This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 19c3b8e682 TIKA-4692-improve-ooxml-sax-parsers (#2731)
19c3b8e682 is described below

commit 19c3b8e682399ce6cde2e6513a6ac31f32a6f689
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 3 18:28:40 2026 -0400

    TIKA-4692-improve-ooxml-sax-parsers (#2731)
---
 .../org/apache/tika/sax/XHTMLContentHandler.java   |   1 +
 .../tika/parser/microsoft/OfficeParserConfig.java  |  18 ++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  26 ++-
 ...neTagManager.java => FormattingTagManager.java} | 140 +++++++-------
 .../microsoft/ooxml/OOXMLPartContentCollector.java |  48 ++---
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  | 179 ++++-------------
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |  97 +++++-----
 .../tika/parser/microsoft/ooxml/RunProperties.java |  11 ++
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   | 213 ++++++---------------
 .../ooxml/SXWPFWordExtractorDecorator.java         |  89 ++++-----
 .../microsoft/ooxml/XWPFBodyContentsHandler.java   |  11 --
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |  53 +++--
 .../parser/microsoft/ooxml/OOXMLDocxSAXTest.java   |  31 ++-
 13 files changed, 393 insertions(+), 524 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
index 3fd7766d03..bae8c4b885 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
@@ -94,6 +94,7 @@ public class XHTMLContentHandler extends SafeContentHandler {
     private boolean headStarted = false;
     private boolean headEnded = false;
     private boolean useFrameset = false;
+
     public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
         this(handler, metadata, null);
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index c8886e5fdf..9f21b0b798 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -39,6 +39,7 @@ public class OfficeParserConfig implements Serializable {
 
     private boolean writeSelectHeadersInBody = false;
 
+    private boolean includeGlossary = true;
     private String dateOverrideFormat = null;
     private int maxOverride = 0;//ignore
 
@@ -213,6 +214,23 @@ public class OfficeParserConfig implements Serializable {
         this.concatenatePhoneticRuns = concatenatePhoneticRuns;
     }
 
+    public boolean isIncludeGlossary() {
+        return includeGlossary;
+    }
+
+    /**
+     * Whether or not to include the glossary (building blocks / AutoText) 
document
+     * from docx files.  The glossary can contain template content such as 
form field
+     * placeholders that may duplicate content already present in the main 
body.
+     * <p/>
+     * Default: <code>true</code>
+     *
+     * @param includeGlossary whether or not to include glossary content
+     */
+    public void setIncludeGlossary(boolean includeGlossary) {
+        this.includeGlossary = includeGlossary;
+    }
+
     public boolean isIncludeMissingRows() {
         return includeMissingRows;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index dd7c5eafaf..6beef7c1c4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -623,6 +623,27 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
      * @param parentPart
      * @param contentHandler
      */
+    /**
+     * Safely resolves a related part, returning null if the part cannot be 
found
+     * instead of throwing {@link IllegalArgumentException}.
+     */
+    public static PackagePart safeGetRelatedPart(PackagePart source,
+                                           PackageRelationship relationship)
+            throws InvalidFormatException {
+        if (source == null || relationship == null) {
+            return null;
+        }
+        if (!source.isRelationshipExists(relationship)) {
+            return null;
+        }
+        try {
+            return source.getRelatedPart(relationship);
+        } catch (IllegalArgumentException e) {
+            // Relationship exists but target part is missing from the package
+            return null;
+        }
+    }
+
     void handleGeneralTextContainingPart(String contentType, String 
xhtmlClassLabel,
                                          PackagePart parentPart, Metadata 
parentMetadata,
                                          ContentHandler contentHandler) throws 
SAXException {
@@ -646,7 +667,10 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
                         relatedPartPRC.getRelationship(i);
                 try {
                     PackagePart relatedPartPart =
-                            
parentPart.getRelatedPart(relatedPartPackageRelationship);
+                            safeGetRelatedPart(parentPart, 
relatedPartPackageRelationship);
+                    if (relatedPartPart == null) {
+                        continue;
+                    }
                     try (InputStream stream = 
relatedPartPart.getInputStream()) {
                         XMLReaderUtils.parseSAX(stream,
                                 new EmbeddedContentHandler(contentHandler), 
context);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
similarity index 58%
rename from 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
rename to 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
index 45eee33b57..0545cd0037 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
@@ -16,94 +16,112 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.util.Objects;
+
 import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.sax.XHTMLContentHandler;
 
 /**
- * Manages all inline XHTML elements (hyperlinks and formatting tags) as a
- * unified state machine, ensuring proper nesting.  The nesting order from
- * outermost to innermost is:
- * <pre>
- *   {@code <a> <b> <i> <s> <u> text </u> </s> </i> </b> </a>}
- * </pre>
- * <p>
- * When a change occurs to an outer element, all inner elements are closed
- * first, the change is applied, then inner elements are reopened as needed.
- * This prevents generating malformed XHTML with overlapping or unbalanced 
tags.
+ * Single owner of all run-scoped XHTML wrapper tags, ensuring proper nesting.
+ * Nesting order from outermost to innermost:
+ * {@code <a href="..."><b><i><s><u>text</u></s></i></b></a>}.
  * <p>
- * This class replaces the separate {@code FormattingTagManager} and the
- * {@code wroteHyperlinkStart} boolean that were previously tracked 
independently
- * in {@link OOXMLTikaBodyPartHandler}.
+ * Hyperlinks come from two OOXML sources with different lifecycles:
+ * <ul>
+ *   <li><b>Wrapper hyperlinks</b> (DOCX {@code <w:hyperlink>}, field-code 
HYPERLINK):
+ *       opened/closed explicitly via {@link #openHyperlink}/{@link 
#closeHyperlink},
+ *       span multiple runs.</li>
+ *   <li><b>Run-property hyperlinks</b> (PPTX {@code <a:hlinkClick>}):
+ *       set on {@link RunProperties#setHlinkClickUrl}, managed automatically
+ *       by {@link #applyFormatting} per-run.</li>
+ * </ul>
+ * Both emit the same {@code <a href="...">} XHTML. Wrapper hyperlinks take
+ * precedence — run properties cannot override an active wrapper.
  */
-class InlineTagManager {
+class FormattingTagManager {
 
     private final XHTMLContentHandler xhtml;
 
-    private boolean hyperlinkOpen = false;
+    // Outermost to innermost: hyperlink > bold > italic > strike > underline
+    private String currentHyperlink = null;
+    private boolean wrapperHyperlinkActive = false;
     private boolean isBold = false;
     private boolean isItalics = false;
     private boolean isStrikeThrough = false;
     private boolean isUnderline = false;
 
-    InlineTagManager(XHTMLContentHandler xhtml) {
+    FormattingTagManager(XHTMLContentHandler xhtml) {
         this.xhtml = xhtml;
     }
 
     /**
-     * Opens a hyperlink.  Since {@code <a>} is the outermost inline element,
-     * any existing inline elements (including a prior hyperlink) are closed
-     * first.
-     *
-     * @param href the link target; if {@code null} this is a no-op
+     * Opens a wrapper-style hyperlink (DOCX {@code <w:hyperlink>} or 
field-code).
+     * Closes any open formatting tags first to maintain nesting.
+     * No-op if url is null.
      */
-    void openHyperlink(String href) throws SAXException {
-        if (href == null) {
+    void openHyperlink(String url) throws SAXException {
+        if (url == null) {
             return;
         }
-        // Close everything — formatting then any existing hyperlink
-        closeAll();
-        xhtml.startElement("a", "href", href);
-        hyperlinkOpen = true;
+        closeFormattingTags();
+        if (currentHyperlink != null) {
+            xhtml.endElement("a");
+        }
+        xhtml.startElement("a", "href", url);
+        currentHyperlink = url;
+        wrapperHyperlinkActive = true;
     }
 
     /**
-     * Closes the current hyperlink and all formatting inside it.
-     * No-op if no hyperlink is open.
+     * Closes the active wrapper-style hyperlink. No-op if none was opened.
      */
     void closeHyperlink() throws SAXException {
-        if (!hyperlinkOpen) {
-            return;
+        if (currentHyperlink != null && wrapperHyperlinkActive) {
+            closeFormattingTags();
+            xhtml.endElement("a");
+            currentHyperlink = null;
+            wrapperHyperlinkActive = false;
         }
-        closeFormatting();
-        xhtml.endElement("a");
-        hyperlinkOpen = false;
     }
 
     /**
-     * Returns {@code true} if a hyperlink is currently open.
+     * Returns true if any hyperlink (wrapper or run-property) is currently 
open.
      */
-    boolean isHyperlinkOpen() {
-        return hyperlinkOpen;
+    boolean isHyperlinkActive() {
+        return currentHyperlink != null;
     }
 
     /**
      * Reconciles the current formatting state with the given run properties,
      * opening and closing XHTML tags as needed to maintain proper nesting.
-     * The nesting order for formatting is: {@code <b> <i> <s> <u>}.
      */
     void applyFormatting(RunProperties runProperties) throws SAXException {
-        if (runProperties.isBold() != isBold) {
-            // Bold is outermost formatting — close everything inside it
-            if (isUnderline) {
-                xhtml.endElement("u");
-                isUnderline = false;
+        // Run-property hyperlinks only when no wrapper is active
+        if (!wrapperHyperlinkActive) {
+            String newHyperlink = runProperties.getHlinkClickUrl();
+            if (!Objects.equals(newHyperlink, currentHyperlink)) {
+                closeFormattingTags();
+                if (currentHyperlink != null) {
+                    xhtml.endElement("a");
+                }
+                if (newHyperlink != null) {
+                    xhtml.startElement("a", "href", newHyperlink);
+                }
+                currentHyperlink = newHyperlink;
             }
+        }
+
+        if (runProperties.isBold() != isBold) {
             if (isStrikeThrough) {
                 xhtml.endElement("s");
                 isStrikeThrough = false;
             }
+            if (isUnderline) {
+                xhtml.endElement("u");
+                isUnderline = false;
+            }
             if (isItalics) {
                 xhtml.endElement("i");
                 isItalics = false;
@@ -117,14 +135,14 @@ class InlineTagManager {
         }
 
         if (runProperties.isItalics() != isItalics) {
-            if (isUnderline) {
-                xhtml.endElement("u");
-                isUnderline = false;
-            }
             if (isStrikeThrough) {
                 xhtml.endElement("s");
                 isStrikeThrough = false;
             }
+            if (isUnderline) {
+                xhtml.endElement("u");
+                isUnderline = false;
+            }
             if (runProperties.isItalics()) {
                 xhtml.startElement("i");
             } else {
@@ -158,10 +176,18 @@ class InlineTagManager {
     }
 
     /**
-     * Closes all currently open formatting tags in proper nesting order
-     * (innermost first: u, s, i, b).  Does NOT close the hyperlink.
+     * Closes all currently open tags in proper nesting order.
      */
-    void closeFormatting() throws SAXException {
+    void closeAll() throws SAXException {
+        closeFormattingTags();
+        if (currentHyperlink != null) {
+            xhtml.endElement("a");
+            currentHyperlink = null;
+            wrapperHyperlinkActive = false;
+        }
+    }
+
+    private void closeFormattingTags() throws SAXException {
         if (isUnderline) {
             xhtml.endElement("u");
             isUnderline = false;
@@ -179,18 +205,4 @@ class InlineTagManager {
             isBold = false;
         }
     }
-
-    /**
-     * Closes ALL open inline elements — formatting first, then hyperlink.
-     * This is the primary safety mechanism: call at every structural boundary
-     * (end of paragraph, table cell, table row, table, etc.) to guarantee
-     * well-formed XHTML.
-     */
-    void closeAll() throws SAXException {
-        closeFormatting();
-        if (hyperlinkOpen) {
-            xhtml.endElement("a");
-            hyperlinkOpen = false;
-        }
-    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
index 4b21831638..6cece158e8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
@@ -54,9 +54,6 @@ class OOXMLPartContentCollector extends DefaultHandler {
     private String currentId = null;
     private ByteArrayOutputStream buffer = null;
     private int depth = 0;
-    // Prefix mappings that fired since the last startElement — need to be
-    // emitted as xmlns declarations on the next element inside a collected 
fragment.
-    private final java.util.List<String[]> pendingPrefixMappings = new 
java.util.ArrayList<>();
 
     /**
      * @param wrapperElementNames local names of wrapper elements to collect
@@ -79,12 +76,6 @@ class OOXMLPartContentCollector extends DefaultHandler {
     @Override
     public void startPrefixMapping(String prefix, String uri) {
         namespaceMappings.put(prefix, uri);
-        // Track prefix mappings that fire within a collected fragment —
-        // these need to be emitted as xmlns declarations on the next
-        // startElement so that re-parsed fragments have valid namespace 
bindings.
-        if (currentId != null) {
-            pendingPrefixMappings.add(new String[]{prefix, uri});
-        }
     }
 
     Map<String, byte[]> getContentMap() {
@@ -105,7 +96,9 @@ class OOXMLPartContentCollector extends DefaultHandler {
             if (id != null && !skipIds.contains(id)) {
                 currentId = id;
                 buffer = new ByteArrayOutputStream();
-                writeString(buildWrapperOpenTag());
+                // Don't write wrapper open tag yet — inline xmlns declarations
+                // (e.g., xmlns:a on nested elements) haven't been captured via
+                // startPrefixMapping. Defer to endElement when all are known.
                 depth = 0;
             }
         }
@@ -119,8 +112,16 @@ class OOXMLPartContentCollector extends DefaultHandler {
         }
 
         if (depth == 0) {
-            writeString("</w:body>");
-            contentMap.put(currentId, buffer.toByteArray());
+            // Build the wrapper now — all startPrefixMapping calls from nested
+            // elements have been captured, so inline xmlns declarations are 
included.
+            byte[] wrapperOpen = 
buildWrapperOpenTag().getBytes(StandardCharsets.UTF_8);
+            byte[] content = buffer.toByteArray();
+            ByteArrayOutputStream combined =
+                    new ByteArrayOutputStream(wrapperOpen.length + 
content.length + 16);
+            combined.write(wrapperOpen, 0, wrapperOpen.length);
+            combined.write(content, 0, content.length);
+            writeString(combined, "</w:body>");
+            contentMap.put(currentId, combined.toByteArray());
             currentId = null;
             buffer = null;
             return;
@@ -166,23 +167,6 @@ class OOXMLPartContentCollector extends DefaultHandler {
         String tagName = (qName != null && !qName.isEmpty()) ? qName : 
localName;
         StringBuilder sb = new StringBuilder();
         sb.append('<').append(tagName);
-        // Emit any namespace declarations that fired since the last element.
-        // In namespace-aware SAX, xmlns:prefix attributes are reported as
-        // startPrefixMapping events, NOT as attributes — so they must be
-        // re-serialized explicitly for the fragment to be re-parseable.
-        if (!pendingPrefixMappings.isEmpty()) {
-            for (String[] mapping : pendingPrefixMappings) {
-                String prefix = mapping[0];
-                String nsUri = mapping[1];
-                if (prefix == null || prefix.isEmpty()) {
-                    sb.append(" xmlns=\"").append(escape(nsUri)).append("\"");
-                } else {
-                    sb.append(" xmlns:").append(prefix).append("=\"")
-                            .append(escape(nsUri)).append("\"");
-                }
-            }
-            pendingPrefixMappings.clear();
-        }
         for (int i = 0; i < atts.getLength(); i++) {
             String attName = atts.getQName(i);
             if (attName == null || attName.isEmpty()) {
@@ -197,8 +181,12 @@ class OOXMLPartContentCollector extends DefaultHandler {
     }
 
     private void writeString(String s) {
+        writeString(buffer, s);
+    }
+
+    private static void writeString(ByteArrayOutputStream target, String s) {
         byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
-        buffer.write(bytes, 0, bytes.length);
+        target.write(bytes, 0, bytes.length);
     }
 
     static String escape(String s) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index ac46090d1c..a18f52a4d2 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -55,7 +55,7 @@ public class OOXMLTikaBodyPartHandler
     private int pDepth = 0; //paragraph depth
     private int tableDepth = 0;//table depth
     private int sdtDepth = 0;//
-    private final InlineTagManager inlineTags;
+    private FormattingTagManager formattingTags;
 
     //TODO: fix this
     //pWithinCell should be an array/stack of given cell depths
@@ -68,14 +68,9 @@ public class OOXMLTikaBodyPartHandler
     //will need to replace this with a stack
     //if we're marking more that the first level <p/> element
     private String paragraphTag = null;
-    private boolean pendingParagraph = false;
-    private boolean paragraphTagOpen = false;
-    private ParagraphProperties pendingParagraphProperties = null;
-    private String pendingHyperlinkHref = null;
 
     private OOXMLInlineBodyPartMap inlinePartMap = 
OOXMLInlineBodyPartMap.EMPTY;
     private ParseContext parseContext = null;
-    private final java.util.List<String[]> pendingNoteIds = new 
java.util.ArrayList<>();
     private final java.util.List<String> pendingCommentIds = new 
java.util.ArrayList<>();
     private final java.util.Set<String> emittedCommentIds = new 
java.util.HashSet<>();
     private final Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap = 
new HashMap<>();
@@ -87,7 +82,7 @@ public class OOXMLTikaBodyPartHandler
     public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, Metadata 
metadata) {
         this.xhtml = xhtml;
         this.metadata = metadata;
-        this.inlineTags = new InlineTagManager(xhtml);
+        this.formattingTags = new FormattingTagManager(xhtml);
         this.styles = XWPFStylesShim.EMPTY_STYLES;
         this.listManager = XWPFListManager.EMPTY_LIST;
         this.includeDeletedText = false;
@@ -105,7 +100,7 @@ public class OOXMLTikaBodyPartHandler
                                     OfficeParserConfig parserConfig, Metadata 
metadata) {
         this.xhtml = xhtml;
         this.metadata = metadata;
-        this.inlineTags = new InlineTagManager(xhtml);
+        this.formattingTags = new FormattingTagManager(xhtml);
         this.styles = styles;
         this.listManager = listManager;
         this.includeDeletedText = parserConfig.isIncludeDeletedContent();
@@ -125,100 +120,35 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void run(RunProperties runProperties, String contents) throws 
SAXException {
-        ensureParagraphOpen();
-        flushPendingHyperlink();
-        inlineTags.applyFormatting(runProperties);
+        formattingTags.applyFormatting(runProperties);
         xhtml.characters(contents);
     }
 
-    private void flushPendingHyperlink() throws SAXException {
-        if (pendingHyperlinkHref != null) {
-            inlineTags.openHyperlink(pendingHyperlinkHref);
-            pendingHyperlinkHref = null;
-        }
-    }
-
     @Override
     public void hyperlinkStart(String link) throws SAXException {
-        // Defer hyperlink opening if no paragraph is open yet.
-        // Shape-level hyperlinks (cNvPr/hlinkClick) fire before any <p>,
-        // so we store the link and open it when the paragraph opens.
-        if (pendingParagraph || pDepth == 0) {
-            pendingHyperlinkHref = link;
-        } else {
-            inlineTags.openHyperlink(link);
-        }
+        formattingTags.openHyperlink(link);
     }
 
     @Override
     public void hyperlinkEnd() throws SAXException {
-        if (pendingHyperlinkHref != null) {
-            pendingHyperlinkHref = null;
-        } else {
-            inlineTags.closeHyperlink();
-        }
-    }
-
-    /**
-     * Closes any open inline elements (hyperlinks, formatting tags) in
-     * the correct nesting order.  Called before closing any structural
-     * element (paragraph, table cell, table row, table, etc.) to ensure
-     * well-formed XHTML.
-     */
-    void closeInlineElements() throws SAXException {
-        inlineTags.closeAll();
+        formattingTags.closeHyperlink();
     }
 
-
     @Override
     public void startParagraph(ParagraphProperties paragraphProperties) throws 
SAXException {
+
         //if you're in a table cell and your after the first paragraph
         //make sure to prepend a \n
         if (tableCellDepth > 0 && pWithinCell > 0) {
             xhtml.characters(NEWLINE, 0, 1);
         }
-        // If we're about to nest a paragraph (e.g. inside a text box / shape),
-        // force-open the outer paragraph first so that inner content ends up
-        // inside the outer <p> tag rather than floating as raw text.
-        if (pendingParagraph && pDepth > 0) {
-            ensureParagraphOpen();
-        }
-        // Record the paragraph as pending — don't emit <p> yet.
-        // We defer opening until the first content arrives (via 
ensureParagraphOpen)
-        // so that style info from pPr is available.
-        pendingParagraph = true;
-        pendingParagraphProperties = paragraphProperties;
-        pDepth++;
-    }
-
-    @Override
-    public void setParagraphProperties(ParagraphProperties paragraphProperties)
-            throws SAXException {
-        // Copy the properties — the caller may reset the object after this 
call.
-        // The <p> tag hasn't been emitted yet, so this style will be applied 
when it opens.
-        if (pendingParagraph) {
-            pendingParagraphProperties = new 
ParagraphProperties(paragraphProperties);
-        }
-    }
-
-    /**
-     * Ensures the current paragraph's XHTML tag is open.  Called before any
-     * content is written (runs, hyperlinks, etc.) so that the deferred
-     * {@code <p>} tag is emitted with the correct style.
-     */
-    private void ensureParagraphOpen() throws SAXException {
-        if (!pendingParagraph) {
-            return;
-        }
-        pendingParagraph = false;
 
-        if (pDepth == 1 && tableDepth == 0 && sdtDepth == 0) {
+        if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
             paragraphTag = P;
             String styleClass = null;
-            ParagraphProperties pp = pendingParagraphProperties;
             //TIKA-2144 check that styles is not null
-            if (pp != null && pp.getStyleID() != null && styles != null) {
-                String styleName = styles.getStyleName(pp.getStyleID());
+            if (paragraphProperties.getStyleID() != null && styles != null) {
+                String styleName = 
styles.getStyleName(paragraphProperties.getStyleID());
                 if (styleName != null) {
                     WordExtractor.TagAndStyle tas =
                             WordExtractor.buildParagraphTagAndStyle(styleName, 
false);
@@ -227,39 +157,33 @@ public class OOXMLTikaBodyPartHandler
                 }
             }
 
+
             if (styleClass == null) {
                 xhtml.startElement(paragraphTag);
             } else {
                 xhtml.startElement(paragraphTag, "class", styleClass);
             }
-            paragraphTagOpen = true;
         }
 
-        if (pendingParagraphProperties != null) {
-            writeParagraphNumber(pendingParagraphProperties.getNumId(),
-                    pendingParagraphProperties.getIlvl(), listManager, xhtml);
-        }
-        pendingParagraphProperties = null;
+        writeParagraphNumber(paragraphProperties.getNumId(), 
paragraphProperties.getIlvl(),
+                listManager, xhtml);
+        pDepth++;
     }
 
+
     @Override
     public void endParagraph() throws SAXException {
-        ensureParagraphOpen();
-        closeInlineElements();
-        if (paragraphTagOpen) {
+        formattingTags.closeAll();
+        if (pDepth == 1 && tableDepth == 0) {
             xhtml.endElement(paragraphTag);
-            paragraphTagOpen = false;
         } else if (tableCellDepth > 0 && pWithinCell > 0) {
             xhtml.characters(NEWLINE, 0, 1);
         } else if (tableCellDepth == 0) {
             xhtml.characters(NEWLINE, 0, 1);
         }
 
-        // Emit any pending footnote/endnote and comment content after the
-        // paragraph closes.  Inlining mid-paragraph would create <div> inside
-        // <p>, and the inner handler's endParagraph() would close the outer
-        // <p> tag, corrupting state.
-        emitPendingNotes();
+        // Emit any pending comment content after the paragraph closes
+        // (matching the DOM parser's behavior of appending comments after 
paragraphs)
         emitPendingComments();
 
         if (tableCellDepth > 0) {
@@ -268,27 +192,6 @@ public class OOXMLTikaBodyPartHandler
         pDepth--;
     }
 
-    private void emitPendingNotes() throws SAXException {
-        if (pendingNoteIds.isEmpty()) {
-            return;
-        }
-        for (String[] noteTypeAndId : pendingNoteIds) {
-            String noteType = noteTypeAndId[0];
-            String id = noteTypeAndId[1];
-            byte[] xml = "footnote".equals(noteType)
-                    ? inlinePartMap.getFootnote(id)
-                    : inlinePartMap.getEndnote(id);
-            if (xml != null) {
-                inlineNoteContent(xml, noteType);
-            } else {
-                xhtml.characters("[");
-                xhtml.characters(id);
-                xhtml.characters("]");
-            }
-        }
-        pendingNoteIds.clear();
-    }
-
     private void emitPendingComments() throws SAXException {
         if (pendingCommentIds.isEmpty()) {
             return;
@@ -313,12 +216,7 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void startTable() throws SAXException {
-        // Close any open paragraph — <table> can't nest inside <p> in XHTML
-        closeInlineElements();
-        if (paragraphTagOpen) {
-            xhtml.endElement(paragraphTag);
-            paragraphTagOpen = false;
-        }
+
         xhtml.startElement("table");
         tableDepth++;
 
@@ -326,7 +224,7 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void endTable() throws SAXException {
-        closeInlineElements();
+
         xhtml.endElement("table");
         tableDepth--;
 
@@ -339,7 +237,6 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void endTableRow() throws SAXException {
-        closeInlineElements();
         xhtml.endElement("tr");
     }
 
@@ -351,7 +248,6 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void endTableCell() throws SAXException {
-        closeInlineElements();
         xhtml.endElement("td");
         pWithinCell = 0;
         tableCellDepth--;
@@ -359,7 +255,7 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void startSDT() throws SAXException {
-        inlineTags.closeAll();
+        formattingTags.closeAll();
         sdtDepth++;
     }
 
@@ -389,10 +285,14 @@ public class OOXMLTikaBodyPartHandler
         if (id == null) {
             return;
         }
-        // Defer footnote emission to after the paragraph closes.
-        // Inlining mid-paragraph creates <div> inside <p>, and the inner
-        // handler's endParagraph() closes the outer <p> tag, corrupting state.
-        pendingNoteIds.add(new String[]{"footnote", id});
+        byte[] xml = inlinePartMap.getFootnote(id);
+        if (xml != null) {
+            inlineNoteContent(xml, "footnote");
+        } else {
+            xhtml.characters("[");
+            xhtml.characters(id);
+            xhtml.characters("]");
+        }
     }
 
     @Override
@@ -400,7 +300,14 @@ public class OOXMLTikaBodyPartHandler
         if (id == null) {
             return;
         }
-        pendingNoteIds.add(new String[]{"endnote", id});
+        byte[] xml = inlinePartMap.getEndnote(id);
+        if (xml != null) {
+            inlineNoteContent(xml, "endnote");
+        } else {
+            xhtml.characters("[");
+            xhtml.characters(id);
+            xhtml.characters("]");
+        }
     }
 
     @Override
@@ -411,25 +318,19 @@ public class OOXMLTikaBodyPartHandler
     }
 
     private void inlineNoteContent(byte[] xml, String cssClass) throws 
SAXException {
-        // Close any open inline elements before inlining note content
-        // to ensure the <div> nests correctly
-        closeInlineElements();
         // Use the inline part map's relationship map which includes 
relationships
         // from the footnote/endnote parts (needed for picture resolution)
         Map<String, String> noteRelationships = 
inlinePartMap.getLinkedRelationships();
         xhtml.startElement("div", "class", cssClass);
-        OOXMLTikaBodyPartHandler innerHandler = new 
OOXMLTikaBodyPartHandler(xhtml);
         try {
             XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml),
                     new EmbeddedContentHandler(
                             new OOXMLWordAndPowerPointTextHandler(
-                                    innerHandler,
+                                    new OOXMLTikaBodyPartHandler(xhtml),
                                     noteRelationships)),
                     parseContext);
         } catch (TikaException | IOException e) {
             xhtml.characters("[" + cssClass + " parse error]");
-        } finally {
-            innerHandler.closeInlineElements();
         }
         xhtml.endElement("div");
     }
@@ -529,7 +430,7 @@ public class OOXMLTikaBodyPartHandler
     @Override
     public void startBookmark(String id, String name) throws SAXException {
         //skip bookmarks within hyperlinks
-        if (name != null && !inlineTags.isHyperlinkOpen()) {
+        if (name != null && !formattingTags.isHyperlinkActive()) {
             xhtml.startElement("a", "name", name);
             xhtml.endElement("a");
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index 00f1ff6c4a..46e25b299d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -111,10 +111,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private final static String COMMENT_REFERENCE = "commentReference";
     private static final String TEXTBOX = "textbox";
     private static final String TXBX = "txbx"; // DrawingML text box (wps:txbx 
in mc:Choice)
-    private static final String SDT = "sdt";
-    private static final String SDT_PR = "sdtPr";
-    private static final String SDT_CONTENT = "sdtContent";
-    private static final String SHOWING_PLCHDR = "showingPlcHdr";
     private final static String FLD_CHAR = "fldChar";
     private final static String INSTR_TEXT = "instrText";
     private final static String FLD_CHAR_TYPE = "fldCharType";
@@ -141,7 +137,14 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private boolean inRPr = false;
     private boolean inNumPr = false;
     private boolean inRt = false;
-    private boolean inPPr = false;
+    //mechanism used to determine when to
+    //signal the start of the p, and still
+    //handle p with pPr and those without
+    private boolean lastStartElementWasP = false;
+    //have we signaled the start of a p?
+    //pPr can happen multiple times within a p
+    //<p><pPr/><r><t>text</t></r><pPr></p>
+    private boolean pStarted = false;
     //alternate content can be embedded in itself.
     //need to track depth.
     //preferACChoice controls which branch is processed:
@@ -151,9 +154,13 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private int inACFallbackDepth = 0;
     private boolean inDelText = false;
     //buffers rt in ruby sections (see 17.3.3.25)
-    private boolean inHlinkClick = false;
     private boolean inTextBox = false;
     private boolean inV = false; //in c:v in chart file
+    // True when we're inside a <pPr> that was a direct child of <p> (the 
first child).
+    // Only those pPr elements should trigger startParagraph on close.
+    // pPr elements nested inside other elements (e.g., <a:pPr> inside <a:fld>)
+    // must not be treated as paragraph-level properties.
+    private boolean inParagraphLevelPPr = false;
     // Field code tracking for instrText-based hyperlinks
     private boolean inField = false;
     private boolean inInstrText = false;
@@ -164,10 +171,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private DateUtils dateUtils = new DateUtils();
 
     private boolean hiddenSlide = false;
-    // SDT (structured document tag) placeholder tracking
-    private boolean inSdtPr = false;
-    private boolean sdtIsPlaceholder = false;
-    private int sdtPlaceholderDepth = 0;
     private boolean hasAnimations = false;
 
     public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler 
bodyContentsHandler,
@@ -226,6 +229,17 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             throws SAXException {
         //TODO: checkBox, textBox, sym, headerReference, footerReference, 
commentRangeEnd
 
+        if (lastStartElementWasP && PPR.equals(localName)) {
+            // pPr is the first child of <p> — this is a paragraph-level pPr.
+            // Defer startParagraph until </pPr> so properties (style, 
numbering) are set first.
+            inParagraphLevelPPr = true;
+        } else if (lastStartElementWasP) {
+            // First child of <p> is not pPr — start paragraph immediately 
with defaults.
+            bodyContentsHandler.startParagraph(currPProperties);
+        }
+
+        lastStartElementWasP = false;
+
         if (uri != null && uri.equals(MC_NS)) {
             if (CHOICE.equals(localName)) {
                 inACChoiceDepth++;
@@ -254,9 +268,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         } else if (TAB.equals(localName)) {
             runBuffer.append(TAB_CHAR);
         } else if (P.equals(localName)) {
-            bodyContentsHandler.startParagraph(currPProperties);
-        } else if (PPR.equals(localName)) {
-            inPPr = true;
+            lastStartElementWasP = true;
         } else if (B.equals(localName)) { //TODO: add bCs
             if (inR && inRPr) {
                 currRunProperties.setBold(true);
@@ -318,8 +330,14 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             String hyperlink = null;
             if (hyperlinkId != null) {
                 hyperlink = linkedRelationships.get(hyperlinkId);
-                bodyContentsHandler.hyperlinkStart(hyperlink);
-                inHlinkClick = true;
+                if (inR) {
+                    // hlinkClick inside a run — treat as run property.
+                    // FormattingTagManager opens/closes <a> with the run 
lifecycle.
+                    currRunProperties.setHlinkClickUrl(hyperlink);
+                } else if (hyperlink != null) {
+                    // hlinkClick on a shape/picture (not in a run) — emit as 
self-closing ref
+                    bodyContentsHandler.externalRef("hlinkClick", hyperlink);
+                }
             }
         } else if (TBL.equals(localName)) {
             bodyContentsHandler.startTable();
@@ -329,20 +347,8 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             pictureTracker.setDescription(atts.getValue("", "descr"));
         } else if (PIC.equals(localName)) {
             pictureTracker.startPic(); //check for PIC_NS?
-        } else if (SDT.equals(localName)) {
-            // SDTs can nest; only track placeholder at outermost level
-            if (sdtPlaceholderDepth == 0) {
-                sdtIsPlaceholder = false;
-            }
-        } else if (SDT_PR.equals(localName)) {
-            inSdtPr = true;
-        } else if (SHOWING_PLCHDR.equals(localName) && inSdtPr) {
-            sdtIsPlaceholder = true;
-        } else if (SDT_CONTENT.equals(localName)) {
-            if (sdtIsPlaceholder) {
-                sdtPlaceholderDepth++;
-            }
-        } else if (FOOTNOTE_REFERENCE.equals(localName)) {
+        } //TODO: add sdt, sdtPr, sdtContent goes here statistically
+        else if (FOOTNOTE_REFERENCE.equals(localName)) {
             String id = atts.getValue(W_NS, "id");
             bodyContentsHandler.footnoteReference(id);
         } else if (IMAGEDATA.equals(localName)) {
@@ -500,10 +506,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         }
         if (PIC.equals(localName)) { //PIC_NS
             pictureTracker.endPicture();
-            if (inHlinkClick) {
-                bodyContentsHandler.hyperlinkEnd();
-                inHlinkClick = false;
-            }
             return;
         } else if (RPR.equals(localName)) {
             inRPr = false;
@@ -511,10 +513,15 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             handleEndOfRun();
         } else if (T.equals(localName)) {
             inT = false;
-        } else if (PPR.equals(localName)) {
-            inPPr = false;
-            bodyContentsHandler.setParagraphProperties(currPProperties);
+        } else if (PPR.equals(localName) && inParagraphLevelPPr) {
+            // Only process as paragraph properties if this pPr was a direct 
child of <p>.
+            // pPr inside other elements (e.g., <a:fld> fields) must be 
ignored.
+            if (!pStarted) {
+                bodyContentsHandler.startParagraph(currPProperties);
+                pStarted = true;
+            }
             currPProperties.reset();
+            inParagraphLevelPPr = false;
         } else if (P.equals(localName)) {
             if (runBuffer.length() > 0) {
                 //<p><tab></p>...this will treat that as if it were
@@ -522,6 +529,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
                 bodyContentsHandler.run(currRunProperties, 
runBuffer.toString());
                 runBuffer.setLength(0);
             }
+            pStarted = false;
             bodyContentsHandler.endParagraph();
         } else if (TC.equals(localName)) {
             bodyContentsHandler.endTableCell();
@@ -538,14 +546,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             editType = EditType.NONE;
         } else if (HYPERLINK.equals(localName)) {
             bodyContentsHandler.hyperlinkEnd();
-        } else if (SDT_PR.equals(localName)) {
-            inSdtPr = false;
-        } else if (SDT_CONTENT.equals(localName)) {
-            if (sdtPlaceholderDepth > 0) {
-                sdtPlaceholderDepth--;
-            }
-        } else if (SDT.equals(localName)) {
-            sdtIsPlaceholder = false;
         } else if (PICT.equals(localName)) {
             pictureTracker.endPicture();
         } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a 
chart
@@ -571,16 +571,13 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
 
     private void handleEndOfRun() throws SAXException {
         bodyContentsHandler.run(currRunProperties, runBuffer.toString());
-        if (inHlinkClick) {
-            bodyContentsHandler.hyperlinkEnd();
-            inHlinkClick = false;
-        }
         inR = false;
         runBuffer.setLength(0);
         currRunProperties.setBold(false);
         currRunProperties.setItalics(false);
         currRunProperties.setStrike(false);
         currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
+        currRunProperties.setHlinkClickUrl(null);
     }
 
     @Override
@@ -590,8 +587,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             return;
         } else if (!includeTextBox && inTextBox) {
             return;
-        } else if (sdtPlaceholderDepth > 0) {
-            return;
         }
 
         if (editType.equals(EditType.MOVE_FROM) && inT) {
@@ -617,8 +612,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             return;
         } else if (!includeTextBox && inTextBox) {
             return;
-        } else if (sdtPlaceholderDepth > 0) {
-            return;
         }
 
         if (inT) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
index 54d149f333..efed9c1348 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
@@ -30,6 +30,9 @@ public class RunProperties {
 
     UnderlinePatterns underline = UnderlinePatterns.NONE;
 
+    // PPTX hlinkClick hyperlink URL — set from <a:hlinkClick> inside <a:rPr>
+    String hlinkClickUrl = null;
+
     public boolean isItalics() {
         return italics;
     }
@@ -68,4 +71,12 @@ public class RunProperties {
             underline = UnderlinePatterns.SINGLE;
         }
     }
+
+    public String getHlinkClickUrl() {
+        return hlinkClickUrl;
+    }
+
+    public void setHlinkClickUrl(String url) {
+        this.hlinkClickUrl = url;
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index d6f5b9759d..7d8030afad 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -20,7 +20,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.zip.ZipException;
@@ -34,9 +33,7 @@ import 
org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -106,15 +103,28 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
         loadCommentAuthors();
         addCommentAuthorMetadata();
 
-        List<PackagePart> orderedSlides = getOrderedSlideParts();
+        PackageRelationshipCollection slidesPRC = null;
+        try {
+            slidesPRC = 
mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
+        } catch (InvalidFormatException e) {
+            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                    ExceptionUtils.getStackTrace(e));
+        }
 
         int hiddenSlideCount = 0;
-        for (PackagePart slidePart : orderedSlides) {
-            try {
-                hiddenSlideCount += handleSlidePart(slidePart, xhtml);
-            } catch (ZipException e) {
-                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
-                        ExceptionUtils.getStackTrace(e));
+        if (slidesPRC != null && slidesPRC.size() > 0) {
+            for (int i = 0; i < slidesPRC.size(); i++) {
+                try {
+                    PackagePart slidePart =
+                            safeGetRelatedPart(mainDocument, 
slidesPRC.getRelationship(i));
+                    if (slidePart == null) {
+                        continue;
+                    }
+                    hiddenSlideCount += handleSlidePart(slidePart, xhtml);
+                } catch (InvalidFormatException | ZipException e) {
+                    
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                            ExceptionUtils.getStackTrace(e));
+                }
             }
         }
         if (hiddenSlideCount > 0) {
@@ -122,9 +132,16 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
         }
 
         if (config.isIncludeSlideMasterContent()) {
-            // Handout master is presentation-level, not per-slide
-            handleTextPartWithCleanup(HANDOUT_MASTER, "slide-handout-master", 
mainDocument,
-                    xhtml, new HashMap<>(), false);
+            
handleGeneralTextContainingPart(XSLFRelation.SLIDE_MASTER.getRelation(), 
"slide-master",
+                    mainDocument, metadata, new PlaceHolderSkipper(
+                            new OOXMLWordAndPowerPointTextHandler(
+                                    new OOXMLTikaBodyPartHandler(xhtml),
+                                    new HashMap<>())));
+
+            handleGeneralTextContainingPart(HANDOUT_MASTER, 
"slide-handout-master", mainDocument,
+                    metadata,
+                    new OOXMLWordAndPowerPointTextHandler(new 
OOXMLTikaBodyPartHandler(xhtml),
+                            new HashMap<>()));
         }
     }
 
@@ -143,7 +160,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
         for (int i = 0; i < prc.size(); i++) {
             PackagePart commentAuthorsPart = null;
             try {
-                commentAuthorsPart = 
mainDocument.getRelatedPart(prc.getRelationship(i));
+                commentAuthorsPart = safeGetRelatedPart(mainDocument, 
prc.getRelationship(i));
             } catch (InvalidFormatException e) {
                 metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                         ExceptionUtils.getStackTrace(e));
@@ -171,103 +188,6 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
         }
     }
 
-    /**
-     * Returns the first related part for the given relationship type,
-     * or null if none found.
-     */
-    private PackagePart getRelatedPartByType(PackagePart source, String 
relationType) {
-        try {
-            PackageRelationshipCollection prc = 
source.getRelationshipsByType(relationType);
-            if (prc != null && prc.size() > 0) {
-                return source.getRelatedPart(prc.getRelationship(0));
-            }
-        } catch (InvalidFormatException | IllegalArgumentException e) {
-            // missing part
-        }
-        return null;
-    }
-
-    /**
-     * Returns slide parts in presentation order by parsing the sldIdLst
-     * from presentation.xml.  Any slides found in .rels but not in
-     * the sldIdLst are appended at the end.
-     */
-    private List<PackagePart> getOrderedSlideParts() {
-        // Step 1: parse presentation.xml to get ordered rIds from sldIdLst
-        List<String> orderedRIds = new ArrayList<>();
-        try (InputStream is = mainDocument.getInputStream()) {
-            XMLReaderUtils.parseSAX(is, new DefaultHandler() {
-                private boolean inSldIdLst = false;
-
-                @Override
-                public void startElement(String uri, String localName, String 
qName,
-                                         Attributes atts) {
-                    if ("sldIdLst".equals(localName)) {
-                        inSldIdLst = true;
-                    } else if (inSldIdLst && "sldId".equals(localName)) {
-                        String rId = atts.getValue(
-                                
"http://schemas.openxmlformats.org/officeDocument/2006/relationships";,
-                                "id");
-                        if (rId != null) {
-                            orderedRIds.add(rId);
-                        }
-                    }
-                }
-
-                @Override
-                public void endElement(String uri, String localName, String 
qName) {
-                    if ("sldIdLst".equals(localName)) {
-                        inSldIdLst = false;
-                    }
-                }
-            }, context);
-        } catch (Exception e) {
-            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
-                    ExceptionUtils.getStackTrace(e));
-        }
-
-        // Step 2: build rId -> PackagePart map from relationships
-        Map<String, PackagePart> rIdToSlide = new LinkedHashMap<>();
-        try {
-            PackageRelationshipCollection slidesPRC =
-                    
mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
-            if (slidesPRC != null) {
-                for (int i = 0; i < slidesPRC.size(); i++) {
-                    PackageRelationship rel = slidesPRC.getRelationship(i);
-                    try {
-                        PackagePart part = mainDocument.getRelatedPart(rel);
-                        if (part != null) {
-                            rIdToSlide.put(rel.getId(), part);
-                        }
-                    } catch (InvalidFormatException | IllegalArgumentException 
e) {
-                        // skip missing parts
-                    }
-                }
-            }
-        } catch (InvalidFormatException e) {
-            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
-                    ExceptionUtils.getStackTrace(e));
-        }
-
-        // Step 3: assemble in presentation order, then append orphans
-        List<PackagePart> result = new ArrayList<>();
-        for (String rId : orderedRIds) {
-            PackagePart part = rIdToSlide.remove(rId);
-            if (part != null) {
-                result.add(part);
-            }
-        }
-        // append any slides in .rels but not in sldIdLst
-        if (!rIdToSlide.isEmpty()) {
-            metadata.set(Office.NUM_UNLISTED_SLIDES, rIdToSlide.size());
-            for (PackagePart part : rIdToSlide.values()) {
-                metadata.add(Office.UNLISTED_SLIDE_NAMES, 
part.getPartName().getName());
-            }
-            result.addAll(rIdToSlide.values());
-        }
-        return result;
-    }
-
     /**
      * @return 1 if the slide is hidden, 0 otherwise
      */
@@ -278,10 +198,9 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
 
         int hidden = 0;
         xhtml.startElement("div", "class", "slide-content");
-        OOXMLTikaBodyPartHandler bodyHandler = new 
OOXMLTikaBodyPartHandler(xhtml, metadata);
         try (InputStream stream = slidePart.getInputStream()) {
             OOXMLWordAndPowerPointTextHandler wordAndPPTHandler = new 
OOXMLWordAndPowerPointTextHandler(
-                    bodyHandler, linkedRelationships);
+                    new OOXMLTikaBodyPartHandler(xhtml, metadata), 
linkedRelationships);
             XMLReaderUtils.parseSAX(stream,
                     new EmbeddedContentHandler(wordAndPPTHandler), context);
             if (wordAndPPTHandler.isHiddenSlide()) {
@@ -295,64 +214,42 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
         }
-        bodyHandler.closeInlineElements();
+
         xhtml.endElement("div");
 
         if (config.isIncludeSlideMasterContent()) {
-            // Extract the slide layout (per-slide)
-            PackagePart layoutPart = getRelatedPartByType(slidePart,
-                    XSLFRelation.SLIDE_LAYOUT.getRelation());
-            if (layoutPart != null) {
-                
handleTextPartWithCleanup(XSLFRelation.SLIDE_LAYOUT.getRelation(),
-                        "slide-master-content", slidePart, xhtml, 
linkedRelationships, true);
-                // Follow layout → slide master chain
-                
handleTextPartWithCleanup(XSLFRelation.SLIDE_MASTER.getRelation(),
-                        "slide-master-content", layoutPart, xhtml, 
linkedRelationships, true);
-            }
+            
handleGeneralTextContainingPart(XSLFRelation.SLIDE_LAYOUT.getRelation(),
+                    "slide-master-content", slidePart, metadata, new 
PlaceHolderSkipper(
+                            new OOXMLWordAndPowerPointTextHandler(
+                                    new OOXMLTikaBodyPartHandler(xhtml), 
linkedRelationships)));
         }
         if (config.isIncludeSlideNotes()) {
-            handleTextPartWithCleanup(XSLFRelation.NOTES.getRelation(), 
"slide-notes",
-                    slidePart, xhtml, linkedRelationships, false);
+            handleGeneralTextContainingPart(XSLFRelation.NOTES.getRelation(), 
"slide-notes",
+                    slidePart, metadata,
+                    new OOXMLWordAndPowerPointTextHandler(new 
OOXMLTikaBodyPartHandler(xhtml),
+                            linkedRelationships));
             if (config.isIncludeSlideMasterContent()) {
-                
handleTextPartWithCleanup(XSLFRelation.NOTES_MASTER.getRelation(),
-                        "slide-notes-master", slidePart, xhtml, 
linkedRelationships, false);
+                
handleGeneralTextContainingPart(XSLFRelation.NOTES_MASTER.getRelation(),
+                        "slide-notes-master", slidePart, metadata,
+                        new OOXMLWordAndPowerPointTextHandler(new 
OOXMLTikaBodyPartHandler(xhtml),
+                                linkedRelationships));
+
             }
         }
         handleGeneralTextContainingPart(XSLFRelation.COMMENTS.getRelation(), 
null, slidePart,
                 metadata, new XSLFCommentsHandler(xhtml, commentAuthors));
 
-        handleTextPartWithCleanup(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
-                "diagram-data", slidePart, xhtml, linkedRelationships, false);
+        
handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
+                "diagram-data", slidePart, metadata,
+                new OOXMLWordAndPowerPointTextHandler(new 
OOXMLTikaBodyPartHandler(xhtml),
+                        linkedRelationships));
 
-        handleTextPartWithCleanup(XSLFRelation.CHART.getRelation(), "chart", 
slidePart,
-                xhtml, linkedRelationships, false);
+        handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(), 
"chart", slidePart,
+                metadata, new OOXMLWordAndPowerPointTextHandler(new 
OOXMLTikaBodyPartHandler(xhtml),
+                        linkedRelationships));
         return hidden;
     }
 
-    /**
-     * Handles a text-containing part with guaranteed inline element cleanup.
-     * Creates an OOXMLTikaBodyPartHandler, parses the part, then calls
-     * closeInlineElements() to ensure no unclosed tags leak into subsequent 
output.
-     *
-     * @param usePlaceholderSkipper if true, wraps the handler in a 
PlaceHolderSkipper
-     */
-    private void handleTextPartWithCleanup(String contentType, String 
xhtmlClassLabel,
-                                           PackagePart parentPart, 
XHTMLContentHandler xhtml,
-                                           Map<String, String> 
linkedRelationships,
-                                           boolean usePlaceholderSkipper) 
throws SAXException {
-        OOXMLTikaBodyPartHandler bodyHandler = new 
OOXMLTikaBodyPartHandler(xhtml);
-        OOXMLWordAndPowerPointTextHandler textHandler =
-                new OOXMLWordAndPowerPointTextHandler(bodyHandler, 
linkedRelationships);
-        DefaultHandler handler = usePlaceholderSkipper
-                ? new PlaceHolderSkipper(textHandler) : textHandler;
-        try {
-            handleGeneralTextContainingPart(contentType, xhtmlClassLabel, 
parentPart,
-                    metadata, handler);
-        } finally {
-            bodyHandler.closeInlineElements();
-        }
-    }
-
     /**
      * In PowerPoint files, slides have things embedded in them,
      * and slide drawings which have the images
@@ -374,7 +271,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
             for (int i = 0; i < slidePRC.size(); i++) {
                 PackagePart slidePart = null;
                 try {
-                    slidePart = 
mainDocument.getRelatedPart(slidePRC.getRelationship(i));
+                    slidePart = safeGetRelatedPart(mainDocument, 
slidePRC.getRelationship(i));
                 } catch (InvalidFormatException e) {
                     
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                             ExceptionUtils.getStackTrace(e));
@@ -397,7 +294,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
                 for (int i = 0; i < prc.size(); i++) {
                     PackagePart pp = null;
                     try {
-                        pp = 
mainDocument.getRelatedPart(prc.getRelationship(i));
+                        pp = safeGetRelatedPart(mainDocument, 
prc.getRelationship(i));
                     } catch (InvalidFormatException e) {
                         
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                                 ExceptionUtils.getStackTrace(e));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 393370662b..88c7aaa0f7 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -45,6 +45,7 @@ import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.EMFParser;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFFeatureExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim;
@@ -126,16 +127,21 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             }
         }
         //handle glossary document
-        pps = 
opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
-        if (pps != null) {
-            if (pps.size() > 0) {
-                xhtml.startElement("div", "class", "glossary");
-
-                for (PackagePart pp : pps) {
-                    //likely only one, but why not...
-                    handleDocumentPart(pp, xhtml);
+        OfficeParserConfig officeParserConfig = 
context.get(OfficeParserConfig.class,
+                new OfficeParserConfig());
+        if (officeParserConfig.isIncludeGlossary()) {
+            pps = opcPackage.getPartsByContentType(
+                    XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
+            if (pps != null) {
+                if (pps.size() > 0) {
+                    xhtml.startElement("div", "class", "glossary");
+
+                    for (PackagePart pp : pps) {
+                        //likely only one, but why not...
+                        handleDocumentPart(pp, xhtml);
+                    }
+                    xhtml.endElement("div");
                 }
-                xhtml.endElement("div");
             }
         }
 
@@ -223,23 +229,8 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                     }
                 }
             }
-        } catch (InvalidFormatException | IOException | TikaException | 
SAXException |
-                 IllegalArgumentException e) {
-            // swallow -- POI throws IllegalArgumentException when
-            // a relationship references a part missing from the package
-        }
-    }
-
-    /**
-     * Safely resolves a related part from a relationship.  Returns {@code 
null}
-     * instead of throwing {@link IllegalArgumentException} when the target
-     * part is missing from the package (e.g. truncated / salvaged zips).
-     */
-    private static PackagePart safeGetRelatedPart(PackagePart source, 
PackageRelationship rel) {
-        try {
-            return source.getRelatedPart(rel);
-        } catch (InvalidFormatException | IllegalArgumentException e) {
-            return null;
+        } catch (InvalidFormatException | IOException | TikaException | 
SAXException e) {
+            // swallow
         }
     }
 
@@ -286,10 +277,11 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                     for (int i = 0; i < headersPRC.size(); i++) {
                         PackagePart header =
                                 safeGetRelatedPart(documentPart, 
headersPRC.getRelationship(i));
-                        if (header != null) {
-                            handlePart(header, styles, listManager, xhtml,
-                                    OOXMLInlineBodyPartMap.EMPTY);
+                        if (header == null) {
+                            continue;
                         }
+                        handlePart(header, styles, listManager, xhtml,
+                                OOXMLInlineBodyPartMap.EMPTY);
                     }
                 }
             } catch (InvalidFormatException | ZipException e) {
@@ -327,10 +319,11 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                     for (int i = 0; i < prc.size(); i++) {
                         PackagePart packagePart =
                                 safeGetRelatedPart(documentPart, 
prc.getRelationship(i));
-                        if (packagePart != null) {
-                            handlePart(packagePart, styles, listManager, xhtml,
-                                    OOXMLInlineBodyPartMap.EMPTY);
+                        if (packagePart == null) {
+                            continue;
                         }
+                        handlePart(packagePart, styles, listManager, xhtml,
+                                OOXMLInlineBodyPartMap.EMPTY);
                     }
                 }
             } catch (InvalidFormatException | ZipException e) {
@@ -391,11 +384,10 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                             linkedRelationships, 
config.isIncludeShapeBasedContent(),
                             config.isConcatenatePhoneticRuns(),
                             config.isPreferAlternateContentChoice())), 
context);
-        } catch (TikaException | IOException | SAXException e) {
+        } catch (TikaException | IOException e) {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
         }
-        bodyHandler.closeInlineElements();
         Map<String, EmbeddedPartMetadata> partMetadata = 
bodyHandler.getEmbeddedPartMetadataMap();
         resolveEmfNames(packagePart, partMetadata);
         embeddedPartMetadataMap.putAll(partMetadata);
@@ -410,7 +402,7 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                 continue;
             }
             try {
-                PackagePart emfPart = documentPart.getRelatedPart(
+                PackagePart emfPart = safeGetRelatedPart(documentPart,
                         documentPart.getRelationship(emfRId));
                 if (emfPart == null || emfPart.getContentType() == null) {
                     continue;
@@ -471,23 +463,20 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             OOXMLPartContentCollector collector =
                     new OOXMLPartContentCollector(wrapperElements, skipIds);
             for (int i = 0; i < prc.size(); i++) {
-                try {
-                    PackagePart part = 
documentPart.getRelatedPart(prc.getRelationship(i));
-                    // collect the part's linked relationships (for picture 
resolution)
-                    Map<String, String> partRels =
-                            loadLinkedRelationships(part, true, metadata);
-                    allRelationships.putAll(partRels);
-                    try (InputStream stream = part.getInputStream()) {
-                        XMLReaderUtils.parseSAX(stream, collector, context);
-                    }
-                } catch (InvalidFormatException | IOException | TikaException |
-                         SAXException e) {
-                    
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
-                            ExceptionUtils.getStackTrace(e));
+                PackagePart part = safeGetRelatedPart(documentPart, 
prc.getRelationship(i));
+                if (part == null) {
+                    continue;
+                }
+                // collect the part's linked relationships (for picture 
resolution)
+                Map<String, String> partRels =
+                        loadLinkedRelationships(part, true, metadata);
+                allRelationships.putAll(partRels);
+                try (InputStream stream = part.getInputStream()) {
+                    XMLReaderUtils.parseSAX(stream, collector, context);
                 }
             }
             return collector.getContentMap();
-        } catch (InvalidFormatException e) {
+        } catch (InvalidFormatException | IOException | TikaException | 
SAXException e) {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
             return Collections.emptyMap();
@@ -530,7 +519,7 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                 }
                 return new XWPFNumberingShim(numberingPart, context);
             }
-        } catch (Exception e) {
+        } catch (IOException | InvalidFormatException | TikaException | 
SAXException e) {
             //swallow
         }
         return null;
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
index dbd6996886..a9eb400e98 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
@@ -48,17 +48,6 @@ public interface XWPFBodyContentsHandler {
 
     void startParagraph(ParagraphProperties paragraphProperties) throws 
SAXException;
 
-    /**
-     * Updates the properties (style, numbering) for the current pending 
paragraph.
-     * Called when {@code </pPr>} is encountered, after {@link 
#startParagraph} but
-     * before any content.  The body handler defers opening the XHTML {@code 
<p>}
-     * tag until the first content arrives, so this style info will be 
available.
-     */
-    default void setParagraphProperties(ParagraphProperties 
paragraphProperties)
-            throws SAXException {
-        // Default no-op for backward compatibility
-    }
-
     void endParagraph() throws SAXException;
 
     void startTable() throws SAXException;
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 5a4676d631..74976ea34e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -42,6 +42,7 @@ import org.apache.tika.exception.RuntimeSAXException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
 import org.apache.tika.parser.microsoft.ooxml.EditType;
 import 
org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
@@ -61,6 +62,7 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
 
     private OPCPackage container;
     private POIXMLProperties properties;
+    private boolean includeGlossary = true;
 
     public XWPFEventBasedWordExtractor(OPCPackage container)
             throws OpenXML4JException, IOException {
@@ -84,6 +86,10 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
         return this.container;
     }
 
+    public void setIncludeGlossary(boolean includeGlossary) {
+        this.includeGlossary = includeGlossary;
+    }
+
     public POIXMLProperties.CoreProperties getCoreProperties() {
         POIXMLProperties props = getOrCreateProperties();
         return props != null ? props.getCoreProperties() : null;
@@ -130,23 +136,26 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
             }
         }
         //handle glossary document
-        pps = 
container.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
-
-        if (pps != null) {
-            for (PackagePart pp : pps) {
-                //likely only one, but why not...
-                try {
-                    handleDocumentPart(pp, sb);
-                } catch (IOException e) {
-                    LOG.warn("IOException handling glossary document part", e);
-                } catch (SAXException e) {
-                    if (WriteLimitReachedException.isWriteLimitReached(e)) {
-                        throw new RuntimeSAXException(e);
+        if (includeGlossary) {
+            pps = container.getPartsByContentType(
+                    XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
+
+            if (pps != null) {
+                for (PackagePart pp : pps) {
+                    //likely only one, but why not...
+                    try {
+                        handleDocumentPart(pp, sb);
+                    } catch (IOException e) {
+                        LOG.warn("IOException handling glossary document 
part", e);
+                    } catch (SAXException e) {
+                        if (WriteLimitReachedException.isWriteLimitReached(e)) 
{
+                            throw new RuntimeSAXException(e);
+                        }
+                        //swallow this because we don't actually call it
+                        LOG.warn("SAXException handling glossary document 
part", e);
+                    } catch (TikaException e) {
+                        LOG.warn("ParseException handling document part", e);
                     }
-                    //swallow this because we don't actually call it
-                    LOG.warn("SAXException handling glossary document part", 
e);
-                } catch (TikaException e) {
-                    LOG.warn("ParseException handling document part", e);
                 }
             }
         }
@@ -184,7 +193,11 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
                     
documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
             if (headersPRC != null) {
                 for (int i = 0; i < headersPRC.size(); i++) {
-                    PackagePart header = 
documentPart.getRelatedPart(headersPRC.getRelationship(i));
+                    PackagePart header = 
AbstractOOXMLExtractor.safeGetRelatedPart(
+                            documentPart, headersPRC.getRelationship(i));
+                    if (header == null) {
+                        continue;
+                    }
                     handlePart(header, xwpfListManager, sb);
                 }
             }
@@ -204,7 +217,11 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
                 if (prc != null) {
                     for (int i = 0; i < prc.size(); i++) {
                         PackagePart packagePart =
-                                
documentPart.getRelatedPart(prc.getRelationship(i));
+                                AbstractOOXMLExtractor.safeGetRelatedPart(
+                                        documentPart, prc.getRelationship(i));
+                        if (packagePart == null) {
+                            continue;
+                        }
                         handlePart(packagePart, xwpfListManager, sb);
                     }
                 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
index 38f0cb080b..ac533400e0 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
@@ -30,6 +30,7 @@ import org.junit.jupiter.api.Test;
 import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -116,7 +117,7 @@ public class OOXMLDocxSAXTest extends AbstractOOXMLDocxTest 
{
                 content);
 
         assertContains("<td>Embedded table r1c1", content);
-        assertContainsCount("This is text within a shape", content, 1);
+        assertContainsCount("<p>This is text within a shape", content, 1);
         assertContains("<p>Rich text content control", content);
         assertContains("<p>Simple text content control", content);
         assertContains("Repeating content", content);
@@ -363,4 +364,32 @@ public class OOXMLDocxSAXTest extends 
AbstractOOXMLDocxTest {
         Metadata m = metadataList.get(0);
         assertEquals("true", m.get(Office.HAS_FRAMESETS));
     }
+
+    /**
+     * Test with external DOCX files known to trigger "prefix not bound"
+     * from missing namespace declarations in footnote/endnote fragments.
+     * Enable by setting system property "tika.test.docx.namespace" to a file 
path.
+     */
+    @Test
+    public void testNamespaceInFragments() throws Exception {
+        String filePath = System.getProperty("tika.test.docx.namespace");
+        if (filePath == null) {
+            return;
+        }
+        java.io.File f = new java.io.File(filePath);
+        if (!f.isFile()) {
+            return;
+        }
+        AutoDetectParser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        org.xml.sax.ContentHandler handler =
+                new org.apache.tika.sax.BodyContentHandler(-1);
+        try (TikaInputStream tis = TikaInputStream.get(f.toPath())) {
+            parser.parse(tis, handler, metadata, getParseContext());
+        }
+        String[] warnings = 
metadata.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING);
+        for (String w : warnings) {
+            assertNotContained("not bound", w);
+        }
+    }
 }

Reply via email to