This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4fb2252ea8da832cef27e51913b4d8cd263f2590
Author: tallison <[email protected]>
AuthorDate: Wed Mar 18 06:44:56 2026 -0400

    improve sax ooxml - WIP
---
 .../tika/parser/microsoft/OfficeParserConfig.java  |  28 +++
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |  42 +++-
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   | 256 ++++-----------------
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   |  31 ++-
 .../ooxml/SXWPFWordExtractorDecorator.java         |   6 +-
 .../xslf/XSLFEventBasedPowerPointExtractor.java    |   7 +-
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |   6 +-
 7 files changed, 156 insertions(+), 220 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 7bba9647b1..db6d4e78e9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -35,6 +35,8 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXDocxExtractor = false;
     private boolean useSAXPptxExtractor = false;
 
+    private boolean preferAlternateContentChoice = true;
+
     private boolean writeSelectHeadersInBody = false;
 
     private boolean extractAllAlternativesFromMSG = false;
@@ -167,6 +169,32 @@ public class OfficeParserConfig implements Serializable {
         this.useSAXPptxExtractor = useSAXPptxExtractor;
     }
 
+    /**
+     * In OOXML, {@code mc:AlternateContent} wraps {@code mc:Choice} 
(newer/richer
+     * rendering, e.g. DrawingML text boxes) and {@code mc:Fallback} (degraded 
VML
+     * for older consumers). When {@code true} (default), the SAX parser 
processes
+     * the Choice branch and skips Fallback. When {@code false}, it processes
+     * Fallback and skips Choice (legacy behavior prior to Tika 4.x).
+     * <p>
+     * For text extraction, Choice typically contains equal or more content 
than
+     * Fallback.
+     * <p>
+     * Default: {@code true}
+     *
+     * @return whether to prefer mc:Choice over mc:Fallback
+     */
+    public boolean isPreferAlternateContentChoice() {
+        return preferAlternateContentChoice;
+    }
+
+    /**
+     * @param preferAlternateContentChoice whether to prefer mc:Choice over 
mc:Fallback
+     * @see #isPreferAlternateContentChoice()
+     */
+    public void setPreferAlternateContentChoice(boolean 
preferAlternateContentChoice) {
+        this.preferAlternateContentChoice = preferAlternateContentChoice;
+    }
+
     public boolean isConcatenatePhoneticRuns() {
         return concatenatePhoneticRuns;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index 4bc445fb5e..1595436ad2 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -24,13 +24,15 @@ import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.WordExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
 import org.apache.tika.sax.XHTMLContentHandler;
 
 public class OOXMLTikaBodyPartHandler
-        implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
+        implements XWPFBodyContentsHandler {
 
     private static final String P = "p";
 
@@ -41,6 +43,7 @@ public class OOXMLTikaBodyPartHandler
     private final boolean includeDeletedText;
     private final boolean includeMoveFromText;
     private final XWPFStylesShim styles;
+    private final Metadata metadata;
 
     private int pDepth = 0; //paragraph depth
     private int tableDepth = 0;//table depth
@@ -64,7 +67,12 @@ public class OOXMLTikaBodyPartHandler
     private String paragraphTag = null;
 
     public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) {
+        this(xhtml, null);
+    }
+
+    public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, Metadata 
metadata) {
         this.xhtml = xhtml;
+        this.metadata = metadata;
         this.styles = XWPFStylesShim.EMPTY_STYLES;
         this.listManager = XWPFListManager.EMPTY_LIST;
         this.includeDeletedText = false;
@@ -72,8 +80,16 @@ public class OOXMLTikaBodyPartHandler
     }
 
     public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim 
styles,
-                                    XWPFListManager listManager, 
OfficeParserConfig parserConfig) {
+                                    XWPFListManager listManager,
+                                    OfficeParserConfig parserConfig) {
+        this(xhtml, styles, listManager, parserConfig, null);
+    }
+
+    public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim 
styles,
+                                    XWPFListManager listManager,
+                                    OfficeParserConfig parserConfig, Metadata 
metadata) {
         this.xhtml = xhtml;
+        this.metadata = metadata;
         this.styles = styles;
         this.listManager = listManager;
         this.includeDeletedText = parserConfig.isIncludeDeletedContent();
@@ -272,7 +288,7 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void startEditedSection(String editor, Date date,
-                                   OOXMLWordAndPowerPointTextHandler.EditType 
editType) {
+                                   EditType editType) {
         //no-op
     }
 
@@ -326,6 +342,9 @@ public class OOXMLTikaBodyPartHandler
         if (relId == null) {
             return;
         }
+        if (metadata != null) {
+            metadata.set(Office.HAS_LINKED_OLE_OBJECTS, true);
+        }
         // Emit as an external reference anchor - linked OLE objects reference 
external files
         AttributesImpl attributes = new AttributesImpl();
         attributes.addAttribute("", "class", "class", "CDATA", 
"external-ref-linkedOle");
@@ -351,11 +370,28 @@ public class OOXMLTikaBodyPartHandler
 
     }
 
+    @Override
+    public void fieldCodeHyperlinkStart(String link) throws SAXException {
+        if (metadata != null) {
+            metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
+        }
+        hyperlinkStart(link);
+    }
+
     @Override
     public void externalRef(String fieldType, String url) throws SAXException {
         if (url == null || url.isEmpty()) {
             return;
         }
+        if (metadata != null) {
+            if ("hlinkHover".equals(fieldType)) {
+                metadata.set(Office.HAS_HOVER_HYPERLINKS, true);
+            } else if ("vml-shape-href".equals(fieldType)) {
+                metadata.set(Office.HAS_VML_HYPERLINKS, true);
+            } else {
+                metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
+            }
+        }
         AttributesImpl attr = new AttributesImpl();
         attr.addAttribute("", "class", "class", "CDATA", "external-ref-" + 
fieldType);
         attr.addAttribute("", "href", "href", "CDATA", url);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index 3569398a28..8e952b9a55 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -19,16 +19,12 @@ package org.apache.tika.parser.microsoft.ooxml;
 
 import java.util.Date;
 import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
 import org.apache.tika.utils.DateUtils;
 
 /**
@@ -90,6 +86,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private static final String VAL = "val";
     private static final String SLIDE = "sld";
     private static final String SHOW = "show";
+    private static final String TIMING = "timing"; // p:timing — slide 
animations
     private final static String MC_NS =
             "http://schemas.openxmlformats.org/markup-compatibility/2006";;
     private final static String O_NS = 
"urn:schemas-microsoft-com:office:office";
@@ -112,6 +109,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private final static String MOVE_TO = "moveTo";
     private final static String ENDNOTE_REFERENCE = "endnoteReference";
     private static final String TEXTBOX = "textbox";
+    private static final String TXBX = "txbx"; // DrawingML text box (wps:txbx 
in mc:Choice)
     private final static String FLD_CHAR = "fldChar";
     private final static String INSTR_TEXT = "instrText";
     private final static String FLD_CHAR_TYPE = "fldCharType";
@@ -122,24 +120,14 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private final static String SHAPE = "shape";
     private final static String HREF = "href";
 
-    // Patterns for extracting URLs from field codes
-    private static final Pattern HYPERLINK_PATTERN =
-            Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", 
Pattern.CASE_INSENSITIVE);
-    private static final Pattern INCLUDEPICTURE_PATTERN =
-            Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"", 
Pattern.CASE_INSENSITIVE);
-    private static final Pattern INCLUDETEXT_PATTERN =
-            Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"", 
Pattern.CASE_INSENSITIVE);
-    private static final Pattern IMPORT_PATTERN =
-            Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"", 
Pattern.CASE_INSENSITIVE);
-    private static final Pattern LINK_PATTERN =
-            
Pattern.compile("LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"", 
Pattern.CASE_INSENSITIVE);
     private final XWPFBodyContentsHandler bodyContentsHandler;
     private final Map<String, String> linkedRelationships;
+    private final OOXMLPictureTracker pictureTracker;
     private final RunProperties currRunProperties = new RunProperties();
     private final ParagraphProperties currPProperties = new 
ParagraphProperties();
     private final boolean includeTextBox;
     private final boolean concatenatePhoneticRuns;
-    private final Metadata metadata;
+    private final boolean preferACChoice;
     private final StringBuilder runBuffer = new StringBuilder();
     private final StringBuilder rubyBuffer = new StringBuilder();
     private boolean inR = false;
@@ -148,11 +136,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private boolean inRPr = false;
     private boolean inNumPr = false;
     private boolean inRt = false;
-    private boolean inPic = false;
-    private boolean inPict = false;
-    private String picDescription = null;
-    private String picRId = null;
-    private String picFilename = null;
     //mechanism used to determine when to
     //signal the start of the p, and still
     //handle p with pPr and those without
@@ -163,7 +146,9 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private boolean pStarted = false;
     //alternate content can be embedded in itself.
     //need to track depth.
-    //if in alternate, choose fallback, maybe make this configurable?
+    //preferACChoice controls which branch is processed:
+    //  true  -> process Choice, skip Fallback (richer content)
+    //  false -> process Fallback, skip Choice (legacy behavior)
     private int inACChoiceDepth = 0;
     private int inACFallbackDepth = 0;
     private boolean inDelText = false;
@@ -176,31 +161,34 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private boolean inInstrText = false;
     private boolean inFieldHyperlink = false;
     private final StringBuilder instrTextBuffer = new StringBuilder();
-    private OOXMLWordAndPowerPointTextHandler.EditType editType =
-            OOXMLWordAndPowerPointTextHandler.EditType.NONE;
+    private EditType editType =
+            EditType.NONE;
     private DateUtils dateUtils = new DateUtils();
 
     private boolean hiddenSlide = false;
+    private boolean hasAnimations = false;
 
     public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler 
bodyContentsHandler,
                                              Map<String, String> hyperlinks) {
-        this(bodyContentsHandler, hyperlinks, true, true, null);
+        this(bodyContentsHandler, hyperlinks, true, true, true);
     }
 
     public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler 
bodyContentsHandler,
                                              Map<String, String> hyperlinks, 
boolean includeTextBox,
                                              boolean concatenatePhoneticRuns) {
-        this(bodyContentsHandler, hyperlinks, includeTextBox, 
concatenatePhoneticRuns, null);
+        this(bodyContentsHandler, hyperlinks, includeTextBox, 
concatenatePhoneticRuns, true);
     }
 
     public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler 
bodyContentsHandler,
                                              Map<String, String> hyperlinks, 
boolean includeTextBox,
-                                             boolean concatenatePhoneticRuns, 
Metadata metadata) {
+                                             boolean concatenatePhoneticRuns,
+                                             boolean preferACChoice) {
         this.bodyContentsHandler = bodyContentsHandler;
         this.linkedRelationships = hyperlinks;
+        this.pictureTracker = new OOXMLPictureTracker(hyperlinks, 
bodyContentsHandler);
         this.includeTextBox = includeTextBox;
         this.concatenatePhoneticRuns = concatenatePhoneticRuns;
-        this.metadata = metadata;
+        this.preferACChoice = preferACChoice;
     }
 
     @Override
@@ -219,6 +207,18 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     public void endPrefixMapping(String prefix) throws SAXException {
     }
 
+    /**
+     * Returns true if content should be skipped due to AlternateContent 
handling.
+     * When preferACChoice is true, skip Fallback; when false, skip Choice.
+     */
+    private boolean inSkippedAlternateContent() {
+        if (preferACChoice) {
+            return inACFallbackDepth > 0;
+        } else {
+            return inACChoiceDepth > 0;
+        }
+    }
+
     @Override
     public void startElement(String uri, String localName, String qName, 
Attributes atts)
             throws SAXException {
@@ -238,11 +238,11 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             }
         }
 
-        if (inACChoiceDepth > 0) {
+        if (inSkippedAlternateContent()) {
             return;
         }
 
-        if (!includeTextBox && localName.equals(TEXTBOX)) {
+        if (!includeTextBox && (localName.equals(TEXTBOX) || 
localName.equals(TXBX))) {
             inTextBox = true;
             return;
         }
@@ -326,18 +326,18 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         } else if (TBL.equals(localName)) {
             bodyContentsHandler.startTable();
         } else if (BLIP.equals(localName)) { //check for DRAWING_NS
-            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+            
pictureTracker.setBlipRId(atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed"));
         } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
-            picDescription = atts.getValue("", "descr");
+            pictureTracker.setDescription(atts.getValue("", "descr"));
         } else if (PIC.equals(localName)) {
-            inPic = true; //check for PIC_NS?
+            pictureTracker.startPic(); //check for PIC_NS?
         } //TODO: add sdt, sdtPr, sdtContent goes here statistically
         else if (FOOTNOTE_REFERENCE.equals(localName)) {
             String id = atts.getValue(W_NS, "id");
             bodyContentsHandler.footnoteReference(id);
         } else if (IMAGEDATA.equals(localName)) {
-            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
-            picDescription = atts.getValue(O_NS, "title");
+            
pictureTracker.setImageDataRId(atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id"));
+            pictureTracker.setImageDataDescription(atts.getValue(O_NS, 
"title"));
         } else if (INS.equals(localName)) {
             startEditedSection(editType.INSERT, atts);
         } else if (DEL_TEXT.equals(localName)) {
@@ -365,11 +365,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             if ("Embed".equals(type)) {
                 bodyContentsHandler.embeddedOLERef(refId);
             } else if ("Link".equals(type)) {
-                // Linked OLE object - references external file
                 bodyContentsHandler.linkedOLERef(refId);
-                if (metadata != null) {
-                    metadata.set(Office.HAS_LINKED_OLE_OBJECTS, true);
-                }
             }
         } else if (CR.equals(localName)) {
             runBuffer.append(NEWLINE);
@@ -385,6 +381,8 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             if ("0".equals(val) || "false".equals(val)) {
                 hiddenSlide = true;
             }
+        } else if (TIMING.equals(localName)) {
+            hasAnimations = true;
         } else if (FLD_CHAR.equals(localName)) {
             String fldCharType = atts.getValue(W_NS, FLD_CHAR_TYPE);
             if ("begin".equals(fldCharType)) {
@@ -392,22 +390,17 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
                 instrTextBuffer.setLength(0);
             } else if ("separate".equals(fldCharType)) {
                 // Parse instrText for HYPERLINK
-                String url = 
parseHyperlinkFromInstrText(instrTextBuffer.toString());
+                String url = 
FieldCodeParser.parseHyperlinkFromInstrText(instrTextBuffer.toString());
                 if (url != null) {
-                    bodyContentsHandler.hyperlinkStart(url);
+                    bodyContentsHandler.fieldCodeHyperlinkStart(url);
                     inFieldHyperlink = true;
-                    if (metadata != null) {
-                        metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
-                    }
                 } else {
                     // Check for external reference fields (INCLUDEPICTURE, 
INCLUDETEXT, etc.)
                     StringBuilder fieldType = new StringBuilder();
-                    String extUrl = 
parseExternalRefFromInstrText(instrTextBuffer.toString(), fieldType);
+                    String extUrl = 
FieldCodeParser.parseExternalRefFromInstrText(
+                            instrTextBuffer.toString(), fieldType);
                     if (extUrl != null) {
                         bodyContentsHandler.externalRef(fieldType.toString(), 
extUrl);
-                        if (metadata != null) {
-                            metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
-                        }
                     }
                 }
             } else if ("end".equals(fldCharType)) {
@@ -427,9 +420,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
                 String hyperlink = linkedRelationships.get(hyperlinkId);
                 if (hyperlink != null) {
                     bodyContentsHandler.externalRef("hlinkHover", hyperlink);
-                    if (metadata != null) {
-                        metadata.set(Office.HAS_HOVER_HYPERLINKS, true);
-                    }
                 }
             }
         } else if (SHAPE.equals(localName) && V_NS.equals(uri)) {
@@ -440,9 +430,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             }
             if (href != null && !href.isEmpty()) {
                 bodyContentsHandler.externalRef("vml-shape-href", href);
-                if (metadata != null) {
-                    metadata.set(Office.HAS_VML_HYPERLINKS, true);
-                }
             }
         }
 
@@ -479,65 +466,6 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         return -1;
     }
 
-    /**
-     * Parses a HYPERLINK URL from instrText field code content.
-     * Field codes like: HYPERLINK "https://example.com";
-     *
-     * @param instrText the accumulated instrText content
-     * @return the URL if found, or null
-     */
-    private String parseHyperlinkFromInstrText(String instrText) {
-        if (instrText == null || instrText.isEmpty()) {
-            return null;
-        }
-        Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
-        if (m.find()) {
-            return m.group(1);
-        }
-        return null;
-    }
-
-    /**
-     * Parses URLs from instrText field codes that reference external 
resources.
-     * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields.
-     *
-     * @param instrText the accumulated instrText content
-     * @param fieldType output parameter - will contain the field type if found
-     * @return the URL if found, or null
-     */
-    private String parseExternalRefFromInstrText(String instrText, 
StringBuilder fieldType) {
-        if (instrText == null || instrText.isEmpty()) {
-            return null;
-        }
-        String trimmed = instrText.trim();
-
-        Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed);
-        if (m.find()) {
-            fieldType.append("INCLUDEPICTURE");
-            return m.group(1);
-        }
-
-        m = INCLUDETEXT_PATTERN.matcher(trimmed);
-        if (m.find()) {
-            fieldType.append("INCLUDETEXT");
-            return m.group(1);
-        }
-
-        m = IMPORT_PATTERN.matcher(trimmed);
-        if (m.find()) {
-            fieldType.append("IMPORT");
-            return m.group(1);
-        }
-
-        m = LINK_PATTERN.matcher(trimmed);
-        if (m.find()) {
-            fieldType.append("LINK");
-            return m.group(1);
-        }
-
-        return null;
-    }
-
     @Override
     public void endElement(String uri, String localName, String qName) throws 
SAXException {
 
@@ -546,17 +474,16 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         } else if (FALLBACK.equals(localName)) {
             inACFallbackDepth--;
         }
-        if (inACChoiceDepth > 0) {
+        if (inSkippedAlternateContent()) {
             return;
         }
 
-        if (!includeTextBox && localName.equals(TEXTBOX)) {
+        if (!includeTextBox && (localName.equals(TEXTBOX) || 
localName.equals(TXBX))) {
             inTextBox = false;
             return;
         }
         if (PIC.equals(localName)) { //PIC_NS
-            handlePict();
-            inPic = false;
+            pictureTracker.endPicture();
             return;
         } else if (RPR.equals(localName)) {
             inRPr = false;
@@ -595,7 +522,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         } else if (HYPERLINK.equals(localName)) {
             bodyContentsHandler.hyperlinkEnd();
         } else if (PICT.equals(localName)) {
-            handlePict();
+            pictureTracker.endPicture();
         } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a 
chart
             inV = false;
             handleEndOfRun();
@@ -631,21 +558,10 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
     }
 
-    private void handlePict() throws SAXException {
-        String picFileName = null;
-        if (picRId != null) {
-            picFileName = linkedRelationships.get(picRId);
-        }
-        bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
-        picDescription = null;
-        picRId = null;
-        inPic = false;
-    }
-
     @Override
     public void characters(char[] ch, int start, int length) throws 
SAXException {
 
-        if (inACChoiceDepth > 0) {
+        if (inSkippedAlternateContent()) {
             return;
         } else if (!includeTextBox && inTextBox) {
             return;
@@ -670,7 +586,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
 
     @Override
     public void ignorableWhitespace(char[] ch, int start, int length) throws 
SAXException {
-        if (inACChoiceDepth > 0) {
+        if (inSkippedAlternateContent()) {
             return;
         } else if (!includeTextBox && inTextBox) {
             return;
@@ -691,81 +607,11 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         }
     }
 
-    public enum EditType {
-        NONE, INSERT, DELETE, MOVE_TO, MOVE_FROM
-    }
-
-    public interface XWPFBodyContentsHandler {
-
-        void run(RunProperties runProperties, String contents) throws 
SAXException;
-
-        /**
-         * @param link the link; can be null
-         */
-        void hyperlinkStart(String link) throws SAXException;
-
-        void hyperlinkEnd() throws SAXException;
-
-        void startParagraph(ParagraphProperties paragraphProperties) throws 
SAXException;
-
-        void endParagraph() throws SAXException;
-
-        void startTable() throws SAXException;
-
-        void endTable() throws SAXException;
-
-        void startTableRow() throws SAXException;
-
-        void endTableRow() throws SAXException;
-
-        void startTableCell() throws SAXException;
-
-        void endTableCell() throws SAXException;
-
-        void startSDT() throws SAXException;
-
-        void endSDT() throws SAXException;
-
-        void startEditedSection(String editor, Date date, EditType editType) 
throws SAXException;
-
-        void endEditedSection() throws SAXException;
-
-        boolean isIncludeDeletedText() throws SAXException;
-
-        void footnoteReference(String id) throws SAXException;
-
-        void endnoteReference(String id) throws SAXException;
-
-        boolean isIncludeMoveFromText() throws SAXException;
-
-        void embeddedOLERef(String refId) throws SAXException;
-
-        /**
-         * Called when a linked (vs embedded) OLE object is found.
-         * These reference external files and are a security concern.
-         */
-        void linkedOLERef(String refId) throws SAXException;
-
-        void embeddedPicRef(String picFileName, String picDescription) throws 
SAXException;
-
-        void startBookmark(String id, String name) throws SAXException;
-
-        void endBookmark(String id) throws SAXException;
-
-        /**
-         * Called when an external reference URL is found in a field code.
-         * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK fields,
-         * and DrawingML/VML hyperlinks on shapes.
-         *
-         * @param fieldType the type of field (e.g., "INCLUDEPICTURE", 
"hlinkHover", "vml-href")
-         * @param url the external URL
-         */
-        default void externalRef(String fieldType, String url) throws 
SAXException {
-            // Default no-op implementation for backward compatibility
-        }
-    }
-
     public boolean isHiddenSlide() {
         return hiddenSlide;
     }
+
+    public boolean hasAnimations() {
+        return hasAnimations;
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 45f4657f75..e8c112241b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -99,7 +99,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, 
IOException {
 
         loadCommentAuthors();
-
+        addCommentAuthorMetadata();
 
         PackageRelationshipCollection slidesPRC = null;
         try {
@@ -109,10 +109,12 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
                     ExceptionUtils.getStackTrace(e));
         }
 
+        int hiddenSlideCount = 0;
         if (slidesPRC != null && slidesPRC.size() > 0) {
             for (int i = 0; i < slidesPRC.size(); i++) {
                 try {
-                    
handleSlidePart(mainDocument.getRelatedPart(slidesPRC.getRelationship(i)),
+                    hiddenSlideCount += handleSlidePart(
+                            
mainDocument.getRelatedPart(slidesPRC.getRelationship(i)),
                             xhtml);
                 } catch (InvalidFormatException | ZipException e) {
                     
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
@@ -120,6 +122,9 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
                 }
             }
         }
+        if (hiddenSlideCount > 0) {
+            metadata.set(Office.NUM_HIDDEN_SLIDES, hiddenSlideCount);
+        }
 
         if (config.isIncludeSlideMasterContent()) {
             
handleGeneralTextContainingPart(XSLFRelation.SLIDE_MASTER.getRelation(), 
"slide-master",
@@ -170,20 +175,35 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
 
     }
 
-    private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler 
xhtml)
+    private void addCommentAuthorMetadata() {
+        for (String name : commentAuthors.nameMap.values()) {
+            if (name != null && !name.isBlank()) {
+                metadata.add(Office.COMMENT_PERSONS, name);
+            }
+        }
+    }
+
+    /**
+     * @return 1 if the slide is hidden, 0 otherwise
+     */
+    private int handleSlidePart(PackagePart slidePart, XHTMLContentHandler 
xhtml)
             throws IOException, SAXException {
         Map<String, String> linkedRelationships =
                 loadLinkedRelationships(slidePart, false, metadata);
 
-//        Map<String, String> hyperlinks = 
loadHyperlinkRelationships(packagePart);
+        int hidden = 0;
         xhtml.startElement("div", "class", "slide-content");
         try (InputStream stream = slidePart.getInputStream()) {
             OOXMLWordAndPowerPointTextHandler wordAndPPTHandler = new 
OOXMLWordAndPowerPointTextHandler(
-                    new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships);
+                    new OOXMLTikaBodyPartHandler(xhtml, metadata), 
linkedRelationships);
             XMLReaderUtils.parseSAX(stream,
                     new EmbeddedContentHandler(wordAndPPTHandler), context);
             if (wordAndPPTHandler.isHiddenSlide()) {
                 metadata.set(Office.HAS_HIDDEN_SLIDES, true);
+                hidden = 1;
+            }
+            if (wordAndPPTHandler.hasAnimations()) {
+                metadata.set(Office.HAS_ANIMATIONS, true);
             }
         } catch (TikaException | IOException e) {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
@@ -222,6 +242,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
         handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(), 
"chart", slidePart,
                 metadata, new OOXMLWordAndPowerPointTextHandler(new 
OOXMLTikaBodyPartHandler(xhtml),
                         linkedRelationships));
+        return hidden;
     }
 
     /**
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 60eb91ec94..78eddc280f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -306,9 +306,11 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
         try (InputStream stream = packagePart.getInputStream()) {
             XMLReaderUtils.parseSAX(stream,
                     new EmbeddedContentHandler(new 
OOXMLWordAndPowerPointTextHandler(
-                            new OOXMLTikaBodyPartHandler(xhtml, styles, 
listManager, config),
+                            new OOXMLTikaBodyPartHandler(xhtml, styles, 
listManager,
+                                    config, metadata),
                             linkedRelationships, 
config.isIncludeShapeBasedContent(),
-                            config.isConcatenatePhoneticRuns(), metadata)), 
context);
+                            config.isConcatenatePhoneticRuns(),
+                            config.isPreferAlternateContentChoice())), 
context);
         } catch (TikaException | IOException e) {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index 2950e46be3..07860f13e8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -27,9 +27,10 @@ import 
org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.xmlbeans.XmlException;
 
-import 
org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
+import org.apache.tika.parser.microsoft.ooxml.EditType;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
 import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.apache.tika.parser.microsoft.ooxml.XWPFBodyContentsHandler;
 
 public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor {
 
@@ -92,7 +93,7 @@ public class XSLFEventBasedPowerPointExtractor implements 
POIXMLTextExtractor {
     }
 
     private static class XSLFToTextContentHandler
-            implements 
OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
+            implements XWPFBodyContentsHandler {
         private final StringBuilder buffer;
 
         public XSLFToTextContentHandler(StringBuilder buffer) {
@@ -166,7 +167,7 @@ public class XSLFEventBasedPowerPointExtractor implements 
POIXMLTextExtractor {
 
         @Override
         public void startEditedSection(String editor, Date date,
-                                       
OOXMLWordAndPowerPointTextHandler.EditType editType) {
+                                       EditType editType) {
 
         }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 8056f26bfb..7c5dc990fc 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -44,9 +44,11 @@ import org.apache.tika.exception.RuntimeSAXException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.EditType;
 import 
org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
 import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.apache.tika.parser.microsoft.ooxml.XWPFBodyContentsHandler;
 import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
 import org.apache.tika.utils.XMLReaderUtils;
 
@@ -254,7 +256,7 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
     }
 
     private static class XWPFToTextContentHandler
-            implements 
OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
+            implements XWPFBodyContentsHandler {
         private final StringBuilder buffer;
 
         public XWPFToTextContentHandler(StringBuilder buffer) {
@@ -328,7 +330,7 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
 
         @Override
         public void startEditedSection(String editor, Date date,
-                                       
OOXMLWordAndPowerPointTextHandler.EditType editType) {
+                                       EditType editType) {
 
         }
 


Reply via email to