This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4fb2252ea8da832cef27e51913b4d8cd263f2590 Author: tallison <[email protected]> AuthorDate: Wed Mar 18 06:44:56 2026 -0400 improve sax ooxml - WIP --- .../tika/parser/microsoft/OfficeParserConfig.java | 28 +++ .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 42 +++- .../ooxml/OOXMLWordAndPowerPointTextHandler.java | 256 ++++----------------- .../ooxml/SXSLFPowerPointExtractorDecorator.java | 31 ++- .../ooxml/SXWPFWordExtractorDecorator.java | 6 +- .../xslf/XSLFEventBasedPowerPointExtractor.java | 7 +- .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 6 +- 7 files changed, 156 insertions(+), 220 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index 7bba9647b1..db6d4e78e9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -35,6 +35,8 @@ public class OfficeParserConfig implements Serializable { private boolean useSAXDocxExtractor = false; private boolean useSAXPptxExtractor = false; + private boolean preferAlternateContentChoice = true; + private boolean writeSelectHeadersInBody = false; private boolean extractAllAlternativesFromMSG = false; @@ -167,6 +169,32 @@ public class OfficeParserConfig implements Serializable { this.useSAXPptxExtractor = useSAXPptxExtractor; } + /** + * In OOXML, {@code mc:AlternateContent} wraps {@code mc:Choice} (newer/richer + * rendering, e.g. DrawingML text boxes) and {@code mc:Fallback} (degraded VML + * for older consumers). When {@code true} (default), the SAX parser processes + * the Choice branch and skips Fallback. When {@code false}, it processes + * Fallback and skips Choice (legacy behavior prior to Tika 4.x). + * <p> + * For text extraction, Choice typically contains equal or more content than + * Fallback. + * <p> + * Default: {@code true} + * + * @return whether to prefer mc:Choice over mc:Fallback + */ + public boolean isPreferAlternateContentChoice() { + return preferAlternateContentChoice; + } + + /** + * @param preferAlternateContentChoice whether to prefer mc:Choice over mc:Fallback + * @see #isPreferAlternateContentChoice() + */ + public void setPreferAlternateContentChoice(boolean preferAlternateContentChoice) { + this.preferAlternateContentChoice = preferAlternateContentChoice; + } + public boolean isConcatenatePhoneticRuns() { return concatenatePhoneticRuns; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java index 4bc445fb5e..1595436ad2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java @@ -24,13 +24,15 @@ import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.WordExtractor; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim; import org.apache.tika.sax.XHTMLContentHandler; public class OOXMLTikaBodyPartHandler - implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler { + implements XWPFBodyContentsHandler { private static final String P = "p"; @@ -41,6 +43,7 @@ public class OOXMLTikaBodyPartHandler private final boolean includeDeletedText; private final boolean includeMoveFromText; private final XWPFStylesShim styles; + private final Metadata metadata; private int pDepth = 0; //paragraph depth private int tableDepth = 0;//table depth @@ -64,7 +67,12 @@ public class OOXMLTikaBodyPartHandler private String paragraphTag = null; public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) { + this(xhtml, null); + } + + public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, Metadata metadata) { this.xhtml = xhtml; + this.metadata = metadata; this.styles = XWPFStylesShim.EMPTY_STYLES; this.listManager = XWPFListManager.EMPTY_LIST; this.includeDeletedText = false; @@ -72,8 +80,16 @@ public class OOXMLTikaBodyPartHandler } public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles, - XWPFListManager listManager, OfficeParserConfig parserConfig) { + XWPFListManager listManager, + OfficeParserConfig parserConfig) { + this(xhtml, styles, listManager, parserConfig, null); + } + + public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles, + XWPFListManager listManager, + OfficeParserConfig parserConfig, Metadata metadata) { this.xhtml = xhtml; + this.metadata = metadata; this.styles = styles; this.listManager = listManager; this.includeDeletedText = parserConfig.isIncludeDeletedContent(); @@ -272,7 +288,7 @@ public class OOXMLTikaBodyPartHandler @Override public void startEditedSection(String editor, Date date, - OOXMLWordAndPowerPointTextHandler.EditType editType) { + EditType editType) { //no-op } @@ -326,6 +342,9 @@ public class OOXMLTikaBodyPartHandler if (relId == null) { return; } + if (metadata != null) { + metadata.set(Office.HAS_LINKED_OLE_OBJECTS, true); + } // Emit as an external reference anchor - linked OLE objects reference external files AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "external-ref-linkedOle"); @@ -351,11 +370,28 @@ public class OOXMLTikaBodyPartHandler } + @Override + public void fieldCodeHyperlinkStart(String link) throws SAXException { + if (metadata != null) { + metadata.set(Office.HAS_FIELD_HYPERLINKS, true); + } + hyperlinkStart(link); + } + @Override public void externalRef(String fieldType, String url) throws SAXException { if (url == null || url.isEmpty()) { return; } + if (metadata != null) { + if ("hlinkHover".equals(fieldType)) { + metadata.set(Office.HAS_HOVER_HYPERLINKS, true); + } else if ("vml-shape-href".equals(fieldType)) { + metadata.set(Office.HAS_VML_HYPERLINKS, true); + } else { + metadata.set(Office.HAS_FIELD_HYPERLINKS, true); + } + } AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "class", "class", "CDATA", "external-ref-" + fieldType); attr.addAttribute("", "href", "href", "CDATA", url); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java index 3569398a28..8e952b9a55 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java @@ -19,16 +19,12 @@ package org.apache.tika.parser.microsoft.ooxml; import java.util.Date; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; import org.apache.tika.utils.DateUtils; /** @@ -90,6 +86,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private static final String VAL = "val"; private static final String SLIDE = "sld"; private static final String SHOW = "show"; + private static final String TIMING = "timing"; // p:timing — slide animations private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006"; private final static String O_NS = "urn:schemas-microsoft-com:office:office"; @@ -112,6 +109,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private final static String MOVE_TO = "moveTo"; private final static String ENDNOTE_REFERENCE = "endnoteReference"; private static final String TEXTBOX = "textbox"; + private static final String TXBX = "txbx"; // DrawingML text box (wps:txbx in mc:Choice) private final static String FLD_CHAR = "fldChar"; private final static String INSTR_TEXT = "instrText"; private final static String FLD_CHAR_TYPE = "fldCharType"; @@ -122,24 +120,14 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private final static String SHAPE = "shape"; private final static String HREF = "href"; - // Patterns for extracting URLs from field codes - private static final Pattern HYPERLINK_PATTERN = - Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); - private static final Pattern INCLUDEPICTURE_PATTERN = - Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); - private static final Pattern INCLUDETEXT_PATTERN = - Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); - private static final Pattern IMPORT_PATTERN = - Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); - private static final Pattern LINK_PATTERN = - Pattern.compile("LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); private final XWPFBodyContentsHandler bodyContentsHandler; private final Map<String, String> linkedRelationships; + private final OOXMLPictureTracker pictureTracker; private final RunProperties currRunProperties = new RunProperties(); private final ParagraphProperties currPProperties = new ParagraphProperties(); private final boolean includeTextBox; private final boolean concatenatePhoneticRuns; - private final Metadata metadata; + private final boolean preferACChoice; private final StringBuilder runBuffer = new StringBuilder(); private final StringBuilder rubyBuffer = new StringBuilder(); private boolean inR = false; @@ -148,11 +136,6 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private boolean inRPr = false; private boolean inNumPr = false; private boolean inRt = false; - private boolean inPic = false; - private boolean inPict = false; - private String picDescription = null; - private String picRId = null; - private String picFilename = null; //mechanism used to determine when to //signal the start of the p, and still //handle p with pPr and those without @@ -163,7 +146,9 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private boolean pStarted = false; //alternate content can be embedded in itself. //need to track depth. - //if in alternate, choose fallback, maybe make this configurable? + //preferACChoice controls which branch is processed: + // true -> process Choice, skip Fallback (richer content) + // false -> process Fallback, skip Choice (legacy behavior) private int inACChoiceDepth = 0; private int inACFallbackDepth = 0; private boolean inDelText = false; @@ -176,31 +161,34 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private boolean inInstrText = false; private boolean inFieldHyperlink = false; private final StringBuilder instrTextBuffer = new StringBuilder(); - private OOXMLWordAndPowerPointTextHandler.EditType editType = - OOXMLWordAndPowerPointTextHandler.EditType.NONE; + private EditType editType = + EditType.NONE; private DateUtils dateUtils = new DateUtils(); private boolean hiddenSlide = false; + private boolean hasAnimations = false; public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, Map<String, String> hyperlinks) { - this(bodyContentsHandler, hyperlinks, true, true, null); + this(bodyContentsHandler, hyperlinks, true, true, true); } public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, Map<String, String> hyperlinks, boolean includeTextBox, boolean concatenatePhoneticRuns) { - this(bodyContentsHandler, hyperlinks, includeTextBox, concatenatePhoneticRuns, null); + this(bodyContentsHandler, hyperlinks, includeTextBox, concatenatePhoneticRuns, true); } public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, Map<String, String> hyperlinks, boolean includeTextBox, - boolean concatenatePhoneticRuns, Metadata metadata) { + boolean concatenatePhoneticRuns, + boolean preferACChoice) { this.bodyContentsHandler = bodyContentsHandler; this.linkedRelationships = hyperlinks; + this.pictureTracker = new OOXMLPictureTracker(hyperlinks, bodyContentsHandler); this.includeTextBox = includeTextBox; this.concatenatePhoneticRuns = concatenatePhoneticRuns; - this.metadata = metadata; + this.preferACChoice = preferACChoice; } @Override @@ -219,6 +207,18 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { public void endPrefixMapping(String prefix) throws SAXException { } + /** + * Returns true if content should be skipped due to AlternateContent handling. + * When preferACChoice is true, skip Fallback; when false, skip Choice. + */ + private boolean inSkippedAlternateContent() { + if (preferACChoice) { + return inACFallbackDepth > 0; + } else { + return inACChoiceDepth > 0; + } + } + @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { @@ -238,11 +238,11 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } } - if (inACChoiceDepth > 0) { + if (inSkippedAlternateContent()) { return; } - if (!includeTextBox && localName.equals(TEXTBOX)) { + if (!includeTextBox && (localName.equals(TEXTBOX) || localName.equals(TXBX))) { inTextBox = true; return; } @@ -326,18 +326,18 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } else if (TBL.equals(localName)) { bodyContentsHandler.startTable(); } else if (BLIP.equals(localName)) { //check for DRAWING_NS - picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed"); + pictureTracker.setBlipRId(atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed")); } else if ("cNvPr".equals(localName)) { //check for PIC_NS? - picDescription = atts.getValue("", "descr"); + pictureTracker.setDescription(atts.getValue("", "descr")); } else if (PIC.equals(localName)) { - inPic = true; //check for PIC_NS? + pictureTracker.startPic(); //check for PIC_NS? } //TODO: add sdt, sdtPr, sdtContent goes here statistically else if (FOOTNOTE_REFERENCE.equals(localName)) { String id = atts.getValue(W_NS, "id"); bodyContentsHandler.footnoteReference(id); } else if (IMAGEDATA.equals(localName)) { - picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id"); - picDescription = atts.getValue(O_NS, "title"); + pictureTracker.setImageDataRId(atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id")); + pictureTracker.setImageDataDescription(atts.getValue(O_NS, "title")); } else if (INS.equals(localName)) { startEditedSection(editType.INSERT, atts); } else if (DEL_TEXT.equals(localName)) { @@ -365,11 +365,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { if ("Embed".equals(type)) { bodyContentsHandler.embeddedOLERef(refId); } else if ("Link".equals(type)) { - // Linked OLE object - references external file bodyContentsHandler.linkedOLERef(refId); - if (metadata != null) { - metadata.set(Office.HAS_LINKED_OLE_OBJECTS, true); - } } } else if (CR.equals(localName)) { runBuffer.append(NEWLINE); @@ -385,6 +381,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { if ("0".equals(val) || "false".equals(val)) { hiddenSlide = true; } + } else if (TIMING.equals(localName)) { + hasAnimations = true; } else if (FLD_CHAR.equals(localName)) { String fldCharType = atts.getValue(W_NS, FLD_CHAR_TYPE); if ("begin".equals(fldCharType)) { @@ -392,22 +390,17 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { instrTextBuffer.setLength(0); } else if ("separate".equals(fldCharType)) { // Parse instrText for HYPERLINK - String url = parseHyperlinkFromInstrText(instrTextBuffer.toString()); + String url = FieldCodeParser.parseHyperlinkFromInstrText(instrTextBuffer.toString()); if (url != null) { - bodyContentsHandler.hyperlinkStart(url); + bodyContentsHandler.fieldCodeHyperlinkStart(url); inFieldHyperlink = true; - if (metadata != null) { - metadata.set(Office.HAS_FIELD_HYPERLINKS, true); - } } else { // Check for external reference fields (INCLUDEPICTURE, INCLUDETEXT, etc.) StringBuilder fieldType = new StringBuilder(); - String extUrl = parseExternalRefFromInstrText(instrTextBuffer.toString(), fieldType); + String extUrl = FieldCodeParser.parseExternalRefFromInstrText( + instrTextBuffer.toString(), fieldType); if (extUrl != null) { bodyContentsHandler.externalRef(fieldType.toString(), extUrl); - if (metadata != null) { - metadata.set(Office.HAS_FIELD_HYPERLINKS, true); - } } } } else if ("end".equals(fldCharType)) { @@ -427,9 +420,6 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { String hyperlink = linkedRelationships.get(hyperlinkId); if (hyperlink != null) { bodyContentsHandler.externalRef("hlinkHover", hyperlink); - if (metadata != null) { - metadata.set(Office.HAS_HOVER_HYPERLINKS, true); - } } } } else if (SHAPE.equals(localName) && V_NS.equals(uri)) { @@ -440,9 +430,6 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } if (href != null && !href.isEmpty()) { bodyContentsHandler.externalRef("vml-shape-href", href); - if (metadata != null) { - metadata.set(Office.HAS_VML_HYPERLINKS, true); - } } } @@ -479,65 +466,6 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { return -1; } - /** - * Parses a HYPERLINK URL from instrText field code content. - * Field codes like: HYPERLINK "https://example.com" - * - * @param instrText the accumulated instrText content - * @return the URL if found, or null - */ - private String parseHyperlinkFromInstrText(String instrText) { - if (instrText == null || instrText.isEmpty()) { - return null; - } - Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim()); - if (m.find()) { - return m.group(1); - } - return null; - } - - /** - * Parses URLs from instrText field codes that reference external resources. - * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields. - * - * @param instrText the accumulated instrText content - * @param fieldType output parameter - will contain the field type if found - * @return the URL if found, or null - */ - private String parseExternalRefFromInstrText(String instrText, StringBuilder fieldType) { - if (instrText == null || instrText.isEmpty()) { - return null; - } - String trimmed = instrText.trim(); - - Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed); - if (m.find()) { - fieldType.append("INCLUDEPICTURE"); - return m.group(1); - } - - m = INCLUDETEXT_PATTERN.matcher(trimmed); - if (m.find()) { - fieldType.append("INCLUDETEXT"); - return m.group(1); - } - - m = IMPORT_PATTERN.matcher(trimmed); - if (m.find()) { - fieldType.append("IMPORT"); - return m.group(1); - } - - m = LINK_PATTERN.matcher(trimmed); - if (m.find()) { - fieldType.append("LINK"); - return m.group(1); - } - - return null; - } - @Override public void endElement(String uri, String localName, String qName) throws SAXException { @@ -546,17 +474,16 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } else if (FALLBACK.equals(localName)) { inACFallbackDepth--; } - if (inACChoiceDepth > 0) { + if (inSkippedAlternateContent()) { return; } - if (!includeTextBox && localName.equals(TEXTBOX)) { + if (!includeTextBox && (localName.equals(TEXTBOX) || localName.equals(TXBX))) { inTextBox = false; return; } if (PIC.equals(localName)) { //PIC_NS - handlePict(); - inPic = false; + pictureTracker.endPicture(); return; } else if (RPR.equals(localName)) { inRPr = false; @@ -595,7 +522,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } else if (HYPERLINK.equals(localName)) { bodyContentsHandler.hyperlinkEnd(); } else if (PICT.equals(localName)) { - handlePict(); + pictureTracker.endPicture(); } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart inV = false; handleEndOfRun(); @@ -631,21 +558,10 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { currRunProperties.setUnderline(UnderlinePatterns.NONE.name()); } - private void handlePict() throws SAXException { - String picFileName = null; - if (picRId != null) { - picFileName = linkedRelationships.get(picRId); - } - bodyContentsHandler.embeddedPicRef(picFileName, picDescription); - picDescription = null; - picRId = null; - inPic = false; - } - @Override public void characters(char[] ch, int start, int length) throws SAXException { - if (inACChoiceDepth > 0) { + if (inSkippedAlternateContent()) { return; } else if (!includeTextBox && inTextBox) { return; @@ -670,7 +586,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { - if (inACChoiceDepth > 0) { + if (inSkippedAlternateContent()) { return; } else if (!includeTextBox && inTextBox) { return; @@ -691,81 +607,11 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } } - public enum EditType { - NONE, INSERT, DELETE, MOVE_TO, MOVE_FROM - } - - public interface XWPFBodyContentsHandler { - - void run(RunProperties runProperties, String contents) throws SAXException; - - /** - * @param link the link; can be null - */ - void hyperlinkStart(String link) throws SAXException; - - void hyperlinkEnd() throws SAXException; - - void startParagraph(ParagraphProperties paragraphProperties) throws SAXException; - - void endParagraph() throws SAXException; - - void startTable() throws SAXException; - - void endTable() throws SAXException; - - void startTableRow() throws SAXException; - - void endTableRow() throws SAXException; - - void startTableCell() throws SAXException; - - void endTableCell() throws SAXException; - - void startSDT() throws SAXException; - - void endSDT() throws SAXException; - - void startEditedSection(String editor, Date date, EditType editType) throws SAXException; - - void endEditedSection() throws SAXException; - - boolean isIncludeDeletedText() throws SAXException; - - void footnoteReference(String id) throws SAXException; - - void endnoteReference(String id) throws SAXException; - - boolean isIncludeMoveFromText() throws SAXException; - - void embeddedOLERef(String refId) throws SAXException; - - /** - * Called when a linked (vs embedded) OLE object is found. - * These reference external files and are a security concern. - */ - void linkedOLERef(String refId) throws SAXException; - - void embeddedPicRef(String picFileName, String picDescription) throws SAXException; - - void startBookmark(String id, String name) throws SAXException; - - void endBookmark(String id) throws SAXException; - - /** - * Called when an external reference URL is found in a field code. - * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK fields, - * and DrawingML/VML hyperlinks on shapes. - * - * @param fieldType the type of field (e.g., "INCLUDEPICTURE", "hlinkHover", "vml-href") - * @param url the external URL - */ - default void externalRef(String fieldType, String url) throws SAXException { - // Default no-op implementation for backward compatibility - } - } - public boolean isHiddenSlide() { return hiddenSlide; } + + public boolean hasAnimations() { + return hasAnimations; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java index 45f4657f75..e8c112241b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java @@ -99,7 +99,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException { loadCommentAuthors(); - + addCommentAuthorMetadata(); PackageRelationshipCollection slidesPRC = null; try { @@ -109,10 +109,12 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { ExceptionUtils.getStackTrace(e)); } + int hiddenSlideCount = 0; if (slidesPRC != null && slidesPRC.size() > 0) { for (int i = 0; i < slidesPRC.size(); i++) { try { - handleSlidePart(mainDocument.getRelatedPart(slidesPRC.getRelationship(i)), + hiddenSlideCount += handleSlidePart( + mainDocument.getRelatedPart(slidesPRC.getRelationship(i)), xhtml); } catch (InvalidFormatException | ZipException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, @@ -120,6 +122,9 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { } } } + if (hiddenSlideCount > 0) { + metadata.set(Office.NUM_HIDDEN_SLIDES, hiddenSlideCount); + } if (config.isIncludeSlideMasterContent()) { handleGeneralTextContainingPart(XSLFRelation.SLIDE_MASTER.getRelation(), "slide-master", @@ -170,20 +175,35 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { } - private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) + private void addCommentAuthorMetadata() { + for (String name : commentAuthors.nameMap.values()) { + if (name != null && !name.isBlank()) { + metadata.add(Office.COMMENT_PERSONS, name); + } + } + } + + /** + * @return 1 if the slide is hidden, 0 otherwise + */ + private int handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) throws IOException, SAXException { Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false, metadata); -// Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart); + int hidden = 0; xhtml.startElement("div", "class", "slide-content"); try (InputStream stream = slidePart.getInputStream()) { OOXMLWordAndPowerPointTextHandler wordAndPPTHandler = new OOXMLWordAndPowerPointTextHandler( - new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships); + new OOXMLTikaBodyPartHandler(xhtml, metadata), linkedRelationships); XMLReaderUtils.parseSAX(stream, new EmbeddedContentHandler(wordAndPPTHandler), context); if (wordAndPPTHandler.isHiddenSlide()) { metadata.set(Office.HAS_HIDDEN_SLIDES, true); + hidden = 1; + } + if (wordAndPPTHandler.hasAnimations()) { + metadata.set(Office.HAS_ANIMATIONS, true); } } catch (TikaException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, @@ -222,6 +242,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(), "chart", slidePart, metadata, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)); + return hidden; } /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index 60eb91ec94..78eddc280f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -306,9 +306,11 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { try (InputStream stream = packagePart.getInputStream()) { XMLReaderUtils.parseSAX(stream, new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler( - new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, config), + new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, + config, metadata), linkedRelationships, config.isIncludeShapeBasedContent(), - config.isConcatenatePhoneticRuns(), metadata)), context); + config.isConcatenatePhoneticRuns(), + config.isPreferAlternateContentChoice())), context); } catch (TikaException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java index 2950e46be3..07860f13e8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java @@ -27,9 +27,10 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.xmlbeans.XmlException; -import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler; +import org.apache.tika.parser.microsoft.ooxml.EditType; import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties; import org.apache.tika.parser.microsoft.ooxml.RunProperties; +import org.apache.tika.parser.microsoft.ooxml.XWPFBodyContentsHandler; public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor { @@ -92,7 +93,7 @@ public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor { } private static class XSLFToTextContentHandler - implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler { + implements XWPFBodyContentsHandler { private final StringBuilder buffer; public XSLFToTextContentHandler(StringBuilder buffer) { @@ -166,7 +167,7 @@ public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor { @Override public void startEditedSection(String editor, Date date, - OOXMLWordAndPowerPointTextHandler.EditType editType) { + EditType editType) { } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index 8056f26bfb..7c5dc990fc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -44,9 +44,11 @@ import org.apache.tika.exception.RuntimeSAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.ooxml.EditType; import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler; import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties; import org.apache.tika.parser.microsoft.ooxml.RunProperties; +import org.apache.tika.parser.microsoft.ooxml.XWPFBodyContentsHandler; import org.apache.tika.parser.microsoft.ooxml.XWPFListManager; import org.apache.tika.utils.XMLReaderUtils; @@ -254,7 +256,7 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { } private static class XWPFToTextContentHandler - implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler { + implements XWPFBodyContentsHandler { private final StringBuilder buffer; public XWPFToTextContentHandler(StringBuilder buffer) { @@ -328,7 +330,7 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { @Override public void startEditedSection(String editor, Date date, - OOXMLWordAndPowerPointTextHandler.EditType editType) { + EditType editType) { }
