This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2c878feaa164d92fc171531e8258b74be1784c80 Author: tallison <[email protected]> AuthorDate: Fri Apr 3 16:30:43 2026 -0400 further improvements --- .../org/apache/tika/sax/XHTMLContentHandler.java | 40 +++++ .../microsoft/ooxml/AbstractOOXMLExtractor.java | 26 ++- .../microsoft/ooxml/FormattingTagManager.java | 90 +++++++++- .../parser/microsoft/ooxml/InlineTagManager.java | 196 --------------------- .../microsoft/ooxml/OOXMLPartContentCollector.java | 22 ++- .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 14 +- .../ooxml/OOXMLWordAndPowerPointTextHandler.java | 33 +++- .../tika/parser/microsoft/ooxml/RunProperties.java | 11 ++ .../ooxml/SXSLFPowerPointExtractorDecorator.java | 18 +- .../ooxml/SXWPFWordExtractorDecorator.java | 34 ++-- .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 13 +- .../parser/microsoft/ooxml/OOXMLDocxSAXTest.java | 29 +++ .../parser/microsoft/ooxml/OOXMLPptxSAXTest.java | 39 +++- 13 files changed, 314 insertions(+), 251 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java index 3fd7766d03..6f9af40421 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java @@ -16,8 +16,10 @@ */ package org.apache.tika.sax; +import java.util.ArrayDeque; import java.util.Arrays; import java.util.Collections; +import java.util.Deque; import java.util.HashSet; import java.util.Set; @@ -94,6 +96,15 @@ public class XHTMLContentHandler extends SafeContentHandler { private boolean headStarted = false; private boolean headEnded = false; private boolean useFrameset = false; + + /** + * When true, tracks a stack of opened element names and throws + * a RuntimeException on mismatched endElement calls. This is a + * debugging aid for finding unbalanced SAX events in parsers. + * Enable via {@link #setStrictTagBalanceChecking(boolean)}. + */ + private static boolean strictTagBalanceChecking = false; + private final Deque<String> tagStack = new ArrayDeque<>(); public XHTMLContentHandler(ContentHandler handler, Metadata metadata) { this(handler, metadata, null); } @@ -124,6 +135,17 @@ public class XHTMLContentHandler extends SafeContentHandler { } } + /** + * Enables or disables strict tag balance checking. When enabled, + * every startElement pushes onto a stack and every endElement + * verifies the tag matches, throwing a RuntimeException with the + * full stack trace on mismatch. This is a debugging tool, not for + * production use. + */ + public static void setStrictTagBalanceChecking(boolean strict) { + strictTagBalanceChecking = strict; + } + private static Set<String> unmodifiableSet(String... elements) { return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(elements))); } @@ -282,6 +304,9 @@ public class XHTMLContentHandler extends SafeContentHandler { } super.startElement(uri, local, name, attributes); + if (strictTagBalanceChecking) { + tagStack.push(name); + } } } @@ -292,6 +317,21 @@ public class XHTMLContentHandler extends SafeContentHandler { @Override public void endElement(String uri, String local, String name) throws SAXException { if (!AUTO.contains(name)) { + if (strictTagBalanceChecking) { + if (tagStack.isEmpty()) { + throw new RuntimeException( + "STRICT TAG CHECK: endElement('" + name + + "') but tag stack is empty! No matching startElement."); + } + String expected = tagStack.peek(); + if (!name.equals(expected)) { + throw new RuntimeException( + "STRICT TAG CHECK: endElement('" + name + + "') but expected '" + expected + + "'. Tag stack (top to bottom): " + tagStack); + } + tagStack.pop(); + } super.endElement(uri, local, name); if (XHTML.equals(uri) && ENDLINE.contains(name)) { newline(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 70d5920800..3074a802b8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -624,6 +624,27 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { * @param parentPart * @param contentHandler */ + /** + * Safely resolves a related part, returning null if the part cannot be found + * instead of throwing {@link IllegalArgumentException}. + */ + public static PackagePart safeGetRelatedPart(PackagePart source, + PackageRelationship relationship) + throws InvalidFormatException { + if (source == null || relationship == null) { + return null; + } + if (!source.isRelationshipExists(relationship)) { + return null; + } + try { + return source.getRelatedPart(relationship); + } catch (IllegalArgumentException e) { + // Relationship exists but target part is missing from the package + return null; + } + } + void handleGeneralTextContainingPart(String contentType, String xhtmlClassLabel, PackagePart parentPart, Metadata parentMetadata, ContentHandler contentHandler) throws SAXException { @@ -647,7 +668,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { relatedPartPRC.getRelationship(i); try { PackagePart relatedPartPart = - parentPart.getRelatedPart(relatedPartPackageRelationship); + safeGetRelatedPart(parentPart, relatedPartPackageRelationship); + if (relatedPartPart == null) { + continue; + } try (InputStream stream = relatedPartPart.getInputStream()) { XMLReaderUtils.parseSAX(stream, new EmbeddedContentHandler(contentHandler), context); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java index db88eedbae..0545cd0037 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java @@ -16,24 +16,37 @@ */ package org.apache.tika.parser.microsoft.ooxml; +import java.util.Objects; + import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.xml.sax.SAXException; import org.apache.tika.sax.XHTMLContentHandler; /** - * Manages XHTML formatting tags (b, i, u, s) as a state machine, - * ensuring proper nesting. Tags are always ordered from outermost to innermost: - * {@code <b><i><s><u>text</u></s></i></b>}. + * Single owner of all run-scoped XHTML wrapper tags, ensuring proper nesting. + * Nesting order from outermost to innermost: + * {@code <a href="..."><b><i><s><u>text</u></s></i></b></a>}. * <p> - * When a formatting change occurs, all tags that are "inside" the changing tag - * must be closed first, then the change applied, then inner tags reopened. - * This avoids generating malformed XHTML with overlapping tags. + * Hyperlinks come from two OOXML sources with different lifecycles: + * <ul> + * <li><b>Wrapper hyperlinks</b> (DOCX {@code <w:hyperlink>}, field-code HYPERLINK): + * opened/closed explicitly via {@link #openHyperlink}/{@link #closeHyperlink}, + * span multiple runs.</li> + * <li><b>Run-property hyperlinks</b> (PPTX {@code <a:hlinkClick>}): + * set on {@link RunProperties#setHlinkClickUrl}, managed automatically + * by {@link #applyFormatting} per-run.</li> + * </ul> + * Both emit the same {@code <a href="...">} XHTML. Wrapper hyperlinks take + * precedence — run properties cannot override an active wrapper. */ class FormattingTagManager { private final XHTMLContentHandler xhtml; + // Outermost to innermost: hyperlink > bold > italic > strike > underline + private String currentHyperlink = null; + private boolean wrapperHyperlinkActive = false; private boolean isBold = false; private boolean isItalics = false; private boolean isStrikeThrough = false; @@ -43,13 +56,64 @@ class FormattingTagManager { this.xhtml = xhtml; } + /** + * Opens a wrapper-style hyperlink (DOCX {@code <w:hyperlink>} or field-code). + * Closes any open formatting tags first to maintain nesting. + * No-op if url is null. + */ + void openHyperlink(String url) throws SAXException { + if (url == null) { + return; + } + closeFormattingTags(); + if (currentHyperlink != null) { + xhtml.endElement("a"); + } + xhtml.startElement("a", "href", url); + currentHyperlink = url; + wrapperHyperlinkActive = true; + } + + /** + * Closes the active wrapper-style hyperlink. No-op if none was opened. + */ + void closeHyperlink() throws SAXException { + if (currentHyperlink != null && wrapperHyperlinkActive) { + closeFormattingTags(); + xhtml.endElement("a"); + currentHyperlink = null; + wrapperHyperlinkActive = false; + } + } + + /** + * Returns true if any hyperlink (wrapper or run-property) is currently open. + */ + boolean isHyperlinkActive() { + return currentHyperlink != null; + } + /** * Reconciles the current formatting state with the given run properties, * opening and closing XHTML tags as needed to maintain proper nesting. */ void applyFormatting(RunProperties runProperties) throws SAXException { + // Run-property hyperlinks only when no wrapper is active + if (!wrapperHyperlinkActive) { + String newHyperlink = runProperties.getHlinkClickUrl(); + if (!Objects.equals(newHyperlink, currentHyperlink)) { + closeFormattingTags(); + if (currentHyperlink != null) { + xhtml.endElement("a"); + } + if (newHyperlink != null) { + xhtml.startElement("a", "href", newHyperlink); + } + currentHyperlink = newHyperlink; + } + } + if (runProperties.isBold() != isBold) { - // Bold is outermost — close everything inside it if (isStrikeThrough) { xhtml.endElement("s"); isStrikeThrough = false; @@ -112,10 +176,18 @@ class FormattingTagManager { } /** - * Closes all currently open formatting tags in proper nesting order - * (innermost first: u, s, i, b). + * Closes all currently open tags in proper nesting order. */ void closeAll() throws SAXException { + closeFormattingTags(); + if (currentHyperlink != null) { + xhtml.endElement("a"); + currentHyperlink = null; + wrapperHyperlinkActive = false; + } + } + + private void closeFormattingTags() throws SAXException { if (isUnderline) { xhtml.endElement("u"); isUnderline = false; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java deleted file mode 100644 index 45eee33b57..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.microsoft.ooxml; - -import org.apache.poi.xwpf.usermodel.UnderlinePatterns; -import org.xml.sax.SAXException; - -import org.apache.tika.sax.XHTMLContentHandler; - -/** - * Manages all inline XHTML elements (hyperlinks and formatting tags) as a - * unified state machine, ensuring proper nesting. The nesting order from - * outermost to innermost is: - * <pre> - * {@code <a> <b> <i> <s> <u> text </u> </s> </i> </b> </a>} - * </pre> - * <p> - * When a change occurs to an outer element, all inner elements are closed - * first, the change is applied, then inner elements are reopened as needed. - * This prevents generating malformed XHTML with overlapping or unbalanced tags. - * <p> - * This class replaces the separate {@code FormattingTagManager} and the - * {@code wroteHyperlinkStart} boolean that were previously tracked independently - * in {@link OOXMLTikaBodyPartHandler}. - */ -class InlineTagManager { - - private final XHTMLContentHandler xhtml; - - private boolean hyperlinkOpen = false; - private boolean isBold = false; - private boolean isItalics = false; - private boolean isStrikeThrough = false; - private boolean isUnderline = false; - - InlineTagManager(XHTMLContentHandler xhtml) { - this.xhtml = xhtml; - } - - /** - * Opens a hyperlink. Since {@code <a>} is the outermost inline element, - * any existing inline elements (including a prior hyperlink) are closed - * first. - * - * @param href the link target; if {@code null} this is a no-op - */ - void openHyperlink(String href) throws SAXException { - if (href == null) { - return; - } - // Close everything — formatting then any existing hyperlink - closeAll(); - xhtml.startElement("a", "href", href); - hyperlinkOpen = true; - } - - /** - * Closes the current hyperlink and all formatting inside it. - * No-op if no hyperlink is open. - */ - void closeHyperlink() throws SAXException { - if (!hyperlinkOpen) { - return; - } - closeFormatting(); - xhtml.endElement("a"); - hyperlinkOpen = false; - } - - /** - * Returns {@code true} if a hyperlink is currently open. - */ - boolean isHyperlinkOpen() { - return hyperlinkOpen; - } - - /** - * Reconciles the current formatting state with the given run properties, - * opening and closing XHTML tags as needed to maintain proper nesting. - * The nesting order for formatting is: {@code <b> <i> <s> <u>}. - */ - void applyFormatting(RunProperties runProperties) throws SAXException { - if (runProperties.isBold() != isBold) { - // Bold is outermost formatting — close everything inside it - if (isUnderline) { - xhtml.endElement("u"); - isUnderline = false; - } - if (isStrikeThrough) { - xhtml.endElement("s"); - isStrikeThrough = false; - } - if (isItalics) { - xhtml.endElement("i"); - isItalics = false; - } - if (runProperties.isBold()) { - xhtml.startElement("b"); - } else { - xhtml.endElement("b"); - } - isBold = runProperties.isBold(); - } - - if (runProperties.isItalics() != isItalics) { - if (isUnderline) { - xhtml.endElement("u"); - isUnderline = false; - } - if (isStrikeThrough) { - xhtml.endElement("s"); - isStrikeThrough = false; - } - if (runProperties.isItalics()) { - xhtml.startElement("i"); - } else { - xhtml.endElement("i"); - } - isItalics = runProperties.isItalics(); - } - - if (runProperties.isStrikeThrough() != isStrikeThrough) { - if (isUnderline) { - xhtml.endElement("u"); - isUnderline = false; - } - if (runProperties.isStrikeThrough()) { - xhtml.startElement("s"); - } else { - xhtml.endElement("s"); - } - isStrikeThrough = runProperties.isStrikeThrough(); - } - - boolean runIsUnderlined = runProperties.getUnderline() != UnderlinePatterns.NONE; - if (runIsUnderlined != isUnderline) { - if (runIsUnderlined) { - xhtml.startElement("u"); - } else { - xhtml.endElement("u"); - } - isUnderline = runIsUnderlined; - } - } - - /** - * Closes all currently open formatting tags in proper nesting order - * (innermost first: u, s, i, b). Does NOT close the hyperlink. - */ - void closeFormatting() throws SAXException { - if (isUnderline) { - xhtml.endElement("u"); - isUnderline = false; - } - if (isStrikeThrough) { - xhtml.endElement("s"); - isStrikeThrough = false; - } - if (isItalics) { - xhtml.endElement("i"); - isItalics = false; - } - if (isBold) { - xhtml.endElement("b"); - isBold = false; - } - } - - /** - * Closes ALL open inline elements — formatting first, then hyperlink. - * This is the primary safety mechanism: call at every structural boundary - * (end of paragraph, table cell, table row, table, etc.) to guarantee - * well-formed XHTML. - */ - void closeAll() throws SAXException { - closeFormatting(); - if (hyperlinkOpen) { - xhtml.endElement("a"); - hyperlinkOpen = false; - } - } -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java index 48dcc692d5..6cece158e8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java @@ -96,7 +96,9 @@ class OOXMLPartContentCollector extends DefaultHandler { if (id != null && !skipIds.contains(id)) { currentId = id; buffer = new ByteArrayOutputStream(); - writeString(buildWrapperOpenTag()); + // Don't write wrapper open tag yet — inline xmlns declarations + // (e.g., xmlns:a on nested elements) haven't been captured via + // startPrefixMapping. Defer to endElement when all are known. depth = 0; } } @@ -110,8 +112,16 @@ class OOXMLPartContentCollector extends DefaultHandler { } if (depth == 0) { - writeString("</w:body>"); - contentMap.put(currentId, buffer.toByteArray()); + // Build the wrapper now — all startPrefixMapping calls from nested + // elements have been captured, so inline xmlns declarations are included. + byte[] wrapperOpen = buildWrapperOpenTag().getBytes(StandardCharsets.UTF_8); + byte[] content = buffer.toByteArray(); + ByteArrayOutputStream combined = + new ByteArrayOutputStream(wrapperOpen.length + content.length + 16); + combined.write(wrapperOpen, 0, wrapperOpen.length); + combined.write(content, 0, content.length); + writeString(combined, "</w:body>"); + contentMap.put(currentId, combined.toByteArray()); currentId = null; buffer = null; return; @@ -171,8 +181,12 @@ class OOXMLPartContentCollector extends DefaultHandler { } private void writeString(String s) { + writeString(buffer, s); + } + + private static void writeString(ByteArrayOutputStream target, String s) { byte[] bytes = s.getBytes(StandardCharsets.UTF_8); - buffer.write(bytes, 0, bytes.length); + target.write(bytes, 0, bytes.length); } static String escape(String s) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java index e104867db4..a18f52a4d2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java @@ -56,7 +56,6 @@ public class OOXMLTikaBodyPartHandler private int tableDepth = 0;//table depth private int sdtDepth = 0;// private FormattingTagManager formattingTags; - private boolean wroteHyperlinkStart = false; //TODO: fix this //pWithinCell should be an array/stack of given cell depths @@ -127,19 +126,12 @@ public class OOXMLTikaBodyPartHandler @Override public void hyperlinkStart(String link) throws SAXException { - if (link != null) { - xhtml.startElement("a", "href", link); - wroteHyperlinkStart = true; - } + formattingTags.openHyperlink(link); } @Override public void hyperlinkEnd() throws SAXException { - if (wroteHyperlinkStart) { - formattingTags.closeAll(); - wroteHyperlinkStart = false; - xhtml.endElement("a"); - } + formattingTags.closeHyperlink(); } @Override @@ -438,7 +430,7 @@ public class OOXMLTikaBodyPartHandler @Override public void startBookmark(String id, String name) throws SAXException { //skip bookmarks within hyperlinks - if (name != null && !wroteHyperlinkStart) { + if (name != null && !formattingTags.isHyperlinkActive()) { xhtml.startElement("a", "name", name); xhtml.endElement("a"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java index 7b0b5ceea5..46e25b299d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java @@ -154,9 +154,13 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private int inACFallbackDepth = 0; private boolean inDelText = false; //buffers rt in ruby sections (see 17.3.3.25) - private boolean inHlinkClick = false; private boolean inTextBox = false; private boolean inV = false; //in c:v in chart file + // True when we're inside a <pPr> that was a direct child of <p> (the first child). + // Only those pPr elements should trigger startParagraph on close. + // pPr elements nested inside other elements (e.g., <a:pPr> inside <a:fld>) + // must not be treated as paragraph-level properties. + private boolean inParagraphLevelPPr = false; // Field code tracking for instrText-based hyperlinks private boolean inField = false; private boolean inInstrText = false; @@ -225,7 +229,12 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { throws SAXException { //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd - if (lastStartElementWasP && !PPR.equals(localName)) { + if (lastStartElementWasP && PPR.equals(localName)) { + // pPr is the first child of <p> — this is a paragraph-level pPr. + // Defer startParagraph until </pPr> so properties (style, numbering) are set first. + inParagraphLevelPPr = true; + } else if (lastStartElementWasP) { + // First child of <p> is not pPr — start paragraph immediately with defaults. bodyContentsHandler.startParagraph(currPProperties); } @@ -321,8 +330,14 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { String hyperlink = null; if (hyperlinkId != null) { hyperlink = linkedRelationships.get(hyperlinkId); - bodyContentsHandler.hyperlinkStart(hyperlink); - inHlinkClick = true; + if (inR) { + // hlinkClick inside a run — treat as run property. + // FormattingTagManager opens/closes <a> with the run lifecycle. + currRunProperties.setHlinkClickUrl(hyperlink); + } else if (hyperlink != null) { + // hlinkClick on a shape/picture (not in a run) — emit as self-closing ref + bodyContentsHandler.externalRef("hlinkClick", hyperlink); + } } } else if (TBL.equals(localName)) { bodyContentsHandler.startTable(); @@ -498,12 +513,15 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { handleEndOfRun(); } else if (T.equals(localName)) { inT = false; - } else if (PPR.equals(localName)) { + } else if (PPR.equals(localName) && inParagraphLevelPPr) { + // Only process as paragraph properties if this pPr was a direct child of <p>. + // pPr inside other elements (e.g., <a:fld> fields) must be ignored. if (!pStarted) { bodyContentsHandler.startParagraph(currPProperties); pStarted = true; } currPProperties.reset(); + inParagraphLevelPPr = false; } else if (P.equals(localName)) { if (runBuffer.length() > 0) { //<p><tab></p>...this will treat that as if it were @@ -553,16 +571,13 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private void handleEndOfRun() throws SAXException { bodyContentsHandler.run(currRunProperties, runBuffer.toString()); - if (inHlinkClick) { - bodyContentsHandler.hyperlinkEnd(); - inHlinkClick = false; - } inR = false; runBuffer.setLength(0); currRunProperties.setBold(false); currRunProperties.setItalics(false); currRunProperties.setStrike(false); currRunProperties.setUnderline(UnderlinePatterns.NONE.name()); + currRunProperties.setHlinkClickUrl(null); } @Override diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java index 54d149f333..efed9c1348 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java @@ -30,6 +30,9 @@ public class RunProperties { UnderlinePatterns underline = UnderlinePatterns.NONE; + // PPTX hlinkClick hyperlink URL — set from <a:hlinkClick> inside <a:rPr> + String hlinkClickUrl = null; + public boolean isItalics() { return italics; } @@ -68,4 +71,12 @@ public class RunProperties { underline = UnderlinePatterns.SINGLE; } } + + public String getHlinkClickUrl() { + return hlinkClickUrl; + } + + public void setHlinkClickUrl(String url) { + this.hlinkClickUrl = url; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java index 7697452a41..1497a17b92 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java @@ -110,9 +110,12 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { if (slidesPRC != null && slidesPRC.size() > 0) { for (int i = 0; i < slidesPRC.size(); i++) { try { - hiddenSlideCount += handleSlidePart( - mainDocument.getRelatedPart(slidesPRC.getRelationship(i)), - xhtml); + PackagePart slidePart = + safeGetRelatedPart(mainDocument, slidesPRC.getRelationship(i)); + if (slidePart == null) { + continue; + } + hiddenSlideCount += handleSlidePart(slidePart, xhtml); } catch (InvalidFormatException | ZipException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); @@ -152,7 +155,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { for (int i = 0; i < prc.size(); i++) { PackagePart commentAuthorsPart = null; try { - commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i)); + commentAuthorsPart = safeGetRelatedPart(mainDocument, prc.getRelationship(i)); } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); @@ -263,7 +266,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { for (int i = 0; i < slidePRC.size(); i++) { PackagePart slidePart = null; try { - slidePart = mainDocument.getRelatedPart(slidePRC.getRelationship(i)); + slidePart = safeGetRelatedPart(mainDocument, slidePRC.getRelationship(i)); } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); @@ -286,7 +289,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { for (int i = 0; i < prc.size(); i++) { PackagePart pp = null; try { - pp = mainDocument.getRelatedPart(prc.getRelationship(i)); + pp = safeGetRelatedPart(mainDocument, prc.getRelationship(i)); } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); @@ -305,7 +308,8 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { for (String relation : new String[]{XSLFRelation.VML_DRAWING.getRelation(), XSLFRelation.SLIDE_LAYOUT.getRelation(), XSLFRelation.NOTES_MASTER.getRelation(), - XSLFRelation.NOTES.getRelation()}) { + XSLFRelation.NOTES.getRelation(), XSLFRelation.CHART.getRelation(), + XSLFRelation.DIAGRAM_DRAWING.getRelation()}) { PackageRelationshipCollection prc = null; try { prc = slidePart.getRelationshipsByType(relation); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index 45c2725e67..2f42c7d8a8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -75,7 +75,8 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { new String[]{XWPFRelation.HEADER.getRelation(), XWPFRelation.FOOTER.getRelation(), XWPFRelation.FOOTNOTE.getRelation(), "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes", - "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"}; + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments", + AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA}; // Relationship types for Word settings private static final String SETTINGS_RELATION = @@ -198,7 +199,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { PackageRelationshipCollection settingsRels = documentPart.getRelationshipsByType(SETTINGS_RELATION); if (settingsRels != null && settingsRels.size() > 0) { - PackagePart settingsPart = documentPart.getRelatedPart(settingsRels.getRelationship(0)); + PackagePart settingsPart = safeGetRelatedPart(documentPart, settingsRels.getRelationship(0)); if (settingsPart != null) { try (InputStream is = settingsPart.getInputStream()) { WordSettingsHandler handler = new WordSettingsHandler(xhtml); @@ -218,7 +219,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { PackageRelationshipCollection webSettingsRels = documentPart.getRelationshipsByType(WEB_SETTINGS_RELATION); if (webSettingsRels != null && webSettingsRels.size() > 0) { - PackagePart webSettingsPart = documentPart.getRelatedPart(webSettingsRels.getRelationship(0)); + PackagePart webSettingsPart = safeGetRelatedPart(documentPart, webSettingsRels.getRelationship(0)); if (webSettingsPart != null) { try (InputStream is = webSettingsPart.getInputStream()) { WebSettingsHandler handler = new WebSettingsHandler(xhtml); @@ -276,7 +277,10 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { if (headersPRC != null) { for (int i = 0; i < headersPRC.size(); i++) { PackagePart header = - documentPart.getRelatedPart(headersPRC.getRelationship(i)); + safeGetRelatedPart(documentPart, headersPRC.getRelationship(i)); + if (header == null) { + continue; + } handlePart(header, styles, listManager, xhtml, OOXMLInlineBodyPartMap.EMPTY); } @@ -315,7 +319,10 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { if (prc != null) { for (int i = 0; i < prc.size(); i++) { PackagePart packagePart = - documentPart.getRelatedPart(prc.getRelationship(i)); + safeGetRelatedPart(documentPart, prc.getRelationship(i)); + if (packagePart == null) { + continue; + } handlePart(packagePart, styles, listManager, xhtml, OOXMLInlineBodyPartMap.EMPTY); } @@ -396,7 +403,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { continue; } try { - PackagePart emfPart = documentPart.getRelatedPart( + PackagePart emfPart = safeGetRelatedPart(documentPart, documentPart.getRelationship(emfRId)); if (emfPart == null || emfPart.getContentType() == null) { continue; @@ -457,7 +464,10 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { OOXMLPartContentCollector collector = new OOXMLPartContentCollector(wrapperElements, skipIds); for (int i = 0; i < prc.size(); i++) { - PackagePart part = documentPart.getRelatedPart(prc.getRelationship(i)); + PackagePart part = safeGetRelatedPart(documentPart, prc.getRelationship(i)); + if (part == null) { + continue; + } // collect the part's linked relationships (for picture resolution) Map<String, String> partRels = loadLinkedRelationships(part, true, metadata); @@ -484,7 +494,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { if (stylesRelationShip == null) { return null; } - PackagePart stylesPart = packagePart.getRelatedPart(stylesRelationShip); + PackagePart stylesPart = safeGetRelatedPart(packagePart, stylesRelationShip); if (stylesPart == null) { return null; } @@ -504,7 +514,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { if (numberingRelationShip == null) { return null; } - PackagePart numberingPart = packagePart.getRelatedPart(numberingRelationShip); + PackagePart numberingPart = safeGetRelatedPart(packagePart, numberingRelationShip); if (numberingPart == null) { return null; } @@ -545,8 +555,10 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { if (prc != null) { for (int i = 0; i < prc.size(); i++) { PackagePart packagePart = - documentPart.getRelatedPart(prc.getRelationship(i)); - relatedParts.add(packagePart); + safeGetRelatedPart(documentPart, prc.getRelationship(i)); + if (packagePart != null) { + relatedParts.add(packagePart); + } } } } catch (InvalidFormatException e) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index ec1a1fa437..45c30355e0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -43,6 +43,7 @@ import org.apache.tika.exception.RuntimeSAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor; import org.apache.tika.parser.microsoft.ooxml.EditType; import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler; import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties; @@ -193,7 +194,11 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation()); if (headersPRC != null) { for (int i = 0; i < headersPRC.size(); i++) { - PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i)); + PackagePart header = AbstractOOXMLExtractor.safeGetRelatedPart( + documentPart, headersPRC.getRelationship(i)); + if (header == null) { + continue; + } handlePart(header, xwpfListManager, sb); } } @@ -213,7 +218,11 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { if (prc != null) { for (int i = 0; i < prc.size(); i++) { PackagePart packagePart = - documentPart.getRelatedPart(prc.getRelationship(i)); + AbstractOOXMLExtractor.safeGetRelatedPart( + documentPart, prc.getRelationship(i)); + if (packagePart == null) { + continue; + } handlePart(packagePart, xwpfListManager, sb); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java index 02d5247d44..ac533400e0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java @@ -30,6 +30,7 @@ import org.junit.jupiter.api.Test; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.OfficeOpenXMLCore; @@ -363,4 +364,32 @@ public class OOXMLDocxSAXTest extends AbstractOOXMLDocxTest { Metadata m = metadataList.get(0); assertEquals("true", m.get(Office.HAS_FRAMESETS)); } + + /** + * Test with external DOCX files known to trigger "prefix not bound" + * from missing namespace declarations in footnote/endnote fragments. + * Enable by setting system property "tika.test.docx.namespace" to a file path. + */ + @Test + public void testNamespaceInFragments() throws Exception { + String filePath = System.getProperty("tika.test.docx.namespace"); + if (filePath == null) { + return; + } + java.io.File f = new java.io.File(filePath); + if (!f.isFile()) { + return; + } + AutoDetectParser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + org.xml.sax.ContentHandler handler = + new org.apache.tika.sax.BodyContentHandler(-1); + try (TikaInputStream tis = TikaInputStream.get(f.toPath())) { + parser.parse(tis, handler, metadata, getParseContext()); + } + String[] warnings = metadata.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING); + for (String w : warnings) { + assertNotContained("not bound", w); + } + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPptxSAXTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPptxSAXTest.java index 0d0554d049..81e22df764 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPptxSAXTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPptxSAXTest.java @@ -40,6 +40,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; /** * Runs the shared PPTX tests using the SAX-based streaming parser, @@ -62,7 +63,7 @@ public class OOXMLPptxSAXTest extends AbstractOOXMLPptxTest { List<Metadata> metadataList = getRecursiveMetadata("testPPT_various2.pptx", getParseContext()); - assertEquals(14, metadataList.size(), "right number of attachments"); + assertEquals(15, metadataList.size(), "right number of attachments"); String mainContent = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); @@ -234,6 +235,42 @@ public class OOXMLPptxSAXTest extends AbstractOOXMLPptxTest { assertContainsAtLeast(parsedBy, metadataList); } + @Test + public void testStrictTagBalancePptx() throws Exception { + XHTMLContentHandler.setStrictTagBalanceChecking(true); + try { + // Test with the standard test file first — should not throw + getRecursiveMetadata("testPPT_various2.pptx", getParseContext()); + } finally { + XHTMLContentHandler.setStrictTagBalanceChecking(false); + } + } + + /** + * Test with external PPTX files known to trigger "suspected zip bomb" + * from unbalanced SAX tags. Enable by setting system property + * "tika.test.pptx.zipbomb" to a file path. + */ + @Test + public void testStrictTagBalanceExternalPptx() throws Exception { + String filePath = System.getProperty("tika.test.pptx.zipbomb"); + if (filePath == null) { + return; + } + java.io.File f = new java.io.File(filePath); + if (!f.isFile()) { + return; + } + XHTMLContentHandler.setStrictTagBalanceChecking(true); + try (TikaInputStream tis = TikaInputStream.get(f.toPath())) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(-1); + AUTO_DETECT_PARSER.parse(tis, handler, metadata, getParseContext()); + } finally { + XHTMLContentHandler.setStrictTagBalanceChecking(false); + } + } + @Test public void testEncrypted() throws Exception { Map<String, String> tests = new HashMap<>();
