This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9a2b3a84244a5d3060978990846d4b689a3c2df6 Author: tallison <[email protected]> AuthorDate: Wed Mar 18 08:00:48 2026 -0400 improve sax ooxml - footnotes and endnotes - WIP --- .../ooxml/OOXMLFootnoteEndnoteCollector.java | 145 --------------------- .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 55 +++++++- .../ooxml/SXWPFWordExtractorDecorator.java | 71 ++++++++-- 3 files changed, 114 insertions(+), 157 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java deleted file mode 100644 index 9f7e663bf7..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.microsoft.ooxml; - -import java.util.HashMap; -import java.util.Map; - -import javax.xml.stream.XMLOutputFactory; -import javax.xml.stream.XMLStreamException; -import javax.xml.stream.XMLStreamWriter; - -import java.io.StringWriter; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -/** - * SAX handler that collects raw XML content for each footnote or endnote - * by ID from footnotes.xml or endnotes.xml. The collected XML can then be - * re-parsed through the main handler when a footnote/endnote reference is - * encountered in the document body. - */ -class OOXMLFootnoteEndnoteCollector extends DefaultHandler { - - private static final String W_NS = - "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; - private static final String FOOTNOTE = "footnote"; - private static final String ENDNOTE = "endnote"; - - private final Map<String, byte[]> contentMap = new HashMap<>(); - - private String currentId = null; - private StringWriter currentWriter = null; - private XMLStreamWriter currentXmlWriter = null; - private int depth = 0; - - Map<String, byte[]> getContentMap() { - return contentMap; - } - - @Override - public void startElement(String uri, String localName, String qName, - Attributes atts) throws SAXException { - if (currentId != null) { - depth++; - try { - if (qName != null && !qName.isEmpty()) { - currentXmlWriter.writeStartElement(qName); - } else { - currentXmlWriter.writeStartElement(localName); - } - for (int i = 0; i < atts.getLength(); i++) { - String attQName = atts.getQName(i); - if (attQName != null && !attQName.isEmpty()) { - currentXmlWriter.writeAttribute(attQName, atts.getValue(i)); - } else { - currentXmlWriter.writeAttribute( - atts.getLocalName(i), atts.getValue(i)); - } - } - } catch (XMLStreamException e) { - throw new SAXException(e); - } - return; - } - - if ((FOOTNOTE.equals(localName) || ENDNOTE.equals(localName))) { - String id = atts.getValue(W_NS, "id"); - // skip separator/continuation footnotes (ids 0 and -1) - if (id != null && !id.equals("0") && !id.equals("-1")) { - currentId = id; - currentWriter = new StringWriter(); - try { - currentXmlWriter = XMLOutputFactory.newInstance() - .createXMLStreamWriter(currentWriter); - currentXmlWriter.writeStartDocument(); - // wrap content in a root element - currentXmlWriter.writeStartElement("body"); - } catch (XMLStreamException e) { - throw new SAXException(e); - } - depth = 0; - } - } - } - - @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { - if (currentId == null) { - return; - } - - if (depth == 0) { - // end of the footnote/endnote element itself - try { - currentXmlWriter.writeEndElement(); // close <body> - currentXmlWriter.writeEndDocument(); - currentXmlWriter.flush(); - currentXmlWriter.close(); - } catch (XMLStreamException e) { - throw new SAXException(e); - } - contentMap.put(currentId, - currentWriter.toString().getBytes(java.nio.charset.StandardCharsets.UTF_8)); - currentId = null; - currentWriter = null; - currentXmlWriter = null; - return; - } - - depth--; - try { - currentXmlWriter.writeEndElement(); - } catch (XMLStreamException e) { - throw new SAXException(e); - } - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - if (currentId != null && currentXmlWriter != null) { - try { - currentXmlWriter.writeCharacters(new String(ch, start, length)); - } catch (XMLStreamException e) { - throw new SAXException(e); - } - } - } -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java index 1595436ad2..dab03ac30f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java @@ -17,19 +17,26 @@ package org.apache.tika.parser.microsoft.ooxml; +import java.io.ByteArrayInputStream; +import java.io.IOException; import java.math.BigInteger; import java.util.Date; +import java.util.Map; import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.WordExtractor; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim; +import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.XMLReaderUtils; public class OOXMLTikaBodyPartHandler implements XWPFBodyContentsHandler { @@ -66,6 +73,9 @@ public class OOXMLTikaBodyPartHandler //if we're marking more that the first level <p/> element private String paragraphTag = null; + private OOXMLInlineBodyPartMap inlinePartMap = OOXMLInlineBodyPartMap.EMPTY; + private ParseContext parseContext = null; + public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) { this(xhtml, null); } @@ -96,6 +106,17 @@ public class OOXMLTikaBodyPartHandler this.includeMoveFromText = parserConfig.isIncludeMoveFromContent(); } + /** + * Sets pre-parsed inline body part content (footnotes, endnotes, comments) + * so that references encountered during main document parsing can be + * resolved inline. + */ + public void setInlineBodyPartMap(OOXMLInlineBodyPartMap inlinePartMap, + ParseContext parseContext) { + this.inlinePartMap = inlinePartMap != null ? inlinePartMap : OOXMLInlineBodyPartMap.EMPTY; + this.parseContext = parseContext; + } + @Override public void run(RunProperties runProperties, String contents) throws SAXException { @@ -304,7 +325,13 @@ public class OOXMLTikaBodyPartHandler @Override public void footnoteReference(String id) throws SAXException { - if (id != null) { + if (id == null) { + return; + } + byte[] xml = inlinePartMap.getFootnote(id); + if (xml != null) { + inlineNoteContent(xml, "footnote"); + } else { xhtml.characters("["); xhtml.characters(id); xhtml.characters("]"); @@ -313,13 +340,37 @@ public class OOXMLTikaBodyPartHandler @Override public void endnoteReference(String id) throws SAXException { - if (id != null) { + if (id == null) { + return; + } + byte[] xml = inlinePartMap.getEndnote(id); + if (xml != null) { + inlineNoteContent(xml, "endnote"); + } else { xhtml.characters("["); xhtml.characters(id); xhtml.characters("]"); } } + private void inlineNoteContent(byte[] xml, String cssClass) throws SAXException { + // Use the inline part map's relationship map which includes relationships + // from the footnote/endnote parts (needed for picture resolution) + Map<String, String> noteRelationships = inlinePartMap.getLinkedRelationships(); + xhtml.startElement("div", "class", cssClass); + try { + XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml), + new EmbeddedContentHandler( + new OOXMLWordAndPowerPointTextHandler( + new OOXMLTikaBodyPartHandler(xhtml), + noteRelationships)), + parseContext); + } catch (TikaException | IOException e) { + xhtml.characters("[" + cssClass + " parse error]"); + } + xhtml.endElement("div"); + } + @Override public boolean isIncludeMoveFromText() { return includeMoveFromText; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index 78eddc280f..f8ed0c0f95 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -19,8 +19,10 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.zip.ZipException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; @@ -255,7 +257,8 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { for (int i = 0; i < headersPRC.size(); i++) { PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i)); - handlePart(header, styles, listManager, xhtml); + handlePart(header, styles, listManager, xhtml, + OOXMLInlineBodyPartMap.EMPTY); } } } catch (InvalidFormatException | ZipException e) { @@ -264,18 +267,21 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } } + // Pre-collect footnotes, endnotes, and comments so they can be + // inlined at the point of reference in the main document + OOXMLInlineBodyPartMap inlinePartMap = collectInlineParts(documentPart); + //main document try { - handlePart(documentPart, styles, listManager, xhtml); + handlePart(documentPart, styles, listManager, xhtml, inlinePartMap); } catch (ZipException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } - //for now, just dump other components at end + //dump remaining components at end (diagrams, charts, footers, comments) for (String rel : new String[]{AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA, - XSSFRelation.CHART.getRelation(), XWPFRelation.FOOTNOTE.getRelation(), - XWPFRelation.COMMENT.getRelation(), XWPFRelation.FOOTER.getRelation(), - XWPFRelation.ENDNOTE.getRelation(),}) { + XSSFRelation.CHART.getRelation(), + XWPFRelation.COMMENT.getRelation(), XWPFRelation.FOOTER.getRelation()}) { //skip footers if we shouldn't extract them if (!config.isIncludeHeadersAndFooters() && rel.equals(XWPFRelation.FOOTER.getRelation())) { @@ -287,7 +293,8 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { for (int i = 0; i < prc.size(); i++) { PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i)); - handlePart(packagePart, styles, listManager, xhtml); + handlePart(packagePart, styles, listManager, xhtml, + OOXMLInlineBodyPartMap.EMPTY); } } } catch (InvalidFormatException | ZipException e) { @@ -298,16 +305,19 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } private void handlePart(PackagePart packagePart, XWPFStylesShim styles, - XWPFListManager listManager, XHTMLContentHandler xhtml) + XWPFListManager listManager, XHTMLContentHandler xhtml, + OOXMLInlineBodyPartMap inlinePartMap) throws IOException, SAXException { Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true, metadata); + OOXMLTikaBodyPartHandler bodyHandler = + new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, config, metadata); + bodyHandler.setInlineBodyPartMap(inlinePartMap, context); try (InputStream stream = packagePart.getInputStream()) { XMLReaderUtils.parseSAX(stream, new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler( - new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, - config, metadata), + bodyHandler, linkedRelationships, config.isIncludeShapeBasedContent(), config.isConcatenatePhoneticRuns(), config.isPreferAlternateContentChoice())), context); @@ -315,7 +325,48 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } + } + + private OOXMLInlineBodyPartMap collectInlineParts(PackagePart documentPart) { + Map<String, String> allRelationships = new java.util.HashMap<>(); + Map<String, byte[]> footnoteMap = collectPartContent(documentPart, + XWPFRelation.FOOTNOTE.getRelation(), Set.of("footnote"), + allRelationships); + String endnoteRel = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes"; + Map<String, byte[]> endnoteMap = collectPartContent(documentPart, + endnoteRel, Set.of("endnote"), allRelationships); + return new OOXMLInlineBodyPartMap(footnoteMap, endnoteMap, + Collections.emptyMap(), allRelationships); + } + private Map<String, byte[]> collectPartContent(PackagePart documentPart, + String relationshipType, Set<String> wrapperElements, + Map<String, String> allRelationships) { + try { + PackageRelationshipCollection prc = + documentPart.getRelationshipsByType(relationshipType); + if (prc == null || prc.size() == 0) { + return Collections.emptyMap(); + } + OOXMLPartContentCollector collector = + new OOXMLPartContentCollector(wrapperElements); + for (int i = 0; i < prc.size(); i++) { + PackagePart part = documentPart.getRelatedPart(prc.getRelationship(i)); + // collect the part's linked relationships (for picture resolution) + Map<String, String> partRels = + loadLinkedRelationships(part, true, metadata); + allRelationships.putAll(partRels); + try (InputStream stream = part.getInputStream()) { + XMLReaderUtils.parseSAX(stream, collector, context); + } + } + return collector.getContentMap(); + } catch (InvalidFormatException | IOException | TikaException | SAXException e) { + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); + return Collections.emptyMap(); + } }
