This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9a2b3a84244a5d3060978990846d4b689a3c2df6
Author: tallison <[email protected]>
AuthorDate: Wed Mar 18 08:00:48 2026 -0400

    improve sax ooxml - footnotes and endnotes - WIP
---
 .../ooxml/OOXMLFootnoteEndnoteCollector.java       | 145 ---------------------
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |  55 +++++++-
 .../ooxml/SXWPFWordExtractorDecorator.java         |  71 ++++++++--
 3 files changed, 114 insertions(+), 157 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java
deleted file mode 100644
index 9f7e663bf7..0000000000
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import java.util.HashMap;
-import java.util.Map;
-
-import javax.xml.stream.XMLOutputFactory;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamWriter;
-
-import java.io.StringWriter;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * SAX handler that collects raw XML content for each footnote or endnote
- * by ID from footnotes.xml or endnotes.xml. The collected XML can then be
- * re-parsed through the main handler when a footnote/endnote reference is
- * encountered in the document body.
- */
-class OOXMLFootnoteEndnoteCollector extends DefaultHandler {
-
-    private static final String W_NS =
-            "http://schemas.openxmlformats.org/wordprocessingml/2006/main";;
-    private static final String FOOTNOTE = "footnote";
-    private static final String ENDNOTE = "endnote";
-
-    private final Map<String, byte[]> contentMap = new HashMap<>();
-
-    private String currentId = null;
-    private StringWriter currentWriter = null;
-    private XMLStreamWriter currentXmlWriter = null;
-    private int depth = 0;
-
-    Map<String, byte[]> getContentMap() {
-        return contentMap;
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName,
-            Attributes atts) throws SAXException {
-        if (currentId != null) {
-            depth++;
-            try {
-                if (qName != null && !qName.isEmpty()) {
-                    currentXmlWriter.writeStartElement(qName);
-                } else {
-                    currentXmlWriter.writeStartElement(localName);
-                }
-                for (int i = 0; i < atts.getLength(); i++) {
-                    String attQName = atts.getQName(i);
-                    if (attQName != null && !attQName.isEmpty()) {
-                        currentXmlWriter.writeAttribute(attQName, 
atts.getValue(i));
-                    } else {
-                        currentXmlWriter.writeAttribute(
-                                atts.getLocalName(i), atts.getValue(i));
-                    }
-                }
-            } catch (XMLStreamException e) {
-                throw new SAXException(e);
-            }
-            return;
-        }
-
-        if ((FOOTNOTE.equals(localName) || ENDNOTE.equals(localName))) {
-            String id = atts.getValue(W_NS, "id");
-            // skip separator/continuation footnotes (ids 0 and -1)
-            if (id != null && !id.equals("0") && !id.equals("-1")) {
-                currentId = id;
-                currentWriter = new StringWriter();
-                try {
-                    currentXmlWriter = XMLOutputFactory.newInstance()
-                            .createXMLStreamWriter(currentWriter);
-                    currentXmlWriter.writeStartDocument();
-                    // wrap content in a root element
-                    currentXmlWriter.writeStartElement("body");
-                } catch (XMLStreamException e) {
-                    throw new SAXException(e);
-                }
-                depth = 0;
-            }
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName)
-            throws SAXException {
-        if (currentId == null) {
-            return;
-        }
-
-        if (depth == 0) {
-            // end of the footnote/endnote element itself
-            try {
-                currentXmlWriter.writeEndElement(); // close <body>
-                currentXmlWriter.writeEndDocument();
-                currentXmlWriter.flush();
-                currentXmlWriter.close();
-            } catch (XMLStreamException e) {
-                throw new SAXException(e);
-            }
-            contentMap.put(currentId,
-                    
currentWriter.toString().getBytes(java.nio.charset.StandardCharsets.UTF_8));
-            currentId = null;
-            currentWriter = null;
-            currentXmlWriter = null;
-            return;
-        }
-
-        depth--;
-        try {
-            currentXmlWriter.writeEndElement();
-        } catch (XMLStreamException e) {
-            throw new SAXException(e);
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws 
SAXException {
-        if (currentId != null && currentXmlWriter != null) {
-            try {
-                currentXmlWriter.writeCharacters(new String(ch, start, 
length));
-            } catch (XMLStreamException e) {
-                throw new SAXException(e);
-            }
-        }
-    }
-}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index 1595436ad2..dab03ac30f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -17,19 +17,26 @@
 package org.apache.tika.parser.microsoft.ooxml;
 
 
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.math.BigInteger;
 import java.util.Date;
+import java.util.Map;
 
 import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.WordExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 
 public class OOXMLTikaBodyPartHandler
         implements XWPFBodyContentsHandler {
@@ -66,6 +73,9 @@ public class OOXMLTikaBodyPartHandler
     //if we're marking more that the first level <p/> element
     private String paragraphTag = null;
 
+    private OOXMLInlineBodyPartMap inlinePartMap = 
OOXMLInlineBodyPartMap.EMPTY;
+    private ParseContext parseContext = null;
+
     public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) {
         this(xhtml, null);
     }
@@ -96,6 +106,17 @@ public class OOXMLTikaBodyPartHandler
         this.includeMoveFromText = parserConfig.isIncludeMoveFromContent();
     }
 
+    /**
+     * Sets pre-parsed inline body part content (footnotes, endnotes, comments)
+     * so that references encountered during main document parsing can be
+     * resolved inline.
+     */
+    public void setInlineBodyPartMap(OOXMLInlineBodyPartMap inlinePartMap,
+            ParseContext parseContext) {
+        this.inlinePartMap = inlinePartMap != null ? inlinePartMap : 
OOXMLInlineBodyPartMap.EMPTY;
+        this.parseContext = parseContext;
+    }
+
     @Override
     public void run(RunProperties runProperties, String contents) throws 
SAXException {
 
@@ -304,7 +325,13 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void footnoteReference(String id) throws SAXException {
-        if (id != null) {
+        if (id == null) {
+            return;
+        }
+        byte[] xml = inlinePartMap.getFootnote(id);
+        if (xml != null) {
+            inlineNoteContent(xml, "footnote");
+        } else {
             xhtml.characters("[");
             xhtml.characters(id);
             xhtml.characters("]");
@@ -313,13 +340,37 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void endnoteReference(String id) throws SAXException {
-        if (id != null) {
+        if (id == null) {
+            return;
+        }
+        byte[] xml = inlinePartMap.getEndnote(id);
+        if (xml != null) {
+            inlineNoteContent(xml, "endnote");
+        } else {
             xhtml.characters("[");
             xhtml.characters(id);
             xhtml.characters("]");
         }
     }
 
+    private void inlineNoteContent(byte[] xml, String cssClass) throws 
SAXException {
+        // Use the inline part map's relationship map which includes 
relationships
+        // from the footnote/endnote parts (needed for picture resolution)
+        Map<String, String> noteRelationships = 
inlinePartMap.getLinkedRelationships();
+        xhtml.startElement("div", "class", cssClass);
+        try {
+            XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml),
+                    new EmbeddedContentHandler(
+                            new OOXMLWordAndPowerPointTextHandler(
+                                    new OOXMLTikaBodyPartHandler(xhtml),
+                                    noteRelationships)),
+                    parseContext);
+        } catch (TikaException | IOException e) {
+            xhtml.characters("[" + cssClass + " parse error]");
+        }
+        xhtml.endElement("div");
+    }
+
     @Override
     public boolean isIncludeMoveFromText() {
         return includeMoveFromText;
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 78eddc280f..f8ed0c0f95 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -19,8 +19,10 @@ package org.apache.tika.parser.microsoft.ooxml;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.zip.ZipException;
 
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -255,7 +257,8 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                     for (int i = 0; i < headersPRC.size(); i++) {
                         PackagePart header =
                                 
documentPart.getRelatedPart(headersPRC.getRelationship(i));
-                        handlePart(header, styles, listManager, xhtml);
+                        handlePart(header, styles, listManager, xhtml,
+                                OOXMLInlineBodyPartMap.EMPTY);
                     }
                 }
             } catch (InvalidFormatException | ZipException e) {
@@ -264,18 +267,21 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             }
         }
 
+        // Pre-collect footnotes, endnotes, and comments so they can be
+        // inlined at the point of reference in the main document
+        OOXMLInlineBodyPartMap inlinePartMap = 
collectInlineParts(documentPart);
+
         //main document
         try {
-            handlePart(documentPart, styles, listManager, xhtml);
+            handlePart(documentPart, styles, listManager, xhtml, 
inlinePartMap);
         } catch (ZipException e) {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
         }
-        //for now, just dump other components at end
+        //dump remaining components at end (diagrams, charts, footers, 
comments)
         for (String rel : new 
String[]{AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
-                XSSFRelation.CHART.getRelation(), 
XWPFRelation.FOOTNOTE.getRelation(),
-                XWPFRelation.COMMENT.getRelation(), 
XWPFRelation.FOOTER.getRelation(),
-                XWPFRelation.ENDNOTE.getRelation(),}) {
+                XSSFRelation.CHART.getRelation(),
+                XWPFRelation.COMMENT.getRelation(), 
XWPFRelation.FOOTER.getRelation()}) {
             //skip footers if we shouldn't extract them
             if (!config.isIncludeHeadersAndFooters() &&
                     rel.equals(XWPFRelation.FOOTER.getRelation())) {
@@ -287,7 +293,8 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                     for (int i = 0; i < prc.size(); i++) {
                         PackagePart packagePart =
                                 
documentPart.getRelatedPart(prc.getRelationship(i));
-                        handlePart(packagePart, styles, listManager, xhtml);
+                        handlePart(packagePart, styles, listManager, xhtml,
+                                OOXMLInlineBodyPartMap.EMPTY);
                     }
                 }
             } catch (InvalidFormatException | ZipException e) {
@@ -298,16 +305,19 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
     }
 
     private void handlePart(PackagePart packagePart, XWPFStylesShim styles,
-                            XWPFListManager listManager, XHTMLContentHandler 
xhtml)
+                            XWPFListManager listManager, XHTMLContentHandler 
xhtml,
+                            OOXMLInlineBodyPartMap inlinePartMap)
             throws IOException, SAXException {
 
         Map<String, String> linkedRelationships =
                 loadLinkedRelationships(packagePart, true, metadata);
+        OOXMLTikaBodyPartHandler bodyHandler =
+                new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, 
config, metadata);
+        bodyHandler.setInlineBodyPartMap(inlinePartMap, context);
         try (InputStream stream = packagePart.getInputStream()) {
             XMLReaderUtils.parseSAX(stream,
                     new EmbeddedContentHandler(new 
OOXMLWordAndPowerPointTextHandler(
-                            new OOXMLTikaBodyPartHandler(xhtml, styles, 
listManager,
-                                    config, metadata),
+                            bodyHandler,
                             linkedRelationships, 
config.isIncludeShapeBasedContent(),
                             config.isConcatenatePhoneticRuns(),
                             config.isPreferAlternateContentChoice())), 
context);
@@ -315,7 +325,48 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
         }
+    }
+
+    private OOXMLInlineBodyPartMap collectInlineParts(PackagePart 
documentPart) {
+        Map<String, String> allRelationships = new java.util.HashMap<>();
+        Map<String, byte[]> footnoteMap = collectPartContent(documentPart,
+                XWPFRelation.FOOTNOTE.getRelation(), Set.of("footnote"),
+                allRelationships);
+        String endnoteRel =
+                
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes";;
+        Map<String, byte[]> endnoteMap = collectPartContent(documentPart,
+                endnoteRel, Set.of("endnote"), allRelationships);
+        return new OOXMLInlineBodyPartMap(footnoteMap, endnoteMap,
+                Collections.emptyMap(), allRelationships);
+    }
 
+    private Map<String, byte[]> collectPartContent(PackagePart documentPart,
+            String relationshipType, Set<String> wrapperElements,
+            Map<String, String> allRelationships) {
+        try {
+            PackageRelationshipCollection prc =
+                    documentPart.getRelationshipsByType(relationshipType);
+            if (prc == null || prc.size() == 0) {
+                return Collections.emptyMap();
+            }
+            OOXMLPartContentCollector collector =
+                    new OOXMLPartContentCollector(wrapperElements);
+            for (int i = 0; i < prc.size(); i++) {
+                PackagePart part = 
documentPart.getRelatedPart(prc.getRelationship(i));
+                // collect the part's linked relationships (for picture 
resolution)
+                Map<String, String> partRels =
+                        loadLinkedRelationships(part, true, metadata);
+                allRelationships.putAll(partRels);
+                try (InputStream stream = part.getInputStream()) {
+                    XMLReaderUtils.parseSAX(stream, collector, context);
+                }
+            }
+            return collector.getContentMap();
+        } catch (InvalidFormatException | IOException | TikaException | 
SAXException e) {
+            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                    ExceptionUtils.getStackTrace(e));
+            return Collections.emptyMap();
+        }
     }
 
 

Reply via email to