This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5ae759340aeb31e254f2bcf24686fcdfd53ad6ed
Author: tallison <[email protected]>
AuthorDate: Wed Mar 18 06:45:12 2026 -0400

    improve sax ooxml - WIP
---
 .../tika/parser/microsoft/ooxml/EditType.java      |  21 +++
 .../parser/microsoft/ooxml/FieldCodeParser.java    | 109 ++++++++++++++++
 .../ooxml/OOXMLFootnoteEndnoteCollector.java       | 145 +++++++++++++++++++++
 .../microsoft/ooxml/OOXMLPictureTracker.java       |  93 +++++++++++++
 .../microsoft/ooxml/XWPFBodyContentsHandler.java   | 106 +++++++++++++++
 5 files changed, 474 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EditType.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EditType.java
new file mode 100644
index 0000000000..cbc9d1ae86
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EditType.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+public enum EditType {
+    NONE, INSERT, DELETE, MOVE_TO, MOVE_FROM
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldCodeParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldCodeParser.java
new file mode 100644
index 0000000000..d71cbdc7f9
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldCodeParser.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Parses OOXML field codes (instrText) to extract URLs from HYPERLINK,
+ * INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields.
+ * <p>
+ * This class has no Tika dependencies and could be contributed to POI.
+ */
+public class FieldCodeParser {
+
+    private static final Pattern HYPERLINK_PATTERN =
+            Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"",
+                    Pattern.CASE_INSENSITIVE);
+    private static final Pattern INCLUDEPICTURE_PATTERN =
+            Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"",
+                    Pattern.CASE_INSENSITIVE);
+    private static final Pattern INCLUDETEXT_PATTERN =
+            Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"",
+                    Pattern.CASE_INSENSITIVE);
+    private static final Pattern IMPORT_PATTERN =
+            Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"",
+                    Pattern.CASE_INSENSITIVE);
+    private static final Pattern LINK_PATTERN =
+            Pattern.compile(
+                    "LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"",
+                    Pattern.CASE_INSENSITIVE);
+
+    private FieldCodeParser() {
+    }
+
+    /**
+     * Parses a HYPERLINK URL from instrText field code content.
+     * Field codes like: {@code HYPERLINK "https://example.com"}
+     *
+     * @param instrText the accumulated instrText content
+     * @return the URL if found, or null
+     */
+    public static String parseHyperlinkFromInstrText(String instrText) {
+        if (instrText == null || instrText.isEmpty()) {
+            return null;
+        }
+        Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
+        if (m.find()) {
+            return m.group(1);
+        }
+        return null;
+    }
+
+    /**
+     * Parses URLs from instrText field codes that reference external 
resources.
+     * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields.
+     *
+     * @param instrText the accumulated instrText content
+     * @param fieldType output parameter - will contain the field type if found
+     * @return the URL if found, or null
+     */
+    public static String parseExternalRefFromInstrText(String instrText,
+            StringBuilder fieldType) {
+        if (instrText == null || instrText.isEmpty()) {
+            return null;
+        }
+        String trimmed = instrText.trim();
+
+        Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed);
+        if (m.find()) {
+            fieldType.append("INCLUDEPICTURE");
+            return m.group(1);
+        }
+
+        m = INCLUDETEXT_PATTERN.matcher(trimmed);
+        if (m.find()) {
+            fieldType.append("INCLUDETEXT");
+            return m.group(1);
+        }
+
+        m = IMPORT_PATTERN.matcher(trimmed);
+        if (m.find()) {
+            fieldType.append("IMPORT");
+            return m.group(1);
+        }
+
+        m = LINK_PATTERN.matcher(trimmed);
+        if (m.find()) {
+            fieldType.append("LINK");
+            return m.group(1);
+        }
+
+        return null;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java
new file mode 100644
index 0000000000..9f7e663bf7
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.stream.XMLOutputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamWriter;
+
+import java.io.StringWriter;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX handler that collects raw XML content for each footnote or endnote
+ * by ID from footnotes.xml or endnotes.xml. The collected XML can then be
+ * re-parsed through the main handler when a footnote/endnote reference is
+ * encountered in the document body.
+ */
+class OOXMLFootnoteEndnoteCollector extends DefaultHandler {
+
+    private static final String W_NS =
+            "http://schemas.openxmlformats.org/wordprocessingml/2006/main";;
+    private static final String FOOTNOTE = "footnote";
+    private static final String ENDNOTE = "endnote";
+
+    private final Map<String, byte[]> contentMap = new HashMap<>();
+
+    private String currentId = null;
+    private StringWriter currentWriter = null;
+    private XMLStreamWriter currentXmlWriter = null;
+    private int depth = 0;
+
+    Map<String, byte[]> getContentMap() {
+        return contentMap;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName,
+            Attributes atts) throws SAXException {
+        if (currentId != null) {
+            depth++;
+            try {
+                if (qName != null && !qName.isEmpty()) {
+                    currentXmlWriter.writeStartElement(qName);
+                } else {
+                    currentXmlWriter.writeStartElement(localName);
+                }
+                for (int i = 0; i < atts.getLength(); i++) {
+                    String attQName = atts.getQName(i);
+                    if (attQName != null && !attQName.isEmpty()) {
+                        currentXmlWriter.writeAttribute(attQName, 
atts.getValue(i));
+                    } else {
+                        currentXmlWriter.writeAttribute(
+                                atts.getLocalName(i), atts.getValue(i));
+                    }
+                }
+            } catch (XMLStreamException e) {
+                throw new SAXException(e);
+            }
+            return;
+        }
+
+        if ((FOOTNOTE.equals(localName) || ENDNOTE.equals(localName))) {
+            String id = atts.getValue(W_NS, "id");
+            // skip separator/continuation footnotes (ids 0 and -1)
+            if (id != null && !id.equals("0") && !id.equals("-1")) {
+                currentId = id;
+                currentWriter = new StringWriter();
+                try {
+                    currentXmlWriter = XMLOutputFactory.newInstance()
+                            .createXMLStreamWriter(currentWriter);
+                    currentXmlWriter.writeStartDocument();
+                    // wrap content in a root element
+                    currentXmlWriter.writeStartElement("body");
+                } catch (XMLStreamException e) {
+                    throw new SAXException(e);
+                }
+                depth = 0;
+            }
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        if (currentId == null) {
+            return;
+        }
+
+        if (depth == 0) {
+            // end of the footnote/endnote element itself
+            try {
+                currentXmlWriter.writeEndElement(); // close <body>
+                currentXmlWriter.writeEndDocument();
+                currentXmlWriter.flush();
+                currentXmlWriter.close();
+            } catch (XMLStreamException e) {
+                throw new SAXException(e);
+            }
+            contentMap.put(currentId,
+                    
currentWriter.toString().getBytes(java.nio.charset.StandardCharsets.UTF_8));
+            currentId = null;
+            currentWriter = null;
+            currentXmlWriter = null;
+            return;
+        }
+
+        depth--;
+        try {
+            currentXmlWriter.writeEndElement();
+        } catch (XMLStreamException e) {
+            throw new SAXException(e);
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        if (currentId != null && currentXmlWriter != null) {
+            try {
+                currentXmlWriter.writeCharacters(new String(ch, start, 
length));
+            } catch (XMLStreamException e) {
+                throw new SAXException(e);
+            }
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java
new file mode 100644
index 0000000000..dbd55b9f9f
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.Map;
+
+import org.xml.sax.SAXException;
+
+/**
+ * Tracks the lifecycle of picture elements (PIC, PICT, BLIP, IMAGEDATA, cNvPr)
+ * during OOXML SAX parsing and emits embeddedPicRef callbacks when the picture
+ * scope closes.
+ * <p>
+ * This class has no Tika dependencies and could be contributed to POI.
+ */
+class OOXMLPictureTracker {
+
+    private final Map<String, String> linkedRelationships;
+    private final XWPFBodyContentsHandler bodyContentsHandler;
+
+    private boolean inPic = false;
+    private boolean inPict = false;
+    private String picDescription = null;
+    private String picRId = null;
+
+    OOXMLPictureTracker(Map<String, String> linkedRelationships,
+            XWPFBodyContentsHandler bodyContentsHandler) {
+        this.linkedRelationships = linkedRelationships;
+        this.bodyContentsHandler = bodyContentsHandler;
+    }
+
+    boolean isInPic() {
+        return inPic;
+    }
+
+    boolean isInPict() {
+        return inPict;
+    }
+
+    void startPic() {
+        inPic = true;
+    }
+
+    void startPict() {
+        inPict = true;
+    }
+
+    void setBlipRId(String rId) {
+        picRId = rId;
+    }
+
+    void setDescription(String description) {
+        picDescription = description;
+    }
+
+    void setImageDataRId(String rId) {
+        picRId = rId;
+    }
+
+    void setImageDataDescription(String description) {
+        picDescription = description;
+    }
+
+    /**
+     * Called at end of PIC or PICT element. Resolves the filename from
+     * the relationship map and emits the embeddedPicRef callback.
+     */
+    void endPicture() throws SAXException {
+        String picFileName = null;
+        if (picRId != null) {
+            picFileName = linkedRelationships.get(picRId);
+        }
+        bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+        picDescription = null;
+        picRId = null;
+        inPic = false;
+        inPict = false;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
new file mode 100644
index 0000000000..a45a7d63f5
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.Date;
+
+import org.xml.sax.SAXException;
+
+/**
+ * Callback interface for receiving structured document events from the
+ * OOXML SAX dispatcher. Implementations convert these events into output
+ * formats (e.g., XHTML, Markdown, plain text).
+ */
+public interface XWPFBodyContentsHandler {
+
+    void run(RunProperties runProperties, String contents) throws SAXException;
+
+    /**
+     * @param link the link; can be null
+     */
+    void hyperlinkStart(String link) throws SAXException;
+
+    /**
+     * Called when a hyperlink is found via a field code (instrText HYPERLINK).
+     * Distinct from relationship-based hyperlinks for security tracking 
purposes.
+     *
+     * @param link the link URL
+     */
+    default void fieldCodeHyperlinkStart(String link) throws SAXException {
+        hyperlinkStart(link);
+    }
+
+    void hyperlinkEnd() throws SAXException;
+
+    void startParagraph(ParagraphProperties paragraphProperties) throws 
SAXException;
+
+    void endParagraph() throws SAXException;
+
+    void startTable() throws SAXException;
+
+    void endTable() throws SAXException;
+
+    void startTableRow() throws SAXException;
+
+    void endTableRow() throws SAXException;
+
+    void startTableCell() throws SAXException;
+
+    void endTableCell() throws SAXException;
+
+    void startSDT() throws SAXException;
+
+    void endSDT() throws SAXException;
+
+    void startEditedSection(String editor, Date date, EditType editType) 
throws SAXException;
+
+    void endEditedSection() throws SAXException;
+
+    boolean isIncludeDeletedText() throws SAXException;
+
+    void footnoteReference(String id) throws SAXException;
+
+    void endnoteReference(String id) throws SAXException;
+
+    boolean isIncludeMoveFromText() throws SAXException;
+
+    void embeddedOLERef(String refId) throws SAXException;
+
+    /**
+     * Called when a linked (vs embedded) OLE object is found.
+     * These reference external files and are a security concern.
+     */
+    void linkedOLERef(String refId) throws SAXException;
+
+    void embeddedPicRef(String picFileName, String picDescription) throws 
SAXException;
+
+    void startBookmark(String id, String name) throws SAXException;
+
+    void endBookmark(String id) throws SAXException;
+
+    /**
+     * Called when an external reference URL is found in a field code.
+     * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK fields,
+     * and DrawingML/VML hyperlinks on shapes.
+     *
+     * @param fieldType the type of field (e.g., "INCLUDEPICTURE", 
"hlinkHover", "vml-href")
+     * @param url the external URL
+     */
+    default void externalRef(String fieldType, String url) throws SAXException 
{
+        // Default no-op implementation for backward compatibility
+    }
+}

Reply via email to