This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1bebbd83fbd84a7bb844138240a03a1dff2c4eea Author: tallison <[email protected]> AuthorDate: Wed Mar 18 08:01:11 2026 -0400 improve sax ooxml - footnotes and endnotes - git add - WIP --- .../microsoft/ooxml/OOXMLInlineBodyPartMap.java | 78 ++++++++ .../microsoft/ooxml/OOXMLPartContentCollector.java | 202 +++++++++++++++++++++ 2 files changed, 280 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLInlineBodyPartMap.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLInlineBodyPartMap.java new file mode 100644 index 0000000000..3b13d910ea --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLInlineBodyPartMap.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.Collections; +import java.util.Map; + +/** + * Holds pre-parsed XML content fragments for OOXML document parts that are + * referenced inline from the main document body. Each map stores + * ID → raw XML bytes for a specific part type. + * <p> + * Used for footnotes, endnotes, and comments so that their content can be + * inlined at the point of reference rather than dumped at the end. + */ +class OOXMLInlineBodyPartMap { + + static final OOXMLInlineBodyPartMap EMPTY = new OOXMLInlineBodyPartMap( + Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), + Collections.emptyMap()); + + private final Map<String, byte[]> footnotes; + private final Map<String, byte[]> endnotes; + private final Map<String, byte[]> comments; + private final Map<String, String> linkedRelationships; + + OOXMLInlineBodyPartMap(Map<String, byte[]> footnotes, + Map<String, byte[]> endnotes, + Map<String, byte[]> comments, + Map<String, String> linkedRelationships) { + this.footnotes = footnotes; + this.endnotes = endnotes; + this.comments = comments; + this.linkedRelationships = linkedRelationships; + } + + Map<String, String> getLinkedRelationships() { + return linkedRelationships; + } + + byte[] getFootnote(String id) { + return footnotes.get(id); + } + + byte[] getEndnote(String id) { + return endnotes.get(id); + } + + byte[] getComment(String id) { + return comments.get(id); + } + + boolean hasFootnotes() { + return !footnotes.isEmpty(); + } + + boolean hasEndnotes() { + return !endnotes.isEmpty(); + } + + boolean hasComments() { + return !comments.isEmpty(); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java new file mode 100644 index 0000000000..e5dca8665e --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.ByteArrayOutputStream; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Generic SAX handler that collects raw XML content by ID from OOXML part files. + * Works with any part that contains wrapper elements with {@code w:id} attributes + * containing body content (paragraphs, tables, formatting, etc.). + * <p> + * Used for: + * <ul> + * <li>footnotes.xml — wrapper element "footnote"</li> + * <li>endnotes.xml — wrapper element "endnote"</li> + * <li>comments.xml — wrapper element "comment"</li> + * </ul> + * <p> + * IDs "0" and "-1" are skipped (these are separator/continuation elements in + * footnotes/endnotes). + */ +class OOXMLPartContentCollector extends DefaultHandler { + + private static final String W_NS = + "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + + private final Set<String> wrapperElementNames; + private final Map<String, byte[]> contentMap = new HashMap<>(); + private final Map<String, String> namespaceMappings = new HashMap<>(); + + private String currentId = null; + private ByteArrayOutputStream buffer = null; + private int depth = 0; + + /** + * @param wrapperElementNames local names of wrapper elements to collect + * (e.g., "footnote", "endnote", "comment") + */ + OOXMLPartContentCollector(Set<String> wrapperElementNames) { + this.wrapperElementNames = wrapperElementNames; + } + + @Override + public void startPrefixMapping(String prefix, String uri) { + namespaceMappings.put(prefix, uri); + } + + Map<String, byte[]> getContentMap() { + return contentMap; + } + + @Override + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + if (currentId != null) { + depth++; + appendStartTag(localName, qName, atts); + return; + } + + if (wrapperElementNames.contains(localName)) { + String id = atts.getValue(W_NS, "id"); + if (id != null && !"0".equals(id) && !"-1".equals(id)) { + currentId = id; + buffer = new ByteArrayOutputStream(); + writeString(buildWrapperOpenTag()); + depth = 0; + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (currentId == null) { + return; + } + + if (depth == 0) { + writeString("</w:body>"); + contentMap.put(currentId, buffer.toByteArray()); + currentId = null; + buffer = null; + return; + } + + depth--; + if (qName != null && !qName.isEmpty()) { + writeString("</" + qName + ">"); + } else { + writeString("</" + localName + ">"); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (currentId != null) { + writeString(escape(new String(ch, start, length))); + } + } + + private String buildWrapperOpenTag() { + StringBuilder sb = new StringBuilder("<w:body"); + // include all namespace declarations from the source document + for (Map.Entry<String, String> entry : namespaceMappings.entrySet()) { + String prefix = entry.getKey(); + String nsUri = entry.getValue(); + if (prefix == null || prefix.isEmpty()) { + sb.append(" xmlns=\"").append(escape(nsUri)).append("\""); + } else { + sb.append(" xmlns:").append(prefix).append("=\"") + .append(escape(nsUri)).append("\""); + } + } + // ensure w namespace is present + if (!namespaceMappings.containsKey("w")) { + sb.append(" xmlns:w=\"").append(W_NS).append("\""); + } + sb.append(">"); + return sb.toString(); + } + + private void appendStartTag(String localName, String qName, Attributes atts) { + String tagName = (qName != null && !qName.isEmpty()) ? qName : localName; + StringBuilder sb = new StringBuilder(); + sb.append('<').append(tagName); + for (int i = 0; i < atts.getLength(); i++) { + String attName = atts.getQName(i); + if (attName == null || attName.isEmpty()) { + attName = atts.getLocalName(i); + } + sb.append(' ').append(attName).append("=\""); + sb.append(escape(atts.getValue(i))); + sb.append('"'); + } + sb.append('>'); + writeString(sb.toString()); + } + + private void writeString(String s) { + byte[] bytes = s.getBytes(StandardCharsets.UTF_8); + buffer.write(bytes, 0, bytes.length); + } + + static String escape(String s) { + if (s == null) { + return ""; + } + StringBuilder sb = null; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + String replacement = null; + switch (c) { + case '&': + replacement = "&"; + break; + case '<': + replacement = "<"; + break; + case '>': + replacement = ">"; + break; + case '"': + replacement = """; + break; + default: + if (sb != null) { + sb.append(c); + } + continue; + } + if (sb == null) { + sb = new StringBuilder(s.length() + 16); + sb.append(s, 0, i); + } + sb.append(replacement); + } + return sb != null ? sb.toString() : s; + } +}
