This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5ae759340aeb31e254f2bcf24686fcdfd53ad6ed Author: tallison <[email protected]> AuthorDate: Wed Mar 18 06:45:12 2026 -0400 improve sax ooxml - WIP --- .../tika/parser/microsoft/ooxml/EditType.java | 21 +++ .../parser/microsoft/ooxml/FieldCodeParser.java | 109 ++++++++++++++++ .../ooxml/OOXMLFootnoteEndnoteCollector.java | 145 +++++++++++++++++++++ .../microsoft/ooxml/OOXMLPictureTracker.java | 93 +++++++++++++ .../microsoft/ooxml/XWPFBodyContentsHandler.java | 106 +++++++++++++++ 5 files changed, 474 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EditType.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EditType.java new file mode 100644 index 0000000000..cbc9d1ae86 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EditType.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +public enum EditType { + NONE, INSERT, DELETE, MOVE_TO, MOVE_FROM +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldCodeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldCodeParser.java new file mode 100644 index 0000000000..d71cbdc7f9 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldCodeParser.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Parses OOXML field codes (instrText) to extract URLs from HYPERLINK, + * INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields. + * <p> + * This class has no Tika dependencies and could be contributed to POI. + */ +public class FieldCodeParser { + + private static final Pattern HYPERLINK_PATTERN = + Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", + Pattern.CASE_INSENSITIVE); + private static final Pattern INCLUDEPICTURE_PATTERN = + Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"", + Pattern.CASE_INSENSITIVE); + private static final Pattern INCLUDETEXT_PATTERN = + Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"", + Pattern.CASE_INSENSITIVE); + private static final Pattern IMPORT_PATTERN = + Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"", + Pattern.CASE_INSENSITIVE); + private static final Pattern LINK_PATTERN = + Pattern.compile( + "LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"", + Pattern.CASE_INSENSITIVE); + + private FieldCodeParser() { + } + + /** + * Parses a HYPERLINK URL from instrText field code content. + * Field codes like: {@code HYPERLINK "https://example.com"} + * + * @param instrText the accumulated instrText content + * @return the URL if found, or null + */ + public static String parseHyperlinkFromInstrText(String instrText) { + if (instrText == null || instrText.isEmpty()) { + return null; + } + Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim()); + if (m.find()) { + return m.group(1); + } + return null; + } + + /** + * Parses URLs from instrText field codes that reference external resources. + * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields. + * + * @param instrText the accumulated instrText content + * @param fieldType output parameter - will contain the field type if found + * @return the URL if found, or null + */ + public static String parseExternalRefFromInstrText(String instrText, + StringBuilder fieldType) { + if (instrText == null || instrText.isEmpty()) { + return null; + } + String trimmed = instrText.trim(); + + Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("INCLUDEPICTURE"); + return m.group(1); + } + + m = INCLUDETEXT_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("INCLUDETEXT"); + return m.group(1); + } + + m = IMPORT_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("IMPORT"); + return m.group(1); + } + + m = LINK_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("LINK"); + return m.group(1); + } + + return null; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java new file mode 100644 index 0000000000..9f7e663bf7 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLFootnoteEndnoteCollector.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.HashMap; +import java.util.Map; + +import javax.xml.stream.XMLOutputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamWriter; + +import java.io.StringWriter; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * SAX handler that collects raw XML content for each footnote or endnote + * by ID from footnotes.xml or endnotes.xml. The collected XML can then be + * re-parsed through the main handler when a footnote/endnote reference is + * encountered in the document body. + */ +class OOXMLFootnoteEndnoteCollector extends DefaultHandler { + + private static final String W_NS = + "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + private static final String FOOTNOTE = "footnote"; + private static final String ENDNOTE = "endnote"; + + private final Map<String, byte[]> contentMap = new HashMap<>(); + + private String currentId = null; + private StringWriter currentWriter = null; + private XMLStreamWriter currentXmlWriter = null; + private int depth = 0; + + Map<String, byte[]> getContentMap() { + return contentMap; + } + + @Override + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + if (currentId != null) { + depth++; + try { + if (qName != null && !qName.isEmpty()) { + currentXmlWriter.writeStartElement(qName); + } else { + currentXmlWriter.writeStartElement(localName); + } + for (int i = 0; i < atts.getLength(); i++) { + String attQName = atts.getQName(i); + if (attQName != null && !attQName.isEmpty()) { + currentXmlWriter.writeAttribute(attQName, atts.getValue(i)); + } else { + currentXmlWriter.writeAttribute( + atts.getLocalName(i), atts.getValue(i)); + } + } + } catch (XMLStreamException e) { + throw new SAXException(e); + } + return; + } + + if ((FOOTNOTE.equals(localName) || ENDNOTE.equals(localName))) { + String id = atts.getValue(W_NS, "id"); + // skip separator/continuation footnotes (ids 0 and -1) + if (id != null && !id.equals("0") && !id.equals("-1")) { + currentId = id; + currentWriter = new StringWriter(); + try { + currentXmlWriter = XMLOutputFactory.newInstance() + .createXMLStreamWriter(currentWriter); + currentXmlWriter.writeStartDocument(); + // wrap content in a root element + currentXmlWriter.writeStartElement("body"); + } catch (XMLStreamException e) { + throw new SAXException(e); + } + depth = 0; + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (currentId == null) { + return; + } + + if (depth == 0) { + // end of the footnote/endnote element itself + try { + currentXmlWriter.writeEndElement(); // close <body> + currentXmlWriter.writeEndDocument(); + currentXmlWriter.flush(); + currentXmlWriter.close(); + } catch (XMLStreamException e) { + throw new SAXException(e); + } + contentMap.put(currentId, + currentWriter.toString().getBytes(java.nio.charset.StandardCharsets.UTF_8)); + currentId = null; + currentWriter = null; + currentXmlWriter = null; + return; + } + + depth--; + try { + currentXmlWriter.writeEndElement(); + } catch (XMLStreamException e) { + throw new SAXException(e); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (currentId != null && currentXmlWriter != null) { + try { + currentXmlWriter.writeCharacters(new String(ch, start, length)); + } catch (XMLStreamException e) { + throw new SAXException(e); + } + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java new file mode 100644 index 0000000000..dbd55b9f9f --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.Map; + +import org.xml.sax.SAXException; + +/** + * Tracks the lifecycle of picture elements (PIC, PICT, BLIP, IMAGEDATA, cNvPr) + * during OOXML SAX parsing and emits embeddedPicRef callbacks when the picture + * scope closes. + * <p> + * This class has no Tika dependencies and could be contributed to POI. + */ +class OOXMLPictureTracker { + + private final Map<String, String> linkedRelationships; + private final XWPFBodyContentsHandler bodyContentsHandler; + + private boolean inPic = false; + private boolean inPict = false; + private String picDescription = null; + private String picRId = null; + + OOXMLPictureTracker(Map<String, String> linkedRelationships, + XWPFBodyContentsHandler bodyContentsHandler) { + this.linkedRelationships = linkedRelationships; + this.bodyContentsHandler = bodyContentsHandler; + } + + boolean isInPic() { + return inPic; + } + + boolean isInPict() { + return inPict; + } + + void startPic() { + inPic = true; + } + + void startPict() { + inPict = true; + } + + void setBlipRId(String rId) { + picRId = rId; + } + + void setDescription(String description) { + picDescription = description; + } + + void setImageDataRId(String rId) { + picRId = rId; + } + + void setImageDataDescription(String description) { + picDescription = description; + } + + /** + * Called at end of PIC or PICT element. Resolves the filename from + * the relationship map and emits the embeddedPicRef callback. + */ + void endPicture() throws SAXException { + String picFileName = null; + if (picRId != null) { + picFileName = linkedRelationships.get(picRId); + } + bodyContentsHandler.embeddedPicRef(picFileName, picDescription); + picDescription = null; + picRId = null; + inPic = false; + inPict = false; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java new file mode 100644 index 0000000000..a45a7d63f5 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.Date; + +import org.xml.sax.SAXException; + +/** + * Callback interface for receiving structured document events from the + * OOXML SAX dispatcher. Implementations convert these events into output + * formats (e.g., XHTML, Markdown, plain text). + */ +public interface XWPFBodyContentsHandler { + + void run(RunProperties runProperties, String contents) throws SAXException; + + /** + * @param link the link; can be null + */ + void hyperlinkStart(String link) throws SAXException; + + /** + * Called when a hyperlink is found via a field code (instrText HYPERLINK). + * Distinct from relationship-based hyperlinks for security tracking purposes. + * + * @param link the link URL + */ + default void fieldCodeHyperlinkStart(String link) throws SAXException { + hyperlinkStart(link); + } + + void hyperlinkEnd() throws SAXException; + + void startParagraph(ParagraphProperties paragraphProperties) throws SAXException; + + void endParagraph() throws SAXException; + + void startTable() throws SAXException; + + void endTable() throws SAXException; + + void startTableRow() throws SAXException; + + void endTableRow() throws SAXException; + + void startTableCell() throws SAXException; + + void endTableCell() throws SAXException; + + void startSDT() throws SAXException; + + void endSDT() throws SAXException; + + void startEditedSection(String editor, Date date, EditType editType) throws SAXException; + + void endEditedSection() throws SAXException; + + boolean isIncludeDeletedText() throws SAXException; + + void footnoteReference(String id) throws SAXException; + + void endnoteReference(String id) throws SAXException; + + boolean isIncludeMoveFromText() throws SAXException; + + void embeddedOLERef(String refId) throws SAXException; + + /** + * Called when a linked (vs embedded) OLE object is found. + * These reference external files and are a security concern. + */ + void linkedOLERef(String refId) throws SAXException; + + void embeddedPicRef(String picFileName, String picDescription) throws SAXException; + + void startBookmark(String id, String name) throws SAXException; + + void endBookmark(String id) throws SAXException; + + /** + * Called when an external reference URL is found in a field code. + * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK fields, + * and DrawingML/VML hyperlinks on shapes. + * + * @param fieldType the type of field (e.g., "INCLUDEPICTURE", "hlinkHover", "vml-href") + * @param url the external URL + */ + default void externalRef(String fieldType, String url) throws SAXException { + // Default no-op implementation for backward compatibility + } +}
