This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4708-refactor-xlsx in repository https://gitbox.apache.org/repos/asf/tika.git
commit fef22d5cc30affe78ea5718c072a17b2aac96907 Author: tallison <[email protected]> AuthorDate: Fri Apr 3 07:14:44 2026 -0400 refactor xlsx - WIP --- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 5 +- .../parser/microsoft/ooxml/OOXMLExtractor.java | 3 +- .../microsoft/ooxml/OOXMLExtractorFactory.java | 7 +- .../ooxml/SXWPFWordExtractorDecorator.java | 3 +- .../microsoft/ooxml/TikaSheetXMLHandler.java | 405 +++++++++++++++++++++ .../ooxml/XSLFPowerPointExtractorDecorator.java | 3 +- .../ooxml/XSSFBExcelExtractorDecorator.java | 13 +- .../ooxml/XSSFExcelExtractorDecorator.java | 226 ++++++------ .../microsoft/ooxml/XSSFSharedStringsShim.java | 156 ++++++++ .../parser/microsoft/ooxml/XSSFStylesShim.java | 146 ++++++++ .../ooxml/XWPFWordExtractorDecorator.java | 35 +- .../microsoft/ooxml/xps/XPSTextExtractor.java | 10 +- .../xslf/XSLFEventBasedPowerPointExtractor.java | 9 +- .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 3 +- 14 files changed, 875 insertions(+), 149 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 70d5920800..dd7c5eafaf 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -46,7 +46,6 @@ import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xssf.usermodel.XSSFRelation; import org.apache.poi.xwpf.usermodel.XWPFRelation; -import org.apache.xmlbeans.XmlException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -140,7 +139,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { * ParseContext) */ public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context) - throws SAXException, XmlException, IOException, TikaException { + throws SAXException, IOException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); xhtml.startDocument(); @@ -527,7 +526,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { * Populates the {@link XHTMLContentHandler} object received as parameter. */ protected abstract void buildXHTML(XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException; + throws SAXException, IOException; /** * Return a list of the main parts of the document, used diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java index 673776404c..5c37c9c7b5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java @@ -19,7 +19,6 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; import org.apache.poi.ooxml.POIXMLDocument; -import org.apache.xmlbeans.XmlException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -53,5 +52,5 @@ public interface OOXMLExtractor { * given content handler. */ void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context) - throws SAXException, XmlException, IOException, TikaException; + throws SAXException, IOException, TikaException; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 8558f37c21..f407ffeb62 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -37,7 +37,6 @@ import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFRelation; -import org.apache.xmlbeans.XmlException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; @@ -202,7 +201,7 @@ public class OOXMLExtractorFactory { } else { throw new TikaException("Error creating OOXML extractor", e); } - } catch (OpenXML4JException | XmlException e) { + } catch (OpenXML4JException e) { throw new TikaException("Error creating OOXML extractor", e); } catch (RuntimeSAXException e) { throw (SAXException) e.getCause(); @@ -210,7 +209,7 @@ public class OOXMLExtractorFactory { } private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) - throws TikaException, XmlException, OpenXML4JException, IOException { + throws TikaException, OpenXML4JException, IOException { PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType( "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"); if (packageRelationshipCollection.size() == 0) { @@ -235,7 +234,7 @@ public class OOXMLExtractorFactory { } private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean eventBased) - throws TikaException, XmlException, OpenXML4JException, IOException { + throws TikaException, OpenXML4JException, IOException { PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType( "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index 5fc7d9a846..393370662b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -34,7 +34,6 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.xssf.usermodel.XSSFRelation; import org.apache.poi.xwpf.usermodel.XWPFRelation; -import org.apache.xmlbeans.XmlException; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @@ -117,7 +116,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { @Override protected void buildXHTML(XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { //handle main document List<PackagePart> pps = getStoryDocumentParts(); if (pps != null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java new file mode 100644 index 0000000000..c7276e92a5 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java @@ -0,0 +1,405 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Queue; + +import org.apache.poi.ss.usermodel.BuiltinFormats; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.util.CellAddress; +import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; +import org.apache.poi.xssf.model.Comments; +import org.apache.poi.xssf.usermodel.XSSFComment; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Sheet XML handler for XLSX event-based parsing that uses {@link XSSFStylesShim} + * instead of POI's XMLBeans-dependent {@code StylesTable}. + * <p> + * Adapted from Apache POI's {@code XSSFSheetXMLHandler} (Apache 2.0 license). + * The only structural change is replacing the {@code Styles}/{@code XSSFCellStyle} + * lookup with a direct call to our SAX-based styles shim for format resolution. + */ +class TikaSheetXMLHandler extends DefaultHandler { + + private static final Logger LOG = LoggerFactory.getLogger(TikaSheetXMLHandler.class); + + private static final String NS_SPREADSHEETML = + "http://schemas.openxmlformats.org/spreadsheetml/2006/main"; + + enum XssfDataType { + BOOLEAN, + ERROR, + FORMULA, + INLINE_STRING, + SST_STRING, + NUMBER, + } + + private final XSSFStylesShim stylesShim; + private final Comments comments; + private final XSSFSharedStringsShim sharedStringsShim; + private final SheetContentsHandler output; + private final DataFormatter formatter; + private final boolean formulasNotResults; + + private boolean vIsOpen; + private boolean fIsOpen; + private boolean isIsOpen; + private boolean hfIsOpen; + + private XssfDataType nextDataType; + private short formatIndex; + private String formatString; + + private int rowNum; + private int nextRowNum; + private String cellRef; + + private final StringBuilder value = new StringBuilder(64); + private final StringBuilder formula = new StringBuilder(64); + private final StringBuilder headerFooter = new StringBuilder(64); + + private Queue<CellAddress> commentCellRefs; + + TikaSheetXMLHandler(XSSFStylesShim stylesShim, + Comments comments, + XSSFSharedStringsShim sharedStringsShim, + SheetContentsHandler sheetContentsHandler, + DataFormatter dataFormatter, + boolean formulasNotResults) { + this.stylesShim = stylesShim; + this.comments = comments; + this.sharedStringsShim = sharedStringsShim; + this.output = sheetContentsHandler; + this.formatter = dataFormatter; + this.formulasNotResults = formulasNotResults; + this.nextDataType = XssfDataType.NUMBER; + initComments(comments); + } + + TikaSheetXMLHandler(XSSFStylesShim stylesShim, + XSSFSharedStringsShim sharedStringsShim, + SheetContentsHandler sheetContentsHandler, + DataFormatter dataFormatter, + boolean formulasNotResults) { + this(stylesShim, null, sharedStringsShim, sheetContentsHandler, dataFormatter, + formulasNotResults); + } + + private void initComments(Comments commentsTable) { + if (commentsTable != null) { + commentCellRefs = new LinkedList<>(); + for (Iterator<CellAddress> iter = commentsTable.getCellAddresses(); + iter.hasNext(); ) { + commentCellRefs.add(iter.next()); + } + } + } + + private boolean isTextTag(String name) { + if ("v".equals(name)) { + return true; + } + if ("inlineStr".equals(name)) { + return true; + } + return "t".equals(name) && isIsOpen; + } + + @Override + public void startElement(String uri, String localName, String qName, + Attributes attributes) throws SAXException { + if (uri != null && !uri.equals(NS_SPREADSHEETML)) { + return; + } + + if (isTextTag(localName)) { + vIsOpen = true; + if (!isIsOpen) { + value.setLength(0); + } + } else if ("is".equals(localName)) { + isIsOpen = true; + } else if ("f".equals(localName)) { + formula.setLength(0); + if (this.nextDataType == XssfDataType.NUMBER) { + this.nextDataType = XssfDataType.FORMULA; + } + String type = attributes.getValue("t"); + if (type != null && type.equals("shared")) { + String ref = attributes.getValue("ref"); + if (ref != null) { + fIsOpen = true; + } else { + if (formulasNotResults) { + LOG.warn("shared formulas not yet supported!"); + } + } + } else { + fIsOpen = true; + } + } else if ("oddHeader".equals(localName) || "evenHeader".equals(localName) || + "firstHeader".equals(localName) || "firstFooter".equals(localName) || + "oddFooter".equals(localName) || "evenFooter".equals(localName)) { + hfIsOpen = true; + headerFooter.setLength(0); + } else if ("row".equals(localName)) { + String rowNumStr = attributes.getValue("r"); + if (rowNumStr != null) { + rowNum = Integer.parseInt(rowNumStr.trim()) - 1; + } else { + rowNum = nextRowNum; + } + output.startRow(rowNum); + } else if ("c".equals(localName)) { + // Cell element — resolve style to format index/string + this.formula.setLength(0); + this.nextDataType = XssfDataType.NUMBER; + this.formatIndex = -1; + this.formatString = null; + cellRef = attributes.getValue("r"); + String cellType = attributes.getValue("t"); + String cellStyleStr = attributes.getValue("s"); + + if ("b".equals(cellType)) { + nextDataType = XssfDataType.BOOLEAN; + } else if ("e".equals(cellType)) { + nextDataType = XssfDataType.ERROR; + } else if ("inlineStr".equals(cellType)) { + nextDataType = XssfDataType.INLINE_STRING; + } else if ("s".equals(cellType)) { + nextDataType = XssfDataType.SST_STRING; + } else if ("str".equals(cellType)) { + nextDataType = XssfDataType.FORMULA; + } else { + // Number — resolve format via our styles shim + if (stylesShim != null) { + int styleIndex; + if (cellStyleStr != null) { + styleIndex = Integer.parseInt(cellStyleStr.trim()); + } else if (stylesShim.getNumCellStyles() > 0) { + styleIndex = 0; + } else { + styleIndex = -1; + } + if (styleIndex >= 0) { + this.formatIndex = stylesShim.getFormatIndex(styleIndex); + this.formatString = stylesShim.getFormatString(styleIndex); + if (this.formatString == null) { + this.formatString = + BuiltinFormats.getBuiltinFormat(this.formatIndex); + } + } + } + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (uri != null && !uri.equals(NS_SPREADSHEETML)) { + return; + } + + if (isTextTag(localName)) { + vIsOpen = false; + if (!isIsOpen) { + outputCell(); + value.setLength(0); + } + } else if ("f".equals(localName)) { + fIsOpen = false; + } else if ("is".equals(localName)) { + isIsOpen = false; + outputCell(); + value.setLength(0); + } else if ("row".equals(localName)) { + checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_ROW); + output.endRow(rowNum); + nextRowNum = rowNum + 1; + } else if ("sheetData".equals(localName)) { + checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_SHEET_DATA); + output.endSheet(); + } else if ("oddHeader".equals(localName) || "evenHeader".equals(localName) || + "firstHeader".equals(localName)) { + hfIsOpen = false; + output.headerFooter(headerFooter.toString(), true, localName); + } else if ("oddFooter".equals(localName) || "evenFooter".equals(localName) || + "firstFooter".equals(localName)) { + hfIsOpen = false; + output.headerFooter(headerFooter.toString(), false, localName); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (vIsOpen) { + value.append(ch, start, length); + } + if (fIsOpen) { + formula.append(ch, start, length); + } + if (hfIsOpen) { + headerFooter.append(ch, start, length); + } + } + + private void outputCell() { + String thisStr = null; + + if (formulasNotResults && formula.length() > 0) { + thisStr = formula.toString(); + } else { + switch (nextDataType) { + case BOOLEAN: + char first = value.charAt(0); + thisStr = first == '0' ? "FALSE" : "TRUE"; + break; + case ERROR: + thisStr = "ERROR:" + value; + break; + case FORMULA: + if (formulasNotResults) { + thisStr = formula.toString(); + } else { + String fv = value.toString(); + if (this.formatString != null) { + try { + double d = Double.parseDouble(fv.trim()); + thisStr = formatter.formatRawCellContents( + d, this.formatIndex, this.formatString); + } catch (Exception e) { + thisStr = fv; + } + } else { + thisStr = fv; + } + } + break; + case INLINE_STRING: + thisStr = value.toString(); + break; + case SST_STRING: + String sstIndex = value.toString().trim(); + if (!sstIndex.isEmpty()) { + try { + int idx = Integer.parseInt(sstIndex); + thisStr = sharedStringsShim.getItemAt(idx); + } catch (NumberFormatException ex) { + LOG.error("Failed to parse SST index '{}'", sstIndex, ex); + } + } + break; + case NUMBER: + String n = value.toString(); + if (this.formatString != null && !n.isEmpty()) { + try { + thisStr = formatter.formatRawCellContents( + Double.parseDouble(n.trim()), + this.formatIndex, this.formatString); + } catch (Exception e) { + thisStr = n; + } + } else { + thisStr = n; + } + break; + default: + thisStr = "(TODO: Unexpected type: " + nextDataType + ")"; + break; + } + } + + checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL); + XSSFComment comment = comments != null ? + comments.findCellComment(new CellAddress(cellRef)) : null; + output.cell(cellRef, thisStr, comment); + } + + private void checkForEmptyCellComments(EmptyCellCommentsCheckType type) { + if (commentCellRefs != null && !commentCellRefs.isEmpty()) { + if (type == EmptyCellCommentsCheckType.END_OF_SHEET_DATA) { + while (!commentCellRefs.isEmpty()) { + outputEmptyCellComment(commentCellRefs.remove()); + } + return; + } + + if (this.cellRef == null) { + if (type == EmptyCellCommentsCheckType.END_OF_ROW) { + while (!commentCellRefs.isEmpty()) { + if (commentCellRefs.peek().getRow() == rowNum) { + outputEmptyCellComment(commentCellRefs.remove()); + } else { + return; + } + } + return; + } else { + throw new IllegalStateException( + "Cell ref should be null only if there are only empty " + + "cells in the row; rowNum: " + rowNum); + } + } + + CellAddress nextCommentCellRef; + do { + CellAddress cellAddr = new CellAddress(this.cellRef); + CellAddress peekCellRef = commentCellRefs.peek(); + if (type == EmptyCellCommentsCheckType.CELL && + cellAddr.equals(peekCellRef)) { + commentCellRefs.remove(); + return; + } else { + int comparison = peekCellRef.compareTo(cellAddr); + if (comparison > 0 && + type == EmptyCellCommentsCheckType.END_OF_ROW && + peekCellRef.getRow() <= rowNum) { + nextCommentCellRef = commentCellRefs.remove(); + outputEmptyCellComment(nextCommentCellRef); + } else if (comparison < 0 && + type == EmptyCellCommentsCheckType.CELL && + peekCellRef.getRow() <= rowNum) { + nextCommentCellRef = commentCellRefs.remove(); + outputEmptyCellComment(nextCommentCellRef); + } else { + nextCommentCellRef = null; + } + } + } while (nextCommentCellRef != null && !commentCellRefs.isEmpty()); + } + } + + private void outputEmptyCellComment(CellAddress cellRef) { + XSSFComment comment = comments.findCellComment(cellRef); + output.cell(cellRef.formatAsString(), null, comment); + } + + private enum EmptyCellCommentsCheckType { + CELL, + END_OF_ROW, + END_OF_SHEET_DATA + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java index d292f5c571..fb9876fce8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java @@ -56,7 +56,6 @@ import org.apache.poi.xslf.usermodel.XSLFTableRow; import org.apache.poi.xslf.usermodel.XSLFTextParagraph; import org.apache.poi.xslf.usermodel.XSLFTextRun; import org.apache.poi.xslf.usermodel.XSLFTextShape; -import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor; import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthorList; @@ -356,7 +355,7 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { PackagePart slidePart; try { slidePart = document.getSlidePart(ctSlide); - } catch (IOException | XmlException e) { + } catch (Exception e) { throw new TikaException("Broken OOXML file", e); } addSlideParts(slidePart, parts); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java index 51a30cdc96..0db32f067a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java @@ -18,7 +18,6 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; import java.io.InputStream; -import java.util.List; import java.util.Locale; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; @@ -32,8 +31,6 @@ import org.apache.poi.xssf.binary.XSSFBStylesTable; import org.apache.poi.xssf.eventusermodel.XSSFBReader; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; -import org.apache.poi.xssf.usermodel.XSSFShape; -import org.apache.xmlbeans.XmlException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -61,7 +58,7 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator { @Override public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context) - throws SAXException, XmlException, IOException, TikaException { + throws SAXException, IOException, TikaException { this.metadata = metadata; this.parseContext = context; @@ -75,7 +72,7 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator { */ @Override protected void buildXHTML(XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { OPCPackage container = extractor.getPackage(); XSSFBSharedStringsTable strings; @@ -92,7 +89,7 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator { iter = (XSSFBReader.SheetIterator) xssfReader.getSheetsData(); strings = new XSSFBSharedStringsTable(container); } catch (OpenXML4JException e) { - throw new XmlException(e); + throw new IOException(e); } while (iter.hasNext()) { @@ -126,9 +123,7 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator { for (String footer : sheetExtractor.footers) { extractHeaderFooter(footer, xhtml); } - List<XSSFShape> shapes = iter.getShapes(); - - processShapes(shapes, xhtml); + processDrawings(sheetPart, xhtml); //for now dump sheet hyperlinks at bottom of page //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index e25bdf6d09..fab2601e4a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -21,11 +21,9 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Set; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; @@ -42,24 +40,12 @@ import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.HeaderFooter; import org.apache.poi.ss.util.CellReference; -import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; import org.apache.poi.xssf.eventusermodel.XSSFReader; -import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.model.Comments; -import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFComment; -import org.apache.poi.xssf.usermodel.XSSFDrawing; -import org.apache.poi.xssf.usermodel.XSSFRelation; -import org.apache.poi.xssf.usermodel.XSSFShape; -import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper; -import org.apache.xmlbeans.XmlException; -import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink; -import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps; -import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape; -import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; @@ -92,6 +78,18 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { // Power Query stores data in customData parts private static final String POWER_QUERY_CONTENT_TYPE = "application/vnd.ms-excel.customDataProperties+xml"; + private static final String RELATION_DRAWING = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing"; + private static final String RELATION_CHART = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart"; + private static final String RELATION_HYPERLINK = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"; + private static final String NS_DRAWING_ML = + "http://schemas.openxmlformats.org/drawingml/2006/main"; + private static final String NS_RELATIONSHIPS = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships"; + private static final String RELATION_VML_DRAWING = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/vmlDrawing"; /** * Allows access to headers/footers from raw xml strings @@ -135,7 +133,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { @Override public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context) - throws SAXException, XmlException, IOException, TikaException { + throws SAXException, IOException, TikaException { this.metadata = metadata; this.parseContext = context; @@ -149,21 +147,22 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { */ @Override protected void buildXHTML(XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { OPCPackage container = extractor.getPackage(); - ReadOnlySharedStringsTable strings; + XSSFSharedStringsShim stringsShim; XSSFReader.SheetIterator iter; XSSFReader xssfReader; - StylesTable styles; + XSSFStylesShim stylesShim; try { xssfReader = new XSSFReader(container); - styles = xssfReader.getStylesTable(); + stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), parseContext); iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); - strings = new ReadOnlySharedStringsTable(container, config.isConcatenatePhoneticRuns()); - } catch (OpenXML4JException e) { - throw new XmlException(e); + stringsShim = new XSSFSharedStringsShim(xssfReader.getSharedStringsData(), + config.isConcatenatePhoneticRuns(), parseContext); + } catch (OpenXML4JException | TikaException e) { + throw new IOException(e); } while (iter.hasNext()) { SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml); @@ -187,7 +186,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { xhtml.startElement("table"); xhtml.startElement("tbody"); - processSheet(sheetExtractor, comments, styles, strings, stream); + processSheet(sheetExtractor, comments, stylesShim, stringsShim, stream); try { getThreadedComments(container, sheetPart, xhtml); } catch (InvalidFormatException | TikaException | IOException e) { @@ -209,8 +208,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { // Do text held in shapes, if required if (config.isIncludeShapeBasedContent()) { - List<XSSFShape> shapes = iter.getShapes(); - processShapes(shapes, xhtml); + processDrawings(sheetPart, xhtml); } //for now dump sheet hyperlinks at bottom of page @@ -669,7 +667,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { protected void addDrawingHyperLinks(PackagePart sheetPart) { try { for (PackageRelationship rel : sheetPart - .getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { + .getRelationshipsByType(RELATION_DRAWING)) { if (rel.getTargetMode() == TargetMode.INTERNAL) { PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); PackagePart part = rel.getPackage().getPart(relName); @@ -678,7 +676,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { continue; } for (PackageRelationship drawRel : part - .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) { + .getRelationshipsByType(RELATION_HYPERLINK)) { drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString()); } } @@ -696,7 +694,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { throws SAXException { try { for (PackageRelationship rel : sheetPart - .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) { + .getRelationshipsByType(RELATION_HYPERLINK)) { xhtml.startElement("a", "href", rel.getTargetURI().toString()); xhtml.characters(rel.getTargetURI().toString()); xhtml.endElement("a"); @@ -713,101 +711,124 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } - protected void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) + protected void processDrawings(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException { - if (shapes == null) { - return; - } - //We don't currently have an obvious way to get drawings - //directly from sheetIter. Therefore, we grab the shapes and process those. - //To get the diagrams and charts, we need to get the parent drawing for each - //shape, and we need to make sure that we only process each parent shape once! - //SEE TIKA-2703 TODO: add unit test - Set<String> seenParentDrawings = new HashSet<>(); - for (XSSFShape shape : shapes) { - if (shape instanceof XSSFSimpleShape) { - String sText = ((XSSFSimpleShape) shape).getText(); - if (sText != null && sText.length() > 0) { - xhtml.element("p", sText); + try { + for (PackageRelationship rel : sheetPart + .getRelationshipsByType(RELATION_DRAWING)) { + if (rel.getTargetMode() != TargetMode.INTERNAL) { + continue; } - extractHyperLinksFromShape(((XSSFSimpleShape) shape).getCTShape(), xhtml); - } - - XSSFDrawing parentDrawing = shape.getDrawing(); - if (parentDrawing != null) { - if (!seenParentDrawings - .contains(parentDrawing.getPackagePart().getPartName().toString())) { - //dump diagram data - handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA, - "diagram-data", parentDrawing.getPackagePart(), metadata, - new OOXMLWordAndPowerPointTextHandler( - new OOXMLTikaBodyPartHandler(xhtml), - new HashMap<>()//empty - )); - //dump chart data - handleGeneralTextContainingPart(XSSFRelation.CHART.getRelation(), "chart", - parentDrawing.getPackagePart(), metadata, - new OOXMLWordAndPowerPointTextHandler( - new OOXMLTikaBodyPartHandler(xhtml), - new HashMap<>()//empty - )); + PackagePartName relName = + PackagingURIHelper.createPartName(rel.getTargetURI()); + PackagePart drawingPart = rel.getPackage().getPart(relName); + if (drawingPart == null) { + continue; } - seenParentDrawings.add(parentDrawing.getPackagePart().getPartName().toString()); + // SAX-parse drawing XML for shape text and hyperlinks + try (InputStream is = drawingPart.getInputStream()) { + XMLReaderUtils.parseSAX(is, + new DrawingShapeHandler(xhtml, drawingHyperlinks), + parseContext); + } catch (IOException | TikaException e) { + //swallow + } + // Process diagram and chart data through drawing part relationships + handleGeneralTextContainingPart( + AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA, + "diagram-data", drawingPart, metadata, + new OOXMLWordAndPowerPointTextHandler( + new OOXMLTikaBodyPartHandler(xhtml), + new HashMap<>())); + handleGeneralTextContainingPart(RELATION_CHART, "chart", + drawingPart, metadata, + new OOXMLWordAndPowerPointTextHandler( + new OOXMLTikaBodyPartHandler(xhtml), + new HashMap<>())); } + } catch (InvalidFormatException e) { + //swallow } } - private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) - throws SAXException { - - if (ctShape == null) { - return; - } + /** + * SAX handler for drawing XML that extracts shape text and hyperlinks + * without requiring XMLBeans or the POI usermodel (XSSFShape, etc.). + */ + private static class DrawingShapeHandler extends DefaultHandler { - CTShapeNonVisual nvSpPR = ctShape.getNvSpPr(); - if (nvSpPR == null) { - return; - } + private final XHTMLContentHandler xhtml; + private final Map<String, String> hyperlinks; - CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr(); - if (cNvPr == null) { - return; - } + private boolean inTxBody; + private boolean inT; + private final StringBuilder textBuffer = new StringBuilder(); + private final StringBuilder shapeText = new StringBuilder(); - CTHyperlink ctHyperlink = cNvPr.getHlinkClick(); - if (ctHyperlink == null) { - return; + DrawingShapeHandler(XHTMLContentHandler xhtml, Map<String, String> hyperlinks) { + this.xhtml = xhtml; + this.hyperlinks = hyperlinks; } - String url = drawingHyperlinks.get(ctHyperlink.getId()); - if (url != null) { - xhtml.startElement("a", "href", url); - xhtml.characters(url); - xhtml.endElement("a"); + @Override + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + if ("txBody".equals(localName)) { + inTxBody = true; + shapeText.setLength(0); + } else if ("t".equals(localName) && inTxBody) { + inT = true; + textBuffer.setLength(0); + } else if ("hlinkClick".equals(localName) || "hlinkHover".equals(localName)) { + String rId = atts.getValue(NS_RELATIONSHIPS, "id"); + if (rId == null) { + // try non-namespace-aware fallback + rId = atts.getValue("r:id"); + } + if (rId != null) { + String url = hyperlinks.get(rId); + if (url != null) { + xhtml.startElement("a", "href", url); + xhtml.characters(url); + xhtml.endElement("a"); + } + } + } } - CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover(); - if (ctHoverHyperlink == null) { - return; + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if ("t".equals(localName) && inT) { + inT = false; + shapeText.append(textBuffer); + } else if ("p".equals(localName) && inTxBody && + shapeText.length() > 0) { + shapeText.append('\n'); + } else if ("txBody".equals(localName)) { + inTxBody = false; + String text = shapeText.toString().trim(); + if (!text.isEmpty()) { + xhtml.element("p", text); + } + } } - url = drawingHyperlinks.get(ctHoverHyperlink.getId()); - if (url != null) { - xhtml.startElement("a", "href", url); - xhtml.characters(url); - xhtml.endElement("a"); + @Override + public void characters(char[] ch, int start, int length) { + if (inT) { + textBuffer.append(ch, start, length); + } } - } public void processSheet(SheetContentsHandler sheetContentsHandler, Comments comments, - StylesTable styles, ReadOnlySharedStringsTable strings, + XSSFStylesShim stylesShim, XSSFSharedStringsShim stringsShim, InputStream sheetInputStream) throws IOException, SAXException { try { - XSSFSheetInterestingPartsCapturer handler = new XSSFSheetInterestingPartsCapturer( - new XSSFSheetXMLHandler(styles, comments, strings, sheetContentsHandler, - formatter, false)); + new TikaSheetXMLHandler(stylesShim, comments, stringsShim, + sheetContentsHandler, formatter, false)); XMLReaderUtils.parseSAX(sheetInputStream, handler, parseContext); sheetInputStream.close(); @@ -825,6 +846,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } + /** * In Excel files, sheets have things embedded in them, * and sheet drawings which have the images @@ -839,7 +861,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { // If it has drawings, return those too try { for (PackageRelationship rel : part - .getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { + .getRelationshipsByType(RELATION_DRAWING)) { if (rel.getTargetMode() == TargetMode.INTERNAL) { PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); @@ -847,7 +869,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } for (PackageRelationship rel : part - .getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) { + .getRelationshipsByType(RELATION_VML_DRAWING)) { if (rel.getTargetMode() == TargetMode.INTERNAL) { PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFSharedStringsShim.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFSharedStringsShim.java new file mode 100644 index 0000000000..8556d0fbb3 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFSharedStringsShim.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.utils.XMLReaderUtils; + +/** + * SAX-based shim that replaces POI's {@code ReadOnlySharedStringsTable} + * for XLSX event-based parsing. + * <p> + * Parses {@code xl/sharedStrings.xml} and stores each shared string entry + * as a plain {@code String}, avoiding the XMLBeans dependency that + * {@code XSSFRichTextString} requires. Rich text runs within a single + * {@code <si>} are concatenated into a single string. + */ +class XSSFSharedStringsShim { + + private final List<String> strings; + private final boolean includePhoneticRuns; + + XSSFSharedStringsShim(InputStream sharedStringsData, + boolean includePhoneticRuns, + ParseContext parseContext) + throws IOException, SAXException, TikaException { + this.includePhoneticRuns = includePhoneticRuns; + SharedStringsHandler handler = new SharedStringsHandler(); + if (sharedStringsData != null) { + try { + XMLReaderUtils.parseSAX(sharedStringsData, handler, parseContext); + } finally { + sharedStringsData.close(); + } + } + this.strings = handler.strings; + } + + String getItemAt(int idx) { + return strings.get(idx); + } + + int getCount() { + return strings.size(); + } + + private class SharedStringsHandler extends DefaultHandler { + + private static final String NS = + "http://schemas.openxmlformats.org/spreadsheetml/2006/main"; + + final List<String> strings = new ArrayList<>(); + private StringBuilder characters; + private boolean tIsOpen; + private boolean inRPh; + + @Override + public void startElement(String uri, String localName, String qName, + Attributes attributes) { + if (uri != null && !NS.equals(uri)) { + return; + } + switch (localName) { + case "sst": + String uniqueCount = attributes.getValue("uniqueCount"); + if (uniqueCount != null) { + try { + int hint = (int) Long.parseLong(uniqueCount); + // guard against corrupt files with absurd counts + ((ArrayList<String>) strings).ensureCapacity( + Math.min(hint, 100_000)); + } catch (NumberFormatException e) { + // ignore + } + } + characters = new StringBuilder(64); + break; + case "si": + if (characters != null) { + characters.setLength(0); + } + break; + case "t": + tIsOpen = true; + break; + case "rPh": + inRPh = true; + if (includePhoneticRuns && characters != null && + characters.length() > 0) { + characters.append(" "); + } + break; + default: + break; + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if (uri != null && !NS.equals(uri)) { + return; + } + switch (localName) { + case "si": + if (characters != null) { + strings.add(characters.toString()); + } + break; + case "t": + tIsOpen = false; + break; + case "rPh": + inRPh = false; + break; + default: + break; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + if (tIsOpen && characters != null) { + if (inRPh) { + if (includePhoneticRuns) { + characters.append(ch, start, length); + } + } else { + characters.append(ch, start, length); + } + } + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFStylesShim.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFStylesShim.java new file mode 100644 index 0000000000..ca99c7512e --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFStylesShim.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.poi.ss.usermodel.BuiltinFormats; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.utils.XMLReaderUtils; + +/** + * SAX-based shim that replaces POI's {@code StylesTable} for XLSX event-based parsing. + * <p> + * Parses {@code xl/styles.xml} and extracts only the information needed for text + * extraction: the number format resolution chain (cellXfs index to format string). + * This avoids the XMLBeans dependency that {@code StylesTable} requires. + */ +class XSSFStylesShim { + + private final Map<Short, String> numberFormats = new HashMap<>(); + private final List<Short> cellXfFormatIds = new ArrayList<>(); + + XSSFStylesShim(InputStream stylesData, ParseContext parseContext) + throws IOException, SAXException, TikaException { + if (stylesData != null) { + try { + XMLReaderUtils.parseSAX(stylesData, new StylesHandler(), parseContext); + } finally { + stylesData.close(); + } + } + } + + int getNumCellStyles() { + return cellXfFormatIds.size(); + } + + short getFormatIndex(int styleIndex) { + if (styleIndex < 0 || styleIndex >= cellXfFormatIds.size()) { + return -1; + } + return cellXfFormatIds.get(styleIndex); + } + + String getFormatString(int styleIndex) { + short fmtId = getFormatIndex(styleIndex); + if (fmtId == -1) { + return null; + } + String fmt = numberFormats.get(fmtId); + if (fmt == null) { + fmt = BuiltinFormats.getBuiltinFormat(fmtId); + } + return fmt; + } + + private class StylesHandler extends DefaultHandler { + + private static final String NS = + "http://schemas.openxmlformats.org/spreadsheetml/2006/main"; + + private boolean inCellXfs; + private boolean inNumFmts; + + @Override + public void startElement(String uri, String localName, String qName, + Attributes attributes) { + if (!NS.equals(uri)) { + return; + } + switch (localName) { + case "numFmts": + inNumFmts = true; + break; + case "numFmt": + if (inNumFmts) { + String idStr = attributes.getValue("numFmtId"); + String code = attributes.getValue("formatCode"); + if (idStr != null && code != null) { + try { + numberFormats.put(Short.parseShort(idStr), code); + } catch (NumberFormatException e) { + // skip malformed + } + } + } + break; + case "cellXfs": + inCellXfs = true; + break; + case "xf": + if (inCellXfs) { + String numFmtIdStr = attributes.getValue("numFmtId"); + short numFmtId = 0; + if (numFmtIdStr != null) { + try { + numFmtId = Short.parseShort(numFmtIdStr); + } catch (NumberFormatException e) { + // default to 0 (General) + } + } + cellXfFormatIds.add(numFmtId); + } + break; + default: + break; + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if (!NS.equals(uri)) { + return; + } + if ("numFmts".equals(localName)) { + inNumFmts = false; + } else if ("cellXfs".equals(localName)) { + inCellXfs = false; + } + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java index 2dd8af7afc..dfb4c71924 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java @@ -60,7 +60,6 @@ import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.apache.xmlbeans.XmlCursor; -import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar; @@ -124,7 +123,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { */ @Override protected void buildXHTML(XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); XWPFListManager listManager = new XWPFListManager(loadNumbering()); // headers @@ -187,7 +186,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void extractIBodyText(IBody bodyElement, XWPFListManager listManager, XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { for (IBodyElement element : bodyElement.getBodyElements()) { if (element instanceof XWPFParagraph) { XWPFParagraph paragraph = (XWPFParagraph) element; @@ -205,7 +204,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { ISDTContent content = element.getContent(); String tag = "p"; xhtml.startElement(tag); @@ -215,7 +214,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager, XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { // If this paragraph is actually a whole new section, then // it could have its own headers and footers // Check and handle if so @@ -375,10 +374,14 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { //Note "w:txbxContent//"...must look for all descendant paragraphs //not just the immediate children of txbxContent -- TIKA-2807 if (config.isIncludeShapeBasedContent()) { - for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath( - "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent//w:p")) { - extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), - paragraph.getBody()), listManager, xhtml); + try { + for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath( + "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent//w:p")) { + extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), + paragraph.getBody()), listManager, xhtml); + } + } catch (Exception e) { + // XmlException from CTP.Factory.parse — swallow for shape content } } @@ -397,7 +400,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void extractFootnoteEndnoteContent(XWPFParagraph paragraph, XWPFListManager listManager, XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { String nsW = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; QName footnoteRefQName = new QName(nsW, "footnoteReference"); QName endnoteRefQName = new QName(nsW, "endnoteReference"); @@ -553,7 +556,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void processRun(XWPFRun run, XWPFParagraph paragraph, XHTMLContentHandler xhtml, Deque<FormattingUtils.Tag> formattingState) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { // open/close required tags if run changes formatting FormattingUtils.ensureFormattingState(xhtml, FormattingUtils.toTags(run), formattingState); @@ -581,7 +584,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { xhtml.characters(run.getContent().getText()); } @@ -627,7 +630,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void extractTable(XWPFTable table, XWPFListManager listManager, XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { xhtml.startElement("table"); xhtml.startElement("tbody"); for (XWPFTableRow row : table.getRows()) { @@ -649,7 +652,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void extractFooters(XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { // footers if (hfPolicy.getFirstPageFooter() != null) { extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), listManager); @@ -664,7 +667,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void extractHeaders(XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { if (hfPolicy == null) { return; } @@ -684,7 +687,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header, XWPFListManager listManager) - throws SAXException, XmlException, IOException { + throws SAXException, IOException { for (IBodyElement e : header.getBodyElements()) { if (e instanceof XWPFParagraph) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java index 705ce27487..a05698d969 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java @@ -25,7 +25,6 @@ import org.apache.poi.ooxml.POIXMLProperties; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; -import org.apache.xmlbeans.XmlException; /** * Currently, mostly a pass-through class to hold pkg and properties @@ -37,10 +36,13 @@ public class XPSTextExtractor implements POIXMLTextExtractor { private final OPCPackage pkg; private final POIXMLProperties properties; - public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException { + public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, IOException { this.pkg = pkg; - this.properties = new POIXMLProperties(pkg); - + try { + this.properties = new POIXMLProperties(pkg); + } catch (Exception e) { + throw new IOException("Failed to read OOXML properties", e); + } } @Override diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java index 193f649a4e..3de987e67d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java @@ -25,7 +25,6 @@ import org.apache.poi.ooxml.POIXMLProperties; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; -import org.apache.xmlbeans.XmlException; import org.apache.tika.parser.microsoft.ooxml.EditType; import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties; @@ -39,9 +38,13 @@ public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor { private POIXMLProperties properties; public XSLFEventBasedPowerPointExtractor(OPCPackage container) - throws XmlException, OpenXML4JException, IOException { + throws OpenXML4JException, IOException { this.container = container; - this.properties = new POIXMLProperties(container); + try { + this.properties = new POIXMLProperties(container); + } catch (Exception e) { + throw new IOException("Failed to read OOXML properties", e); + } } public OPCPackage getPackage() { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index 2bb53a3c69..5a4676d631 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -34,7 +34,6 @@ import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.xwpf.usermodel.XWPFRelation; -import org.apache.xmlbeans.XmlException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -64,7 +63,7 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { private POIXMLProperties properties; public XWPFEventBasedWordExtractor(OPCPackage container) - throws XmlException, OpenXML4JException, IOException { + throws OpenXML4JException, IOException { this.container = container; // Properties are lazily initialized to avoid requiring ooxml-lite // when SAXBasedMetadataExtractor is used instead
