This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bfb5096759 TIKA-4708-refactor-xlsx (#2733)
bfb5096759 is described below
commit bfb5096759ab059054a2fcfa78c0a34bf090772a
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 3 17:59:05 2026 -0400
TIKA-4708-refactor-xlsx (#2733)
---
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 5 +-
.../parser/microsoft/ooxml/OOXMLExtractor.java | 3 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 7 +-
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 5 +
.../ooxml/SXWPFWordExtractorDecorator.java | 3 +-
.../microsoft/ooxml/TikaSheetContentsHandler.java | 36 ++
.../microsoft/ooxml/TikaSheetXMLHandler.java | 401 +++++++++++++++++++++
.../ooxml/XSLFPowerPointExtractorDecorator.java | 3 +-
.../ooxml/XSSFBExcelExtractorDecorator.java | 13 +-
.../parser/microsoft/ooxml/XSSFCommentsShim.java | 187 ++++++++++
.../ooxml/XSSFExcelExtractorDecorator.java | 295 +++++++++------
.../microsoft/ooxml/XSSFSharedStringsShim.java | 156 ++++++++
.../parser/microsoft/ooxml/XSSFStylesShim.java | 146 ++++++++
.../ooxml/XWPFWordExtractorDecorator.java | 35 +-
.../microsoft/ooxml/xps/XPSTextExtractor.java | 10 +-
.../xslf/XSLFEventBasedPowerPointExtractor.java | 9 +-
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 3 +-
17 files changed, 1161 insertions(+), 156 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 70d5920800..dd7c5eafaf 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -46,7 +46,6 @@ import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -140,7 +139,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
* ParseContext)
*/
public void getXHTML(ContentHandler handler, Metadata metadata,
ParseContext context)
- throws SAXException, XmlException, IOException, TikaException {
+ throws SAXException, IOException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata,
context);
xhtml.startDocument();
@@ -527,7 +526,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
* Populates the {@link XHTMLContentHandler} object received as parameter.
*/
protected abstract void buildXHTML(XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException;
+ throws SAXException, IOException;
/**
* Return a list of the main parts of the document, used
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
index 673776404c..5c37c9c7b5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
@@ -19,7 +19,6 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import org.apache.poi.ooxml.POIXMLDocument;
-import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -53,5 +52,5 @@ public interface OOXMLExtractor {
* given content handler.
*/
void getXHTML(ContentHandler handler, Metadata metadata, ParseContext
context)
- throws SAXException, XmlException, IOException, TikaException;
+ throws SAXException, IOException, TikaException;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 8558f37c21..f407ffeb62 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -37,7 +37,6 @@ import
org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.xmlbeans.XmlException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
@@ -202,7 +201,7 @@ public class OOXMLExtractorFactory {
} else {
throw new TikaException("Error creating OOXML extractor", e);
}
- } catch (OpenXML4JException | XmlException e) {
+ } catch (OpenXML4JException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (RuntimeSAXException e) {
throw (SAXException) e.getCause();
@@ -210,7 +209,7 @@ public class OOXMLExtractorFactory {
}
private static POIXMLTextExtractor trySXWPF(OPCPackage pkg)
- throws TikaException, XmlException, OpenXML4JException,
IOException {
+ throws TikaException, OpenXML4JException, IOException {
PackageRelationshipCollection packageRelationshipCollection =
pkg.getRelationshipsByType(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
if (packageRelationshipCollection.size() == 0) {
@@ -235,7 +234,7 @@ public class OOXMLExtractorFactory {
}
private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean
eventBased)
- throws TikaException, XmlException, OpenXML4JException,
IOException {
+ throws TikaException, OpenXML4JException, IOException {
PackageRelationshipCollection packageRelationshipCollection =
pkg.getRelationshipsByType(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index c16b75fd8c..d6f5b9759d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -93,6 +93,11 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
//if mainDocument == null, throw exception
}
+ @Override
+ public MetadataExtractor getMetadataExtractor() {
+ return new SAXBasedMetadataExtractor(opcPackage, context);
+ }
+
/**
* @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
*/
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 5fc7d9a846..393370662b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -34,7 +34,6 @@ import
org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.xmlbeans.XmlException;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -117,7 +116,7 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
@Override
protected void buildXHTML(XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
//handle main document
List<PackagePart> pps = getStoryDocumentParts();
if (pps != null) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java
new file mode 100644
index 0000000000..44173ec322
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+/**
+ * Sheet contents handler that uses {@link XSSFCommentsShim.CommentData}
+ * instead of POI's XMLBeans-dependent {@code XSSFComment}.
+ */
+interface TikaSheetContentsHandler {
+
+ void startRow(int rowNum);
+
+ void endRow(int rowNum);
+
+ void cell(String cellRef, String formattedValue,
XSSFCommentsShim.CommentData comment);
+
+ default void headerFooter(String text, boolean isHeader, String tagName) {
+ }
+
+ default void endSheet() {
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
new file mode 100644
index 0000000000..3ba83dd255
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
@@ -0,0 +1,401 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.Queue;
+
+import org.apache.poi.ss.usermodel.BuiltinFormats;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.util.CellAddress;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Sheet XML handler for XLSX event-based parsing that uses {@link
XSSFStylesShim}
+ * and {@link XSSFCommentsShim} instead of POI's XMLBeans-dependent
+ * {@code StylesTable} and {@code CommentsTable}.
+ * <p>
+ * Adapted from Apache POI's {@code XSSFSheetXMLHandler} (Apache 2.0 license).
+ */
+class TikaSheetXMLHandler extends DefaultHandler {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(TikaSheetXMLHandler.class);
+
+ private static final String NS_SPREADSHEETML =
+ "http://schemas.openxmlformats.org/spreadsheetml/2006/main";
+
+ enum XssfDataType {
+ BOOLEAN,
+ ERROR,
+ FORMULA,
+ INLINE_STRING,
+ SST_STRING,
+ NUMBER,
+ }
+
+ private final XSSFStylesShim stylesShim;
+ private final XSSFCommentsShim commentsShim;
+ private final XSSFSharedStringsShim sharedStringsShim;
+ private final TikaSheetContentsHandler output;
+ private final DataFormatter formatter;
+ private final boolean formulasNotResults;
+
+ private boolean vIsOpen;
+ private boolean fIsOpen;
+ private boolean isIsOpen;
+ private boolean hfIsOpen;
+
+ private XssfDataType nextDataType;
+ private short formatIndex;
+ private String formatString;
+
+ private int rowNum;
+ private int nextRowNum;
+ private String cellRef;
+
+ private final StringBuilder value = new StringBuilder(64);
+ private final StringBuilder formula = new StringBuilder(64);
+ private final StringBuilder headerFooter = new StringBuilder(64);
+
+ private Queue<CellAddress> commentCellRefs;
+
+ TikaSheetXMLHandler(XSSFStylesShim stylesShim,
+ XSSFCommentsShim commentsShim,
+ XSSFSharedStringsShim sharedStringsShim,
+ TikaSheetContentsHandler sheetContentsHandler,
+ DataFormatter dataFormatter,
+ boolean formulasNotResults) {
+ this.stylesShim = stylesShim;
+ this.commentsShim = commentsShim;
+ this.sharedStringsShim = sharedStringsShim;
+ this.output = sheetContentsHandler;
+ this.formatter = dataFormatter;
+ this.formulasNotResults = formulasNotResults;
+ this.nextDataType = XssfDataType.NUMBER;
+ initComments(commentsShim);
+ }
+
+ TikaSheetXMLHandler(XSSFStylesShim stylesShim,
+ XSSFSharedStringsShim sharedStringsShim,
+ TikaSheetContentsHandler sheetContentsHandler,
+ DataFormatter dataFormatter,
+ boolean formulasNotResults) {
+ this(stylesShim, null, sharedStringsShim, sheetContentsHandler,
dataFormatter,
+ formulasNotResults);
+ }
+
+ private void initComments(XSSFCommentsShim commentsShim) {
+ if (commentsShim != null) {
+ commentCellRefs = new LinkedList<>();
+ for (Iterator<CellAddress> iter = commentsShim.getCellAddresses();
+ iter.hasNext(); ) {
+ commentCellRefs.add(iter.next());
+ }
+ }
+ }
+
+ private boolean isTextTag(String name) {
+ if ("v".equals(name)) {
+ return true;
+ }
+ if ("inlineStr".equals(name)) {
+ return true;
+ }
+ return "t".equals(name) && isIsOpen;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) throws SAXException {
+ if (uri != null && !uri.equals(NS_SPREADSHEETML)) {
+ return;
+ }
+
+ if (isTextTag(localName)) {
+ vIsOpen = true;
+ if (!isIsOpen) {
+ value.setLength(0);
+ }
+ } else if ("is".equals(localName)) {
+ isIsOpen = true;
+ } else if ("f".equals(localName)) {
+ formula.setLength(0);
+ if (this.nextDataType == XssfDataType.NUMBER) {
+ this.nextDataType = XssfDataType.FORMULA;
+ }
+ String type = attributes.getValue("t");
+ if (type != null && type.equals("shared")) {
+ String ref = attributes.getValue("ref");
+ if (ref != null) {
+ fIsOpen = true;
+ } else {
+ if (formulasNotResults) {
+ LOG.warn("shared formulas not yet supported!");
+ }
+ }
+ } else {
+ fIsOpen = true;
+ }
+ } else if ("oddHeader".equals(localName) ||
"evenHeader".equals(localName) ||
+ "firstHeader".equals(localName) ||
"firstFooter".equals(localName) ||
+ "oddFooter".equals(localName) ||
"evenFooter".equals(localName)) {
+ hfIsOpen = true;
+ headerFooter.setLength(0);
+ } else if ("row".equals(localName)) {
+ String rowNumStr = attributes.getValue("r");
+ if (rowNumStr != null) {
+ rowNum = Integer.parseInt(rowNumStr.trim()) - 1;
+ } else {
+ rowNum = nextRowNum;
+ }
+ output.startRow(rowNum);
+ } else if ("c".equals(localName)) {
+ // Cell element — resolve style to format index/string
+ this.formula.setLength(0);
+ this.nextDataType = XssfDataType.NUMBER;
+ this.formatIndex = -1;
+ this.formatString = null;
+ cellRef = attributes.getValue("r");
+ String cellType = attributes.getValue("t");
+ String cellStyleStr = attributes.getValue("s");
+
+ if ("b".equals(cellType)) {
+ nextDataType = XssfDataType.BOOLEAN;
+ } else if ("e".equals(cellType)) {
+ nextDataType = XssfDataType.ERROR;
+ } else if ("inlineStr".equals(cellType)) {
+ nextDataType = XssfDataType.INLINE_STRING;
+ } else if ("s".equals(cellType)) {
+ nextDataType = XssfDataType.SST_STRING;
+ } else if ("str".equals(cellType)) {
+ nextDataType = XssfDataType.FORMULA;
+ } else {
+ // Number — resolve format via our styles shim
+ if (stylesShim != null) {
+ int styleIndex;
+ if (cellStyleStr != null) {
+ styleIndex = Integer.parseInt(cellStyleStr.trim());
+ } else if (stylesShim.getNumCellStyles() > 0) {
+ styleIndex = 0;
+ } else {
+ styleIndex = -1;
+ }
+ if (styleIndex >= 0) {
+ this.formatIndex =
stylesShim.getFormatIndex(styleIndex);
+ this.formatString =
stylesShim.getFormatString(styleIndex);
+ if (this.formatString == null) {
+ this.formatString =
+
BuiltinFormats.getBuiltinFormat(this.formatIndex);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if (uri != null && !uri.equals(NS_SPREADSHEETML)) {
+ return;
+ }
+
+ if (isTextTag(localName)) {
+ vIsOpen = false;
+ if (!isIsOpen) {
+ outputCell();
+ value.setLength(0);
+ }
+ } else if ("f".equals(localName)) {
+ fIsOpen = false;
+ } else if ("is".equals(localName)) {
+ isIsOpen = false;
+ outputCell();
+ value.setLength(0);
+ } else if ("row".equals(localName)) {
+ checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_ROW);
+ output.endRow(rowNum);
+ nextRowNum = rowNum + 1;
+ } else if ("sheetData".equals(localName)) {
+
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_SHEET_DATA);
+ output.endSheet();
+ } else if ("oddHeader".equals(localName) ||
"evenHeader".equals(localName) ||
+ "firstHeader".equals(localName)) {
+ hfIsOpen = false;
+ output.headerFooter(headerFooter.toString(), true, localName);
+ } else if ("oddFooter".equals(localName) ||
"evenFooter".equals(localName) ||
+ "firstFooter".equals(localName)) {
+ hfIsOpen = false;
+ output.headerFooter(headerFooter.toString(), false, localName);
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws
SAXException {
+ if (vIsOpen) {
+ value.append(ch, start, length);
+ }
+ if (fIsOpen) {
+ formula.append(ch, start, length);
+ }
+ if (hfIsOpen) {
+ headerFooter.append(ch, start, length);
+ }
+ }
+
+ private void outputCell() {
+ String thisStr = null;
+
+ if (formulasNotResults && formula.length() > 0) {
+ thisStr = formula.toString();
+ } else {
+ switch (nextDataType) {
+ case BOOLEAN:
+ char first = value.charAt(0);
+ thisStr = first == '0' ? "FALSE" : "TRUE";
+ break;
+ case ERROR:
+ thisStr = "ERROR:" + value;
+ break;
+ case FORMULA:
+ if (formulasNotResults) {
+ thisStr = formula.toString();
+ } else {
+ String fv = value.toString();
+ if (this.formatString != null) {
+ try {
+ double d = Double.parseDouble(fv.trim());
+ thisStr = formatter.formatRawCellContents(
+ d, this.formatIndex,
this.formatString);
+ } catch (Exception e) {
+ thisStr = fv;
+ }
+ } else {
+ thisStr = fv;
+ }
+ }
+ break;
+ case INLINE_STRING:
+ thisStr = value.toString();
+ break;
+ case SST_STRING:
+ String sstIndex = value.toString().trim();
+ if (!sstIndex.isEmpty()) {
+ try {
+ int idx = Integer.parseInt(sstIndex);
+ thisStr = sharedStringsShim.getItemAt(idx);
+ } catch (NumberFormatException ex) {
+ LOG.error("Failed to parse SST index '{}'",
sstIndex, ex);
+ }
+ }
+ break;
+ case NUMBER:
+ String n = value.toString();
+ if (this.formatString != null && !n.isEmpty()) {
+ try {
+ thisStr = formatter.formatRawCellContents(
+ Double.parseDouble(n.trim()),
+ this.formatIndex, this.formatString);
+ } catch (Exception e) {
+ thisStr = n;
+ }
+ } else {
+ thisStr = n;
+ }
+ break;
+ default:
+ thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
+ break;
+ }
+ }
+
+ checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL);
+ XSSFCommentsShim.CommentData comment = commentsShim != null ?
+ commentsShim.findCellComment(new CellAddress(cellRef)) : null;
+ output.cell(cellRef, thisStr, comment);
+ }
+
+ private void checkForEmptyCellComments(EmptyCellCommentsCheckType type) {
+ if (commentCellRefs != null && !commentCellRefs.isEmpty()) {
+ if (type == EmptyCellCommentsCheckType.END_OF_SHEET_DATA) {
+ while (!commentCellRefs.isEmpty()) {
+ outputEmptyCellComment(commentCellRefs.remove());
+ }
+ return;
+ }
+
+ if (this.cellRef == null) {
+ if (type == EmptyCellCommentsCheckType.END_OF_ROW) {
+ while (!commentCellRefs.isEmpty()) {
+ if (commentCellRefs.peek().getRow() == rowNum) {
+ outputEmptyCellComment(commentCellRefs.remove());
+ } else {
+ return;
+ }
+ }
+ return;
+ } else {
+ throw new IllegalStateException(
+ "Cell ref should be null only if there are only
empty " +
+ "cells in the row; rowNum: " + rowNum);
+ }
+ }
+
+ CellAddress nextCommentCellRef;
+ do {
+ CellAddress cellAddr = new CellAddress(this.cellRef);
+ CellAddress peekCellRef = commentCellRefs.peek();
+ if (type == EmptyCellCommentsCheckType.CELL &&
+ cellAddr.equals(peekCellRef)) {
+ commentCellRefs.remove();
+ return;
+ } else {
+ int comparison = peekCellRef.compareTo(cellAddr);
+ if (comparison > 0 &&
+ type == EmptyCellCommentsCheckType.END_OF_ROW &&
+ peekCellRef.getRow() <= rowNum) {
+ nextCommentCellRef = commentCellRefs.remove();
+ outputEmptyCellComment(nextCommentCellRef);
+ } else if (comparison < 0 &&
+ type == EmptyCellCommentsCheckType.CELL &&
+ peekCellRef.getRow() <= rowNum) {
+ nextCommentCellRef = commentCellRefs.remove();
+ outputEmptyCellComment(nextCommentCellRef);
+ } else {
+ nextCommentCellRef = null;
+ }
+ }
+ } while (nextCommentCellRef != null && !commentCellRefs.isEmpty());
+ }
+ }
+
+ private void outputEmptyCellComment(CellAddress cellRef) {
+ XSSFCommentsShim.CommentData comment =
commentsShim.findCellComment(cellRef);
+ output.cell(cellRef.formatAsString(), null, comment);
+ }
+
+ private enum EmptyCellCommentsCheckType {
+ CELL,
+ END_OF_ROW,
+ END_OF_SHEET_DATA
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index d292f5c571..fb9876fce8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -56,7 +56,6 @@ import org.apache.poi.xslf.usermodel.XSLFTableRow;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
import org.apache.poi.xslf.usermodel.XSLFTextRun;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
-import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;
import
org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthorList;
@@ -356,7 +355,7 @@ public class XSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
PackagePart slidePart;
try {
slidePart = document.getSlidePart(ctSlide);
- } catch (IOException | XmlException e) {
+ } catch (Exception e) {
throw new TikaException("Broken OOXML file", e);
}
addSlideParts(slidePart, parts);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index 51a30cdc96..0db32f067a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -18,7 +18,6 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import java.io.InputStream;
-import java.util.List;
import java.util.Locale;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
@@ -32,8 +31,6 @@ import org.apache.poi.xssf.binary.XSSFBStylesTable;
import org.apache.poi.xssf.eventusermodel.XSSFBReader;
import
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFShape;
-import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -61,7 +58,7 @@ public class XSSFBExcelExtractorDecorator extends
XSSFExcelExtractorDecorator {
@Override
public void getXHTML(ContentHandler handler, Metadata metadata,
ParseContext context)
- throws SAXException, XmlException, IOException, TikaException {
+ throws SAXException, IOException, TikaException {
this.metadata = metadata;
this.parseContext = context;
@@ -75,7 +72,7 @@ public class XSSFBExcelExtractorDecorator extends
XSSFExcelExtractorDecorator {
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
OPCPackage container = extractor.getPackage();
XSSFBSharedStringsTable strings;
@@ -92,7 +89,7 @@ public class XSSFBExcelExtractorDecorator extends
XSSFExcelExtractorDecorator {
iter = (XSSFBReader.SheetIterator) xssfReader.getSheetsData();
strings = new XSSFBSharedStringsTable(container);
} catch (OpenXML4JException e) {
- throw new XmlException(e);
+ throw new IOException(e);
}
while (iter.hasNext()) {
@@ -126,9 +123,7 @@ public class XSSFBExcelExtractorDecorator extends
XSSFExcelExtractorDecorator {
for (String footer : sheetExtractor.footers) {
extractHeaderFooter(footer, xhtml);
}
- List<XSSFShape> shapes = iter.getShapes();
-
- processShapes(shapes, xhtml);
+ processDrawings(sheetPart, xhtml);
//for now dump sheet hyperlinks at bottom of page
//consider a double-pass of the inputstream to reunite hyperlinks
with cells/textboxes
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java
new file mode 100644
index 0000000000..f3293a0d3c
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.poi.ss.util.CellAddress;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * SAX-based shim that parses {@code xl/commentsN.xml} without XMLBeans.
+ * Replaces POI's {@code CommentsTable} (which depends on poi-ooxml-lite)
+ * for Tika's text extraction needs.
+ *
+ * <p>Only extracts what Tika needs: cell reference → (author, text)
mapping.</p>
+ */
+class XSSFCommentsShim {
+
+ private final Map<CellAddress, CommentData> commentsByCell;
+
+ /**
+ * Simple holder for comment data needed by Tika.
+ */
+ static class CommentData {
+ private final String author;
+ private final String text;
+
+ CommentData(String author, String text) {
+ this.author = author;
+ this.text = text;
+ }
+
+ public String getAuthor() {
+ return author;
+ }
+
+ public String getText() {
+ return text;
+ }
+ }
+
+ /**
+ * Parse a comments XML stream.
+ *
+ * @param is the {@code xl/commentsN.xml} stream (may be null)
+ * @param parseContext parse context for SAX parser configuration
+ */
+ XSSFCommentsShim(InputStream is, ParseContext parseContext)
+ throws IOException, TikaException, SAXException {
+ commentsByCell = new LinkedHashMap<>();
+ if (is != null) {
+ CommentsHandler handler = new CommentsHandler();
+ XMLReaderUtils.parseSAX(is, handler, parseContext);
+ }
+ }
+
+ /**
+ * @return the number of comments parsed
+ */
+ int getNumberOfComments() {
+ return commentsByCell.size();
+ }
+
+ /**
+ * Find comment data for a given cell address.
+ *
+ * @return CommentData or null if no comment at that cell
+ */
+ CommentData findCellComment(CellAddress cellAddress) {
+ return commentsByCell.get(cellAddress);
+ }
+
+ /**
+ * @return iterator over all cell addresses that have comments, in
document order
+ */
+ Iterator<CellAddress> getCellAddresses() {
+ return commentsByCell.keySet().iterator();
+ }
+
+ /**
+ * SAX handler for comments XML. Structure:
+ * <pre>
+ * <comments>
+ * <authors>
+ * <author>Name</author>
+ * </authors>
+ * <commentList>
+ * <comment ref="A1" authorId="0">
+ * <text>
+ * <r><t>Comment text</t></r>
+ * or plain <t>Comment text</t>
+ * </text>
+ * </comment>
+ * </commentList>
+ * </comments>
+ * </pre>
+ */
+ private class CommentsHandler extends DefaultHandler {
+
+ private final List<String> authors = new ArrayList<>();
+ private final StringBuilder textBuffer = new StringBuilder();
+
+ private boolean inAuthor;
+ private boolean inT;
+ private boolean inText;
+
+ private String currentRef;
+ private int currentAuthorId;
+ private final StringBuilder commentText = new StringBuilder();
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) {
+ if ("author".equals(localName)) {
+ inAuthor = true;
+ textBuffer.setLength(0);
+ } else if ("comment".equals(localName)) {
+ currentRef = atts.getValue("ref");
+ String authorIdStr = atts.getValue("authorId");
+ currentAuthorId = authorIdStr != null ?
Integer.parseInt(authorIdStr) : -1;
+ commentText.setLength(0);
+ } else if ("text".equals(localName)) {
+ inText = true;
+ } else if ("t".equals(localName) && inText) {
+ inT = true;
+ textBuffer.setLength(0);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) {
+ if ("author".equals(localName)) {
+ inAuthor = false;
+ authors.add(textBuffer.toString());
+ } else if ("t".equals(localName) && inT) {
+ inT = false;
+ if (commentText.length() > 0) {
+ commentText.append(' ');
+ }
+ commentText.append(textBuffer);
+ } else if ("text".equals(localName)) {
+ inText = false;
+ } else if ("comment".equals(localName)) {
+ if (currentRef != null) {
+ String author = (currentAuthorId >= 0 && currentAuthorId <
authors.size())
+ ? authors.get(currentAuthorId) : "";
+ commentsByCell.put(new CellAddress(currentRef),
+ new CommentData(author, commentText.toString()));
+ }
+ currentRef = null;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ if (inAuthor || inT) {
+ textBuffer.append(ch, start, length);
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index e25bdf6d09..899016caf8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -21,11 +21,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
-import java.util.Set;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
@@ -42,24 +40,11 @@ import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.util.CellReference;
-import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
-import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
-import org.apache.poi.xssf.model.Comments;
-import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
-import org.apache.poi.xssf.usermodel.XSSFDrawing;
-import org.apache.poi.xssf.usermodel.XSSFRelation;
-import org.apache.poi.xssf.usermodel.XSSFShape;
-import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
-import org.apache.xmlbeans.XmlException;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
-import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
-import
org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
@@ -92,6 +77,20 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
// Power Query stores data in customData parts
private static final String POWER_QUERY_CONTENT_TYPE =
"application/vnd.ms-excel.customDataProperties+xml";
+ private static final String RELATION_DRAWING =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing";
+ private static final String RELATION_CHART =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart";
+ private static final String RELATION_HYPERLINK =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
+ private static final String NS_DRAWING_ML =
+ "http://schemas.openxmlformats.org/drawingml/2006/main";
+ private static final String NS_RELATIONSHIPS =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships";
+ private static final String RELATION_VML_DRAWING =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/vmlDrawing";
+ private static final String RELATION_COMMENTS =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments";
/**
* Allows access to headers/footers from raw xml strings
@@ -133,9 +132,14 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
//does no good here.
}
+ @Override
+ public MetadataExtractor getMetadataExtractor() {
+ return new SAXBasedMetadataExtractor(extractor.getPackage(),
parseContext);
+ }
+
@Override
public void getXHTML(ContentHandler handler, Metadata metadata,
ParseContext context)
- throws SAXException, XmlException, IOException, TikaException {
+ throws SAXException, IOException, TikaException {
this.metadata = metadata;
this.parseContext = context;
@@ -149,21 +153,22 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
OPCPackage container = extractor.getPackage();
- ReadOnlySharedStringsTable strings;
+ XSSFSharedStringsShim stringsShim;
XSSFReader.SheetIterator iter;
XSSFReader xssfReader;
- StylesTable styles;
+ XSSFStylesShim stylesShim;
try {
xssfReader = new XSSFReader(container);
- styles = xssfReader.getStylesTable();
+ stylesShim = new XSSFStylesShim(xssfReader.getStylesData(),
parseContext);
iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
- strings = new ReadOnlySharedStringsTable(container,
config.isConcatenatePhoneticRuns());
- } catch (OpenXML4JException e) {
- throw new XmlException(e);
+ stringsShim = new
XSSFSharedStringsShim(xssfReader.getSharedStringsData(),
+ config.isConcatenatePhoneticRuns(), parseContext);
+ } catch (OpenXML4JException | TikaException e) {
+ throw new IOException(e);
}
while (iter.hasNext()) {
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config,
xhtml);
@@ -174,8 +179,8 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
addDrawingHyperLinks(sheetPart);
sheetParts.add(sheetPart);
- Comments comments = iter.getSheetComments();
- if (comments != null && comments.getNumberOfComments() > 0) {
+ XSSFCommentsShim commentsShim = parseSheetComments(sheetPart);
+ if (commentsShim != null && commentsShim.getNumberOfComments()
> 0) {
metadata.set(Office.HAS_COMMENTS, true);
}
@@ -187,7 +192,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
xhtml.startElement("table");
xhtml.startElement("tbody");
- processSheet(sheetExtractor, comments, styles, strings,
stream);
+ processSheet(sheetExtractor, commentsShim, stylesShim,
stringsShim, stream);
try {
getThreadedComments(container, sheetPart, xhtml);
} catch (InvalidFormatException | TikaException | IOException
e) {
@@ -209,8 +214,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
// Do text held in shapes, if required
if (config.isIncludeShapeBasedContent()) {
- List<XSSFShape> shapes = iter.getShapes();
- processShapes(shapes, xhtml);
+ processDrawings(sheetPart, xhtml);
}
//for now dump sheet hyperlinks at bottom of page
@@ -669,7 +673,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
protected void addDrawingHyperLinks(PackagePart sheetPart) {
try {
for (PackageRelationship rel : sheetPart
-
.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
+ .getRelationshipsByType(RELATION_DRAWING)) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
PackagePart part = rel.getPackage().getPart(relName);
@@ -678,7 +682,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
continue;
}
for (PackageRelationship drawRel : part
-
.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
+ .getRelationshipsByType(RELATION_HYPERLINK)) {
drawingHyperlinks.put(drawRel.getId(),
drawRel.getTargetURI().toString());
}
}
@@ -696,7 +700,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
throws SAXException {
try {
for (PackageRelationship rel : sheetPart
-
.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
+ .getRelationshipsByType(RELATION_HYPERLINK)) {
xhtml.startElement("a", "href", rel.getTargetURI().toString());
xhtml.characters(rel.getTargetURI().toString());
xhtml.endElement("a");
@@ -713,101 +717,125 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
- protected void processShapes(List<XSSFShape> shapes, XHTMLContentHandler
xhtml)
+ protected void processDrawings(PackagePart sheetPart, XHTMLContentHandler
xhtml)
throws SAXException {
- if (shapes == null) {
- return;
- }
- //We don't currently have an obvious way to get drawings
- //directly from sheetIter. Therefore, we grab the shapes and process
those.
- //To get the diagrams and charts, we need to get the parent drawing
for each
- //shape, and we need to make sure that we only process each parent
shape once!
- //SEE TIKA-2703 TODO: add unit test
- Set<String> seenParentDrawings = new HashSet<>();
- for (XSSFShape shape : shapes) {
- if (shape instanceof XSSFSimpleShape) {
- String sText = ((XSSFSimpleShape) shape).getText();
- if (sText != null && sText.length() > 0) {
- xhtml.element("p", sText);
+ try {
+ for (PackageRelationship rel : sheetPart
+ .getRelationshipsByType(RELATION_DRAWING)) {
+ if (rel.getTargetMode() != TargetMode.INTERNAL) {
+ continue;
}
- extractHyperLinksFromShape(((XSSFSimpleShape)
shape).getCTShape(), xhtml);
- }
-
- XSSFDrawing parentDrawing = shape.getDrawing();
- if (parentDrawing != null) {
- if (!seenParentDrawings
-
.contains(parentDrawing.getPackagePart().getPartName().toString())) {
- //dump diagram data
-
handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
- "diagram-data", parentDrawing.getPackagePart(),
metadata,
- new OOXMLWordAndPowerPointTextHandler(
- new OOXMLTikaBodyPartHandler(xhtml),
- new HashMap<>()//empty
- ));
- //dump chart data
-
handleGeneralTextContainingPart(XSSFRelation.CHART.getRelation(), "chart",
- parentDrawing.getPackagePart(), metadata,
- new OOXMLWordAndPowerPointTextHandler(
- new OOXMLTikaBodyPartHandler(xhtml),
- new HashMap<>()//empty
- ));
+ PackagePartName relName =
+ PackagingURIHelper.createPartName(rel.getTargetURI());
+ PackagePart drawingPart = rel.getPackage().getPart(relName);
+ if (drawingPart == null) {
+ continue;
+ }
+ // SAX-parse drawing XML for shape text and hyperlinks
+ try (InputStream is = drawingPart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is,
+ new DrawingShapeHandler(xhtml, drawingHyperlinks),
+ parseContext);
+ } catch (IOException | TikaException e) {
+ //swallow
}
-
seenParentDrawings.add(parentDrawing.getPackagePart().getPartName().toString());
+ // Process diagram and chart data through drawing part
relationships
+ handleGeneralTextContainingPart(
+ AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
+ "diagram-data", drawingPart, metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<>()));
+ handleGeneralTextContainingPart(RELATION_CHART, "chart",
+ drawingPart, metadata,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<>()));
}
+ } catch (InvalidFormatException e) {
+ //swallow
}
}
- private void extractHyperLinksFromShape(CTShape ctShape,
XHTMLContentHandler xhtml)
- throws SAXException {
-
- if (ctShape == null) {
- return;
- }
+ /**
+ * SAX handler for drawing XML that extracts shape text and hyperlinks
+ * without requiring XMLBeans or the POI usermodel (XSSFShape, etc.).
+ */
+ private static class DrawingShapeHandler extends DefaultHandler {
- CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
- if (nvSpPR == null) {
- return;
- }
+ private final XHTMLContentHandler xhtml;
+ private final Map<String, String> hyperlinks;
- CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
- if (cNvPr == null) {
- return;
- }
+ private boolean inTxBody;
+ private boolean inT;
+ private final StringBuilder textBuffer = new StringBuilder();
+ private final StringBuilder shapeText = new StringBuilder();
- CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
- if (ctHyperlink == null) {
- return;
+ DrawingShapeHandler(XHTMLContentHandler xhtml, Map<String, String>
hyperlinks) {
+ this.xhtml = xhtml;
+ this.hyperlinks = hyperlinks;
}
- String url = drawingHyperlinks.get(ctHyperlink.getId());
- if (url != null) {
- xhtml.startElement("a", "href", url);
- xhtml.characters(url);
- xhtml.endElement("a");
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ if ("txBody".equals(localName)) {
+ inTxBody = true;
+ shapeText.setLength(0);
+ } else if ("t".equals(localName) && inTxBody) {
+ inT = true;
+ textBuffer.setLength(0);
+ } else if ("hlinkClick".equals(localName) ||
"hlinkHover".equals(localName)) {
+ String rId = atts.getValue(NS_RELATIONSHIPS, "id");
+ if (rId == null) {
+ // try non-namespace-aware fallback
+ rId = atts.getValue("r:id");
+ }
+ if (rId != null) {
+ String url = hyperlinks.get(rId);
+ if (url != null) {
+ xhtml.startElement("a", "href", url);
+ xhtml.characters(url);
+ xhtml.endElement("a");
+ }
+ }
+ }
}
- CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
- if (ctHoverHyperlink == null) {
- return;
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if ("t".equals(localName) && inT) {
+ inT = false;
+ shapeText.append(textBuffer);
+ } else if ("p".equals(localName) && inTxBody &&
+ shapeText.length() > 0) {
+ shapeText.append('\n');
+ } else if ("txBody".equals(localName)) {
+ inTxBody = false;
+ String text = shapeText.toString().trim();
+ if (!text.isEmpty()) {
+ xhtml.element("p", text);
+ }
+ }
}
- url = drawingHyperlinks.get(ctHoverHyperlink.getId());
- if (url != null) {
- xhtml.startElement("a", "href", url);
- xhtml.characters(url);
- xhtml.endElement("a");
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ if (inT) {
+ textBuffer.append(ch, start, length);
+ }
}
-
}
- public void processSheet(SheetContentsHandler sheetContentsHandler,
Comments comments,
- StylesTable styles, ReadOnlySharedStringsTable
strings,
+ public void processSheet(TikaSheetContentsHandler sheetContentsHandler,
+ XSSFCommentsShim commentsShim,
+ XSSFStylesShim stylesShim, XSSFSharedStringsShim
stringsShim,
InputStream sheetInputStream) throws IOException,
SAXException {
try {
-
XSSFSheetInterestingPartsCapturer handler = new
XSSFSheetInterestingPartsCapturer(
- new XSSFSheetXMLHandler(styles, comments, strings,
sheetContentsHandler,
- formatter, false));
+ new TikaSheetXMLHandler(stylesShim, commentsShim,
stringsShim,
+ sheetContentsHandler, formatter, false));
XMLReaderUtils.parseSAX(sheetInputStream, handler, parseContext);
sheetInputStream.close();
@@ -825,6 +853,33 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
+ /**
+ * Parse the comments XML for a sheet part via SAX, avoiding XMLBeans.
+ */
+ private XSSFCommentsShim parseSheetComments(PackagePart sheetPart) {
+ try {
+ PackageRelationshipCollection rels =
+ sheetPart.getRelationshipsByType(RELATION_COMMENTS);
+ if (rels.isEmpty()) {
+ return null;
+ }
+ PackageRelationship rel = rels.getRelationship(0);
+ PackagePartName partName =
+ PackagingURIHelper.createPartName(rel.getTargetURI());
+ PackagePart commentsPart = rel.getPackage().getPart(partName);
+ if (commentsPart == null) {
+ return null;
+ }
+ try (InputStream is = commentsPart.getInputStream()) {
+ return new XSSFCommentsShim(is, parseContext);
+ }
+ } catch (InvalidFormatException | IOException | TikaException |
SAXException e) {
+ //swallow — comments are not critical
+ return null;
+ }
+ }
+
+
/**
* In Excel files, sheets have things embedded in them,
* and sheet drawings which have the images
@@ -839,7 +894,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
// If it has drawings, return those too
try {
for (PackageRelationship rel : part
-
.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
+ .getRelationshipsByType(RELATION_DRAWING)) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
@@ -847,7 +902,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
for (PackageRelationship rel : part
-
.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
+ .getRelationshipsByType(RELATION_VML_DRAWING)) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName =
PackagingURIHelper.createPartName(rel.getTargetURI());
@@ -870,7 +925,8 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
/**
* Turns formatted sheet events into HTML
*/
- protected static class SheetTextAsHTML implements SheetContentsHandler {
+ protected static class SheetTextAsHTML
+ implements TikaSheetContentsHandler, SheetContentsHandler {
private final boolean includeHeadersFooters;
private final boolean includeMissingRows;
protected List<String> headers;
@@ -917,7 +973,8 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
- public void cell(String cellRef, String formattedValue, XSSFComment
comment) {
+ public void cell(String cellRef, String formattedValue,
+ XSSFCommentsShim.CommentData comment) {
try {
// Handle any missing cells
int colNum =
@@ -942,7 +999,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
xhtml.endElement("br");
xhtml.characters(comment.getAuthor());
xhtml.characters(": ");
- xhtml.characters(comment.getString().getString());
+ xhtml.characters(comment.getText());
}
xhtml.endElement("td");
@@ -951,6 +1008,21 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
+ /**
+ * Bridge for POI's {@link SheetContentsHandler} interface, used by the
+ * XLSB (binary) path via {@link
org.apache.poi.xssf.binary.XSSFBSheetHandler}.
+ */
+ public void cell(String cellRef, String formattedValue, XSSFComment
comment) {
+ XSSFCommentsShim.CommentData commentData = null;
+ if (comment != null) {
+ String text = comment.getString() != null ?
+ comment.getString().getString() : "";
+ commentData = new XSSFCommentsShim.CommentData(
+ comment.getAuthor(), text);
+ }
+ cell(cellRef, formattedValue, commentData);
+ }
+
public void headerFooter(String text, boolean isHeader, String
tagName) {
if (!includeHeadersFooters) {
return;
@@ -961,6 +1033,11 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
footers.add(text);
}
}
+
+ @Override
+ public void endSheet() {
+ // no-op — satisfies both TikaSheetContentsHandler and
SheetContentsHandler
+ }
}
protected static class HeaderFooterFromString implements HeaderFooter {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFSharedStringsShim.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFSharedStringsShim.java
new file mode 100644
index 0000000000..8556d0fbb3
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFSharedStringsShim.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * SAX-based shim that replaces POI's {@code ReadOnlySharedStringsTable}
+ * for XLSX event-based parsing.
+ * <p>
+ * Parses {@code xl/sharedStrings.xml} and stores each shared string entry
+ * as a plain {@code String}, avoiding the XMLBeans dependency that
+ * {@code XSSFRichTextString} requires. Rich text runs within a single
+ * {@code <si>} are concatenated into a single string.
+ */
+class XSSFSharedStringsShim {
+
+ private final List<String> strings;
+ private final boolean includePhoneticRuns;
+
+ XSSFSharedStringsShim(InputStream sharedStringsData,
+ boolean includePhoneticRuns,
+ ParseContext parseContext)
+ throws IOException, SAXException, TikaException {
+ this.includePhoneticRuns = includePhoneticRuns;
+ SharedStringsHandler handler = new SharedStringsHandler();
+ if (sharedStringsData != null) {
+ try {
+ XMLReaderUtils.parseSAX(sharedStringsData, handler,
parseContext);
+ } finally {
+ sharedStringsData.close();
+ }
+ }
+ this.strings = handler.strings;
+ }
+
+ String getItemAt(int idx) {
+ return strings.get(idx);
+ }
+
+ int getCount() {
+ return strings.size();
+ }
+
+ private class SharedStringsHandler extends DefaultHandler {
+
+ private static final String NS =
+ "http://schemas.openxmlformats.org/spreadsheetml/2006/main";
+
+ final List<String> strings = new ArrayList<>();
+ private StringBuilder characters;
+ private boolean tIsOpen;
+ private boolean inRPh;
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) {
+ if (uri != null && !NS.equals(uri)) {
+ return;
+ }
+ switch (localName) {
+ case "sst":
+ String uniqueCount = attributes.getValue("uniqueCount");
+ if (uniqueCount != null) {
+ try {
+ int hint = (int) Long.parseLong(uniqueCount);
+ // guard against corrupt files with absurd counts
+ ((ArrayList<String>) strings).ensureCapacity(
+ Math.min(hint, 100_000));
+ } catch (NumberFormatException e) {
+ // ignore
+ }
+ }
+ characters = new StringBuilder(64);
+ break;
+ case "si":
+ if (characters != null) {
+ characters.setLength(0);
+ }
+ break;
+ case "t":
+ tIsOpen = true;
+ break;
+ case "rPh":
+ inRPh = true;
+ if (includePhoneticRuns && characters != null &&
+ characters.length() > 0) {
+ characters.append(" ");
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) {
+ if (uri != null && !NS.equals(uri)) {
+ return;
+ }
+ switch (localName) {
+ case "si":
+ if (characters != null) {
+ strings.add(characters.toString());
+ }
+ break;
+ case "t":
+ tIsOpen = false;
+ break;
+ case "rPh":
+ inRPh = false;
+ break;
+ default:
+ break;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ if (tIsOpen && characters != null) {
+ if (inRPh) {
+ if (includePhoneticRuns) {
+ characters.append(ch, start, length);
+ }
+ } else {
+ characters.append(ch, start, length);
+ }
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFStylesShim.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFStylesShim.java
new file mode 100644
index 0000000000..ca99c7512e
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFStylesShim.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.poi.ss.usermodel.BuiltinFormats;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * SAX-based shim that replaces POI's {@code StylesTable} for XLSX event-based
parsing.
+ * <p>
+ * Parses {@code xl/styles.xml} and extracts only the information needed for
text
+ * extraction: the number format resolution chain (cellXfs index to format
string).
+ * This avoids the XMLBeans dependency that {@code StylesTable} requires.
+ */
+class XSSFStylesShim {
+
+ private final Map<Short, String> numberFormats = new HashMap<>();
+ private final List<Short> cellXfFormatIds = new ArrayList<>();
+
+ XSSFStylesShim(InputStream stylesData, ParseContext parseContext)
+ throws IOException, SAXException, TikaException {
+ if (stylesData != null) {
+ try {
+ XMLReaderUtils.parseSAX(stylesData, new StylesHandler(),
parseContext);
+ } finally {
+ stylesData.close();
+ }
+ }
+ }
+
+ int getNumCellStyles() {
+ return cellXfFormatIds.size();
+ }
+
+ short getFormatIndex(int styleIndex) {
+ if (styleIndex < 0 || styleIndex >= cellXfFormatIds.size()) {
+ return -1;
+ }
+ return cellXfFormatIds.get(styleIndex);
+ }
+
+ String getFormatString(int styleIndex) {
+ short fmtId = getFormatIndex(styleIndex);
+ if (fmtId == -1) {
+ return null;
+ }
+ String fmt = numberFormats.get(fmtId);
+ if (fmt == null) {
+ fmt = BuiltinFormats.getBuiltinFormat(fmtId);
+ }
+ return fmt;
+ }
+
+ private class StylesHandler extends DefaultHandler {
+
+ private static final String NS =
+ "http://schemas.openxmlformats.org/spreadsheetml/2006/main";
+
+ private boolean inCellXfs;
+ private boolean inNumFmts;
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) {
+ if (!NS.equals(uri)) {
+ return;
+ }
+ switch (localName) {
+ case "numFmts":
+ inNumFmts = true;
+ break;
+ case "numFmt":
+ if (inNumFmts) {
+ String idStr = attributes.getValue("numFmtId");
+ String code = attributes.getValue("formatCode");
+ if (idStr != null && code != null) {
+ try {
+ numberFormats.put(Short.parseShort(idStr),
code);
+ } catch (NumberFormatException e) {
+ // skip malformed
+ }
+ }
+ }
+ break;
+ case "cellXfs":
+ inCellXfs = true;
+ break;
+ case "xf":
+ if (inCellXfs) {
+ String numFmtIdStr = attributes.getValue("numFmtId");
+ short numFmtId = 0;
+ if (numFmtIdStr != null) {
+ try {
+ numFmtId = Short.parseShort(numFmtIdStr);
+ } catch (NumberFormatException e) {
+ // default to 0 (General)
+ }
+ }
+ cellXfFormatIds.add(numFmtId);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) {
+ if (!NS.equals(uri)) {
+ return;
+ }
+ if ("numFmts".equals(localName)) {
+ inNumFmts = false;
+ } else if ("cellXfs".equals(localName)) {
+ inCellXfs = false;
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 2dd8af7afc..dfb4c71924 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -60,7 +60,6 @@ import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.xmlbeans.XmlCursor;
-import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar;
@@ -124,7 +123,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
XWPFListManager listManager = new XWPFListManager(loadNumbering());
// headers
@@ -187,7 +186,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
private void extractIBodyText(IBody bodyElement, XWPFListManager
listManager,
XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
for (IBodyElement element : bodyElement.getBodyElements()) {
if (element instanceof XWPFParagraph) {
XWPFParagraph paragraph = (XWPFParagraph) element;
@@ -205,7 +204,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
}
private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
ISDTContent content = element.getContent();
String tag = "p";
xhtml.startElement(tag);
@@ -215,7 +214,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
private void extractParagraph(XWPFParagraph paragraph, XWPFListManager
listManager,
XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
// If this paragraph is actually a whole new section, then
// it could have its own headers and footers
// Check and handle if so
@@ -375,10 +374,14 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
//Note "w:txbxContent//"...must look for all descendant paragraphs
//not just the immediate children of txbxContent -- TIKA-2807
if (config.isIncludeShapeBasedContent()) {
- for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath(
- "declare namespace
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare
namespace
wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape'
.//*/wps:txbx/w:txbxContent//w:p")) {
- extractParagraph(new
XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()),
- paragraph.getBody()), listManager, xhtml);
+ try {
+ for (XmlObject embeddedParagraph :
paragraph.getCTP().selectPath(
+ "declare namespace
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare
namespace
wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape'
.//*/wps:txbx/w:txbxContent//w:p")) {
+ extractParagraph(new
XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()),
+ paragraph.getBody()), listManager, xhtml);
+ }
+ } catch (Exception e) {
+ // XmlException from CTP.Factory.parse — swallow for shape
content
}
}
@@ -397,7 +400,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
private void extractFootnoteEndnoteContent(XWPFParagraph paragraph,
XWPFListManager listManager,
XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
String nsW =
"http://schemas.openxmlformats.org/wordprocessingml/2006/main";
QName footnoteRefQName = new QName(nsW, "footnoteReference");
QName endnoteRefQName = new QName(nsW, "endnoteReference");
@@ -553,7 +556,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
private void processRun(XWPFRun run, XWPFParagraph paragraph,
XHTMLContentHandler xhtml,
Deque<FormattingUtils.Tag> formattingState)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
// open/close required tags if run changes formatting
FormattingUtils.ensureFormattingState(xhtml,
FormattingUtils.toTags(run), formattingState);
@@ -581,7 +584,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
}
private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
xhtml.characters(run.getContent().getText());
}
@@ -627,7 +630,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
private void extractTable(XWPFTable table, XWPFListManager listManager,
XHTMLContentHandler xhtml)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
xhtml.startElement("table");
xhtml.startElement("tbody");
for (XWPFTableRow row : table.getRows()) {
@@ -649,7 +652,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
private void extractFooters(XHTMLContentHandler xhtml,
XWPFHeaderFooterPolicy hfPolicy,
XWPFListManager listManager)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
// footers
if (hfPolicy.getFirstPageFooter() != null) {
extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(),
listManager);
@@ -664,7 +667,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
private void extractHeaders(XHTMLContentHandler xhtml,
XWPFHeaderFooterPolicy hfPolicy,
XWPFListManager listManager)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
if (hfPolicy == null) {
return;
}
@@ -684,7 +687,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter
header,
XWPFListManager listManager)
- throws SAXException, XmlException, IOException {
+ throws SAXException, IOException {
for (IBodyElement e : header.getBodyElements()) {
if (e instanceof XWPFParagraph) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
index 705ce27487..a05698d969 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
@@ -25,7 +25,6 @@ import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.xmlbeans.XmlException;
/**
* Currently, mostly a pass-through class to hold pkg and properties
@@ -37,10 +36,13 @@ public class XPSTextExtractor implements
POIXMLTextExtractor {
private final OPCPackage pkg;
private final POIXMLProperties properties;
- public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException,
XmlException, IOException {
+ public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException,
IOException {
this.pkg = pkg;
- this.properties = new POIXMLProperties(pkg);
-
+ try {
+ this.properties = new POIXMLProperties(pkg);
+ } catch (Exception e) {
+ throw new IOException("Failed to read OOXML properties", e);
+ }
}
@Override
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index 193f649a4e..3de987e67d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -25,7 +25,6 @@ import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.xmlbeans.XmlException;
import org.apache.tika.parser.microsoft.ooxml.EditType;
import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
@@ -39,9 +38,13 @@ public class XSLFEventBasedPowerPointExtractor implements
POIXMLTextExtractor {
private POIXMLProperties properties;
public XSLFEventBasedPowerPointExtractor(OPCPackage container)
- throws XmlException, OpenXML4JException, IOException {
+ throws OpenXML4JException, IOException {
this.container = container;
- this.properties = new POIXMLProperties(container);
+ try {
+ this.properties = new POIXMLProperties(container);
+ } catch (Exception e) {
+ throw new IOException("Failed to read OOXML properties", e);
+ }
}
public OPCPackage getPackage() {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 2bb53a3c69..5a4676d631 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -34,7 +34,6 @@ import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.xmlbeans.XmlException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
@@ -64,7 +63,7 @@ public class XWPFEventBasedWordExtractor implements
POIXMLTextExtractor {
private POIXMLProperties properties;
public XWPFEventBasedWordExtractor(OPCPackage container)
- throws XmlException, OpenXML4JException, IOException {
+ throws OpenXML4JException, IOException {
this.container = container;
// Properties are lazily initialized to avoid requiring ooxml-lite
// when SAXBasedMetadataExtractor is used instead