This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit b983e3c1f62d581beecc568a2a877d27e15b79c2 Author: tallison <[email protected]> AuthorDate: Wed Mar 18 17:36:37 2026 -0400 improve sax ooxml - docx-tests - WIP --- .../tika/parser/microsoft/AbstractListManager.java | 2 +- .../microsoft/ooxml/FormattingTagManager.java | 22 +- .../microsoft/ooxml/OOXMLPictureTracker.java | 6 + .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 17 +- .../ooxml/OOXMLWordAndPowerPointTextHandler.java | 7 +- .../ooxml/SXWPFWordExtractorDecorator.java | 63 +- .../microsoft/ooxml/XWPFBodyContentsHandler.java | 2 +- .../parser/microsoft/ooxml/XWPFListManager.java | 122 +--- .../ooxml/XWPFWordExtractorDecorator.java | 24 +- .../xslf/XSLFEventBasedPowerPointExtractor.java | 2 +- .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 37 +- .../microsoft/ooxml/xwpf/XWPFFeatureExtractor.java | 8 +- .../microsoft/ooxml/xwpf/XWPFNumberingShim.java | 268 +++++++- .../parser/microsoft/ooxml/OOXMLParserTest.java | 730 --------------------- .../parser/microsoft/ooxml/OOXMLParserTest.java | 81 +-- .../parser/microsoft/ooxml/SXWPFExtractorTest.java | 100 --- 16 files changed, 431 insertions(+), 1060 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java index 2e5f8f21c4..12d806d09d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java @@ -248,7 +248,7 @@ public abstract class AbstractListManager { } } - protected static class LevelTuple { + public static class LevelTuple { private final int start; private final int restart; private final String lvlText; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java index cf8dfefce9..db88eedbae 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java @@ -22,9 +22,9 @@ import org.xml.sax.SAXException; import org.apache.tika.sax.XHTMLContentHandler; /** - * Manages XHTML formatting tags (b, i, u, strike) as a state machine, + * Manages XHTML formatting tags (b, i, u, s) as a state machine, * ensuring proper nesting. Tags are always ordered from outermost to innermost: - * {@code <b><i><strike><u>text</u></strike></i></b>}. + * {@code <b><i><s><u>text</u></s></i></b>}. * <p> * When a formatting change occurs, all tags that are "inside" the changing tag * must be closed first, then the change applied, then inner tags reopened. @@ -51,7 +51,7 @@ class FormattingTagManager { if (runProperties.isBold() != isBold) { // Bold is outermost — close everything inside it if (isStrikeThrough) { - xhtml.endElement("strike"); + xhtml.endElement("s"); isStrikeThrough = false; } if (isUnderline) { @@ -72,7 +72,7 @@ class FormattingTagManager { if (runProperties.isItalics() != isItalics) { if (isStrikeThrough) { - xhtml.endElement("strike"); + xhtml.endElement("s"); isStrikeThrough = false; } if (isUnderline) { @@ -93,9 +93,9 @@ class FormattingTagManager { isUnderline = false; } if (runProperties.isStrikeThrough()) { - xhtml.startElement("strike"); + xhtml.startElement("s"); } else { - xhtml.endElement("strike"); + xhtml.endElement("s"); } isStrikeThrough = runProperties.isStrikeThrough(); } @@ -113,17 +113,17 @@ class FormattingTagManager { /** * Closes all currently open formatting tags in proper nesting order - * (innermost first: u, strike, i, b). + * (innermost first: u, s, i, b). */ void closeAll() throws SAXException { - if (isStrikeThrough) { - xhtml.endElement("strike"); - isStrikeThrough = false; - } if (isUnderline) { xhtml.endElement("u"); isUnderline = false; } + if (isStrikeThrough) { + xhtml.endElement("s"); + isStrikeThrough = false; + } if (isItalics) { xhtml.endElement("i"); isItalics = false; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java index dbd55b9f9f..f6a98ecacf 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPictureTracker.java @@ -36,6 +36,7 @@ class OOXMLPictureTracker { private boolean inPict = false; private String picDescription = null; private String picRId = null; + private String lastImageDataRId = null; OOXMLPictureTracker(Map<String, String> linkedRelationships, XWPFBodyContentsHandler bodyContentsHandler) { @@ -69,6 +70,11 @@ class OOXMLPictureTracker { void setImageDataRId(String rId) { picRId = rId; + lastImageDataRId = rId; + } + + String getImageDataRId() { + return lastImageDataRId; } void setImageDataDescription(String description) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java index 0e33acc12c..e104867db4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java @@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.math.BigInteger; import java.util.Date; +import java.util.HashMap; import java.util.Map; import org.xml.sax.SAXException; @@ -73,6 +74,7 @@ public class OOXMLTikaBodyPartHandler private ParseContext parseContext = null; private final java.util.List<String> pendingCommentIds = new java.util.ArrayList<>(); private final java.util.Set<String> emittedCommentIds = new java.util.HashSet<>(); + private final Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap = new HashMap<>(); public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) { this(xhtml, null); @@ -347,10 +349,19 @@ public class OOXMLTikaBodyPartHandler } @Override - public void embeddedOLERef(String relId) throws SAXException { + public void embeddedOLERef(String relId, String progId, String emfImageRId) + throws SAXException { if (relId == null) { return; } + if ((progId != null && !progId.isEmpty()) || + (emfImageRId != null && !emfImageRId.isEmpty())) { + EmbeddedPartMetadata epm = new EmbeddedPartMetadata(emfImageRId); + if (progId != null && !progId.isEmpty()) { + epm.setProgId(progId); + } + embeddedPartMetadataMap.put(relId, epm); + } AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", relId); @@ -358,6 +369,10 @@ public class OOXMLTikaBodyPartHandler xhtml.endElement("div"); } + public Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() { + return embeddedPartMetadataMap; + } + @Override public void linkedOLERef(String relId) throws SAXException { if (relId == null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java index 4eb507c4fb..7b0b5ceea5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java @@ -352,7 +352,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } else if (OLE_OBJECT.equals(localName)) { //check for O_NS? String type = null; String refId = null; - //TODO: clean this up and ...want to get ProgID? + String progId = null; for (int i = 0; i < atts.getLength(); i++) { String attLocalName = atts.getLocalName(i); String attValue = atts.getValue(i); @@ -361,10 +361,13 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) { refId = attValue; + } else if ("ProgID".equals(attLocalName)) { + progId = attValue; } } if ("Embed".equals(type)) { - bodyContentsHandler.embeddedOLERef(refId); + String emfRId = pictureTracker.getImageDataRId(); + bodyContentsHandler.embeddedOLERef(refId, progId, emfRId); } else if ("Link".equals(type)) { bodyContentsHandler.linkedOLERef(refId); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index 3dc076457b..1aca8a9647 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -20,20 +20,19 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.zip.ZipException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; -import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.xssf.usermodel.XSSFRelation; -import org.apache.poi.xwpf.usermodel.XWPFNumbering; import org.apache.poi.xwpf.usermodel.XWPFRelation; import org.apache.xmlbeans.XmlException; import org.xml.sax.Attributes; @@ -41,11 +40,14 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.EMFParser; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; +import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFFeatureExtractor; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim; import org.apache.tika.sax.EmbeddedContentHandler; @@ -96,6 +98,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private final OPCPackage opcPackage; private final ParseContext context; private final Metadata metadata; + private final Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap = new HashMap<>(); public SXWPFWordExtractorDecorator(Metadata metadata, ParseContext context, @@ -106,6 +109,10 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { this.opcPackage = extractor.getPackage(); } + @Override + public MetadataExtractor getMetadataExtractor() { + return new SAXBasedMetadataExtractor(opcPackage, context); + } @Override protected void buildXHTML(XHTMLContentHandler xhtml) @@ -145,6 +152,9 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { * subdocuments, and framesets. */ private void detectSecurityFeatures(PackagePart documentPart, XHTMLContentHandler xhtml) { + // Extract document features (hidden text, track changes, comments, comment persons) + new XWPFFeatureExtractor().process(documentPart, metadata, context); + // Check for attached template (external template reference) try { PackageRelationshipCollection templateRels = @@ -236,8 +246,9 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private void handleDocumentPart(PackagePart documentPart, XHTMLContentHandler xhtml) throws IOException, SAXException { //load the numbering/list manager and styles from the main document part - XWPFNumbering numbering = loadNumbering(documentPart); - XWPFListManager listManager = new XWPFListManager(numbering); + XWPFNumberingShim numbering = loadNumbering(documentPart); + XWPFListManager listManager = new XWPFListManager( + numbering != null ? numbering : XWPFNumberingShim.EMPTY); XWPFStylesShim styles = null; try { styles = loadStyles(documentPart); @@ -249,6 +260,9 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } if (config.isIncludeHeadersAndFooters()) { + //TODO: the DOM extractor handles per-section headers/footers by detecting + // sectPr within paragraphs. We extract all headers/footers at the document level, + // which is fine for text extraction since OOXML is flow-based, not page-based. //headers try { PackageRelationshipCollection headersPRC = @@ -362,9 +376,44 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } + Map<String, EmbeddedPartMetadata> partMetadata = bodyHandler.getEmbeddedPartMetadataMap(); + resolveEmfNames(packagePart, partMetadata); + embeddedPartMetadataMap.putAll(partMetadata); return bodyHandler; } + private void resolveEmfNames(PackagePart documentPart, + Map<String, EmbeddedPartMetadata> metadataMap) { + for (EmbeddedPartMetadata epm : metadataMap.values()) { + String emfRId = epm.getEmfRelationshipId(); + if (emfRId == null || emfRId.isEmpty()) { + continue; + } + try { + PackagePart emfPart = documentPart.getRelatedPart( + documentPart.getRelationship(emfRId)); + if (emfPart == null || emfPart.getContentType() == null) { + continue; + } + if ("image/x-emf".equals(emfPart.getContentType())) { + try (TikaInputStream tis = TikaInputStream.get(emfPart.getInputStream())) { + EMFParser p = new EMFParser(); + Metadata m = Metadata.newInstance(context); + p.parse(tis, new org.apache.tika.sax.ToTextContentHandler(), m, context); + epm.setFullName(m.get(EMFParser.EMF_ICON_STRING)); + } + } + } catch (Exception e) { + //swallow + } + } + } + + @Override + protected Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() { + return embeddedPartMetadataMap; + } + private OOXMLInlineBodyPartMap collectInlineParts(PackagePart documentPart) { Map<String, String> allRelationships = new java.util.HashMap<>(); Map<String, byte[]> footnoteMap = collectPartContent(documentPart, @@ -440,7 +489,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } - private XWPFNumbering loadNumbering(PackagePart packagePart) { + private XWPFNumberingShim loadNumbering(PackagePart packagePart) { try { PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation()); @@ -453,9 +502,9 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { if (numberingPart == null) { return null; } - return new XWPFNumberingShim(numberingPart); + return new XWPFNumberingShim(numberingPart, context); } - } catch (IOException | OpenXML4JException e) { + } catch (IOException | InvalidFormatException | TikaException | SAXException e) { //swallow } return null; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java index 2bd479d0a0..a9eb400e98 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java @@ -85,7 +85,7 @@ public interface XWPFBodyContentsHandler { boolean isIncludeMoveFromText() throws SAXException; - void embeddedOLERef(String refId) throws SAXException; + void embeddedOLERef(String refId, String progId, String emfImageRId) throws SAXException; /** * Called when a linked (vs embedded) OLE object is found. diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java index 383c31f0f7..b397a20333 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java @@ -18,17 +18,10 @@ package org.apache.tika.parser.microsoft.ooxml; import java.math.BigInteger; -import org.apache.poi.xwpf.usermodel.XWPFAbstractNum; -import org.apache.poi.xwpf.usermodel.XWPFNum; -import org.apache.poi.xwpf.usermodel.XWPFNumbering; import org.apache.poi.xwpf.usermodel.XWPFParagraph; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl; import org.apache.tika.parser.microsoft.AbstractListManager; +import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim; public class XWPFListManager extends AbstractListManager { @@ -38,26 +31,10 @@ public class XWPFListManager extends AbstractListManager { * Always returns empty string. */ public final static XWPFListManager EMPTY_LIST = new EmptyListManager(); - private final static boolean OVERRIDE_AVAILABLE; - private final static String SKIP_FORMAT = Character.toString((char) 61623); -//if this shows up as the lvlText, don't show a number - - static { - boolean b = false; - try { - Class.forName("org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl"); - b = true; - } catch (ClassNotFoundException e) { - //swallow - } - b = OVERRIDE_AVAILABLE = false; - - } - private final XWPFNumbering numbering; + private final XWPFNumberingShim numbering; - //map of numId (which paragraph series is this a member of?), levelcounts - public XWPFListManager(XWPFNumbering numbering) { + public XWPFListManager(XWPFNumberingShim numbering) { this.numbering = numbering; } @@ -76,23 +53,21 @@ public class XWPFListManager extends AbstractListManager { } int currNumId = numId.intValue(); - - XWPFNum xwpfNum = numbering.getNum(numId); - - if (xwpfNum == null) { + int currAbNumId = numbering.getAbstractNumId(currNumId); + if (currAbNumId < 0) { return ""; } - CTNum ctNum = xwpfNum.getCTNum(); - CTDecimalNumber abNum = ctNum.getAbstractNumId(); - int currAbNumId = abNum.getVal().intValue(); ParagraphLevelCounter lc = listLevelMap.get(currAbNumId); LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId); if (lc == null) { - lc = loadLevelTuples(abNum); + lc = loadLevelTuples(currAbNumId); + if (lc == null) { + return ""; + } } if (overrideTuples == null) { - overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels()); + overrideTuples = numbering.getOverrideLevels(currNumId, lc.getNumberOfLevels()); } String formattedString = lc.incrementLevel(iLvl, overrideTuples); @@ -104,85 +79,14 @@ public class XWPFListManager extends AbstractListManager { } - private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) { - LevelTuple[] levelTuples = new LevelTuple[length]; - int overrideLength = ctNum.sizeOfLvlOverrideArray(); - if (overrideLength == 0) { + private ParagraphLevelCounter loadLevelTuples(int abstractNumId) { + LevelTuple[] levels = numbering.getAbstractNumLevels(abstractNumId); + if (levels == null) { return null; } - for (int i = 0; i < length; i++) { - LevelTuple tuple; - if (i >= overrideLength) { - tuple = new LevelTuple("%" + i + "."); - } else { - CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i); - if (ctNumLvl != null) { - tuple = buildTuple(i, ctNumLvl.getLvl()); - } else { - tuple = new LevelTuple("%" + i + "."); - } - } - levelTuples[i] = tuple; - } - return levelTuples; - } - - - private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) { - //Unfortunately, we need to go this far into the underlying structure - //to get the abstract num information for the edge case where - //someone skips a level and the format is not context-free, e.g. "1.B.i". - XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal()); - CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum(); - - LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()]; - for (int i = 0; i < levels.length; i++) { - levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i)); - } return new ParagraphLevelCounter(levels); } - private LevelTuple buildTuple(int level, CTLvl ctLvl) { - boolean isLegal = false; - int start = 1; - int restart = -1; - String lvlText = "%" + level + "."; - String numFmt = "decimal"; - - - if (ctLvl != null && ctLvl.getIsLgl() != null) { - isLegal = true; - } - - if (ctLvl != null && ctLvl.getNumFmt() != null && ctLvl.getNumFmt().getVal() != null) { - numFmt = ctLvl.getNumFmt().getVal().toString(); - } - if (ctLvl != null && ctLvl.getLvlRestart() != null && - ctLvl.getLvlRestart().getVal() != null) { - restart = ctLvl.getLvlRestart().getVal().intValue(); - } - if (ctLvl != null && ctLvl.getStart() != null && ctLvl.getStart().getVal() != null) { - start = ctLvl.getStart().getVal().intValue(); - } else { - - //this is a hack. Currently, this gets the lowest possible - //start for a given numFmt. We should probably try to grab the - //restartNumberingAfterBreak value in - //e.g. <w:abstractNum w:abstractNumId="12" w15:restartNumberingAfterBreak="0">??? - if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || - "decimalZero".equals(numFmt)) { - start = 0; - } else { - start = 1; - } - } - if (ctLvl != null && ctLvl.getLvlText() != null && ctLvl.getLvlText().getVal() != null) { - lvlText = ctLvl.getLvlText().getVal(); - } - return new LevelTuple(start, restart, lvlText, numFmt, isLegal); - } - - private static class EmptyListManager extends XWPFListManager { EmptyListManager() { super(null); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java index f40ee517e2..2dd8af7afc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java @@ -29,6 +29,7 @@ import com.microsoft.schemas.vml.impl.CTShapeImpl; import org.apache.poi.ooxml.POIXMLDocumentPart; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.xssf.usermodel.XSSFRelation; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; @@ -81,6 +82,7 @@ import org.apache.tika.parser.microsoft.FormattingUtils; import org.apache.tika.parser.microsoft.WordExtractor; import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFFeatureExtractor; +import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim; import org.apache.tika.sax.ToTextContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.StringUtils; @@ -124,7 +126,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); - XWPFListManager listManager = new XWPFListManager(document.getNumbering()); + XWPFListManager listManager = new XWPFListManager(loadNumbering()); // headers if (hfPolicy != null && config.isIncludeHeadersAndFooters()) { extractHeaders(xhtml, hfPolicy, listManager); @@ -153,6 +155,26 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } } + private XWPFNumberingShim loadNumbering() { + try { + PackageRelationshipCollection numberingParts = + document.getPackagePart().getRelationshipsByType( + XWPFRelation.NUMBERING.getRelation()); + if (numberingParts.size() > 0) { + PackageRelationship rel = numberingParts.getRelationship(0); + if (rel != null) { + PackagePart numberingPart = document.getPackagePart().getRelatedPart(rel); + if (numberingPart != null) { + return new XWPFNumberingShim(numberingPart, getParseContext()); + } + } + } + } catch (Exception e) { + //swallow + } + return XWPFNumberingShim.EMPTY; + } + private void extractFeatures(XWPFDocument document, Metadata metadata) { XWPFFeatureExtractor ex = new XWPFFeatureExtractor(); ex.process(document, metadata, getParseContext()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java index 8d8db557b9..193f649a4e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java @@ -203,7 +203,7 @@ public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor { @Override - public void embeddedOLERef(String refId) { + public void embeddedOLERef(String refId, String progId, String emfImageRId) { //no-op } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index 9b30ce15b1..2bb53a3c69 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -33,7 +33,6 @@ import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; -import org.apache.poi.xwpf.usermodel.XWPFNumbering; import org.apache.poi.xwpf.usermodel.XWPFRelation; import org.apache.xmlbeans.XmlException; import org.slf4j.Logger; @@ -67,7 +66,19 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { public XWPFEventBasedWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { this.container = container; - this.properties = new POIXMLProperties(container); + // Properties are lazily initialized to avoid requiring ooxml-lite + // when SAXBasedMetadataExtractor is used instead + } + + private POIXMLProperties getOrCreateProperties() { + if (properties == null) { + try { + properties = new POIXMLProperties(container); + } catch (Exception e) { + LOG.warn("Couldn't load properties", e); + } + } + return properties; } public OPCPackage getPackage() { @@ -75,15 +86,18 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { } public POIXMLProperties.CoreProperties getCoreProperties() { - return this.properties.getCoreProperties(); + POIXMLProperties props = getOrCreateProperties(); + return props != null ? props.getCoreProperties() : null; } public POIXMLProperties.ExtendedProperties getExtendedProperties() { - return this.properties.getExtendedProperties(); + POIXMLProperties props = getOrCreateProperties(); + return props != null ? props.getExtendedProperties() : null; } public POIXMLProperties.CustomProperties getCustomProperties() { - return this.properties.getCustomProperties(); + POIXMLProperties props = getOrCreateProperties(); + return props != null ? props.getCustomProperties() : null; } @Override @@ -160,8 +174,9 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) throws IOException, SAXException, TikaException { //load the numbering/list manager and styles from the main document part - XWPFNumbering numbering = loadNumbering(documentPart); - XWPFListManager xwpfListManager = new XWPFListManager(numbering); + XWPFNumberingShim numbering = loadNumbering(documentPart); + XWPFListManager xwpfListManager = new XWPFListManager( + numbering != null ? numbering : XWPFNumberingShim.EMPTY); //TODO: XWPFStyles styles = loadStyles(documentPart); //headers @@ -234,7 +249,7 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { return hyperlinks; } - private XWPFNumbering loadNumbering(PackagePart packagePart) throws IOException { + private XWPFNumberingShim loadNumbering(PackagePart packagePart) { try { PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation()); @@ -247,9 +262,9 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { if (numberingPart == null) { return null; } - return new XWPFNumbering(numberingPart); + return new XWPFNumberingShim(numberingPart, new ParseContext()); } - } catch (OpenXML4JException e) { + } catch (Exception e) { LOG.warn("Couldn't load numbering", e); } return null; @@ -365,7 +380,7 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { } @Override - public void embeddedOLERef(String refId) { + public void embeddedOLERef(String refId, String progId, String emfImageRId) { //no-op } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java index 06e45afb72..3cb1935cc6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFFeatureExtractor.java @@ -21,6 +21,7 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -42,8 +43,11 @@ import org.apache.tika.utils.XMLReaderUtils; public class XWPFFeatureExtractor { public void process(XWPFDocument xwpfDocument, Metadata metadata, ParseContext parseContext) { - try (InputStream is = xwpfDocument.getPackagePart() - .getInputStream()) { + process(xwpfDocument.getPackagePart(), metadata, parseContext); + } + + public void process(PackagePart packagePart, Metadata metadata, ParseContext parseContext) { + try (InputStream is = packagePart.getInputStream()) { FeatureHandler featureHandler = new FeatureHandler(); XMLReaderUtils.parseSAX(is, featureHandler, parseContext); if (featureHandler.hasComments) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFNumberingShim.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFNumberingShim.java index f563223964..7d8d15fc7f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFNumberingShim.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFNumberingShim.java @@ -17,20 +17,274 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf; import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; -import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.PackagePart; -import org.apache.poi.xwpf.usermodel.XWPFNumbering; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.AbstractListManager.LevelTuple; +import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler; +import org.apache.tika.utils.XMLReaderUtils; /** - * Stub class of POI's XWPFNumbering because onDocumentRead() is protected + * SAX-based parser for numbering.xml that replaces the XMLBeans-dependent + * POI XWPFNumbering. This eliminates the need for ooxml-lite schema classes + * in the SAX parsing chain. + * <p> + * Follows the same pattern as {@link XWPFStylesShim}. */ -public class XWPFNumberingShim extends XWPFNumbering { +public class XWPFNumberingShim { + + public static final XWPFNumberingShim EMPTY = new EmptyNumberingShim(); - public XWPFNumberingShim(PackagePart part) throws IOException, OpenXML4JException { - super(part); - onDocumentRead(); + // abstractNumId -> list of LevelTuples (indexed by ilvl) + private final Map<Integer, LevelTuple[]> abstractNumLevels = new HashMap<>(); + // numId -> abstractNumId + private final Map<Integer, Integer> numToAbstractNum = new HashMap<>(); + // numId -> override LevelTuples (indexed by ilvl), null entries for non-overridden levels + private final Map<Integer, Map<Integer, LevelTuple>> overrideLevels = new HashMap<>(); + + private XWPFNumberingShim() { } + public XWPFNumberingShim(PackagePart part, ParseContext parseContext) + throws IOException, TikaException, SAXException { + try (InputStream is = part.getInputStream()) { + XMLReaderUtils.parseSAX(is, new NumberingHandler(), parseContext); + } + } + /** + * @return the abstractNumId for the given numId, or -1 if not found + */ + public int getAbstractNumId(int numId) { + Integer id = numToAbstractNum.get(numId); + return id != null ? id : -1; + } + + /** + * @return the level tuples for the given abstractNumId, or null if not found + */ + public LevelTuple[] getAbstractNumLevels(int abstractNumId) { + return abstractNumLevels.get(abstractNumId); + } + + /** + * Build override level tuples array for a given numId with the specified length. + * Returns null if there are no overrides for this numId. + */ + public LevelTuple[] getOverrideLevels(int numId, int length) { + Map<Integer, LevelTuple> overrides = overrideLevels.get(numId); + if (overrides == null || overrides.isEmpty()) { + return null; + } + LevelTuple[] result = new LevelTuple[length]; + for (int i = 0; i < length; i++) { + LevelTuple override = overrides.get(i); + if (override != null) { + result[i] = override; + } else { + result[i] = new LevelTuple("%" + i + "."); + } + } + return result; + } + + private static class EmptyNumberingShim extends XWPFNumberingShim { + @Override + public int getAbstractNumId(int numId) { + return -1; + } + + @Override + public LevelTuple[] getAbstractNumLevels(int abstractNumId) { + return null; + } + + @Override + public LevelTuple[] getOverrideLevels(int numId, int length) { + return null; + } + } + + private class NumberingHandler extends DefaultHandler { + + private static final String W_NS = OOXMLWordAndPowerPointTextHandler.W_NS; + + // Current context + private boolean inAbstractNum = false; + private int currentAbstractNumId = -1; + private boolean inNum = false; + private int currentNumId = -1; + private boolean inLvl = false; + private boolean inLvlOverride = false; + private int currentIlvl = -1; + + // Level accumulators (reset for each lvl element) + private int lvlStart = -1; + private int lvlRestart = -1; + private String lvlText = null; + private String lvlNumFmt = null; + private boolean lvlIsLegal = false; + + // Collecting levels for current abstractNum + private final Map<Integer, LevelTuple> currentAbstractLevels = new HashMap<>(); + // Collecting overrides for current num + private final Map<Integer, LevelTuple> currentOverrides = new HashMap<>(); + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + if (!W_NS.equals(uri)) { + return; + } + + switch (localName) { + case "abstractNum": + inAbstractNum = true; + currentAbstractNumId = getIntAttr(atts, W_NS, "abstractNumId", -1); + currentAbstractLevels.clear(); + break; + case "num": + inNum = true; + currentNumId = getIntAttr(atts, W_NS, "numId", -1); + currentOverrides.clear(); + break; + case "lvlOverride": + if (inNum) { + inLvlOverride = true; + currentIlvl = getIntAttr(atts, W_NS, "ilvl", -1); + } + break; + case "lvl": + inLvl = true; + currentIlvl = getIntAttr(atts, W_NS, "ilvl", -1); + // Reset accumulators + lvlStart = -1; + lvlRestart = -1; + lvlText = null; + lvlNumFmt = null; + lvlIsLegal = false; + break; + case "start": + if (inLvl) { + lvlStart = getIntAttr(atts, W_NS, "val", -1); + } + break; + case "numFmt": + if (inLvl) { + lvlNumFmt = atts.getValue(W_NS, "val"); + } + break; + case "lvlText": + if (inLvl) { + lvlText = atts.getValue(W_NS, "val"); + } + break; + case "lvlRestart": + if (inLvl) { + lvlRestart = getIntAttr(atts, W_NS, "val", -1); + } + break; + case "isLgl": + if (inLvl) { + lvlIsLegal = true; + } + break; + case "abstractNumId": + if (inNum && !inLvl) { + int absId = getIntAttr(atts, W_NS, "val", -1); + if (currentNumId >= 0 && absId >= 0) { + numToAbstractNum.put(currentNumId, absId); + } + } + break; + default: + break; + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (!W_NS.equals(uri)) { + return; + } + switch (localName) { + case "lvl": + if (inLvl && currentIlvl >= 0) { + LevelTuple tuple = buildLevelTuple(currentIlvl); + if (inLvlOverride && inNum) { + currentOverrides.put(currentIlvl, tuple); + } else if (inAbstractNum) { + currentAbstractLevels.put(currentIlvl, tuple); + } + } + inLvl = false; + break; + case "lvlOverride": + inLvlOverride = false; + break; + case "abstractNum": + if (inAbstractNum && currentAbstractNumId >= 0 && + !currentAbstractLevels.isEmpty()) { + int maxLevel = currentAbstractLevels.keySet().stream() + .mapToInt(Integer::intValue).max().orElse(-1); + LevelTuple[] levels = new LevelTuple[maxLevel + 1]; + for (int i = 0; i <= maxLevel; i++) { + LevelTuple t = currentAbstractLevels.get(i); + levels[i] = t != null ? t : new LevelTuple("%" + i + "."); + } + abstractNumLevels.put(currentAbstractNumId, levels); + } + inAbstractNum = false; + currentAbstractNumId = -1; + break; + case "num": + if (inNum && currentNumId >= 0 && !currentOverrides.isEmpty()) { + overrideLevels.put(currentNumId, new HashMap<>(currentOverrides)); + } + inNum = false; + currentNumId = -1; + break; + default: + break; + } + } + + private LevelTuple buildLevelTuple(int level) { + int start = lvlStart; + int restart = lvlRestart; + String text = lvlText != null ? lvlText : "%" + level + "."; + String numFmt = lvlNumFmt != null ? lvlNumFmt : "decimal"; + + if (start < 0) { + // Same hack as XWPFListManager.buildTuple + if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || + "decimalZero".equals(numFmt)) { + start = 0; + } else { + start = 1; + } + } + return new LevelTuple(start, restart, text, numFmt, lvlIsLegal); + } + + private int getIntAttr(Attributes atts, String ns, String localName, int defaultVal) { + String val = atts.getValue(ns, localName); + if (val == null) { + return defaultVal; + } + try { + return Integer.parseInt(val); + } catch (NumberFormatException e) { + return defaultVal; + } + } + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index ecbe1956ec..cf12fbc1d7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -16,18 +16,11 @@ */ package org.apache.tika.parser.microsoft.ooxml; -import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.PrintStream; -import java.io.StringWriter; import java.text.DecimalFormatSymbols; import java.util.Arrays; import java.util.HashMap; @@ -38,10 +31,6 @@ import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.sax.SAXTransformerFactory; -import javax.xml.transform.sax.TransformerHandler; -import javax.xml.transform.stream.StreamResult; import org.apache.poi.util.LocaleUtil; import org.junit.jupiter.api.AfterAll; @@ -56,7 +45,6 @@ import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.EncryptedDocumentException; -import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.Metadata; @@ -64,7 +52,6 @@ import org.apache.tika.metadata.Office; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; @@ -73,7 +60,6 @@ import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.OfficeParserTest; import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.utils.XMLReaderUtils; public class OOXMLParserTest extends MultiThreadedTikaTest { @@ -330,157 +316,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } } - /** - * Test the plain text output of the Word converter - * - * @throws Exception - */ - @Test - public void testWord() throws Exception { - Metadata metadata = new Metadata(); - String content = getText("testWORD.docx", metadata, new ParseContext()); - assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); - assertTrue(content.contains("Sample Word Document")); - - } - - /** - * Test the plain text output of the Word converter - * - * @throws Exception - */ - @Test - public void testWordFootnote() throws Exception { - XMLResult xmlResult = getXML("footnotes.docx"); - assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", - xmlResult.metadata.get(Metadata.CONTENT_TYPE)); - assertTrue(xmlResult.xml.contains("snoska")); - //TIKA-4657 -- footnote content should be in a div with class "footnote" - // and should not be nested inside the paragraph - assertContains("<div class=\"footnote\">", xmlResult.xml); - assertNotContained("<p><div class=\"footnote\">", xmlResult.xml); - } - - @Test - public void testEndnoteWithTable() throws Exception { - XMLResult xmlResult = getXML("testWORD_endnote_table.docx"); - assertContains("Cat Property Act", xmlResult.xml); - //TIKA-4657 -- endnote content should be in a div with class "endnote" - assertContains("<div class=\"endnote\">", xmlResult.xml); - } - - /** - * Test that the word converter is able to generate the - * correct HTML for the document - */ - @Test - public void testWordHTML() throws Exception { - XMLResult result = getXML("testWORD.docx"); - String xml = result.xml; - Metadata metadata = result.metadata; - assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); - assertTrue(xml.contains("Sample Word Document")); - - // Check that custom headings came through - assertTrue(xml.contains("<h1 class=\"title\">")); - // Regular headings - assertTrue(xml.contains("<h1>Heading Level 1</h1>")); - assertTrue(xml.contains("<h2>Heading Level 2</h2>")); - // Headings with anchor tags in them - assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>")); - // Bold and italic - assertTrue(xml.contains("<b>BOLD</b>")); - assertTrue(xml.contains("<i>ITALIC</i>")); - // Table - assertTrue(xml.contains("<table>")); - assertTrue(xml.contains("<td>")); - // Links - assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>")); - // Anchor links - assertContains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>", xml); - // Paragraphs with other styles - assertTrue(xml.contains("<p class=\"signature\">This one")); - - result = getXML("testWORD_3imgs.docx"); - xml = result.xml; - - // Images 2-4 (there is no 1!) - assertTrue(xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />"), - "Image not found in:\n" + xml); - assertTrue(xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />"), - "Image not found in:\n" + xml); - assertTrue(xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />"), - "Image not found in:\n" + xml); - - // Text too - assertTrue(xml.contains("<p>The end!</p>")); - - // TIKA-692: test document containing multiple - // character runs within a bold tag: - xml = getXML("testWORD_bold_character_runs.docx").xml; - - // Make sure bold text arrived as single - // contiguous string even though Word parser - // handled this as 3 character runs - assertTrue(xml.contains("F<b>oob</b>a<b>r</b>"), "Bold text wasn't contiguous: " + xml); - - // TIKA-692: test document containing multiple - // character runs within a bold tag: - xml = getXML("testWORD_bold_character_runs2.docx").xml; - - // Make sure bold text arrived as single - // contiguous string even though Word parser - // handled this as 3 character runs - assertTrue(xml.contains("F<b>oob</b>a<b>r</b>"), "Bold text wasn't contiguous: " + xml); - } - - /** - * Test that we can extract image from docx header - */ - @Test - public void testWordPicturesInHeader() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx"); - assertEquals(2, metadataList.size()); - Metadata m = metadataList.get(0); - String mainContent = m.get(TikaCoreProperties.TIKA_CONTENT); - assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", - m.get(Metadata.CONTENT_TYPE)); - // Check that custom headings came through - assertTrue(mainContent.contains("<img")); - } - - @Test - @Disabled("need to add links in xhtml") - public void testPicturesInVariousPlaces() throws Exception { - //test that images are actually extracted from - //headers, footers, comments, endnotes, footnotes - List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx"); - - //only process embedded resources once - assertEquals(3, metadataList.size()); - String content = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); - for (int i = 1; i < 4; i++) { - assertContains("header" + i + "_pic", content); - assertContains("footer" + i + "_pic", content); - } - assertContains("body_pic.jpg", content); - assertContains("sdt_pic.jpg", content); - assertContains("deeply_embedded_pic", content); - assertContains("deleted_pic", content);//TODO: don't extract this - assertContains("footnotes_pic", content); - assertContains("comments_pic", content); - assertContains("endnotes_pic", content); -// assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt - - assertContainsCount("<img src=", content, 14); - } - /** * Documents with some sheets are protected, but not all. * See TIKA-364. @@ -513,130 +348,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertContains("Office", xmlResult.xml); } - /** - * Test docx without headers - * TIKA-633 - */ - @Test - public void testNullHeaders() throws Exception { - assertFalse(getXML("NullHeader.docx").xml.isEmpty(), - "Should have found some text"); - } - - @Test - public void testTextDecoration() throws Exception { - String xml = getXML("testWORD_various.docx").xml; - - assertContains("<b>Bold</b>", xml); - assertContains("<i>italic</i>", xml); - assertContains("<u>underline</u>", xml); - assertContains("<s>strikethrough</s>", xml); - } - - @Test - public void testTextDecorationNested() throws Exception { - String xml = getXML("testWORD_various.docx").xml; - - assertContains("<i>ita<s>li</s>c</i>", xml); - assertContains("<i>ita<s>l<u>i</u></s>c</i>", xml); - assertContains("<i><u>unde<s>r</s>line</u></i>", xml); - - //confirm that spaces aren't added for </s> and </u> - String txt = getText("testWORD_various.docx"); - assertContainsCount("italic", txt, 3); - assertNotContained("ita ", txt); - - assertContainsCount("underline", txt, 2); - assertNotContained("unde ", txt); - } - - @Test - public void testVarious() throws Exception { - Metadata metadata = new Metadata(); - - String content = getText("testWORD_various.docx", metadata); - //content = content.replaceAll("\\s+"," "); - assertContains("Footnote appears here", content); - assertContains("This is a footnote.", content); - assertContains("This is the header text.", content); - assertContains("This is the footer text.", content); - assertContains("Here is a text box", content); - assertContains("Bold", content); - assertContains("italic", content); - assertContains("underline", content); - assertContains("superscript", content); - assertContains("subscript", content); - assertContains("Here is a citation:", content); - assertContains("Figure 1 This is a caption for Figure 1", content); - assertContains("(Kramer)", content); - assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", - content.replaceAll("\\s+", " ")); - assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", - content.replaceAll("\\s+", " ")); - assertContains("This is a hyperlink", content); - assertContains("Here is a list:", content); - for (int row = 1; row <= 3; row++) { - //assertContains("·\tBullet " + row, content); - //assertContains("\u00b7\tBullet " + row, content); - assertContains("Bullet " + row, content); - } - assertContains("Here is a numbered list:", content); - for (int row = 1; row <= 3; row++) { - //assertContains(row + ")\tNumber bullet " + row, content); - //assertContains(row + ") Number bullet " + row, content); - // TODO: OOXMLExtractor fails to number the bullets: - assertContains("Number bullet " + row, content); - } - - for (int row = 1; row <= 2; row++) { - for (int col = 1; col <= 3; col++) { - assertContains("Row " + row + " Col " + col, content); - } - } - - assertContains("Keyword1 Keyword2", content); - assertEquals("Keyword1 Keyword2", metadata.get(Office.KEYWORDS)); - assertContains("Keyword1 Keyword2", - Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT))); - - - assertContains("Subject is here", content); - assertContains("Subject is here", - Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT))); - - - assertContains("Suddenly some Japanese text:", content); - // Special version of (GHQ) - assertContains("\uff08\uff27\uff28\uff31\uff09", content); - // 6 other characters - assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", - content); - - assertContains("And then some Gothic text:", content); - assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", - content); - } - - @Test - public void testDOCXHeaderFooterNotExtraction() throws Exception { - ParseContext parseContext = new ParseContext(); - OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setIncludeHeadersAndFooters(false); - parseContext.set(OfficeParserConfig.class, officeParserConfig); - String xml = getXML("testWORD_various.docx", parseContext).xml; - assertNotContained("This is the header text.", xml); - assertNotContained("This is the footer text.", xml); - - //now test configuration via tika-config - Parser configuredParser = TikaLoader.load( - getConfigPath(OfficeParserTest.class, "tika-config-headers-footers.json")) - .loadAutoDetectParser(); - xml = getXML("testWORD_various.docx", configuredParser).xml; - assertNotContained("This is the header text.", xml); - assertNotContained("This is the footer text.", xml); - - } - @Test public void testVariousPPTX() throws Exception { Metadata metadata = new Metadata(); @@ -801,41 +512,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); } - @Test - public void testWordCustomProperties() throws Exception { - Metadata metadata = new Metadata(); - - try (TikaInputStream tis = getResourceAsStream( - "/test-documents/testWORD_custom_props.docx")) { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OOXMLParser().parse(tis, handler, metadata, context); - } - - assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION)); - assertEquals("1", metadata.get(Office.PAGE_COUNT)); - assertEquals("2", metadata.get(Office.WORD_COUNT)); - assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("My Keyword", metadata.get(Office.KEYWORDS)); - assertContains("My Keyword", Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT))); - - assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE)); - assertEquals("My subject", metadata.get(DublinCore.SUBJECT)); - assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER)); - assertEquals("true", metadata.get("custom:myCustomBoolean")); - assertEquals("3", metadata.get("custom:myCustomNumber")); - assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); - assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate")); - assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); - } - @Test public void testPowerPointCustomProperties() throws Exception { Metadata metadata = new Metadata(); @@ -863,38 +539,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); } - // TIKA-989: - @Test - public void testEmbeddedPDF() throws Exception { - Metadata metadata = new Metadata(); - StringWriter sw = new StringWriter(); - SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); - TransformerHandler handler = factory.newTransformerHandler(); - handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); - handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); - handler.setResult(new StreamResult(sw)); - - try (TikaInputStream tis = getResourceAsStream( - "/test-documents/testWORD_embedded_pdf.docx")) { - new OOXMLParser().parse(tis, handler, metadata, new ParseContext()); - } - String xml = sw.toString(); - int i = xml.indexOf("Here is the pdf file:"); - int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>"); - int k = xml.indexOf("Bye Bye"); - int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>"); - int m = xml.indexOf("Bye for real."); - assertTrue(i != -1); - assertTrue(j != -1); - assertTrue(k != -1); - assertTrue(l != -1); - assertTrue(m != -1); - assertTrue(i < j); - assertTrue(j < k); - assertTrue(k < l); - assertTrue(l < m); - } - // TIKA-997: @Test public void testEmbeddedZipInPPTX() throws Exception { @@ -912,62 +556,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertTrue(j < k); } - // TIKA-1006 - @Test - public void testWordNullStyle() throws Exception { - String xml = getXML("testWORD_null_style.docx").xml; - assertContains("Test av styrt dokument", xml); - } - - /** - * TIKA-1044 - Handle word documents where parts of the - * text have no formatting or styles applied to them - */ - @Test - public void testNoFormat() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - - try (TikaInputStream tis = getResourceAsStream("/test-documents/testWORD_no_format.docx")) { - new OOXMLParser().parse(tis, handler, metadata, new ParseContext()); - } - - String content = handler.toString(); - assertContains("This is a piece of text that causes an exception", content); - } - - // TIKA-1005: - @Test - public void testTextInsideTextBox() throws Exception { - String xml = getXML("testWORD_text_box.docx").xml; - assertContains("This text is directly in the body of the document.", xml); - assertContains("This text is inside of a text box in the body of the document.", xml); - assertContains("This text is inside of a text box in the header of the document.", xml); - assertContains("This text is inside of a text box in the footer of the document.", xml); - } - - //TIKA-2807 - @Test - public void testSDTInTextBox() throws Exception { - String xml = getXML("testWORD_sdtInTextBox.docx").xml; - assertContains("rich-text-content-control_inside-text-box", xml); - assertContainsCount("inside-text", xml, 1); - } - - //TIKA-2346 - @Test - public void testTurningOffTextBoxExtraction() throws Exception { - ParseContext pc = new ParseContext(); - OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setIncludeShapeBasedContent(false); - pc.set(OfficeParserConfig.class, officeParserConfig); - String xml = getXML("testWORD_text_box.docx", pc).xml; - assertContains("This text is directly in the body of the document.", xml); - assertNotContained("This text is inside of a text box in the body of the document.", xml); - assertNotContained("This text is inside of a text box in the header of the document.", xml); - assertNotContained("This text is inside of a text box in the footer of the document.", xml); - } - // TIKA-1032: @Test public void testEmbeddedPPTXTwoSlides() throws Exception { @@ -976,22 +564,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />", xml); } - /** - * Test for missing text described in - * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>. - * and TIKA-1317 - */ - @Test - public void testMissingText() throws Exception { - XMLResult xmlResult = getXML("testWORD_missing_text.docx"); - assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", - xmlResult.metadata.get(Metadata.CONTENT_TYPE)); - assertContains("BigCompany", xmlResult.xml); - assertContains("Seasoned", xmlResult.xml); - assertContains("Rich_text_in_cell", xmlResult.xml); - - } - //TIKA-1100: @Test public void testExcelTextBox() throws Exception { @@ -1011,27 +583,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertNotContained("autoshape", xml); } - //TIKA-792; with room for future missing bean tests - @Test - public void testWordMissingOOXMLBeans() throws Exception { - //If a bean is missing, POI prints stack trace to stderr - String[] fileNames = new String[]{"testWORD_missing_ooxml_bean1.docx",//TIKA-792 - }; - PrintStream origErr = System.err; - for (String fileName : fileNames) { - //grab stderr - ByteArrayOutputStream errContent = new ByteArrayOutputStream(); - System.setErr(new PrintStream(errContent, true, UTF_8.name())); - getXML(fileName); - - //return stderr - System.setErr(origErr); - - String err = errContent.toString(UTF_8.name()); - assertTrue(err.isEmpty()); - } - } - //TIKA-817 @Test public void testPPTXAutodate() throws Exception { @@ -1043,17 +594,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } - @Test - public void testDOCXThumbnail() throws Exception { - String xml = getXML("testDOCX_Thumbnail.docx").xml; - int a = xml.indexOf("This file contains a thumbnail"); - int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />"); - - assertTrue(a != -1); - assertTrue(b != -1); - assertTrue(a < b); - } - @Test public void testXLSXThumbnail() throws Exception { String xml = getXML("testXLSX_Thumbnail.xlsx").xml; @@ -1116,78 +656,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } } - @Test - public void testDOCXParagraphNumbering() throws Exception { - String xml = getXML("testWORD_numbered_list.docx").xml; - assertContains("1) This", xml); - assertContains("a) Is", xml); - assertContains("i) A multi", xml); - assertContains("ii) Level", xml); - assertContains("1. Within cell 1", xml); - assertContains("b. Cell b", xml); - assertContains("iii) List", xml); - assertContains("2) foo", xml); - assertContains("ii) baz", xml); - assertContains("ii) foo", xml); - assertContains("II. bar", xml); - assertContains("6. six", xml); - assertContains("7. seven", xml); - assertContains("a. seven a", xml); - assertContains("e. seven e", xml); - assertContains("2. A ii 2", xml); - assertContains("3. page break list 3", xml); - assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml); - assertContains("1.1.1. 1.1.1", xml); - assertContains("1.1. 1.2->1.1 //set the value", xml); - - //TODO: comment is not being extracted! - //assertContains("add a list here", xml); - } - - @Test - public void testDOCXOverrideParagraphNumbering() throws Exception { - String xml = getXML("testWORD_override_list_numbering.docx").xml; - - //Test 1 - assertContains("<p>1.1.1.1...1 1.1.1.1...1</p>", xml); - assertContains("1st.2.3someText 1st.2.3someText", xml); - assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml); - assertContains("5th 5th", xml); - - - //Test 2 - assertContains("1.a.I 1.a.I", xml); - //test no reset because level 2 is not sufficient to reset - assertContains("<p>1.b.III 1.b.III</p>", xml); - //test restarted because of level 0's increment to 2 - assertContains("2.a.I 2.a.I", xml); - //test handling of skipped level - assertContains("<p>2.b 2.b</p>", xml); - - //Test 3 - assertContains("(1)) (1))", xml); - //tests start level 1 at 17 and - assertContains("2.17 2.17", xml); - //tests that isLegal turns everything into decimal - assertContains("2.18.2.1 2.18.2.1", xml); - assertContains("<p>2 2</p>", xml); - - //Test4 - assertContains("<p>1 1</p>", xml); - assertContains("<p>A A</p>", xml); - assertContains("<p>B B</p>", xml); - //this tests overrides - assertContains("<p>C C</p>", xml); - assertContains("<p>4 4</p>", xml); - - //Test5 - assertContains(">00 00", xml); - assertContains(">01 01", xml); - assertContains(">01. 01.", xml); - assertContains(">01..1 01..1", xml); - assertContains(">02 02", xml); - } - @Test public void testExcelHeaderAndFooterExtraction() throws Exception { XMLResult xml = getXML("testEXCEL_headers_footers.xlsx"); @@ -1241,19 +709,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } - @Test - public void testMultiAuthorsManagers() throws Exception { - XMLResult r = getXML("testWORD_multi_authors.docx"); - String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR); - assertEquals(3, authors.length); - assertEquals("author2", authors[1]); - - String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER); - assertEquals(2, managers.length); - assertEquals("manager1", managers[0]); - assertEquals("manager2", managers[1]); - } - @Test public void testHyperlinksInXLSX() throws Exception { String xml = getXML("testEXCEL_hyperlinks.xlsx").xml; @@ -1268,16 +723,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } - @Test - public void testOrigSourcePath() throws Exception { - Metadata embed1_zip_metadata = getRecursiveMetadata("test_recursive_embedded.docx").get(2); - assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip", Arrays.asList( - embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME))); - assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip", - Arrays.asList( - embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME))); - } - @Test public void testBigIntegersWGeneralFormat() throws Exception { //TIKA-2025 @@ -1313,55 +758,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } - @Test - public void testBoldHyperlink() throws Exception { - //TIKA-1255 - String xml = getXML("testWORD_boldHyperlink.docx").xml; - xml = xml.replaceAll("\\s+", " "); - assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml); - assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml); - } - - @Test - public void testLongForIntExceptionInSummaryDetails() throws Exception { - //TIKA-2055 - assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx").xml); - } - - @Test - public void testMacrosInDocm() throws Exception { - - //test default is "don't extract macros" - for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm")) { - if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) { - fail("Shouldn't have extracted macros as default"); - } - } - - //now test that they were extracted - ParseContext context = new ParseContext(); - OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setExtractMacros(true); - context.set(OfficeParserConfig.class, officeParserConfig); - - - Metadata minExpected = new Metadata(); - minExpected.add(TikaCoreProperties.TIKA_CONTENT.getName(), "Sub Embolden()"); - minExpected.add(TikaCoreProperties.TIKA_CONTENT.getName(), "Sub Italicize()"); - minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic"); - minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - - assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", context)); - - //test configuring via config file - Parser parser = TikaLoader.load( - getConfigPath(OOXMLParserTest.class, "tika-config-dom-macros.json")) - .loadAutoDetectParser(); - assertContainsAtLeast(minExpected, - getRecursiveMetadata("testWORD_macros.docm", parser)); - } - @Test public void testMacrosInPptm() throws Exception { @@ -1429,45 +825,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { getRecursiveMetadata("testEXCEL_macro.xlsm", parser)); } - //@Test //use this for lightweight benchmarking to compare xwpf options - public void testBatch() throws Exception { - OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setUseSAXDocxExtractor(true); - long started = System.currentTimeMillis(); - int ex = 0; - for (int i = 0; i < 100; i++) { - for (File f : getResourceAsFile("/test-documents").listFiles()) { - if (!f.getName().endsWith(".docx")) { - continue; - } - try (TikaInputStream tis = TikaInputStream.get(f.toPath())) { - ParseContext parseContext = new ParseContext(); - parseContext.set(OfficeParserConfig.class, officeParserConfig); - //test only the extraction of the main docx content, not embedded docs - parseContext.set(Parser.class, new EmptyParser()); - XMLResult r = getXML(tis, AUTO_DETECT_PARSER, new Metadata(), parseContext); - } catch (Exception e) { - ex++; - - } - } - } - System.out.println("elapsed: " + (System.currentTimeMillis() - started) + " with " + ex + - " exceptions"); - } - - @Test - public void testInitializationViaConfig() throws Exception { - //NOTE: this test relies on a bug in the DOM extractor that - //is passing over the title information. - //once we fix that, this test will no longer be meaningful! - Parser p = TikaLoader.load( - getConfigPath(OfficeParserTest.class, "tika-config-sax-docx.json")) - .loadAutoDetectParser(); - XMLResult xml = getXML("testWORD_2006ml.docx", p, new Metadata()); - assertContains("engaging title", xml.xml); - } - @Test public void testExcelXLSB() throws Exception { Detector detector = new DefaultDetector(); @@ -1599,11 +956,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertContains("SmartArt", getXML("testEXCEL_diagramData.xlsx").xml); } - @Test - public void testDOCXDiagramData() throws Exception { - assertContains("From here", getXML("testWORD_diagramData.docx").xml); - } - @Test public void testPPTXDiagramData() throws Exception { assertContains("President", getXML("testPPT_diagramData.pptx").xml); @@ -1625,14 +977,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertNotContained("chartSpace", xml); } - @Test - public void testDOCXChartData() throws Exception { - String xml = getXML("testWORD_charts.docx").xml; - assertContains("peach", xml); - assertContains("March\tApril", xml); - assertNotContained("chartSpace", xml); - } - @Test public void testPPTXChartData() throws Exception { String xml = getXML("testPPT_charts.pptx").xml; @@ -1693,21 +1037,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } - @Test - public void testDOCXPhoneticStrings() throws Exception { - - assertContains("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", - getXML("testWORD_phonetic.docx").xml); - - OfficeParserConfig config = new OfficeParserConfig(); - config.setConcatenatePhoneticRuns(false); - ParseContext parseContext = new ParseContext(); - parseContext.set(OfficeParserConfig.class, config); - String xml = getXML("testWORD_phonetic.docx", parseContext).xml; - assertContains("\u6771\u4EAC", xml); - assertNotContained("\u3068", xml); - } - @Test public void testEmbeddedMedia() throws Exception { List<Metadata> metadataList = getRecursiveMetadata("testPPT_embeddedMP3.pptx"); @@ -1747,17 +1076,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } - @Test - public void testTruncatedSAXDocx() throws Exception { - ParseContext pc = new ParseContext(); - OfficeParserConfig c = new OfficeParserConfig(); - c.setUseSAXDocxExtractor(true); - pc.set(OfficeParserConfig.class, c); - assertThrows(TikaException.class, () -> { - getRecursiveMetadata("testWORD_truncated.docx", pc); - }); - } - @Test public void testDateFormat() throws Exception { Parser p = TikaLoader.load( @@ -1810,16 +1128,6 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { } - @Test - public void testFeatureExtraction() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testWORD_features.docx"); - Metadata m = metadataList.get(0); - assertContains("Kyle Reese", Arrays.asList(m.getValues(Office.COMMENT_PERSONS))); - assertEquals("true", m.get(Office.HAS_HIDDEN_TEXT)); - assertEquals("true", m.get(Office.HAS_TRACK_CHANGES)); - assertEquals("true", m.get(Office.HAS_COMMENTS)); - } - @Test public void testNoRecordSizeOverflow() throws Exception { //TIKA-4474 -- test: files (passed as stream) no longer have limit on record size as they are spooled @@ -1827,42 +1135,4 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertContains("Repetitive content pattern 3 for compression test row 1", content); } - /** - * Test extraction of field-based hyperlinks using instrText/fldChar. - * These are hyperlinks embedded as field codes rather than relationship-based hyperlinks. - * Uses the DOM-based XWPFWordExtractorDecorator. - */ - @Test - public void testInstrTextHyperlink() throws Exception { - String xml = getXML("testInstrLink.docx").xml; - // The document contains a HYPERLINK field code in instrText - assertContains("<a href=\"https://exmaple.com/file\">", xml); - assertContains("Access Document(s)", xml); - } - - /** - * Test extraction of external reference field codes (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK). - * These can be used to hide malicious URLs in documents. - */ - @Test - public void testExternalRefFieldCodes() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testExternalRefs.docx"); - Metadata m = metadataList.get(0); - // Check metadata flag is set - assertEquals("true", m.get(Office.HAS_FIELD_HYPERLINKS)); - - String xml = getXML("testExternalRefs.docx").xml; - // Test INCLUDEPICTURE field code - assertContains("class=\"external-ref-INCLUDEPICTURE\"", xml); - assertContains("http://example.com/tracking.png", xml); - // Test INCLUDETEXT field code - assertContains("class=\"external-ref-INCLUDETEXT\"", xml); - assertContains("http://example.org/payload.txt", xml); - // Test IMPORT field code - assertContains("class=\"external-ref-IMPORT\"", xml); - assertContains("http://example.net/exploit.wmf", xml); - // Test LINK field code - assertContains("class=\"external-ref-LINK\"", xml); - assertContains("http://test.invalid/cmd.docx", xml); - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 8ff5ccbb27..58a6ff880a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -28,11 +28,9 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.microsoft.EMFParser; import org.apache.tika.parser.microsoft.OfficeParserConfig; public class OOXMLParserTest extends TikaTest { @@ -68,90 +66,21 @@ public class OOXMLParserTest extends TikaTest { assertContains("Hello World", pdfMetadata2.get(TikaCoreProperties.TIKA_CONTENT)); } - @Test - public void testEMFAssociatedWithAttachments() throws Exception { - //TIKA-3968 - List<Metadata> metadataList = getRecursiveMetadata("testWORD_EMFAndAttachments.docx"); - - assertEquals("true", metadataList.get(1).get(EMFParser.EMF_ICON_ONLY)); - assertEquals("true", metadataList.get(3).get(EMFParser.EMF_ICON_ONLY)); - assertEquals("true", metadataList.get(5).get(EMFParser.EMF_ICON_ONLY)); - assertEquals("TestText.txt", metadataList.get(1).get(EMFParser.EMF_ICON_STRING)); - assertEquals("TestPdf.pdf", metadataList.get(3).get(EMFParser.EMF_ICON_STRING)); - assertEquals("testWORD123.docx", metadataList.get(5).get(EMFParser.EMF_ICON_STRING)); - - assertNull(metadataList.get(2).get(Office.PROG_ID)); - assertEquals("AcroExch.Document.DC", metadataList.get(4).get(Office.PROG_ID)); - assertEquals("Word.Document.12", metadataList.get(6).get(Office.PROG_ID)); - - assertEquals("TestText.txt", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertEquals("TestPdf.pdf", metadataList.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertEquals("testWORD123.docx", metadataList.get(6).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - - assertEquals("/TestText.txt", - metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); - assertEquals("/TestPdf.pdf", - metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); - assertEquals("/testWORD123.docx", - metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); - - assertContains("This is Text File", - metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); - - assertContains("This is test PDF document for parser.", - metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT)); - - assertContains("This is test word document for parser.", - metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT)); - - assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), - metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), - metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), - metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - - assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), - metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), - metadataList.get(3).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), - metadataList.get(5).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - } - @Disabled("TODO figure out why this doesn't work") - @Test//(expected = org.apache.tika.exception.TikaException.class) + @Test public void testCorruptedZip() throws Exception { //TIKA_2446 getRecursiveMetadata("testZIP_corrupted_oom.zip"); } - @Test - public void testAltFileMHTChunk() throws Exception { - //test file with permission from: - // https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx - List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx"); - assertEquals(3, metadataList.size()); - assertContains("Example of a table", - metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); - } - - @Test - public void testAltFileHTMLChunk() throws Exception { - //test file with permission from: - // https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_HTML_container.docx - List<Metadata> metadataList = getRecursiveMetadata("testAltChunkHTML.docx"); - assertEquals(2, metadataList.size()); - assertContains("Example of a table", - metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); - } - @Test public void testDigestTranslator() throws Exception { - TikaLoader loader = TikaLoader.load(getConfigPath(OOXMLParserTest.class, "tika-config-digests.json")); + TikaLoader loader = TikaLoader.load( + getConfigPath(OOXMLParserTest.class, "tika-config-digests.json")); Parser parser = loader.loadAutoDetectParser(); ParseContext parseContext = loader.loadParseContext(); - List<Metadata> metadataList = getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser, parseContext); + List<Metadata> metadataList = + getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser, parseContext); assertEquals(4, metadataList.size()); for (Metadata m : metadataList) { assertNotNull(m.get("X-TIKA:digest:SHA256:BASE32")); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java deleted file mode 100644 index ef76cc641e..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.microsoft.ooxml; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNull; - -import java.util.List; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import org.apache.tika.TikaTest; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.microsoft.EMFParser; -import org.apache.tika.parser.microsoft.OfficeParserConfig; - -public class SXWPFExtractorTest extends TikaTest { - - private ParseContext parseContext; - - @BeforeEach - public void setUp() { - parseContext = new ParseContext(); - OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setUseSAXDocxExtractor(true); - officeParserConfig.setUseSAXPptxExtractor(true); - parseContext.set(OfficeParserConfig.class, officeParserConfig); - - } - @Test - @Disabled("TODO -- implement TIKA-3968 for SXWPFExtractor") - public void testEMFAssociatedWithAttachments() throws Exception { - //TIKA-3968 - List<Metadata> metadataList = getRecursiveMetadata("testWORD_EMFAndAttachments.docx", parseContext); - - assertEquals("true", metadataList.get(1).get(EMFParser.EMF_ICON_ONLY)); - assertEquals("true", metadataList.get(3).get(EMFParser.EMF_ICON_ONLY)); - assertEquals("true", metadataList.get(5).get(EMFParser.EMF_ICON_ONLY)); - assertEquals("TestText.txt", metadataList.get(1).get(EMFParser.EMF_ICON_STRING)); - assertEquals("TestPdf.pdf", metadataList.get(3).get(EMFParser.EMF_ICON_STRING)); - assertEquals("testWORD123.docx", metadataList.get(5).get(EMFParser.EMF_ICON_STRING)); - - assertNull(metadataList.get(2).get(Office.PROG_ID)); - assertEquals("AcroExch.Document.DC", metadataList.get(4).get(Office.PROG_ID)); - assertEquals("Word.Document.12", metadataList.get(6).get(Office.PROG_ID)); - - assertEquals("TestText.txt", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertEquals("TestPdf.pdf", metadataList.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertEquals("testWORD123.docx", metadataList.get(6).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - - assertEquals("/TestText.txt", - metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); - assertEquals("/TestPdf.pdf", - metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); - assertEquals("/testWORD123.docx", - metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); - - assertContains("This is Text File", - metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); - - assertContains("This is test PDF document for parser.", - metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT)); - - assertContains("This is test word document for parser.", - metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT)); - - assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), - metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), - metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), - metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - - assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), - metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), - metadataList.get(3).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), - metadataList.get(5).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - } -}
