This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4345 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 375e59580270ebd0c0da7a41e43d7739925fede0 Author: tallison <[email protected]> AuthorDate: Thu Nov 7 12:03:57 2024 -0500 TIKA-4345 -- allow configurability for injecting headers into content in msg --- .../parser/microsoft/AbstractOfficeParser.java | 14 +++++ .../apache/tika/parser/microsoft/OfficeParser.java | 1 - .../tika/parser/microsoft/OfficeParserConfig.java | 17 +++++- .../tika/parser/microsoft/OutlookExtractor.java | 71 ++++++++++++---------- .../tika/parser/microsoft/rtf/RTFParser.java | 31 ++++++++-- .../tika/parser/microsoft/rtf/TextExtractor.java | 19 +++--- .../tika/parser/microsoft/OutlookParserTest.java | 39 ++++++++++-- 7 files changed, 141 insertions(+), 51 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java index ec785f5d2..ea5179552 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java @@ -180,4 +180,18 @@ public abstract class AbstractOfficeParser implements Parser { public boolean isIncludeHeadersAndFooters() { return defaultOfficeParserConfig.isIncludeHeadersAndFooters(); } + + /** + * If set to <code>true</code>, this will write the to/from/cc into the body content + * + * @param val + */ + @Field + public void setWriteSelectHeadersInBody(boolean val) { + defaultOfficeParserConfig.setWriteSelectHeadersInBody(val); + } + + public boolean isWriteSelectHeadersInBody() { + return defaultOfficeParserConfig.isWriteSelectHeadersInBody(); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index 21a771c86..8fe685686 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -248,7 +248,6 @@ public class OfficeParser extends AbstractOfficeParser { break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, metadata, context); - extractor.parse(xhtml); break; case ENCRYPTED: diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index 680b63c9e..af69eefa1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -34,8 +34,9 @@ public class OfficeParserConfig implements Serializable { private boolean useSAXDocxExtractor = false; private boolean useSAXPptxExtractor = false; - private boolean extractAllAlternativesFromMSG; + private boolean extractAllAlternativesFromMSG = false; + private boolean writeSelectHeadersInBody = false; private String dateOverrideFormat = null; private int maxOverride = 0;//ignore @@ -201,6 +202,20 @@ public class OfficeParserConfig implements Serializable { this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG; } + public boolean isWriteSelectHeadersInBody() { + return writeSelectHeadersInBody; + } + + /** + * If set to <code>true</code>, this will add to/from/cc into the + * body content. + * + * @param val + */ + public void setWriteSelectHeadersInBody(boolean val) { + this.writeSelectHeadersInBody = val; + } + public boolean isIncludeMissingRows() { return includeMissingRows; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 451346745..a73adbaf6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -227,24 +227,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } - - xhtml.element("h1", subject); - - // Output the from and to details in text, as you - // often want them in text form for searching - xhtml.startElement("dl"); - if (from != null) { - header(xhtml, "From", from); - } - header(xhtml, "To", msg.getDisplayTo()); - header(xhtml, "Cc", msg.getDisplayCC()); - header(xhtml, "Bcc", msg.getDisplayBCC()); - try { - header(xhtml, "Recipients", msg.getRecipientEmailAddress()); - } catch (ChunkNotFoundException e) { - //swallow - } - xhtml.endElement("dl"); + writeSelectHeadersInBody(subject, from, msg, xhtml); // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; @@ -265,7 +248,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { // Process the attachments for (AttachmentChunks attachment : msg.getAttachmentFiles()) { - xhtml.startElement("div", "class", "attachment-entry"); String filename = null; if (attachment.getAttachLongFileName() != null) { @@ -273,9 +255,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } else if (attachment.getAttachFileName() != null) { filename = attachment.getAttachFileName().getValue(); } - if (filename != null && filename.length() > 0) { - xhtml.element("h1", filename); - } if (attachment.getAttachData() != null) { handleEmbeddedResource( @@ -286,8 +265,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), filename, xhtml, true); } - - xhtml.endElement("div"); } } catch (ChunkNotFoundException e) { throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", @@ -302,6 +279,31 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } + private void writeSelectHeadersInBody(String subject, String from, MAPIMessage msg, XHTMLContentHandler xhtml) + throws SAXException, ChunkNotFoundException { + if (! officeParserConfig.isWriteSelectHeadersInBody()) { + return; + } + xhtml.element("h1", subject); + + // Output the from and to details in text, as you + // often want them in text form for searching + xhtml.startElement("dl"); + if (from != null) { + header(xhtml, "From", from); + } + header(xhtml, "To", msg.getDisplayTo()); + header(xhtml, "Cc", msg.getDisplayCC()); + header(xhtml, "Bcc", msg.getDisplayBCC()); + try { + header(xhtml, "Recipients", msg.getRecipientEmailAddress()); + } catch (ChunkNotFoundException e) { + //swallow + } + xhtml.endElement("dl"); + + } + private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { @@ -310,9 +312,18 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml); return; } - + if (officeParserConfig.isWriteSelectHeadersInBody()) { + xhtml.startElement("div", "class", "message-body"); + _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); + xhtml.endElement("div"); + } else { + _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); + } + } + private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, + XHTMLContentHandler xhtml) + throws SAXException, IOException, TikaException { boolean doneBody = false; - xhtml.startElement("div", "class", "message-body"); if (htmlChunk != null) { byte[] data = null; if (htmlChunk instanceof ByteChunk) { @@ -341,21 +352,19 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()); - Parser rtfParser = EmbeddedDocumentUtil + RTFParser rtfParser = (RTFParser) EmbeddedDocumentUtil .tryToFindExistingLeafParser(RTFParser.class, parseContext); if (rtfParser == null) { rtfParser = new RTFParser(); } - rtfParser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(), - new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), - parseContext); + rtfParser.parseInline(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(), + xhtml, new Metadata(), parseContext); doneBody = true; } } if (textChunk != null && (extractAllAlternatives || !doneBody)) { xhtml.element("p", ((StringChunk) textChunk).getValue()); } - xhtml.endElement("div"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java index faa808fa1..8b17575a7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java @@ -64,19 +64,38 @@ public class RTFParser implements Parser { ParseContext context) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE, "application/rtf"); TaggedInputStream tagged = new TaggedInputStream(stream); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); try { - XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata); - RTFEmbObjHandler embObjHandler = - new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb()); - final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler); - ert.setIgnoreListMarkup(ignoreListMarkup); - ert.extract(stream); + parseInline(stream, xhtml, metadata, context); } catch (IOException e) { tagged.throwIfCauseOf(e); throw new TikaException("Error parsing an RTF document", e); + } finally { + xhtml.endDocument(); } } + /** + * This bypasses wrapping the handler for inline parsing (in at least the OutlookExtractor). + * + * @param is + * @param handler + * @param metadata + * @param context + * @throws TikaException + * @throws IOException + * @throws SAXException + */ + public void parseInline(InputStream is, ContentHandler handler, Metadata metadata, ParseContext context) + throws TikaException, IOException, SAXException { + RTFEmbObjHandler embObjHandler = + new RTFEmbObjHandler(handler, metadata, context, getMemoryLimitInKb()); + final TextExtractor ert = new TextExtractor(handler, metadata, embObjHandler); + ert.setIgnoreListMarkup(ignoreListMarkup); + ert.extract(is); + } + public int getMemoryLimitInKb() { //there's a race condition here, but it shouldn't matter. if (USE_STATIC) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java index 28ca76299..83abb1ae6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java @@ -17,6 +17,8 @@ package org.apache.tika.parser.microsoft.rtf; +import static org.apache.tika.sax.XHTMLContentHandler.XHTML; + import java.io.IOException; import java.io.InputStream; import java.io.PushbackInputStream; @@ -36,7 +38,9 @@ import java.util.Stack; import java.util.TimeZone; import org.apache.commons.io.IOUtils; +import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -47,7 +51,6 @@ import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.CharsetUtils; /* Tokenizes and performs a "shallow" parse of the RTF @@ -256,7 +259,7 @@ final class TextExtractor { // close the group, we restore it private final LinkedList<GroupState> groupStates = new LinkedList<>(); private final StringBuilder pendingBuffer = new StringBuilder(); - private final XHTMLContentHandler out; + private final ContentHandler out; private final Metadata metadata; private final RTFEmbObjHandler embObjHandler; // How many next ansi chars we should skip; this @@ -330,7 +333,7 @@ final class TextExtractor { //to defend against DoS with memory consumption private int maxStackSize = 1000; - public TextExtractor(XHTMLContentHandler out, Metadata metadata, + public TextExtractor(ContentHandler out, Metadata metadata, RTFEmbObjHandler embObjHandler) { this.metadata = metadata; this.out = out; @@ -464,7 +467,6 @@ final class TextExtractor { } private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException { - out.startDocument(); while (true) { final int b = in.read(); @@ -503,7 +505,6 @@ final class TextExtractor { while (paragraphStack.size() > 0) { end(paragraphStack.pop()); } - out.endDocument(); } private void parseControlToken(PushbackInputStream in) @@ -1084,11 +1085,11 @@ final class TextExtractor { } private void end(String tag) throws IOException, SAXException, TikaException { - out.endElement(tag); + out.endElement(XHTML, tag, tag); } private void start(String tag) throws IOException, SAXException, TikaException { - out.startElement(tag); + out.startElement(XHTML, tag, tag, new AttributesImpl()); } // Handle non-parameter control word: @@ -1357,7 +1358,9 @@ final class TextExtractor { } else if (equals("fldrslt") && fieldState == 2) { assert pendingURL != null; lazyStartParagraph(); - out.startElement("a", "href", pendingURL); + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute(XHTML, "href", "href", "CDATA", pendingURL); + out.startElement("", "a", "a", attrs); pendingURL = null; fieldState = 3; groupState.ignore = false; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index ffd4c0e5d..686a6657c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -43,6 +43,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; /** @@ -52,11 +53,19 @@ public class OutlookParserTest extends TikaTest { @Test public void testOutlookParsing() throws Exception { + + //test default behavior + List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", AUTO_DETECT_PARSER, + BasicContentHandlerFactory.HANDLER_TYPE.BODY); + assertNotContained("Microsoft Outlook Express 6", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + + + //test legacy behavior ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = getResourceAsStream("/test-documents/test-outlook.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); } assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE)); @@ -98,7 +107,7 @@ public class OutlookParserTest extends TikaTest { Metadata metadata = new Metadata(); try (InputStream stream = getResourceAsStream("/test-documents/testMSG.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); } assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE)); @@ -176,7 +185,7 @@ public class OutlookParserTest extends TikaTest { handler.setResult(new StreamResult(sw)); try (InputStream stream = getResourceAsStream("/test-documents/testMSG_chinese.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); } // As the HTML version should have been processed, ensure @@ -233,6 +242,12 @@ public class OutlookParserTest extends TikaTest { @Test public void testOutlookHTMLfromRTF() throws Exception { + + //test default behavior + List<Metadata> metadataList = getRecursiveMetadata("test-outlook2003.msg"); + assertNotContained("<dd>New Outlook User</dd>", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + + //test legacy behavior with the configuration set Metadata metadata = new Metadata(); // Check the HTML version @@ -244,7 +259,7 @@ public class OutlookParserTest extends TikaTest { handler.setResult(new StreamResult(sw)); try (InputStream stream = getResourceAsStream("/test-documents/test-outlook2003.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); } // As the HTML version should have been processed, ensure @@ -267,6 +282,14 @@ public class OutlookParserTest extends TikaTest { assertEquals(2, content.split("<\\/body>").length); } + private ParseContext configureInjectHeaders() { + ParseContext parseContext = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setWriteSelectHeadersInBody(true); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + return parseContext; + } + @Test public void testMAPIMessageClasses() throws Exception { @@ -319,4 +342,12 @@ public class OutlookParserTest extends TikaTest { } } + + @Test + public void testNewlinesInRTFBody() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", AUTO_DETECT_PARSER, + BasicContentHandlerFactory.HANDLER_TYPE.BODY); + assertContains("annuaires\t \n" + " Synchronisation", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + } + }
