This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4345-v2 in repository https://gitbox.apache.org/repos/asf/tika.git
commit aed6b2a5c94fae76503e143d4502ee056f741d4c Author: tallison <[email protected]> AuthorDate: Thu May 8 10:43:27 2025 -0400 TIKA-4345 -- add back configurability for injecting headers into the body of emails (legacy pre-4.x behavior) --- .../tika/parser/microsoft/OfficeParserConfig.java | 10 ++++++++ .../tika/parser/microsoft/OutlookExtractor.java | 28 ++++++++++++++++++++++ .../tika/parser/microsoft/OutlookParserTest.java | 17 +++++++++++++ 3 files changed, 55 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index 8e761efad..bfa2865e2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -35,6 +35,8 @@ public class OfficeParserConfig implements Serializable { private boolean useSAXDocxExtractor = false; private boolean useSAXPptxExtractor = false; + private boolean writeSelectHeadersInBody = false; + private boolean extractAllAlternativesFromMSG = false; private String dateOverrideFormat = null; private int maxOverride = 0;//ignore @@ -276,6 +278,14 @@ public class OfficeParserConfig implements Serializable { public int getMaxOverride() { return this.maxOverride; } + + public boolean isWriteSelectHeadersInBody() { + return writeSelectHeadersInBody; + } + + public void setWriteSelectHeadersInBody(boolean writeSelectHeadersInBody) { + this.writeSelectHeadersInBody = writeSelectHeadersInBody; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 0e219dac6..e13234d5c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -267,6 +267,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } handleGeneralDates(msg, headers, parentMetadata); + writeSelectHeadersInBody(parentMetadata, msg, xhtml); // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; @@ -859,6 +860,33 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { return false; } + private void writeSelectHeadersInBody(Metadata metadata, MAPIMessage msg, XHTMLContentHandler xhtml) + throws SAXException, ChunkNotFoundException { + if (! officeParserConfig.isWriteSelectHeadersInBody()) { + return; + } + String subject = metadata.get(TikaCoreProperties.SUBJECT); + subject = (subject == null) ? "" : subject; + xhtml.element("h1", subject); + + // Output the from and to details in text, as you + // often want them in text form for searching + xhtml.startElement("dl"); + String from = metadata.get(Message.MESSAGE_FROM); + if (from != null) { + header(xhtml, "From", from); + } + header(xhtml, "To", msg.getDisplayTo()); + header(xhtml, "Cc", msg.getDisplayCC()); + header(xhtml, "Bcc", msg.getDisplayBCC()); + try { + header(xhtml, "Recipients", msg.getRecipientEmailAddress()); + } catch (ChunkNotFoundException e) { + //swallow + } + xhtml.endElement("dl"); + } + private List<Recipient> buildRecipients() { RecipientChunks[] recipientChunks = msg.getRecipientDetailsChunks(); if (recipientChunks == null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index ba8a6c64e..9da786e6e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -401,4 +401,21 @@ public class OutlookParserTest extends TikaTest { assertContains("annuaires\t \n" + " Synchronisation", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); } + @Test + public void testHeadersInBody() throws Exception { + //test default behavior -- no headers + ParseContext parseContext = new ParseContext(); + String xml = getText("testMSG.msg", new Metadata(), parseContext); + assertTrue(xml.startsWith("Hi,")); + + //test configurable behavior (legacy behavior up to Tika 4.x) + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setWriteSelectHeadersInBody(true); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + xml = getText("testMSG.msg", new Metadata(), parseContext); + xml = xml.replaceAll("\\s+", " "); + assertTrue(xml.startsWith("MIME registry use cases")); + assertContains("From Jukka Zitting", xml); + } + }
