This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 67410849203b82d050e5bea5dfeb35d012db4bb6 Author: tallison <[email protected]> AuthorDate: Thu May 8 10:43:27 2025 -0400 TIKA-4345 -- add back configurability for injecting headers into the body of emails (legacy pre-4.x behavior) --- .../tika/parser/microsoft/OutlookExtractor.java | 28 ++++++++++++++++++++++ .../tika/parser/microsoft/OutlookParserTest.java | 17 +++++++++++++ 2 files changed, 45 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 0d46e6437..b965f6f4c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -272,6 +272,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } handleGeneralDates(msg, headers, parentMetadata); + writeSelectHeadersInBody(parentMetadata, msg, xhtml); // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; @@ -864,6 +865,33 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { return false; } + private void writeSelectHeadersInBody(Metadata metadata, MAPIMessage msg, XHTMLContentHandler xhtml) + throws SAXException, ChunkNotFoundException { + if (! officeParserConfig.isWriteSelectHeadersInBody()) { + return; + } + String subject = metadata.get(TikaCoreProperties.SUBJECT); + subject = (subject == null) ? "" : subject; + xhtml.element("h1", subject); + + // Output the from and to details in text, as you + // often want them in text form for searching + xhtml.startElement("dl"); + String from = metadata.get(Message.MESSAGE_FROM); + if (from != null) { + header(xhtml, "From", from); + } + header(xhtml, "To", msg.getDisplayTo()); + header(xhtml, "Cc", msg.getDisplayCC()); + header(xhtml, "Bcc", msg.getDisplayBCC()); + try { + header(xhtml, "Recipients", msg.getRecipientEmailAddress()); + } catch (ChunkNotFoundException e) { + //swallow + } + xhtml.endElement("dl"); + } + private List<Recipient> buildRecipients() { RecipientChunks[] recipientChunks = msg.getRecipientDetailsChunks(); if (recipientChunks == null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index db68d241e..866d25887 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -420,4 +420,21 @@ public class OutlookParserTest extends TikaTest { assertContains("annuaires\t \n" + " Synchronisation", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); } + @Test + public void testHeadersInBody() throws Exception { + //test default behavior -- no headers + ParseContext parseContext = new ParseContext(); + String xml = getText("testMSG.msg", new Metadata(), parseContext); + xml = xml.replaceAll("\\s+", " "); + assertTrue(xml.startsWith("MIME registry use cases")); + assertContains("From Jukka Zitting", xml); + + //test configurable behavior (legacy behavior up to Tika 4.x) + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setWriteSelectHeadersInBody(false); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + xml = getText("testMSG.msg", new Metadata(), parseContext); + assertTrue(xml.startsWith("Hi,")); + } + }
