This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 67410849203b82d050e5bea5dfeb35d012db4bb6
Author: tallison <[email protected]>
AuthorDate: Thu May 8 10:43:27 2025 -0400

    TIKA-4345 -- add back configurability for injecting headers into the body 
of emails (legacy pre-4.x behavior)
---
 .../tika/parser/microsoft/OutlookExtractor.java    | 28 ++++++++++++++++++++++
 .../tika/parser/microsoft/OutlookParserTest.java   | 17 +++++++++++++
 2 files changed, 45 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 0d46e6437..b965f6f4c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -272,6 +272,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
 
         handleGeneralDates(msg, headers, parentMetadata);
+        writeSelectHeadersInBody(parentMetadata, msg, xhtml);
 
         // Get the message body. Preference order is: html, rtf, text
         Chunk htmlChunk = null;
@@ -864,6 +865,33 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         return false;
     }
 
+    private void writeSelectHeadersInBody(Metadata metadata, MAPIMessage msg, 
XHTMLContentHandler xhtml)
+            throws SAXException, ChunkNotFoundException {
+        if (! officeParserConfig.isWriteSelectHeadersInBody()) {
+            return;
+        }
+        String subject = metadata.get(TikaCoreProperties.SUBJECT);
+        subject = (subject == null) ? "" : subject;
+        xhtml.element("h1", subject);
+
+        // Output the from and to details in text, as you
+        //  often want them in text form for searching
+        xhtml.startElement("dl");
+        String from = metadata.get(Message.MESSAGE_FROM);
+        if (from != null) {
+            header(xhtml, "From", from);
+        }
+        header(xhtml, "To", msg.getDisplayTo());
+        header(xhtml, "Cc", msg.getDisplayCC());
+        header(xhtml, "Bcc", msg.getDisplayBCC());
+        try {
+            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+        } catch (ChunkNotFoundException e) {
+            //swallow
+        }
+        xhtml.endElement("dl");
+    }
+
     private List<Recipient> buildRecipients() {
         RecipientChunks[] recipientChunks = msg.getRecipientDetailsChunks();
         if (recipientChunks == null) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index db68d241e..866d25887 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -420,4 +420,21 @@ public class OutlookParserTest extends TikaTest {
         assertContains("annuaires\t \n" + " Synchronisation", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
     }
 
+    @Test
+    public void testHeadersInBody() throws Exception {
+        //test default behavior -- no headers
+        ParseContext parseContext = new ParseContext();
+        String xml = getText("testMSG.msg", new Metadata(), parseContext);
+        xml = xml.replaceAll("\\s+", " ");
+        assertTrue(xml.startsWith("MIME registry use cases"));
+        assertContains("From Jukka Zitting", xml);
+
+        //test configurable behavior (legacy behavior up to Tika 4.x)
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setWriteSelectHeadersInBody(false);
+        parseContext.set(OfficeParserConfig.class, officeParserConfig);
+        xml = getText("testMSG.msg", new Metadata(), parseContext);
+        assertTrue(xml.startsWith("Hi,"));
+    }
+
 }

Reply via email to