This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 55d3d788c TIKA-4345 -- stop injecting headers into the body of msg 
files (#2042)
55d3d788c is described below

commit 55d3d788c962c32763e068b368ed19da7118253c
Author: Tim Allison <[email protected]>
AuthorDate: Wed Nov 13 10:34:53 2024 -0500

    TIKA-4345 -- stop injecting headers into the body of msg files (#2042)
    
    * TIKA-4345 -- stop injecting headers into the body for msg files
---
 CHANGES.txt                                        |  7 +++++
 .../parser/microsoft/AbstractOfficeParser.java     | 13 --------
 .../tika/parser/microsoft/OfficeParserConfig.java  | 15 ---------
 .../tika/parser/microsoft/OutlookExtractor.java    | 36 ++--------------------
 .../tika/parser/microsoft/OutlookParserTest.java   | 33 ++++++--------------
 5 files changed, 18 insertions(+), 86 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 395f41b07..a5feb090f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,8 +1,15 @@
 Release 4.0.0-BETA1 - ???
   BREAKING CHANGES
 
+   * Headers are no longer injected into the body/content of MSG files 
(TIKA-4345). Please open
+     a ticket if you need this behavior across email formats.
+
+
 Release 3.1.0 - ??
 
+   * Allow users to turn off the injection of some headers into the content 
stream of MSG
+     files (TIKA-4345).
+
    * Add a wrapper for Google's magika detector (TIKA-4344).
 
    * Add support for MachO via Alexey Pelykh (TIKA-4309).
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index ea5179552..a44073d4e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -181,17 +181,4 @@ public abstract class AbstractOfficeParser implements 
Parser {
         return defaultOfficeParserConfig.isIncludeHeadersAndFooters();
     }
 
-    /**
-     * If set to <code>true</code>, this will write the to/from/cc into the 
body content
-     *
-     * @param val
-     */
-    @Field
-    public void setWriteSelectHeadersInBody(boolean val) {
-        defaultOfficeParserConfig.setWriteSelectHeadersInBody(val);
-    }
-
-    public boolean isWriteSelectHeadersInBody() {
-        return defaultOfficeParserConfig.isWriteSelectHeadersInBody();
-    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index af69eefa1..8e761efad 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -36,7 +36,6 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXPptxExtractor = false;
 
     private boolean extractAllAlternativesFromMSG = false;
-    private boolean writeSelectHeadersInBody = false;
     private String dateOverrideFormat = null;
     private int maxOverride = 0;//ignore
 
@@ -202,20 +201,6 @@ public class OfficeParserConfig implements Serializable {
         this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
     }
 
-    public boolean isWriteSelectHeadersInBody() {
-        return writeSelectHeadersInBody;
-    }
-
-    /**
-     * If set to <code>true</code>, this will add to/from/cc into the
-     * body content.
-     *
-     * @param val
-     */
-    public void setWriteSelectHeadersInBody(boolean val) {
-        this.writeSelectHeadersInBody = val;
-    }
-
     public boolean isIncludeMissingRows() {
         return includeMissingRows;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a73adbaf6..5a2dc996e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -227,8 +227,6 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                 }
             }
 
-            writeSelectHeadersInBody(subject, from, msg, xhtml);
-
             // Get the message body. Preference order is: html, rtf, text
             Chunk htmlChunk = null;
             Chunk rtfChunk = null;
@@ -279,31 +277,6 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
     }
 
-    private void writeSelectHeadersInBody(String subject, String from, 
MAPIMessage msg, XHTMLContentHandler xhtml)
-            throws SAXException, ChunkNotFoundException {
-        if (! officeParserConfig.isWriteSelectHeadersInBody()) {
-            return;
-        }
-        xhtml.element("h1", subject);
-
-        // Output the from and to details in text, as you
-        //  often want them in text form for searching
-        xhtml.startElement("dl");
-        if (from != null) {
-            header(xhtml, "From", from);
-        }
-        header(xhtml, "To", msg.getDisplayTo());
-        header(xhtml, "Cc", msg.getDisplayCC());
-        header(xhtml, "Bcc", msg.getDisplayBCC());
-        try {
-            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
-        } catch (ChunkNotFoundException e) {
-            //swallow
-        }
-        xhtml.endElement("dl");
-
-    }
-
     private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk 
textChunk,
                                   XHTMLContentHandler xhtml)
             throws SAXException, IOException, TikaException {
@@ -312,13 +285,8 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
             extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml);
             return;
         }
-        if (officeParserConfig.isWriteSelectHeadersInBody()) {
-            xhtml.startElement("div", "class", "message-body");
-            _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
-            xhtml.endElement("div");
-        } else {
-            _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
-        }
+        _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+
     }
     private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk 
textChunk,
                                   XHTMLContentHandler xhtml)
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 686a6657c..f10f4aa7c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -54,18 +54,12 @@ public class OutlookParserTest extends TikaTest {
     @Test
     public void testOutlookParsing() throws Exception {
 
-        //test default behavior
-        List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", 
AUTO_DETECT_PARSER,
-                BasicContentHandlerFactory.HANDLER_TYPE.BODY);
-        assertNotContained("Microsoft Outlook Express 6", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
 
-
-        //test legacy behavior
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/test-outlook.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
         }
         assertEquals("application/vnd.ms-outlook", 
metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Microsoft Outlook Express 6", 
metadata.get(TikaCoreProperties.TITLE));
@@ -90,9 +84,9 @@ public class OutlookParserTest extends TikaTest {
         assertEquals("2007-04-05T16:26:06Z", 
metadata.get(TikaCoreProperties.CREATED));
 
         String content = handler.toString();
-        assertContains("Microsoft Outlook Express 6", content);
-        assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
-        assertContains("Nouvel utilisateur de Outlook Express", content);
+        assertNotContained("Microsoft Outlook Express 6", content);
+        assertNotContained("L'\u00C9quipe Microsoft Outlook Express", content);
+        assertNotContained("Nouvel utilisateur de Outlook Express", content);
         assertContains("Messagerie et groupes de discussion", content);
     }
 
@@ -107,7 +101,7 @@ public class OutlookParserTest extends TikaTest {
         Metadata metadata = new Metadata();
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/testMSG.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
         }
 
         assertEquals("application/vnd.ms-outlook", 
metadata.get(Metadata.CONTENT_TYPE));
@@ -115,7 +109,6 @@ public class OutlookParserTest extends TikaTest {
         String content = handler.toString();
         Pattern pattern = Pattern.compile("From");
         Matcher matcher = pattern.matcher(content);
-        assertTrue(matcher.find());
         assertFalse(matcher.find());
 
         //test that last header is added
@@ -185,13 +178,13 @@ public class OutlookParserTest extends TikaTest {
         handler.setResult(new StreamResult(sw));
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
         }
 
         // As the HTML version should have been processed, ensure
         //  we got some of the links
         String content = sw.toString();
-        assertContains("<dd>[email protected]</dd>", content);
+        assertNotContained("<dd>[email protected]</dd>", content);
         assertContains("<p>Alfresco MSG format testing", content);
         assertContains("<li>1", content);
         assertContains("<li>2", content);
@@ -259,13 +252,13 @@ public class OutlookParserTest extends TikaTest {
         handler.setResult(new StreamResult(sw));
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/test-outlook2003.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
         }
 
         // As the HTML version should have been processed, ensure
         //  we got some of the links
         String content = sw.toString().replaceAll("[\\r\\n\\t]+", " 
").replaceAll(" +", " ");
-        assertContains("<dd>New Outlook User</dd>", content);
+        assertNotContained("<dd>New Outlook User</dd>", content);
         assertContains("designed <i>to help you", content);
         assertContains(
                 "<p> <a 
href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\";>Cached
 Exchange Mode</a>",
@@ -282,14 +275,6 @@ public class OutlookParserTest extends TikaTest {
         assertEquals(2, content.split("<\\/body>").length);
     }
 
-    private ParseContext configureInjectHeaders() {
-        ParseContext parseContext = new ParseContext();
-        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
-        officeParserConfig.setWriteSelectHeadersInBody(true);
-        parseContext.set(OfficeParserConfig.class, officeParserConfig);
-        return parseContext;
-    }
-
     @Test
     public void testMAPIMessageClasses() throws Exception {
 

Reply via email to