This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 55d3d788c TIKA-4345 -- stop injecting headers into the body of msg
files (#2042)
55d3d788c is described below
commit 55d3d788c962c32763e068b368ed19da7118253c
Author: Tim Allison <[email protected]>
AuthorDate: Wed Nov 13 10:34:53 2024 -0500
TIKA-4345 -- stop injecting headers into the body of msg files (#2042)
* TIKA-4345 -- stop injecting headers into the body for msg files
---
CHANGES.txt | 7 +++++
.../parser/microsoft/AbstractOfficeParser.java | 13 --------
.../tika/parser/microsoft/OfficeParserConfig.java | 15 ---------
.../tika/parser/microsoft/OutlookExtractor.java | 36 ++--------------------
.../tika/parser/microsoft/OutlookParserTest.java | 33 ++++++--------------
5 files changed, 18 insertions(+), 86 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 395f41b07..a5feb090f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,8 +1,15 @@
Release 4.0.0-BETA1 - ???
BREAKING CHANGES
+ * Headers are no longer injected into the body/content of MSG files
(TIKA-4345). Please open
+ a ticket if you need this behavior across email formats.
+
+
Release 3.1.0 - ??
+ * Allow users to turn off the injection of some headers into the content
stream of MSG
+ files (TIKA-4345).
+
* Add a wrapper for Google's magika detector (TIKA-4344).
* Add support for MachO via Alexey Pelykh (TIKA-4309).
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index ea5179552..a44073d4e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -181,17 +181,4 @@ public abstract class AbstractOfficeParser implements
Parser {
return defaultOfficeParserConfig.isIncludeHeadersAndFooters();
}
- /**
- * If set to <code>true</code>, this will write the to/from/cc into the
body content
- *
- * @param val
- */
- @Field
- public void setWriteSelectHeadersInBody(boolean val) {
- defaultOfficeParserConfig.setWriteSelectHeadersInBody(val);
- }
-
- public boolean isWriteSelectHeadersInBody() {
- return defaultOfficeParserConfig.isWriteSelectHeadersInBody();
- }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index af69eefa1..8e761efad 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -36,7 +36,6 @@ public class OfficeParserConfig implements Serializable {
private boolean useSAXPptxExtractor = false;
private boolean extractAllAlternativesFromMSG = false;
- private boolean writeSelectHeadersInBody = false;
private String dateOverrideFormat = null;
private int maxOverride = 0;//ignore
@@ -202,20 +201,6 @@ public class OfficeParserConfig implements Serializable {
this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
}
- public boolean isWriteSelectHeadersInBody() {
- return writeSelectHeadersInBody;
- }
-
- /**
- * If set to <code>true</code>, this will add to/from/cc into the
- * body content.
- *
- * @param val
- */
- public void setWriteSelectHeadersInBody(boolean val) {
- this.writeSelectHeadersInBody = val;
- }
-
public boolean isIncludeMissingRows() {
return includeMissingRows;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a73adbaf6..5a2dc996e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -227,8 +227,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
- writeSelectHeadersInBody(subject, from, msg, xhtml);
-
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
@@ -279,31 +277,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
- private void writeSelectHeadersInBody(String subject, String from,
MAPIMessage msg, XHTMLContentHandler xhtml)
- throws SAXException, ChunkNotFoundException {
- if (! officeParserConfig.isWriteSelectHeadersInBody()) {
- return;
- }
- xhtml.element("h1", subject);
-
- // Output the from and to details in text, as you
- // often want them in text form for searching
- xhtml.startElement("dl");
- if (from != null) {
- header(xhtml, "From", from);
- }
- header(xhtml, "To", msg.getDisplayTo());
- header(xhtml, "Cc", msg.getDisplayCC());
- header(xhtml, "Bcc", msg.getDisplayBCC());
- try {
- header(xhtml, "Recipients", msg.getRecipientEmailAddress());
- } catch (ChunkNotFoundException e) {
- //swallow
- }
- xhtml.endElement("dl");
-
- }
-
private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
@@ -312,13 +285,8 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml);
return;
}
- if (officeParserConfig.isWriteSelectHeadersInBody()) {
- xhtml.startElement("div", "class", "message-body");
- _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
- xhtml.endElement("div");
- } else {
- _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
- }
+ _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+
}
private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
XHTMLContentHandler xhtml)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 686a6657c..f10f4aa7c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -54,18 +54,12 @@ public class OutlookParserTest extends TikaTest {
@Test
public void testOutlookParsing() throws Exception {
- //test default behavior
- List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg",
AUTO_DETECT_PARSER,
- BasicContentHandlerFactory.HANDLER_TYPE.BODY);
- assertNotContained("Microsoft Outlook Express 6",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
-
- //test legacy behavior
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream =
getResourceAsStream("/test-documents/test-outlook.msg")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata,
configureInjectHeaders());
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata, new
ParseContext());
}
assertEquals("application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Microsoft Outlook Express 6",
metadata.get(TikaCoreProperties.TITLE));
@@ -90,9 +84,9 @@ public class OutlookParserTest extends TikaTest {
assertEquals("2007-04-05T16:26:06Z",
metadata.get(TikaCoreProperties.CREATED));
String content = handler.toString();
- assertContains("Microsoft Outlook Express 6", content);
- assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
- assertContains("Nouvel utilisateur de Outlook Express", content);
+ assertNotContained("Microsoft Outlook Express 6", content);
+ assertNotContained("L'\u00C9quipe Microsoft Outlook Express", content);
+ assertNotContained("Nouvel utilisateur de Outlook Express", content);
assertContains("Messagerie et groupes de discussion", content);
}
@@ -107,7 +101,7 @@ public class OutlookParserTest extends TikaTest {
Metadata metadata = new Metadata();
try (InputStream stream =
getResourceAsStream("/test-documents/testMSG.msg")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata,
configureInjectHeaders());
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata, new
ParseContext());
}
assertEquals("application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
@@ -115,7 +109,6 @@ public class OutlookParserTest extends TikaTest {
String content = handler.toString();
Pattern pattern = Pattern.compile("From");
Matcher matcher = pattern.matcher(content);
- assertTrue(matcher.find());
assertFalse(matcher.find());
//test that last header is added
@@ -185,13 +178,13 @@ public class OutlookParserTest extends TikaTest {
handler.setResult(new StreamResult(sw));
try (InputStream stream =
getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata,
configureInjectHeaders());
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata, new
ParseContext());
}
// As the HTML version should have been processed, ensure
// we got some of the links
String content = sw.toString();
- assertContains("<dd>[email protected]</dd>", content);
+ assertNotContained("<dd>[email protected]</dd>", content);
assertContains("<p>Alfresco MSG format testing", content);
assertContains("<li>1", content);
assertContains("<li>2", content);
@@ -259,13 +252,13 @@ public class OutlookParserTest extends TikaTest {
handler.setResult(new StreamResult(sw));
try (InputStream stream =
getResourceAsStream("/test-documents/test-outlook2003.msg")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata,
configureInjectHeaders());
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata, new
ParseContext());
}
// As the HTML version should have been processed, ensure
// we got some of the links
String content = sw.toString().replaceAll("[\\r\\n\\t]+", "
").replaceAll(" +", " ");
- assertContains("<dd>New Outlook User</dd>", content);
+ assertNotContained("<dd>New Outlook User</dd>", content);
assertContains("designed <i>to help you", content);
assertContains(
"<p> <a
href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached
Exchange Mode</a>",
@@ -282,14 +275,6 @@ public class OutlookParserTest extends TikaTest {
assertEquals(2, content.split("<\\/body>").length);
}
- private ParseContext configureInjectHeaders() {
- ParseContext parseContext = new ParseContext();
- OfficeParserConfig officeParserConfig = new OfficeParserConfig();
- officeParserConfig.setWriteSelectHeadersInBody(true);
- parseContext.set(OfficeParserConfig.class, officeParserConfig);
- return parseContext;
- }
-
@Test
public void testMAPIMessageClasses() throws Exception {