This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6d7a70416 TIKA-4391 -- identify "inline" images within .msg files.
(#2195)
6d7a70416 is described below
commit 6d7a70416366952ce16824f59dc945bc401cfab6
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 8 09:23:46 2025 -0400
TIKA-4391 -- identify "inline" images within .msg files. (#2195)
---
.../main/java/org/apache/tika/metadata/MAPI.java | 2 +
.../java/org/apache/tika/metadata/RTFMetadata.java | 3 +
.../tika/parser/microsoft/OutlookExtractor.java | 136 ++++++++++++++++-----
.../tika/parser/microsoft/rtf/TextExtractor.java | 3 +
.../tika/parser/microsoft/OutlookParserTest.java | 8 ++
5 files changed, 121 insertions(+), 31 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
index 5f4ef12ae..d52b67351 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -65,6 +65,8 @@ public interface MAPI {
Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority");
Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META +
"is-flagged");
+ Property BODY_TYPES_PROCESSED = Property.internalTextBag(PREFIX_MAPI_META
+ "body-types-processed");
+
Property ATTACH_LONG_PATH_NAME =
Property.internalText(PREFIX_MAPI_ATTACH_META + "long-path-name");
Property ATTACH_LONG_FILE_NAME =
Property.internalText(PREFIX_MAPI_ATTACH_META + "long-file-name");
Property ATTACH_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META
+ "file-name");
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
index e4572e38f..22842391f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
@@ -44,4 +44,7 @@ public interface RTFMetadata {
Property EMB_ITEM = Property.internalText(
PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"emb_item");
+ Property CONTAINS_ENCAPSULATED_HTML = Property.internalBoolean(
+ PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"contains_encapsulated_html");
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 30e1ca14a..0e219dac6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -30,11 +30,13 @@ import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -68,6 +70,7 @@ import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -90,6 +93,9 @@ import org.apache.tika.utils.StringUtils;
*/
public class OutlookExtractor extends AbstractPOIFSExtractor {
static Logger LOGGER = LoggerFactory.getLogger(OutlookExtractor.class);
+ public enum BODY_TYPES_PROCESSED {
+ HTML, RTF, TEXT;
+ }
private static final Metadata EMPTY_METADATA = new Metadata();
private static final MAPIProperty[] LITERAL_TIME_MAPI_PROPERTIES = new
MAPIProperty[] {
@@ -116,6 +122,10 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
private static final Map<String, String> MESSAGE_CLASSES = new
LinkedHashMap<>();
+ private static final Pattern IMG_TAG_PATTERN = Pattern.compile("<img
([^>]{0,1000})>");
+ private static final Pattern SRC_ATTR_PATTERN =
Pattern.compile("src=\"cid:([^\"]{0,1000})\"");
+ private static final Pattern TEXT_CID_PATTERN =
Pattern.compile("\\[cid:([^]]{0,1000})]");
+
static {
for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES) {
String name = property.mapiProperty.toLowerCase(Locale.ROOT);
@@ -128,8 +138,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
loadMessageClasses();
}
-
-
private static void loadMessageClasses() {
String fName =
"/org/apache/tika/parser/microsoft/msg/mapi_message_classes.properties";
try (BufferedReader r = new BufferedReader(
@@ -275,41 +283,54 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
textChunk = chunk;
}
}
- handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+ Set<String> contentIdNames = new HashSet<>();
+ handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml,
contentIdNames);
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
- Metadata metadata = new Metadata();
- updateAttachmentMetadata(attachment, metadata);
+ Metadata attachMetadata = new Metadata();
+ updateAttachmentMetadata(attachment, attachMetadata,
contentIdNames);
String filename = null;
- if
(!StringUtils.isBlank(metadata.get(MAPI.ATTACH_LONG_FILE_NAME))) {
- filename = metadata.get(MAPI.ATTACH_LONG_FILE_NAME);
- } else if
(!StringUtils.isBlank(metadata.get(MAPI.ATTACH_DISPLAY_NAME))) {
- filename = metadata.get(MAPI.ATTACH_DISPLAY_NAME);
- } else if
(!StringUtils.isBlank(metadata.get(MAPI.ATTACH_FILE_NAME))) {
- filename = metadata.get(MAPI.ATTACH_FILE_NAME);
+ if
(!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_LONG_FILE_NAME))) {
+ filename = attachMetadata.get(MAPI.ATTACH_LONG_FILE_NAME);
+ } else if
(!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_DISPLAY_NAME))) {
+ filename = attachMetadata.get(MAPI.ATTACH_DISPLAY_NAME);
+ } else if
(!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_FILE_NAME))) {
+ filename = attachMetadata.get(MAPI.ATTACH_FILE_NAME);
}
//this is allowed to be null;
- String mimeType = metadata.get(MAPI.ATTACH_MIME);
+ String mimeType = attachMetadata.get(MAPI.ATTACH_MIME);
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment
.getAttachData()
- .getValue()), metadata, filename, null, null,
mimeType, xhtml, true);
+ .getValue()), attachMetadata, filename, null, null,
mimeType, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment
.getAttachmentDirectory()
- .getDirectory(), metadata, filename, xhtml, true);
+ .getDirectory(), attachMetadata, filename, xhtml,
true);
}
}
}
- private void updateAttachmentMetadata(AttachmentChunks attachment,
Metadata metadata) {
+ private void updateAttachmentMetadata(AttachmentChunks attachment,
Metadata metadata,
+ Set<String> contentIdNames) {
+ StringChunk contentIdChunk = attachment.getAttachContentId();
+ if (contentIdChunk != null) {
+ String contentId = contentIdChunk.getValue();
+ if (! StringUtils.isBlank(contentId)) {
+ contentId = contentId.trim();
+ if (contentIdNames.contains(contentId)) {
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY,
+
TikaCoreProperties.EmbeddedResourceType.INLINE.name());
+ }
+ metadata.set(MAPI.ATTACH_CONTENT_ID, contentId);
+ }
+ }
addStringChunkToMetadata(MAPI.ATTACH_LONG_PATH_NAME,
attachment.getAttachLongPathName(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_LONG_FILE_NAME,
attachment.getAttachLongFileName(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_FILE_NAME,
attachment.getAttachFileName(), metadata);
- addStringChunkToMetadata(MAPI.ATTACH_CONTENT_ID,
attachment.getAttachContentId(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_CONTENT_LOCATION,
attachment.getAttachContentLocation(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_DISPLAY_NAME,
attachment.getAttachDisplayName(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_EXTENSION,
attachment.getAttachExtension(), metadata);
@@ -441,20 +462,20 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
- XHTMLContentHandler xhtml)
+ XHTMLContentHandler xhtml, Set<String>
contentIdNames)
throws SAXException, IOException, TikaException {
if (extractAllAlternatives) {
- extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml);
+ extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml,
contentIdNames);
return;
}
- _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+ _handleBestBodyChunk(htmlChunk, rtfChunk, textChunk, xhtml,
contentIdNames);
}
- private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
- XHTMLContentHandler xhtml)
+ private void _handleBestBodyChunk(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
+ XHTMLContentHandler xhtml, Set<String>
contentIdNames)
throws SAXException, IOException, TikaException {
- boolean doneBody = false;
+ //try html, then rtf, then text
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
@@ -468,13 +489,16 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
if (htmlParser == null) {
htmlParser = new JSoupParser();
}
+ Metadata htmlMetadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(data)) {
- htmlParser.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), new Metadata(), parseContext);
+ htmlParser.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), htmlMetadata, parseContext);
}
- doneBody = true;
+ extractContentIdNamesFromHtml(data, htmlMetadata,
contentIdNames);
+ parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.HTML.name());
+ return;
}
}
- if (rtfChunk != null && (extractAllAlternatives || !doneBody)) {
+ if (rtfChunk != null) {
ByteChunk chunk = (ByteChunk) rtfChunk;
//avoid buffer underflow TIKA-2530
//TODO -- would be good to find an example triggering file and
@@ -488,26 +512,64 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
if (rtfParser == null) {
rtfParser = new RTFParser();
}
+ Metadata rtfMetadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(rtf.getData()))
{
- rtfParser.parseInline(tis, xhtml, new Metadata(),
parseContext);
+ rtfParser.parseInline(tis, xhtml, rtfMetadata,
parseContext);
}
- doneBody = true;
+ extractContentIdNamesFromRtf(rtf.getData(), rtfMetadata,
contentIdNames);
+ parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.RTF.name());
+ parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML,
+
rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
+ return;
}
}
- if (textChunk != null && (extractAllAlternatives || !doneBody)) {
- xhtml.element("p", ((StringChunk) textChunk).getValue());
+ if (textChunk != null) {
+ String s = ((StringChunk) textChunk).getValue();
+ xhtml.element("p", s);
+ extractContentIdNamesFromText(s, contentIdNames);
+ parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.TEXT.name());
+ }
+
+ }
+
+ private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata,
Set<String> contentIdNames) {
+ //for now, hope that there's encapsulated html
+ //TODO: check for encapsulated html. If it doesn't exist, handle RTF
specifically
+ extractContentIdNamesFromHtml(data, metadata, contentIdNames);
+ }
+
+ private void extractContentIdNamesFromHtml(byte[] data, Metadata metadata,
Set<String> contentIdNames) {
+ String html = new String(data, UTF_8);
+ Matcher imageMatcher = IMG_TAG_PATTERN.matcher(html);
+ Matcher cidSrcMatcher = SRC_ATTR_PATTERN.matcher("");
+ while (imageMatcher.find()) {
+ String imgElementContents = imageMatcher.group(1);
+ cidSrcMatcher.reset(imgElementContents);
+ while (cidSrcMatcher.find()) {
+ String cid = cidSrcMatcher.group(1);
+ cid = cid.trim();
+ contentIdNames.add(cid);
+ }
}
+ }
+ private void extractContentIdNamesFromText(String s, Set<String>
contentIdNames) {
+ Matcher m = TEXT_CID_PATTERN.matcher(s);
+ while (m.find()) {
+ contentIdNames.add(m.group(1));
+ }
}
private void extractAllAlternatives(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
- XHTMLContentHandler xhtml)
+ XHTMLContentHandler xhtml, Set<String>
contentIdNames)
throws TikaException, SAXException, IOException {
if (htmlChunk != null) {
byte[] data = getValue(htmlChunk);
if (data != null) {
handleEmbeddedResource(TikaInputStream.get(data), "html-body",
null,
MediaType.TEXT_HTML.toString(), xhtml, true);
+ extractContentIdNamesFromHtml(data, new Metadata(),
contentIdNames);
+ parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.HTML.name());
}
}
if (rtfChunk != null) {
@@ -518,8 +580,16 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
byte[] data = rtf.getData();
if (data != null) {
- handleEmbeddedResource(TikaInputStream.get(data), "rtf-body",
null,
+ Metadata rtfMetadata = new Metadata();
+ handleEmbeddedResource(TikaInputStream.get(data), rtfMetadata,
+ "rtf-body", null, null,
"application/rtf", xhtml, true);
+ extractContentIdNamesFromRtf(data, rtfMetadata,
contentIdNames);
+ //copy this info into the parent...what else should we copy?
+ parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.RTF.name());
+ parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML,
+
rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
+
}
}
if (textChunk != null) {
@@ -530,6 +600,10 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
MediaType.TEXT_PLAIN.toString());
handleEmbeddedResource(TikaInputStream.get(data),
chunkMetadata, null, "text-body",
null, MediaType.TEXT_PLAIN.toString(), xhtml, true);
+ if (textChunk instanceof StringChunk) {
+ extractContentIdNamesFromText(((StringChunk)
textChunk).getValue(), contentIdNames);
+ }
+ parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.TEXT.name());
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 1fe173259..5612de219 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -50,6 +50,7 @@ import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.utils.CharsetUtils;
@@ -899,6 +900,8 @@ final class TextExtractor {
hour = param;
} else if (equals("min")) {
minute = param;
+ } else if (equals("fromhtml") && param == 1) {
+ metadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML, true);
}
if (fontTableState == 1) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 6c532a84e..ba8a6c64e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -39,6 +39,7 @@ import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -240,6 +241,8 @@ public class OutlookParserTest extends TikaTest {
metadata.get(MAPI.INTERNET_REFERENCES));
assertEquals("<c8508767c15dbf40a21693142739ea8d564d18f...@exvmbx018-1.exch018.msoutlookonline.net>",
metadata.get(MAPI.IN_REPLY_TO_ID));
+
+ assertEquals("true",
metadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
}
@Test
@@ -247,6 +250,7 @@ public class OutlookParserTest extends TikaTest {
List<Metadata> metadataList =
getRecursiveMetadata("testMSG_att_msg.msg");
assertEquals("/Test Attachment.msg",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals("/smbprn.00009008.KdcPjl.pdf",
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("true",
metadataList.get(0).get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
}
@Test
@@ -289,6 +293,8 @@ public class OutlookParserTest extends TikaTest {
// Make sure we don't have nested html docs
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
+
+ assertEquals("true",
metadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
}
@Test
@@ -323,6 +329,8 @@ public class OutlookParserTest extends TikaTest {
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
assertEquals("0", m.get("mapi:property:PidLidAppointmentSequence"));
assertEquals("false", m.get("mapi:property:PidLidRecurring"));
+ assertEquals("true", m.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
+
}
@Test