This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6060d9a115 TIKA-4696 improve inline tagging (#2711)
6060d9a115 is described below
commit 6060d9a115205d175fc13c51e93bf03e0d3fe1cf
Author: Tim Allison <[email protected]>
AuthorDate: Mon Mar 23 16:13:05 2026 -0400
TIKA-4696 improve inline tagging (#2711)
---
.../main/java/org/apache/tika/metadata/MAPI.java | 14 +
.../tika/parser/microsoft/OutlookExtractor.java | 154 ++++++-
.../msg/RTFEncapsulatedHTMLExtractor.java | 453 +++++++++++++++++++++
.../tika/parser/microsoft/OutlookParserTest.java | 43 ++
.../msg/RTFEncapsulatedHTMLExtractorTest.java | 217 ++++++++++
5 files changed, 867 insertions(+), 14 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
index 613c3d3d9d..c8f81a980d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -77,4 +77,18 @@ public interface MAPI {
Property ATTACH_MIME = Property.internalText(PREFIX_MAPI_ATTACH_META +
"mime");
Property ATTACH_LANGUAGE = Property.internalText(PREFIX_MAPI_ATTACH_META +
"language");
+ /**
+ * PidTagAttachFlags (0x3714) — indicates which body formats might
reference this attachment.
+ * Bit 1 (0x1) = ATT_INVISIBLE_IN_HTML
+ * Bit 2 (0x2) = ATT_INVISIBLE_IN_RTF
+ * Bit 3 (0x4) = ATT_RENDERED_IN_BODY
+ */
+ Property ATTACH_FLAGS = Property.internalInteger(PREFIX_MAPI_ATTACH_META +
"flags");
+
+ /**
+ * PidTagAttachmentHidden (0x7FFE) — indicates whether this attachment is
hidden from the end
+ * user. Inline images typically have this set to true.
+ */
+ Property ATTACH_HIDDEN = Property.internalBoolean(PREFIX_MAPI_ATTACH_META
+ "hidden");
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 552c52889c..0b8db23f45 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -20,8 +20,11 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.BufferedReader;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
@@ -56,7 +59,10 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks;
import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.datatypes.Types;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.CodePageUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -79,6 +85,7 @@ import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.parser.mailcommons.MailDateParser;
import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
+import org.apache.tika.parser.microsoft.msg.RTFEncapsulatedHTMLExtractor;
import org.apache.tika.parser.microsoft.rtf.RTFParser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
@@ -173,6 +180,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
private static Pattern HEADER_KEY_PAT =
Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
+ private final DirectoryNode root;
private final MAPIMessage msg;
private final ParseContext parseContext;
private final boolean extractAllAlternatives;
@@ -181,6 +189,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
public OutlookExtractor(DirectoryNode root, Metadata metadata,
ParseContext context) throws TikaException {
super(context, metadata);
+ this.root = root;
this.parseContext = context;
this.extractAllAlternatives =
context.get(OfficeParserConfig.class).isExtractAllAlternativesFromMSG();
@@ -317,18 +326,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
private void updateAttachmentMetadata(AttachmentChunks attachment,
Metadata metadata,
Set<String> contentIdNames) {
- StringChunk contentIdChunk = attachment.getAttachContentId();
- if (contentIdChunk != null) {
- String contentId = contentIdChunk.getValue();
- if (! StringUtils.isBlank(contentId)) {
- contentId = contentId.trim();
- if (contentIdNames.contains(contentId)) {
- metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY,
-
TikaCoreProperties.EmbeddedResourceType.INLINE.name());
- }
- metadata.set(MAPI.ATTACH_CONTENT_ID, contentId);
- }
- }
+ // Extract string-based metadata from POI's named chunk getters
addStringChunkToMetadata(MAPI.ATTACH_LONG_PATH_NAME,
attachment.getAttachLongPathName(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_LONG_FILE_NAME,
attachment.getAttachLongFileName(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_FILE_NAME,
attachment.getAttachFileName(), metadata);
@@ -337,6 +335,129 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
addStringChunkToMetadata(MAPI.ATTACH_EXTENSION,
attachment.getAttachExtension(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_MIME,
attachment.getAttachMimeTag(), metadata);
addStringChunkToMetadata(MAPI.ATTACH_LANGUAGE,
attachment.getAttachLanguage(), metadata);
+
+ // Extract fixed properties from the attachment's
__properties_version1.0 stream
+ // POI's AttachmentChunks doesn't parse this stream, so we read it
directly.
+ Map<Integer, Long> attachProps =
readAttachmentProperties(attachment.getPOIFSName());
+ Long attachFlags = attachProps.get(PID_TAG_ATTACH_FLAGS);
+ if (attachFlags != null) {
+ metadata.set(MAPI.ATTACH_FLAGS, attachFlags.intValue());
+ }
+ Long attachHidden = attachProps.get(PID_TAG_ATTACHMENT_HIDDEN);
+ if (attachHidden != null) {
+ metadata.set(MAPI.ATTACH_HIDDEN, attachHidden.intValue() != 0);
+ }
+
+ // Determine inline vs attachment
+ String contentId = null;
+ StringChunk contentIdChunk = attachment.getAttachContentId();
+ if (contentIdChunk != null) {
+ String rawCid = contentIdChunk.getValue();
+ if (!StringUtils.isBlank(rawCid)) {
+ contentId = rawCid.trim();
+ metadata.set(MAPI.ATTACH_CONTENT_ID, contentId);
+ }
+ }
+
+ if (contentId != null && contentIdNames.contains(contentId)) {
+ // Layer 1: CID referenced in the message body — high confidence
inline
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY,
+ TikaCoreProperties.EmbeddedResourceType.INLINE.name());
+ } else if (contentId != null
+ && attachFlags != null
+ && (attachFlags & ATT_RENDERED_IN_BODY) != 0
+ && isInlineableMimeType(metadata.get(MAPI.ATTACH_MIME))) {
+ // Layer 2: MAPI says rendered in body + image MIME type — the CID
regex
+ // missed it (e.g. encapsulated RTF with stripped img tags)
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY,
+ TikaCoreProperties.EmbeddedResourceType.INLINE.name());
+ }
+ }
+
+ private static final Set<String> INLINEABLE_MIME_TYPES = Set.of(
+ "application/x-ms-wmz",
+ "application/x-ms-emz",
+ "application/x-msmetafile",
+ "image/x-wmf",
+ "image/x-emf",
+ "image/wmf",
+ "image/emf"
+ );
+
+ /**
+ * Returns true for MIME types that are safe to label as INLINE.
+ * We gate on this to avoid marking PDFs, DOCX, etc. as inline — downstream
+ * consumers use INLINE to decide what to index separately.
+ */
+ private static boolean isInlineableMimeType(String mimeType) {
+ if (StringUtils.isBlank(mimeType)) {
+ return false;
+ }
+ String lower = mimeType.toLowerCase(Locale.ROOT).trim();
+ return lower.startsWith("image/") ||
INLINEABLE_MIME_TYPES.contains(lower);
+ }
+
+ // PidTagAttachFlags (0x3714) — bit flags indicating which body formats
reference this
+ private static final int PID_TAG_ATTACH_FLAGS = 0x3714;
+ // Bit 2 = ATT_RENDERED_IN_BODY: this attachment is referenced by the body
+ private static final int ATT_RENDERED_IN_BODY = 0x4;
+ // PidTagAttachmentHidden (0x7FFE) — boolean, true if hidden from end user
(inline images)
+ private static final int PID_TAG_ATTACHMENT_HIDDEN = 0x7FFE;
+
+ /**
+ * Read fixed MAPI properties from the __properties_version1.0 stream
inside an
+ * attachment storage. POI's {@link AttachmentChunks} does not parse this
stream.
+ *
+ * <p>The stream format is: 8-byte header, followed by 16-byte property
entries.
+ * Each entry: 2 bytes property type, 2 bytes property ID, 4 bytes flags,
+ * 8 bytes value (inline for fixed-size types).</p>
+ *
+ * @param poifsName the OLE2 directory name for this attachment
+ * (e.g. "__attach_version1.0_#00000000")
+ * @return map of property ID to value for fixed-size integer/boolean
properties
+ */
+ private Map<Integer, Long> readAttachmentProperties(String poifsName) {
+ Map<Integer, Long> result = new HashMap<>();
+ try {
+ DirectoryEntry attachDir = (DirectoryEntry)
root.getEntry(poifsName);
+ DocumentEntry propsEntry =
+ (DocumentEntry)
attachDir.getEntry("__properties_version1.0");
+ byte[] data;
+ try (InputStream dis = new DocumentInputStream(propsEntry)) {
+ data = dis.readAllBytes();
+ }
+ if (data.length < 8) {
+ return result;
+ }
+ ByteBuffer buf =
ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
+ int offset = 8; // skip 8-byte header
+ while (offset + 16 <= data.length) {
+ int propType = buf.getShort(offset) & 0xFFFF;
+ int propId = buf.getShort(offset + 2) & 0xFFFF;
+ long value;
+ switch (propType) {
+ case 0x0003: // PtypInteger32
+ value = buf.getInt(offset + 8) & 0xFFFFFFFFL;
+ result.put(propId, value);
+ break;
+ case 0x000B: // PtypBoolean
+ value = buf.getShort(offset + 8) & 0xFFFF;
+ result.put(propId, value);
+ break;
+ case 0x0014: // PtypInteger64
+ value = buf.getLong(offset + 8);
+ result.put(propId, value);
+ break;
+ default:
+ // skip variable-length, binary, time and other types
+ break;
+ }
+ offset += 16;
+ }
+ } catch (Exception e) {
+ LOGGER.debug("Could not read attachment properties for {}",
poifsName, e);
+ }
+ return result;
}
private void addStringChunkToMetadata(Property property, StringChunk
stringChunk, Metadata metadata) {
@@ -534,8 +655,13 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata,
Set<String> contentIdNames) {
- //for now, hope that there's encapsulated html
- //TODO: check for encapsulated html. If it doesn't exist, handle RTF
specifically
+ // Try to de-encapsulate the HTML from the RTF first
+ String html = RTFEncapsulatedHTMLExtractor.extract(data);
+ if (html != null) {
+ extractContentIdNamesFromHtml(html.getBytes(UTF_8), metadata,
contentIdNames);
+ return;
+ }
+ // Fall back to scanning the raw RTF bytes for cid: references
extractContentIdNamesFromHtml(data, metadata, contentIdNames);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
new file mode 100644
index 0000000000..3ef453a48d
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.msg;
+
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Extracts the original HTML from an RTF document that contains encapsulated
HTML
+ * (as indicated by the {@code \fromhtml1} control word).
+ *
+ * <p>The encapsulated HTML format stores HTML in two places:</p>
+ * <ol>
+ * <li>{@code {\*\htmltag<N> ...}} groups — contain the HTML markup (tags,
style blocks, etc.)</li>
+ * <li>Text between htmltag groups — contains the actual text content,
provided it is NOT
+ * wrapped in {@code \htmlrtf ... \htmlrtf0} (which marks RTF-only
rendering hints)</li>
+ * </ol>
+ *
+ * <p>Within both htmltag groups and inter-tag text, the following RTF escapes
are decoded:</p>
+ * <ul>
+ * <li>{@code \par} → newline</li>
+ * <li>{@code \tab} → tab character</li>
+ * <li>{@code \line} → {@code <br>}</li>
+ * <li>{@code \'xx} → single byte (decoded using the document's ANSI code
page)</li>
+ * <li>{@code \\}, {@code \{}, {@code \}} → literal characters</li>
+ * </ul>
+ */
+public class RTFEncapsulatedHTMLExtractor {
+
+ private static final Logger LOGGER =
LoggerFactory.getLogger(RTFEncapsulatedHTMLExtractor.class);
+
+ private static final String HTMLTAG_PREFIX = "{\\*\\htmltag";
+ private static final String FROM_HTML_MARKER = "\\fromhtml";
+ private static final String ANSICPG_PREFIX = "\\ansicpg";
+
+ /**
+ * Extracts the HTML content from an encapsulated-HTML RTF document.
+ *
+ * @param rtfBytes the decompressed RTF bytes
+ * @return the extracted HTML string, or {@code null} if the RTF does not
contain
+ * encapsulated HTML
+ */
+ public static String extract(byte[] rtfBytes) {
+ if (rtfBytes == null || rtfBytes.length == 0) {
+ return null;
+ }
+ // Work with US-ASCII — RTF is 7-bit and non-ASCII bytes are escaped
as \'xx
+ String rtf = new String(rtfBytes, StandardCharsets.US_ASCII);
+
+ if (!rtf.contains(FROM_HTML_MARKER)) {
+ return null;
+ }
+
+ Charset codePage = detectCodePage(rtf);
+
+ // Find the start of the document body (after the RTF header).
+ // We skip past the initial {\rtf1... header by finding the first
+ // htmltag group or \htmlrtf marker — everything before that is RTF
preamble.
+ int bodyStart = rtf.indexOf(HTMLTAG_PREFIX);
+ if (bodyStart < 0) {
+ return null;
+ }
+
+ StringBuilder html = new StringBuilder(rtf.length() / 2);
+ ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+ int pos = bodyStart;
+ int len = rtf.length();
+ boolean inHtmlRtfSkip = false;
+
+ while (pos < len) {
+ // Check if we're at an htmltag group
+ if (rtf.startsWith(HTMLTAG_PREFIX, pos)) {
+ flushPendingBytes(pendingBytes, html, codePage);
+
+ // Find matching close brace
+ int groupEnd = findMatchingBrace(rtf, pos);
+ if (groupEnd < 0) {
+ break;
+ }
+
+ // Skip {\*\htmltag prefix and digit(s)
+ int contentStart = pos + HTMLTAG_PREFIX.length();
+ while (contentStart < groupEnd &&
Character.isDigit(rtf.charAt(contentStart))) {
+ contentStart++;
+ }
+ // Skip optional space after tag number
+ if (contentStart < groupEnd && rtf.charAt(contentStart) == '
') {
+ contentStart++;
+ }
+
+ // Decode the htmltag content
+ String inner = rtf.substring(contentStart, groupEnd);
+ decodeRtfEscapes(inner, html, codePage);
+
+ pos = groupEnd + 1;
+ continue;
+ }
+
+ // Check for \htmlrtf control word (start or end of RTF-only block)
+ if (rtf.startsWith("\\htmlrtf", pos)) {
+ flushPendingBytes(pendingBytes, html, codePage);
+ int afterWord = pos + "\\htmlrtf".length();
+
+ if (afterWord < len && rtf.charAt(afterWord) == '0') {
+ // \htmlrtf0 — end of skip block
+ inHtmlRtfSkip = false;
+ afterWord++;
+ if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+ afterWord++;
+ }
+ } else {
+ // \htmlrtf — start of skip block
+ inHtmlRtfSkip = true;
+ if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+ afterWord++;
+ }
+ }
+ pos = afterWord;
+ continue;
+ }
+
+ // If we're inside an \htmlrtf skip block, just advance past this
character.
+ // We don't skip nested groups wholesale because \htmlrtf0 may
appear inside them.
+ if (inHtmlRtfSkip) {
+ pos++;
+ continue;
+ }
+
+ // Check for other { groups (nested RTF groups that aren't htmltag)
+ if (rtf.charAt(pos) == '{') {
+ flushPendingBytes(pendingBytes, html, codePage);
+ int end = findMatchingBrace(rtf, pos);
+ if (end > 0) {
+ pos = end + 1;
+ } else {
+ pos++;
+ }
+ continue;
+ }
+
+ // Skip closing braces
+ if (rtf.charAt(pos) == '}') {
+ flushPendingBytes(pendingBytes, html, codePage);
+ pos++;
+ continue;
+ }
+
+ // Handle RTF escapes in inter-tag text
+ if (rtf.charAt(pos) == '\\' && pos + 1 < len) {
+ char next = rtf.charAt(pos + 1);
+
+ // \'xx hex escape
+ if (next == '\'' && pos + 3 < len) {
+ int hi = Character.digit(rtf.charAt(pos + 2), 16);
+ int lo = Character.digit(rtf.charAt(pos + 3), 16);
+ if (hi >= 0 && lo >= 0) {
+ pendingBytes.write((hi << 4) | lo);
+ }
+ pos += 4;
+ continue;
+ }
+
+ flushPendingBytes(pendingBytes, html, codePage);
+
+ // Escaped literals
+ if (next == '\\' || next == '{' || next == '}') {
+ html.append(next);
+ pos += 2;
+ continue;
+ }
+
+ // Control word
+ if (Character.isLetter(next)) {
+ int wordStart = pos + 1;
+ int wordEnd = wordStart;
+ while (wordEnd < len &&
Character.isLetter(rtf.charAt(wordEnd))) {
+ wordEnd++;
+ }
+ String word = rtf.substring(wordStart, wordEnd);
+
+ // Skip optional numeric parameter
+ int paramEnd = wordEnd;
+ if (paramEnd < len && (rtf.charAt(paramEnd) == '-'
+ || Character.isDigit(rtf.charAt(paramEnd)))) {
+ paramEnd++;
+ while (paramEnd < len &&
Character.isDigit(rtf.charAt(paramEnd))) {
+ paramEnd++;
+ }
+ }
+ // Skip optional space delimiter
+ int afterWord = paramEnd;
+ if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+ afterWord++;
+ }
+
+ switch (word) {
+ case "par":
+ case "pard":
+ html.append('\n');
+ break;
+ case "tab":
+ html.append('\t');
+ break;
+ case "line":
+ html.append("<br>");
+ break;
+ default:
+ // Skip unknown control words
+ break;
+ }
+ pos = afterWord;
+ continue;
+ }
+
+ // Unknown escape — skip backslash
+ pos++;
+ continue;
+ }
+
+ // Newlines/carriage returns in RTF are whitespace, not content
+ if (rtf.charAt(pos) == '\r' || rtf.charAt(pos) == '\n') {
+ pos++;
+ continue;
+ }
+
+ // Regular text character between htmltag groups — this is HTML
content
+ flushPendingBytes(pendingBytes, html, codePage);
+ html.append(rtf.charAt(pos));
+ pos++;
+ }
+
+ flushPendingBytes(pendingBytes, html, codePage);
+
+ if (html.length() == 0) {
+ return null;
+ }
+ return html.toString();
+ }
+
+ /**
+ * Find the position of the closing brace that matches the opening brace at
+ * {@code openPos}. Handles nested groups and escaped braces.
+ *
+ * @return index of the closing '}', or -1 if not found
+ */
+ static int findMatchingBrace(String rtf, int openPos) {
+ int depth = 0;
+ int len = rtf.length();
+ for (int i = openPos; i < len; i++) {
+ char c = rtf.charAt(i);
+ if (c == '\\' && i + 1 < len) {
+ char next = rtf.charAt(i + 1);
+ if (next == '{' || next == '}' || next == '\\') {
+ i++;
+ continue;
+ }
+ }
+ if (c == '{') {
+ depth++;
+ } else if (c == '}') {
+ depth--;
+ if (depth == 0) {
+ return i;
+ }
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Decode RTF escapes within an htmltag group's content.
+ */
+ static void decodeRtfEscapes(String content, StringBuilder out, Charset
codePage) {
+ int len = content.length();
+ int i = 0;
+ ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+
+ while (i < len) {
+ char c = content.charAt(i);
+
+ if (c == '\\') {
+ if (i + 1 >= len) {
+ break;
+ }
+ char next = content.charAt(i + 1);
+
+ // \'xx hex escape
+ if (next == '\'' && i + 3 < len) {
+ int hi = Character.digit(content.charAt(i + 2), 16);
+ int lo = Character.digit(content.charAt(i + 3), 16);
+ if (hi >= 0 && lo >= 0) {
+ pendingBytes.write((hi << 4) | lo);
+ }
+ i += 4;
+ continue;
+ }
+
+ flushPendingBytes(pendingBytes, out, codePage);
+
+ if (next == '\\' || next == '{' || next == '}') {
+ out.append(next);
+ i += 2;
+ continue;
+ }
+
+ // Control words
+ if (Character.isLetter(next)) {
+ int wordStart = i + 1;
+ int wordEnd = wordStart;
+ while (wordEnd < len &&
Character.isLetter(content.charAt(wordEnd))) {
+ wordEnd++;
+ }
+ String word = content.substring(wordStart, wordEnd);
+
+ int paramEnd = wordEnd;
+ if (paramEnd < len && (content.charAt(paramEnd) == '-'
+ || Character.isDigit(content.charAt(paramEnd)))) {
+ paramEnd++;
+ while (paramEnd < len &&
Character.isDigit(content.charAt(paramEnd))) {
+ paramEnd++;
+ }
+ }
+ int afterWord = paramEnd;
+ if (afterWord < len && content.charAt(afterWord) == ' ') {
+ afterWord++;
+ }
+
+ switch (word) {
+ case "par":
+ case "pard":
+ out.append('\n');
+ break;
+ case "tab":
+ out.append('\t');
+ break;
+ case "line":
+ out.append("<br>");
+ break;
+ case "htmlrtf":
+ // Skip \htmlrtf...\htmlrtf0 inside htmltag groups
+ i = skipHtmlRtfBlock(content, i);
+ continue;
+ default:
+ break;
+ }
+ i = afterWord;
+ continue;
+ }
+
+ i++;
+ continue;
+ }
+
+ if (c == '{' || c == '}') {
+ flushPendingBytes(pendingBytes, out, codePage);
+ i++;
+ continue;
+ }
+
+ flushPendingBytes(pendingBytes, out, codePage);
+ out.append(c);
+ i++;
+ }
+
+ flushPendingBytes(pendingBytes, out, codePage);
+ }
+
+ /**
+ * Skip a {@code \htmlrtf ... \htmlrtf0} block within an htmltag group.
+ *
+ * @param content the string being parsed
+ * @param pos position of the backslash starting {@code \htmlrtf}
+ * @return position after the matching {@code \htmlrtf0}
+ */
+ static int skipHtmlRtfBlock(String content, int pos) {
+ int afterWord = pos + "\\htmlrtf".length();
+ if (afterWord < content.length() && content.charAt(afterWord) == '0') {
+ // This is \htmlrtf0 (end marker) — just skip past it
+ afterWord++;
+ if (afterWord < content.length() && content.charAt(afterWord) == '
') {
+ afterWord++;
+ }
+ return afterWord;
+ }
+
+ // Skip everything until \htmlrtf0
+ int endPos = content.indexOf("\\htmlrtf0", afterWord);
+ if (endPos < 0) {
+ return content.length();
+ }
+ int after = endPos + "\\htmlrtf0".length();
+ if (after < content.length() && content.charAt(after) == ' ') {
+ after++;
+ }
+ return after;
+ }
+
+ /**
+ * Detect the ANSI code page from the RTF header ({@code \ansicpgNNNN}).
+ * Falls back to windows-1252 if not found.
+ */
+ static Charset detectCodePage(String rtf) {
+ int idx = rtf.indexOf(ANSICPG_PREFIX);
+ if (idx < 0) {
+ return Charset.forName("windows-1252");
+ }
+ int numStart = idx + ANSICPG_PREFIX.length();
+ int numEnd = numStart;
+ while (numEnd < rtf.length() && Character.isDigit(rtf.charAt(numEnd)))
{
+ numEnd++;
+ }
+ if (numEnd == numStart) {
+ return Charset.forName("windows-1252");
+ }
+ String cpNum = rtf.substring(numStart, numEnd);
+ try {
+ return Charset.forName("windows-" + cpNum);
+ } catch (Exception e) {
+ try {
+ return Charset.forName("cp" + cpNum);
+ } catch (Exception e2) {
+ LOGGER.debug("Unknown code page {}, falling back to
windows-1252", cpNum);
+ return Charset.forName("windows-1252");
+ }
+ }
+ }
+
+ private static void flushPendingBytes(ByteArrayOutputStream pending,
StringBuilder out,
+ Charset codePage) {
+ if (pending.size() > 0) {
+ out.append(new String(pending.toByteArray(), codePage));
+ pending.reset();
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 20b010e7b7..eb92465dbe 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -407,4 +407,47 @@ public class OutlookParserTest extends TikaTest {
assertContains("annuaires\t \n" + " Synchronisation",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
+ @Test
+ public void testAttachFlagsExtracted() throws Exception {
+ // test-outlook2003.msg has 11 JPEG attachments with
PidTagAttachFlags=4
+ // (ATT_RENDERED_IN_BODY) but no Content-ID
+ List<Metadata> metadataList =
getRecursiveMetadata("test-outlook2003.msg");
+ // first entry is the message itself, rest are attachments
+ assertTrue(metadataList.size() > 1, "expected attachments");
+ for (int i = 1; i < metadataList.size(); i++) {
+ Metadata m = metadataList.get(i);
+ assertEquals("4", m.get(MAPI.ATTACH_FLAGS),
+ "attachment " + i + " should have flags=4");
+ }
+ }
+
+ @Test
+ public void testRegularAttachmentsNotMarkedInline() throws Exception {
+ // testMSG_att_doc.msg has regular document attachments with flags=0
+ // and no Content-ID — they must NOT be marked INLINE
+ List<Metadata> metadataList =
getRecursiveMetadata("testMSG_att_doc.msg");
+ assertTrue(metadataList.size() > 1, "expected attachments");
+ for (int i = 1; i < metadataList.size(); i++) {
+ Metadata m = metadataList.get(i);
+ String resourceType =
m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY);
+ assertFalse(
+
TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(resourceType),
+ "regular attachment " + i + " should not be INLINE");
+ }
+ }
+
+ @Test
+ public void testImageWithFlagsButNoCidNotInline() throws Exception {
+ // test-outlook2003.msg has image attachments with ATT_RENDERED_IN_BODY
+ // but NO Content-ID. Layer 2 requires CID, so these should NOT be
INLINE.
+ List<Metadata> metadataList =
getRecursiveMetadata("test-outlook2003.msg");
+ for (int i = 1; i < metadataList.size(); i++) {
+ Metadata m = metadataList.get(i);
+ String resourceType =
m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY);
+ assertFalse(
+
TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(resourceType),
+ "image attachment " + i + " without CID should not be
INLINE");
+ }
+ }
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
new file mode 100644
index 0000000000..0c1096f4f2
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.msg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFEncapsulatedHTMLExtractorTest {
+
+ @Test
+ public void testNullAndEmpty() {
+ assertNull(RTFEncapsulatedHTMLExtractor.extract(null));
+ assertNull(RTFEncapsulatedHTMLExtractor.extract(new byte[0]));
+ }
+
+ @Test
+ public void testNonEncapsulatedRtf() {
+ String rtf = "{\\rtf1\\ansi\\deff0 Hello world}";
+
assertNull(RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)));
+ }
+
+ @Test
+ public void testSimpleEncapsulatedHtml() {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag19 <html>}\n" +
+ "{\\*\\htmltag34 <head>}\n" +
+ "{\\*\\htmltag41 </head>}\n" +
+ "{\\*\\htmltag50 <body>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "{\\*\\htmltag84 Hello world}\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "\\htmlrtf }\\htmlrtf0\n" +
+ "{\\*\\htmltag58 </body>}\n" +
+ "{\\*\\htmltag27 </html>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("<html>"));
+ assertTrue(html.contains("<p>"));
+ assertTrue(html.contains("Hello world"));
+ assertTrue(html.contains("</html>"));
+ }
+
+ @Test
+ public void testImgCidExtraction() {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag19 <html>}\n" +
+ "{\\*\\htmltag50 <body>}\n" +
+ "{\\*\\htmltag84 <img
src=\"cid:[email protected]\">}\n" +
+ "{\\*\\htmltag58 </body>}\n" +
+ "{\\*\\htmltag27 </html>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("cid:[email protected]"),
+ "CID reference should be preserved in extracted HTML");
+ }
+
+ @Test
+ public void testParAndTabDecoding() {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag241 <style>}\n" +
+ "{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n"
+
+ "{\\*\\htmltag249 </style>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("<style>"));
+ assertTrue(html.contains("body {"));
+ assertTrue(html.contains("\tcolor: red;"));
+ assertTrue(html.contains("</style>"));
+ }
+
+ @Test
+ public void testHexEscapeDecoding() {
+ // \'e9 = 0xE9 = 'é' in windows-1252
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 caf\\'e9}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("café", html);
+ }
+
+ @Test
+ public void testMultiByteHexEscape() {
+ // UTF-8 encoded 'ü' = 0xC3 0xBC in code page 65001 (UTF-8)
+ // But more commonly: \'fc in windows-1252 = 'ü'
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 gr\\'fc\\'dfe}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("grüße", html);
+ }
+
+ @Test
+ public void testCodePage1254Turkish() {
+ // \'fe in windows-1254 = 'þ' (U+00FE, LATIN SMALL LETTER THORN)
+ // \'fd in windows-1254 = 'ý' (U+00FD)
+ String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 Say\\'fdn}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("Sayın", html);
+ }
+
+ @Test
+ public void testHtmlrtfSkipping() {
+ // Content between \htmlrtf and \htmlrtf0 should be skipped
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 Hello}\n" +
+ "\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" +
+ "{\\*\\htmltag84 World}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("Hello World", html);
+ }
+
+ @Test
+ public void testEscapedBracesAndBackslash() {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("a { b } c \\d", html);
+ }
+
+ @Test
+ public void testEmptyHtmltag() {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag72}\n" +
+ "{\\*\\htmltag84 text}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("text", html);
+ }
+
+ @Test
+ public void testInterTagTextContent() {
+ // Realistic pattern: text content appears BETWEEN htmltag groups,
+ // with \htmlrtf blocks that should be skipped
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag19 <html>}\n" +
+ "{\\*\\htmltag50 <body>}\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "Hello from the message body\n" +
+ "\\htmlrtf\\par}\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "Second paragraph\n" +
+ "\\htmlrtf\\par}\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "{\\*\\htmltag58 </body>}\n" +
+ "{\\*\\htmltag27 </html>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("<p>"), "should contain HTML tags");
+ assertTrue(html.contains("Hello from the message body"),
+ "should contain inter-tag text content");
+ assertTrue(html.contains("Second paragraph"),
+ "should contain second paragraph text");
+ assertTrue(html.contains("</html>"), "should contain closing tag");
+ }
+
+ @Test
+ public void testInterTagHexEscapes() {
+ // Text between htmltag groups can also have \'xx escapes
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "caf\\'e9\n" +
+ "\\htmlrtf }\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("café"), "hex escapes in inter-tag text
should be decoded");
+ }
+
+ @Test
+ public void testLineControlWord() {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 line1\\line line2}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("line1<br>line2", html);
+ }
+}