(tika) branch main updated: TIKA-4696 improve inline tagging (#2711)

tallison Mon, 23 Mar 2026 13:13:20 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 6060d9a115 TIKA-4696 improve inline tagging (#2711)
6060d9a115 is described below

commit 6060d9a115205d175fc13c51e93bf03e0d3fe1cf
Author: Tim Allison <[email protected]>
AuthorDate: Mon Mar 23 16:13:05 2026 -0400

    TIKA-4696 improve inline tagging (#2711)
---
 .../main/java/org/apache/tika/metadata/MAPI.java   |  14 +
 .../tika/parser/microsoft/OutlookExtractor.java    | 154 ++++++-
 .../msg/RTFEncapsulatedHTMLExtractor.java          | 453 +++++++++++++++++++++
 .../tika/parser/microsoft/OutlookParserTest.java   |  43 ++
 .../msg/RTFEncapsulatedHTMLExtractorTest.java      | 217 ++++++++++
 5 files changed, 867 insertions(+), 14 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java 
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
index 613c3d3d9d..c8f81a980d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -77,4 +77,18 @@ public interface MAPI {
     Property ATTACH_MIME = Property.internalText(PREFIX_MAPI_ATTACH_META + 
"mime");
     Property ATTACH_LANGUAGE = Property.internalText(PREFIX_MAPI_ATTACH_META + 
"language");
 
+    /**
+     * PidTagAttachFlags (0x3714) — indicates which body formats might 
reference this attachment.
+     * Bit 1 (0x1) = ATT_INVISIBLE_IN_HTML
+     * Bit 2 (0x2) = ATT_INVISIBLE_IN_RTF
+     * Bit 3 (0x4) = ATT_RENDERED_IN_BODY
+     */
+    Property ATTACH_FLAGS = Property.internalInteger(PREFIX_MAPI_ATTACH_META + 
"flags");
+
+    /**
+     * PidTagAttachmentHidden (0x7FFE) — indicates whether this attachment is 
hidden from the end
+     * user. Inline images typically have this set to true.
+     */
+    Property ATTACH_HIDDEN = Property.internalBoolean(PREFIX_MAPI_ATTACH_META 
+ "hidden");
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 552c52889c..0b8db23f45 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -20,8 +20,11 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
@@ -56,7 +59,10 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks;
 import org.apache.poi.hsmf.datatypes.StringChunk;
 import org.apache.poi.hsmf.datatypes.Types;
 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.util.CodePageUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -79,6 +85,7 @@ import org.apache.tika.parser.html.HtmlEncodingDetector;
 import org.apache.tika.parser.html.JSoupParser;
 import org.apache.tika.parser.mailcommons.MailDateParser;
 import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
+import org.apache.tika.parser.microsoft.msg.RTFEncapsulatedHTMLExtractor;
 import org.apache.tika.parser.microsoft.rtf.RTFParser;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
@@ -173,6 +180,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
     private static Pattern HEADER_KEY_PAT =
             Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
 
+    private final DirectoryNode root;
     private final MAPIMessage msg;
     private final ParseContext parseContext;
     private final boolean extractAllAlternatives;
@@ -181,6 +189,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
 
     public OutlookExtractor(DirectoryNode root, Metadata metadata, 
ParseContext context) throws TikaException {
         super(context, metadata);
+        this.root = root;
         this.parseContext = context;
         this.extractAllAlternatives =
                 
context.get(OfficeParserConfig.class).isExtractAllAlternativesFromMSG();
@@ -317,18 +326,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
 
     private void updateAttachmentMetadata(AttachmentChunks attachment, 
Metadata metadata,
                                           Set<String> contentIdNames) {
-        StringChunk contentIdChunk = attachment.getAttachContentId();
-        if (contentIdChunk != null) {
-            String contentId = contentIdChunk.getValue();
-            if (! StringUtils.isBlank(contentId)) {
-                contentId = contentId.trim();
-                if (contentIdNames.contains(contentId)) {
-                    metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY,
-                            
TikaCoreProperties.EmbeddedResourceType.INLINE.name());
-                }
-                metadata.set(MAPI.ATTACH_CONTENT_ID, contentId);
-            }
-        }
+        // Extract string-based metadata from POI's named chunk getters
         addStringChunkToMetadata(MAPI.ATTACH_LONG_PATH_NAME, 
attachment.getAttachLongPathName(), metadata);
         addStringChunkToMetadata(MAPI.ATTACH_LONG_FILE_NAME, 
attachment.getAttachLongFileName(), metadata);
         addStringChunkToMetadata(MAPI.ATTACH_FILE_NAME, 
attachment.getAttachFileName(), metadata);
@@ -337,6 +335,129 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         addStringChunkToMetadata(MAPI.ATTACH_EXTENSION, 
attachment.getAttachExtension(), metadata);
         addStringChunkToMetadata(MAPI.ATTACH_MIME, 
attachment.getAttachMimeTag(), metadata);
         addStringChunkToMetadata(MAPI.ATTACH_LANGUAGE, 
attachment.getAttachLanguage(), metadata);
+
+        // Extract fixed properties from the attachment's 
__properties_version1.0 stream
+        // POI's AttachmentChunks doesn't parse this stream, so we read it 
directly.
+        Map<Integer, Long> attachProps = 
readAttachmentProperties(attachment.getPOIFSName());
+        Long attachFlags = attachProps.get(PID_TAG_ATTACH_FLAGS);
+        if (attachFlags != null) {
+            metadata.set(MAPI.ATTACH_FLAGS, attachFlags.intValue());
+        }
+        Long attachHidden = attachProps.get(PID_TAG_ATTACHMENT_HIDDEN);
+        if (attachHidden != null) {
+            metadata.set(MAPI.ATTACH_HIDDEN, attachHidden.intValue() != 0);
+        }
+
+        // Determine inline vs attachment
+        String contentId = null;
+        StringChunk contentIdChunk = attachment.getAttachContentId();
+        if (contentIdChunk != null) {
+            String rawCid = contentIdChunk.getValue();
+            if (!StringUtils.isBlank(rawCid)) {
+                contentId = rawCid.trim();
+                metadata.set(MAPI.ATTACH_CONTENT_ID, contentId);
+            }
+        }
+
+        if (contentId != null && contentIdNames.contains(contentId)) {
+            // Layer 1: CID referenced in the message body — high confidence 
inline
+            metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY,
+                    TikaCoreProperties.EmbeddedResourceType.INLINE.name());
+        } else if (contentId != null
+                && attachFlags != null
+                && (attachFlags & ATT_RENDERED_IN_BODY) != 0
+                && isInlineableMimeType(metadata.get(MAPI.ATTACH_MIME))) {
+            // Layer 2: MAPI says rendered in body + image MIME type — the CID 
regex
+            // missed it (e.g. encapsulated RTF with stripped img tags)
+            metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY,
+                    TikaCoreProperties.EmbeddedResourceType.INLINE.name());
+        }
+    }
+
+    private static final Set<String> INLINEABLE_MIME_TYPES = Set.of(
+            "application/x-ms-wmz",
+            "application/x-ms-emz",
+            "application/x-msmetafile",
+            "image/x-wmf",
+            "image/x-emf",
+            "image/wmf",
+            "image/emf"
+    );
+
+    /**
+     * Returns true for MIME types that are safe to label as INLINE.
+     * We gate on this to avoid marking PDFs, DOCX, etc. as inline — downstream
+     * consumers use INLINE to decide what to index separately.
+     */
+    private static boolean isInlineableMimeType(String mimeType) {
+        if (StringUtils.isBlank(mimeType)) {
+            return false;
+        }
+        String lower = mimeType.toLowerCase(Locale.ROOT).trim();
+        return lower.startsWith("image/") || 
INLINEABLE_MIME_TYPES.contains(lower);
+    }
+
+    // PidTagAttachFlags (0x3714) — bit flags indicating which body formats 
reference this
+    private static final int PID_TAG_ATTACH_FLAGS = 0x3714;
+    // Bit 2 = ATT_RENDERED_IN_BODY: this attachment is referenced by the body
+    private static final int ATT_RENDERED_IN_BODY = 0x4;
+    // PidTagAttachmentHidden (0x7FFE) — boolean, true if hidden from end user 
(inline images)
+    private static final int PID_TAG_ATTACHMENT_HIDDEN = 0x7FFE;
+
+    /**
+     * Read fixed MAPI properties from the __properties_version1.0 stream 
inside an
+     * attachment storage.  POI's {@link AttachmentChunks} does not parse this 
stream.
+     *
+     * <p>The stream format is: 8-byte header, followed by 16-byte property 
entries.
+     * Each entry: 2 bytes property type, 2 bytes property ID, 4 bytes flags,
+     * 8 bytes value (inline for fixed-size types).</p>
+     *
+     * @param poifsName the OLE2 directory name for this attachment
+     *                  (e.g. "__attach_version1.0_#00000000")
+     * @return map of property ID to value for fixed-size integer/boolean 
properties
+     */
+    private Map<Integer, Long> readAttachmentProperties(String poifsName) {
+        Map<Integer, Long> result = new HashMap<>();
+        try {
+            DirectoryEntry attachDir = (DirectoryEntry) 
root.getEntry(poifsName);
+            DocumentEntry propsEntry =
+                    (DocumentEntry) 
attachDir.getEntry("__properties_version1.0");
+            byte[] data;
+            try (InputStream dis = new DocumentInputStream(propsEntry)) {
+                data = dis.readAllBytes();
+            }
+            if (data.length < 8) {
+                return result;
+            }
+            ByteBuffer buf = 
ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
+            int offset = 8; // skip 8-byte header
+            while (offset + 16 <= data.length) {
+                int propType = buf.getShort(offset) & 0xFFFF;
+                int propId = buf.getShort(offset + 2) & 0xFFFF;
+                long value;
+                switch (propType) {
+                    case 0x0003: // PtypInteger32
+                        value = buf.getInt(offset + 8) & 0xFFFFFFFFL;
+                        result.put(propId, value);
+                        break;
+                    case 0x000B: // PtypBoolean
+                        value = buf.getShort(offset + 8) & 0xFFFF;
+                        result.put(propId, value);
+                        break;
+                    case 0x0014: // PtypInteger64
+                        value = buf.getLong(offset + 8);
+                        result.put(propId, value);
+                        break;
+                    default:
+                        // skip variable-length, binary, time and other types
+                        break;
+                }
+                offset += 16;
+            }
+        } catch (Exception e) {
+            LOGGER.debug("Could not read attachment properties for {}", 
poifsName, e);
+        }
+        return result;
     }
 
     private void addStringChunkToMetadata(Property property, StringChunk 
stringChunk, Metadata metadata) {
@@ -534,8 +655,13 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
     }
 
     private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata, 
Set<String> contentIdNames) {
-        //for now, hope that there's encapsulated html
-        //TODO: check for encapsulated html. If it doesn't exist, handle RTF 
specifically
+        // Try to de-encapsulate the HTML from the RTF first
+        String html = RTFEncapsulatedHTMLExtractor.extract(data);
+        if (html != null) {
+            extractContentIdNamesFromHtml(html.getBytes(UTF_8), metadata, 
contentIdNames);
+            return;
+        }
+        // Fall back to scanning the raw RTF bytes for cid: references
         extractContentIdNamesFromHtml(data, metadata, contentIdNames);
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
new file mode 100644
index 0000000000..3ef453a48d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.msg;
+
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Extracts the original HTML from an RTF document that contains encapsulated 
HTML
+ * (as indicated by the {@code \fromhtml1} control word).
+ *
+ * <p>The encapsulated HTML format stores HTML in two places:</p>
+ * <ol>
+ *   <li>{@code {\*\htmltag<N> ...}} groups — contain the HTML markup (tags, 
style blocks, etc.)</li>
+ *   <li>Text between htmltag groups — contains the actual text content, 
provided it is NOT
+ *       wrapped in {@code \htmlrtf ... \htmlrtf0} (which marks RTF-only 
rendering hints)</li>
+ * </ol>
+ *
+ * <p>Within both htmltag groups and inter-tag text, the following RTF escapes 
are decoded:</p>
+ * <ul>
+ *   <li>{@code \par} → newline</li>
+ *   <li>{@code \tab} → tab character</li>
+ *   <li>{@code \line} → {@code <br>}</li>
+ *   <li>{@code \'xx} → single byte (decoded using the document's ANSI code 
page)</li>
+ *   <li>{@code \\}, {@code \{}, {@code \}} → literal characters</li>
+ * </ul>
+ */
+public class RTFEncapsulatedHTMLExtractor {
+
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(RTFEncapsulatedHTMLExtractor.class);
+
+    private static final String HTMLTAG_PREFIX = "{\\*\\htmltag";
+    private static final String FROM_HTML_MARKER = "\\fromhtml";
+    private static final String ANSICPG_PREFIX = "\\ansicpg";
+
+    /**
+     * Extracts the HTML content from an encapsulated-HTML RTF document.
+     *
+     * @param rtfBytes the decompressed RTF bytes
+     * @return the extracted HTML string, or {@code null} if the RTF does not 
contain
+     *         encapsulated HTML
+     */
+    public static String extract(byte[] rtfBytes) {
+        if (rtfBytes == null || rtfBytes.length == 0) {
+            return null;
+        }
+        // Work with US-ASCII — RTF is 7-bit and non-ASCII bytes are escaped 
as \'xx
+        String rtf = new String(rtfBytes, StandardCharsets.US_ASCII);
+
+        if (!rtf.contains(FROM_HTML_MARKER)) {
+            return null;
+        }
+
+        Charset codePage = detectCodePage(rtf);
+
+        // Find the start of the document body (after the RTF header).
+        // We skip past the initial {\rtf1... header by finding the first
+        // htmltag group or \htmlrtf marker — everything before that is RTF 
preamble.
+        int bodyStart = rtf.indexOf(HTMLTAG_PREFIX);
+        if (bodyStart < 0) {
+            return null;
+        }
+
+        StringBuilder html = new StringBuilder(rtf.length() / 2);
+        ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+        int pos = bodyStart;
+        int len = rtf.length();
+        boolean inHtmlRtfSkip = false;
+
+        while (pos < len) {
+            // Check if we're at an htmltag group
+            if (rtf.startsWith(HTMLTAG_PREFIX, pos)) {
+                flushPendingBytes(pendingBytes, html, codePage);
+
+                // Find matching close brace
+                int groupEnd = findMatchingBrace(rtf, pos);
+                if (groupEnd < 0) {
+                    break;
+                }
+
+                // Skip {\*\htmltag prefix and digit(s)
+                int contentStart = pos + HTMLTAG_PREFIX.length();
+                while (contentStart < groupEnd && 
Character.isDigit(rtf.charAt(contentStart))) {
+                    contentStart++;
+                }
+                // Skip optional space after tag number
+                if (contentStart < groupEnd && rtf.charAt(contentStart) == ' 
') {
+                    contentStart++;
+                }
+
+                // Decode the htmltag content
+                String inner = rtf.substring(contentStart, groupEnd);
+                decodeRtfEscapes(inner, html, codePage);
+
+                pos = groupEnd + 1;
+                continue;
+            }
+
+            // Check for \htmlrtf control word (start or end of RTF-only block)
+            if (rtf.startsWith("\\htmlrtf", pos)) {
+                flushPendingBytes(pendingBytes, html, codePage);
+                int afterWord = pos + "\\htmlrtf".length();
+
+                if (afterWord < len && rtf.charAt(afterWord) == '0') {
+                    // \htmlrtf0 — end of skip block
+                    inHtmlRtfSkip = false;
+                    afterWord++;
+                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+                        afterWord++;
+                    }
+                } else {
+                    // \htmlrtf — start of skip block
+                    inHtmlRtfSkip = true;
+                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+                        afterWord++;
+                    }
+                }
+                pos = afterWord;
+                continue;
+            }
+
+            // If we're inside an \htmlrtf skip block, just advance past this 
character.
+            // We don't skip nested groups wholesale because \htmlrtf0 may 
appear inside them.
+            if (inHtmlRtfSkip) {
+                pos++;
+                continue;
+            }
+
+            // Check for other { groups (nested RTF groups that aren't htmltag)
+            if (rtf.charAt(pos) == '{') {
+                flushPendingBytes(pendingBytes, html, codePage);
+                int end = findMatchingBrace(rtf, pos);
+                if (end > 0) {
+                    pos = end + 1;
+                } else {
+                    pos++;
+                }
+                continue;
+            }
+
+            // Skip closing braces
+            if (rtf.charAt(pos) == '}') {
+                flushPendingBytes(pendingBytes, html, codePage);
+                pos++;
+                continue;
+            }
+
+            // Handle RTF escapes in inter-tag text
+            if (rtf.charAt(pos) == '\\' && pos + 1 < len) {
+                char next = rtf.charAt(pos + 1);
+
+                // \'xx hex escape
+                if (next == '\'' && pos + 3 < len) {
+                    int hi = Character.digit(rtf.charAt(pos + 2), 16);
+                    int lo = Character.digit(rtf.charAt(pos + 3), 16);
+                    if (hi >= 0 && lo >= 0) {
+                        pendingBytes.write((hi << 4) | lo);
+                    }
+                    pos += 4;
+                    continue;
+                }
+
+                flushPendingBytes(pendingBytes, html, codePage);
+
+                // Escaped literals
+                if (next == '\\' || next == '{' || next == '}') {
+                    html.append(next);
+                    pos += 2;
+                    continue;
+                }
+
+                // Control word
+                if (Character.isLetter(next)) {
+                    int wordStart = pos + 1;
+                    int wordEnd = wordStart;
+                    while (wordEnd < len && 
Character.isLetter(rtf.charAt(wordEnd))) {
+                        wordEnd++;
+                    }
+                    String word = rtf.substring(wordStart, wordEnd);
+
+                    // Skip optional numeric parameter
+                    int paramEnd = wordEnd;
+                    if (paramEnd < len && (rtf.charAt(paramEnd) == '-'
+                            || Character.isDigit(rtf.charAt(paramEnd)))) {
+                        paramEnd++;
+                        while (paramEnd < len && 
Character.isDigit(rtf.charAt(paramEnd))) {
+                            paramEnd++;
+                        }
+                    }
+                    // Skip optional space delimiter
+                    int afterWord = paramEnd;
+                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+                        afterWord++;
+                    }
+
+                    switch (word) {
+                        case "par":
+                        case "pard":
+                            html.append('\n');
+                            break;
+                        case "tab":
+                            html.append('\t');
+                            break;
+                        case "line":
+                            html.append("<br>");
+                            break;
+                        default:
+                            // Skip unknown control words
+                            break;
+                    }
+                    pos = afterWord;
+                    continue;
+                }
+
+                // Unknown escape — skip backslash
+                pos++;
+                continue;
+            }
+
+            // Newlines/carriage returns in RTF are whitespace, not content
+            if (rtf.charAt(pos) == '\r' || rtf.charAt(pos) == '\n') {
+                pos++;
+                continue;
+            }
+
+            // Regular text character between htmltag groups — this is HTML 
content
+            flushPendingBytes(pendingBytes, html, codePage);
+            html.append(rtf.charAt(pos));
+            pos++;
+        }
+
+        flushPendingBytes(pendingBytes, html, codePage);
+
+        if (html.length() == 0) {
+            return null;
+        }
+        return html.toString();
+    }
+
+    /**
+     * Find the position of the closing brace that matches the opening brace at
+     * {@code openPos}.  Handles nested groups and escaped braces.
+     *
+     * @return index of the closing '}', or -1 if not found
+     */
+    static int findMatchingBrace(String rtf, int openPos) {
+        int depth = 0;
+        int len = rtf.length();
+        for (int i = openPos; i < len; i++) {
+            char c = rtf.charAt(i);
+            if (c == '\\' && i + 1 < len) {
+                char next = rtf.charAt(i + 1);
+                if (next == '{' || next == '}' || next == '\\') {
+                    i++;
+                    continue;
+                }
+            }
+            if (c == '{') {
+                depth++;
+            } else if (c == '}') {
+                depth--;
+                if (depth == 0) {
+                    return i;
+                }
+            }
+        }
+        return -1;
+    }
+
+    /**
+     * Decode RTF escapes within an htmltag group's content.
+     */
+    static void decodeRtfEscapes(String content, StringBuilder out, Charset 
codePage) {
+        int len = content.length();
+        int i = 0;
+        ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+
+        while (i < len) {
+            char c = content.charAt(i);
+
+            if (c == '\\') {
+                if (i + 1 >= len) {
+                    break;
+                }
+                char next = content.charAt(i + 1);
+
+                // \'xx hex escape
+                if (next == '\'' && i + 3 < len) {
+                    int hi = Character.digit(content.charAt(i + 2), 16);
+                    int lo = Character.digit(content.charAt(i + 3), 16);
+                    if (hi >= 0 && lo >= 0) {
+                        pendingBytes.write((hi << 4) | lo);
+                    }
+                    i += 4;
+                    continue;
+                }
+
+                flushPendingBytes(pendingBytes, out, codePage);
+
+                if (next == '\\' || next == '{' || next == '}') {
+                    out.append(next);
+                    i += 2;
+                    continue;
+                }
+
+                // Control words
+                if (Character.isLetter(next)) {
+                    int wordStart = i + 1;
+                    int wordEnd = wordStart;
+                    while (wordEnd < len && 
Character.isLetter(content.charAt(wordEnd))) {
+                        wordEnd++;
+                    }
+                    String word = content.substring(wordStart, wordEnd);
+
+                    int paramEnd = wordEnd;
+                    if (paramEnd < len && (content.charAt(paramEnd) == '-'
+                            || Character.isDigit(content.charAt(paramEnd)))) {
+                        paramEnd++;
+                        while (paramEnd < len && 
Character.isDigit(content.charAt(paramEnd))) {
+                            paramEnd++;
+                        }
+                    }
+                    int afterWord = paramEnd;
+                    if (afterWord < len && content.charAt(afterWord) == ' ') {
+                        afterWord++;
+                    }
+
+                    switch (word) {
+                        case "par":
+                        case "pard":
+                            out.append('\n');
+                            break;
+                        case "tab":
+                            out.append('\t');
+                            break;
+                        case "line":
+                            out.append("<br>");
+                            break;
+                        case "htmlrtf":
+                            // Skip \htmlrtf...\htmlrtf0 inside htmltag groups
+                            i = skipHtmlRtfBlock(content, i);
+                            continue;
+                        default:
+                            break;
+                    }
+                    i = afterWord;
+                    continue;
+                }
+
+                i++;
+                continue;
+            }
+
+            if (c == '{' || c == '}') {
+                flushPendingBytes(pendingBytes, out, codePage);
+                i++;
+                continue;
+            }
+
+            flushPendingBytes(pendingBytes, out, codePage);
+            out.append(c);
+            i++;
+        }
+
+        flushPendingBytes(pendingBytes, out, codePage);
+    }
+
+    /**
+     * Skip a {@code \htmlrtf ... \htmlrtf0} block within an htmltag group.
+     *
+     * @param content the string being parsed
+     * @param pos     position of the backslash starting {@code \htmlrtf}
+     * @return position after the matching {@code \htmlrtf0}
+     */
+    static int skipHtmlRtfBlock(String content, int pos) {
+        int afterWord = pos + "\\htmlrtf".length();
+        if (afterWord < content.length() && content.charAt(afterWord) == '0') {
+            // This is \htmlrtf0 (end marker) — just skip past it
+            afterWord++;
+            if (afterWord < content.length() && content.charAt(afterWord) == ' 
') {
+                afterWord++;
+            }
+            return afterWord;
+        }
+
+        // Skip everything until \htmlrtf0
+        int endPos = content.indexOf("\\htmlrtf0", afterWord);
+        if (endPos < 0) {
+            return content.length();
+        }
+        int after = endPos + "\\htmlrtf0".length();
+        if (after < content.length() && content.charAt(after) == ' ') {
+            after++;
+        }
+        return after;
+    }
+
+    /**
+     * Detect the ANSI code page from the RTF header ({@code \ansicpgNNNN}).
+     * Falls back to windows-1252 if not found.
+     */
+    static Charset detectCodePage(String rtf) {
+        int idx = rtf.indexOf(ANSICPG_PREFIX);
+        if (idx < 0) {
+            return Charset.forName("windows-1252");
+        }
+        int numStart = idx + ANSICPG_PREFIX.length();
+        int numEnd = numStart;
+        while (numEnd < rtf.length() && Character.isDigit(rtf.charAt(numEnd))) 
{
+            numEnd++;
+        }
+        if (numEnd == numStart) {
+            return Charset.forName("windows-1252");
+        }
+        String cpNum = rtf.substring(numStart, numEnd);
+        try {
+            return Charset.forName("windows-" + cpNum);
+        } catch (Exception e) {
+            try {
+                return Charset.forName("cp" + cpNum);
+            } catch (Exception e2) {
+                LOGGER.debug("Unknown code page {}, falling back to 
windows-1252", cpNum);
+                return Charset.forName("windows-1252");
+            }
+        }
+    }
+
+    private static void flushPendingBytes(ByteArrayOutputStream pending, 
StringBuilder out,
+                                          Charset codePage) {
+        if (pending.size() > 0) {
+            out.append(new String(pending.toByteArray(), codePage));
+            pending.reset();
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 20b010e7b7..eb92465dbe 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -407,4 +407,47 @@ public class OutlookParserTest extends TikaTest {
         assertContains("annuaires\t \n" + " Synchronisation", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
     }
 
+    @Test
+    public void testAttachFlagsExtracted() throws Exception {
+        // test-outlook2003.msg has 11 JPEG attachments with 
PidTagAttachFlags=4
+        // (ATT_RENDERED_IN_BODY) but no Content-ID
+        List<Metadata> metadataList = 
getRecursiveMetadata("test-outlook2003.msg");
+        // first entry is the message itself, rest are attachments
+        assertTrue(metadataList.size() > 1, "expected attachments");
+        for (int i = 1; i < metadataList.size(); i++) {
+            Metadata m = metadataList.get(i);
+            assertEquals("4", m.get(MAPI.ATTACH_FLAGS),
+                    "attachment " + i + " should have flags=4");
+        }
+    }
+
+    @Test
+    public void testRegularAttachmentsNotMarkedInline() throws Exception {
+        // testMSG_att_doc.msg has regular document attachments with flags=0
+        // and no Content-ID — they must NOT be marked INLINE
+        List<Metadata> metadataList = 
getRecursiveMetadata("testMSG_att_doc.msg");
+        assertTrue(metadataList.size() > 1, "expected attachments");
+        for (int i = 1; i < metadataList.size(); i++) {
+            Metadata m = metadataList.get(i);
+            String resourceType = 
m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY);
+            assertFalse(
+                    
TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(resourceType),
+                    "regular attachment " + i + " should not be INLINE");
+        }
+    }
+
+    @Test
+    public void testImageWithFlagsButNoCidNotInline() throws Exception {
+        // test-outlook2003.msg has image attachments with ATT_RENDERED_IN_BODY
+        // but NO Content-ID. Layer 2 requires CID, so these should NOT be 
INLINE.
+        List<Metadata> metadataList = 
getRecursiveMetadata("test-outlook2003.msg");
+        for (int i = 1; i < metadataList.size(); i++) {
+            Metadata m = metadataList.get(i);
+            String resourceType = 
m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY);
+            assertFalse(
+                    
TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(resourceType),
+                    "image attachment " + i + " without CID should not be 
INLINE");
+        }
+    }
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
new file mode 100644
index 0000000000..0c1096f4f2
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.msg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFEncapsulatedHTMLExtractorTest {
+
+    @Test
+    public void testNullAndEmpty() {
+        assertNull(RTFEncapsulatedHTMLExtractor.extract(null));
+        assertNull(RTFEncapsulatedHTMLExtractor.extract(new byte[0]));
+    }
+
+    @Test
+    public void testNonEncapsulatedRtf() {
+        String rtf = "{\\rtf1\\ansi\\deff0 Hello world}";
+        
assertNull(RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)));
+    }
+
+    @Test
+    public void testSimpleEncapsulatedHtml() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag34 <head>}\n" +
+                "{\\*\\htmltag41 </head>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "{\\*\\htmltag84 Hello world}\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "\\htmlrtf }\\htmlrtf0\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<html>"));
+        assertTrue(html.contains("<p>"));
+        assertTrue(html.contains("Hello world"));
+        assertTrue(html.contains("</html>"));
+    }
+
+    @Test
+    public void testImgCidExtraction() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "{\\*\\htmltag84 <img 
src=\"cid:[email protected]\">}\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("cid:[email protected]"),
+                "CID reference should be preserved in extracted HTML");
+    }
+
+    @Test
+    public void testParAndTabDecoding() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag241 <style>}\n" +
+                "{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n" 
+
+                "{\\*\\htmltag249 </style>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<style>"));
+        assertTrue(html.contains("body {"));
+        assertTrue(html.contains("\tcolor: red;"));
+        assertTrue(html.contains("</style>"));
+    }
+
+    @Test
+    public void testHexEscapeDecoding() {
+        // \'e9 = 0xE9 = 'é' in windows-1252
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 caf\\'e9}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("café", html);
+    }
+
+    @Test
+    public void testMultiByteHexEscape() {
+        // UTF-8 encoded 'ü' = 0xC3 0xBC in code page 65001 (UTF-8)
+        // But more commonly: \'fc in windows-1252 = 'ü'
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 gr\\'fc\\'dfe}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("grüße", html);
+    }
+
+    @Test
+    public void testCodePage1254Turkish() {
+        // \'fe in windows-1254 = 'þ' (U+00FE, LATIN SMALL LETTER THORN)
+        // \'fd in windows-1254 = 'ý' (U+00FD)
+        String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 Say\\'fdn}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("Sayın", html);
+    }
+
+    @Test
+    public void testHtmlrtfSkipping() {
+        // Content between \htmlrtf and \htmlrtf0 should be skipped
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 Hello}\n" +
+                "\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" +
+                "{\\*\\htmltag84  World}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("Hello World", html);
+    }
+
+    @Test
+    public void testEscapedBracesAndBackslash() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("a { b } c \\d", html);
+    }
+
+    @Test
+    public void testEmptyHtmltag() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag72}\n" +
+                "{\\*\\htmltag84 text}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("text", html);
+    }
+
+    @Test
+    public void testInterTagTextContent() {
+        // Realistic pattern: text content appears BETWEEN htmltag groups,
+        // with \htmlrtf blocks that should be skipped
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "Hello from the message body\n" +
+                "\\htmlrtf\\par}\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "Second paragraph\n" +
+                "\\htmlrtf\\par}\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<p>"), "should contain HTML tags");
+        assertTrue(html.contains("Hello from the message body"),
+                "should contain inter-tag text content");
+        assertTrue(html.contains("Second paragraph"),
+                "should contain second paragraph text");
+        assertTrue(html.contains("</html>"), "should contain closing tag");
+    }
+
+    @Test
+    public void testInterTagHexEscapes() {
+        // Text between htmltag groups can also have \'xx escapes
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "caf\\'e9\n" +
+                "\\htmlrtf }\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("café"), "hex escapes in inter-tag text 
should be decoded");
+    }
+
+    @Test
+    public void testLineControlWord() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 line1\\line line2}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("line1<br>line2", html);
+    }
+}

(tika) branch main updated: TIKA-4696 improve inline tagging (#2711)

Reply via email to