(tika) 02/02: improve tagging of inline images

tallison Mon, 23 Mar 2026 06:27:24 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4696-improve-inline-tagging
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 509502fc349d3818cdcf2f7203ea684377f8130e
Author: tallison <[email protected]>
AuthorDate: Mon Mar 23 09:27:00 2026 -0400

    improve tagging of inline images
---
 .../tika/parser/microsoft/OutlookExtractor.java    |  22 +-
 .../msg/RTFEncapsulatedHTMLExtractor.java          | 453 +++++++++++++++++++++
 .../msg/RTFEncapsulatedHTMLExtractorTest.java      | 217 ++++++++++
 3 files changed, 689 insertions(+), 3 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index eb8fca4f47..0b8db23f45 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -85,6 +85,7 @@ import org.apache.tika.parser.html.HtmlEncodingDetector;
 import org.apache.tika.parser.html.JSoupParser;
 import org.apache.tika.parser.mailcommons.MailDateParser;
 import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
+import org.apache.tika.parser.microsoft.msg.RTFEncapsulatedHTMLExtractor;
 import org.apache.tika.parser.microsoft.rtf.RTFParser;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
@@ -373,6 +374,16 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
     }
 
+    private static final Set<String> INLINEABLE_MIME_TYPES = Set.of(
+            "application/x-ms-wmz",
+            "application/x-ms-emz",
+            "application/x-msmetafile",
+            "image/x-wmf",
+            "image/x-emf",
+            "image/wmf",
+            "image/emf"
+    );
+
     /**
      * Returns true for MIME types that are safe to label as INLINE.
      * We gate on this to avoid marking PDFs, DOCX, etc. as inline — downstream
@@ -383,7 +394,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
             return false;
         }
         String lower = mimeType.toLowerCase(Locale.ROOT).trim();
-        return lower.startsWith("image/");
+        return lower.startsWith("image/") || 
INLINEABLE_MIME_TYPES.contains(lower);
     }
 
     // PidTagAttachFlags (0x3714) — bit flags indicating which body formats 
reference this
@@ -644,8 +655,13 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
     }
 
     private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata, 
Set<String> contentIdNames) {
-        //for now, hope that there's encapsulated html
-        //TODO: check for encapsulated html. If it doesn't exist, handle RTF 
specifically
+        // Try to de-encapsulate the HTML from the RTF first
+        String html = RTFEncapsulatedHTMLExtractor.extract(data);
+        if (html != null) {
+            extractContentIdNamesFromHtml(html.getBytes(UTF_8), metadata, 
contentIdNames);
+            return;
+        }
+        // Fall back to scanning the raw RTF bytes for cid: references
         extractContentIdNamesFromHtml(data, metadata, contentIdNames);
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
new file mode 100644
index 0000000000..3ef453a48d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.msg;
+
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Extracts the original HTML from an RTF document that contains encapsulated 
HTML
+ * (as indicated by the {@code \fromhtml1} control word).
+ *
+ * <p>The encapsulated HTML format stores HTML in two places:</p>
+ * <ol>
+ *   <li>{@code {\*\htmltag<N> ...}} groups — contain the HTML markup (tags, 
style blocks, etc.)</li>
+ *   <li>Text between htmltag groups — contains the actual text content, 
provided it is NOT
+ *       wrapped in {@code \htmlrtf ... \htmlrtf0} (which marks RTF-only 
rendering hints)</li>
+ * </ol>
+ *
+ * <p>Within both htmltag groups and inter-tag text, the following RTF escapes 
are decoded:</p>
+ * <ul>
+ *   <li>{@code \par} → newline</li>
+ *   <li>{@code \tab} → tab character</li>
+ *   <li>{@code \line} → {@code <br>}</li>
+ *   <li>{@code \'xx} → single byte (decoded using the document's ANSI code 
page)</li>
+ *   <li>{@code \\}, {@code \{}, {@code \}} → literal characters</li>
+ * </ul>
+ */
+public class RTFEncapsulatedHTMLExtractor {
+
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(RTFEncapsulatedHTMLExtractor.class);
+
+    private static final String HTMLTAG_PREFIX = "{\\*\\htmltag";
+    private static final String FROM_HTML_MARKER = "\\fromhtml";
+    private static final String ANSICPG_PREFIX = "\\ansicpg";
+
+    /**
+     * Extracts the HTML content from an encapsulated-HTML RTF document.
+     *
+     * @param rtfBytes the decompressed RTF bytes
+     * @return the extracted HTML string, or {@code null} if the RTF does not 
contain
+     *         encapsulated HTML
+     */
+    public static String extract(byte[] rtfBytes) {
+        if (rtfBytes == null || rtfBytes.length == 0) {
+            return null;
+        }
+        // Work with US-ASCII — RTF is 7-bit and non-ASCII bytes are escaped 
as \'xx
+        String rtf = new String(rtfBytes, StandardCharsets.US_ASCII);
+
+        if (!rtf.contains(FROM_HTML_MARKER)) {
+            return null;
+        }
+
+        Charset codePage = detectCodePage(rtf);
+
+        // Find the start of the document body (after the RTF header).
+        // We skip past the initial {\rtf1... header by finding the first
+        // htmltag group or \htmlrtf marker — everything before that is RTF 
preamble.
+        int bodyStart = rtf.indexOf(HTMLTAG_PREFIX);
+        if (bodyStart < 0) {
+            return null;
+        }
+
+        StringBuilder html = new StringBuilder(rtf.length() / 2);
+        ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+        int pos = bodyStart;
+        int len = rtf.length();
+        boolean inHtmlRtfSkip = false;
+
+        while (pos < len) {
+            // Check if we're at an htmltag group
+            if (rtf.startsWith(HTMLTAG_PREFIX, pos)) {
+                flushPendingBytes(pendingBytes, html, codePage);
+
+                // Find matching close brace
+                int groupEnd = findMatchingBrace(rtf, pos);
+                if (groupEnd < 0) {
+                    break;
+                }
+
+                // Skip {\*\htmltag prefix and digit(s)
+                int contentStart = pos + HTMLTAG_PREFIX.length();
+                while (contentStart < groupEnd && 
Character.isDigit(rtf.charAt(contentStart))) {
+                    contentStart++;
+                }
+                // Skip optional space after tag number
+                if (contentStart < groupEnd && rtf.charAt(contentStart) == ' 
') {
+                    contentStart++;
+                }
+
+                // Decode the htmltag content
+                String inner = rtf.substring(contentStart, groupEnd);
+                decodeRtfEscapes(inner, html, codePage);
+
+                pos = groupEnd + 1;
+                continue;
+            }
+
+            // Check for \htmlrtf control word (start or end of RTF-only block)
+            if (rtf.startsWith("\\htmlrtf", pos)) {
+                flushPendingBytes(pendingBytes, html, codePage);
+                int afterWord = pos + "\\htmlrtf".length();
+
+                if (afterWord < len && rtf.charAt(afterWord) == '0') {
+                    // \htmlrtf0 — end of skip block
+                    inHtmlRtfSkip = false;
+                    afterWord++;
+                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+                        afterWord++;
+                    }
+                } else {
+                    // \htmlrtf — start of skip block
+                    inHtmlRtfSkip = true;
+                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+                        afterWord++;
+                    }
+                }
+                pos = afterWord;
+                continue;
+            }
+
+            // If we're inside an \htmlrtf skip block, just advance past this 
character.
+            // We don't skip nested groups wholesale because \htmlrtf0 may 
appear inside them.
+            if (inHtmlRtfSkip) {
+                pos++;
+                continue;
+            }
+
+            // Check for other { groups (nested RTF groups that aren't htmltag)
+            if (rtf.charAt(pos) == '{') {
+                flushPendingBytes(pendingBytes, html, codePage);
+                int end = findMatchingBrace(rtf, pos);
+                if (end > 0) {
+                    pos = end + 1;
+                } else {
+                    pos++;
+                }
+                continue;
+            }
+
+            // Skip closing braces
+            if (rtf.charAt(pos) == '}') {
+                flushPendingBytes(pendingBytes, html, codePage);
+                pos++;
+                continue;
+            }
+
+            // Handle RTF escapes in inter-tag text
+            if (rtf.charAt(pos) == '\\' && pos + 1 < len) {
+                char next = rtf.charAt(pos + 1);
+
+                // \'xx hex escape
+                if (next == '\'' && pos + 3 < len) {
+                    int hi = Character.digit(rtf.charAt(pos + 2), 16);
+                    int lo = Character.digit(rtf.charAt(pos + 3), 16);
+                    if (hi >= 0 && lo >= 0) {
+                        pendingBytes.write((hi << 4) | lo);
+                    }
+                    pos += 4;
+                    continue;
+                }
+
+                flushPendingBytes(pendingBytes, html, codePage);
+
+                // Escaped literals
+                if (next == '\\' || next == '{' || next == '}') {
+                    html.append(next);
+                    pos += 2;
+                    continue;
+                }
+
+                // Control word
+                if (Character.isLetter(next)) {
+                    int wordStart = pos + 1;
+                    int wordEnd = wordStart;
+                    while (wordEnd < len && 
Character.isLetter(rtf.charAt(wordEnd))) {
+                        wordEnd++;
+                    }
+                    String word = rtf.substring(wordStart, wordEnd);
+
+                    // Skip optional numeric parameter
+                    int paramEnd = wordEnd;
+                    if (paramEnd < len && (rtf.charAt(paramEnd) == '-'
+                            || Character.isDigit(rtf.charAt(paramEnd)))) {
+                        paramEnd++;
+                        while (paramEnd < len && 
Character.isDigit(rtf.charAt(paramEnd))) {
+                            paramEnd++;
+                        }
+                    }
+                    // Skip optional space delimiter
+                    int afterWord = paramEnd;
+                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
+                        afterWord++;
+                    }
+
+                    switch (word) {
+                        case "par":
+                        case "pard":
+                            html.append('\n');
+                            break;
+                        case "tab":
+                            html.append('\t');
+                            break;
+                        case "line":
+                            html.append("<br>");
+                            break;
+                        default:
+                            // Skip unknown control words
+                            break;
+                    }
+                    pos = afterWord;
+                    continue;
+                }
+
+                // Unknown escape — skip backslash
+                pos++;
+                continue;
+            }
+
+            // Newlines/carriage returns in RTF are whitespace, not content
+            if (rtf.charAt(pos) == '\r' || rtf.charAt(pos) == '\n') {
+                pos++;
+                continue;
+            }
+
+            // Regular text character between htmltag groups — this is HTML 
content
+            flushPendingBytes(pendingBytes, html, codePage);
+            html.append(rtf.charAt(pos));
+            pos++;
+        }
+
+        flushPendingBytes(pendingBytes, html, codePage);
+
+        if (html.length() == 0) {
+            return null;
+        }
+        return html.toString();
+    }
+
+    /**
+     * Find the position of the closing brace that matches the opening brace at
+     * {@code openPos}.  Handles nested groups and escaped braces.
+     *
+     * @return index of the closing '}', or -1 if not found
+     */
+    static int findMatchingBrace(String rtf, int openPos) {
+        int depth = 0;
+        int len = rtf.length();
+        for (int i = openPos; i < len; i++) {
+            char c = rtf.charAt(i);
+            if (c == '\\' && i + 1 < len) {
+                char next = rtf.charAt(i + 1);
+                if (next == '{' || next == '}' || next == '\\') {
+                    i++;
+                    continue;
+                }
+            }
+            if (c == '{') {
+                depth++;
+            } else if (c == '}') {
+                depth--;
+                if (depth == 0) {
+                    return i;
+                }
+            }
+        }
+        return -1;
+    }
+
+    /**
+     * Decode RTF escapes within an htmltag group's content.
+     */
+    static void decodeRtfEscapes(String content, StringBuilder out, Charset 
codePage) {
+        int len = content.length();
+        int i = 0;
+        ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+
+        while (i < len) {
+            char c = content.charAt(i);
+
+            if (c == '\\') {
+                if (i + 1 >= len) {
+                    break;
+                }
+                char next = content.charAt(i + 1);
+
+                // \'xx hex escape
+                if (next == '\'' && i + 3 < len) {
+                    int hi = Character.digit(content.charAt(i + 2), 16);
+                    int lo = Character.digit(content.charAt(i + 3), 16);
+                    if (hi >= 0 && lo >= 0) {
+                        pendingBytes.write((hi << 4) | lo);
+                    }
+                    i += 4;
+                    continue;
+                }
+
+                flushPendingBytes(pendingBytes, out, codePage);
+
+                if (next == '\\' || next == '{' || next == '}') {
+                    out.append(next);
+                    i += 2;
+                    continue;
+                }
+
+                // Control words
+                if (Character.isLetter(next)) {
+                    int wordStart = i + 1;
+                    int wordEnd = wordStart;
+                    while (wordEnd < len && 
Character.isLetter(content.charAt(wordEnd))) {
+                        wordEnd++;
+                    }
+                    String word = content.substring(wordStart, wordEnd);
+
+                    int paramEnd = wordEnd;
+                    if (paramEnd < len && (content.charAt(paramEnd) == '-'
+                            || Character.isDigit(content.charAt(paramEnd)))) {
+                        paramEnd++;
+                        while (paramEnd < len && 
Character.isDigit(content.charAt(paramEnd))) {
+                            paramEnd++;
+                        }
+                    }
+                    int afterWord = paramEnd;
+                    if (afterWord < len && content.charAt(afterWord) == ' ') {
+                        afterWord++;
+                    }
+
+                    switch (word) {
+                        case "par":
+                        case "pard":
+                            out.append('\n');
+                            break;
+                        case "tab":
+                            out.append('\t');
+                            break;
+                        case "line":
+                            out.append("<br>");
+                            break;
+                        case "htmlrtf":
+                            // Skip \htmlrtf...\htmlrtf0 inside htmltag groups
+                            i = skipHtmlRtfBlock(content, i);
+                            continue;
+                        default:
+                            break;
+                    }
+                    i = afterWord;
+                    continue;
+                }
+
+                i++;
+                continue;
+            }
+
+            if (c == '{' || c == '}') {
+                flushPendingBytes(pendingBytes, out, codePage);
+                i++;
+                continue;
+            }
+
+            flushPendingBytes(pendingBytes, out, codePage);
+            out.append(c);
+            i++;
+        }
+
+        flushPendingBytes(pendingBytes, out, codePage);
+    }
+
+    /**
+     * Skip a {@code \htmlrtf ... \htmlrtf0} block within an htmltag group.
+     *
+     * @param content the string being parsed
+     * @param pos     position of the backslash starting {@code \htmlrtf}
+     * @return position after the matching {@code \htmlrtf0}
+     */
+    static int skipHtmlRtfBlock(String content, int pos) {
+        int afterWord = pos + "\\htmlrtf".length();
+        if (afterWord < content.length() && content.charAt(afterWord) == '0') {
+            // This is \htmlrtf0 (end marker) — just skip past it
+            afterWord++;
+            if (afterWord < content.length() && content.charAt(afterWord) == ' 
') {
+                afterWord++;
+            }
+            return afterWord;
+        }
+
+        // Skip everything until \htmlrtf0
+        int endPos = content.indexOf("\\htmlrtf0", afterWord);
+        if (endPos < 0) {
+            return content.length();
+        }
+        int after = endPos + "\\htmlrtf0".length();
+        if (after < content.length() && content.charAt(after) == ' ') {
+            after++;
+        }
+        return after;
+    }
+
+    /**
+     * Detect the ANSI code page from the RTF header ({@code \ansicpgNNNN}).
+     * Falls back to windows-1252 if not found.
+     */
+    static Charset detectCodePage(String rtf) {
+        int idx = rtf.indexOf(ANSICPG_PREFIX);
+        if (idx < 0) {
+            return Charset.forName("windows-1252");
+        }
+        int numStart = idx + ANSICPG_PREFIX.length();
+        int numEnd = numStart;
+        while (numEnd < rtf.length() && Character.isDigit(rtf.charAt(numEnd))) 
{
+            numEnd++;
+        }
+        if (numEnd == numStart) {
+            return Charset.forName("windows-1252");
+        }
+        String cpNum = rtf.substring(numStart, numEnd);
+        try {
+            return Charset.forName("windows-" + cpNum);
+        } catch (Exception e) {
+            try {
+                return Charset.forName("cp" + cpNum);
+            } catch (Exception e2) {
+                LOGGER.debug("Unknown code page {}, falling back to 
windows-1252", cpNum);
+                return Charset.forName("windows-1252");
+            }
+        }
+    }
+
+    private static void flushPendingBytes(ByteArrayOutputStream pending, 
StringBuilder out,
+                                          Charset codePage) {
+        if (pending.size() > 0) {
+            out.append(new String(pending.toByteArray(), codePage));
+            pending.reset();
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
new file mode 100644
index 0000000000..0c1096f4f2
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.msg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFEncapsulatedHTMLExtractorTest {
+
+    @Test
+    public void testNullAndEmpty() {
+        assertNull(RTFEncapsulatedHTMLExtractor.extract(null));
+        assertNull(RTFEncapsulatedHTMLExtractor.extract(new byte[0]));
+    }
+
+    @Test
+    public void testNonEncapsulatedRtf() {
+        String rtf = "{\\rtf1\\ansi\\deff0 Hello world}";
+        
assertNull(RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)));
+    }
+
+    @Test
+    public void testSimpleEncapsulatedHtml() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag34 <head>}\n" +
+                "{\\*\\htmltag41 </head>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "{\\*\\htmltag84 Hello world}\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "\\htmlrtf }\\htmlrtf0\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<html>"));
+        assertTrue(html.contains("<p>"));
+        assertTrue(html.contains("Hello world"));
+        assertTrue(html.contains("</html>"));
+    }
+
+    @Test
+    public void testImgCidExtraction() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "{\\*\\htmltag84 <img 
src=\"cid:[email protected]\">}\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("cid:[email protected]"),
+                "CID reference should be preserved in extracted HTML");
+    }
+
+    @Test
+    public void testParAndTabDecoding() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag241 <style>}\n" +
+                "{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n" 
+
+                "{\\*\\htmltag249 </style>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<style>"));
+        assertTrue(html.contains("body {"));
+        assertTrue(html.contains("\tcolor: red;"));
+        assertTrue(html.contains("</style>"));
+    }
+
+    @Test
+    public void testHexEscapeDecoding() {
+        // \'e9 = 0xE9 = 'é' in windows-1252
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 caf\\'e9}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("café", html);
+    }
+
+    @Test
+    public void testMultiByteHexEscape() {
+        // UTF-8 encoded 'ü' = 0xC3 0xBC in code page 65001 (UTF-8)
+        // But more commonly: \'fc in windows-1252 = 'ü'
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 gr\\'fc\\'dfe}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("grüße", html);
+    }
+
+    @Test
+    public void testCodePage1254Turkish() {
+        // \'fe in windows-1254 = 'þ' (U+00FE, LATIN SMALL LETTER THORN)
+        // \'fd in windows-1254 = 'ý' (U+00FD)
+        String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 Say\\'fdn}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("Sayın", html);
+    }
+
+    @Test
+    public void testHtmlrtfSkipping() {
+        // Content between \htmlrtf and \htmlrtf0 should be skipped
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 Hello}\n" +
+                "\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" +
+                "{\\*\\htmltag84  World}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("Hello World", html);
+    }
+
+    @Test
+    public void testEscapedBracesAndBackslash() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("a { b } c \\d", html);
+    }
+
+    @Test
+    public void testEmptyHtmltag() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag72}\n" +
+                "{\\*\\htmltag84 text}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("text", html);
+    }
+
+    @Test
+    public void testInterTagTextContent() {
+        // Realistic pattern: text content appears BETWEEN htmltag groups,
+        // with \htmlrtf blocks that should be skipped
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "Hello from the message body\n" +
+                "\\htmlrtf\\par}\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "Second paragraph\n" +
+                "\\htmlrtf\\par}\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<p>"), "should contain HTML tags");
+        assertTrue(html.contains("Hello from the message body"),
+                "should contain inter-tag text content");
+        assertTrue(html.contains("Second paragraph"),
+                "should contain second paragraph text");
+        assertTrue(html.contains("</html>"), "should contain closing tag");
+    }
+
+    @Test
+    public void testInterTagHexEscapes() {
+        // Text between htmltag groups can also have \'xx escapes
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "caf\\'e9\n" +
+                "\\htmlrtf }\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("café"), "hex escapes in inter-tag text 
should be decoded");
+    }
+
+    @Test
+    public void testLineControlWord() {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 line1\\line line2}\n" +
+                "}";
+        String html = 
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("line1<br>line2", html);
+    }
+}

(tika) 02/02: improve tagging of inline images

Reply via email to