This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4696-improve-inline-tagging in repository https://gitbox.apache.org/repos/asf/tika.git
commit 509502fc349d3818cdcf2f7203ea684377f8130e Author: tallison <[email protected]> AuthorDate: Mon Mar 23 09:27:00 2026 -0400 improve tagging of inline images --- .../tika/parser/microsoft/OutlookExtractor.java | 22 +- .../msg/RTFEncapsulatedHTMLExtractor.java | 453 +++++++++++++++++++++ .../msg/RTFEncapsulatedHTMLExtractorTest.java | 217 ++++++++++ 3 files changed, 689 insertions(+), 3 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index eb8fca4f47..0b8db23f45 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -85,6 +85,7 @@ import org.apache.tika.parser.html.HtmlEncodingDetector; import org.apache.tika.parser.html.JSoupParser; import org.apache.tika.parser.mailcommons.MailDateParser; import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor; +import org.apache.tika.parser.microsoft.msg.RTFEncapsulatedHTMLExtractor; import org.apache.tika.parser.microsoft.rtf.RTFParser; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; @@ -373,6 +374,16 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } + private static final Set<String> INLINEABLE_MIME_TYPES = Set.of( + "application/x-ms-wmz", + "application/x-ms-emz", + "application/x-msmetafile", + "image/x-wmf", + "image/x-emf", + "image/wmf", + "image/emf" + ); + /** * Returns true for MIME types that are safe to label as INLINE. * We gate on this to avoid marking PDFs, DOCX, etc. as inline — downstream @@ -383,7 +394,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { return false; } String lower = mimeType.toLowerCase(Locale.ROOT).trim(); - return lower.startsWith("image/"); + return lower.startsWith("image/") || INLINEABLE_MIME_TYPES.contains(lower); } // PidTagAttachFlags (0x3714) — bit flags indicating which body formats reference this @@ -644,8 +655,13 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata, Set<String> contentIdNames) { - //for now, hope that there's encapsulated html - //TODO: check for encapsulated html. If it doesn't exist, handle RTF specifically + // Try to de-encapsulate the HTML from the RTF first + String html = RTFEncapsulatedHTMLExtractor.extract(data); + if (html != null) { + extractContentIdNamesFromHtml(html.getBytes(UTF_8), metadata, contentIdNames); + return; + } + // Fall back to scanning the raw RTF bytes for cid: references extractContentIdNamesFromHtml(data, metadata, contentIdNames); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java new file mode 100644 index 0000000000..3ef453a48d --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java @@ -0,0 +1,453 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.msg; + +import java.io.ByteArrayOutputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Extracts the original HTML from an RTF document that contains encapsulated HTML + * (as indicated by the {@code \fromhtml1} control word). + * + * <p>The encapsulated HTML format stores HTML in two places:</p> + * <ol> + * <li>{@code {\*\htmltag<N> ...}} groups — contain the HTML markup (tags, style blocks, etc.)</li> + * <li>Text between htmltag groups — contains the actual text content, provided it is NOT + * wrapped in {@code \htmlrtf ... \htmlrtf0} (which marks RTF-only rendering hints)</li> + * </ol> + * + * <p>Within both htmltag groups and inter-tag text, the following RTF escapes are decoded:</p> + * <ul> + * <li>{@code \par} → newline</li> + * <li>{@code \tab} → tab character</li> + * <li>{@code \line} → {@code <br>}</li> + * <li>{@code \'xx} → single byte (decoded using the document's ANSI code page)</li> + * <li>{@code \\}, {@code \{}, {@code \}} → literal characters</li> + * </ul> + */ +public class RTFEncapsulatedHTMLExtractor { + + private static final Logger LOGGER = LoggerFactory.getLogger(RTFEncapsulatedHTMLExtractor.class); + + private static final String HTMLTAG_PREFIX = "{\\*\\htmltag"; + private static final String FROM_HTML_MARKER = "\\fromhtml"; + private static final String ANSICPG_PREFIX = "\\ansicpg"; + + /** + * Extracts the HTML content from an encapsulated-HTML RTF document. + * + * @param rtfBytes the decompressed RTF bytes + * @return the extracted HTML string, or {@code null} if the RTF does not contain + * encapsulated HTML + */ + public static String extract(byte[] rtfBytes) { + if (rtfBytes == null || rtfBytes.length == 0) { + return null; + } + // Work with US-ASCII — RTF is 7-bit and non-ASCII bytes are escaped as \'xx + String rtf = new String(rtfBytes, StandardCharsets.US_ASCII); + + if (!rtf.contains(FROM_HTML_MARKER)) { + return null; + } + + Charset codePage = detectCodePage(rtf); + + // Find the start of the document body (after the RTF header). + // We skip past the initial {\rtf1... header by finding the first + // htmltag group or \htmlrtf marker — everything before that is RTF preamble. + int bodyStart = rtf.indexOf(HTMLTAG_PREFIX); + if (bodyStart < 0) { + return null; + } + + StringBuilder html = new StringBuilder(rtf.length() / 2); + ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream(); + int pos = bodyStart; + int len = rtf.length(); + boolean inHtmlRtfSkip = false; + + while (pos < len) { + // Check if we're at an htmltag group + if (rtf.startsWith(HTMLTAG_PREFIX, pos)) { + flushPendingBytes(pendingBytes, html, codePage); + + // Find matching close brace + int groupEnd = findMatchingBrace(rtf, pos); + if (groupEnd < 0) { + break; + } + + // Skip {\*\htmltag prefix and digit(s) + int contentStart = pos + HTMLTAG_PREFIX.length(); + while (contentStart < groupEnd && Character.isDigit(rtf.charAt(contentStart))) { + contentStart++; + } + // Skip optional space after tag number + if (contentStart < groupEnd && rtf.charAt(contentStart) == ' ') { + contentStart++; + } + + // Decode the htmltag content + String inner = rtf.substring(contentStart, groupEnd); + decodeRtfEscapes(inner, html, codePage); + + pos = groupEnd + 1; + continue; + } + + // Check for \htmlrtf control word (start or end of RTF-only block) + if (rtf.startsWith("\\htmlrtf", pos)) { + flushPendingBytes(pendingBytes, html, codePage); + int afterWord = pos + "\\htmlrtf".length(); + + if (afterWord < len && rtf.charAt(afterWord) == '0') { + // \htmlrtf0 — end of skip block + inHtmlRtfSkip = false; + afterWord++; + if (afterWord < len && rtf.charAt(afterWord) == ' ') { + afterWord++; + } + } else { + // \htmlrtf — start of skip block + inHtmlRtfSkip = true; + if (afterWord < len && rtf.charAt(afterWord) == ' ') { + afterWord++; + } + } + pos = afterWord; + continue; + } + + // If we're inside an \htmlrtf skip block, just advance past this character. + // We don't skip nested groups wholesale because \htmlrtf0 may appear inside them. + if (inHtmlRtfSkip) { + pos++; + continue; + } + + // Check for other { groups (nested RTF groups that aren't htmltag) + if (rtf.charAt(pos) == '{') { + flushPendingBytes(pendingBytes, html, codePage); + int end = findMatchingBrace(rtf, pos); + if (end > 0) { + pos = end + 1; + } else { + pos++; + } + continue; + } + + // Skip closing braces + if (rtf.charAt(pos) == '}') { + flushPendingBytes(pendingBytes, html, codePage); + pos++; + continue; + } + + // Handle RTF escapes in inter-tag text + if (rtf.charAt(pos) == '\\' && pos + 1 < len) { + char next = rtf.charAt(pos + 1); + + // \'xx hex escape + if (next == '\'' && pos + 3 < len) { + int hi = Character.digit(rtf.charAt(pos + 2), 16); + int lo = Character.digit(rtf.charAt(pos + 3), 16); + if (hi >= 0 && lo >= 0) { + pendingBytes.write((hi << 4) | lo); + } + pos += 4; + continue; + } + + flushPendingBytes(pendingBytes, html, codePage); + + // Escaped literals + if (next == '\\' || next == '{' || next == '}') { + html.append(next); + pos += 2; + continue; + } + + // Control word + if (Character.isLetter(next)) { + int wordStart = pos + 1; + int wordEnd = wordStart; + while (wordEnd < len && Character.isLetter(rtf.charAt(wordEnd))) { + wordEnd++; + } + String word = rtf.substring(wordStart, wordEnd); + + // Skip optional numeric parameter + int paramEnd = wordEnd; + if (paramEnd < len && (rtf.charAt(paramEnd) == '-' + || Character.isDigit(rtf.charAt(paramEnd)))) { + paramEnd++; + while (paramEnd < len && Character.isDigit(rtf.charAt(paramEnd))) { + paramEnd++; + } + } + // Skip optional space delimiter + int afterWord = paramEnd; + if (afterWord < len && rtf.charAt(afterWord) == ' ') { + afterWord++; + } + + switch (word) { + case "par": + case "pard": + html.append('\n'); + break; + case "tab": + html.append('\t'); + break; + case "line": + html.append("<br>"); + break; + default: + // Skip unknown control words + break; + } + pos = afterWord; + continue; + } + + // Unknown escape — skip backslash + pos++; + continue; + } + + // Newlines/carriage returns in RTF are whitespace, not content + if (rtf.charAt(pos) == '\r' || rtf.charAt(pos) == '\n') { + pos++; + continue; + } + + // Regular text character between htmltag groups — this is HTML content + flushPendingBytes(pendingBytes, html, codePage); + html.append(rtf.charAt(pos)); + pos++; + } + + flushPendingBytes(pendingBytes, html, codePage); + + if (html.length() == 0) { + return null; + } + return html.toString(); + } + + /** + * Find the position of the closing brace that matches the opening brace at + * {@code openPos}. Handles nested groups and escaped braces. + * + * @return index of the closing '}', or -1 if not found + */ + static int findMatchingBrace(String rtf, int openPos) { + int depth = 0; + int len = rtf.length(); + for (int i = openPos; i < len; i++) { + char c = rtf.charAt(i); + if (c == '\\' && i + 1 < len) { + char next = rtf.charAt(i + 1); + if (next == '{' || next == '}' || next == '\\') { + i++; + continue; + } + } + if (c == '{') { + depth++; + } else if (c == '}') { + depth--; + if (depth == 0) { + return i; + } + } + } + return -1; + } + + /** + * Decode RTF escapes within an htmltag group's content. + */ + static void decodeRtfEscapes(String content, StringBuilder out, Charset codePage) { + int len = content.length(); + int i = 0; + ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream(); + + while (i < len) { + char c = content.charAt(i); + + if (c == '\\') { + if (i + 1 >= len) { + break; + } + char next = content.charAt(i + 1); + + // \'xx hex escape + if (next == '\'' && i + 3 < len) { + int hi = Character.digit(content.charAt(i + 2), 16); + int lo = Character.digit(content.charAt(i + 3), 16); + if (hi >= 0 && lo >= 0) { + pendingBytes.write((hi << 4) | lo); + } + i += 4; + continue; + } + + flushPendingBytes(pendingBytes, out, codePage); + + if (next == '\\' || next == '{' || next == '}') { + out.append(next); + i += 2; + continue; + } + + // Control words + if (Character.isLetter(next)) { + int wordStart = i + 1; + int wordEnd = wordStart; + while (wordEnd < len && Character.isLetter(content.charAt(wordEnd))) { + wordEnd++; + } + String word = content.substring(wordStart, wordEnd); + + int paramEnd = wordEnd; + if (paramEnd < len && (content.charAt(paramEnd) == '-' + || Character.isDigit(content.charAt(paramEnd)))) { + paramEnd++; + while (paramEnd < len && Character.isDigit(content.charAt(paramEnd))) { + paramEnd++; + } + } + int afterWord = paramEnd; + if (afterWord < len && content.charAt(afterWord) == ' ') { + afterWord++; + } + + switch (word) { + case "par": + case "pard": + out.append('\n'); + break; + case "tab": + out.append('\t'); + break; + case "line": + out.append("<br>"); + break; + case "htmlrtf": + // Skip \htmlrtf...\htmlrtf0 inside htmltag groups + i = skipHtmlRtfBlock(content, i); + continue; + default: + break; + } + i = afterWord; + continue; + } + + i++; + continue; + } + + if (c == '{' || c == '}') { + flushPendingBytes(pendingBytes, out, codePage); + i++; + continue; + } + + flushPendingBytes(pendingBytes, out, codePage); + out.append(c); + i++; + } + + flushPendingBytes(pendingBytes, out, codePage); + } + + /** + * Skip a {@code \htmlrtf ... \htmlrtf0} block within an htmltag group. + * + * @param content the string being parsed + * @param pos position of the backslash starting {@code \htmlrtf} + * @return position after the matching {@code \htmlrtf0} + */ + static int skipHtmlRtfBlock(String content, int pos) { + int afterWord = pos + "\\htmlrtf".length(); + if (afterWord < content.length() && content.charAt(afterWord) == '0') { + // This is \htmlrtf0 (end marker) — just skip past it + afterWord++; + if (afterWord < content.length() && content.charAt(afterWord) == ' ') { + afterWord++; + } + return afterWord; + } + + // Skip everything until \htmlrtf0 + int endPos = content.indexOf("\\htmlrtf0", afterWord); + if (endPos < 0) { + return content.length(); + } + int after = endPos + "\\htmlrtf0".length(); + if (after < content.length() && content.charAt(after) == ' ') { + after++; + } + return after; + } + + /** + * Detect the ANSI code page from the RTF header ({@code \ansicpgNNNN}). + * Falls back to windows-1252 if not found. + */ + static Charset detectCodePage(String rtf) { + int idx = rtf.indexOf(ANSICPG_PREFIX); + if (idx < 0) { + return Charset.forName("windows-1252"); + } + int numStart = idx + ANSICPG_PREFIX.length(); + int numEnd = numStart; + while (numEnd < rtf.length() && Character.isDigit(rtf.charAt(numEnd))) { + numEnd++; + } + if (numEnd == numStart) { + return Charset.forName("windows-1252"); + } + String cpNum = rtf.substring(numStart, numEnd); + try { + return Charset.forName("windows-" + cpNum); + } catch (Exception e) { + try { + return Charset.forName("cp" + cpNum); + } catch (Exception e2) { + LOGGER.debug("Unknown code page {}, falling back to windows-1252", cpNum); + return Charset.forName("windows-1252"); + } + } + } + + private static void flushPendingBytes(ByteArrayOutputStream pending, StringBuilder out, + Charset codePage) { + if (pending.size() > 0) { + out.append(new String(pending.toByteArray(), codePage)); + pending.reset(); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java new file mode 100644 index 0000000000..0c1096f4f2 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.msg; + +import static java.nio.charset.StandardCharsets.US_ASCII; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +public class RTFEncapsulatedHTMLExtractorTest { + + @Test + public void testNullAndEmpty() { + assertNull(RTFEncapsulatedHTMLExtractor.extract(null)); + assertNull(RTFEncapsulatedHTMLExtractor.extract(new byte[0])); + } + + @Test + public void testNonEncapsulatedRtf() { + String rtf = "{\\rtf1\\ansi\\deff0 Hello world}"; + assertNull(RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII))); + } + + @Test + public void testSimpleEncapsulatedHtml() { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag19 <html>}\n" + + "{\\*\\htmltag34 <head>}\n" + + "{\\*\\htmltag41 </head>}\n" + + "{\\*\\htmltag50 <body>}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "{\\*\\htmltag64 <p>}\n" + + "{\\*\\htmltag84 Hello world}\n" + + "{\\*\\htmltag72 </p>}\n" + + "\\htmlrtf }\\htmlrtf0\n" + + "{\\*\\htmltag58 </body>}\n" + + "{\\*\\htmltag27 </html>}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("<html>")); + assertTrue(html.contains("<p>")); + assertTrue(html.contains("Hello world")); + assertTrue(html.contains("</html>")); + } + + @Test + public void testImgCidExtraction() { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag19 <html>}\n" + + "{\\*\\htmltag50 <body>}\n" + + "{\\*\\htmltag84 <img src=\"cid:[email protected]\">}\n" + + "{\\*\\htmltag58 </body>}\n" + + "{\\*\\htmltag27 </html>}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("cid:[email protected]"), + "CID reference should be preserved in extracted HTML"); + } + + @Test + public void testParAndTabDecoding() { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag241 <style>}\n" + + "{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n" + + "{\\*\\htmltag249 </style>}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("<style>")); + assertTrue(html.contains("body {")); + assertTrue(html.contains("\tcolor: red;")); + assertTrue(html.contains("</style>")); + } + + @Test + public void testHexEscapeDecoding() { + // \'e9 = 0xE9 = 'é' in windows-1252 + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 caf\\'e9}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("café", html); + } + + @Test + public void testMultiByteHexEscape() { + // UTF-8 encoded 'ü' = 0xC3 0xBC in code page 65001 (UTF-8) + // But more commonly: \'fc in windows-1252 = 'ü' + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 gr\\'fc\\'dfe}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("grüße", html); + } + + @Test + public void testCodePage1254Turkish() { + // \'fe in windows-1254 = 'þ' (U+00FE, LATIN SMALL LETTER THORN) + // \'fd in windows-1254 = 'ý' (U+00FD) + String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 Say\\'fdn}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("Sayın", html); + } + + @Test + public void testHtmlrtfSkipping() { + // Content between \htmlrtf and \htmlrtf0 should be skipped + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 Hello}\n" + + "\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" + + "{\\*\\htmltag84 World}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("Hello World", html); + } + + @Test + public void testEscapedBracesAndBackslash() { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("a { b } c \\d", html); + } + + @Test + public void testEmptyHtmltag() { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag72}\n" + + "{\\*\\htmltag84 text}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("text", html); + } + + @Test + public void testInterTagTextContent() { + // Realistic pattern: text content appears BETWEEN htmltag groups, + // with \htmlrtf blocks that should be skipped + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag19 <html>}\n" + + "{\\*\\htmltag50 <body>}\n" + + "{\\*\\htmltag64 <p>}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "Hello from the message body\n" + + "\\htmlrtf\\par}\\htmlrtf0\n" + + "{\\*\\htmltag72 </p>}\n" + + "{\\*\\htmltag64 <p>}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "Second paragraph\n" + + "\\htmlrtf\\par}\\htmlrtf0\n" + + "{\\*\\htmltag72 </p>}\n" + + "{\\*\\htmltag58 </body>}\n" + + "{\\*\\htmltag27 </html>}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("<p>"), "should contain HTML tags"); + assertTrue(html.contains("Hello from the message body"), + "should contain inter-tag text content"); + assertTrue(html.contains("Second paragraph"), + "should contain second paragraph text"); + assertTrue(html.contains("</html>"), "should contain closing tag"); + } + + @Test + public void testInterTagHexEscapes() { + // Text between htmltag groups can also have \'xx escapes + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag64 <p>}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "caf\\'e9\n" + + "\\htmlrtf }\\htmlrtf0\n" + + "{\\*\\htmltag72 </p>}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("café"), "hex escapes in inter-tag text should be decoded"); + } + + @Test + public void testLineControlWord() { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 line1\\line line2}\n" + + "}"; + String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("line1<br>line2", html); + } +}
