This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new fec0a997c8 decapsulate html from rtf within msgs...lol (#2713)
fec0a997c8 is described below
commit fec0a997c89c26e08c2700d00a2f99813cfa7730
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 3 17:30:36 2026 -0400
decapsulate html from rtf within msgs...lol (#2713)
---
.../tika/parser/microsoft/OfficeParserConfig.java | 18 ---
.../tika/parser/microsoft/OutlookExtractor.java | 151 +++++++-----------
.../msg/RTFEncapsulatedHTMLExtractor.java | 177 ++++++++++++++++++---
.../tika/parser/microsoft/OutlookParserTest.java | 48 +-----
.../msg/RTFEncapsulatedHTMLExtractorTest.java | 139 ++++++++++++++++
5 files changed, 353 insertions(+), 180 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index db6d4e78e9..c8886e5fdf 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -39,7 +39,6 @@ public class OfficeParserConfig implements Serializable {
private boolean writeSelectHeadersInBody = false;
- private boolean extractAllAlternativesFromMSG = false;
private String dateOverrideFormat = null;
private int maxOverride = 0;//ignore
@@ -214,23 +213,6 @@ public class OfficeParserConfig implements Serializable {
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
}
- public boolean isExtractAllAlternativesFromMSG() {
- return extractAllAlternativesFromMSG;
- }
-
- /**
- * Some .msg files can contain body content in html, rtf and/or text.
- * The default behavior is to pick the first non-null value and include
only that.
- * If you'd like to extract all non-null body content, which is likely
duplicative,
- * set this value to true.
- *
- * @param extractAllAlternativesFromMSG whether or not to extract all
alternative parts
- * @since 1.17
- */
- public void setExtractAllAlternativesFromMSG(boolean
extractAllAlternativesFromMSG) {
- this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
- }
-
public boolean isIncludeMissingRows() {
return includeMissingRows;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 0b8db23f45..a2ef6de04f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -78,7 +78,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlEncodingDetector;
@@ -183,7 +182,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
private final DirectoryNode root;
private final MAPIMessage msg;
private final ParseContext parseContext;
- private final boolean extractAllAlternatives;
HtmlEncodingDetector detector = new HtmlEncodingDetector();
@@ -191,8 +189,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
super(context, metadata);
this.root = root;
this.parseContext = context;
- this.extractAllAlternatives =
-
context.get(OfficeParserConfig.class).isExtractAllAlternativesFromMSG();
try {
this.msg = new MAPIMessage(root);
} catch (IOException e) {
@@ -296,6 +292,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
Set<String> contentIdNames = new HashSet<>();
handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml,
contentIdNames);
+
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
Metadata attachMetadata = Metadata.newInstance(context);
@@ -586,36 +583,11 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
XHTMLContentHandler xhtml, Set<String>
contentIdNames)
throws SAXException, IOException, TikaException {
-
- if (extractAllAlternatives) {
- extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml,
contentIdNames);
- return;
- }
- _handleBestBodyChunk(htmlChunk, rtfChunk, textChunk, xhtml,
contentIdNames);
-
- }
- private void _handleBestBodyChunk(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
- XHTMLContentHandler xhtml, Set<String>
contentIdNames)
- throws SAXException, IOException, TikaException {
- //try html, then rtf, then text
+ // Priority: a) HTML chunk, b) HTML extracted from RTF, c) raw RTF, d)
text
if (htmlChunk != null) {
- byte[] data = null;
- if (htmlChunk instanceof ByteChunk) {
- data = ((ByteChunk) htmlChunk).getValue();
- } else if (htmlChunk instanceof StringChunk) {
- data = ((StringChunk) htmlChunk).getRawValue();
- }
+ byte[] data = getValue(htmlChunk);
if (data != null) {
- Parser htmlParser = EmbeddedDocumentUtil
- .tryToFindExistingLeafParser(JSoupParser.class,
parseContext);
- if (htmlParser == null) {
- htmlParser = new JSoupParser();
- }
- Metadata htmlMetadata = Metadata.newInstance(context);
- try (TikaInputStream tis = TikaInputStream.get(data)) {
- htmlParser.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), htmlMetadata, parseContext);
- }
- extractContentIdNamesFromHtml(data, htmlMetadata,
contentIdNames);
+ parseHtmlBody(data, xhtml, contentIdNames);
parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.HTML.name());
return;
}
@@ -623,25 +595,34 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
if (rtfChunk != null) {
ByteChunk chunk = (ByteChunk) rtfChunk;
//avoid buffer underflow TIKA-2530
- //TODO -- would be good to find an example triggering file and
- //figure out if this is a bug in POI or a genuine 0 length chunk
if (chunk.getValue() != null && chunk.getValue().length > 0) {
MAPIRtfAttribute rtf =
new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED,
Types.BINARY.getId(),
chunk.getValue());
+ byte[] rtfData = rtf.getData();
+ // Try to extract encapsulated HTML — returns null if not
present
+ String html = RTFEncapsulatedHTMLExtractor.extract(rtfData);
+ if (html != null) {
+ parseHtmlString(html, xhtml, contentIdNames);
+ parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
+ BODY_TYPES_PROCESSED.RTF.name());
+ parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML,
"true");
+ return;
+ }
+ // Fall back to parsing as raw RTF
RTFParser rtfParser = (RTFParser) EmbeddedDocumentUtil
.tryToFindExistingLeafParser(RTFParser.class,
parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
Metadata rtfMetadata = Metadata.newInstance(context);
- try (TikaInputStream tis = TikaInputStream.get(rtf.getData()))
{
+ try (TikaInputStream tis = TikaInputStream.get(rtfData)) {
rtfParser.parseInline(tis, xhtml, rtfMetadata,
parseContext);
}
- extractContentIdNamesFromRtf(rtf.getData(), rtfMetadata,
contentIdNames);
+ // Scan raw RTF bytes for cid: references
+ extractContentIdNames(rtfData, contentIdNames);
parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.RTF.name());
- parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML,
-
rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
+ parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML,
"false");
return;
}
}
@@ -651,21 +632,46 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
extractContentIdNamesFromText(s, contentIdNames);
parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.TEXT.name());
}
+ }
+ private void parseHtmlBody(byte[] htmlData, XHTMLContentHandler xhtml,
+ Set<String> contentIdNames)
+ throws SAXException, IOException, TikaException {
+ Parser htmlParser = EmbeddedDocumentUtil
+ .tryToFindExistingLeafParser(JSoupParser.class, parseContext);
+ if (htmlParser == null) {
+ htmlParser = new JSoupParser();
+ }
+ Metadata htmlMetadata = Metadata.newInstance(context);
+ try (TikaInputStream tis = TikaInputStream.get(htmlData)) {
+ htmlParser.parse(tis,
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ htmlMetadata, parseContext);
+ }
+ extractContentIdNames(htmlData, contentIdNames);
}
- private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata,
Set<String> contentIdNames) {
- // Try to de-encapsulate the HTML from the RTF first
- String html = RTFEncapsulatedHTMLExtractor.extract(data);
- if (html != null) {
- extractContentIdNamesFromHtml(html.getBytes(UTF_8), metadata,
contentIdNames);
- return;
- }
- // Fall back to scanning the raw RTF bytes for cid: references
- extractContentIdNamesFromHtml(data, metadata, contentIdNames);
+ /**
+ * Parse an already-decoded HTML string using JSoupParser.parseString(),
+ * bypassing encoding detection entirely. Used for HTML de-encapsulated
+ * from RTF where the charset has already been handled.
+ */
+ private void parseHtmlString(String html, XHTMLContentHandler xhtml,
+ Set<String> contentIdNames)
+ throws SAXException, IOException, TikaException {
+ JSoupParser htmlParser = (JSoupParser) EmbeddedDocumentUtil
+ .tryToFindExistingLeafParser(JSoupParser.class, parseContext);
+ if (htmlParser == null) {
+ htmlParser = new JSoupParser();
+ }
+ Metadata htmlMetadata = Metadata.newInstance(context);
+ htmlParser.parseString(html,
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ htmlMetadata, parseContext);
+ extractContentIdNames(html.getBytes(UTF_8), contentIdNames);
}
- private void extractContentIdNamesFromHtml(byte[] data, Metadata metadata,
Set<String> contentIdNames) {
+ private void extractContentIdNames(byte[] data, Set<String>
contentIdNames) {
String html = new String(data, UTF_8);
Matcher imageMatcher = IMG_TAG_PATTERN.matcher(html);
Matcher cidSrcMatcher = SRC_ATTR_PATTERN.matcher("");
@@ -687,55 +693,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
- private void extractAllAlternatives(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
- XHTMLContentHandler xhtml, Set<String>
contentIdNames)
- throws TikaException, SAXException, IOException {
- if (htmlChunk != null) {
- byte[] data = getValue(htmlChunk);
- if (data != null) {
- handleEmbeddedResource(TikaInputStream.get(data), "html-body",
null,
- MediaType.TEXT_HTML.toString(), xhtml, true);
- extractContentIdNamesFromHtml(data,
Metadata.newInstance(context), contentIdNames);
- parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.HTML.name());
- }
- }
- if (rtfChunk != null) {
- ByteChunk chunk = (ByteChunk) rtfChunk;
- MAPIRtfAttribute rtf =
- new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED,
Types.BINARY.getId(),
- chunk.getValue());
-
- byte[] data = rtf.getData();
- if (data != null) {
- Metadata rtfMetadata = Metadata.newInstance(context);
- handleEmbeddedResource(TikaInputStream.get(data), rtfMetadata,
- "rtf-body", null, null,
- "application/rtf", xhtml, true);
- extractContentIdNamesFromRtf(data, rtfMetadata,
contentIdNames);
- //copy this info into the parent...what else should we copy?
- parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.RTF.name());
- parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML,
-
rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
-
- }
- }
- if (textChunk != null) {
- byte[] data = getValue(textChunk);
- if (data != null) {
- Metadata chunkMetadata = Metadata.newInstance(context);
-
chunkMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
- MediaType.TEXT_PLAIN.toString());
- handleEmbeddedResource(TikaInputStream.get(data),
chunkMetadata, null, "text-body",
- null, MediaType.TEXT_PLAIN.toString(), xhtml, true);
- if (textChunk instanceof StringChunk) {
- extractContentIdNamesFromText(((StringChunk)
textChunk).getValue(), contentIdNames);
- }
- parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
BODY_TYPES_PROCESSED.TEXT.name());
- }
- }
-
- }
-
//can return null!
private byte[] getValue(Chunk chunk) {
byte[] data = null;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
index 3ef453a48d..e254dc4447 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractor.java
@@ -19,6 +19,10 @@ package org.apache.tika.parser.microsoft.msg;
import java.io.ByteArrayOutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -34,14 +38,10 @@ import org.slf4j.LoggerFactory;
* wrapped in {@code \htmlrtf ... \htmlrtf0} (which marks RTF-only
rendering hints)</li>
* </ol>
*
- * <p>Within both htmltag groups and inter-tag text, the following RTF escapes
are decoded:</p>
- * <ul>
- * <li>{@code \par} → newline</li>
- * <li>{@code \tab} → tab character</li>
- * <li>{@code \line} → {@code <br>}</li>
- * <li>{@code \'xx} → single byte (decoded using the document's ANSI code
page)</li>
- * <li>{@code \\}, {@code \{}, {@code \}} → literal characters</li>
- * </ul>
+ * <p>Per the MS-OXRTFEX specification, {@code \'xx} hex escapes in inter-tag
text are decoded
+ * using the code page of the currently selected font ({@code \fN}). The
font-to-charset mapping
+ * is built from the RTF font table's {@code \fcharsetN} declarations. Inside
+ * {@code {\*\htmltag}} groups, the document's default code page ({@code
\ansicpgN}) is used.</p>
*/
public class RTFEncapsulatedHTMLExtractor {
@@ -51,6 +51,29 @@ public class RTFEncapsulatedHTMLExtractor {
private static final String FROM_HTML_MARKER = "\\fromhtml";
private static final String ANSICPG_PREFIX = "\\ansicpg";
+ // Maps RTF \fcharset values to Java Charset objects.
+ // Based on the Windows CharacterSet enumeration and Tika's
TextExtractor.FCHARSET_MAP.
+ private static final Map<Integer, Charset> FCHARSET_MAP = new HashMap<>();
+
+ static {
+ FCHARSET_MAP.put(0, Charset.forName("windows-1252")); // ANSI
+ FCHARSET_MAP.put(77, Charset.forName("MacRoman")); // Mac Roman
+ FCHARSET_MAP.put(128, Charset.forName("MS932")); // Shift_JIS
(Japanese)
+ FCHARSET_MAP.put(129, Charset.forName("ms949")); // Hangul
(Korean)
+ FCHARSET_MAP.put(130, charsetOrNull("x-Johab")); // Johab
(Korean)
+ FCHARSET_MAP.put(134, Charset.forName("GBK")); // GB2312
(Simplified Chinese)
+ FCHARSET_MAP.put(136, Charset.forName("Big5")); // Big5
(Traditional Chinese)
+ FCHARSET_MAP.put(161, Charset.forName("windows-1253")); // Greek
+ FCHARSET_MAP.put(162, Charset.forName("windows-1254")); // Turkish
+ FCHARSET_MAP.put(163, Charset.forName("windows-1258")); // Vietnamese
+ FCHARSET_MAP.put(177, Charset.forName("windows-1255")); // Hebrew
+ FCHARSET_MAP.put(178, Charset.forName("windows-1256")); // Arabic
+ FCHARSET_MAP.put(186, Charset.forName("windows-1257")); // Baltic
+ FCHARSET_MAP.put(204, Charset.forName("windows-1251")); // Russian
+ FCHARSET_MAP.put(222, Charset.forName("ms874")); // Thai
+ FCHARSET_MAP.put(238, Charset.forName("windows-1250")); // Eastern
Europe
+ }
+
/**
* Extracts the HTML content from an encapsulated-HTML RTF document.
*
@@ -69,11 +92,15 @@ public class RTFEncapsulatedHTMLExtractor {
return null;
}
- Charset codePage = detectCodePage(rtf);
+ Charset defaultCodePage = detectCodePage(rtf);
+ Map<Integer, Charset> fontCharsets = parseFontTable(rtf);
+ // Track the current font's charset for inter-tag text decoding.
+ // The stack mirrors RTF brace nesting so that font switches inside
+ // groups (e.g. {\f3 ...}) are automatically unwound on '}'.
+ Charset currentFontCharset = defaultCodePage;
+ Deque<Charset> charsetStack = new ArrayDeque<>();
// Find the start of the document body (after the RTF header).
- // We skip past the initial {\rtf1... header by finding the first
- // htmltag group or \htmlrtf marker — everything before that is RTF
preamble.
int bodyStart = rtf.indexOf(HTMLTAG_PREFIX);
if (bodyStart < 0) {
return null;
@@ -88,7 +115,7 @@ public class RTFEncapsulatedHTMLExtractor {
while (pos < len) {
// Check if we're at an htmltag group
if (rtf.startsWith(HTMLTAG_PREFIX, pos)) {
- flushPendingBytes(pendingBytes, html, codePage);
+ flushPendingBytes(pendingBytes, html, currentFontCharset);
// Find matching close brace
int groupEnd = findMatchingBrace(rtf, pos);
@@ -106,9 +133,9 @@ public class RTFEncapsulatedHTMLExtractor {
contentStart++;
}
- // Decode the htmltag content
+ // Decode the htmltag content using default code page per
MS-OXRTFEX spec
String inner = rtf.substring(contentStart, groupEnd);
- decodeRtfEscapes(inner, html, codePage);
+ decodeRtfEscapes(inner, html, defaultCodePage);
pos = groupEnd + 1;
continue;
@@ -116,7 +143,7 @@ public class RTFEncapsulatedHTMLExtractor {
// Check for \htmlrtf control word (start or end of RTF-only block)
if (rtf.startsWith("\\htmlrtf", pos)) {
- flushPendingBytes(pendingBytes, html, codePage);
+ flushPendingBytes(pendingBytes, html, currentFontCharset);
int afterWord = pos + "\\htmlrtf".length();
if (afterWord < len && rtf.charAt(afterWord) == '0') {
@@ -137,16 +164,40 @@ public class RTFEncapsulatedHTMLExtractor {
continue;
}
- // If we're inside an \htmlrtf skip block, just advance past this
character.
- // We don't skip nested groups wholesale because \htmlrtf0 may
appear inside them.
+ // Inside \htmlrtf skip blocks: don't emit text, but track brace
+ // nesting so that font switches inside groups are properly scoped
+ // (pushed on '{', popped on '}') — just like the full RTF parser.
if (inHtmlRtfSkip) {
+ char sc = rtf.charAt(pos);
+ if (sc == '{') {
+ charsetStack.push(currentFontCharset);
+ } else if (sc == '}') {
+ if (!charsetStack.isEmpty()) {
+ currentFontCharset = charsetStack.pop();
+ }
+ } else if (sc == '\\' && pos + 1 < len && rtf.charAt(pos + 1)
== 'f'
+ && pos + 2 < len && Character.isDigit(rtf.charAt(pos +
2))) {
+ // Track \fN font switches within the current group
+ int numStart = pos + 2;
+ int numEnd = numStart;
+ while (numEnd < len &&
Character.isDigit(rtf.charAt(numEnd))) {
+ numEnd++;
+ }
+ if (numEnd >= len ||
!Character.isLetter(rtf.charAt(numEnd))) {
+ int fontId = Integer.parseInt(rtf.substring(numStart,
numEnd));
+ Charset fontCs = fontCharsets.get(fontId);
+ if (fontCs != null) {
+ currentFontCharset = fontCs;
+ }
+ }
+ }
pos++;
continue;
}
// Check for other { groups (nested RTF groups that aren't htmltag)
if (rtf.charAt(pos) == '{') {
- flushPendingBytes(pendingBytes, html, codePage);
+ flushPendingBytes(pendingBytes, html, currentFontCharset);
int end = findMatchingBrace(rtf, pos);
if (end > 0) {
pos = end + 1;
@@ -158,7 +209,7 @@ public class RTFEncapsulatedHTMLExtractor {
// Skip closing braces
if (rtf.charAt(pos) == '}') {
- flushPendingBytes(pendingBytes, html, codePage);
+ flushPendingBytes(pendingBytes, html, currentFontCharset);
pos++;
continue;
}
@@ -167,7 +218,7 @@ public class RTFEncapsulatedHTMLExtractor {
if (rtf.charAt(pos) == '\\' && pos + 1 < len) {
char next = rtf.charAt(pos + 1);
- // \'xx hex escape
+ // \'xx hex escape — decode using current font's charset
if (next == '\'' && pos + 3 < len) {
int hi = Character.digit(rtf.charAt(pos + 2), 16);
int lo = Character.digit(rtf.charAt(pos + 3), 16);
@@ -178,7 +229,7 @@ public class RTFEncapsulatedHTMLExtractor {
continue;
}
- flushPendingBytes(pendingBytes, html, codePage);
+ flushPendingBytes(pendingBytes, html, currentFontCharset);
// Escaped literals
if (next == '\\' || next == '{' || next == '}') {
@@ -196,7 +247,8 @@ public class RTFEncapsulatedHTMLExtractor {
}
String word = rtf.substring(wordStart, wordEnd);
- // Skip optional numeric parameter
+ // Parse optional numeric parameter
+ int paramStart = wordEnd;
int paramEnd = wordEnd;
if (paramEnd < len && (rtf.charAt(paramEnd) == '-'
|| Character.isDigit(rtf.charAt(paramEnd)))) {
@@ -222,6 +274,17 @@ public class RTFEncapsulatedHTMLExtractor {
case "line":
html.append("<br>");
break;
+ case "f":
+ // Font switch in inter-tag text — update current
charset
+ if (paramEnd > paramStart) {
+ int fontId = Integer.parseInt(
+ rtf.substring(paramStart, paramEnd));
+ Charset fontCs = fontCharsets.get(fontId);
+ if (fontCs != null) {
+ currentFontCharset = fontCs;
+ }
+ }
+ break;
default:
// Skip unknown control words
break;
@@ -242,12 +305,12 @@ public class RTFEncapsulatedHTMLExtractor {
}
// Regular text character between htmltag groups — this is HTML
content
- flushPendingBytes(pendingBytes, html, codePage);
+ flushPendingBytes(pendingBytes, html, currentFontCharset);
html.append(rtf.charAt(pos));
pos++;
}
- flushPendingBytes(pendingBytes, html, codePage);
+ flushPendingBytes(pendingBytes, html, currentFontCharset);
if (html.length() == 0) {
return null;
@@ -255,6 +318,64 @@ public class RTFEncapsulatedHTMLExtractor {
return html.toString();
}
+ /**
+ * Parse the RTF font table to build a mapping from font ID to charset.
+ */
+ static Map<Integer, Charset> parseFontTable(String rtf) {
+ Map<Integer, Charset> result = new HashMap<>();
+ int fontTblStart = rtf.indexOf("{\\fonttbl");
+ if (fontTblStart < 0) {
+ return result;
+ }
+ int fontTblEnd = findMatchingBrace(rtf, fontTblStart);
+ if (fontTblEnd < 0) {
+ return result;
+ }
+ String fontTable = rtf.substring(fontTblStart, fontTblEnd + 1);
+
+ int currentFontId = -1;
+ int pos = 0;
+ int ftLen = fontTable.length();
+
+ while (pos < ftLen) {
+ if (fontTable.charAt(pos) == '\\' && pos + 1 < ftLen
+ && Character.isLetter(fontTable.charAt(pos + 1))) {
+ int wordStart = pos + 1;
+ int wordEnd = wordStart;
+ while (wordEnd < ftLen &&
Character.isLetter(fontTable.charAt(wordEnd))) {
+ wordEnd++;
+ }
+ String word = fontTable.substring(wordStart, wordEnd);
+
+ // Parse numeric parameter
+ int paramStart = wordEnd;
+ int paramEnd = wordEnd;
+ if (paramEnd < ftLen && (fontTable.charAt(paramEnd) == '-'
+ || Character.isDigit(fontTable.charAt(paramEnd)))) {
+ paramEnd++;
+ while (paramEnd < ftLen &&
Character.isDigit(fontTable.charAt(paramEnd))) {
+ paramEnd++;
+ }
+ }
+
+ if ("f".equals(word) && paramEnd > paramStart) {
+ currentFontId =
Integer.parseInt(fontTable.substring(paramStart, paramEnd));
+ } else if ("fcharset".equals(word) && paramEnd > paramStart
+ && currentFontId >= 0) {
+ int fcharset =
Integer.parseInt(fontTable.substring(paramStart, paramEnd));
+ Charset cs = FCHARSET_MAP.get(fcharset);
+ if (cs != null) {
+ result.put(currentFontId, cs);
+ }
+ }
+ pos = paramEnd;
+ } else {
+ pos++;
+ }
+ }
+ return result;
+ }
+
/**
* Find the position of the closing brace that matches the opening brace at
* {@code openPos}. Handles nested groups and escaped braces.
@@ -443,6 +564,14 @@ public class RTFEncapsulatedHTMLExtractor {
}
}
+ private static Charset charsetOrNull(String name) {
+ try {
+ return Charset.forName(name);
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
private static void flushPendingBytes(ByteArrayOutputStream pending,
StringBuilder out,
Charset codePage) {
if (pending.size() > 0) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index eb92465dbe..f4c4b50b73 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -34,7 +34,6 @@ import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
@@ -42,7 +41,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
@@ -282,13 +280,13 @@ public class OutlookParserTest extends TikaTest {
AUTO_DETECT_PARSER.parse(tis, handler, metadata, new
ParseContext());
}
- // As the HTML version should have been processed, ensure
- // we got some of the links
+ // The encapsulated HTML should have been extracted and parsed through
JSoupParser
String content = sw.toString().replaceAll("[\\r\\n\\t]+", "
").replaceAll(" +", " ");
assertNotContained("<dd>New Outlook User</dd>", content);
- assertContains("designed <i>to help you", content);
+ assertContains("designed to help you", content);
assertContains(
- "<p> <a
href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached
Exchange Mode</a>",
+ "<a
href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">" +
+ "Cached Exchange Mode</a>",
content);
// Link - check text around it, and the link itself
@@ -366,45 +364,13 @@ public class OutlookParserTest extends TikaTest {
}
- @Test
- public void testHandlingAllAlternativesBodies() throws Exception {
- //test that default only has one body
- List<Metadata> metadataList = getRecursiveMetadata("testMSG.msg");
- assertEquals(1, metadataList.size());
- assertContains("breaking your application",
- metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
- assertEquals("application/vnd.ms-outlook",
metadataList.get(0).get(Metadata.CONTENT_TYPE));
-
- //now try extracting all bodies
- //they should each appear as standalone attachments
- //with no content in the body of the msg level
- Parser p = TikaLoader.load(
- getConfigPath(OutlookParserTest.class,
"tika-config-extract-all-alternatives-msg.json"))
- .loadAutoDetectParser();
-
- metadataList = getRecursiveMetadata("testMSG.msg", p);
- assertEquals(3, metadataList.size());
-
- assertNotContained("breaking your application",
- metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
- assertEquals("application/vnd.ms-outlook",
- metadataList.get(0).get(Metadata.CONTENT_TYPE));
-
- assertContains("breaking your application",
- metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
- assertEquals("application/rtf",
metadataList.get(1).get(Metadata.CONTENT_TYPE));
-
- assertContains("breaking your application",
- metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
-
assertTrue(metadataList.get(2).get(Metadata.CONTENT_TYPE).startsWith("text/plain"));
-
- }
-
@Test
public void testNewlinesInRTFBody() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg",
AUTO_DETECT_PARSER,
BasicContentHandlerFactory.HANDLER_TYPE.BODY);
- assertContains("annuaires\t \n" + " Synchronisation",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+ String content =
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
+ assertContains("annuaires", content);
+ assertContains("Synchronisation", content);
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
index 0c1096f4f2..f09e6019fc 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/msg/RTFEncapsulatedHTMLExtractorTest.java
@@ -18,10 +18,14 @@ package org.apache.tika.parser.microsoft.msg;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import java.nio.charset.Charset;
+import java.util.Map;
+
import org.junit.jupiter.api.Test;
public class RTFEncapsulatedHTMLExtractorTest {
@@ -214,4 +218,139 @@ public class RTFEncapsulatedHTMLExtractorTest {
assertNotNull(html);
assertEquals("line1<br>line2", html);
}
+
+ @Test
+ public void testParseFontTable() {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1
\\deff0{\\fonttbl\n" +
+ "{\\f0\\fswiss\\fcharset0 Arial;}\n" +
+ "{\\f1\\fmodern\\fcharset0 Courier New;}\n" +
+ "{\\f4\\fswiss\\fcharset134 Simsun;}\n" +
+ "{\\f5\\fswiss\\fcharset128 MS PGothic;}\n" +
+ "{\\f6\\fswiss\\fcharset162 Arial Tur;}\n" +
+ "}\n}";
+ Map<Integer, Charset> fonts =
RTFEncapsulatedHTMLExtractor.parseFontTable(rtf);
+ assertEquals(Charset.forName("windows-1252"), fonts.get(0));
+ assertEquals(Charset.forName("GBK"), fonts.get(4));
+ assertEquals(Charset.forName("MS932"), fonts.get(5));
+ assertEquals(Charset.forName("windows-1254"), fonts.get(6));
+ }
+
+ @Test
+ public void testParseFontTableEmpty() {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0 no font
table}";
+ Map<Integer, Charset> fonts =
RTFEncapsulatedHTMLExtractor.parseFontTable(rtf);
+ assertTrue(fonts.isEmpty());
+ }
+
+ @Test
+ public void testCjkFontCharsetTracking() {
+ // Simulates the real-world case: \ansicpg1252 but \fcharset134 (GBK)
font
+ // used for inter-tag CJK text. The \htmlrtf block switches to \f1
(GBK font)
+ // and the \'xx bytes after \htmlrtf0 should be decoded as GBK.
+ // \u53ef\u4ee5 = 可以, GBK bytes: BF C9 D2 D4
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1
\\deff0{\\fonttbl\n" +
+ "{\\f0\\fswiss\\fcharset0 Arial;}\n" +
+ "{\\f1\\fswiss\\fcharset134 Simsun;}\n" +
+ "}\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\f1 \\htmlrtf0\n" +
+ "\\'bf\\'c9\\'d2\\'d4\n" +
+ "\\htmlrtf }\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("\u53ef\u4ee5"),
+ "GBK bytes should be decoded as Chinese characters, got: " +
html);
+ }
+
+ @Test
+ public void testCjkFontSwitchBackToLatin() {
+ // After CJK text, font switches back to Latin font for ASCII content
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1
\\deff0{\\fonttbl\n" +
+ "{\\f0\\fswiss\\fcharset0 Arial;}\n" +
+ "{\\f1\\fswiss\\fcharset134 Simsun;}\n" +
+ "}\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\f1 \\htmlrtf0\n" +
+ "\\'bf\\'c9\\'d2\\'d4\n" +
+ "\\htmlrtf\\f0 \\htmlrtf0\n" +
+ "Hello\n" +
+ "\\htmlrtf }\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("\u53ef\u4ee5"),
+ "CJK should be decoded correctly, got: " + html);
+ assertTrue(html.contains("Hello"),
+ "Latin text after font switch should be preserved");
+ }
+
+ @Test
+ public void testHtmltagUsesDefaultCodePage() {
+ // Per MS-OXRTFEX spec, \'xx inside htmltag groups should use the
+ // default code page (\ansicpg), not the current font's charset.
+ // \'e9 in windows-1252 = é
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1
\\deff0{\\fonttbl\n" +
+ "{\\f0\\fswiss\\fcharset0 Arial;}\n" +
+ "{\\f1\\fswiss\\fcharset134 Simsun;}\n" +
+ "}\n" +
+ "\\htmlrtf {\\f1 \\htmlrtf0\n" +
+ "{\\*\\htmltag84 caf\\'e9}\n" +
+ "\\htmlrtf }\\htmlrtf0\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("café", html,
+ "htmltag content should use default code page, not font
charset");
+ }
+
+ @Test
+ public void testFontSwitchInBracedSkipBlockDoesNotPersist() {
+ // Reproduces the Hebrew/Chinese bug: a skip block contains {\f3\'a0}
+ // where \f3 is a Latin font (charset 0). The braces should scope the
+ // font switch so it doesn't affect subsequent inter-tag text.
+ // \u05d0\u05d2 = אג, windows-1255 bytes: E0 E2
+ String rtf = "{\\rtf1\\ansi\\ansicpg1255\\fromhtml1
\\deff0{\\fonttbl\n" +
+ "{\\f0\\fswiss\\fcharset177 David;}\n" +
+ "{\\f3\\fmodern\\fcharset0 Courier New;}\n" +
+ "}\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "\\'e0\\'e2\n" + // Hebrew: אג
+ "{\\*\\htmltag84 }" +
+ "\\htmlrtf {\\f3\\'a0}\\htmlrtf0\n" + // skip block with
braced \f3
+ "\\'e8\\'e5\\'e1\n" + // Hebrew: חוב
+ "\\htmlrtf }\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("\u05d0\u05d2"),
+ "First Hebrew text should decode correctly, got: " + html);
+ // \xe8\xe5\xe1 in windows-1255 = טוב; in windows-1252 = èåá
+ assertTrue(html.contains("\u05d8\u05d5\u05d1"),
+ "Hebrew text after braced skip block should still use
windows-1255, got: " + html);
+ assertFalse(html.contains("\u00e8\u00e5\u00e1"),
+ "Should NOT decode as windows-1252 (mojibake), got: " + html);
+ }
+
+ @Test
+ public void testFontSwitchInInterTagText() {
+ // \f control word directly in inter-tag text (outside \htmlrtf blocks)
+ // should also update the current charset
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1
\\deff0{\\fonttbl\n" +
+ "{\\f0\\fswiss\\fcharset0 Arial;}\n" +
+ "{\\f1\\fswiss\\fcharset134 Simsun;}\n" +
+ "}\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\f1 \\'bf\\'c9\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "}";
+ String html =
RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("\u53ef"),
+ "Font switch in inter-tag text should affect charset, got: " +
html);
+ }
}