(tika) 01/02: jflex rtf parser - WIP

tallison Mon, 06 Apr 2026 06:34:53 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 
TIKA-4710-rtf-attachments-in-html-decapsulation
in repository https://gitbox.apache.org/repos/asf/tika.git


commit c37df9d84ede2599ae3c96b00b6676f9b149f356
Author: tallison <[email protected]>
AuthorDate: Mon Apr 6 08:21:47 2026 -0400

    jflex rtf parser - WIP
---
 .../tika-parser-microsoft-module/pom.xml           |  17 +
 .../tika/parser/microsoft/OutlookExtractor.java    |   8 +-
 .../parser/microsoft/rtf/RTFObjDataParser.java     |   6 +-
 .../parser/microsoft/rtf/jflex/RTFCharsetMaps.java | 180 +++++++
 .../microsoft/rtf/jflex/RTFEmbeddedHandler.java    | 339 +++++++++++++
 .../parser/microsoft/rtf/jflex/RTFGroupState.java  |  76 +++
 .../microsoft/rtf/jflex/RTFHtmlDecapsulator.java   | 284 +++++++++++
 .../rtf/jflex/RTFObjDataStreamParser.java          | 534 +++++++++++++++++++++
 .../microsoft/rtf/jflex/RTFPictStreamParser.java   | 104 ++++
 .../tika/parser/microsoft/rtf/jflex/RTFState.java  | 336 +++++++++++++
 .../tika/parser/microsoft/rtf/jflex/RTFToken.java  |  96 ++++
 .../parser/microsoft/rtf/jflex/RTFTokenType.java   |  30 ++
 .../parser/microsoft/rtf/jflex/RTFTokenizer.jflex  | 129 +++++
 .../rtf/jflex/RTFEmbeddedHandlerTest.java          | 132 +++++
 .../rtf/jflex/RTFHtmlDecapsulatorTest.java         | 247 ++++++++++
 .../parser/microsoft/rtf/jflex/RTFStateTest.java   | 252 ++++++++++
 .../microsoft/rtf/jflex/RTFTokenizerTest.java      | 187 ++++++++
 17 files changed, 2951 insertions(+), 6 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
index 63cc9605cd..906914e132 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
@@ -125,6 +125,23 @@
   </dependencies>
   <build>
     <plugins>
+      <plugin>
+        <groupId>de.jflex</groupId>
+        <artifactId>jflex-maven-plugin</artifactId>
+        <version>1.9.1</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>generate</goal>
+            </goals>
+            <configuration>
+              <lexDefinitions>
+                <lexDefinition>src/main/jflex</lexDefinition>
+              </lexDefinitions>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
       <plugin>
         <groupId>org.apache.rat</groupId>
         <artifactId>apache-rat-plugin</artifactId>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a2ef6de04f..be8c419cb7 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -84,8 +84,8 @@ import org.apache.tika.parser.html.HtmlEncodingDetector;
 import org.apache.tika.parser.html.JSoupParser;
 import org.apache.tika.parser.mailcommons.MailDateParser;
 import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
-import org.apache.tika.parser.microsoft.msg.RTFEncapsulatedHTMLExtractor;
 import org.apache.tika.parser.microsoft.rtf.RTFParser;
+import org.apache.tika.parser.microsoft.rtf.jflex.RTFHtmlDecapsulator;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.apache.tika.sax.BodyContentHandler;
@@ -600,8 +600,10 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                         new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, 
Types.BINARY.getId(),
                                 chunk.getValue());
                 byte[] rtfData = rtf.getData();
-                // Try to extract encapsulated HTML — returns null if not 
present
-                String html = RTFEncapsulatedHTMLExtractor.extract(rtfData);
+                // Try to extract encapsulated HTML + embedded objects in one 
pass
+                RTFHtmlDecapsulator decapsulator =
+                        new RTFHtmlDecapsulator(xhtml, parseContext, 20 * 
1024);
+                String html = decapsulator.extract(rtfData);
                 if (html != null) {
                     parseHtmlString(html, xhtml, contentIdNames);
                     parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
index ff4c12061e..48f88e2f7f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
@@ -51,12 +51,12 @@ import 
org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
  * 
http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
  * and for granting permission to use his code in Tika.
  */
-class RTFObjDataParser {
+public class RTFObjDataParser {
 
     private final static String WIN_ASCII = "WINDOWS-1252";
     private final int memoryLimitInKb;
 
-    RTFObjDataParser(int memoryLimitInKb) {
+    public RTFObjDataParser(int memoryLimitInKb) {
         this.memoryLimitInKb = memoryLimitInKb;
     }
 
@@ -81,7 +81,7 @@ class RTFObjDataParser {
      * @return byte[] for contents of obj data
      * @throws IOException
      */
-    protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger 
unknownFilenameCount)
+    public byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger 
unknownFilenameCount)
             throws IOException, TikaException {
         UnsynchronizedByteArrayInputStream is = 
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get();
         long version = readUInt(is);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
new file mode 100644
index 0000000000..aaac2552ac
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.utils.CharsetUtils;
+
+/**
+ * Shared charset maps for RTF parsing. Maps RTF {@code \fcharsetN} and
+ * {@code \ansicpgN} values to Java {@link Charset} instances.
+ *
+ * <p>Extracted from the original {@code TextExtractor} so both the JFlex-based
+ * parser and decapsulator can reuse them.</p>
+ */
+public final class RTFCharsetMaps {
+
+    public static final Charset WINDOWS_1252 = Charset.forName("windows-1252");
+
+    /**
+     * Maps {@code \fcharsetN} values to Java charsets.
+     * The RTF font table uses these to declare per-font character encodings.
+     */
+    public static final Map<Integer, Charset> FCHARSET_MAP;
+
+    /**
+     * Maps {@code \ansicpgN} values to Java charsets.
+     * This is the global ANSI code page declared in the RTF header.
+     */
+    public static final Map<Integer, Charset> ANSICPG_MAP;
+
+    static {
+        Map<Integer, Charset> fcharset = new HashMap<>();
+
+        fcharset.put(0, WINDOWS_1252);                   // ANSI
+        // charset 1 = Default, charset 2 = Symbol
+
+        fcharset.put(77, getCharset("MacRoman"));        // Mac Roman
+        fcharset.put(78, getCharset("Shift_JIS"));       // Mac Shift Jis
+        fcharset.put(79, getCharset("ms949"));            // Mac Hangul
+        fcharset.put(80, getCharset("GB2312"));           // Mac GB2312
+        fcharset.put(81, getCharset("Big5"));             // Mac Big5
+        fcharset.put(82, getCharset("johab"));            // Mac Johab (old)
+        fcharset.put(83, getCharset("MacHebrew"));        // Mac Hebrew
+        fcharset.put(84, getCharset("MacArabic"));        // Mac Arabic
+        fcharset.put(85, getCharset("MacGreek"));         // Mac Greek
+        fcharset.put(86, getCharset("MacTurkish"));       // Mac Turkish
+        fcharset.put(87, getCharset("MacThai"));          // Mac Thai
+        fcharset.put(88, getCharset("cp1250"));           // Mac East Europe
+        fcharset.put(89, getCharset("cp1251"));           // Mac Russian
+
+        fcharset.put(128, getCharset("MS932"));           // Shift JIS
+        fcharset.put(129, getCharset("ms949"));           // Hangul
+        fcharset.put(130, getCharset("ms1361"));          // Johab
+        fcharset.put(134, getCharset("ms936"));           // GB2312
+        fcharset.put(136, getCharset("ms950"));           // Big5
+        fcharset.put(161, getCharset("cp1253"));          // Greek
+        fcharset.put(162, getCharset("cp1254"));          // Turkish
+        fcharset.put(163, getCharset("cp1258"));          // Vietnamese
+        fcharset.put(177, getCharset("cp1255"));          // Hebrew
+        fcharset.put(178, getCharset("cp1256"));          // Arabic
+        fcharset.put(186, getCharset("cp1257"));          // Baltic
+
+        fcharset.put(204, getCharset("cp1251"));          // Russian
+        fcharset.put(222, getCharset("ms874"));           // Thai
+        fcharset.put(238, getCharset("cp1250"));          // Eastern European
+        fcharset.put(254, getCharset("cp437"));           // PC 437
+        fcharset.put(255, getCharset("cp850"));           // OEM
+
+        FCHARSET_MAP = Collections.unmodifiableMap(fcharset);
+    }
+
+    static {
+        Map<Integer, Charset> ansicpg = new HashMap<>();
+
+        ansicpg.put(437, getCharset("CP437"));            // US IBM
+        ansicpg.put(708, getCharset("ISO-8859-6"));       // Arabic (ASMO 708)
+        ansicpg.put(709, getCharset("windows-709"));      // Arabic (ASMO 449+)
+        ansicpg.put(710, getCharset("windows-710"));      // Arabic 
(transparent)
+        ansicpg.put(711, getCharset("windows-711"));      // Arabic (Nafitha)
+        ansicpg.put(720, getCharset("windows-720"));      // Arabic 
(transparent ASMO)
+        ansicpg.put(819, getCharset("CP819"));            // Windows 3.1 
(US/Western)
+        ansicpg.put(850, getCharset("CP850"));            // IBM Multilingual
+        ansicpg.put(852, getCharset("CP852"));            // Eastern European
+        ansicpg.put(860, getCharset("CP860"));            // Portuguese
+        ansicpg.put(862, getCharset("CP862"));            // Hebrew
+        ansicpg.put(863, getCharset("CP863"));            // French Canadian
+        ansicpg.put(864, getCharset("CP864"));            // Arabic
+        ansicpg.put(865, getCharset("CP865"));            // Norwegian
+        ansicpg.put(866, getCharset("CP866"));            // Soviet Union
+        ansicpg.put(874, getCharset("MS874"));            // Thai
+        ansicpg.put(932, getCharset("MS932"));            // Japanese
+        ansicpg.put(936, getCharset("MS936"));            // Simplified Chinese
+        ansicpg.put(949, getCharset("CP949"));            // Korean
+        ansicpg.put(950, getCharset("CP950"));            // Traditional 
Chinese
+        ansicpg.put(1250, getCharset("CP1250"));          // Eastern European
+        ansicpg.put(1251, getCharset("CP1251"));          // Cyrillic
+        ansicpg.put(1252, getCharset("CP1252"));          // Western European
+        ansicpg.put(1253, getCharset("CP1253"));          // Greek
+        ansicpg.put(1254, getCharset("CP1254"));          // Turkish
+        ansicpg.put(1255, getCharset("CP1255"));          // Hebrew
+        ansicpg.put(1256, getCharset("CP1256"));          // Arabic
+        ansicpg.put(1257, getCharset("CP1257"));          // Baltic
+        ansicpg.put(1258, getCharset("CP1258"));          // Vietnamese
+        ansicpg.put(1361, getCharset("x-Johab"));         // Johab
+        ansicpg.put(10000, getCharset("MacRoman"));       // Mac Roman
+        ansicpg.put(10001, getCharset("Shift_JIS"));      // Mac Japan
+        ansicpg.put(10004, getCharset("MacArabic"));      // Mac Arabic
+        ansicpg.put(10005, getCharset("MacHebrew"));      // Mac Hebrew
+        ansicpg.put(10006, getCharset("MacGreek"));       // Mac Greek
+        ansicpg.put(10007, getCharset("MacCyrillic"));    // Mac Cyrillic
+        ansicpg.put(10029, getCharset("x-MacCentralEurope")); // Mac Latin2
+        ansicpg.put(10081, getCharset("MacTurkish"));     // Mac Turkish
+        ansicpg.put(57002, getCharset("x-ISCII91"));      // Devanagari
+        ansicpg.put(57003, getCharset("windows-57003"));  // Bengali
+        ansicpg.put(57004, getCharset("windows-57004"));  // Tamil
+        ansicpg.put(57005, getCharset("windows-57005"));  // Telugu
+        ansicpg.put(57006, getCharset("windows-57006"));  // Assamese
+        ansicpg.put(57007, getCharset("windows-57007"));  // Oriya
+        ansicpg.put(57008, getCharset("windows-57008"));  // Kannada
+        ansicpg.put(57009, getCharset("windows-57009"));  // Malayalam
+        ansicpg.put(57010, getCharset("windows-57010"));  // Gujarati
+        ansicpg.put(57011, getCharset("windows-57011"));  // Punjabi
+
+        ANSICPG_MAP = Collections.unmodifiableMap(ansicpg);
+    }
+
+    private RTFCharsetMaps() {
+    }
+
+    /**
+     * Resolve a charset by name, falling back to US-ASCII if unavailable.
+     */
+    static Charset getCharset(String name) {
+        try {
+            return CharsetUtils.forName(name);
+        } catch (IllegalArgumentException e) {
+            return StandardCharsets.US_ASCII;
+        }
+    }
+
+    /**
+     * Resolve an ANSI code page number to a Java Charset.
+     * Tries the ANSICPG_MAP first, then falls back to {@code windows-N} and 
{@code cpN}.
+     * Returns {@code WINDOWS_1252} if nothing matches.
+     */
+    public static Charset resolveCodePage(int cpNumber) {
+        Charset cs = ANSICPG_MAP.get(cpNumber);
+        if (cs != null) {
+            return cs;
+        }
+        try {
+            return Charset.forName("windows-" + cpNumber);
+        } catch (Exception e) {
+            try {
+                return Charset.forName("cp" + cpNumber);
+            } catch (Exception e2) {
+                return WINDOWS_1252;
+            }
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
new file mode 100644
index 0000000000..dd8b052b9e
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
@@ -0,0 +1,339 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+
+/**
+ * Handles embedded objects and pictures within the JFlex-based RTF token 
stream.
+ *
+ * <p>Uses streaming parsers ({@link RTFObjDataStreamParser} and
+ * {@link RTFPictStreamParser}) so that large embedded objects are written
+ * to temp files rather than buffered entirely in memory.</p>
+ */
+public class RTFEmbeddedHandler {
+
+    private static final String EMPTY_STRING = "";
+
+    private final ContentHandler handler;
+    private final ParseContext context;
+    private final EmbeddedDocumentUtil embeddedDocumentUtil;
+    private final long maxBytes;
+
+    private boolean inObject = false;
+    private boolean isPictBitmap = false;
+    private int hi = -1;
+    private int thumbCount = 0;
+    private final AtomicInteger unknownFilenameCount = new AtomicInteger();
+
+    // Shape property metadata
+    private String sn = EMPTY_STRING;
+    private String sv = EMPTY_STRING;
+    private final StringBuilder metadataBuffer = new StringBuilder();
+
+    private Metadata metadata;
+    private EmbState state = EmbState.NADA;
+
+    // Streaming parsers — one active at a time
+    private RTFObjDataStreamParser objParser;
+    private RTFPictStreamParser pictParser;
+
+    public RTFEmbeddedHandler(ContentHandler handler, ParseContext context,
+                              int memoryLimitInKb) {
+        this.handler = handler;
+        this.context = context;
+        this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+        this.maxBytes = memoryLimitInKb > 0 ? (long) memoryLimitInKb * 1024 : 
-1;
+        this.metadata = Metadata.newInstance(context);
+    }
+
+    /**
+     * Process a token for embedded object/pict handling.
+     * Call this AFTER {@link RTFState#processToken(RTFToken)} has run.
+     *
+     * @param tok the current token
+     * @param rtfState the RTF state (already updated for this token)
+     * @param closingGroup for GROUP_CLOSE tokens, the group state that just 
closed.
+     *                     Null for other token types.
+     */
+    public void processToken(RTFToken tok, RTFState rtfState, RTFGroupState 
closingGroup)
+            throws IOException, SAXException, TikaException {
+        RTFTokenType type = tok.getType();
+        RTFGroupState group = rtfState.getCurrentGroup();
+
+        switch (type) {
+            case GROUP_CLOSE:
+                if (closingGroup == null) {
+                    break;
+                }
+                if (closingGroup.objdata) {
+                    handleCompletedObjData();
+                } else if (closingGroup.pictDepth == 1) {
+                    handleCompletedPict();
+                } else if (closingGroup.sn) {
+                    endSN();
+                } else if (closingGroup.sv) {
+                    endSV();
+                } else if (closingGroup.sp) {
+                    endSP();
+                }
+                if (closingGroup.object) {
+                    inObject = false;
+                }
+                break;
+
+            case CONTROL_WORD:
+                String name = tok.getName();
+                switch (name) {
+                    case "object":
+                        inObject = true;
+                        break;
+                    case "objdata":
+                        startObjData();
+                        break;
+                    case "pict":
+                        startPict();
+                        break;
+                    case "sn":
+                        startSN();
+                        break;
+                    case "sv":
+                        startSV();
+                        break;
+                    case "wbitmap":
+                        isPictBitmap = true;
+                        break;
+                }
+                break;
+
+            case TEXT:
+                if (group.objdata || group.pictDepth == 1) {
+                    String text = tok.getName();
+                    for (int i = 0; i < text.length(); i++) {
+                        writeHexChar(text.charAt(i));
+                    }
+                } else if (group.sn || group.sv) {
+                    String text = tok.getName();
+                    for (int i = 0; i < text.length(); i++) {
+                        metadataBuffer.append(text.charAt(i));
+                    }
+                }
+                break;
+
+            case HEX_ESCAPE:
+                if (group.sn || group.sv) {
+                    metadataBuffer.append((char) tok.getHexValue());
+                }
+                break;
+
+            default:
+                break;
+        }
+    }
+
+    // --- Lifecycle for objdata ---
+
+    private void startObjData() throws IOException {
+        state = EmbState.OBJDATA;
+        metadata = Metadata.newInstance(context);
+        objParser = new RTFObjDataStreamParser(maxBytes);
+    }
+
+    private void handleCompletedObjData() throws IOException, SAXException, 
TikaException {
+        if (objParser == null) {
+            reset();
+            return;
+        }
+        try {
+            TikaInputStream tis = objParser.onComplete(metadata, 
unknownFilenameCount);
+            if (tis != null) {
+                try {
+                    extractObj(tis, metadata);
+                } finally {
+                    tis.close();
+                }
+            }
+        } catch (IOException e) {
+            EmbeddedDocumentUtil.recordException(e, metadata);
+        } finally {
+            objParser.close();
+            objParser = null;
+            reset();
+        }
+    }
+
+    // --- Lifecycle for pict ---
+
+    private void startPict() throws IOException {
+        state = EmbState.PICT;
+        metadata = Metadata.newInstance(context);
+        pictParser = new RTFPictStreamParser(maxBytes);
+    }
+
+    private void handleCompletedPict() throws IOException, SAXException, 
TikaException {
+        if (pictParser == null) {
+            reset();
+            return;
+        }
+        try {
+            Path pictFile = pictParser.onComplete();
+            if (pictFile != null) {
+                String filePath =
+                        metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + 
"wzDescription");
+                if (filePath != null && !filePath.isEmpty()) {
+                    metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, 
filePath);
+                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                            FilenameUtils.getName(filePath));
+                    metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, 
filePath);
+                }
+                metadata.set(RTFMetadata.THUMBNAIL, 
Boolean.toString(inObject));
+                if (isPictBitmap) {
+                    
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+                            "image/x-rtf-raw-bitmap");
+                }
+
+                try (TikaInputStream tis = TikaInputStream.get(pictFile)) {
+                    extractObj(tis, metadata);
+                }
+            }
+        } catch (IOException e) {
+            EmbeddedDocumentUtil.recordException(e, metadata);
+        } finally {
+            pictParser.close();
+            pictParser = null;
+            reset();
+        }
+    }
+
+    // --- Shape property metadata ---
+
+    private void startSN() {
+        metadataBuffer.setLength(0);
+        metadataBuffer.append(RTFMetadata.RTF_PICT_META_PREFIX);
+    }
+
+    private void endSN() {
+        sn = metadataBuffer.toString();
+    }
+
+    private void startSV() {
+        metadataBuffer.setLength(0);
+    }
+
+    private void endSV() {
+        sv = metadataBuffer.toString();
+    }
+
+    private void endSP() {
+        metadata.add(sn, sv);
+    }
+
+    // --- Hex pair decoding ---
+
+    private void writeHexChar(int b) throws IOException, TikaException {
+        if (isHexChar(b)) {
+            if (hi == -1) {
+                hi = 16 * hexValue(b);
+            } else {
+                int decoded = hi + hexValue(b);
+                hi = -1;
+                // Route the decoded byte to the active streaming parser
+                if (objParser != null) {
+                    objParser.onByte(decoded);
+                } else if (pictParser != null) {
+                    pictParser.onByte(decoded);
+                }
+            }
+        }
+    }
+
+    // --- Common extraction ---
+
+    private void extractObj(TikaInputStream tis, Metadata meta)
+            throws SAXException, IOException, TikaException {
+        meta.set(Metadata.CONTENT_LENGTH, Long.toString(tis.getLength()));
+
+        if (embeddedDocumentUtil.shouldParseEmbedded(meta)) {
+            if (meta.get(TikaCoreProperties.RESOURCE_NAME_KEY) == null) {
+                String extension = embeddedDocumentUtil.getExtension(tis, 
meta);
+                if (inObject && state == EmbState.PICT) {
+                    meta.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                            
EmbeddedDocumentUtil.EmbeddedResourcePrefix.THUMBNAIL.getPrefix()
+                                    + "-" + thumbCount++ + extension);
+                    meta.set(RTFMetadata.THUMBNAIL, "true");
+                } else {
+                    meta.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                            
EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED.getPrefix()
+                                    + "-" + 
unknownFilenameCount.getAndIncrement()
+                                    + extension);
+                }
+                meta.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, 
true);
+            }
+            try {
+                embeddedDocumentUtil.parseEmbedded(
+                        tis, new EmbeddedContentHandler(handler), meta, true);
+            } catch (IOException e) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, meta);
+            }
+        }
+    }
+
+    private void reset() {
+        state = EmbState.NADA;
+        metadata = Metadata.newInstance(context);
+        hi = -1;
+        sn = EMPTY_STRING;
+        sv = EMPTY_STRING;
+        metadataBuffer.setLength(0);
+        isPictBitmap = false;
+    }
+
+    private static boolean isHexChar(int ch) {
+        return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 
'A' && ch <= 'F');
+    }
+
+    private static int hexValue(int ch) {
+        if (ch >= '0' && ch <= '9') {
+            return ch - '0';
+        } else if (ch >= 'a' && ch <= 'z') {
+            return 10 + (ch - 'a');
+        } else {
+            return 10 + (ch - 'A');
+        }
+    }
+
+    private enum EmbState {
+        PICT,
+        OBJDATA,
+        NADA
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
new file mode 100644
index 0000000000..c5f9f8c444
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+
+/**
+ * State associated with a single RTF group ({@code \{ ... \}}).
+ * <p>
+ * When a new group opens, the current state is pushed onto the stack and a
+ * child state is created that inherits the parent's properties. When the group
+ * closes, the state is popped.
+ */
+public class RTFGroupState {
+
+    /** Nesting depth (0 = root). */
+    int depth;
+
+    /** Current font charset, set by {@code \fN} if the font table maps it. 
May be null. */
+    Charset fontCharset;
+
+    /** Current font ID, set by {@code \fN}. -1 if unset. */
+    int fontId = -1;
+
+    /** Number of ANSI chars to skip after a unicode escape (ucN control 
word). Default 1. */
+    int ucSkip = 1;
+
+    /** True if this group's content should be ignored (e.g. {@code \*} 
destination). */
+    boolean ignore;
+
+    /** True if bold. */
+    boolean bold;
+
+    /** True if italic. */
+    boolean italic;
+
+    // Embedded object / picture state
+    boolean objdata;
+    int pictDepth;
+    boolean sp;
+    boolean sn;
+    boolean sv;
+    boolean object;
+    boolean annotation;
+
+    /** Create a root group state with defaults. */
+    public RTFGroupState() {
+    }
+
+    /** Create a child group state inheriting from the parent. */
+    public RTFGroupState(RTFGroupState parent) {
+        this.depth = parent.depth + 1;
+        this.fontCharset = parent.fontCharset;
+        this.fontId = parent.fontId;
+        this.ucSkip = parent.ucSkip;
+        this.ignore = parent.ignore;
+        this.bold = parent.bold;
+        this.italic = parent.italic;
+        this.pictDepth = parent.pictDepth > 0 ? parent.pictDepth + 1 : 0;
+        // objdata, sp, sn, sv, object, annotation are NOT inherited
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
new file mode 100644
index 0000000000..9c5d0cee90
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Extracts the original HTML from an RTF document that contains encapsulated 
HTML
+ * (as indicated by the {@code \fromhtml1} control word), using a JFlex-based 
tokenizer
+ * and shared {@link RTFState} for font/codepage tracking.
+ *
+ * <p>Embedded objects and pictures are extracted in the same pass via
+ * {@link RTFEmbeddedHandler}.</p>
+ */
+public class RTFHtmlDecapsulator {
+
+    private static final int DEFAULT_MEMORY_LIMIT_KB = 20 * 1024; // 20 MB
+
+    private final ContentHandler handler;
+    private final ParseContext context;
+    private final int memoryLimitInKb;
+
+    /**
+     * Creates a decapsulator that extracts embedded objects through the given 
handler.
+     *
+     * @param handler the content handler for embedded document extraction
+     * @param context the parse context (provides EmbeddedDocumentExtractor, 
etc.)
+     * @param memoryLimitInKb max bytes per embedded object (in KB), or -1 for 
unlimited
+     */
+    public RTFHtmlDecapsulator(ContentHandler handler, ParseContext context,
+                               int memoryLimitInKb) {
+        this.handler = handler;
+        this.context = context;
+        this.memoryLimitInKb = memoryLimitInKb;
+    }
+
+    /**
+     * Creates a decapsulator with default memory limit and no embedded 
extraction.
+     */
+    public RTFHtmlDecapsulator() {
+        this(null, null, DEFAULT_MEMORY_LIMIT_KB);
+    }
+
+    /**
+     * Extracts the HTML content from an encapsulated-HTML RTF document.
+     * Embedded objects and pictures are extracted as a side effect through
+     * the {@link ContentHandler} provided at construction time.
+     *
+     * @param rtfBytes the decompressed RTF bytes
+     * @return the extracted HTML string, or {@code null} if the RTF does not 
contain
+     *         encapsulated HTML
+     * @throws IOException if the tokenizer encounters an I/O error
+     */
+    public String extract(byte[] rtfBytes) throws IOException, SAXException, 
TikaException {
+        if (rtfBytes == null || rtfBytes.length == 0) {
+            return null;
+        }
+
+        String rtf = new String(rtfBytes, StandardCharsets.US_ASCII);
+
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+        RTFState state = new RTFState();
+        RTFEmbeddedHandler embHandler = (handler != null && context != null)
+                ? new RTFEmbeddedHandler(handler, context, memoryLimitInKb)
+                : null;
+
+        StringBuilder html = new StringBuilder(rtf.length() / 2);
+        ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+
+        boolean foundFromHtml = false;
+        boolean foundHtmlTag = false;
+        boolean inHtmlRtfSkip = false;
+
+        boolean sawIgnorable = false;
+        int htmlTagDepth = -1;
+        boolean inHtmlTag = false;
+
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            RTFTokenType type = tok.getType();
+
+            if (type == RTFTokenType.EOF) {
+                break;
+            }
+
+            // Flush pending bytes before charset-changing events
+            if (type == RTFTokenType.GROUP_CLOSE
+                    || (type == RTFTokenType.CONTROL_WORD && 
"f".equals(tok.getName())
+                        && tok.hasParameter())) {
+                flushPendingBytes(pendingBytes, html, state);
+            }
+
+            // Let RTFState handle group stack, font table, codepage, unicode 
skip
+            boolean consumed = state.processToken(tok);
+
+            // Let embedded handler process objdata/pict/sp in the same pass
+            if (embHandler != null && !consumed) {
+                RTFGroupState closingGroup =
+                        (type == RTFTokenType.GROUP_CLOSE) ? 
state.getLastClosedGroup() : null;
+                try {
+                    embHandler.processToken(tok, state, closingGroup);
+                } catch (TikaException | IOException e) {
+                    // record and continue — don't let a bad embedded object 
kill decapsulation
+                }
+            }
+
+            RTFGroupState group = state.getCurrentGroup();
+
+            // Skip tokens that are part of objdata/pict hex streams
+            if (!consumed && (group.objdata || group.pictDepth > 0)) {
+                // Embedded handler already consumed these
+                continue;
+            }
+
+            switch (type) {
+                case GROUP_OPEN:
+                    sawIgnorable = false;
+                    break;
+
+                case GROUP_CLOSE:
+                    if (inHtmlTag && state.getDepth() < htmlTagDepth) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        inHtmlTag = false;
+                        htmlTagDepth = -1;
+                    }
+                    break;
+
+                case CONTROL_SYMBOL:
+                    if ("*".equals(tok.getName())) {
+                        sawIgnorable = true;
+                    }
+                    if (!foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || isContentArea(htmlTagDepth)) {
+                        String sym = tok.getName();
+                        if ("{".equals(sym) || "}".equals(sym) || 
"\\".equals(sym)) {
+                            flushPendingBytes(pendingBytes, html, state);
+                            html.append(sym);
+                        }
+                    }
+                    break;
+
+                case CONTROL_WORD:
+                    if (consumed) {
+                        break;
+                    }
+                    String name = tok.getName();
+
+                    if ("fromhtml".equals(name)) {
+                        foundFromHtml = true;
+                        break;
+                    }
+
+                    if ("htmltag".equals(name) && sawIgnorable) {
+                        if (!foundFromHtml) {
+                            break;
+                        }
+                        foundHtmlTag = true;
+                        flushPendingBytes(pendingBytes, html, state);
+                        inHtmlTag = true;
+                        htmlTagDepth = state.getDepth();
+                        break;
+                    }
+
+                    if ("htmlrtf".equals(name)) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        inHtmlRtfSkip = !(tok.hasParameter() && 
tok.getParameter() == 0);
+                        break;
+                    }
+
+                    if (!foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+
+                    if (inHtmlTag || isContentArea(htmlTagDepth)) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        switch (name) {
+                            case "par":
+                            case "pard":
+                                html.append('\n');
+                                break;
+                            case "tab":
+                                html.append('\t');
+                                break;
+                            case "line":
+                                html.append("<br>");
+                                break;
+                            default:
+                                break;
+                        }
+                    }
+                    break;
+
+                case HEX_ESCAPE:
+                    if (consumed) {
+                        break;
+                    }
+                    if (!foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || isContentArea(htmlTagDepth)) {
+                        pendingBytes.write(tok.getHexValue());
+                    }
+                    break;
+
+                case UNICODE_ESCAPE:
+                    if (!foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || isContentArea(htmlTagDepth)) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        int cp = tok.getParameter();
+                        if (Character.isValidCodePoint(cp)) {
+                            html.appendCodePoint(cp);
+                        }
+                    }
+                    break;
+
+                case TEXT:
+                    if (consumed) {
+                        break;
+                    }
+                    if (!foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || isContentArea(htmlTagDepth)) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        html.append(tok.getName());
+                    }
+                    break;
+
+                case CRLF:
+                case BIN:
+                default:
+                    break;
+            }
+        }
+
+        flushPendingBytes(pendingBytes, html, state);
+
+        if (!foundFromHtml || html.length() == 0) {
+            return null;
+        }
+        return html.toString();
+    }
+
+    private static boolean isContentArea(int htmlTagDepth) {
+        return htmlTagDepth == -1;
+    }
+
+    private static void flushPendingBytes(ByteArrayOutputStream pending, 
StringBuilder out,
+                                          RTFState state) {
+        if (pending.size() > 0) {
+            Charset cs = state.getCurrentCharset();
+            out.append(new String(pending.toByteArray(), cs));
+            pending.reset();
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
new file mode 100644
index 0000000000..c45b0a3817
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
@@ -0,0 +1,534 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Locale;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+
+/**
+ * Parses OLE objdata from an RTF stream inline, byte by byte.
+ *
+ * <p>The OLE objdata structure is:
+ * <pre>
+ *   [4 bytes version][4 bytes formatId]
+ *   [4 bytes classNameLen][classNameLen bytes className]
+ *   [4 bytes topicNameLen][topicNameLen bytes topicName]
+ *   [4 bytes itemNameLen][itemNameLen bytes itemName]
+ *   [4 bytes dataSz][dataSz bytes payload]
+ * </pre>
+ * The small header fields are parsed byte-by-byte via a state machine.
+ * Once the header is complete and {@code dataSz} is known, the payload
+ * bytes stream directly to a temp file — never buffered in memory.</p>
+ *
+ * <p>On {@link #onComplete(Metadata, AtomicInteger)}, the payload is
+ * interpreted based on {@code className} (Package, PBrush, POIFS, etc.)
+ * and the extracted content is returned as a {@link TikaInputStream}.</p>
+ */
+public class RTFObjDataStreamParser implements Closeable {
+
+    private static final String WIN_ASCII = "WINDOWS-1252";
+
+    private final long maxBytes;
+
+    // State machine
+    private Field currentField = Field.VERSION;
+    private byte[] fieldBuf = new byte[4];
+    private int fieldPos;
+    private int fieldTarget = 4;
+
+    // Parsed header values
+    private long version;
+    private long formatId;
+    private String className;
+    private String topicName;
+    private String itemName;
+    private long dataSz;
+
+    // String accumulator for length-prefixed ANSI strings
+    private byte[] stringBuf;
+    private int stringPos;
+
+    // Payload streaming
+    private Path tempFile;
+    private OutputStream dataOut;
+    private long dataWritten;
+
+    /**
+     * @param maxBytes maximum payload bytes to accept (-1 for unlimited)
+     */
+    public RTFObjDataStreamParser(long maxBytes) {
+        this.maxBytes = maxBytes;
+    }
+
+    /**
+     * Receive a single decoded byte from the objdata hex stream.
+     */
+    public void onByte(int b) throws IOException, TikaException {
+        switch (currentField) {
+            case VERSION:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    version = readLE32(fieldBuf);
+                    initUint32Field(Field.FORMAT_ID);
+                }
+                break;
+
+            case FORMAT_ID:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    formatId = readLE32(fieldBuf);
+                    if (formatId != 2L) {
+                        // Not an embedded object (1 = link). Skip everything.
+                        currentField = Field.SKIP;
+                    } else {
+                        initUint32Field(Field.CLASS_LEN);
+                    }
+                }
+                break;
+
+            case CLASS_LEN:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    int len = (int) readLE32(fieldBuf);
+                    initStringField(Field.CLASS_NAME, len);
+                }
+                break;
+
+            case CLASS_NAME:
+                stringBuf[stringPos++] = (byte) b;
+                if (stringPos >= fieldTarget) {
+                    className = decodeString(stringBuf, fieldTarget);
+                    initUint32Field(Field.TOPIC_LEN);
+                }
+                break;
+
+            case TOPIC_LEN:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    int len = (int) readLE32(fieldBuf);
+                    initStringField(Field.TOPIC_NAME, len);
+                }
+                break;
+
+            case TOPIC_NAME:
+                stringBuf[stringPos++] = (byte) b;
+                if (stringPos >= fieldTarget) {
+                    topicName = decodeString(stringBuf, fieldTarget);
+                    initUint32Field(Field.ITEM_LEN);
+                }
+                break;
+
+            case ITEM_LEN:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    int len = (int) readLE32(fieldBuf);
+                    initStringField(Field.ITEM_NAME, len);
+                }
+                break;
+
+            case ITEM_NAME:
+                stringBuf[stringPos++] = (byte) b;
+                if (stringPos >= fieldTarget) {
+                    itemName = decodeString(stringBuf, fieldTarget);
+                    initUint32Field(Field.DATA_SIZE);
+                }
+                break;
+
+            case DATA_SIZE:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    dataSz = readLE32(fieldBuf);
+                    if (dataSz <= 0) {
+                        currentField = Field.DONE;
+                    } else {
+                        currentField = Field.DATA;
+                        tempFile = Files.createTempFile("tika-rtf-obj-", 
".bin");
+                        dataOut = new 
BufferedOutputStream(Files.newOutputStream(tempFile));
+                    }
+                }
+                break;
+
+            case DATA:
+                if (maxBytes > 0 && dataWritten >= maxBytes) {
+                    throw new TikaMemoryLimitException(dataWritten + 1, 
maxBytes);
+                }
+                dataOut.write(b);
+                dataWritten++;
+                if (dataWritten >= dataSz) {
+                    dataOut.close();
+                    dataOut = null;
+                    currentField = Field.DONE;
+                }
+                break;
+
+            case DONE:
+            case SKIP:
+                break;
+        }
+    }
+
+    /**
+     * Called when the objdata group closes. Populates metadata and returns
+     * a TikaInputStream with the extracted embedded content, or null if
+     * the object couldn't be parsed.
+     *
+     * <p>The caller is responsible for closing the returned TikaInputStream
+     * (which will clean up the underlying temp file).</p>
+     */
+    public TikaInputStream onComplete(Metadata metadata, AtomicInteger 
unknownFilenameCount)
+            throws IOException, TikaException {
+        if (currentField == Field.SKIP || tempFile == null) {
+            return null;
+        }
+
+        metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
+        if (className != null && !className.isEmpty()) {
+            metadata.add(RTFMetadata.EMB_CLASS, className);
+        }
+        if (topicName != null && !topicName.isEmpty()) {
+            metadata.add(RTFMetadata.EMB_TOPIC, topicName);
+        }
+        if (itemName != null && !itemName.isEmpty()) {
+            metadata.add(RTFMetadata.EMB_ITEM, itemName);
+        }
+
+        String cn = className != null ? className.toLowerCase(Locale.ROOT) : 
"";
+
+        if ("package".equals(cn)) {
+            return handlePackage(metadata);
+        } else if ("pbrush".equals(cn)) {
+            // Raw bitmap — the temp file IS the content
+            return TikaInputStream.get(tempFile);
+        } else {
+            return handleGenericOrPOIFS(metadata, unknownFilenameCount);
+        }
+    }
+
+    /**
+     * Returns true if the header has been fully parsed (regardless of whether
+     * all data bytes have arrived).
+     */
+    public boolean isHeaderParsed() {
+        return currentField == Field.DATA || currentField == Field.DONE;
+    }
+
+    /** Returns the parsed className, or null if header isn't complete yet. */
+    public String getClassName() {
+        return className;
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (dataOut != null) {
+            dataOut.close();
+            dataOut = null;
+        }
+        cleanup();
+    }
+
+    // --- Package handling ---
+
+    private TikaInputStream handlePackage(Metadata metadata) throws 
IOException, TikaException {
+        try (InputStream is = new 
BufferedInputStream(Files.newInputStream(tempFile))) {
+            int type1 = readUShortLE(is);
+
+            String displayName = readNullTerminatedString(is);
+            readNullTerminatedString(is); // iconFilePath
+            readUShortBE(is); // iconIndex
+            int type2 = readUShortLE(is);
+
+            if (type2 != 3) {
+                // type 1 = link, only handle type 3 = embedded
+                return null;
+            }
+
+            readUIntLE(is); // filePathLen
+            String ansiFilePath = readNullTerminatedString(is);
+            long bytesLen = readUIntLE(is);
+
+            // The remaining bytes in the stream are the actual file content.
+            // Create a temp file for them.
+            Path contentFile = Files.createTempFile("tika-rtf-pkg-", ".bin");
+            try (OutputStream contentOut = new BufferedOutputStream(
+                    Files.newOutputStream(contentFile))) {
+                long copied = copyBounded(is, contentOut, bytesLen);
+            }
+
+            // Try to read unicode file path (optional)
+            StringBuilder unicodePath = new StringBuilder();
+            try {
+                long unicodeLen = readUIntLE(is);
+                for (int i = 0; i < unicodeLen; i++) {
+                    int lo = is.read();
+                    int hi = is.read();
+                    if (lo == -1 || hi == -1) {
+                        unicodePath.setLength(0);
+                        break;
+                    }
+                    unicodePath.append((char) (lo + 256 * hi));
+                }
+            } catch (IOException e) {
+                unicodePath.setLength(0);
+            }
+
+            String fileNameToUse;
+            String pathToUse;
+            if (unicodePath.length() > 0) {
+                fileNameToUse = unicodePath.toString();
+                pathToUse = unicodePath.toString();
+            } else {
+                fileNameToUse = displayName != null ? displayName : "";
+                pathToUse = ansiFilePath != null ? ansiFilePath : "";
+            }
+            metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, 
fileNameToUse);
+            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                    FilenameUtils.getName(fileNameToUse));
+            metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, 
pathToUse);
+
+            return TikaInputStream.get(contentFile);
+        } finally {
+            cleanup();
+        }
+    }
+
+    // --- Generic / POIFS handling ---
+
+    private TikaInputStream handleGenericOrPOIFS(Metadata metadata,
+                                                  AtomicInteger 
unknownFilenameCount)
+            throws IOException, TikaException {
+        try (InputStream probe = new 
BufferedInputStream(Files.newInputStream(tempFile))) {
+            boolean isOLE2 = FileMagic.valueOf(probe) == FileMagic.OLE2;
+            if (!isOLE2) {
+                // Not POIFS — return raw bytes from temp file
+                return TikaInputStream.get(tempFile);
+            }
+        }
+
+        // It's POIFS — parse it
+        try (InputStream poifsIn = new 
BufferedInputStream(Files.newInputStream(tempFile));
+             POIFSFileSystem fs = new POIFSFileSystem(poifsIn)) {
+            DirectoryNode root = fs.getRoot();
+            if (root == null) {
+                return null;
+            }
+
+            byte[] content = null;
+
+            if (root.hasEntry("Package")) {
+                Entry pkg = root.getEntry("Package");
+                try (BoundedInputStream bis = new BoundedInputStream(
+                        maxBytes > 0 ? maxBytes : Long.MAX_VALUE,
+                        new DocumentInputStream((DocumentEntry) pkg))) {
+                    content = IOUtils.toByteArray(bis);
+                    if (bis.hasHitBound()) {
+                        throw new TikaMemoryLimitException(maxBytes + 1, 
maxBytes);
+                    }
+                }
+            } else {
+                POIFSDocumentType type = POIFSDocumentType.detectType(root);
+                if (type == POIFSDocumentType.OLE10_NATIVE) {
+                    try {
+                        Ole10Native ole = 
Ole10Native.createFromEmbeddedOleObject(root);
+                        content = ole.getDataBuffer();
+                    } catch (Ole10NativeException ex) {
+                        // Not valid OLE10Native
+                    }
+                } else if (type == POIFSDocumentType.COMP_OBJ) {
+                    DocumentEntry contentsEntry;
+                    try {
+                        contentsEntry = (DocumentEntry) 
root.getEntry("CONTENTS");
+                    } catch (FileNotFoundException e) {
+                        contentsEntry = (DocumentEntry) 
root.getEntry("Contents");
+                    }
+                    try (DocumentInputStream inp = new 
DocumentInputStream(contentsEntry)) {
+                        content = new byte[contentsEntry.getSize()];
+                        inp.readFully(content);
+                    }
+                } else {
+                    // Unknown POIFS type — return the whole thing
+                    metadata.set(Metadata.CONTENT_TYPE, 
type.getType().toString());
+                    return TikaInputStream.get(tempFile);
+                }
+            }
+
+            if (content != null) {
+                // Write extracted content to a new temp file
+                Path contentFile = Files.createTempFile("tika-rtf-poifs-", 
".bin");
+                Files.write(contentFile, content);
+                return TikaInputStream.get(contentFile);
+            }
+        } finally {
+            cleanup();
+        }
+        return null;
+    }
+
+    // --- Helper methods ---
+
+    private void initUint32Field(Field next) {
+        currentField = next;
+        fieldPos = 0;
+        fieldTarget = 4;
+    }
+
+    private void initStringField(Field next, int len) {
+        currentField = next;
+        if (len <= 0) {
+            // Empty string — advance immediately
+            switch (next) {
+                case CLASS_NAME:
+                    className = "";
+                    initUint32Field(Field.TOPIC_LEN);
+                    break;
+                case TOPIC_NAME:
+                    topicName = "";
+                    initUint32Field(Field.ITEM_LEN);
+                    break;
+                case ITEM_NAME:
+                    itemName = "";
+                    initUint32Field(Field.DATA_SIZE);
+                    break;
+                default:
+                    break;
+            }
+            return;
+        }
+        stringBuf = new byte[len];
+        stringPos = 0;
+        fieldTarget = len;
+    }
+
+    private static long readLE32(byte[] buf) {
+        return (buf[0] & 0xFFL)
+                | ((buf[1] & 0xFFL) << 8)
+                | ((buf[2] & 0xFFL) << 16)
+                | ((buf[3] & 0xFFL) << 24);
+    }
+
+    private static String decodeString(byte[] buf, int len) {
+        try {
+            return new String(buf, 0, len, WIN_ASCII).trim();
+        } catch (java.io.UnsupportedEncodingException e) {
+            return new String(buf, 0, len).trim();
+        }
+    }
+
+    private static int readUShortLE(InputStream is) throws IOException {
+        int lo = is.read();
+        int hi = is.read();
+        if (lo == -1 || hi == -1) {
+            throw new IOException("unexpected end of stream");
+        }
+        return lo | (hi << 8);
+    }
+
+    private static int readUShortBE(InputStream is) throws IOException {
+        int hi = is.read();
+        int lo = is.read();
+        if (lo == -1 || hi == -1) {
+            throw new IOException("unexpected end of stream");
+        }
+        return (hi << 8) | lo;
+    }
+
+    private static long readUIntLE(InputStream is) throws IOException {
+        try {
+            return EndianUtils.readUIntLE(is);
+        } catch (EndianUtils.BufferUnderrunException e) {
+            throw new IOException(e);
+        }
+    }
+
+    private static String readNullTerminatedString(InputStream is) throws 
IOException {
+        StringBuilder sb = new StringBuilder();
+        int c = is.read();
+        while (c > 0) {
+            sb.append((char) c);
+            c = is.read();
+        }
+        if (c == -1) {
+            throw new IOException("hit end of stream before null terminator");
+        }
+        return sb.toString();
+    }
+
+    private static long copyBounded(InputStream in, OutputStream out, long 
maxLen)
+            throws IOException {
+        byte[] buf = new byte[8192];
+        long total = 0;
+        while (total < maxLen) {
+            int toRead = (int) Math.min(buf.length, maxLen - total);
+            int read = in.read(buf, 0, toRead);
+            if (read == -1) {
+                break;
+            }
+            out.write(buf, 0, read);
+            total += read;
+        }
+        return total;
+    }
+
+    private void cleanup() {
+        if (tempFile != null) {
+            try {
+                Files.deleteIfExists(tempFile);
+            } catch (IOException ignored) {
+                // best effort
+            }
+            tempFile = null;
+        }
+    }
+
+    private enum Field {
+        VERSION, FORMAT_ID,
+        CLASS_LEN, CLASS_NAME,
+        TOPIC_LEN, TOPIC_NAME,
+        ITEM_LEN, ITEM_NAME,
+        DATA_SIZE, DATA,
+        DONE, SKIP
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
new file mode 100644
index 0000000000..8fe6c98989
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+
+/**
+ * Streams decoded bytes from an RTF {@code \pict} group to a temp file.
+ *
+ * <p>Pict data is raw image bytes (after hex-pair decoding). There is no
+ * header to parse — bytes are written directly to a temp file. On
+ * {@link #onComplete()}, the caller retrieves the temp file path and
+ * hands it to the embedded document extractor.</p>
+ */
+public class RTFPictStreamParser implements Closeable {
+
+    private final long maxBytes;
+    private Path tempFile;
+    private OutputStream out;
+    private long bytesWritten;
+
+    /**
+     * @param maxBytes maximum number of bytes to accept (-1 for unlimited)
+     */
+    public RTFPictStreamParser(long maxBytes) throws IOException {
+        this.maxBytes = maxBytes;
+        this.tempFile = Files.createTempFile("tika-rtf-pict-", ".bin");
+        this.out = new BufferedOutputStream(Files.newOutputStream(tempFile));
+    }
+
+    /**
+     * Receive a single decoded byte from the pict hex stream.
+     */
+    public void onByte(int b) throws IOException, TikaException {
+        if (maxBytes > 0 && bytesWritten >= maxBytes) {
+            throw new TikaMemoryLimitException(bytesWritten + 1, maxBytes);
+        }
+        out.write(b);
+        bytesWritten++;
+    }
+
+    /**
+     * Called when the pict group closes. Flushes and closes the output stream.
+     *
+     * @return the path to the temp file containing the image data,
+     *         or null if no bytes were written
+     */
+    public Path onComplete() throws IOException {
+        out.close();
+        out = null;
+        if (bytesWritten == 0) {
+            cleanup();
+            return null;
+        }
+        return tempFile;
+    }
+
+    /** Returns the number of bytes written so far. */
+    public long getBytesWritten() {
+        return bytesWritten;
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (out != null) {
+            out.close();
+            out = null;
+        }
+        cleanup();
+    }
+
+    private void cleanup() {
+        if (tempFile != null) {
+            try {
+                Files.deleteIfExists(tempFile);
+            } catch (IOException ignored) {
+                // best effort
+            }
+            tempFile = null;
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
new file mode 100644
index 0000000000..a2e2553e44
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Shared RTF parsing state: group stack, font table, codepage tracking,
+ * and unicode skip handling.
+ *
+ * <p>Both the HTML decapsulator and the full RTF parser use this class
+ * to manage the stateful parts of RTF processing.</p>
+ *
+ * <p>Typical usage: feed every token to {@link #processToken(RTFToken)}
+ * and query the current charset via {@link #getCurrentCharset()}.</p>
+ */
+public class RTFState {
+
+    /** Global charset from {@code \ansicpgN} or charset family selectors. */
+    private Charset globalCharset = RTFCharsetMaps.WINDOWS_1252;
+
+    /** Default font ID from {@code \deffN}. */
+    private int globalDefaultFont = -1;
+
+    /** Font table: maps font number ({@code \fN}) to charset ({@code 
\fcharsetN}). */
+    private final Map<Integer, Charset> fontToCharset = new HashMap<>();
+
+    /** Group state stack. */
+    private final Deque<RTFGroupState> stack = new ArrayDeque<>();
+
+    /** Current (active) group state. */
+    private RTFGroupState current = new RTFGroupState();
+
+    /** Number of ANSI chars remaining to skip after a unicode escape. */
+    private int ansiSkip = 0;
+
+    /** The group state that was just closed (before popGroup). Set on 
GROUP_CLOSE. */
+    private RTFGroupState lastClosedGroup;
+
+    // Font table parsing state
+    // 0 = not yet seen, 1 = inside fonttbl, 2 = finished fonttbl
+    private int fontTableState = 0;
+    private int fontTableDepth = -1;
+    private int currentFontId = -1;
+
+    private boolean inHeader = true;
+
+    /**
+     * Process a single token to update internal state.
+     * <p>
+     * This handles: group open/close, charset selectors (ansi, ansicpg,
+     * deff), font table parsing (fonttbl, f, fcharset),
+     * unicode skip tracking (uc), and font changes (f in body).
+     *
+     * @return true if the token was consumed by state management (caller 
should skip it),
+     *         false if the caller should also process it
+     */
+    public boolean processToken(RTFToken tok) {
+        switch (tok.getType()) {
+            case GROUP_OPEN:
+                pushGroup();
+                return false;
+
+            case GROUP_CLOSE:
+                lastClosedGroup = current;
+                popGroup();
+                // Check if we've exited the font table
+                if (fontTableState == 1 && current.depth < fontTableDepth) {
+                    fontTableState = 2;
+                }
+                return false;
+
+            case CONTROL_SYMBOL:
+                if ("*".equals(tok.getName())) {
+                    current.ignore = true;
+                }
+                return false;
+
+            case CONTROL_WORD:
+                return processControlWord(tok);
+
+            case UNICODE_ESCAPE:
+                // After a unicode escape, skip the next ucSkip ANSI chars
+                ansiSkip = current.ucSkip;
+                return false;
+
+            case HEX_ESCAPE:
+                // If we're in the ANSI shadow of a unicode escape, skip this 
byte
+                if (ansiSkip > 0) {
+                    ansiSkip--;
+                    return true; // consumed — caller should ignore
+                }
+                return false;
+
+            case TEXT:
+                // If we're in the ANSI shadow, skip text chars
+                if (ansiSkip > 0) {
+                    // Each TEXT token is one char
+                    ansiSkip--;
+                    return true;
+                }
+                return false;
+
+            default:
+                return false;
+        }
+    }
+
+    private boolean processControlWord(RTFToken tok) {
+        String name = tok.getName();
+        boolean hasParam = tok.hasParameter();
+        int param = tok.getParameter();
+
+        // Global charset selectors (header)
+        switch (name) {
+            case "ansi":
+                globalCharset = RTFCharsetMaps.WINDOWS_1252;
+                return true;
+            case "pca":
+                globalCharset = RTFCharsetMaps.getCharset("cp850");
+                return true;
+            case "pc":
+                globalCharset = RTFCharsetMaps.getCharset("cp437");
+                return true;
+            case "mac":
+                globalCharset = RTFCharsetMaps.getCharset("MacRoman");
+                return true;
+            case "ansicpg":
+                if (hasParam) {
+                    Charset cs = RTFCharsetMaps.ANSICPG_MAP.get(param);
+                    if (cs != null) {
+                        globalCharset = cs;
+                    } else {
+                        globalCharset = RTFCharsetMaps.resolveCodePage(param);
+                    }
+                }
+                return true;
+            case "deff":
+                if (hasParam) {
+                    globalDefaultFont = param;
+                }
+                return true;
+        }
+
+        // Font table management
+        if ("fonttbl".equals(name)) {
+            fontTableState = 1;
+            fontTableDepth = current.depth;
+            current.ignore = true;
+            return true;
+        }
+
+        if (fontTableState == 1) {
+            // Inside font table
+            if (current.depth < fontTableDepth) {
+                fontTableState = 2;
+            } else {
+                if ("f".equals(name) && hasParam) {
+                    currentFontId = param;
+                    return true;
+                } else if ("fcharset".equals(name) && hasParam) {
+                    Charset cs = RTFCharsetMaps.FCHARSET_MAP.get(param);
+                    if (cs != null) {
+                        fontToCharset.put(currentFontId, cs);
+                    }
+                    return true;
+                }
+            }
+        }
+
+        // Unicode skip count
+        if ("uc".equals(name) && hasParam) {
+            current.ucSkip = param;
+            return true;
+        }
+
+        // Font change in body
+        if ("f".equals(name) && hasParam) {
+            current.fontId = param;
+            Charset fontCs = fontToCharset.get(param);
+            current.fontCharset = fontCs; // may be null
+            // If we've seen the font table and this is a body font change,
+            // we're out of the header
+            if (fontTableState == 2 && !current.ignore) {
+                inHeader = false;
+            }
+            return false; // caller may also want to know about font changes
+        }
+
+        // Header-ending control words
+        if (inHeader && !current.ignore) {
+            switch (name) {
+                case "par":
+                case "pard":
+                case "sect":
+                case "sectd":
+                case "plain":
+                case "ltrch":
+                case "rtlch":
+                case "htmlrtf":
+                case "line":
+                    inHeader = false;
+                    break;
+            }
+        }
+
+        // Embedded object / picture control words
+        switch (name) {
+            case "object":
+                current.object = true;
+                return false; // caller may want to know
+            case "objdata":
+                current.objdata = true;
+                return false;
+            case "pict":
+                current.pictDepth = 1;
+                return false;
+            case "sp":
+                current.sp = true;
+                return false;
+            case "sn":
+                current.sn = true;
+                return false;
+            case "sv":
+                current.sv = true;
+                return false;
+            case "wbitmap":
+                return false; // caller handles
+        }
+
+        // Ignorable destinations
+        if (inHeader) {
+            switch (name) {
+                case "colortbl":
+                case "stylesheet":
+                    current.ignore = true;
+                    return true;
+            }
+        }
+
+        return false;
+    }
+
+    /** Open a new group: push current state and create a child. */
+    public void pushGroup() {
+        stack.push(current);
+        current = new RTFGroupState(current);
+    }
+
+    /** Close the current group: pop and restore the parent state. */
+    public void popGroup() {
+        if (!stack.isEmpty()) {
+            current = stack.pop();
+        }
+    }
+
+    /**
+     * Returns the charset that should be used to decode the current hex escape
+     * or text byte. Priority:
+     * <ol>
+     *   <li>Font-specific charset (from {@code \fN → \fcharsetN})</li>
+     *   <li>Global default font's charset (from {@code \deffN})</li>
+     *   <li>Global charset (from {@code \ansicpgN} or family selector)</li>
+     * </ol>
+     */
+    public Charset getCurrentCharset() {
+        if (current.fontCharset != null) {
+            return current.fontCharset;
+        }
+        if (globalDefaultFont != -1 && !inHeader) {
+            Charset cs = fontToCharset.get(globalDefaultFont);
+            if (cs != null) {
+                return cs;
+            }
+        }
+        return globalCharset;
+    }
+
+    /** Returns the global charset ({@code \ansicpgN}). */
+    public Charset getGlobalCharset() {
+        return globalCharset;
+    }
+
+    /** Returns the current group state. */
+    public RTFGroupState getCurrentGroup() {
+        return current;
+    }
+
+    /** Returns true if we're still in the RTF header (before body content). */
+    public boolean isInHeader() {
+        return inHeader;
+    }
+
+    /** Returns the current group nesting depth. */
+    public int getDepth() {
+        return current.depth;
+    }
+
+    /** Returns the font-to-charset mapping table. */
+    public Map<Integer, Charset> getFontToCharset() {
+        return fontToCharset;
+    }
+
+    /** Returns the number of ANSI chars remaining to skip. */
+    public int getAnsiSkip() {
+        return ansiSkip;
+    }
+
+    /**
+     * Returns the group state that was just closed on the most recent 
GROUP_CLOSE.
+     * This is the child group's state before it was popped.
+     * Useful for checking flags like objdata, pictDepth, sn, sv, sp, object
+     * to trigger completion handlers.
+     */
+    public RTFGroupState getLastClosedGroup() {
+        return lastClosedGroup;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
new file mode 100644
index 0000000000..ec287f5c7e
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+/**
+ * A single token produced by the RTF tokenizer.
+ * <p>
+ * Mutable and reused by the tokenizer to avoid allocation in the hot loop.
+ * Consumers must copy any data they need before requesting the next token.
+ */
+public class RTFToken {
+
+    private RTFTokenType type;
+    private String name;
+    private int parameter;
+    private boolean hasParameter;
+
+    public void reset(RTFTokenType type) {
+        this.type = type;
+        this.name = null;
+        this.parameter = -1;
+        this.hasParameter = false;
+    }
+
+    public void set(RTFTokenType type, String name, int parameter, boolean 
hasParameter) {
+        this.type = type;
+        this.name = name;
+        this.parameter = parameter;
+        this.hasParameter = hasParameter;
+    }
+
+    public RTFTokenType getType() {
+        return type;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public int getParameter() {
+        return parameter;
+    }
+
+    public boolean hasParameter() {
+        return hasParameter;
+    }
+
+    /**
+     * For HEX_ESCAPE tokens, returns the decoded byte value (0-255).
+     */
+    public int getHexValue() {
+        return parameter;
+    }
+
+    @Override
+    public String toString() {
+        switch (type) {
+            case GROUP_OPEN:
+                return "{";
+            case GROUP_CLOSE:
+                return "}";
+            case CONTROL_WORD:
+                return "\\" + name + (hasParameter ? String.valueOf(parameter) 
: "");
+            case CONTROL_SYMBOL:
+                return "\\" + name;
+            case HEX_ESCAPE:
+                return String.format("\\'%02x", parameter);
+            case UNICODE_ESCAPE:
+                return "\\u" + parameter;
+            case TEXT:
+                return "TEXT[" + name + "]";
+            case BIN:
+                return "\\bin" + parameter;
+            case CRLF:
+                return "CRLF";
+            case EOF:
+                return "EOF";
+            default:
+                return type.name();
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
new file mode 100644
index 0000000000..dcdcf511f9
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+public enum RTFTokenType {
+    GROUP_OPEN,
+    GROUP_CLOSE,
+    CONTROL_WORD,
+    CONTROL_SYMBOL,
+    HEX_ESCAPE,
+    UNICODE_ESCAPE,
+    TEXT,
+    BIN,
+    CRLF,
+    EOF
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
new file mode 100644
index 0000000000..237800effe
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+%%
+
+%public
+%class RTFTokenizer
+%unicode
+%type RTFToken
+%char
+
+%{
+    private final RTFToken token = new RTFToken();
+
+    /**
+     * Returns the reusable token instance. Callers must copy data
+     * before the next call to {@link #yylex()}.
+     */
+    public RTFToken getToken() {
+        return token;
+    }
+
+    private RTFToken controlWord(String text) {
+        // text is the full match including leading backslash and optional 
trailing
+        // delimiter space, e.g. "\\fonttbl", "\\f123 ", "\\ansi "
+        // strip leading backslash
+        String body = text.substring(1);
+        // strip trailing delimiter space if present
+        if (body.endsWith(" ")) {
+            body = body.substring(0, body.length() - 1);
+        }
+
+        // split into name and optional numeric parameter
+        int i = 0;
+        while (i < body.length() && Character.isLetter(body.charAt(i))) {
+            i++;
+        }
+        String name = body.substring(0, i);
+        if (i < body.length()) {
+            // there is a numeric parameter (possibly negative)
+            String paramStr = body.substring(i);
+            int param = Integer.parseInt(paramStr);
+            token.set(RTFTokenType.CONTROL_WORD, name, param, true);
+        } else {
+            token.set(RTFTokenType.CONTROL_WORD, name, -1, false);
+        }
+        return token;
+    }
+
+    private RTFToken hexEscape(String text) {
+        // text is e.g. "\\'ab"
+        int hi = Character.digit(text.charAt(2), 16);
+        int lo = Character.digit(text.charAt(3), 16);
+        token.set(RTFTokenType.HEX_ESCAPE, null, (hi << 4) | lo, true);
+        return token;
+    }
+
+    private RTFToken unicodeEscape(String text) {
+        // text is e.g. "\\u12345" or "\\u-4321 " (may have trailing delimiter 
space)
+        String numStr = text.substring(2).trim();
+        int codePoint = Integer.parseInt(numStr);
+        // RTF uses signed 16-bit: negative values map to 65536 + value
+        if (codePoint < 0) {
+            codePoint = 65536 + codePoint;
+        }
+        token.set(RTFTokenType.UNICODE_ESCAPE, null, codePoint, true);
+        return token;
+    }
+
+    private RTFToken binToken(String text) {
+        // text is e.g. "\\bin12345 " (may have trailing delimiter space)
+        String numStr = text.substring(4).trim();
+        int count = Integer.parseInt(numStr);
+        token.set(RTFTokenType.BIN, null, count, true);
+        return token;
+    }
+%}
+
+/* RTF is 7-bit ASCII; bytes above 127 are escaped. We read as Latin1/byte 
stream. */
+
+/* RTF spec: a control word's delimiter space is consumed and not part of the 
output.
+   We include the optional trailing space in each pattern so the tokenizer 
eats it. */
+ControlWordWithParam = "\\" [a-zA-Z]+ "-"? [0-9]+ " "?
+ControlWord = "\\" [a-zA-Z]+ " "?
+HexEscape = "\\'" [0-9a-fA-F]{2}
+UnicodeEscape = "\\u" "-"? [0-9]+ " "?
+BinControl = "\\bin" [0-9]+ " "?
+ControlSymbol = "\\" [^a-zA-Z0-9\r\n]
+GroupOpen = "{"
+GroupClose = "}"
+CrLf = \r\n | \r | \n
+
+%%
+
+/* Order matters: more specific rules first */
+
+{BinControl}             { return binToken(yytext()); }
+{UnicodeEscape}          { return unicodeEscape(yytext()); }
+{HexEscape}              { return hexEscape(yytext()); }
+{ControlWordWithParam}   { return controlWord(yytext()); }
+{ControlWord}            { return controlWord(yytext()); }
+{ControlSymbol}          { token.set(RTFTokenType.CONTROL_SYMBOL, 
yytext().substring(1), -1, false); return token; }
+{GroupOpen}              { token.reset(RTFTokenType.GROUP_OPEN); return token; 
}
+{GroupClose}             { token.reset(RTFTokenType.GROUP_CLOSE); return 
token; }
+{CrLf}                   { token.reset(RTFTokenType.CRLF); return token; }
+
+/* Text: any character that isn't part of an RTF structure.
+   Match one char at a time to keep things simple. The consumer
+   can accumulate runs. Matching longer runs would be an optimization
+   for later. */
+[^\\\{\}\r\n]            { token.set(RTFTokenType.TEXT, yytext(), -1, false); 
return token; }
+
+<<EOF>>                  { token.reset(RTFTokenType.EOF); return token; }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
new file mode 100644
index 0000000000..32b8ae58f9
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Tests for {@link RTFEmbeddedHandler} driven by the JFlex tokenizer,
+ * both standalone and integrated into the decapsulator.
+ */
+public class RTFEmbeddedHandlerTest {
+
+    private static ParseContext buildContext(List<Metadata> extracted) {
+        ParseContext context = new ParseContext();
+        context.set(EmbeddedDocumentExtractor.class, new 
EmbeddedDocumentExtractor() {
+            @Override
+            public boolean shouldParseEmbedded(Metadata metadata) {
+                return true;
+            }
+
+            @Override
+            public void parseEmbedded(TikaInputStream stream, ContentHandler 
handler,
+                                      Metadata metadata, ParseContext 
parseContext,
+                                      boolean outputHtml) {
+                Metadata copy = new Metadata();
+                for (String name : metadata.names()) {
+                    for (String val : metadata.getValues(name)) {
+                        copy.add(name, val);
+                    }
+                }
+                extracted.add(copy);
+            }
+        });
+        return context;
+    }
+
+    /**
+     * Process an RTF file through the tokenizer + state + embedded handler 
directly.
+     */
+    private List<Metadata> extractEmbeddedDirect(String resourceName)
+            throws IOException, SAXException, TikaException {
+        List<Metadata> extracted = new ArrayList<>();
+        ParseContext context = buildContext(extracted);
+        ContentHandler handler = new DefaultHandler();
+        RTFEmbeddedHandler embHandler = new RTFEmbeddedHandler(handler, 
context, 20 * 1024);
+        RTFState state = new RTFState();
+
+        try (InputStream is = 
getClass().getResourceAsStream("/test-documents/" + resourceName);
+             Reader reader = new InputStreamReader(is, 
StandardCharsets.US_ASCII)) {
+
+            RTFTokenizer tokenizer = new RTFTokenizer(reader);
+            RTFToken tok;
+
+            while ((tok = tokenizer.yylex()) != null) {
+                if (tok.getType() == RTFTokenType.EOF) {
+                    break;
+                }
+                boolean consumed = state.processToken(tok);
+                if (!consumed) {
+                    RTFGroupState closingGroup =
+                            (tok.getType() == RTFTokenType.GROUP_CLOSE)
+                                    ? state.getLastClosedGroup() : null;
+                    embHandler.processToken(tok, state, closingGroup);
+                }
+            }
+        }
+        return extracted;
+    }
+
+    @Test
+    public void testEmbeddedFiles() throws Exception {
+        List<Metadata> embedded = 
extractEmbeddedDirect("testRTFEmbeddedFiles.rtf");
+        assertTrue(embedded.size() > 0,
+                "should extract at least one embedded object from 
testRTFEmbeddedFiles.rtf");
+    }
+
+    @Test
+    public void testPictExtraction() throws Exception {
+        // Verifies the handler doesn't crash on a typical RTF file
+        extractEmbeddedDirect("testRTF.rtf");
+    }
+
+    @Test
+    public void testEmbeddedObjectMetadata() throws Exception {
+        List<Metadata> embedded = 
extractEmbeddedDirect("testRTFEmbeddedFiles.rtf");
+        if (embedded.size() > 0) {
+            boolean hasName = false;
+            for (Metadata m : embedded) {
+                String name = m.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+                if (name != null && !name.isEmpty()) {
+                    hasName = true;
+                    break;
+                }
+            }
+            assertTrue(hasName, "at least one embedded should have a resource 
name");
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
new file mode 100644
index 0000000000..6d8df7534d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for {@link RTFHtmlDecapsulator}, mirroring the original
+ * RTFEncapsulatedHTMLExtractorTest to verify parity.
+ */
+public class RTFHtmlDecapsulatorTest {
+
+    @Test
+    public void testNullAndEmpty() throws Exception {
+        assertNull(new RTFHtmlDecapsulator().extract(null));
+        assertNull(new RTFHtmlDecapsulator().extract(new byte[0]));
+    }
+
+    @Test
+    public void testNonEncapsulatedRtf() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\deff0 Hello world}";
+        assertNull(new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)));
+    }
+
+    @Test
+    public void testSimpleEncapsulatedHtml() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag34 <head>}\n" +
+                "{\\*\\htmltag41 </head>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "{\\*\\htmltag84 Hello world}\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "\\htmlrtf }\\htmlrtf0\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<html>"));
+        assertTrue(html.contains("<p>"));
+        assertTrue(html.contains("Hello world"));
+        assertTrue(html.contains("</html>"));
+    }
+
+    @Test
+    public void testImgCidExtraction() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "{\\*\\htmltag84 <img 
src=\"cid:[email protected]\">}\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("cid:[email protected]"),
+                "CID reference should be preserved in extracted HTML");
+    }
+
+    @Test
+    public void testParAndTabDecoding() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag241 <style>}\n" +
+                "{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n" 
+
+                "{\\*\\htmltag249 </style>}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<style>"));
+        assertTrue(html.contains("body {"));
+        assertTrue(html.contains("\tcolor: red;"));
+        assertTrue(html.contains("</style>"));
+    }
+
+    @Test
+    public void testHexEscapeDecoding() throws Exception {
+        // \'e9 = 0xE9 = 'e' in windows-1252
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 caf\\'e9}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("caf\u00e9", html);
+    }
+
+    @Test
+    public void testMultiByteHexEscape() throws Exception {
+        // \'fc = 'u' and \'df = 'ss' in windows-1252
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 gr\\'fc\\'dfe}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("gr\u00fc\u00dfe", html);
+    }
+
+    @Test
+    public void testCodePage1254Turkish() throws Exception {
+        // \'fd in windows-1254 = 0xFD, decoded by Java's windows-1254 charset
+        String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 Say\\'fdn}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        // Verify the byte 0xFD is decoded through windows-1254
+        byte[] expected = new byte[] { 'S', 'a', 'y', (byte) 0xFD, 'n' };
+        assertEquals(new String(expected, 
java.nio.charset.Charset.forName("windows-1254")), html);
+    }
+
+    @Test
+    public void testHtmlrtfSkipping() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 Hello}\n" +
+                "\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" +
+                "{\\*\\htmltag84  World}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("Hello World", html);
+    }
+
+    @Test
+    public void testEscapedBracesAndBackslash() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("a { b } c \\d", html);
+    }
+
+    @Test
+    public void testEmptyHtmltag() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag72}\n" +
+                "{\\*\\htmltag84 text}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("text", html);
+    }
+
+    @Test
+    public void testInterTagTextContent() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "Hello from the message body\n" +
+                "\\htmlrtf\\par}\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "Second paragraph\n" +
+                "\\htmlrtf\\par}\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<p>"), "should contain HTML tags");
+        assertTrue(html.contains("Hello from the message body"),
+                "should contain inter-tag text content");
+        assertTrue(html.contains("Second paragraph"),
+                "should contain second paragraph text");
+        assertTrue(html.contains("</html>"), "should contain closing tag");
+    }
+
+    @Test
+    public void testInterTagHexEscapes() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "caf\\'e9\n" +
+                "\\htmlrtf }\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("caf\u00e9"), "hex escapes in inter-tag text 
should be decoded");
+    }
+
+    @Test
+    public void testLineControlWord() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 line1\\line line2}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("line1<br>line2", html);
+    }
+
+    @Test
+    public void testFontAwareCodePageDecoding() throws Exception {
+        // f0 = ANSI (fcharset 0 = windows-1252), f1 = Greek (fcharset 161 = 
cp1253)
+        // \'e1 in windows-1252 = U+00E1 (a with acute)
+        // \'e1 in cp1253 = U+03B1 (GREEK SMALL LETTER ALPHA)
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\fonttbl{\\f0\\fcharset0 Times;}{\\f1\\fcharset161 
Greek;}}\n" +
+                "{\\*\\htmltag84 \\f0 caf\\'e9}\n" +
+                "{\\*\\htmltag84 \\f1 \\'e1}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        // f0: \'e9 in windows-1252 = e with acute
+        assertTrue(html.contains("caf\u00e9"), "f0 should decode as 
windows-1252");
+        // f1: \'e1 in cp1253 = Greek alpha
+        assertTrue(html.contains("\u03b1"), "f1 should decode as cp1253 
(Greek)");
+    }
+
+    @Test
+    public void testUnicodeEscapeWithAnsiShadow() throws Exception {
+        // \u8212 is em dash (U+2014). The \'97 is the ANSI shadow and should 
be skipped.
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\fonttbl{\\f0\\fcharset0 Times;}}\n" +
+                "{\\*\\htmltag84 A\\u8212\\'97B}\n" +
+                "}";
+        String html = new 
RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("A\u2014B", html);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
new file mode 100644
index 0000000000..97d6b0a7dc
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.StringReader;
+import java.nio.charset.Charset;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFStateTest {
+
+    private RTFState processRtf(String rtf) throws Exception {
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+        RTFState state = new RTFState();
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            state.processToken(tok);
+        }
+        return state;
+    }
+
+    @Test
+    public void testGlobalCharsetFromAnsicpg() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\ansi\\ansicpg1251}");
+        assertEquals(Charset.forName("CP1251"), state.getGlobalCharset());
+    }
+
+    @Test
+    public void testGlobalCharsetDefaultWindows1252() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\ansi}");
+        assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getGlobalCharset());
+    }
+
+    @Test
+    public void testGlobalCharsetPca() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\pca}");
+        assertEquals(Charset.forName("cp850"), state.getGlobalCharset());
+    }
+
+    @Test
+    public void testGlobalCharsetPc() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\pc}");
+        assertEquals(Charset.forName("cp437"), state.getGlobalCharset());
+    }
+
+    @Test
+    public void testGlobalCharsetMac() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\mac}");
+        assertEquals(Charset.forName("MacRoman"), state.getGlobalCharset());
+    }
+
+    @Test
+    public void testFontTableParsing() throws Exception {
+        // Realistic font table: f0=Times New Roman (ANSI), f1=MS Mincho 
(Shift_JIS)
+        String rtf = "{\\rtf1\\ansi\\deff0" +
+                "{\\fonttbl" +
+                "{\\f0\\froman\\fcharset0 Times New Roman;}" +
+                "{\\f1\\fnil\\fcharset128 MS Mincho;}" +
+                "}" +
+                "\\f0 Hello}";
+        RTFState state = processRtf(rtf);
+
+        // fcharset 0 = ANSI = WINDOWS-1252
+        assertEquals(RTFCharsetMaps.WINDOWS_1252, 
state.getFontToCharset().get(0));
+        // fcharset 128 = Shift JIS = MS932
+        assertEquals(Charset.forName("MS932"), 
state.getFontToCharset().get(1));
+    }
+
+    @Test
+    public void testCurrentCharsetFollowsFont() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" +
+                "{\\fonttbl" +
+                "{\\f0\\froman\\fcharset0 Times;}" +
+                "{\\f1\\fnil\\fcharset161 Greek;}" +
+                "}" +
+                "\\f1 text}";
+        RTFTokenizer tokenizer = new RTFTokenizer(new 
java.io.StringReader(rtf));
+        RTFState state = new RTFState();
+        Charset charsetAtText = null;
+
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            state.processToken(tok);
+            // Capture charset when we see the first body text char
+            if (tok.getType() == RTFTokenType.TEXT && "t".equals(tok.getName())
+                    && charsetAtText == null) {
+                charsetAtText = state.getCurrentCharset();
+            }
+        }
+
+        // Verify font table was populated
+        assertEquals(2, state.getFontToCharset().size());
+        assertEquals(Charset.forName("cp1253"), 
state.getFontToCharset().get(1));
+
+        // After \f1, charset should be cp1253 (Greek)
+        assertNotNull(charsetAtText);
+        assertEquals(Charset.forName("cp1253"), charsetAtText);
+    }
+
+    @Test
+    public void testCurrentCharsetFallsBackToGlobal() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1254\\deff0" +
+                "{\\fonttbl" +
+                "{\\f0\\froman\\fcharset0 Times;}" +
+                "}" +
+                "\\f0 text}";
+        RTFState state = processRtf(rtf);
+
+        // fcharset 0 = WINDOWS-1252 (ANSI)
+        assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getCurrentCharset());
+    }
+
+    @Test
+    public void testDefaultFontCharset() throws Exception {
+        // \deff1 sets default font to f1, which maps to fcharset 162 (Turkish 
= cp1254)
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff1" +
+                "{\\fonttbl" +
+                "{\\f0\\froman\\fcharset0 Times;}" +
+                "{\\f1\\fnil\\fcharset162 Arial;}" +
+                "}" +
+                "\\pard text}";
+        RTFState state = processRtf(rtf);
+
+        // No explicit \fN in body, so should fall back to deff1 -> fcharset 
162 -> cp1254
+        assertEquals(Charset.forName("cp1254"), state.getCurrentCharset());
+    }
+
+    @Test
+    public void testUcSkipInherited() throws Exception {
+        // RTF uc control word sets skip count to 2, inherited by child groups
+        // We process token-by-token and check inside the inner group
+        String rtf = "{\\rtf1\\ansi\\uc2{inner}}";
+        RTFTokenizer tokenizer = new RTFTokenizer(new 
java.io.StringReader(rtf));
+        RTFState state = new RTFState();
+
+        int ucSkipInInnerGroup = -1;
+        boolean seenInnerText = false;
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            state.processToken(tok);
+            // Check ucSkip when we see the first char of "inner"
+            if (tok.getType() == RTFTokenType.TEXT && 
"i".equals(tok.getName()) && !seenInnerText) {
+                ucSkipInInnerGroup = state.getCurrentGroup().ucSkip;
+                seenInnerText = true;
+            }
+        }
+        // Inside {inner}, ucSkip should be inherited as 2 from parent
+        assertEquals(2, ucSkipInInnerGroup);
+    }
+
+    @Test
+    public void testAnsiSkipAfterUnicode() throws Exception {
+        // After \u8212, the next ucSkip (default 1) ANSI chars should be 
skipped
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252" +
+                "{\\fonttbl{\\f0\\fcharset0 Times;}}" +
+                "\\f0 A\\u8212\\'97B}";
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+        RTFState state = new RTFState();
+        StringBuilder textOutput = new StringBuilder();
+
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            boolean consumed = state.processToken(tok);
+            if (!consumed && !state.getCurrentGroup().ignore) {
+                if (tok.getType() == RTFTokenType.TEXT) {
+                    textOutput.append(tok.getName());
+                } else if (tok.getType() == RTFTokenType.UNICODE_ESCAPE) {
+                    int cp = tok.getParameter();
+                    if (Character.isValidCodePoint(cp)) {
+                        textOutput.appendCodePoint(cp);
+                    }
+                }
+            }
+        }
+        // A + \u8212 (em dash) + B.  The \'97 should be skipped as unicode 
shadow.
+        assertEquals("A\u2014B", textOutput.toString());
+    }
+
+    @Test
+    public void testGroupStateRestored() throws Exception {
+        // Font change inside a group should be reverted when group closes
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" +
+                "{\\fonttbl" +
+                "{\\f0\\fcharset0 Times;}" +
+                "{\\f1\\fcharset161 Greek;}" +
+                "}" +
+                "\\f0 {\\f1 greek}{back to times}}";
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+        RTFState state = new RTFState();
+
+        Charset charsetInsideGroup = null;
+        Charset charsetAfterGroup = null;
+        boolean seenGreekGroup = false;
+        int bodyGroupDepth = 0;
+
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            state.processToken(tok);
+
+            if (tok.getType() == RTFTokenType.TEXT) {
+                String text = tok.getName();
+                if ("g".equals(text) && !seenGreekGroup) {
+                    // First char of "greek"
+                    charsetInsideGroup = state.getCurrentCharset();
+                    seenGreekGroup = true;
+                } else if ("b".equals(text)) {
+                    // First char of "back to times"
+                    charsetAfterGroup = state.getCurrentCharset();
+                }
+            }
+        }
+
+        assertNotNull(charsetInsideGroup);
+        assertNotNull(charsetAfterGroup);
+        // Inside the {\f1 ...} group, charset should be Greek (cp1253)
+        assertEquals(Charset.forName("cp1253"), charsetInsideGroup);
+        // After the group closes, should be back to f0 (WINDOWS-1252)
+        assertEquals(RTFCharsetMaps.WINDOWS_1252, charsetAfterGroup);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
new file mode 100644
index 0000000000..741fefb3e5
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFTokenizerTest {
+
+    private List<RTFToken> tokenize(String input) throws Exception {
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(input));
+        List<RTFToken> tokens = new ArrayList<>();
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            // copy token since it's reused
+            RTFToken copy = new RTFToken();
+            copy.set(tok.getType(), tok.getName(), tok.getParameter(), 
tok.hasParameter());
+            tokens.add(copy);
+        }
+        return tokens;
+    }
+
+    @Test
+    public void testGroupOpenClose() throws Exception {
+        List<RTFToken> tokens = tokenize("{}");
+        assertEquals(2, tokens.size());
+        assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+        assertEquals(RTFTokenType.GROUP_CLOSE, tokens.get(1).getType());
+    }
+
+    @Test
+    public void testControlWord() throws Exception {
+        List<RTFToken> tokens = tokenize("\\rtf1");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(0).getType());
+        assertEquals("rtf", tokens.get(0).getName());
+        assertEquals(1, tokens.get(0).getParameter());
+        assertTrue(tokens.get(0).hasParameter());
+    }
+
+    @Test
+    public void testControlWordNoParam() throws Exception {
+        List<RTFToken> tokens = tokenize("\\ansi");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(0).getType());
+        assertEquals("ansi", tokens.get(0).getName());
+        assertFalse(tokens.get(0).hasParameter());
+    }
+
+    @Test
+    public void testControlWordNegativeParam() throws Exception {
+        List<RTFToken> tokens = tokenize("\\u-4321");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.UNICODE_ESCAPE, tokens.get(0).getType());
+        // -4321 → 65536 - 4321 = 61215
+        assertEquals(61215, tokens.get(0).getParameter());
+    }
+
+    @Test
+    public void testHexEscape() throws Exception {
+        List<RTFToken> tokens = tokenize("\\'e9");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.HEX_ESCAPE, tokens.get(0).getType());
+        assertEquals(0xe9, tokens.get(0).getHexValue());
+    }
+
+    @Test
+    public void testUnicodeEscape() throws Exception {
+        List<RTFToken> tokens = tokenize("\\u8212");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.UNICODE_ESCAPE, tokens.get(0).getType());
+        assertEquals(8212, tokens.get(0).getParameter());
+    }
+
+    @Test
+    public void testBinControl() throws Exception {
+        List<RTFToken> tokens = tokenize("\\bin1024");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.BIN, tokens.get(0).getType());
+        assertEquals(1024, tokens.get(0).getParameter());
+    }
+
+    @Test
+    public void testControlSymbol() throws Exception {
+        List<RTFToken> tokens = tokenize("\\~");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(0).getType());
+        assertEquals("~", tokens.get(0).getName());
+    }
+
+    @Test
+    public void testEscapedBraces() throws Exception {
+        List<RTFToken> tokens = tokenize("\\{\\}\\\\");
+        assertEquals(3, tokens.size());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(0).getType());
+        assertEquals("{", tokens.get(0).getName());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(1).getType());
+        assertEquals("}", tokens.get(1).getName());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(2).getType());
+        assertEquals("\\", tokens.get(2).getName());
+    }
+
+    @Test
+    public void testText() throws Exception {
+        List<RTFToken> tokens = tokenize("Hello");
+        assertEquals(5, tokens.size()); // one char at a time
+        for (RTFToken t : tokens) {
+            assertEquals(RTFTokenType.TEXT, t.getType());
+        }
+        StringBuilder sb = new StringBuilder();
+        for (RTFToken t : tokens) {
+            sb.append(t.getName());
+        }
+        assertEquals("Hello", sb.toString());
+    }
+
+    @Test
+    public void testCrLf() throws Exception {
+        List<RTFToken> tokens = tokenize("a\r\nb");
+        assertEquals(3, tokens.size());
+        assertEquals(RTFTokenType.TEXT, tokens.get(0).getType());
+        assertEquals(RTFTokenType.CRLF, tokens.get(1).getType());
+        assertEquals(RTFTokenType.TEXT, tokens.get(2).getType());
+    }
+
+    @Test
+    public void testIgnorableDestination() throws Exception {
+        // {  \*  \htmltag84_  <  p  >  }
+        // The space after \htmltag84 is consumed as the control word delimiter
+        List<RTFToken> tokens = tokenize("{\\*\\htmltag84 <p>}");
+        assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(1).getType());
+        assertEquals("*", tokens.get(1).getName());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(2).getType());
+        assertEquals("htmltag", tokens.get(2).getName());
+        assertEquals(84, tokens.get(2).getParameter());
+        // remaining tokens are < p > }
+        assertEquals(RTFTokenType.TEXT, tokens.get(3).getType());
+        assertEquals("<", tokens.get(3).getName());
+        assertEquals(RTFTokenType.TEXT, tokens.get(4).getType());
+        assertEquals("p", tokens.get(4).getName());
+        assertEquals(RTFTokenType.TEXT, tokens.get(5).getType());
+        assertEquals(">", tokens.get(5).getName());
+        assertEquals(RTFTokenType.GROUP_CLOSE, tokens.get(6).getType());
+        assertEquals(7, tokens.size());
+    }
+
+    @Test
+    public void testMixedRtf() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252 Hello}";
+        List<RTFToken> tokens = tokenize(rtf);
+        // { \rtf1 \ansi \ansicpg1252 SPACE H e l l o }
+        assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(1).getType());
+        assertEquals("rtf", tokens.get(1).getName());
+        assertEquals(1, tokens.get(1).getParameter());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(2).getType());
+        assertEquals("ansi", tokens.get(2).getName());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(3).getType());
+        assertEquals("ansicpg", tokens.get(3).getName());
+        assertEquals(1252, tokens.get(3).getParameter());
+    }
+}

(tika) 01/02: jflex rtf parser - WIP

Reply via email to