This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4710-rtf-attachments-in-html-decapsulation in repository https://gitbox.apache.org/repos/asf/tika.git
commit c37df9d84ede2599ae3c96b00b6676f9b149f356 Author: tallison <[email protected]> AuthorDate: Mon Apr 6 08:21:47 2026 -0400 jflex rtf parser - WIP --- .../tika-parser-microsoft-module/pom.xml | 17 + .../tika/parser/microsoft/OutlookExtractor.java | 8 +- .../parser/microsoft/rtf/RTFObjDataParser.java | 6 +- .../parser/microsoft/rtf/jflex/RTFCharsetMaps.java | 180 +++++++ .../microsoft/rtf/jflex/RTFEmbeddedHandler.java | 339 +++++++++++++ .../parser/microsoft/rtf/jflex/RTFGroupState.java | 76 +++ .../microsoft/rtf/jflex/RTFHtmlDecapsulator.java | 284 +++++++++++ .../rtf/jflex/RTFObjDataStreamParser.java | 534 +++++++++++++++++++++ .../microsoft/rtf/jflex/RTFPictStreamParser.java | 104 ++++ .../tika/parser/microsoft/rtf/jflex/RTFState.java | 336 +++++++++++++ .../tika/parser/microsoft/rtf/jflex/RTFToken.java | 96 ++++ .../parser/microsoft/rtf/jflex/RTFTokenType.java | 30 ++ .../parser/microsoft/rtf/jflex/RTFTokenizer.jflex | 129 +++++ .../rtf/jflex/RTFEmbeddedHandlerTest.java | 132 +++++ .../rtf/jflex/RTFHtmlDecapsulatorTest.java | 247 ++++++++++ .../parser/microsoft/rtf/jflex/RTFStateTest.java | 252 ++++++++++ .../microsoft/rtf/jflex/RTFTokenizerTest.java | 187 ++++++++ 17 files changed, 2951 insertions(+), 6 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml index 63cc9605cd..906914e132 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml @@ -125,6 +125,23 @@ </dependencies> <build> <plugins> + <plugin> + <groupId>de.jflex</groupId> + <artifactId>jflex-maven-plugin</artifactId> + <version>1.9.1</version> + <executions> + <execution> + <goals> + <goal>generate</goal> + </goals> + <configuration> + <lexDefinitions> + <lexDefinition>src/main/jflex</lexDefinition> + </lexDefinitions> + </configuration> + </execution> + </executions> + </plugin> <plugin> <groupId>org.apache.rat</groupId> <artifactId>apache-rat-plugin</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index a2ef6de04f..be8c419cb7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -84,8 +84,8 @@ import org.apache.tika.parser.html.HtmlEncodingDetector; import org.apache.tika.parser.html.JSoupParser; import org.apache.tika.parser.mailcommons.MailDateParser; import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor; -import org.apache.tika.parser.microsoft.msg.RTFEncapsulatedHTMLExtractor; import org.apache.tika.parser.microsoft.rtf.RTFParser; +import org.apache.tika.parser.microsoft.rtf.jflex.RTFHtmlDecapsulator; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.sax.BodyContentHandler; @@ -600,8 +600,10 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()); byte[] rtfData = rtf.getData(); - // Try to extract encapsulated HTML — returns null if not present - String html = RTFEncapsulatedHTMLExtractor.extract(rtfData); + // Try to extract encapsulated HTML + embedded objects in one pass + RTFHtmlDecapsulator decapsulator = + new RTFHtmlDecapsulator(xhtml, parseContext, 20 * 1024); + String html = decapsulator.extract(rtfData); if (html != null) { parseHtmlString(html, xhtml, contentIdNames); parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java index ff4c12061e..48f88e2f7f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java @@ -51,12 +51,12 @@ import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf * and for granting permission to use his code in Tika. */ -class RTFObjDataParser { +public class RTFObjDataParser { private final static String WIN_ASCII = "WINDOWS-1252"; private final int memoryLimitInKb; - RTFObjDataParser(int memoryLimitInKb) { + public RTFObjDataParser(int memoryLimitInKb) { this.memoryLimitInKb = memoryLimitInKb; } @@ -81,7 +81,7 @@ class RTFObjDataParser { * @return byte[] for contents of obj data * @throws IOException */ - protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) + public byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException, TikaException { UnsynchronizedByteArrayInputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(); long version = readUInt(is); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java new file mode 100644 index 0000000000..aaac2552ac --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.tika.utils.CharsetUtils; + +/** + * Shared charset maps for RTF parsing. Maps RTF {@code \fcharsetN} and + * {@code \ansicpgN} values to Java {@link Charset} instances. + * + * <p>Extracted from the original {@code TextExtractor} so both the JFlex-based + * parser and decapsulator can reuse them.</p> + */ +public final class RTFCharsetMaps { + + public static final Charset WINDOWS_1252 = Charset.forName("windows-1252"); + + /** + * Maps {@code \fcharsetN} values to Java charsets. + * The RTF font table uses these to declare per-font character encodings. + */ + public static final Map<Integer, Charset> FCHARSET_MAP; + + /** + * Maps {@code \ansicpgN} values to Java charsets. + * This is the global ANSI code page declared in the RTF header. + */ + public static final Map<Integer, Charset> ANSICPG_MAP; + + static { + Map<Integer, Charset> fcharset = new HashMap<>(); + + fcharset.put(0, WINDOWS_1252); // ANSI + // charset 1 = Default, charset 2 = Symbol + + fcharset.put(77, getCharset("MacRoman")); // Mac Roman + fcharset.put(78, getCharset("Shift_JIS")); // Mac Shift Jis + fcharset.put(79, getCharset("ms949")); // Mac Hangul + fcharset.put(80, getCharset("GB2312")); // Mac GB2312 + fcharset.put(81, getCharset("Big5")); // Mac Big5 + fcharset.put(82, getCharset("johab")); // Mac Johab (old) + fcharset.put(83, getCharset("MacHebrew")); // Mac Hebrew + fcharset.put(84, getCharset("MacArabic")); // Mac Arabic + fcharset.put(85, getCharset("MacGreek")); // Mac Greek + fcharset.put(86, getCharset("MacTurkish")); // Mac Turkish + fcharset.put(87, getCharset("MacThai")); // Mac Thai + fcharset.put(88, getCharset("cp1250")); // Mac East Europe + fcharset.put(89, getCharset("cp1251")); // Mac Russian + + fcharset.put(128, getCharset("MS932")); // Shift JIS + fcharset.put(129, getCharset("ms949")); // Hangul + fcharset.put(130, getCharset("ms1361")); // Johab + fcharset.put(134, getCharset("ms936")); // GB2312 + fcharset.put(136, getCharset("ms950")); // Big5 + fcharset.put(161, getCharset("cp1253")); // Greek + fcharset.put(162, getCharset("cp1254")); // Turkish + fcharset.put(163, getCharset("cp1258")); // Vietnamese + fcharset.put(177, getCharset("cp1255")); // Hebrew + fcharset.put(178, getCharset("cp1256")); // Arabic + fcharset.put(186, getCharset("cp1257")); // Baltic + + fcharset.put(204, getCharset("cp1251")); // Russian + fcharset.put(222, getCharset("ms874")); // Thai + fcharset.put(238, getCharset("cp1250")); // Eastern European + fcharset.put(254, getCharset("cp437")); // PC 437 + fcharset.put(255, getCharset("cp850")); // OEM + + FCHARSET_MAP = Collections.unmodifiableMap(fcharset); + } + + static { + Map<Integer, Charset> ansicpg = new HashMap<>(); + + ansicpg.put(437, getCharset("CP437")); // US IBM + ansicpg.put(708, getCharset("ISO-8859-6")); // Arabic (ASMO 708) + ansicpg.put(709, getCharset("windows-709")); // Arabic (ASMO 449+) + ansicpg.put(710, getCharset("windows-710")); // Arabic (transparent) + ansicpg.put(711, getCharset("windows-711")); // Arabic (Nafitha) + ansicpg.put(720, getCharset("windows-720")); // Arabic (transparent ASMO) + ansicpg.put(819, getCharset("CP819")); // Windows 3.1 (US/Western) + ansicpg.put(850, getCharset("CP850")); // IBM Multilingual + ansicpg.put(852, getCharset("CP852")); // Eastern European + ansicpg.put(860, getCharset("CP860")); // Portuguese + ansicpg.put(862, getCharset("CP862")); // Hebrew + ansicpg.put(863, getCharset("CP863")); // French Canadian + ansicpg.put(864, getCharset("CP864")); // Arabic + ansicpg.put(865, getCharset("CP865")); // Norwegian + ansicpg.put(866, getCharset("CP866")); // Soviet Union + ansicpg.put(874, getCharset("MS874")); // Thai + ansicpg.put(932, getCharset("MS932")); // Japanese + ansicpg.put(936, getCharset("MS936")); // Simplified Chinese + ansicpg.put(949, getCharset("CP949")); // Korean + ansicpg.put(950, getCharset("CP950")); // Traditional Chinese + ansicpg.put(1250, getCharset("CP1250")); // Eastern European + ansicpg.put(1251, getCharset("CP1251")); // Cyrillic + ansicpg.put(1252, getCharset("CP1252")); // Western European + ansicpg.put(1253, getCharset("CP1253")); // Greek + ansicpg.put(1254, getCharset("CP1254")); // Turkish + ansicpg.put(1255, getCharset("CP1255")); // Hebrew + ansicpg.put(1256, getCharset("CP1256")); // Arabic + ansicpg.put(1257, getCharset("CP1257")); // Baltic + ansicpg.put(1258, getCharset("CP1258")); // Vietnamese + ansicpg.put(1361, getCharset("x-Johab")); // Johab + ansicpg.put(10000, getCharset("MacRoman")); // Mac Roman + ansicpg.put(10001, getCharset("Shift_JIS")); // Mac Japan + ansicpg.put(10004, getCharset("MacArabic")); // Mac Arabic + ansicpg.put(10005, getCharset("MacHebrew")); // Mac Hebrew + ansicpg.put(10006, getCharset("MacGreek")); // Mac Greek + ansicpg.put(10007, getCharset("MacCyrillic")); // Mac Cyrillic + ansicpg.put(10029, getCharset("x-MacCentralEurope")); // Mac Latin2 + ansicpg.put(10081, getCharset("MacTurkish")); // Mac Turkish + ansicpg.put(57002, getCharset("x-ISCII91")); // Devanagari + ansicpg.put(57003, getCharset("windows-57003")); // Bengali + ansicpg.put(57004, getCharset("windows-57004")); // Tamil + ansicpg.put(57005, getCharset("windows-57005")); // Telugu + ansicpg.put(57006, getCharset("windows-57006")); // Assamese + ansicpg.put(57007, getCharset("windows-57007")); // Oriya + ansicpg.put(57008, getCharset("windows-57008")); // Kannada + ansicpg.put(57009, getCharset("windows-57009")); // Malayalam + ansicpg.put(57010, getCharset("windows-57010")); // Gujarati + ansicpg.put(57011, getCharset("windows-57011")); // Punjabi + + ANSICPG_MAP = Collections.unmodifiableMap(ansicpg); + } + + private RTFCharsetMaps() { + } + + /** + * Resolve a charset by name, falling back to US-ASCII if unavailable. + */ + static Charset getCharset(String name) { + try { + return CharsetUtils.forName(name); + } catch (IllegalArgumentException e) { + return StandardCharsets.US_ASCII; + } + } + + /** + * Resolve an ANSI code page number to a Java Charset. + * Tries the ANSICPG_MAP first, then falls back to {@code windows-N} and {@code cpN}. + * Returns {@code WINDOWS_1252} if nothing matches. + */ + public static Charset resolveCodePage(int cpNumber) { + Charset cs = ANSICPG_MAP.get(cpNumber); + if (cs != null) { + return cs; + } + try { + return Charset.forName("windows-" + cpNumber); + } catch (Exception e) { + try { + return Charset.forName("cp" + cpNumber); + } catch (Exception e2) { + return WINDOWS_1252; + } + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java new file mode 100644 index 0000000000..dd8b052b9e --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java @@ -0,0 +1,339 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.io.FilenameUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.RTFMetadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; + +/** + * Handles embedded objects and pictures within the JFlex-based RTF token stream. + * + * <p>Uses streaming parsers ({@link RTFObjDataStreamParser} and + * {@link RTFPictStreamParser}) so that large embedded objects are written + * to temp files rather than buffered entirely in memory.</p> + */ +public class RTFEmbeddedHandler { + + private static final String EMPTY_STRING = ""; + + private final ContentHandler handler; + private final ParseContext context; + private final EmbeddedDocumentUtil embeddedDocumentUtil; + private final long maxBytes; + + private boolean inObject = false; + private boolean isPictBitmap = false; + private int hi = -1; + private int thumbCount = 0; + private final AtomicInteger unknownFilenameCount = new AtomicInteger(); + + // Shape property metadata + private String sn = EMPTY_STRING; + private String sv = EMPTY_STRING; + private final StringBuilder metadataBuffer = new StringBuilder(); + + private Metadata metadata; + private EmbState state = EmbState.NADA; + + // Streaming parsers — one active at a time + private RTFObjDataStreamParser objParser; + private RTFPictStreamParser pictParser; + + public RTFEmbeddedHandler(ContentHandler handler, ParseContext context, + int memoryLimitInKb) { + this.handler = handler; + this.context = context; + this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context); + this.maxBytes = memoryLimitInKb > 0 ? (long) memoryLimitInKb * 1024 : -1; + this.metadata = Metadata.newInstance(context); + } + + /** + * Process a token for embedded object/pict handling. + * Call this AFTER {@link RTFState#processToken(RTFToken)} has run. + * + * @param tok the current token + * @param rtfState the RTF state (already updated for this token) + * @param closingGroup for GROUP_CLOSE tokens, the group state that just closed. + * Null for other token types. + */ + public void processToken(RTFToken tok, RTFState rtfState, RTFGroupState closingGroup) + throws IOException, SAXException, TikaException { + RTFTokenType type = tok.getType(); + RTFGroupState group = rtfState.getCurrentGroup(); + + switch (type) { + case GROUP_CLOSE: + if (closingGroup == null) { + break; + } + if (closingGroup.objdata) { + handleCompletedObjData(); + } else if (closingGroup.pictDepth == 1) { + handleCompletedPict(); + } else if (closingGroup.sn) { + endSN(); + } else if (closingGroup.sv) { + endSV(); + } else if (closingGroup.sp) { + endSP(); + } + if (closingGroup.object) { + inObject = false; + } + break; + + case CONTROL_WORD: + String name = tok.getName(); + switch (name) { + case "object": + inObject = true; + break; + case "objdata": + startObjData(); + break; + case "pict": + startPict(); + break; + case "sn": + startSN(); + break; + case "sv": + startSV(); + break; + case "wbitmap": + isPictBitmap = true; + break; + } + break; + + case TEXT: + if (group.objdata || group.pictDepth == 1) { + String text = tok.getName(); + for (int i = 0; i < text.length(); i++) { + writeHexChar(text.charAt(i)); + } + } else if (group.sn || group.sv) { + String text = tok.getName(); + for (int i = 0; i < text.length(); i++) { + metadataBuffer.append(text.charAt(i)); + } + } + break; + + case HEX_ESCAPE: + if (group.sn || group.sv) { + metadataBuffer.append((char) tok.getHexValue()); + } + break; + + default: + break; + } + } + + // --- Lifecycle for objdata --- + + private void startObjData() throws IOException { + state = EmbState.OBJDATA; + metadata = Metadata.newInstance(context); + objParser = new RTFObjDataStreamParser(maxBytes); + } + + private void handleCompletedObjData() throws IOException, SAXException, TikaException { + if (objParser == null) { + reset(); + return; + } + try { + TikaInputStream tis = objParser.onComplete(metadata, unknownFilenameCount); + if (tis != null) { + try { + extractObj(tis, metadata); + } finally { + tis.close(); + } + } + } catch (IOException e) { + EmbeddedDocumentUtil.recordException(e, metadata); + } finally { + objParser.close(); + objParser = null; + reset(); + } + } + + // --- Lifecycle for pict --- + + private void startPict() throws IOException { + state = EmbState.PICT; + metadata = Metadata.newInstance(context); + pictParser = new RTFPictStreamParser(maxBytes); + } + + private void handleCompletedPict() throws IOException, SAXException, TikaException { + if (pictParser == null) { + reset(); + return; + } + try { + Path pictFile = pictParser.onComplete(); + if (pictFile != null) { + String filePath = + metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + "wzDescription"); + if (filePath != null && !filePath.isEmpty()) { + metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filePath); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, + FilenameUtils.getName(filePath)); + metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, filePath); + } + metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject)); + if (isPictBitmap) { + metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, + "image/x-rtf-raw-bitmap"); + } + + try (TikaInputStream tis = TikaInputStream.get(pictFile)) { + extractObj(tis, metadata); + } + } + } catch (IOException e) { + EmbeddedDocumentUtil.recordException(e, metadata); + } finally { + pictParser.close(); + pictParser = null; + reset(); + } + } + + // --- Shape property metadata --- + + private void startSN() { + metadataBuffer.setLength(0); + metadataBuffer.append(RTFMetadata.RTF_PICT_META_PREFIX); + } + + private void endSN() { + sn = metadataBuffer.toString(); + } + + private void startSV() { + metadataBuffer.setLength(0); + } + + private void endSV() { + sv = metadataBuffer.toString(); + } + + private void endSP() { + metadata.add(sn, sv); + } + + // --- Hex pair decoding --- + + private void writeHexChar(int b) throws IOException, TikaException { + if (isHexChar(b)) { + if (hi == -1) { + hi = 16 * hexValue(b); + } else { + int decoded = hi + hexValue(b); + hi = -1; + // Route the decoded byte to the active streaming parser + if (objParser != null) { + objParser.onByte(decoded); + } else if (pictParser != null) { + pictParser.onByte(decoded); + } + } + } + } + + // --- Common extraction --- + + private void extractObj(TikaInputStream tis, Metadata meta) + throws SAXException, IOException, TikaException { + meta.set(Metadata.CONTENT_LENGTH, Long.toString(tis.getLength())); + + if (embeddedDocumentUtil.shouldParseEmbedded(meta)) { + if (meta.get(TikaCoreProperties.RESOURCE_NAME_KEY) == null) { + String extension = embeddedDocumentUtil.getExtension(tis, meta); + if (inObject && state == EmbState.PICT) { + meta.set(TikaCoreProperties.RESOURCE_NAME_KEY, + EmbeddedDocumentUtil.EmbeddedResourcePrefix.THUMBNAIL.getPrefix() + + "-" + thumbCount++ + extension); + meta.set(RTFMetadata.THUMBNAIL, "true"); + } else { + meta.set(TikaCoreProperties.RESOURCE_NAME_KEY, + EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED.getPrefix() + + "-" + unknownFilenameCount.getAndIncrement() + + extension); + } + meta.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true); + } + try { + embeddedDocumentUtil.parseEmbedded( + tis, new EmbeddedContentHandler(handler), meta, true); + } catch (IOException e) { + EmbeddedDocumentUtil.recordEmbeddedStreamException(e, meta); + } + } + } + + private void reset() { + state = EmbState.NADA; + metadata = Metadata.newInstance(context); + hi = -1; + sn = EMPTY_STRING; + sv = EMPTY_STRING; + metadataBuffer.setLength(0); + isPictBitmap = false; + } + + private static boolean isHexChar(int ch) { + return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); + } + + private static int hexValue(int ch) { + if (ch >= '0' && ch <= '9') { + return ch - '0'; + } else if (ch >= 'a' && ch <= 'z') { + return 10 + (ch - 'a'); + } else { + return 10 + (ch - 'A'); + } + } + + private enum EmbState { + PICT, + OBJDATA, + NADA + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java new file mode 100644 index 0000000000..c5f9f8c444 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import java.nio.charset.Charset; + +/** + * State associated with a single RTF group ({@code \{ ... \}}). + * <p> + * When a new group opens, the current state is pushed onto the stack and a + * child state is created that inherits the parent's properties. When the group + * closes, the state is popped. + */ +public class RTFGroupState { + + /** Nesting depth (0 = root). */ + int depth; + + /** Current font charset, set by {@code \fN} if the font table maps it. May be null. */ + Charset fontCharset; + + /** Current font ID, set by {@code \fN}. -1 if unset. */ + int fontId = -1; + + /** Number of ANSI chars to skip after a unicode escape (ucN control word). Default 1. */ + int ucSkip = 1; + + /** True if this group's content should be ignored (e.g. {@code \*} destination). */ + boolean ignore; + + /** True if bold. */ + boolean bold; + + /** True if italic. */ + boolean italic; + + // Embedded object / picture state + boolean objdata; + int pictDepth; + boolean sp; + boolean sn; + boolean sv; + boolean object; + boolean annotation; + + /** Create a root group state with defaults. */ + public RTFGroupState() { + } + + /** Create a child group state inheriting from the parent. */ + public RTFGroupState(RTFGroupState parent) { + this.depth = parent.depth + 1; + this.fontCharset = parent.fontCharset; + this.fontId = parent.fontId; + this.ucSkip = parent.ucSkip; + this.ignore = parent.ignore; + this.bold = parent.bold; + this.italic = parent.italic; + this.pictDepth = parent.pictDepth > 0 ? parent.pictDepth + 1 : 0; + // objdata, sp, sn, sv, object, annotation are NOT inherited + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java new file mode 100644 index 0000000000..9c5d0cee90 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.StringReader; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; + +/** + * Extracts the original HTML from an RTF document that contains encapsulated HTML + * (as indicated by the {@code \fromhtml1} control word), using a JFlex-based tokenizer + * and shared {@link RTFState} for font/codepage tracking. + * + * <p>Embedded objects and pictures are extracted in the same pass via + * {@link RTFEmbeddedHandler}.</p> + */ +public class RTFHtmlDecapsulator { + + private static final int DEFAULT_MEMORY_LIMIT_KB = 20 * 1024; // 20 MB + + private final ContentHandler handler; + private final ParseContext context; + private final int memoryLimitInKb; + + /** + * Creates a decapsulator that extracts embedded objects through the given handler. + * + * @param handler the content handler for embedded document extraction + * @param context the parse context (provides EmbeddedDocumentExtractor, etc.) + * @param memoryLimitInKb max bytes per embedded object (in KB), or -1 for unlimited + */ + public RTFHtmlDecapsulator(ContentHandler handler, ParseContext context, + int memoryLimitInKb) { + this.handler = handler; + this.context = context; + this.memoryLimitInKb = memoryLimitInKb; + } + + /** + * Creates a decapsulator with default memory limit and no embedded extraction. + */ + public RTFHtmlDecapsulator() { + this(null, null, DEFAULT_MEMORY_LIMIT_KB); + } + + /** + * Extracts the HTML content from an encapsulated-HTML RTF document. + * Embedded objects and pictures are extracted as a side effect through + * the {@link ContentHandler} provided at construction time. + * + * @param rtfBytes the decompressed RTF bytes + * @return the extracted HTML string, or {@code null} if the RTF does not contain + * encapsulated HTML + * @throws IOException if the tokenizer encounters an I/O error + */ + public String extract(byte[] rtfBytes) throws IOException, SAXException, TikaException { + if (rtfBytes == null || rtfBytes.length == 0) { + return null; + } + + String rtf = new String(rtfBytes, StandardCharsets.US_ASCII); + + RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf)); + RTFState state = new RTFState(); + RTFEmbeddedHandler embHandler = (handler != null && context != null) + ? new RTFEmbeddedHandler(handler, context, memoryLimitInKb) + : null; + + StringBuilder html = new StringBuilder(rtf.length() / 2); + ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream(); + + boolean foundFromHtml = false; + boolean foundHtmlTag = false; + boolean inHtmlRtfSkip = false; + + boolean sawIgnorable = false; + int htmlTagDepth = -1; + boolean inHtmlTag = false; + + RTFToken tok; + while ((tok = tokenizer.yylex()) != null) { + RTFTokenType type = tok.getType(); + + if (type == RTFTokenType.EOF) { + break; + } + + // Flush pending bytes before charset-changing events + if (type == RTFTokenType.GROUP_CLOSE + || (type == RTFTokenType.CONTROL_WORD && "f".equals(tok.getName()) + && tok.hasParameter())) { + flushPendingBytes(pendingBytes, html, state); + } + + // Let RTFState handle group stack, font table, codepage, unicode skip + boolean consumed = state.processToken(tok); + + // Let embedded handler process objdata/pict/sp in the same pass + if (embHandler != null && !consumed) { + RTFGroupState closingGroup = + (type == RTFTokenType.GROUP_CLOSE) ? state.getLastClosedGroup() : null; + try { + embHandler.processToken(tok, state, closingGroup); + } catch (TikaException | IOException e) { + // record and continue — don't let a bad embedded object kill decapsulation + } + } + + RTFGroupState group = state.getCurrentGroup(); + + // Skip tokens that are part of objdata/pict hex streams + if (!consumed && (group.objdata || group.pictDepth > 0)) { + // Embedded handler already consumed these + continue; + } + + switch (type) { + case GROUP_OPEN: + sawIgnorable = false; + break; + + case GROUP_CLOSE: + if (inHtmlTag && state.getDepth() < htmlTagDepth) { + flushPendingBytes(pendingBytes, html, state); + inHtmlTag = false; + htmlTagDepth = -1; + } + break; + + case CONTROL_SYMBOL: + if ("*".equals(tok.getName())) { + sawIgnorable = true; + } + if (!foundHtmlTag || inHtmlRtfSkip) { + break; + } + if (inHtmlTag || isContentArea(htmlTagDepth)) { + String sym = tok.getName(); + if ("{".equals(sym) || "}".equals(sym) || "\\".equals(sym)) { + flushPendingBytes(pendingBytes, html, state); + html.append(sym); + } + } + break; + + case CONTROL_WORD: + if (consumed) { + break; + } + String name = tok.getName(); + + if ("fromhtml".equals(name)) { + foundFromHtml = true; + break; + } + + if ("htmltag".equals(name) && sawIgnorable) { + if (!foundFromHtml) { + break; + } + foundHtmlTag = true; + flushPendingBytes(pendingBytes, html, state); + inHtmlTag = true; + htmlTagDepth = state.getDepth(); + break; + } + + if ("htmlrtf".equals(name)) { + flushPendingBytes(pendingBytes, html, state); + inHtmlRtfSkip = !(tok.hasParameter() && tok.getParameter() == 0); + break; + } + + if (!foundHtmlTag || inHtmlRtfSkip) { + break; + } + + if (inHtmlTag || isContentArea(htmlTagDepth)) { + flushPendingBytes(pendingBytes, html, state); + switch (name) { + case "par": + case "pard": + html.append('\n'); + break; + case "tab": + html.append('\t'); + break; + case "line": + html.append("<br>"); + break; + default: + break; + } + } + break; + + case HEX_ESCAPE: + if (consumed) { + break; + } + if (!foundHtmlTag || inHtmlRtfSkip) { + break; + } + if (inHtmlTag || isContentArea(htmlTagDepth)) { + pendingBytes.write(tok.getHexValue()); + } + break; + + case UNICODE_ESCAPE: + if (!foundHtmlTag || inHtmlRtfSkip) { + break; + } + if (inHtmlTag || isContentArea(htmlTagDepth)) { + flushPendingBytes(pendingBytes, html, state); + int cp = tok.getParameter(); + if (Character.isValidCodePoint(cp)) { + html.appendCodePoint(cp); + } + } + break; + + case TEXT: + if (consumed) { + break; + } + if (!foundHtmlTag || inHtmlRtfSkip) { + break; + } + if (inHtmlTag || isContentArea(htmlTagDepth)) { + flushPendingBytes(pendingBytes, html, state); + html.append(tok.getName()); + } + break; + + case CRLF: + case BIN: + default: + break; + } + } + + flushPendingBytes(pendingBytes, html, state); + + if (!foundFromHtml || html.length() == 0) { + return null; + } + return html.toString(); + } + + private static boolean isContentArea(int htmlTagDepth) { + return htmlTagDepth == -1; + } + + private static void flushPendingBytes(ByteArrayOutputStream pending, StringBuilder out, + RTFState state) { + if (pending.size() > 0) { + Charset cs = state.getCurrentCharset(); + out.append(new String(pending.toByteArray(), cs)); + pending.reset(); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java new file mode 100644 index 0000000000..c45b0a3817 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java @@ -0,0 +1,534 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.Closeable; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Locale; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.poifs.filesystem.Ole10Native; +import org.apache.poi.poifs.filesystem.Ole10NativeException; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.TikaMemoryLimitException; +import org.apache.tika.io.BoundedInputStream; +import org.apache.tika.io.EndianUtils; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.RTFMetadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; + +/** + * Parses OLE objdata from an RTF stream inline, byte by byte. + * + * <p>The OLE objdata structure is: + * <pre> + * [4 bytes version][4 bytes formatId] + * [4 bytes classNameLen][classNameLen bytes className] + * [4 bytes topicNameLen][topicNameLen bytes topicName] + * [4 bytes itemNameLen][itemNameLen bytes itemName] + * [4 bytes dataSz][dataSz bytes payload] + * </pre> + * The small header fields are parsed byte-by-byte via a state machine. + * Once the header is complete and {@code dataSz} is known, the payload + * bytes stream directly to a temp file — never buffered in memory.</p> + * + * <p>On {@link #onComplete(Metadata, AtomicInteger)}, the payload is + * interpreted based on {@code className} (Package, PBrush, POIFS, etc.) + * and the extracted content is returned as a {@link TikaInputStream}.</p> + */ +public class RTFObjDataStreamParser implements Closeable { + + private static final String WIN_ASCII = "WINDOWS-1252"; + + private final long maxBytes; + + // State machine + private Field currentField = Field.VERSION; + private byte[] fieldBuf = new byte[4]; + private int fieldPos; + private int fieldTarget = 4; + + // Parsed header values + private long version; + private long formatId; + private String className; + private String topicName; + private String itemName; + private long dataSz; + + // String accumulator for length-prefixed ANSI strings + private byte[] stringBuf; + private int stringPos; + + // Payload streaming + private Path tempFile; + private OutputStream dataOut; + private long dataWritten; + + /** + * @param maxBytes maximum payload bytes to accept (-1 for unlimited) + */ + public RTFObjDataStreamParser(long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Receive a single decoded byte from the objdata hex stream. + */ + public void onByte(int b) throws IOException, TikaException { + switch (currentField) { + case VERSION: + fieldBuf[fieldPos++] = (byte) b; + if (fieldPos >= fieldTarget) { + version = readLE32(fieldBuf); + initUint32Field(Field.FORMAT_ID); + } + break; + + case FORMAT_ID: + fieldBuf[fieldPos++] = (byte) b; + if (fieldPos >= fieldTarget) { + formatId = readLE32(fieldBuf); + if (formatId != 2L) { + // Not an embedded object (1 = link). Skip everything. + currentField = Field.SKIP; + } else { + initUint32Field(Field.CLASS_LEN); + } + } + break; + + case CLASS_LEN: + fieldBuf[fieldPos++] = (byte) b; + if (fieldPos >= fieldTarget) { + int len = (int) readLE32(fieldBuf); + initStringField(Field.CLASS_NAME, len); + } + break; + + case CLASS_NAME: + stringBuf[stringPos++] = (byte) b; + if (stringPos >= fieldTarget) { + className = decodeString(stringBuf, fieldTarget); + initUint32Field(Field.TOPIC_LEN); + } + break; + + case TOPIC_LEN: + fieldBuf[fieldPos++] = (byte) b; + if (fieldPos >= fieldTarget) { + int len = (int) readLE32(fieldBuf); + initStringField(Field.TOPIC_NAME, len); + } + break; + + case TOPIC_NAME: + stringBuf[stringPos++] = (byte) b; + if (stringPos >= fieldTarget) { + topicName = decodeString(stringBuf, fieldTarget); + initUint32Field(Field.ITEM_LEN); + } + break; + + case ITEM_LEN: + fieldBuf[fieldPos++] = (byte) b; + if (fieldPos >= fieldTarget) { + int len = (int) readLE32(fieldBuf); + initStringField(Field.ITEM_NAME, len); + } + break; + + case ITEM_NAME: + stringBuf[stringPos++] = (byte) b; + if (stringPos >= fieldTarget) { + itemName = decodeString(stringBuf, fieldTarget); + initUint32Field(Field.DATA_SIZE); + } + break; + + case DATA_SIZE: + fieldBuf[fieldPos++] = (byte) b; + if (fieldPos >= fieldTarget) { + dataSz = readLE32(fieldBuf); + if (dataSz <= 0) { + currentField = Field.DONE; + } else { + currentField = Field.DATA; + tempFile = Files.createTempFile("tika-rtf-obj-", ".bin"); + dataOut = new BufferedOutputStream(Files.newOutputStream(tempFile)); + } + } + break; + + case DATA: + if (maxBytes > 0 && dataWritten >= maxBytes) { + throw new TikaMemoryLimitException(dataWritten + 1, maxBytes); + } + dataOut.write(b); + dataWritten++; + if (dataWritten >= dataSz) { + dataOut.close(); + dataOut = null; + currentField = Field.DONE; + } + break; + + case DONE: + case SKIP: + break; + } + } + + /** + * Called when the objdata group closes. Populates metadata and returns + * a TikaInputStream with the extracted embedded content, or null if + * the object couldn't be parsed. + * + * <p>The caller is responsible for closing the returned TikaInputStream + * (which will clean up the underlying temp file).</p> + */ + public TikaInputStream onComplete(Metadata metadata, AtomicInteger unknownFilenameCount) + throws IOException, TikaException { + if (currentField == Field.SKIP || tempFile == null) { + return null; + } + + metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version)); + if (className != null && !className.isEmpty()) { + metadata.add(RTFMetadata.EMB_CLASS, className); + } + if (topicName != null && !topicName.isEmpty()) { + metadata.add(RTFMetadata.EMB_TOPIC, topicName); + } + if (itemName != null && !itemName.isEmpty()) { + metadata.add(RTFMetadata.EMB_ITEM, itemName); + } + + String cn = className != null ? className.toLowerCase(Locale.ROOT) : ""; + + if ("package".equals(cn)) { + return handlePackage(metadata); + } else if ("pbrush".equals(cn)) { + // Raw bitmap — the temp file IS the content + return TikaInputStream.get(tempFile); + } else { + return handleGenericOrPOIFS(metadata, unknownFilenameCount); + } + } + + /** + * Returns true if the header has been fully parsed (regardless of whether + * all data bytes have arrived). + */ + public boolean isHeaderParsed() { + return currentField == Field.DATA || currentField == Field.DONE; + } + + /** Returns the parsed className, or null if header isn't complete yet. */ + public String getClassName() { + return className; + } + + @Override + public void close() throws IOException { + if (dataOut != null) { + dataOut.close(); + dataOut = null; + } + cleanup(); + } + + // --- Package handling --- + + private TikaInputStream handlePackage(Metadata metadata) throws IOException, TikaException { + try (InputStream is = new BufferedInputStream(Files.newInputStream(tempFile))) { + int type1 = readUShortLE(is); + + String displayName = readNullTerminatedString(is); + readNullTerminatedString(is); // iconFilePath + readUShortBE(is); // iconIndex + int type2 = readUShortLE(is); + + if (type2 != 3) { + // type 1 = link, only handle type 3 = embedded + return null; + } + + readUIntLE(is); // filePathLen + String ansiFilePath = readNullTerminatedString(is); + long bytesLen = readUIntLE(is); + + // The remaining bytes in the stream are the actual file content. + // Create a temp file for them. + Path contentFile = Files.createTempFile("tika-rtf-pkg-", ".bin"); + try (OutputStream contentOut = new BufferedOutputStream( + Files.newOutputStream(contentFile))) { + long copied = copyBounded(is, contentOut, bytesLen); + } + + // Try to read unicode file path (optional) + StringBuilder unicodePath = new StringBuilder(); + try { + long unicodeLen = readUIntLE(is); + for (int i = 0; i < unicodeLen; i++) { + int lo = is.read(); + int hi = is.read(); + if (lo == -1 || hi == -1) { + unicodePath.setLength(0); + break; + } + unicodePath.append((char) (lo + 256 * hi)); + } + } catch (IOException e) { + unicodePath.setLength(0); + } + + String fileNameToUse; + String pathToUse; + if (unicodePath.length() > 0) { + fileNameToUse = unicodePath.toString(); + pathToUse = unicodePath.toString(); + } else { + fileNameToUse = displayName != null ? displayName : ""; + pathToUse = ansiFilePath != null ? ansiFilePath : ""; + } + metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileNameToUse); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, + FilenameUtils.getName(fileNameToUse)); + metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, pathToUse); + + return TikaInputStream.get(contentFile); + } finally { + cleanup(); + } + } + + // --- Generic / POIFS handling --- + + private TikaInputStream handleGenericOrPOIFS(Metadata metadata, + AtomicInteger unknownFilenameCount) + throws IOException, TikaException { + try (InputStream probe = new BufferedInputStream(Files.newInputStream(tempFile))) { + boolean isOLE2 = FileMagic.valueOf(probe) == FileMagic.OLE2; + if (!isOLE2) { + // Not POIFS — return raw bytes from temp file + return TikaInputStream.get(tempFile); + } + } + + // It's POIFS — parse it + try (InputStream poifsIn = new BufferedInputStream(Files.newInputStream(tempFile)); + POIFSFileSystem fs = new POIFSFileSystem(poifsIn)) { + DirectoryNode root = fs.getRoot(); + if (root == null) { + return null; + } + + byte[] content = null; + + if (root.hasEntry("Package")) { + Entry pkg = root.getEntry("Package"); + try (BoundedInputStream bis = new BoundedInputStream( + maxBytes > 0 ? maxBytes : Long.MAX_VALUE, + new DocumentInputStream((DocumentEntry) pkg))) { + content = IOUtils.toByteArray(bis); + if (bis.hasHitBound()) { + throw new TikaMemoryLimitException(maxBytes + 1, maxBytes); + } + } + } else { + POIFSDocumentType type = POIFSDocumentType.detectType(root); + if (type == POIFSDocumentType.OLE10_NATIVE) { + try { + Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); + content = ole.getDataBuffer(); + } catch (Ole10NativeException ex) { + // Not valid OLE10Native + } + } else if (type == POIFSDocumentType.COMP_OBJ) { + DocumentEntry contentsEntry; + try { + contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); + } catch (FileNotFoundException e) { + contentsEntry = (DocumentEntry) root.getEntry("Contents"); + } + try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) { + content = new byte[contentsEntry.getSize()]; + inp.readFully(content); + } + } else { + // Unknown POIFS type — return the whole thing + metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); + return TikaInputStream.get(tempFile); + } + } + + if (content != null) { + // Write extracted content to a new temp file + Path contentFile = Files.createTempFile("tika-rtf-poifs-", ".bin"); + Files.write(contentFile, content); + return TikaInputStream.get(contentFile); + } + } finally { + cleanup(); + } + return null; + } + + // --- Helper methods --- + + private void initUint32Field(Field next) { + currentField = next; + fieldPos = 0; + fieldTarget = 4; + } + + private void initStringField(Field next, int len) { + currentField = next; + if (len <= 0) { + // Empty string — advance immediately + switch (next) { + case CLASS_NAME: + className = ""; + initUint32Field(Field.TOPIC_LEN); + break; + case TOPIC_NAME: + topicName = ""; + initUint32Field(Field.ITEM_LEN); + break; + case ITEM_NAME: + itemName = ""; + initUint32Field(Field.DATA_SIZE); + break; + default: + break; + } + return; + } + stringBuf = new byte[len]; + stringPos = 0; + fieldTarget = len; + } + + private static long readLE32(byte[] buf) { + return (buf[0] & 0xFFL) + | ((buf[1] & 0xFFL) << 8) + | ((buf[2] & 0xFFL) << 16) + | ((buf[3] & 0xFFL) << 24); + } + + private static String decodeString(byte[] buf, int len) { + try { + return new String(buf, 0, len, WIN_ASCII).trim(); + } catch (java.io.UnsupportedEncodingException e) { + return new String(buf, 0, len).trim(); + } + } + + private static int readUShortLE(InputStream is) throws IOException { + int lo = is.read(); + int hi = is.read(); + if (lo == -1 || hi == -1) { + throw new IOException("unexpected end of stream"); + } + return lo | (hi << 8); + } + + private static int readUShortBE(InputStream is) throws IOException { + int hi = is.read(); + int lo = is.read(); + if (lo == -1 || hi == -1) { + throw new IOException("unexpected end of stream"); + } + return (hi << 8) | lo; + } + + private static long readUIntLE(InputStream is) throws IOException { + try { + return EndianUtils.readUIntLE(is); + } catch (EndianUtils.BufferUnderrunException e) { + throw new IOException(e); + } + } + + private static String readNullTerminatedString(InputStream is) throws IOException { + StringBuilder sb = new StringBuilder(); + int c = is.read(); + while (c > 0) { + sb.append((char) c); + c = is.read(); + } + if (c == -1) { + throw new IOException("hit end of stream before null terminator"); + } + return sb.toString(); + } + + private static long copyBounded(InputStream in, OutputStream out, long maxLen) + throws IOException { + byte[] buf = new byte[8192]; + long total = 0; + while (total < maxLen) { + int toRead = (int) Math.min(buf.length, maxLen - total); + int read = in.read(buf, 0, toRead); + if (read == -1) { + break; + } + out.write(buf, 0, read); + total += read; + } + return total; + } + + private void cleanup() { + if (tempFile != null) { + try { + Files.deleteIfExists(tempFile); + } catch (IOException ignored) { + // best effort + } + tempFile = null; + } + } + + private enum Field { + VERSION, FORMAT_ID, + CLASS_LEN, CLASS_NAME, + TOPIC_LEN, TOPIC_NAME, + ITEM_LEN, ITEM_NAME, + DATA_SIZE, DATA, + DONE, SKIP + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java new file mode 100644 index 0000000000..8fe6c98989 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import java.io.BufferedOutputStream; +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.TikaMemoryLimitException; + +/** + * Streams decoded bytes from an RTF {@code \pict} group to a temp file. + * + * <p>Pict data is raw image bytes (after hex-pair decoding). There is no + * header to parse — bytes are written directly to a temp file. On + * {@link #onComplete()}, the caller retrieves the temp file path and + * hands it to the embedded document extractor.</p> + */ +public class RTFPictStreamParser implements Closeable { + + private final long maxBytes; + private Path tempFile; + private OutputStream out; + private long bytesWritten; + + /** + * @param maxBytes maximum number of bytes to accept (-1 for unlimited) + */ + public RTFPictStreamParser(long maxBytes) throws IOException { + this.maxBytes = maxBytes; + this.tempFile = Files.createTempFile("tika-rtf-pict-", ".bin"); + this.out = new BufferedOutputStream(Files.newOutputStream(tempFile)); + } + + /** + * Receive a single decoded byte from the pict hex stream. + */ + public void onByte(int b) throws IOException, TikaException { + if (maxBytes > 0 && bytesWritten >= maxBytes) { + throw new TikaMemoryLimitException(bytesWritten + 1, maxBytes); + } + out.write(b); + bytesWritten++; + } + + /** + * Called when the pict group closes. Flushes and closes the output stream. + * + * @return the path to the temp file containing the image data, + * or null if no bytes were written + */ + public Path onComplete() throws IOException { + out.close(); + out = null; + if (bytesWritten == 0) { + cleanup(); + return null; + } + return tempFile; + } + + /** Returns the number of bytes written so far. */ + public long getBytesWritten() { + return bytesWritten; + } + + @Override + public void close() throws IOException { + if (out != null) { + out.close(); + out = null; + } + cleanup(); + } + + private void cleanup() { + if (tempFile != null) { + try { + Files.deleteIfExists(tempFile); + } catch (IOException ignored) { + // best effort + } + tempFile = null; + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java new file mode 100644 index 0000000000..a2e2553e44 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import java.nio.charset.Charset; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.HashMap; +import java.util.Map; + +/** + * Shared RTF parsing state: group stack, font table, codepage tracking, + * and unicode skip handling. + * + * <p>Both the HTML decapsulator and the full RTF parser use this class + * to manage the stateful parts of RTF processing.</p> + * + * <p>Typical usage: feed every token to {@link #processToken(RTFToken)} + * and query the current charset via {@link #getCurrentCharset()}.</p> + */ +public class RTFState { + + /** Global charset from {@code \ansicpgN} or charset family selectors. */ + private Charset globalCharset = RTFCharsetMaps.WINDOWS_1252; + + /** Default font ID from {@code \deffN}. */ + private int globalDefaultFont = -1; + + /** Font table: maps font number ({@code \fN}) to charset ({@code \fcharsetN}). */ + private final Map<Integer, Charset> fontToCharset = new HashMap<>(); + + /** Group state stack. */ + private final Deque<RTFGroupState> stack = new ArrayDeque<>(); + + /** Current (active) group state. */ + private RTFGroupState current = new RTFGroupState(); + + /** Number of ANSI chars remaining to skip after a unicode escape. */ + private int ansiSkip = 0; + + /** The group state that was just closed (before popGroup). Set on GROUP_CLOSE. */ + private RTFGroupState lastClosedGroup; + + // Font table parsing state + // 0 = not yet seen, 1 = inside fonttbl, 2 = finished fonttbl + private int fontTableState = 0; + private int fontTableDepth = -1; + private int currentFontId = -1; + + private boolean inHeader = true; + + /** + * Process a single token to update internal state. + * <p> + * This handles: group open/close, charset selectors (ansi, ansicpg, + * deff), font table parsing (fonttbl, f, fcharset), + * unicode skip tracking (uc), and font changes (f in body). + * + * @return true if the token was consumed by state management (caller should skip it), + * false if the caller should also process it + */ + public boolean processToken(RTFToken tok) { + switch (tok.getType()) { + case GROUP_OPEN: + pushGroup(); + return false; + + case GROUP_CLOSE: + lastClosedGroup = current; + popGroup(); + // Check if we've exited the font table + if (fontTableState == 1 && current.depth < fontTableDepth) { + fontTableState = 2; + } + return false; + + case CONTROL_SYMBOL: + if ("*".equals(tok.getName())) { + current.ignore = true; + } + return false; + + case CONTROL_WORD: + return processControlWord(tok); + + case UNICODE_ESCAPE: + // After a unicode escape, skip the next ucSkip ANSI chars + ansiSkip = current.ucSkip; + return false; + + case HEX_ESCAPE: + // If we're in the ANSI shadow of a unicode escape, skip this byte + if (ansiSkip > 0) { + ansiSkip--; + return true; // consumed — caller should ignore + } + return false; + + case TEXT: + // If we're in the ANSI shadow, skip text chars + if (ansiSkip > 0) { + // Each TEXT token is one char + ansiSkip--; + return true; + } + return false; + + default: + return false; + } + } + + private boolean processControlWord(RTFToken tok) { + String name = tok.getName(); + boolean hasParam = tok.hasParameter(); + int param = tok.getParameter(); + + // Global charset selectors (header) + switch (name) { + case "ansi": + globalCharset = RTFCharsetMaps.WINDOWS_1252; + return true; + case "pca": + globalCharset = RTFCharsetMaps.getCharset("cp850"); + return true; + case "pc": + globalCharset = RTFCharsetMaps.getCharset("cp437"); + return true; + case "mac": + globalCharset = RTFCharsetMaps.getCharset("MacRoman"); + return true; + case "ansicpg": + if (hasParam) { + Charset cs = RTFCharsetMaps.ANSICPG_MAP.get(param); + if (cs != null) { + globalCharset = cs; + } else { + globalCharset = RTFCharsetMaps.resolveCodePage(param); + } + } + return true; + case "deff": + if (hasParam) { + globalDefaultFont = param; + } + return true; + } + + // Font table management + if ("fonttbl".equals(name)) { + fontTableState = 1; + fontTableDepth = current.depth; + current.ignore = true; + return true; + } + + if (fontTableState == 1) { + // Inside font table + if (current.depth < fontTableDepth) { + fontTableState = 2; + } else { + if ("f".equals(name) && hasParam) { + currentFontId = param; + return true; + } else if ("fcharset".equals(name) && hasParam) { + Charset cs = RTFCharsetMaps.FCHARSET_MAP.get(param); + if (cs != null) { + fontToCharset.put(currentFontId, cs); + } + return true; + } + } + } + + // Unicode skip count + if ("uc".equals(name) && hasParam) { + current.ucSkip = param; + return true; + } + + // Font change in body + if ("f".equals(name) && hasParam) { + current.fontId = param; + Charset fontCs = fontToCharset.get(param); + current.fontCharset = fontCs; // may be null + // If we've seen the font table and this is a body font change, + // we're out of the header + if (fontTableState == 2 && !current.ignore) { + inHeader = false; + } + return false; // caller may also want to know about font changes + } + + // Header-ending control words + if (inHeader && !current.ignore) { + switch (name) { + case "par": + case "pard": + case "sect": + case "sectd": + case "plain": + case "ltrch": + case "rtlch": + case "htmlrtf": + case "line": + inHeader = false; + break; + } + } + + // Embedded object / picture control words + switch (name) { + case "object": + current.object = true; + return false; // caller may want to know + case "objdata": + current.objdata = true; + return false; + case "pict": + current.pictDepth = 1; + return false; + case "sp": + current.sp = true; + return false; + case "sn": + current.sn = true; + return false; + case "sv": + current.sv = true; + return false; + case "wbitmap": + return false; // caller handles + } + + // Ignorable destinations + if (inHeader) { + switch (name) { + case "colortbl": + case "stylesheet": + current.ignore = true; + return true; + } + } + + return false; + } + + /** Open a new group: push current state and create a child. */ + public void pushGroup() { + stack.push(current); + current = new RTFGroupState(current); + } + + /** Close the current group: pop and restore the parent state. */ + public void popGroup() { + if (!stack.isEmpty()) { + current = stack.pop(); + } + } + + /** + * Returns the charset that should be used to decode the current hex escape + * or text byte. Priority: + * <ol> + * <li>Font-specific charset (from {@code \fN → \fcharsetN})</li> + * <li>Global default font's charset (from {@code \deffN})</li> + * <li>Global charset (from {@code \ansicpgN} or family selector)</li> + * </ol> + */ + public Charset getCurrentCharset() { + if (current.fontCharset != null) { + return current.fontCharset; + } + if (globalDefaultFont != -1 && !inHeader) { + Charset cs = fontToCharset.get(globalDefaultFont); + if (cs != null) { + return cs; + } + } + return globalCharset; + } + + /** Returns the global charset ({@code \ansicpgN}). */ + public Charset getGlobalCharset() { + return globalCharset; + } + + /** Returns the current group state. */ + public RTFGroupState getCurrentGroup() { + return current; + } + + /** Returns true if we're still in the RTF header (before body content). */ + public boolean isInHeader() { + return inHeader; + } + + /** Returns the current group nesting depth. */ + public int getDepth() { + return current.depth; + } + + /** Returns the font-to-charset mapping table. */ + public Map<Integer, Charset> getFontToCharset() { + return fontToCharset; + } + + /** Returns the number of ANSI chars remaining to skip. */ + public int getAnsiSkip() { + return ansiSkip; + } + + /** + * Returns the group state that was just closed on the most recent GROUP_CLOSE. + * This is the child group's state before it was popped. + * Useful for checking flags like objdata, pictDepth, sn, sv, sp, object + * to trigger completion handlers. + */ + public RTFGroupState getLastClosedGroup() { + return lastClosedGroup; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java new file mode 100644 index 0000000000..ec287f5c7e --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +/** + * A single token produced by the RTF tokenizer. + * <p> + * Mutable and reused by the tokenizer to avoid allocation in the hot loop. + * Consumers must copy any data they need before requesting the next token. + */ +public class RTFToken { + + private RTFTokenType type; + private String name; + private int parameter; + private boolean hasParameter; + + public void reset(RTFTokenType type) { + this.type = type; + this.name = null; + this.parameter = -1; + this.hasParameter = false; + } + + public void set(RTFTokenType type, String name, int parameter, boolean hasParameter) { + this.type = type; + this.name = name; + this.parameter = parameter; + this.hasParameter = hasParameter; + } + + public RTFTokenType getType() { + return type; + } + + public String getName() { + return name; + } + + public int getParameter() { + return parameter; + } + + public boolean hasParameter() { + return hasParameter; + } + + /** + * For HEX_ESCAPE tokens, returns the decoded byte value (0-255). + */ + public int getHexValue() { + return parameter; + } + + @Override + public String toString() { + switch (type) { + case GROUP_OPEN: + return "{"; + case GROUP_CLOSE: + return "}"; + case CONTROL_WORD: + return "\\" + name + (hasParameter ? String.valueOf(parameter) : ""); + case CONTROL_SYMBOL: + return "\\" + name; + case HEX_ESCAPE: + return String.format("\\'%02x", parameter); + case UNICODE_ESCAPE: + return "\\u" + parameter; + case TEXT: + return "TEXT[" + name + "]"; + case BIN: + return "\\bin" + parameter; + case CRLF: + return "CRLF"; + case EOF: + return "EOF"; + default: + return type.name(); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java new file mode 100644 index 0000000000..dcdcf511f9 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +public enum RTFTokenType { + GROUP_OPEN, + GROUP_CLOSE, + CONTROL_WORD, + CONTROL_SYMBOL, + HEX_ESCAPE, + UNICODE_ESCAPE, + TEXT, + BIN, + CRLF, + EOF +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex new file mode 100644 index 0000000000..237800effe --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.rtf.jflex; + +%% + +%public +%class RTFTokenizer +%unicode +%type RTFToken +%char + +%{ + private final RTFToken token = new RTFToken(); + + /** + * Returns the reusable token instance. Callers must copy data + * before the next call to {@link #yylex()}. + */ + public RTFToken getToken() { + return token; + } + + private RTFToken controlWord(String text) { + // text is the full match including leading backslash and optional trailing + // delimiter space, e.g. "\\fonttbl", "\\f123 ", "\\ansi " + // strip leading backslash + String body = text.substring(1); + // strip trailing delimiter space if present + if (body.endsWith(" ")) { + body = body.substring(0, body.length() - 1); + } + + // split into name and optional numeric parameter + int i = 0; + while (i < body.length() && Character.isLetter(body.charAt(i))) { + i++; + } + String name = body.substring(0, i); + if (i < body.length()) { + // there is a numeric parameter (possibly negative) + String paramStr = body.substring(i); + int param = Integer.parseInt(paramStr); + token.set(RTFTokenType.CONTROL_WORD, name, param, true); + } else { + token.set(RTFTokenType.CONTROL_WORD, name, -1, false); + } + return token; + } + + private RTFToken hexEscape(String text) { + // text is e.g. "\\'ab" + int hi = Character.digit(text.charAt(2), 16); + int lo = Character.digit(text.charAt(3), 16); + token.set(RTFTokenType.HEX_ESCAPE, null, (hi << 4) | lo, true); + return token; + } + + private RTFToken unicodeEscape(String text) { + // text is e.g. "\\u12345" or "\\u-4321 " (may have trailing delimiter space) + String numStr = text.substring(2).trim(); + int codePoint = Integer.parseInt(numStr); + // RTF uses signed 16-bit: negative values map to 65536 + value + if (codePoint < 0) { + codePoint = 65536 + codePoint; + } + token.set(RTFTokenType.UNICODE_ESCAPE, null, codePoint, true); + return token; + } + + private RTFToken binToken(String text) { + // text is e.g. "\\bin12345 " (may have trailing delimiter space) + String numStr = text.substring(4).trim(); + int count = Integer.parseInt(numStr); + token.set(RTFTokenType.BIN, null, count, true); + return token; + } +%} + +/* RTF is 7-bit ASCII; bytes above 127 are escaped. We read as Latin1/byte stream. */ + +/* RTF spec: a control word's delimiter space is consumed and not part of the output. + We include the optional trailing space in each pattern so the tokenizer eats it. */ +ControlWordWithParam = "\\" [a-zA-Z]+ "-"? [0-9]+ " "? +ControlWord = "\\" [a-zA-Z]+ " "? +HexEscape = "\\'" [0-9a-fA-F]{2} +UnicodeEscape = "\\u" "-"? [0-9]+ " "? +BinControl = "\\bin" [0-9]+ " "? +ControlSymbol = "\\" [^a-zA-Z0-9\r\n] +GroupOpen = "{" +GroupClose = "}" +CrLf = \r\n | \r | \n + +%% + +/* Order matters: more specific rules first */ + +{BinControl} { return binToken(yytext()); } +{UnicodeEscape} { return unicodeEscape(yytext()); } +{HexEscape} { return hexEscape(yytext()); } +{ControlWordWithParam} { return controlWord(yytext()); } +{ControlWord} { return controlWord(yytext()); } +{ControlSymbol} { token.set(RTFTokenType.CONTROL_SYMBOL, yytext().substring(1), -1, false); return token; } +{GroupOpen} { token.reset(RTFTokenType.GROUP_OPEN); return token; } +{GroupClose} { token.reset(RTFTokenType.GROUP_CLOSE); return token; } +{CrLf} { token.reset(RTFTokenType.CRLF); return token; } + +/* Text: any character that isn't part of an RTF structure. + Match one char at a time to keep things simple. The consumer + can accumulate runs. Matching longer runs would be an optimization + for later. */ +[^\\\{\}\r\n] { token.set(RTFTokenType.TEXT, yytext(), -1, false); return token; } + +<<EOF>> { token.reset(RTFTokenType.EOF); return token; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java new file mode 100644 index 0000000000..32b8ae58f9 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; + +/** + * Tests for {@link RTFEmbeddedHandler} driven by the JFlex tokenizer, + * both standalone and integrated into the decapsulator. + */ +public class RTFEmbeddedHandlerTest { + + private static ParseContext buildContext(List<Metadata> extracted) { + ParseContext context = new ParseContext(); + context.set(EmbeddedDocumentExtractor.class, new EmbeddedDocumentExtractor() { + @Override + public boolean shouldParseEmbedded(Metadata metadata) { + return true; + } + + @Override + public void parseEmbedded(TikaInputStream stream, ContentHandler handler, + Metadata metadata, ParseContext parseContext, + boolean outputHtml) { + Metadata copy = new Metadata(); + for (String name : metadata.names()) { + for (String val : metadata.getValues(name)) { + copy.add(name, val); + } + } + extracted.add(copy); + } + }); + return context; + } + + /** + * Process an RTF file through the tokenizer + state + embedded handler directly. + */ + private List<Metadata> extractEmbeddedDirect(String resourceName) + throws IOException, SAXException, TikaException { + List<Metadata> extracted = new ArrayList<>(); + ParseContext context = buildContext(extracted); + ContentHandler handler = new DefaultHandler(); + RTFEmbeddedHandler embHandler = new RTFEmbeddedHandler(handler, context, 20 * 1024); + RTFState state = new RTFState(); + + try (InputStream is = getClass().getResourceAsStream("/test-documents/" + resourceName); + Reader reader = new InputStreamReader(is, StandardCharsets.US_ASCII)) { + + RTFTokenizer tokenizer = new RTFTokenizer(reader); + RTFToken tok; + + while ((tok = tokenizer.yylex()) != null) { + if (tok.getType() == RTFTokenType.EOF) { + break; + } + boolean consumed = state.processToken(tok); + if (!consumed) { + RTFGroupState closingGroup = + (tok.getType() == RTFTokenType.GROUP_CLOSE) + ? state.getLastClosedGroup() : null; + embHandler.processToken(tok, state, closingGroup); + } + } + } + return extracted; + } + + @Test + public void testEmbeddedFiles() throws Exception { + List<Metadata> embedded = extractEmbeddedDirect("testRTFEmbeddedFiles.rtf"); + assertTrue(embedded.size() > 0, + "should extract at least one embedded object from testRTFEmbeddedFiles.rtf"); + } + + @Test + public void testPictExtraction() throws Exception { + // Verifies the handler doesn't crash on a typical RTF file + extractEmbeddedDirect("testRTF.rtf"); + } + + @Test + public void testEmbeddedObjectMetadata() throws Exception { + List<Metadata> embedded = extractEmbeddedDirect("testRTFEmbeddedFiles.rtf"); + if (embedded.size() > 0) { + boolean hasName = false; + for (Metadata m : embedded) { + String name = m.get(TikaCoreProperties.RESOURCE_NAME_KEY); + if (name != null && !name.isEmpty()) { + hasName = true; + break; + } + } + assertTrue(hasName, "at least one embedded should have a resource name"); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java new file mode 100644 index 0000000000..6d8df7534d --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import static java.nio.charset.StandardCharsets.US_ASCII; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +/** + * Tests for {@link RTFHtmlDecapsulator}, mirroring the original + * RTFEncapsulatedHTMLExtractorTest to verify parity. + */ +public class RTFHtmlDecapsulatorTest { + + @Test + public void testNullAndEmpty() throws Exception { + assertNull(new RTFHtmlDecapsulator().extract(null)); + assertNull(new RTFHtmlDecapsulator().extract(new byte[0])); + } + + @Test + public void testNonEncapsulatedRtf() throws Exception { + String rtf = "{\\rtf1\\ansi\\deff0 Hello world}"; + assertNull(new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII))); + } + + @Test + public void testSimpleEncapsulatedHtml() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag19 <html>}\n" + + "{\\*\\htmltag34 <head>}\n" + + "{\\*\\htmltag41 </head>}\n" + + "{\\*\\htmltag50 <body>}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "{\\*\\htmltag64 <p>}\n" + + "{\\*\\htmltag84 Hello world}\n" + + "{\\*\\htmltag72 </p>}\n" + + "\\htmlrtf }\\htmlrtf0\n" + + "{\\*\\htmltag58 </body>}\n" + + "{\\*\\htmltag27 </html>}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("<html>")); + assertTrue(html.contains("<p>")); + assertTrue(html.contains("Hello world")); + assertTrue(html.contains("</html>")); + } + + @Test + public void testImgCidExtraction() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag19 <html>}\n" + + "{\\*\\htmltag50 <body>}\n" + + "{\\*\\htmltag84 <img src=\"cid:[email protected]\">}\n" + + "{\\*\\htmltag58 </body>}\n" + + "{\\*\\htmltag27 </html>}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("cid:[email protected]"), + "CID reference should be preserved in extracted HTML"); + } + + @Test + public void testParAndTabDecoding() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag241 <style>}\n" + + "{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n" + + "{\\*\\htmltag249 </style>}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("<style>")); + assertTrue(html.contains("body {")); + assertTrue(html.contains("\tcolor: red;")); + assertTrue(html.contains("</style>")); + } + + @Test + public void testHexEscapeDecoding() throws Exception { + // \'e9 = 0xE9 = 'e' in windows-1252 + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 caf\\'e9}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("caf\u00e9", html); + } + + @Test + public void testMultiByteHexEscape() throws Exception { + // \'fc = 'u' and \'df = 'ss' in windows-1252 + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 gr\\'fc\\'dfe}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("gr\u00fc\u00dfe", html); + } + + @Test + public void testCodePage1254Turkish() throws Exception { + // \'fd in windows-1254 = 0xFD, decoded by Java's windows-1254 charset + String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 Say\\'fdn}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + // Verify the byte 0xFD is decoded through windows-1254 + byte[] expected = new byte[] { 'S', 'a', 'y', (byte) 0xFD, 'n' }; + assertEquals(new String(expected, java.nio.charset.Charset.forName("windows-1254")), html); + } + + @Test + public void testHtmlrtfSkipping() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 Hello}\n" + + "\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" + + "{\\*\\htmltag84 World}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("Hello World", html); + } + + @Test + public void testEscapedBracesAndBackslash() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("a { b } c \\d", html); + } + + @Test + public void testEmptyHtmltag() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag72}\n" + + "{\\*\\htmltag84 text}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("text", html); + } + + @Test + public void testInterTagTextContent() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag19 <html>}\n" + + "{\\*\\htmltag50 <body>}\n" + + "{\\*\\htmltag64 <p>}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "Hello from the message body\n" + + "\\htmlrtf\\par}\\htmlrtf0\n" + + "{\\*\\htmltag72 </p>}\n" + + "{\\*\\htmltag64 <p>}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "Second paragraph\n" + + "\\htmlrtf\\par}\\htmlrtf0\n" + + "{\\*\\htmltag72 </p>}\n" + + "{\\*\\htmltag58 </body>}\n" + + "{\\*\\htmltag27 </html>}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("<p>"), "should contain HTML tags"); + assertTrue(html.contains("Hello from the message body"), + "should contain inter-tag text content"); + assertTrue(html.contains("Second paragraph"), + "should contain second paragraph text"); + assertTrue(html.contains("</html>"), "should contain closing tag"); + } + + @Test + public void testInterTagHexEscapes() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag64 <p>}\n" + + "\\htmlrtf {\\htmlrtf0\n" + + "caf\\'e9\n" + + "\\htmlrtf }\\htmlrtf0\n" + + "{\\*\\htmltag72 </p>}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertTrue(html.contains("caf\u00e9"), "hex escapes in inter-tag text should be decoded"); + } + + @Test + public void testLineControlWord() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\*\\htmltag84 line1\\line line2}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("line1<br>line2", html); + } + + @Test + public void testFontAwareCodePageDecoding() throws Exception { + // f0 = ANSI (fcharset 0 = windows-1252), f1 = Greek (fcharset 161 = cp1253) + // \'e1 in windows-1252 = U+00E1 (a with acute) + // \'e1 in cp1253 = U+03B1 (GREEK SMALL LETTER ALPHA) + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\fonttbl{\\f0\\fcharset0 Times;}{\\f1\\fcharset161 Greek;}}\n" + + "{\\*\\htmltag84 \\f0 caf\\'e9}\n" + + "{\\*\\htmltag84 \\f1 \\'e1}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + // f0: \'e9 in windows-1252 = e with acute + assertTrue(html.contains("caf\u00e9"), "f0 should decode as windows-1252"); + // f1: \'e1 in cp1253 = Greek alpha + assertTrue(html.contains("\u03b1"), "f1 should decode as cp1253 (Greek)"); + } + + @Test + public void testUnicodeEscapeWithAnsiShadow() throws Exception { + // \u8212 is em dash (U+2014). The \'97 is the ANSI shadow and should be skipped. + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" + + "{\\fonttbl{\\f0\\fcharset0 Times;}}\n" + + "{\\*\\htmltag84 A\\u8212\\'97B}\n" + + "}"; + String html = new RTFHtmlDecapsulator().extract(rtf.getBytes(US_ASCII)); + assertNotNull(html); + assertEquals("A\u2014B", html); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java new file mode 100644 index 0000000000..97d6b0a7dc --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.StringReader; +import java.nio.charset.Charset; + +import org.junit.jupiter.api.Test; + +public class RTFStateTest { + + private RTFState processRtf(String rtf) throws Exception { + RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf)); + RTFState state = new RTFState(); + RTFToken tok; + while ((tok = tokenizer.yylex()) != null) { + if (tok.getType() == RTFTokenType.EOF) { + break; + } + state.processToken(tok); + } + return state; + } + + @Test + public void testGlobalCharsetFromAnsicpg() throws Exception { + RTFState state = processRtf("{\\rtf1\\ansi\\ansicpg1251}"); + assertEquals(Charset.forName("CP1251"), state.getGlobalCharset()); + } + + @Test + public void testGlobalCharsetDefaultWindows1252() throws Exception { + RTFState state = processRtf("{\\rtf1\\ansi}"); + assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getGlobalCharset()); + } + + @Test + public void testGlobalCharsetPca() throws Exception { + RTFState state = processRtf("{\\rtf1\\pca}"); + assertEquals(Charset.forName("cp850"), state.getGlobalCharset()); + } + + @Test + public void testGlobalCharsetPc() throws Exception { + RTFState state = processRtf("{\\rtf1\\pc}"); + assertEquals(Charset.forName("cp437"), state.getGlobalCharset()); + } + + @Test + public void testGlobalCharsetMac() throws Exception { + RTFState state = processRtf("{\\rtf1\\mac}"); + assertEquals(Charset.forName("MacRoman"), state.getGlobalCharset()); + } + + @Test + public void testFontTableParsing() throws Exception { + // Realistic font table: f0=Times New Roman (ANSI), f1=MS Mincho (Shift_JIS) + String rtf = "{\\rtf1\\ansi\\deff0" + + "{\\fonttbl" + + "{\\f0\\froman\\fcharset0 Times New Roman;}" + + "{\\f1\\fnil\\fcharset128 MS Mincho;}" + + "}" + + "\\f0 Hello}"; + RTFState state = processRtf(rtf); + + // fcharset 0 = ANSI = WINDOWS-1252 + assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getFontToCharset().get(0)); + // fcharset 128 = Shift JIS = MS932 + assertEquals(Charset.forName("MS932"), state.getFontToCharset().get(1)); + } + + @Test + public void testCurrentCharsetFollowsFont() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" + + "{\\fonttbl" + + "{\\f0\\froman\\fcharset0 Times;}" + + "{\\f1\\fnil\\fcharset161 Greek;}" + + "}" + + "\\f1 text}"; + RTFTokenizer tokenizer = new RTFTokenizer(new java.io.StringReader(rtf)); + RTFState state = new RTFState(); + Charset charsetAtText = null; + + RTFToken tok; + while ((tok = tokenizer.yylex()) != null) { + if (tok.getType() == RTFTokenType.EOF) { + break; + } + state.processToken(tok); + // Capture charset when we see the first body text char + if (tok.getType() == RTFTokenType.TEXT && "t".equals(tok.getName()) + && charsetAtText == null) { + charsetAtText = state.getCurrentCharset(); + } + } + + // Verify font table was populated + assertEquals(2, state.getFontToCharset().size()); + assertEquals(Charset.forName("cp1253"), state.getFontToCharset().get(1)); + + // After \f1, charset should be cp1253 (Greek) + assertNotNull(charsetAtText); + assertEquals(Charset.forName("cp1253"), charsetAtText); + } + + @Test + public void testCurrentCharsetFallsBackToGlobal() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1254\\deff0" + + "{\\fonttbl" + + "{\\f0\\froman\\fcharset0 Times;}" + + "}" + + "\\f0 text}"; + RTFState state = processRtf(rtf); + + // fcharset 0 = WINDOWS-1252 (ANSI) + assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getCurrentCharset()); + } + + @Test + public void testDefaultFontCharset() throws Exception { + // \deff1 sets default font to f1, which maps to fcharset 162 (Turkish = cp1254) + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff1" + + "{\\fonttbl" + + "{\\f0\\froman\\fcharset0 Times;}" + + "{\\f1\\fnil\\fcharset162 Arial;}" + + "}" + + "\\pard text}"; + RTFState state = processRtf(rtf); + + // No explicit \fN in body, so should fall back to deff1 -> fcharset 162 -> cp1254 + assertEquals(Charset.forName("cp1254"), state.getCurrentCharset()); + } + + @Test + public void testUcSkipInherited() throws Exception { + // RTF uc control word sets skip count to 2, inherited by child groups + // We process token-by-token and check inside the inner group + String rtf = "{\\rtf1\\ansi\\uc2{inner}}"; + RTFTokenizer tokenizer = new RTFTokenizer(new java.io.StringReader(rtf)); + RTFState state = new RTFState(); + + int ucSkipInInnerGroup = -1; + boolean seenInnerText = false; + RTFToken tok; + while ((tok = tokenizer.yylex()) != null) { + if (tok.getType() == RTFTokenType.EOF) { + break; + } + state.processToken(tok); + // Check ucSkip when we see the first char of "inner" + if (tok.getType() == RTFTokenType.TEXT && "i".equals(tok.getName()) && !seenInnerText) { + ucSkipInInnerGroup = state.getCurrentGroup().ucSkip; + seenInnerText = true; + } + } + // Inside {inner}, ucSkip should be inherited as 2 from parent + assertEquals(2, ucSkipInInnerGroup); + } + + @Test + public void testAnsiSkipAfterUnicode() throws Exception { + // After \u8212, the next ucSkip (default 1) ANSI chars should be skipped + String rtf = "{\\rtf1\\ansi\\ansicpg1252" + + "{\\fonttbl{\\f0\\fcharset0 Times;}}" + + "\\f0 A\\u8212\\'97B}"; + RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf)); + RTFState state = new RTFState(); + StringBuilder textOutput = new StringBuilder(); + + RTFToken tok; + while ((tok = tokenizer.yylex()) != null) { + if (tok.getType() == RTFTokenType.EOF) { + break; + } + boolean consumed = state.processToken(tok); + if (!consumed && !state.getCurrentGroup().ignore) { + if (tok.getType() == RTFTokenType.TEXT) { + textOutput.append(tok.getName()); + } else if (tok.getType() == RTFTokenType.UNICODE_ESCAPE) { + int cp = tok.getParameter(); + if (Character.isValidCodePoint(cp)) { + textOutput.appendCodePoint(cp); + } + } + } + } + // A + \u8212 (em dash) + B. The \'97 should be skipped as unicode shadow. + assertEquals("A\u2014B", textOutput.toString()); + } + + @Test + public void testGroupStateRestored() throws Exception { + // Font change inside a group should be reverted when group closes + String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" + + "{\\fonttbl" + + "{\\f0\\fcharset0 Times;}" + + "{\\f1\\fcharset161 Greek;}" + + "}" + + "\\f0 {\\f1 greek}{back to times}}"; + RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf)); + RTFState state = new RTFState(); + + Charset charsetInsideGroup = null; + Charset charsetAfterGroup = null; + boolean seenGreekGroup = false; + int bodyGroupDepth = 0; + + RTFToken tok; + while ((tok = tokenizer.yylex()) != null) { + if (tok.getType() == RTFTokenType.EOF) { + break; + } + state.processToken(tok); + + if (tok.getType() == RTFTokenType.TEXT) { + String text = tok.getName(); + if ("g".equals(text) && !seenGreekGroup) { + // First char of "greek" + charsetInsideGroup = state.getCurrentCharset(); + seenGreekGroup = true; + } else if ("b".equals(text)) { + // First char of "back to times" + charsetAfterGroup = state.getCurrentCharset(); + } + } + } + + assertNotNull(charsetInsideGroup); + assertNotNull(charsetAfterGroup); + // Inside the {\f1 ...} group, charset should be Greek (cp1253) + assertEquals(Charset.forName("cp1253"), charsetInsideGroup); + // After the group closes, should be back to f0 (WINDOWS-1252) + assertEquals(RTFCharsetMaps.WINDOWS_1252, charsetAfterGroup); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java new file mode 100644 index 0000000000..741fefb3e5 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.rtf.jflex; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.junit.jupiter.api.Test; + +public class RTFTokenizerTest { + + private List<RTFToken> tokenize(String input) throws Exception { + RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(input)); + List<RTFToken> tokens = new ArrayList<>(); + RTFToken tok; + while ((tok = tokenizer.yylex()) != null) { + if (tok.getType() == RTFTokenType.EOF) { + break; + } + // copy token since it's reused + RTFToken copy = new RTFToken(); + copy.set(tok.getType(), tok.getName(), tok.getParameter(), tok.hasParameter()); + tokens.add(copy); + } + return tokens; + } + + @Test + public void testGroupOpenClose() throws Exception { + List<RTFToken> tokens = tokenize("{}"); + assertEquals(2, tokens.size()); + assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType()); + assertEquals(RTFTokenType.GROUP_CLOSE, tokens.get(1).getType()); + } + + @Test + public void testControlWord() throws Exception { + List<RTFToken> tokens = tokenize("\\rtf1"); + assertEquals(1, tokens.size()); + assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(0).getType()); + assertEquals("rtf", tokens.get(0).getName()); + assertEquals(1, tokens.get(0).getParameter()); + assertTrue(tokens.get(0).hasParameter()); + } + + @Test + public void testControlWordNoParam() throws Exception { + List<RTFToken> tokens = tokenize("\\ansi"); + assertEquals(1, tokens.size()); + assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(0).getType()); + assertEquals("ansi", tokens.get(0).getName()); + assertFalse(tokens.get(0).hasParameter()); + } + + @Test + public void testControlWordNegativeParam() throws Exception { + List<RTFToken> tokens = tokenize("\\u-4321"); + assertEquals(1, tokens.size()); + assertEquals(RTFTokenType.UNICODE_ESCAPE, tokens.get(0).getType()); + // -4321 → 65536 - 4321 = 61215 + assertEquals(61215, tokens.get(0).getParameter()); + } + + @Test + public void testHexEscape() throws Exception { + List<RTFToken> tokens = tokenize("\\'e9"); + assertEquals(1, tokens.size()); + assertEquals(RTFTokenType.HEX_ESCAPE, tokens.get(0).getType()); + assertEquals(0xe9, tokens.get(0).getHexValue()); + } + + @Test + public void testUnicodeEscape() throws Exception { + List<RTFToken> tokens = tokenize("\\u8212"); + assertEquals(1, tokens.size()); + assertEquals(RTFTokenType.UNICODE_ESCAPE, tokens.get(0).getType()); + assertEquals(8212, tokens.get(0).getParameter()); + } + + @Test + public void testBinControl() throws Exception { + List<RTFToken> tokens = tokenize("\\bin1024"); + assertEquals(1, tokens.size()); + assertEquals(RTFTokenType.BIN, tokens.get(0).getType()); + assertEquals(1024, tokens.get(0).getParameter()); + } + + @Test + public void testControlSymbol() throws Exception { + List<RTFToken> tokens = tokenize("\\~"); + assertEquals(1, tokens.size()); + assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(0).getType()); + assertEquals("~", tokens.get(0).getName()); + } + + @Test + public void testEscapedBraces() throws Exception { + List<RTFToken> tokens = tokenize("\\{\\}\\\\"); + assertEquals(3, tokens.size()); + assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(0).getType()); + assertEquals("{", tokens.get(0).getName()); + assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(1).getType()); + assertEquals("}", tokens.get(1).getName()); + assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(2).getType()); + assertEquals("\\", tokens.get(2).getName()); + } + + @Test + public void testText() throws Exception { + List<RTFToken> tokens = tokenize("Hello"); + assertEquals(5, tokens.size()); // one char at a time + for (RTFToken t : tokens) { + assertEquals(RTFTokenType.TEXT, t.getType()); + } + StringBuilder sb = new StringBuilder(); + for (RTFToken t : tokens) { + sb.append(t.getName()); + } + assertEquals("Hello", sb.toString()); + } + + @Test + public void testCrLf() throws Exception { + List<RTFToken> tokens = tokenize("a\r\nb"); + assertEquals(3, tokens.size()); + assertEquals(RTFTokenType.TEXT, tokens.get(0).getType()); + assertEquals(RTFTokenType.CRLF, tokens.get(1).getType()); + assertEquals(RTFTokenType.TEXT, tokens.get(2).getType()); + } + + @Test + public void testIgnorableDestination() throws Exception { + // { \* \htmltag84_ < p > } + // The space after \htmltag84 is consumed as the control word delimiter + List<RTFToken> tokens = tokenize("{\\*\\htmltag84 <p>}"); + assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType()); + assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(1).getType()); + assertEquals("*", tokens.get(1).getName()); + assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(2).getType()); + assertEquals("htmltag", tokens.get(2).getName()); + assertEquals(84, tokens.get(2).getParameter()); + // remaining tokens are < p > } + assertEquals(RTFTokenType.TEXT, tokens.get(3).getType()); + assertEquals("<", tokens.get(3).getName()); + assertEquals(RTFTokenType.TEXT, tokens.get(4).getType()); + assertEquals("p", tokens.get(4).getName()); + assertEquals(RTFTokenType.TEXT, tokens.get(5).getType()); + assertEquals(">", tokens.get(5).getName()); + assertEquals(RTFTokenType.GROUP_CLOSE, tokens.get(6).getType()); + assertEquals(7, tokens.size()); + } + + @Test + public void testMixedRtf() throws Exception { + String rtf = "{\\rtf1\\ansi\\ansicpg1252 Hello}"; + List<RTFToken> tokens = tokenize(rtf); + // { \rtf1 \ansi \ansicpg1252 SPACE H e l l o } + assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType()); + assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(1).getType()); + assertEquals("rtf", tokens.get(1).getName()); + assertEquals(1, tokens.get(1).getParameter()); + assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(2).getType()); + assertEquals("ansi", tokens.get(2).getName()); + assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(3).getType()); + assertEquals("ansicpg", tokens.get(3).getName()); + assertEquals(1252, tokens.get(3).getParameter()); + } +}
