This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4663-tika-app-3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6f5eda263700e5d1e6f73e2ec3c26b594f16fe88 Author: tallison <[email protected]> AuthorDate: Wed Feb 18 14:21:08 2026 -0500 TIKA-4663 -- add cli option for markdown in 3.x --- tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java | 16 ++++++++++++++-- .../src/test/java/org/apache/tika/cli/TikaCLITest.java | 12 ++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index bbdd83d11d..37544c59c0 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -101,6 +101,7 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.apache.tika.sax.ToMarkdownContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; import org.apache.tika.serialization.JsonMetadata; @@ -203,6 +204,12 @@ public class TikaCLI { private boolean pipeMode = true; private boolean fork = false; private boolean prettyPrint; + private final OutputType MARKDOWN = new OutputType() { + @Override + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + return new BodyContentHandler(new ToMarkdownContentHandler(getOutputWriter(output, encoding))); + } + }; private final OutputType XML = new OutputType() { @Override protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { @@ -405,6 +412,8 @@ public class TikaCLI { type = XML; } else if (arg.equals("-h") || arg.equals("--html")) { type = HTML; + } else if (arg.equals("--md")) { + type = MARKDOWN; } else if (arg.equals("-t") || arg.equals("--text")) { type = TEXT; } else if (arg.equals("-T") || arg.equals("--text-main")) { @@ -500,6 +509,8 @@ public class TikaCLI { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; } else if (type.equals(TEXT_MAIN)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY; + } else if (type.equals(MARKDOWN)) { + handlerType = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN; } else if (type.equals(METADATA)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE; } @@ -530,12 +541,13 @@ public class TikaCLI { out.println(" -t or --text Output plain text content (body)"); out.println(" -T or --text-main Output plain text content (main content only via boilerpipe handler)"); out.println(" -A or --text-all Output all text content"); - out.println(" -m or --metadata Output only metadata"); + out.println(" --md Output Markdown content (body)"); + out.println(" -m or --metadata Output only metadata (no content)"); out.println(" -j or --json Output metadata in JSON"); out.println(" -y or --xmp Output metadata in XMP"); out.println(" -J or --jsonRecursive Output metadata and content from all"); out.println(" embedded files (choose content type"); - out.println(" with -x, -h, -t or -m; default is -x)"); + out.println(" with -x, -h, --md, -t or -m; default is -x)"); out.println(" -a or --async Run Tika in async mode; must specify details in a" + " tikaConfig file"); out.println(" -l or --language Output only language"); out.println(" -d or --detect Detect document type"); diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 33e311154a..8169ca7f89 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -170,6 +170,18 @@ public class TikaCLITest { assertTrue(content.contains("finished off the cake")); } + /** + * Tests --md option of the cli + * + * @throws Exception + */ + @Test + public void testMarkdownOutput() throws Exception { + String content = getParamOutContent("--md", resourcePrefix + "coffee.xls"); + assertTrue(content.contains("# Sheet1"), "Expected markdown heading"); + assertTrue(content.contains("| ---"), "Expected markdown table separator"); + } + /** * Tests -A option of the cli *
