This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 8dc960aa15 TIKA-4663 - add content handler type metadata and switch 
default to markdown (#2611)
8dc960aa15 is described below

commit 8dc960aa154a475901f3a880f258a815e12e0304
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 18 16:06:28 2026 -0500

    TIKA-4663 - add content handler type metadata and switch default to 
markdown (#2611)
---
 .../java/org/apache/tika/metadata/TikaCoreProperties.java | 12 +++++++++++-
 .../org/apache/tika/sax/BasicContentHandlerFactory.java   |  7 ++++++-
 .../java/org/apache/tika/sax/ContentHandlerFactory.java   | 15 +++++++++++++++
 .../apache/tika/sax/RecursiveParserWrapperHandler.java    |  2 ++
 .../org/apache/tika/async/cli/AsyncProcessorTest.java     |  4 ++++
 .../org/apache/tika/pipes/core/server/ParseHandler.java   |  2 ++
 .../java/org/apache/tika/config/loader/TikaLoader.java    |  8 +++++---
 7 files changed, 45 insertions(+), 5 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index ef3575f6e3..fdd52259e3 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -102,9 +102,19 @@ public interface TikaCoreProperties {
 
     Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + 
"parse_time_millis");
     /**
-     * Simple class name of the content handler
+     * Simple class name of the content handler.
+     * @deprecated Use {@link #TIKA_CONTENT_HANDLER_TYPE} for the handler type 
enum value.
      */
+    @Deprecated
     Property TIKA_CONTENT_HANDLER = Property.internalText(TIKA_META_PREFIX + 
"content_handler");
+
+    /**
+     * The handler type used to produce {@link #TIKA_CONTENT}.
+     * Value is the {@link 
org.apache.tika.sax.BasicContentHandlerFactory.HANDLER_TYPE}
+     * enum name (e.g. {@code TEXT}, {@code MARKDOWN}, {@code HTML}, {@code 
XML}).
+     */
+    Property TIKA_CONTENT_HANDLER_TYPE =
+            Property.internalText(TIKA_META_PREFIX + "content_handler_type");
     Property TIKA_CONTENT = Property.internalText(TIKA_META_PREFIX + 
"content");
     /**
      * Use this to store parse exception information in the Metadata object.
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java 
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index ddef58d96e..337eba15ab 100644
--- 
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -39,7 +39,7 @@ import org.apache.tika.parser.ParseContext;
 @TikaComponent(defaultFor = ContentHandlerFactory.class)
 public class BasicContentHandlerFactory implements 
StreamingContentHandlerFactory, WriteLimiter {
 
-    private HANDLER_TYPE type = HANDLER_TYPE.TEXT;
+    private HANDLER_TYPE type = HANDLER_TYPE.MARKDOWN;
     private int writeLimit = -1;
     private boolean throwOnWriteLimitReached = true;
     private transient ParseContext parseContext;
@@ -227,6 +227,11 @@ public class BasicContentHandlerFactory implements 
StreamingContentHandlerFactor
         return type;
     }
 
+    @Override
+    public String handlerTypeName() {
+        return type.name();
+    }
+
     /**
      * Sets the handler type.
      * @param type the handler type
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java 
b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
index 4c7efd7231..2dfe49912d 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
@@ -39,4 +39,19 @@ public interface ContentHandlerFactory extends Serializable {
      * @return a new ContentHandler instance
      */
     ContentHandler createHandler();
+
+    /**
+     * Returns the name of the handler type produced by this factory
+     * (e.g. {@code TEXT}, {@code MARKDOWN}, {@code HTML}, {@code XML}).
+     * <p>
+     * This value is written to
+     * {@link 
org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT_HANDLER_TYPE}
+     * so that downstream components (such as the inference pipeline) can
+     * determine what format {@code tika:content} is in without guessing.
+     *
+     * @return handler type name, never {@code null}
+     */
+    default String handlerTypeName() {
+        return "UNKNOWN";
+    }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
 
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 3284020426..9294dcaf42 100644
--- 
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ 
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -158,6 +158,8 @@ public class RecursiveParserWrapperHandler extends 
AbstractRecursiveParserWrappe
                 metadata.add(TikaCoreProperties.TIKA_CONTENT, content);
                 metadata.add(TikaCoreProperties.TIKA_CONTENT_HANDLER,
                         handler.getClass().getSimpleName());
+                metadata.set(TikaCoreProperties.TIKA_CONTENT_HANDLER_TYPE,
+                        getContentHandlerFactory().handlerTypeName());
             }
         }
     }
diff --git 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
index 0f31c1c1b8..585bf4b905 100644
--- 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
+++ 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
@@ -50,6 +50,8 @@ import org.apache.tika.pipes.api.pipesiterator.PipesIterator;
 import org.apache.tika.pipes.core.PipesException;
 import org.apache.tika.pipes.core.async.AsyncProcessor;
 import org.apache.tika.pipes.core.extractor.UnpackConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.serialization.JsonMetadataList;
 
 /**
@@ -121,6 +123,8 @@ public class AsyncProcessorTest extends TikaTest {
         ParseContext parseContext = new ParseContext();
         parseContext.set(ParseMode.class, ParseMode.UNPACK);
         parseContext.set(UnpackConfig.class, unpackConfig);
+        parseContext.set(ContentHandlerFactory.class,
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
         FetchEmitTuple t =
                 new FetchEmitTuple("myId-1", new FetchKey("fsf", "mock.xml"),
                         new EmitKey("fse-json", "emit-1"), new Metadata(), 
parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
index 8385631ae4..79d233ba4e 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
@@ -251,6 +251,8 @@ class ParseHandler {
             LOG.warn("parse exception: " + fetchEmitTuple.getId(), e);
         } finally {
             metadata.add(TikaCoreProperties.TIKA_CONTENT, handler.toString());
+            metadata.set(TikaCoreProperties.TIKA_CONTENT_HANDLER_TYPE,
+                    contentHandlerFactory.handlerTypeName());
             if (containerException != null) {
                 metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, 
containerException);
             }
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 36b0f69325..23bca1686e 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -286,7 +286,7 @@ public class TikaLoader {
     /**
      * Loads and returns the content handler factory.
      * If "content-handler-factory" section exists in config, uses that 
factory.
-     * If section missing, returns a default BasicContentHandlerFactory with 
TEXT handler.
+     * If section missing, returns a default BasicContentHandlerFactory with 
MARKDOWN handler.
      * Results are cached - subsequent calls return the same instance.
      *
      * <p>Example JSON:
@@ -315,10 +315,12 @@ public class TikaLoader {
                     throw new TikaConfigException("Failed to load 
content-handler-factory", e);
                 }
             }
-            // Default to BasicContentHandlerFactory with TEXT handler if not 
configured
+            // Default to BasicContentHandlerFactory with MARKDOWN handler if 
not configured.
+            // Markdown preserves structural boundaries (headings, lists, code 
blocks)
+            // which enables higher-quality chunking in the inference pipeline.
             if (contentHandlerFactory == null) {
                 contentHandlerFactory = new BasicContentHandlerFactory(
-                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
+                        BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, -1);
             }
         }
         return contentHandlerFactory;

Reply via email to