This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new fdeb82f179 simplify serialization, take 2 (#2651)
fdeb82f179 is described below
commit fdeb82f17937b1fb54bef8edd834c4fe2eb16e68
Author: Tim Allison <[email protected]>
AuthorDate: Thu Feb 26 18:04:06 2026 -0500
simplify serialization, take 2 (#2651)
---
.../tika-grpc/sample-configs/ner/tika-config.json | 15 +-
.../apache/tika/parser/ner/NamedEntityParser.java | 2 +
.../src/test/resources/configs/tika-config.json | 2 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 32 +--
tika-pipes/tika-pipes-api/pom.xml | 19 ++
.../java/org/apache/tika/pipes/api/ParseMode.java | 3 +
.../apache/tika/pipes/core/CrashingDetector.java | 2 +
.../apache/tika/pipes/core/PipesClientTest.java | 20 +-
.../configs/tika-config-crashing-detector.json | 2 +-
.../tika/config/loader/ComponentInstantiator.java | 157 +++++++++++++-
.../config/loader/TikaObjectMapperFactory.java | 29 +++
.../tika/serialization/ComponentNameResolver.java | 104 ++++++++-
.../tika/serialization/ParseContextUtils.java | 32 +--
.../org/apache/tika/serialization/TikaModule.java | 238 ++-------------------
.../serdes/ParseContextDeserializer.java | 47 +---
.../serdes/ParseContextSerializer.java | 42 +---
.../TestParseContextSerialization.java | 16 +-
17 files changed, 385 insertions(+), 377 deletions(-)
diff --git a/tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.json
b/tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.json
index 479df09d29..90520c90e7 100644
--- a/tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.json
+++ b/tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.json
@@ -13,13 +13,14 @@
},
"parsers": [
{
- "class": "org.apache.tika.parser.ner.NamedEntityParser",
- "supportedMimeTypes": [
- "application/pdf",
- "text/plain",
- "text/html",
- "application/xhtml+xml"
- ]
+ "named-entity-parser": {
+ "_mime-include": [
+ "application/pdf",
+ "text/plain",
+ "text/html",
+ "application/xhtml+xml"
+ ]
+ }
}
]
}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
index 9aa75462e6..110f87a1e1 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
+++
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
@@ -33,6 +33,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.Tika;
+import org.apache.tika.config.TikaComponent;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -55,6 +56,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
* @see OpenNLPNERecogniser
* @see NERecogniser
*/
+@TikaComponent(spi = false)
public class NamedEntityParser implements Parser {
public static final Logger LOG =
LoggerFactory.getLogger(NamedEntityParser.class);
public static final Set<MediaType> MEDIA_TYPES = new HashSet<>();
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config.json
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config.json
index 3be95e7558..c960773c33 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config.json
+++
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config.json
@@ -1,7 +1,7 @@
{
"parsers": [
{
- "org.apache.tika.parser.ner.NamedEntityParser": {
+ "named-entity-parser": {
"_mime-include": ["text/plain", "text/html", "application/xhtml+xml"]
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index d5ce0bde8a..969a2255ef 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -68,8 +68,6 @@ import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.xml.XMLProfiler;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.apache.tika.serialization.serdes.ParseContextDeserializer;
-import org.apache.tika.serialization.serdes.ParseContextSerializer;
import org.apache.tika.utils.StringUtils;
public class PDFParserTest extends TikaTest {
@@ -578,30 +576,22 @@ public class PDFParserTest extends TikaTest {
@Test
public void testPDFParserConfigSerialization() throws Exception {
- // Test that PDFParserConfig can be serialized and deserialized
through ParseContext
- PDFParserConfig config = new PDFParserConfig();
- config.setSortByPosition(true);
- config.setExtractInlineImages(true);
- config.setOcrStrategy(OcrConfig.Strategy.AUTO);
-
- ParseContext parseContext = new ParseContext();
- parseContext.set(PDFParserConfig.class, config);
+ // PDFParser is self-configuring: config goes via "pdf-parser" JSON
config path
+ String json = "{\"pdf-parser\": {\"sortByPosition\": true, " +
+ "\"extractInlineImages\": true, \"ocrStrategy\": \"AUTO\"}}";
- // Serialize using ParseContextSerializer
com.fasterxml.jackson.databind.ObjectMapper mapper =
TikaObjectMapperFactory.getMapper();
- com.fasterxml.jackson.databind.module.SimpleModule module = new
com.fasterxml.jackson.databind.module.SimpleModule();
- module.addSerializer(ParseContext.class, new ParseContextSerializer());
- module.addDeserializer(ParseContext.class, new
ParseContextDeserializer());
- mapper.registerModule(module);
-
- String json = mapper.writeValueAsString(parseContext);
- // Deserialize
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
- // Verify PDFParserConfig was preserved - get it directly from
ParseContext
- PDFParserConfig deserializedConfig =
deserialized.get(PDFParserConfig.class);
+ // Verify config was stored as a JSON config entry
+ assertNotNull(deserialized.getJsonConfigs().get("pdf-parser"),
+ "pdf-parser config should be stored as JSON config");
+
+ // Verify the config can be deserialized to PDFParserConfig
+ String configJson =
deserialized.getJsonConfigs().get("pdf-parser").json();
+ PDFParserConfig deserializedConfig =
+ mapper.readValue(configJson, PDFParserConfig.class);
- assertNotNull(deserializedConfig, "PDFParserConfig should not be null
after deserialization");
assertTrue(deserializedConfig.isSortByPosition(),
"sortByPosition should be preserved");
assertTrue(deserializedConfig.isExtractInlineImages(),
diff --git a/tika-pipes/tika-pipes-api/pom.xml
b/tika-pipes/tika-pipes-api/pom.xml
index bf895ff91e..c37c5f9388 100644
--- a/tika-pipes/tika-pipes-api/pom.xml
+++ b/tika-pipes/tika-pipes-api/pom.xml
@@ -49,9 +49,28 @@
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-annotation-processor</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
</dependencies>
<build>
<plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <annotationProcessorPaths>
+ <path>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-annotation-processor</artifactId>
+ <version>${project.version}</version>
+ </path>
+ </annotationProcessorPaths>
+ </configuration>
+ </plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
diff --git
a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
index e6127d5005..f90ae7a29a 100644
---
a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
+++
b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
@@ -18,12 +18,15 @@ package org.apache.tika.pipes.api;
import java.util.Locale;
+import org.apache.tika.config.TikaComponent;
+
/**
* Controls how embedded documents are handled during parsing.
* <p>
* This can be set as a default in PipesConfig (loaded from tika-config.json)
* or overridden per-file via ParseContext.
*/
+@TikaComponent(name = "parse-mode")
public enum ParseMode {
/**
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/CrashingDetector.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/CrashingDetector.java
index 0304b65407..56270690ee 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/CrashingDetector.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/CrashingDetector.java
@@ -18,6 +18,7 @@ package org.apache.tika.pipes.core;
import java.io.IOException;
+import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -28,6 +29,7 @@ import org.apache.tika.parser.ParseContext;
* Detector that crashes with SystemExit
* Used for testing crash handling during pre-parse detection phase.
*/
+@TikaComponent(spi = false)
public class CrashingDetector implements Detector {
@Override
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
index c603bd12e5..7ab8a7f3a5 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
@@ -34,15 +34,13 @@ import org.apache.tika.config.TimeoutLimits;
import org.apache.tika.config.loader.TikaJsonConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.filter.CompositeMetadataFilter;
-import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.PipesResult;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
-import org.apache.tika.serialization.ParseContextUtils;
+
public class PipesClientTest {
String fetcherName = "fsf";
@@ -115,8 +113,7 @@ public class PipesClientTest {
@Test
public void testMetadataFilterFromJsonConfig(@TempDir Path tmp) throws
Exception {
// Test that metadata filters specified as JSON array in jsonConfigs
- // are properly resolved and applied during pipe processing.
- // This tests the full serialization/deserialization flow.
+ // survive serialization to the forked PipesServer and are applied.
ParseContext parseContext = new ParseContext();
parseContext.setJsonConfig("metadata-filters", """
[
@@ -124,14 +121,6 @@ public class PipesClientTest {
]
""");
- // Resolve the config to actual MetadataFilter instances
- ParseContextUtils.resolveAll(parseContext,
PipesClientTest.class.getClassLoader());
-
- // Verify the filter was resolved
- MetadataFilter resolvedFilter = parseContext.get(MetadataFilter.class);
- Assertions.assertNotNull(resolvedFilter, "MetadataFilter should be
resolved from jsonConfigs");
- assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass());
-
PipesClient pipesClient = init(tmp, testDoc);
PipesResult pipesResult = pipesClient.process(
new FetchEmitTuple(testDoc, new FetchKey(fetcherName, testDoc),
@@ -146,7 +135,7 @@ public class PipesClientTest {
@Test
public void testMultipleMetadataFiltersFromJsonConfig(@TempDir Path tmp)
throws Exception {
- // Test multiple filters specified as JSON array
+ // Test multiple filters specified as JSON array survive serialization
ParseContext parseContext = new ParseContext();
parseContext.setJsonConfig("metadata-filters", """
[
@@ -155,9 +144,6 @@ public class PipesClientTest {
]
""");
- // Resolve the config to actual MetadataFilter instances
- ParseContextUtils.resolveAll(parseContext,
PipesClientTest.class.getClassLoader());
-
String testFile = "mock-embedded.xml";
PipesClient pipesClient = init(tmp, testFile);
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-crashing-detector.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-crashing-detector.json
index 0fbf912a1d..a67b3ffb15 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-crashing-detector.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-crashing-detector.json
@@ -33,7 +33,7 @@
},
"detectors": [
{
- "org.apache.tika.pipes.core.CrashingDetector": {}
+ "crashing-detector": {}
}
],
"plugin-roots": "PLUGINS_PATHS"
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
index 39860928bd..f82daa581c 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
@@ -18,13 +18,22 @@ package org.apache.tika.config.loader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
+import java.util.HashSet;
+import java.util.Set;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.JsonConfig;
+import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.serialization.ComponentNameResolver;
import org.apache.tika.utils.ServiceLoaderUtils;
@@ -123,12 +132,13 @@ public class ComponentInstantiator {
// No JsonConfig constructor, fall back to other methods
}
- // Fall back to Jackson bean deserialization or zero-arg
constructor
+ // Fall back to no-arg constructor + Jackson bean deserialization
(readerForUpdating)
+ // Using readerForUpdating preserves defaults from the no-arg
constructor,
+ // unlike treeToValue which would null out unspecified fields.
T component;
- if (configNode == null || configNode.isEmpty()) {
- component = (T)
componentClass.getDeclaredConstructor().newInstance();
- } else {
- component = (T) objectMapper.treeToValue(configNode,
componentClass);
+ component = (T)
componentClass.getDeclaredConstructor().newInstance();
+ if (configNode != null && !configNode.isEmpty()) {
+
objectMapper.readerForUpdating(component).readValue(configNode);
}
// Call initialize() on Initializable components
@@ -170,6 +180,143 @@ public class ComponentInstantiator {
}
}
+ /**
+ * Instantiates a Tika component with full special-case handling.
+ * <p>
+ * This is the primary entry point for component instantiation from JSON
configuration.
+ * Handles:
+ * <ul>
+ * <li>Type resolution via {@link
ComponentNameResolver#resolveClass}</li>
+ * <li>Type compatibility validation against expectedType</li>
+ * <li>Special cases: DefaultParser/DefaultDetector rejection, MimeTypes
singleton</li>
+ * <li>{@code _mime-include}/{@code _mime-exclude} extraction and
stripping</li>
+ * <li>Three-step instantiation: JsonConfig ctor → readerForUpdating →
no-arg</li>
+ * <li>{@link Initializable#initialize()} callback</li>
+ * <li>Parser MIME filter wrapping</li>
+ * </ul>
+ *
+ * @param typeName the component type name (friendly name or FQCN)
+ * @param configNode the JSON configuration node (may be null)
+ * @param mapper the ObjectMapper for deserialization
+ * @param classLoader the class loader for name resolution
+ * @param expectedType the expected interface/base type (for validation),
or null to skip
+ * @return the instantiated component
+ * @throws TikaConfigException if instantiation fails
+ */
+ @SuppressWarnings("unchecked")
+ public static <T> T instantiateComponent(String typeName, JsonNode
configNode,
+ ObjectMapper mapper, ClassLoader
classLoader,
+ Class<?> expectedType)
+ throws TikaConfigException {
+ // Resolve the class using ComponentNameResolver
+ Class<?> clazz;
+ try {
+ clazz = ComponentNameResolver.resolveClass(typeName, classLoader);
+ } catch (ClassNotFoundException e) {
+ throw new TikaConfigException("Unknown type: " + typeName, e);
+ }
+
+ // Verify type compatibility
+ if (expectedType != null && !expectedType.isAssignableFrom(clazz)) {
+ throw new TikaConfigException("Type " + typeName + " (" +
clazz.getName() +
+ ") is not assignable to " + expectedType.getName());
+ }
+
+ // DefaultParser and DefaultDetector must be loaded via TikaLoader
+ if (clazz == DefaultParser.class) {
+ throw new TikaConfigException("DefaultParser must be loaded via
TikaLoader, not " +
+ "directly via Jackson deserialization. Use
TikaLoader.load() to load configuration.");
+ } else if (clazz == DefaultDetector.class) {
+ throw new TikaConfigException("DefaultDetector must be loaded via
TikaLoader, not " +
+ "directly via Jackson deserialization. Use
TikaLoader.load() to load configuration.");
+ }
+
+ // Extract mime filter fields before stripping them
+ Set<MediaType> includeTypes = extractMimeTypes(configNode,
"_mime-include");
+ Set<MediaType> excludeTypes = extractMimeTypes(configNode,
"_mime-exclude");
+
+ // Strip decorator fields before passing to component
+ JsonNode cleanedConfig = stripDecoratorFields(configNode);
+
+ try {
+ Object instance;
+
+ if (clazz == MimeTypes.class) {
+ // MimeTypes must use the singleton to have all type
definitions loaded
+ instance = MimeTypes.getDefaultMimeTypes();
+ } else if (cleanedConfig == null || cleanedConfig.isEmpty()) {
+ // If no config, use default constructor
+ instance = clazz.getDeclaredConstructor().newInstance();
+ } else {
+ // Try JsonConfig constructor first
+ Constructor<?> jsonConfigCtor =
findJsonConfigConstructor(clazz);
+ if (jsonConfigCtor != null) {
+ // Use plain JSON mapper since the main mapper may be
binary (Smile)
+ String json = TikaObjectMapperFactory.getPlainMapper()
+ .writeValueAsString(cleanedConfig);
+ instance = jsonConfigCtor.newInstance((JsonConfig) () ->
json);
+ } else {
+ // Fall back to no-arg constructor + Jackson bean
deserialization
+ instance = clazz.getDeclaredConstructor().newInstance();
+
mapper.readerForUpdating(instance).readValue(cleanedConfig);
+ }
+ }
+
+ // Call initialize() on Initializable components
+ initializeIfNeeded(instance);
+
+ // Wrap parser with mime filtering if include/exclude types
specified
+ if (instance instanceof Parser && (!includeTypes.isEmpty() ||
!excludeTypes.isEmpty())) {
+ instance = ParserDecorator.withMimeFilters(
+ (Parser) instance, includeTypes, excludeTypes);
+ }
+
+ return (T) instance;
+
+ } catch (TikaConfigException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new TikaConfigException("Failed to instantiate: " +
typeName, e);
+ }
+ }
+
+ private static Set<MediaType> extractMimeTypes(JsonNode configNode, String
fieldName) {
+ Set<MediaType> types = new HashSet<>();
+ if (configNode == null || !configNode.has(fieldName)) {
+ return types;
+ }
+ JsonNode arrayNode = configNode.get(fieldName);
+ if (arrayNode.isArray()) {
+ for (JsonNode typeNode : arrayNode) {
+ types.add(MediaType.parse(typeNode.asText()));
+ }
+ }
+ return types;
+ }
+
+ private static Constructor<?> findJsonConfigConstructor(Class<?> clazz) {
+ try {
+ return clazz.getConstructor(JsonConfig.class);
+ } catch (NoSuchMethodException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Strips decorator fields (_mime-include, _mime-exclude) from config node.
+ * These fields are handled by TikaLoader for wrapping, not by the
component itself.
+ * Note: _exclude is NOT stripped as it's used by DefaultParser for SPI
exclusions.
+ */
+ private static JsonNode stripDecoratorFields(JsonNode configNode) {
+ if (configNode == null || !configNode.isObject()) {
+ return configNode;
+ }
+ ObjectNode cleaned = configNode.deepCopy();
+ cleaned.remove("_mime-include");
+ cleaned.remove("_mime-exclude");
+ return cleaned;
+ }
+
/**
* Checks if the JsonConfig contains actual configuration (non-empty JSON
object with fields).
*
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
index e832dc8d4b..0a24e71705 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
@@ -55,6 +55,35 @@ public class TikaObjectMapperFactory {
private static ObjectMapper MAPPER = null;
+ // Shared plain ObjectMapper (no TikaModule) for converting JsonNodes to
JSON strings.
+ // Needed because the main mapper may use a binary format (e.g., Smile)
+ // which doesn't support writeValueAsString().
+ private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper();
+
+ static {
+ // Components with no bean properties (e.g., parsers with no
configuration)
+ // need to serialize as empty objects rather than throwing.
+ PLAIN_MAPPER.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
+ }
+
+ /**
+ * Returns a shared plain ObjectMapper without TikaModule registration.
+ * <p>
+ * This mapper is suitable for:
+ * <ul>
+ * <li>Converting JsonNodes to JSON strings</li>
+ * <li>Serializing component properties without compact format
wrapping</li>
+ * <li>Avoiding infinite recursion when serializing inside
TikaModule</li>
+ * </ul>
+ * <p>
+ * Has {@code FAIL_ON_EMPTY_BEANS} disabled to allow serialization of
classes with no properties.
+ *
+ * @return the shared plain ObjectMapper
+ */
+ public static ObjectMapper getPlainMapper() {
+ return PLAIN_MAPPER;
+ }
+
public static synchronized ObjectMapper getMapper() {
if (MAPPER == null) {
MAPPER = createMapper();
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
index 195cfd6df0..b1e1d6673a 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
@@ -17,6 +17,7 @@
package org.apache.tika.serialization;
import java.util.Collections;
+import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
@@ -24,7 +25,19 @@ import java.util.concurrent.ConcurrentHashMap;
import org.apache.tika.config.loader.ComponentInfo;
import org.apache.tika.config.loader.ComponentRegistry;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.UnpackSelector;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.sax.ContentHandlerDecoratorFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
/**
* Utility class that resolves friendly component names to classes using
ComponentRegistry.
@@ -37,6 +50,29 @@ import org.apache.tika.exception.TikaConfigException;
*/
public final class ComponentNameResolver {
+ /**
+ * Interfaces that use compact format serialization and serve as
ParseContext keys.
+ * Types implementing these interfaces will be serialized as:
+ * - "type-name" for defaults
+ * - {"type-name": {...}} for configured instances
+ */
+ private static final Set<Class<?>> CONTEXT_KEY_INTERFACES = new
HashSet<>();
+
+ static {
+ CONTEXT_KEY_INTERFACES.add(Parser.class);
+ CONTEXT_KEY_INTERFACES.add(Detector.class);
+ CONTEXT_KEY_INTERFACES.add(EncodingDetector.class);
+ CONTEXT_KEY_INTERFACES.add(MetadataFilter.class);
+ CONTEXT_KEY_INTERFACES.add(Translator.class);
+ CONTEXT_KEY_INTERFACES.add(Renderer.class);
+ CONTEXT_KEY_INTERFACES.add(DigesterFactory.class);
+ CONTEXT_KEY_INTERFACES.add(EmbeddedDocumentExtractorFactory.class);
+ CONTEXT_KEY_INTERFACES.add(MetadataWriteLimiterFactory.class);
+ CONTEXT_KEY_INTERFACES.add(ContentHandlerDecoratorFactory.class);
+ CONTEXT_KEY_INTERFACES.add(ContentHandlerFactory.class);
+ CONTEXT_KEY_INTERFACES.add(UnpackSelector.class);
+ }
+
private static final Map<String, ComponentRegistry> REGISTRIES = new
ConcurrentHashMap<>();
// Component configuration storage (keyed by JSON field name and by
component class)
@@ -77,7 +113,10 @@ public final class ComponentNameResolver {
}
}
}
- return Class.forName(name, false, classLoader);
+ throw new ClassNotFoundException(
+ "Component '" + name + "' is not registered. " +
+ "Components must be registered via @TikaComponent annotation
or .idx file. " +
+ "Arbitrary class names are not allowed for security reasons.");
}
/**
@@ -204,6 +243,69 @@ public final class ComponentNameResolver {
return Collections.unmodifiableSet(FIELD_TO_CONFIG.keySet());
}
+ // ==================== Context Key Resolution Methods ====================
+
+ /**
+ * Returns the set of interfaces that use compact format serialization.
+ *
+ * @return unmodifiable set of context key interfaces
+ */
+ public static Set<Class<?>> getContextKeyInterfaces() {
+ return Collections.unmodifiableSet(CONTEXT_KEY_INTERFACES);
+ }
+
+ /**
+ * Finds the appropriate context key interface for a given type.
+ * This is used to determine which interface should be used as the
ParseContext key
+ * when storing instances of this type.
+ *
+ * @param type the type to find the context key for
+ * @return the interface to use as context key, or null if none found
+ */
+ public static Class<?> findContextKeyInterface(Class<?> type) {
+ for (Class<?> iface : CONTEXT_KEY_INTERFACES) {
+ if (iface.isAssignableFrom(type)) {
+ return iface;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Checks if a type should use compact format serialization.
+ * Returns true if the type implements any of the registered context key
interfaces.
+ *
+ * @param type the type to check
+ * @return true if the type uses compact format
+ */
+ public static boolean usesCompactFormat(Class<?> type) {
+ return findContextKeyInterface(type) != null;
+ }
+
+ /**
+ * Determines the ParseContext key for a component.
+ * <p>
+ * Resolution order:
+ * <ol>
+ * <li>Explicit contextKey from .idx file (via @TikaComponent
annotation)</li>
+ * <li>Auto-detect from implemented interfaces (using
CONTEXT_KEY_INTERFACES)</li>
+ * <li>Fall back to the component class itself</li>
+ * </ol>
+ *
+ * @param info the component info
+ * @return the class to use as ParseContext key
+ */
+ public static Class<?> determineContextKey(ComponentInfo info) {
+ if (info.contextKey() != null) {
+ return info.contextKey();
+ }
+ Class<?> interfaceKey = findContextKeyInterface(info.componentClass());
+ if (interfaceKey != null) {
+ return interfaceKey;
+ }
+ return info.componentClass();
+ }
+
/**
* Gets the contextKey for a class from the component registry.
* The contextKey is recorded in the .idx file by the annotation processor.
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
index 626b69dfe7..252af2396b 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
@@ -140,7 +140,7 @@ public class ParseContextUtils {
}
// Determine the context key
- Class<?> contextKey = determineContextKey(info);
+ Class<?> contextKey =
ComponentNameResolver.determineContextKey(info);
try {
// Deserialize and cache in resolvedConfigs, also add to
context
@@ -157,36 +157,6 @@ public class ParseContextUtils {
}
}
- /**
- * Determines the ParseContext key for a component.
- * <p>
- * Resolution order:
- * <ol>
- * <li>Explicit contextKey from .idx file (via @TikaComponent
annotation)</li>
- * <li>Auto-detect from implemented interfaces (using
TikaModule.COMPACT_FORMAT_INTERFACES)</li>
- * <li>Fall back to the component class itself</li>
- * </ol>
- * <p>
- * Security note: This only determines the context key - it does NOT
affect which
- * classes can be instantiated. Classes must still be registered via
@TikaComponent.
- *
- * @param info the component info
- * @return the class to use as ParseContext key
- */
- private static Class<?> determineContextKey(ComponentInfo info) {
- // Use explicit contextKey from .idx file if specified
- if (info.contextKey() != null) {
- return info.contextKey();
- }
- // Auto-detect from implemented interfaces at runtime
- Class<?> contextKeyInterface =
TikaModule.findContextKeyInterface(info.componentClass());
- if (contextKeyInterface != null) {
- return contextKeyInterface;
- }
- // Fall back to the component class itself
- return info.componentClass();
- }
-
/**
* Resolves an array config entry (e.g., "metadata-filters") to a
composite component.
* <p>
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
index 8277632830..63ea711796 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
@@ -17,10 +17,8 @@
package org.apache.tika.serialization;
import java.io.IOException;
-import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
-import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@@ -36,36 +34,21 @@ import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationConfig;
-import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.deser.Deserializers;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.ser.Serializers;
-import org.apache.tika.config.Initializable;
-import org.apache.tika.config.JsonConfig;
-import org.apache.tika.config.SelfConfiguring;
+import org.apache.tika.config.loader.ComponentInstantiator;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.detect.EncodingDetector;
-import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
-import org.apache.tika.extractor.UnpackSelector;
-import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.filter.MetadataFilter;
-import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
-import org.apache.tika.renderer.Renderer;
-import org.apache.tika.sax.ContentHandlerDecoratorFactory;
-import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.serdes.DefaultDetectorSerializer;
import org.apache.tika.serialization.serdes.DefaultParserSerializer;
import org.apache.tika.serialization.serdes.MetadataDeserializer;
@@ -91,64 +74,6 @@ public class TikaModule extends SimpleModule {
private static ObjectMapper sharedMapper;
- // Plain JSON mapper for converting JsonNodes to JSON strings.
- // This is needed because the main mapper may use a binary format (e.g.,
Smile)
- // which doesn't support writeValueAsString().
- private static final ObjectMapper JSON_MAPPER = new ObjectMapper();
-
- /**
- * Interfaces that use compact format serialization.
- * Types implementing these interfaces will be serialized as:
- * - "type-name" for defaults
- * - {"type-name": {...}} for configured instances
- */
- private static final Set<Class<?>> COMPACT_FORMAT_INTERFACES = new
HashSet<>();
-
- static {
- // Core component interfaces that use compact format
- COMPACT_FORMAT_INTERFACES.add(Parser.class);
- COMPACT_FORMAT_INTERFACES.add(Detector.class);
- COMPACT_FORMAT_INTERFACES.add(EncodingDetector.class);
- COMPACT_FORMAT_INTERFACES.add(MetadataFilter.class);
- COMPACT_FORMAT_INTERFACES.add(Translator.class);
- COMPACT_FORMAT_INTERFACES.add(Renderer.class);
- COMPACT_FORMAT_INTERFACES.add(DigesterFactory.class);
- COMPACT_FORMAT_INTERFACES.add(EmbeddedDocumentExtractorFactory.class);
- COMPACT_FORMAT_INTERFACES.add(MetadataWriteLimiterFactory.class);
- COMPACT_FORMAT_INTERFACES.add(ContentHandlerDecoratorFactory.class);
- COMPACT_FORMAT_INTERFACES.add(ContentHandlerFactory.class);
- COMPACT_FORMAT_INTERFACES.add(UnpackSelector.class);
- }
-
- /**
- * Checks if a type should use compact format serialization.
- * Returns true if the type implements any of the registered compact
format interfaces.
- */
- private static boolean usesCompactFormat(Class<?> type) {
- return findContextKeyInterface(type) != null;
- }
-
- /**
- * Finds the appropriate context key interface for a given type.
- * This is used to determine which interface should be used as the
ParseContext key
- * when storing instances of this type.
- * <p>
- * Security note: This method only helps determine the context key - it
does NOT
- * affect which classes can be instantiated. Classes must still be
registered
- * via @TikaComponent to be deserializable.
- *
- * @param type the type to find the context key for
- * @return the interface to use as context key, or null if none found
- */
- public static Class<?> findContextKeyInterface(Class<?> type) {
- for (Class<?> iface : COMPACT_FORMAT_INTERFACES) {
- if (iface.isAssignableFrom(type)) {
- return iface;
- }
- }
- return null;
- }
-
public TikaModule() {
super("TikaModule");
@@ -220,7 +145,8 @@ public class TikaModule extends SimpleModule {
// Concrete implementations (like ExternalParser, HtmlParser)
should use normal
// Jackson bean deserialization for their properties.
if (rawClass.isInterface() ||
Modifier.isAbstract(rawClass.getModifiers())) {
- if (COMPACT_FORMAT_INTERFACES.contains(rawClass) ||
usesCompactFormat(rawClass)) {
+ if
(ComponentNameResolver.getContextKeyInterfaces().contains(rawClass) ||
+ ComponentNameResolver.usesCompactFormat(rawClass)) {
return new TikaComponentDeserializer(rawClass);
}
}
@@ -253,7 +179,8 @@ public class TikaModule extends SimpleModule {
// Only serialize with compact format if type implements a compact
format interface
// AND has a registered friendly name
- if (usesCompactFormat(rawClass) &&
ComponentNameResolver.getFriendlyName(rawClass) != null) {
+ if (ComponentNameResolver.usesCompactFormat(rawClass) &&
+ ComponentNameResolver.getFriendlyName(rawClass) != null) {
return new TikaComponentSerializer();
}
@@ -263,6 +190,7 @@ public class TikaModule extends SimpleModule {
/**
* Deserializer that handles both string and object formats for Tika
components.
+ * Delegates to {@link ComponentInstantiator#instantiateComponent} for
instantiation.
*/
private static class TikaComponentDeserializer extends
JsonDeserializer<Object> {
private final Class<?> expectedType;
@@ -281,14 +209,15 @@ public class TikaModule extends SimpleModule {
"Call TikaModule.setSharedMapper() before
deserializing.");
}
+ String typeName;
+ JsonNode configNode;
+
if (node.isTextual()) {
- // Simple string format: "pdf-parser"
- String typeName = node.asText();
- return instantiate(typeName, null, mapper);
+ typeName = node.asText();
+ configNode = null;
} else if (node.isObject()) {
Iterator<Map.Entry<String, JsonNode>> fields = node.fields();
if (!fields.hasNext()) {
- // Empty object {} - try to create default instance if
expectedType is concrete
try {
return
expectedType.getDeclaredConstructor().newInstance();
} catch (ReflectiveOperationException e) {
@@ -297,136 +226,19 @@ public class TikaModule extends SimpleModule {
}
}
Map.Entry<String, JsonNode> entry = fields.next();
- return instantiate(entry.getKey(), entry.getValue(), mapper);
+ typeName = entry.getKey();
+ configNode = entry.getValue();
} else {
throw new IOException("Expected string or object for " +
expectedType.getSimpleName() + ", got: " +
node.getNodeType());
}
- }
-
- private Object instantiate(String typeName, JsonNode configNode,
ObjectMapper mapper) throws IOException {
- // Resolve the class using ComponentNameResolver
- Class<?> clazz;
- try {
- clazz = ComponentNameResolver.resolveClass(typeName,
- Thread.currentThread().getContextClassLoader());
- } catch (ClassNotFoundException e) {
- throw new IOException("Unknown type: " + typeName, e);
- }
-
- // Verify type compatibility
- if (!expectedType.isAssignableFrom(clazz)) {
- throw new IOException("Type " + typeName + " (" +
clazz.getName() +
- ") is not assignable to " + expectedType.getName());
- }
-
- // Extract mime filter fields before stripping them
- Set<MediaType> includeTypes = extractMimeTypes(configNode,
"_mime-include");
- Set<MediaType> excludeTypes = extractMimeTypes(configNode,
"_mime-exclude");
-
- // Strip decorator fields before passing to component
- JsonNode cleanedConfig = stripDecoratorFields(configNode);
try {
- Object instance;
-
- // DefaultParser and DefaultDetector must be loaded via
TikaLoader for proper dependency injection
- if (clazz == DefaultParser.class) {
- throw new IOException("DefaultParser must be loaded via
TikaLoader, not directly " +
- "via Jackson deserialization. Use
TikaLoader.load() to load configuration.");
- } else if (clazz == DefaultDetector.class) {
- throw new IOException("DefaultDetector must be loaded via
TikaLoader, not directly " +
- "via Jackson deserialization. Use
TikaLoader.load() to load configuration.");
- } else if (clazz == MimeTypes.class) {
- // MimeTypes must use the singleton to have all type
definitions loaded
- instance = MimeTypes.getDefaultMimeTypes();
- } else if (cleanedConfig == null || cleanedConfig.isEmpty()) {
- // If no config, use default constructor
- instance = clazz.getDeclaredConstructor().newInstance();
- } else {
- // Try JsonConfig constructor first (works for any
component)
- Constructor<?> jsonConfigCtor =
findJsonConfigConstructor(clazz);
- if (jsonConfigCtor != null) {
- // Use plain JSON mapper since the main mapper may be
binary (Smile)
- String json =
JSON_MAPPER.writeValueAsString(cleanedConfig);
- instance = jsonConfigCtor.newInstance((JsonConfig) ()
-> json);
- } else {
- // Fall back to no-arg constructor + Jackson bean
deserialization
- instance =
clazz.getDeclaredConstructor().newInstance();
-
mapper.readerForUpdating(instance).readValue(cleanedConfig);
- }
- }
-
- // Call initialize() on Initializable components
- if (instance instanceof Initializable) {
- try {
- ((Initializable) instance).initialize();
- } catch (TikaConfigException e) {
- throw new IOException("Failed to initialize " +
typeName, e);
- }
- }
-
- // Wrap parser with mime filtering if include/exclude types
specified
- if (instance instanceof Parser && (!includeTypes.isEmpty() ||
!excludeTypes.isEmpty())) {
- instance = ParserDecorator.withMimeFilters((Parser)
instance, includeTypes, excludeTypes);
- }
-
- return instance;
-
- } catch (ReflectiveOperationException e) {
- throw new IOException("Failed to instantiate: " + typeName, e);
- }
- }
-
- private Set<MediaType> extractMimeTypes(JsonNode configNode, String
fieldName) {
- Set<MediaType> types = new HashSet<>();
- if (configNode == null || !configNode.has(fieldName)) {
- return types;
- }
- JsonNode arrayNode = configNode.get(fieldName);
- if (arrayNode.isArray()) {
- for (JsonNode typeNode : arrayNode) {
- types.add(MediaType.parse(typeNode.asText()));
- }
- }
- return types;
- }
-
- private Constructor<?> findJsonConfigConstructor(Class<?> clazz) {
- try {
- return clazz.getConstructor(JsonConfig.class);
- } catch (NoSuchMethodException e) {
- return null;
- }
- }
-
- /**
- * Deserializes a JsonNode using a dedicated deserializer.
- */
- private <T> T deserializeWithNode(JsonDeserializer<T> deserializer,
JsonNode node,
- ObjectMapper mapper) throws
IOException {
- if (node == null) {
- node = mapper.createObjectNode();
- }
- try (JsonParser p = mapper.treeAsTokens(node)) {
- p.nextToken();
- return deserializer.deserialize(p,
mapper.getDeserializationContext());
- }
- }
-
- /**
- * Strips decorator fields (_mime-include, _mime-exclude) from config
node.
- * These fields are handled by TikaLoader for wrapping, not by the
component itself.
- * Note: _exclude is NOT stripped as it's used by DefaultParser for
SPI exclusions.
- */
- private JsonNode stripDecoratorFields(JsonNode configNode) {
- if (configNode == null || !configNode.isObject()) {
- return configNode;
+ return ComponentInstantiator.instantiateComponent(typeName,
configNode,
+ mapper,
Thread.currentThread().getContextClassLoader(), expectedType);
+ } catch (TikaConfigException e) {
+ throw new IOException(e.getMessage(), e);
}
- ObjectNode cleaned = configNode.deepCopy();
- cleaned.remove("_mime-include");
- cleaned.remove("_mime-exclude");
- return cleaned;
}
}
@@ -435,12 +247,8 @@ public class TikaModule extends SimpleModule {
* Outputs simple string if using defaults, object with type key if
configured.
*/
private static class TikaComponentSerializer extends
JsonSerializer<Object> {
- // Plain mapper for serializing without TikaModule (avoids infinite
recursion)
- private final ObjectMapper plainMapper;
TikaComponentSerializer() {
- this.plainMapper = new ObjectMapper();
- this.plainMapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
}
@Override
@@ -507,8 +315,8 @@ public class TikaModule extends SimpleModule {
// Create default config to compare against
Object defaultConfig =
config.getClass().getDeclaredConstructor().newInstance();
- ObjectNode configNode = plainMapper.valueToTree(config);
- ObjectNode defaultNode =
plainMapper.valueToTree(defaultConfig);
+ ObjectNode configNode =
TikaObjectMapperFactory.getPlainMapper().valueToTree(config);
+ ObjectNode defaultNode =
TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultConfig);
// Only keep properties that differ from defaults
ObjectNode result = mapper.createObjectNode();
@@ -525,10 +333,10 @@ public class TikaModule extends SimpleModule {
// No config object - serialize the component directly
Object defaultInstance =
value.getClass().getDeclaredConstructor().newInstance();
- ObjectNode valueNode = plainMapper.valueToTree(value);
- ObjectNode defaultNode =
plainMapper.valueToTree(defaultInstance);
+ ObjectNode valueNode =
TikaObjectMapperFactory.getPlainMapper().valueToTree(value);
+ ObjectNode defaultNode =
TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultInstance);
- ObjectNode result = plainMapper.createObjectNode();
+ ObjectNode result =
TikaObjectMapperFactory.getPlainMapper().createObjectNode();
Iterator<Map.Entry<String, JsonNode>> fields =
valueNode.fields();
while (fields.hasNext()) {
Map.Entry<String, JsonNode> field = fields.next();
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
index c8141c47d9..3e526f7b88 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
@@ -34,9 +34,9 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.config.loader.ComponentInfo;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.serialization.ComponentNameResolver;
-import org.apache.tika.serialization.TikaModule;
/**
* Deserializes ParseContext from JSON.
@@ -61,10 +61,9 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
private static final Logger LOG =
LoggerFactory.getLogger(ParseContextDeserializer.class);
- // Plain JSON mapper for converting JsonNodes to JSON strings.
- // This is needed because the main mapper may use a binary format (e.g.,
Smile)
- // which doesn't support writeValueAsString().
- private static final ObjectMapper JSON_MAPPER = new ObjectMapper();
+ private static ObjectMapper plainMapper() {
+ return TikaObjectMapperFactory.getPlainMapper();
+ }
@Override
public ParseContext deserialize(JsonParser jsonParser,
DeserializationContext ctxt)
@@ -120,7 +119,7 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
// Store as JSON config for lazy resolution
// Use plain JSON mapper since the main mapper may be binary
(Smile)
- String json = JSON_MAPPER.writeValueAsString(value);
+ String json = plainMapper().writeValueAsString(value);
parseContext.setJsonConfig(name, json);
}
}
@@ -128,21 +127,6 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
return parseContext;
}
- /**
- * Determines the context key for a component.
- * Uses explicit contextKey if available, otherwise auto-detects from
interfaces.
- */
- private static Class<?> determineContextKey(ComponentInfo info) {
- if (info.contextKey() != null) {
- return info.contextKey();
- }
- Class<?> interfaceKey =
TikaModule.findContextKeyInterface(info.componentClass());
- if (interfaceKey != null) {
- return interfaceKey;
- }
- return info.componentClass();
- }
-
/**
* Checks if a JSON config entry would create a duplicate context key.
* <p>
@@ -172,7 +156,7 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
return;
}
- Class<?> contextKey = determineContextKey(info);
+ Class<?> contextKey = ComponentNameResolver.determineContextKey(info);
String existingName = seenContextKeys.get(contextKey);
if (existingName != null) {
@@ -215,25 +199,16 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
contextKeyClass = info.contextKey();
}
- // If not found in registry, try as fully qualified class name
+ // If not found in registry, reject — components must be registered
if (configClass == null) {
- try {
- configClass = Class.forName(componentName);
- // Check if the class has a contextKey via its annotation
- contextKeyClass =
ComponentNameResolver.getContextKey(configClass);
- } catch (ClassNotFoundException e) {
- LOG.warn("Could not find class for typed component '{}',
storing as JSON config",
- componentName);
- // Fall back to storing as JSON config (use plain JSON
mapper)
- parseContext.setJsonConfig(componentName,
JSON_MAPPER.writeValueAsString(configNode));
- continue;
- }
+ throw new IOException("Unknown typed component '" +
componentName + "'. " +
+ "Components must be registered via @TikaComponent
annotation or .idx file.");
}
// Determine context key: explicit > interface detection > class
itself
Class<?> parseContextKey = contextKeyClass;
if (parseContextKey == null) {
- parseContextKey =
TikaModule.findContextKeyInterface(configClass);
+ parseContextKey =
ComponentNameResolver.findContextKeyInterface(configClass);
}
if (parseContextKey == null) {
parseContextKey = configClass;
@@ -257,7 +232,7 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
LOG.warn("Failed to deserialize typed component '{}' as {},
storing as JSON config",
componentName, configClass.getName(), e);
// Use plain JSON mapper since main mapper may be binary
(Smile)
- parseContext.setJsonConfig(componentName,
JSON_MAPPER.writeValueAsString(configNode));
+ parseContext.setJsonConfig(componentName,
plainMapper().writeValueAsString(configNode));
}
}
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
index d884f93553..3168b4834b 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
@@ -27,6 +27,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializerProvider;
import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.serialization.ComponentNameResolver;
@@ -51,12 +52,8 @@ public class ParseContextSerializer extends
JsonSerializer<ParseContext> {
public static final String PARSE_CONTEXT = "parse-context";
public static final String TYPED = "typed";
- // Plain mapper for serializing values without TikaModule's component
wrapping
- private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper();
-
- static {
- // Allow serialization of classes with no properties
-
PLAIN_MAPPER.disable(com.fasterxml.jackson.databind.SerializationFeature.FAIL_ON_EMPTY_BEANS);
+ private static ObjectMapper plainMapper() {
+ return TikaObjectMapperFactory.getPlainMapper();
}
@Override
@@ -81,14 +78,13 @@ public class ParseContextSerializer extends
JsonSerializer<ParseContext> {
continue;
}
- // Use the actual value's class for serialization, not the key
class (which may be an interface)
- // This ensures we can deserialize back to the concrete class
- String valueClassName = value.getClass().getName();
-
- // Try to find a friendly component name for the value's class,
otherwise use FQCN
- String keyName = findComponentName(valueClassName);
+ // Find the friendly component name — all serializable components
must be registered
+ String keyName =
ComponentNameResolver.getFriendlyName(value.getClass());
if (keyName == null) {
- keyName = valueClassName;
+ throw new IOException(
+ "Cannot serialize ParseContext entry: " +
value.getClass().getName() +
+ " is not registered. Components must be registered via
" +
+ "@TikaComponent annotation or .idx file to be
serializable.");
}
if (!hasTypedObjects) {
@@ -99,7 +95,7 @@ public class ParseContextSerializer extends
JsonSerializer<ParseContext> {
gen.writeFieldName(keyName);
// Use writeTree instead of writeRawValue for binary format
support (e.g., Smile)
// and stricter validation (fails early if value can't be
serialized)
- gen.writeTree(PLAIN_MAPPER.valueToTree(value));
+ gen.writeTree(plainMapper().valueToTree(value));
// Track this name so we skip it in jsonConfigs
serializedNames.add(keyName);
@@ -119,26 +115,10 @@ public class ParseContextSerializer extends
JsonSerializer<ParseContext> {
}
gen.writeFieldName(entry.getKey());
// Parse the JSON string into a tree for binary format support
- gen.writeTree(PLAIN_MAPPER.readTree(entry.getValue().json()));
+ gen.writeTree(plainMapper().readTree(entry.getValue().json()));
}
gen.writeEndObject();
}
- /**
- * Finds the component name for a class.
- * Uses ComponentNameResolver for registry lookup. Only classes registered
- * in a component registry will be serialized.
- *
- * @param className the fully qualified class name
- * @return the component name, or null if not registered
- */
- private String findComponentName(String className) {
- try {
- Class<?> clazz = Class.forName(className);
- return ComponentNameResolver.getFriendlyName(clazz);
- } catch (ClassNotFoundException e) {
- return null;
- }
- }
}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
index e4dcd731e3..05669826e2 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
@@ -22,6 +22,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
@@ -256,20 +257,13 @@ public class TestParseContextSerialization {
}
@Test
- public void testProgrammaticObjectsNotSerialized() throws Exception {
- // Typed objects set via context.set() are NOT serialized
- // Only jsonConfigs are serialized for clean round-trip
+ public void testUnregisteredObjectFailsSerialization() throws Exception {
+ // Unregistered objects must fail serialization with a clear error
ParseContext pc = new ParseContext();
-
- // String doesn't have a @TikaComponent annotation
pc.set(String.class, "test-value");
- String json = serializeParseContext(pc);
-
- // Should be empty - typed objects are not serialized
- ObjectMapper mapper = createMapper();
- JsonNode root = mapper.readTree(json);
- assertEquals(1, root.size(), "Typed objects should be serialized");
+ assertThrows(IOException.class, () -> serializeParseContext(pc),
+ "Unregistered components should fail serialization");
}
@Test