This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 33e1f6a5c8 rm asciidoc plugin and resources
33e1f6a5c8 is described below
commit 33e1f6a5c879da3219665842b6a7eaffe5c732b4
Author: tallison <[email protected]>
AuthorDate: Fri Jan 23 17:35:06 2026 -0500
rm asciidoc plugin and resources
---
docs/pom.xml | 33 ---
docs/src/assembly/docs.xml | 2 +-
docs/src/main/asciidoc/advanced/index.adoc | 26 --
docs/src/main/asciidoc/advanced/robustness.adoc | 137 -----------
docs/src/main/asciidoc/advanced/spooling.adoc | 229 -----------------
docs/src/main/asciidoc/configuration/index.adoc | 40 ---
.../asciidoc/configuration/parsers/pdf-parser.adoc | 43 ----
.../parsers/tesseract-ocr-parser.adoc | 67 -----
docs/src/main/asciidoc/faq.adoc | 28 ---
docs/src/main/asciidoc/index.adoc | 72 ------
docs/src/main/asciidoc/maintainers/index.adoc | 29 ---
.../maintainers/release-guides/docker.adoc | 133 ----------
.../asciidoc/maintainers/release-guides/grpc.adoc | 32 ---
.../asciidoc/maintainers/release-guides/helm.adoc | 138 -----------
.../asciidoc/maintainers/release-guides/index.adoc | 32 ---
.../asciidoc/maintainers/release-guides/tika.adoc | 271 ---------------------
.../asciidoc/migration-to-4x/design-notes-4x.adoc | 127 ----------
docs/src/main/asciidoc/migration-to-4x/index.adoc | 32 ---
.../migration-to-4x/metadata-changes-4x.adoc | 121 ---------
.../asciidoc/migration-to-4x/migrating-to-4x.adoc | 157 ------------
.../asciidoc/migration-to-4x/serialization-4x.adoc | 101 --------
docs/src/main/asciidoc/pipes/index.adoc | 37 ---
docs/src/main/asciidoc/roadmap.adoc | 96 --------
docs/src/main/asciidoc/security.adoc | 34 ---
docs/src/main/asciidoc/using-tika/cli/index.adoc | 39 ---
docs/src/main/asciidoc/using-tika/grpc/index.adoc | 32 ---
docs/src/main/asciidoc/using-tika/index.adoc | 71 ------
.../using-tika/java-api/getting-started.adoc | 130 ----------
.../main/asciidoc/using-tika/java-api/index.adoc | 35 ---
.../src/main/asciidoc/using-tika/server/index.adoc | 42 ----
30 files changed, 1 insertion(+), 2365 deletions(-)
diff --git a/docs/pom.xml b/docs/pom.xml
index 00315f0686..8fdcc1c84e 100644
--- a/docs/pom.xml
+++ b/docs/pom.xml
@@ -50,39 +50,6 @@ under the License.
</filesets>
</configuration>
</plugin>
- <plugin>
- <groupId>org.asciidoctor</groupId>
- <artifactId>asciidoctor-maven-plugin</artifactId>
- <version>3.2.0</version>
- <executions>
- <execution>
- <id>output-html</id>
- <phase>generate-resources</phase>
- <goals>
- <goal>process-asciidoc</goal>
- </goals>
- <configuration>
- <doctype>article</doctype>
- <attributes>
-
<source-highlighter>coderay</source-highlighter>
- <toc />
- <linkcss>false</linkcss>
- <icons>font</icons>
-
<tika-stable-version>${tika.stable.version}</tika-stable-version>
- <!-- Paths to config examples for include
directives -->
-
<parser-examples>${project.basedir}/../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples</parser-examples>
-
<server-examples>${project.basedir}/../tika-server/tika-server-core/src/test/resources/config-examples</server-examples>
-
<pipes-fs-examples>${project.basedir}/../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples</pipes-fs-examples>
- </attributes>
- </configuration>
- </execution>
- </executions>
- <configuration>
- <sourceDirectory>src/main/asciidoc</sourceDirectory>
- <preserveDirectories>true</preserveDirectories>
- </configuration>
- </plugin>
-
<!-- Maven Assembly plugin to create tar.gz -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
diff --git a/docs/src/assembly/docs.xml b/docs/src/assembly/docs.xml
index 5a4b5c5746..9a52a6000f 100644
--- a/docs/src/assembly/docs.xml
+++ b/docs/src/assembly/docs.xml
@@ -27,7 +27,7 @@ under the License.
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
- <directory>${project.build.directory}/generated-docs</directory>
+ <directory>${project.build.directory}/site</directory>
<outputDirectory>/</outputDirectory>
<includes>
<include>**/*</include>
diff --git a/docs/src/main/asciidoc/advanced/index.adoc
b/docs/src/main/asciidoc/advanced/index.adoc
deleted file mode 100644
index 6fd0125c1a..0000000000
--- a/docs/src/main/asciidoc/advanced/index.adoc
+++ /dev/null
@@ -1,26 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Advanced Topics
-
-This section covers advanced usage and internals of Apache Tika.
-
-== Topics
-
-* xref:robustness.adoc[Robustness] - Process isolation and fault tolerance
when parsing untrusted content
-* xref:spooling.adoc[TikaInputStream and Spooling] - Understanding how
TikaInputStream handles buffering, caching, and spooling to disk
-* xref:metadata/index.adoc[Metadata Reference] - Documentation for Tika's
metadata fields
diff --git a/docs/src/main/asciidoc/advanced/robustness.adoc
b/docs/src/main/asciidoc/advanced/robustness.adoc
deleted file mode 100644
index 7547cf8eb2..0000000000
--- a/docs/src/main/asciidoc/advanced/robustness.adoc
+++ /dev/null
@@ -1,137 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= The Robustness of Apache Tika
-
-Running parsers on untrusted data carries inherent risks. In rare cases, Tika
can
-encounter infinite loops or allocate unexpected amounts of memory
(OutOfMemoryErrors).
-When processing documents at scale, you must implement protective measures.
-
-IMPORTANT: Avoid running Tika in the same process as critical infrastructure
like
-indexers or search systems.
-
-== Process Isolation
-
-The primary defense against parser failures is process isolation. By running
parsers
-in separate processes, you protect your main application from:
-
-* OutOfMemoryErrors
-* Infinite loops
-* Native code crashes
-* Resource exhaustion
-
-=== Tika 4.x
-
-**In Tika 4.x, xref:../pipes/index.adoc[Tika Pipes] is the recommended
approach for
-robust document processing.** It provides:
-
-* Automatic process isolation
-* Fault tolerance and recovery
-* Scalable parallel processing
-* Unified architecture for all deployment scenarios
-
-Pipes can be used in multiple ways:
-
-* **Programmatically** - Via `PipesForkParser` in the `tika-pipes-fork-parser`
module
- (see xref:../using-tika/java-api/getting-started.adoc[Java API Getting
Started])
-* **Via tika-server** - REST endpoints for pipes-based processing
-* **Via tika-grpc** - gRPC interface with pipes backend
-
-In Tika 4.x, the approach to robustness has been simplified. Previous versions
offered
-four different forking mechanisms:
-
-[cols="1,2,1"]
-|===
-|Mechanism |Description |Status in 4.x
-
-|ForkParser
-|Spawned child processes for individual parse operations
-|Deprecated
-
-|tika-batch
-|Desktop/VM-scale batch processing
-|Deprecated
-
-|tika-server (forked mode)
-|REST server with forked parsing processes
-|Available, but Pipes recommended
-
-|tika-pipes
-|Scalable, fault-tolerant pipeline processing
-|*Recommended approach*
-|===
-
-=== Tika 3.x and Earlier
-
-If you are using Tika 3.x or earlier, you have several options for process
isolation:
-
-ForkParser::
-Spawns child processes to protect against out-of-memory errors and infinite
loops.
-Suitable for programmatic use in Java applications.
-
-tika-batch::
-For desktop/VM-scale processing (not cloud-scale):
-+
-[source,bash]
-----
-java -jar tika-app.jar -i <input_dir> -o <output_dir>
-----
-
-tika-server::
-In version 2.x and later, parsing defaults to forked processes. Clients must
handle
-tika-server restarts gracefully.
-
-tika-pipes::
-Available through programmatic use, tika-app `-a` option, or tika-server's
`/async`
-and `/pipes` endpoints.
-
-== Security Testing and Prevention
-
-The Apache Tika team implements several measures to identify and prevent
vulnerabilities:
-
-* **Regression testing** against ~2 million files from Common Crawl before
releases
-* **Code reviews** of dependencies to identify vulnerability patterns
-* **Fuzzing modules** for automated vulnerability discovery
-* **Collaboration** with security researchers
-* **Maintained forks** of parsers with critical fixes (released independently
when needed)
-* **Public documentation** of vulnerabilities at
xref:../security.adoc[security page]
-
-== MockParser for Testing
-
-Tika provides a `MockParser` tool for testing your system's robustness. You can
-configure it to simulate various failure modes:
-
-* Infinite loops
-* OutOfMemoryErrors
-* Excessive runtime
-* Large output generation
-
-This allows you to verify that your integration handles parser failures
gracefully.
-
-== Recommendations
-
-1. **Use Tika Pipes** (4.x) for production workloads with untrusted content
-2. **Isolate Tika** from critical systems - never run in the same JVM as your
indexer
-3. **Set timeouts** for all parsing operations
-4. **Monitor memory usage** and set appropriate limits
-5. **Plan for failures** - your system should handle parser crashes gracefully
-6. **Stay updated** - apply security updates promptly
-
-== Further Reading
-
-* xref:../pipes/index.adoc[Tika Pipes] - Recommended approach for robust
processing
-* xref:../security.adoc[Security] - Known vulnerabilities and security model
diff --git a/docs/src/main/asciidoc/advanced/spooling.adoc
b/docs/src/main/asciidoc/advanced/spooling.adoc
deleted file mode 100644
index 81d3bb18e4..0000000000
--- a/docs/src/main/asciidoc/advanced/spooling.adoc
+++ /dev/null
@@ -1,229 +0,0 @@
-////
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
-////
-= Spooling in Apache Tika
-:toc:
-:toclevels: 3
-:sectnums:
-
-== Background
-
-=== What is Spooling?
-
-Spooling refers to the process of writing an input stream to a temporary file
on disk.
-This benefits certain file formats that can be processed more efficiently with
random access
-to the underlying bytes during detection or parsing.
-
-=== Why Some Formats Benefit from Random Access
-
-Several file formats are most efficiently processed with random access vs
streaming:
-
-* **OLE2 (Microsoft Office legacy formats)**: The POI library benefits from
reading the file
- as a random-access structure to navigate the OLE2 container.
-* **ZIP-based formats**: Container detection benefits from reading the ZIP
central directory,
- which is located at the end of the file. Parsing also benefits from random
access.
-* **Binary Property Lists (bplist)**: Apple's binary plist format benefits
from random access
- for efficient parsing.
-* **PDF**: While detection works via magic bytes, parsing benefits from random
access for
- the PDF cross-reference table.
-
-=== Architectural Decision: Decentralized Spooling
-
-==== The Solution: Let Components Self-Spool
-
-The current architecture follows a simple principle: **each component that
needs random
-access is responsible for obtaining it**.
-
-When a detector or parser needs random access, it calls:
-
-[source,java]
-----
-Path path = TikaInputStream.get(inputStream).getPath();
-// or
-File file = TikaInputStream.get(inputStream).getFile();
-----
-
-`TikaInputStream` handles the spooling transparently based on how it was
initialized:
-
-* **Initialized with `Path`**: The file is used directly for random access. No
spooling needed.
-* **Initialized with `byte[]`**: The bytes are kept in memory. Spooling only
on demand.
-* **Initialized with `InputStream`**: When `getPath()` or `getFile()` is
called, the stream
- is dynamically buffered to memory first, then spills to a temporary file
after a threshold.
- The temporary file is automatically cleaned up when the stream is closed.
-
-==== Benefits of Decentralized Spooling
-
-1. **Efficiency**: Spooling happens only when actually needed, not
preemptively.
-2. **Simplicity**: No central configuration of "which types need spooling."
-3. **Correctness**: Each component knows its own requirements.
-4. **Flexibility**: New formats can be added without modifying central
spooling logic.
-
-=== TikaInputStream Backing Strategies
-
-`TikaInputStream` uses configurable backing strategies that handle caching and
temporary
-file management. This means:
-
-* Repeated calls to `getFile()` return the same temporary file (no
re-spooling).
-* The `rewind()` method efficiently resets the stream for re-reading.
-* Memory-mapped and disk-backed strategies can be selected based on use case.
-
-== User Guide
-
-=== Default Behavior
-
-By default, Tika handles spooling automatically. You don't need to configure
anything
-for most use cases. When a detector or parser benefits from random access to a
file, it will
-spool the input stream to a temporary file if necessary.
-
-=== SpoolingStrategy for Fine-Grained Control
-
-For advanced use cases, you can use `SpoolingStrategy` to control spooling
behavior.
-This is useful when you want to:
-
-* Restrict which file types are allowed to spool (e.g., for performance
reasons)
-* Customize spooling behavior based on metadata or stream properties
-
-==== Programmatic Configuration
-
-[source,java]
-----
-import org.apache.tika.io.SpoolingStrategy;
-import org.apache.tika.parser.ParseContext;
-
-// Create a custom spooling strategy
-SpoolingStrategy strategy = new SpoolingStrategy();
-strategy.setSpoolTypes(Set.of(
- MediaType.application("zip"),
- MediaType.application("pdf")
-));
-
-// Add to parse context
-ParseContext context = new ParseContext();
-context.set(SpoolingStrategy.class, strategy);
-
-// Parse with the custom context
-parser.parse(inputStream, handler, metadata, context);
-----
-
-==== SpoolingStrategy Methods
-
-[source,java]
-----
-// Check if spooling should occur for a given type
-boolean shouldSpool(TikaInputStream tis, Metadata metadata, MediaType
mediaType)
-
-// Configure which types should be spooled
-void setSpoolTypes(Set<MediaType> types)
-
-// Set the media type registry for specialization checking
-void setMediaTypeRegistry(MediaTypeRegistry registry)
-----
-
-==== How Type Matching Works
-
-The `shouldSpool()` method returns `true` if:
-
-1. The stream doesn't already have a backing file (`tis.hasFile()` is false),
AND
-2. The media type matches one of the configured spool types
-
-Type matching considers:
-
-* Exact matches (e.g., `application/zip`)
-* Base type matches (e.g., `application/zip` matches `application/zip;
charset=utf-8`)
-* Specializations (e.g., `application/vnd.oasis.opendocument.text` is a
specialization of `application/zip`)
-
-==== Default Spool Types
-
-The default spool types are:
-
-* `application/zip` - ZIP archives and ZIP-based formats (OOXML, ODF, EPUB,
etc.)
-* `application/x-tika-msoffice` - OLE2 Microsoft Office formats
-* `application/x-bplist` - Apple binary property lists
-* `application/pdf` - PDF documents
-
-=== JSON Configuration
-
-SpoolingStrategy can be configured via JSON in your `tika-config.json` file.
-Place the configuration in the `other-configs` section:
-
-[source,json]
-----
-{
- "other-configs": {
- "spooling-strategy": {
- "spoolTypes": [
- "application/zip",
- "application/x-tika-msoffice",
- "application/pdf"
- ]
- }
- }
-}
-----
-
-Load the configuration using `TikaLoader`:
-
-[source,java]
-----
-TikaLoader loader = TikaLoader.load(Path.of("tika-config.json"));
-SpoolingStrategy strategy = loader.configs().load(SpoolingStrategy.class);
-
-// Add to parse context
-ParseContext context = new ParseContext();
-context.set(SpoolingStrategy.class, strategy);
-----
-
-=== Best Practices
-
-1. **Let Tika handle it**: For most applications, the default behavior is
optimal.
- Don't configure spooling unless you have a specific need.
-
-2. **Use TikaInputStream with Path or byte[]**: When you have a file, pass the
`Path`
- directly to `TikaInputStream.get(Path)` rather than wrapping a
`FileInputStream`.
- Similarly, pass `byte[]` directly rather than wrapping a
`ByteArrayInputStream`.
- This allows TikaInputStream to use efficient backing strategies that avoid
unnecessary
- copying or spooling:
-+
-[source,java]
-----
-// Good: TikaInputStream knows it has a file, can use random access directly
-TikaInputStream tis = TikaInputStream.get(path);
-
-// Bad: TikaInputStream sees an opaque stream, may spool unnecessarily
-TikaInputStream tis = TikaInputStream.get(new FileInputStream(file));
-
-// Good: TikaInputStream knows it has bytes in memory
-TikaInputStream tis = TikaInputStream.get(bytes);
-
-// Bad: TikaInputStream sees an opaque stream
-TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(bytes));
-----
-
-3. **Close streams properly**: Use try-with-resources to ensure temporary files
- are cleaned up:
-+
-[source,java]
-----
-try (TikaInputStream tis = TikaInputStream.get(inputStream)) {
- parser.parse(tis, handler, metadata, context);
-}
-----
-
-4. **Consider memory vs. disk tradeoffs**: For very large files, spooling to
disk
- may be needed. For small files processed in bulk, keeping data in memory
may be
- faster. `TikaInputStream` backing strategies can be tuned for your workload.
diff --git a/docs/src/main/asciidoc/configuration/index.adoc
b/docs/src/main/asciidoc/configuration/index.adoc
deleted file mode 100644
index 215e1f4c71..0000000000
--- a/docs/src/main/asciidoc/configuration/index.adoc
+++ /dev/null
@@ -1,40 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Configuration
-
-This section covers configuring Apache Tika.
-
-== Overview
-
-Tika 4.x uses JSON configuration files. Configuration controls parsers,
detectors,
-content handlers, and other components.
-
-NOTE: Tika 3.x and earlier used XML configuration (`tika-config.xml`). See the
-xref:../migration-to-4x/index.adoc[Migration Guide] for details on converting
to JSON.
-
-== Topics
-
-=== Parser Configuration
-
-* xref:parsers/pdf-parser.adoc[PDFParser] - PDF parsing options
-* xref:parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] - OCR options for
image-based text extraction
-
-// Add links to specific topics as they are created
-// * xref:json-config.adoc[JSON Configuration Reference]
-// * xref:detectors.adoc[Configuring Detectors]
-// * xref:mime-types.adoc[MIME Type Configuration]
diff --git a/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc
b/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc
deleted file mode 100644
index cee58a3b70..0000000000
--- a/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc
+++ /dev/null
@@ -1,43 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= PDFParser Configuration
-
-This page documents the configuration options for `PDFParser` in Tika 4.x.
-
-== Basic Configuration
-
-[source,json]
-----
-include::{parser-examples}/pdf-parser-basic.json[]
-----
-icon:github[]
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json[View
source on GitHub]
-
-== Full Configuration
-
-The following example shows all available configuration options with their
default values.
-Comments indicate the available options for enum fields.
-
-[source,json]
-----
-include::{parser-examples}/pdf-parser-full.json[]
-----
-icon:github[]
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json[View
source on GitHub]
-
-== Changes from 3.x
-
-See xref:../../migration-to-4x/migrating-to-4x.adoc[Migrating to 4.x] for
general migration guidance.
diff --git
a/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc
b/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc
deleted file mode 100644
index 5b1b2b67e6..0000000000
--- a/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc
+++ /dev/null
@@ -1,67 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= TesseractOCRParser Configuration
-
-This page documents the configuration options for `TesseractOCRParser` in Tika
4.x.
-
-== Basic Configuration
-
-[source,json]
-----
-include::{parser-examples}/tesseract-basic.json[]
-----
-icon:github[]
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json[View
source on GitHub]
-
-== Full Configuration
-
-The following example shows all available configuration options with their
default values.
-Comments indicate the available options for enum fields.
-
-[source,json]
-----
-include::{parser-examples}/tesseract-full.json[]
-----
-icon:github[]
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json[View
source on GitHub]
-
-== Changes from 3.x
-
-In Tika 3.x, the `otherTesseractSettings` was a list of space-delimited
key-value strings:
-
-[source,xml]
-----
-<!-- 3.x XML format -->
-<param name="otherTesseractSettings" type="list">
- <string>textord_initialx_ile 0.75</string>
- <string>textord_noise_hfract 0.15625</string>
-</param>
-----
-
-In Tika 4.x, this is replaced with `otherTesseractConfig` as a proper map:
-
-[source,json]
-----
-// 4.x JSON format
-"otherTesseractConfig": {
- "textord_initialx_ile": "0.75",
- "textord_noise_hfract": "0.15625"
-}
-----
-
-The automatic converter handles this transformation.
-
-See xref:../../migration-to-4x/migrating-to-4x.adoc[Migrating to 4.x] for
general migration guidance.
diff --git a/docs/src/main/asciidoc/faq.adoc b/docs/src/main/asciidoc/faq.adoc
deleted file mode 100644
index 168c9a9547..0000000000
--- a/docs/src/main/asciidoc/faq.adoc
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= FAQ and Troubleshooting
-
-This page covers frequently asked questions and common issues when using
Apache Tika.
-
-== Frequently Asked Questions
-
-// TODO: Add FAQs
-
-== Troubleshooting
-
-// TODO: Add common issues and solutions
diff --git a/docs/src/main/asciidoc/index.adoc
b/docs/src/main/asciidoc/index.adoc
deleted file mode 100644
index 5edc9e54ee..0000000000
--- a/docs/src/main/asciidoc/index.adoc
+++ /dev/null
@@ -1,72 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Apache Tika Documentation
-
-WARNING: This reference guide was generated with the assistance of AI and
requires
-human review before it can be fully trusted. This documentation serves as an
example
-and a starting point, but more work remains. Contributions and corrections are
welcome.
-
-== Overview
-
-Apache Tika is a content detection and extraction framework written in Java.
-
-== Using Tika
-
-* xref:using-tika/index.adoc[Getting Started] - Choose your integration method
-* xref:pipes/index.adoc[Pipes] - Scalable, fault-tolerant document processing
-
-== Configuration
-
-* xref:configuration/index.adoc[Configuration] - JSON configuration options
-
-== Migration
-
-* xref:migration-to-4x/index.adoc[Migrating to 4.x] - Guides and background
for upgrading to Tika 4.x
-
-== Advanced
-
-* xref:advanced/index.adoc[Advanced Topics] - Custom parsers, performance
tuning, internals
-
-== FAQ
-
-* xref:faq.adoc[FAQ and Troubleshooting] - Common questions and issues
-
-== Security
-
-* xref:security.adoc[Security] - Security considerations and reporting
vulnerabilities
-
-== Roadmap
-
-* xref:roadmap.adoc[Roadmap] - Planned features and improvements for upcoming
releases
-
-== For Maintainers
-
-* xref:maintainers/index.adoc[Maintainer Documentation] - Release guides and
project maintenance
-
-== Links
-
-* https://tika.apache.org/[Apache Tika Website] - Official project website
-* https://tika.apache.org/{tika-stable-version}/formats.html[Supported
Formats] - File formats Tika can parse
-* https://tika.apache.org/{tika-stable-version}/api/[API Documentation] -
Javadoc
-* https://issues.apache.org/jira/projects/TIKA[JIRA] - Issue tracker
-*
https://repository.apache.org/content/repositories/snapshots/org/apache/tika/[Maven
Snapshots] - SNAPSHOT builds in Apache's Maven repository
-* https://ci-builds.apache.org/job/Tika/[CI Builds] - Continuous integration
builds
-* https://cwiki.apache.org/confluence/display/TIKA/[Confluence Wiki] - Legacy
wiki documentation
-+
-NOTE: As of Tika 4.x, we are migrating content from Confluence to these
AsciiDoc pages.
-The Confluence wiki will eventually be retired.
diff --git a/docs/src/main/asciidoc/maintainers/index.adoc
b/docs/src/main/asciidoc/maintainers/index.adoc
deleted file mode 100644
index bab767b707..0000000000
--- a/docs/src/main/asciidoc/maintainers/index.adoc
+++ /dev/null
@@ -1,29 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= For Maintainers
-
-This section contains documentation for Apache Tika project maintainers and
committers.
-
-== Topics
-
-* xref:release-guides/index.adoc[Release Guides] - How to release Apache Tika
-
-// Add links to specific topics as they are created
-// * link:voting.html[Voting Procedures]
-// * link:ci.html[Continuous Integration]
-// * link:website.html[Website Maintenance]
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc
deleted file mode 100644
index a8f2f8cbc7..0000000000
--- a/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc
+++ /dev/null
@@ -1,133 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Releasing Tika Docker Images
-
-This guide covers the process for releasing Apache Tika Docker images.
-
-== Prerequisites
-
-=== DockerHub Access
-
-You need permissions on the `apache/tika` repository on DockerHub. To obtain
access,
-create an INFRA JIRA ticket with the "Docker" label.
-
-=== Repository Access
-
-Clone the tika-docker repository:
-
-[source,bash]
-----
-git clone https://github.com/apache/tika-docker
-cd tika-docker
-----
-
-== Image Types
-
-The tika-docker repository produces two types of images:
-
-Minimal::
-Apache Tika with base dependencies (Java only)
-
-Full::
-Apache Tika plus Tesseract OCR and GDAL
-
-== Helper Tools
-
-docker-tool.sh::
-Automates building, testing, and publishing Docker images
-
-republish-images.sh::
-Legacy script for batch republishing images
-
-NOTE: The repository also contains Docker Compose files for advanced scenarios
-(Vision, Grobid, OCR, NER), but these are not used for official releases.
-
-== Release Process
-
-=== Step 1: Update README
-
-Update the "Available Tags" section in `README.md` to include the new version.
-
-=== Step 2: Update Version
-
-Increment the TAG version in the `.env` file.
-
-=== Step 3: Update Changelog
-
-Update `CHANGES.md` with release information and date.
-
-=== Step 4: Test Locally
-
-Test the release locally before publishing:
-
-[source,bash]
-----
-./docker-tool.sh build <docker-version> <tika-version>
-./docker-tool.sh test <docker-version>
-----
-
-=== Step 5: Commit Changes
-
-Commit all changes:
-
-[source,bash]
-----
-git add README.md .env CHANGES.md
-git commit -m "Prepare for Docker release <docker-version>"
-git push
-----
-
-=== Step 6: Build and Publish
-
-Build and publish the images using the docker-tool script.
-
-Example for version 3.1.0.0 based on Tika 3.1.0:
-
-[source,bash]
-----
-# Build the images
-./docker-tool.sh build 3.1.0.0 3.1.0
-
-# Test the images
-./docker-tool.sh test 3.1.0.0
-
-# Publish to DockerHub
-./docker-tool.sh publish 3.1.0.0 3.1.0
-----
-
-NOTE: Multi-architecture building takes time. The publish step automatically
-updates the `-latest` tag on DockerHub.
-
-=== Step 7: Tag the Release
-
-Create and push a git tag for the release:
-
-[source,bash]
-----
-git tag -a 3.1.0.0 -m "New release for 3.1.0.0"
-git push --tags
-----
-
-== Post-Release
-
-After publishing the Docker images:
-
-* Verify the images are available on DockerHub at
https://hub.docker.com/r/apache/tika
-* Test pulling and running the new images
-* Update the main Tika website if needed
-* Proceed to release the link:helm.html[Helm charts] if applicable
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc
deleted file mode 100644
index 0576d23bb8..0000000000
--- a/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc
+++ /dev/null
@@ -1,32 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Releasing Tika gRPC
-
-This guide covers the process for releasing Apache Tika gRPC components.
-
-== Prerequisites
-
-// TODO: Add prerequisites
-
-== Release Process
-
-// TODO: Add release steps
-
-== Post-Release
-
-// TODO: Add post-release steps
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc
deleted file mode 100644
index aa80120c6f..0000000000
--- a/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc
+++ /dev/null
@@ -1,138 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Releasing Tika Helm Charts
-
-This guide covers the process for releasing Apache Tika Helm charts.
-
-== Prerequisites
-
-=== Apache JFrog Artifactory Access
-
-You need permissions to release the Apache Tika Helm chart to the Apache Infra
-Artifactory instance. Access is controlled by ASF Infra and can be requested
-via a JIRA ticket with the "Artifactory" label.
-
-=== Repository Access
-
-Clone the tika-helm repository:
-
-[source,bash]
-----
-git clone https://github.com/apache/tika-helm
-cd tika-helm
-----
-
-Apache Tika committers should have existing access to this repository.
-
-=== Install Helm and Plugins
-
-Install Helm and the Artifactory plugin:
-
-[source,bash]
-----
-# Install Helm (macOS)
-brew install helm
-
-# Install the Artifactory push plugin
-helm plugin install https://github.com/belitre/helm-push-artifactory-plugin
--version 1.0.2
-----
-
-== Docker Image Types
-
-The Helm chart deploys one of two upstream Docker image types:
-
-Minimal::
-Contains Apache Tika and base dependencies (Java only)
-
-Full::
-Includes Tika, dependencies, Tesseract OCR, GDAL, etc.
-
-The Helm Chart uses the *Full* image by default, though either can be specified
-during Kubernetes deployment.
-
-== Versioning
-
-tika-helm Charts follow the https://semver.org/spec/v2.0.0.html[Semantic
Versioning 2.0.0]
-specification, regardless of upstream container image versioning.
-
-== Release Process
-
-=== Step 1: Update Chart Configuration
-
-For each new upstream tika-docker FULL release, update the following files:
-
-Chart.yaml::
-* Line 22: Update `version` (chart version)
-* Line 23: Update `appVersion` (must match upstream tika-docker FULL release
tag)
-
-values.yaml::
-* Line 26: Update the default image tag
-
-=== Step 2: Commit and Tag
-
-Commit the changes and create a release tag:
-
-[source,bash]
-----
-export RELEASE_VERSION=v3.2.2
-
-git add -A
-git commit -m "Release tika-helm $RELEASE_VERSION"
-git push origin main
-
-git tag -a $RELEASE_VERSION -m "Release tika-helm $RELEASE_VERSION"
-git push --tags
-----
-
-=== Step 3: Create GitHub Release
-
-. Navigate to the pushed tag on GitHub
-. Click the three-dot menu
-. Select "Create release"
-. Add release notes and publish
-
-=== Step 4: Publish to Apache JFrog Artifactory
-
-Add the Tika Helm repository and push the chart:
-
-[source,bash]
-----
-# Add the Tika Helm repository
-helm repo add tika https://apache.jfrog.io/artifactory/tika
-
-# Set your credentials
-export HELM_REPO_USERNAME="your-apache-id"
-export HELM_REPO_PASSWORD="your-password"
-
-# Push the chart to Artifactory
-helm push-artifactory . https://apache.jfrog.io/artifactory/tika
-----
-
-== Post-Release
-
-After publishing the Helm chart:
-
-* Verify the chart is available at https://apache.jfrog.io/artifactory/tika
-* Test installing the chart in a Kubernetes cluster
-* Update any documentation referencing the chart version
-
-== Questions
-
-For questions about the Helm release process, contact:
-
-* [email protected] mailing list
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/index.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/index.adoc
deleted file mode 100644
index 1f618e9892..0000000000
--- a/docs/src/main/asciidoc/maintainers/release-guides/index.adoc
+++ /dev/null
@@ -1,32 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Release Guides
-
-This section contains guides for releasing Apache Tika components.
-
-== Overview
-
-Apache Tika follows the standard Apache release process. This section provides
-step-by-step guides for releasing the various Tika components.
-
-== Topics
-
-* xref:tika.adoc[Releasing Apache Tika] - Main Tika project release process
-* xref:docker.adoc[Releasing Tika Docker Images] - Docker image release process
-* xref:helm.adoc[Releasing Tika Helm Charts] - Helm chart release process
-* xref:grpc.adoc[Releasing Tika gRPC] - gRPC component release process
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc
deleted file mode 100644
index a967c80421..0000000000
--- a/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc
+++ /dev/null
@@ -1,271 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Releasing Apache Tika
-
-This guide covers the process for releasing the main Apache Tika project.
-
-== Prerequisites
-
-Before starting the release process, ensure you have:
-
-* Commit access to the Apache Tika repository
-* A valid GPG key published to a public keyserver
-* Maven credentials configured in `~/.m2/settings.xml`
-* Access to Apache's Nexus repository manager
-
-== Pre-Release Checks
-
-Before starting the release, run vulnerability and dependency audits:
-
-[source,bash]
-----
-# Identify vulnerable dependencies
-mvn ossindex:audit -Dossindex.fail=true
-
-# Check for outdated plugins
-mvn versions:display-plugin-updates
-
-# Check for outdated dependencies
-mvn versions:display-dependency-updates
-
-# Run full regression tests
-mvn -Prelease-profile clean verify
-----
-
-== Release Process
-
-=== Step 1: Clone the Repository
-
-Clone the repository if you haven't already:
-
-[source,bash]
-----
-git clone https://github.com/apache/tika.git
-cd tika
-----
-
-=== Step 2: Update Documentation
-
-Update `CHANGES.txt` with the release date:
-
-[source]
-----
-Release X.Y.Z - MM/dd/yyyy
-----
-
-Add any changelog entries as needed.
-
-=== Step 3: JIRA Management
-
-. Create versions X.Y.Z, X.(Y+1), and X.(Y+2) in JIRA if they don't exist
-. Reassign any unresolved X.Y.Z issues to X.(Y+1) via bulk change
-
-=== Step 4: Verify License Headers
-
-Run the Apache RAT plugin to verify all files have proper license headers:
-
-[source,bash]
-----
-mvn apache-rat:check
-----
-
-=== Step 5: Commit Changes
-
-Commit the CHANGES.txt updates:
-
-[source,bash]
-----
-git add CHANGES.txt
-git commit -m "Prepare for X.Y.Z release"
-git push
-----
-
-=== Step 6: Set Maven Memory
-
-Configure Maven memory settings:
-
-[source,bash]
-----
-export MAVEN_OPTS="-Xms128m -Xmx256m"
-----
-
-=== Step 7: Prepare the Release
-
-Execute the Maven release prepare goal:
-
-[source,bash]
-----
-mvn release:prepare
-----
-
-This will prompt you to confirm:
-
-* The release version (X.Y.Z)
-* The SCM tag name
-* The next development version
-
-=== Step 8: Perform the Release
-
-Execute the Maven release perform goal:
-
-[source,bash]
-----
-mvn release:perform
-----
-
-Ensure you have valid Maven credentials in `~/.m2/settings.xml`:
-
-[source,xml]
-----
-<servers>
- <server>
- <id>apache.releases.https</id>
- <username>your-apache-id</username>
- <password>your-password</password>
- </server>
-</servers>
-----
-
-=== Step 9: Verify Staging Repository
-
-. Access Apache's Nexus at https://repository.apache.org
-. Log in with your Apache credentials
-. Navigate to "Staging Repositories"
-. Find the org.apache.tika staging repository
-. Verify it contains all expected artifacts
-. Click "Close" with an appropriate message
-
-=== Step 10: Upload Distribution Artifacts
-
-Upload artifacts to `dist.apache.org`:
-
-[source,bash]
-----
-svn co https://dist.apache.org/repos/dist/dev/tika tika-dist-dev
-cd tika-dist-dev
-----
-
-Upload the following files with their signatures (.asc) and checksums
(.sha512):
-
-* `tika-X.Y.Z-src.zip`
-* `tika-app-X.Y.Z.jar`
-* `tika-server-standard-X.Y.Z.jar`
-
-Also:
-
-* Rename `CHANGES.txt` to `CHANGES-X.Y.Z.txt`
-* Ensure the `KEYS` file contains all contributor signatures
-
-=== Step 11: Call the Vote
-
-Send a vote request to the [email protected] mailing list:
-
-[source]
-----
-Subject: [VOTE] Release Apache Tika X.Y.Z
-
-Hi all,
-
-I have created a candidate build for Apache Tika X.Y.Z.
-
-The release candidate artifacts can be found at:
-https://dist.apache.org/repos/dist/dev/tika/
-
-The staging repository is:
-https://repository.apache.org/content/repositories/orgapachetika-XXXX
-
-The Git tag is:
-https://github.com/apache/tika/tree/X.Y.Z
-
-Please vote:
-[ ] +1 Release this package
-[ ] +0 No opinion
-[ ] -1 Do not release (please provide reason)
-
-This vote will remain open for at least 72 hours.
-----
-
-=== Step 12: Release the Artifacts
-
-Upon successful vote (at least 3 +1 votes from PMC members):
-
-. Release the Nexus staging repository (click "Release" button)
-. Move artifacts from dev to release distribution:
-
-[source,bash]
-----
-svn mv https://dist.apache.org/repos/dist/dev/tika/X.Y.Z \
- https://dist.apache.org/repos/dist/release/tika/X.Y.Z \
- -m "Release Apache Tika X.Y.Z"
-----
-
-== Post-Release
-
-=== Update Unreleased Modules
-
-Update any modules that weren't part of the release to the next SNAPSHOT
version.
-
-=== Update Website
-
-Refresh the website documentation to reflect the new release:
-
-* Update download links
-* Update version numbers in documentation
-* Add release notes
-
-=== Release Docker and Helm Images
-
-Follow the separate guides for releasing:
-
-* link:docker.html[Docker images]
-* link:helm.html[Helm charts]
-
-=== Send Announcements
-
-Send release announcements to:
-
-* [email protected]
-* [email protected]
-* [email protected]
-
-[source]
-----
-Subject: [ANNOUNCE] Apache Tika X.Y.Z Released
-
-The Apache Tika team is pleased to announce the release of Apache Tika X.Y.Z.
-
-Apache Tika is a toolkit for detecting and extracting metadata and text
-from various types of files.
-
-This release includes:
-[List major changes/features]
-
-For a complete list of changes, see:
-https://tika.apache.org/X.Y.Z/changes.html
-
-Download:
-https://tika.apache.org/download.html
-
-Thanks to everyone who contributed to this release!
-
-The Apache Tika Team
-----
-
-=== Register the Release
-
-Register the release at https://reporter.apache.org
diff --git a/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc
b/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc
deleted file mode 100644
index 006c4775f9..0000000000
--- a/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc
+++ /dev/null
@@ -1,127 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Design Notes for Tika 4.x
-
-This document captures the design decisions and architectural changes in
Apache Tika 4.x.
-
-== Metadata Keys
-
-The design addresses security concerns by implementing namespaced metadata
keys. This prevents
-user-controlled data from potentially overwriting existing metadata values in
the Metadata object.
-
-See link:migrating-to-4x.html[Migrating to Tika 4.x] for details on specific
-metadata key changes.
-
-== Fat Jars and Maven Shade Strategy
-
-Tika 4.x moves away from fat jar/shaded artifacts. The `tika-app` and
`tika-server` now use
-separate `lib` and `plugins` directories alongside the jar file, enabling
standard `java -jar`
-execution.
-
-== Plugins and PF4J Framework
-
-=== Plugin Packaging
-
-PF4J plugins are packaged exclusively as zips (not jars) to align with the
move away from fat
-jars. Custom code addresses race conditions during the unzipping process
across threads and
-processes.
-
-=== Classloader Management
-
-The team disabled PF4J's default classpath loading to avoid complexity in unit
tests. A
-configured plugins directory is now required.
-
-This strict boundary prevents issues when components are loaded separately.
For example, JSON
-strings replace `JsonNode` objects to avoid problems with independent Jackson
loading in plugins.
-
-IMPORTANT: We tried to have as few Tika dependencies in the plugins as
possible.
-
-== Serialization Architecture
-
-=== Design Principles
-
-* Maximize Jackson usage while minimizing custom serialization code
-* Exclude Jackson from `tika-core` and `tika-parsers-standard-modules`
dependencies
-* Enable runtime configuration updates via Jackson's `readerForUpdating`
-
-=== Security Model
-
-Configuration files at initialization are treated as trusted sources. Runtime
-serialization/deserialization uses an allowlist of permitted packages via
-`PolymorphicObjectMapperFactory`.
-
-Custom components can add patterns to
`META-INF/tika-serialization-allowlist.txt`.
-
-=== Implementation Challenges
-
-* Converted code to true Java beans with matching getters/setters
-* Used `ObjectMapper.DefaultTyping.OBJECT_AND_NON_CONCRETE` for polymorphic
typing
-* Replaced generic collections (`List`, `Set`) with concrete types
(`ArrayList`, `HashSet`)
-* Converted `Path` fields to `String` due to Jackson constraints
-* Avoided Java records to enable `readerForUpdating` functionality
-
-== Annotations System
-
-The `@TikaComponent` annotation handles:
-
-* Automatic service file generation at build time
-* Creation of `META-INF/tika/*.idx` mapping files
-* Kebab-case conversion of class names to friendly identifiers (e.g.,
`PDFParser` → `pdf-parser`)
-* Manual name overrides via `name` attribute
-* Optional `spi=false` setting for non-service-file registration
-
-== Migration Strategy
-
-The plan is to stabilize 4.x structures before backporting capabilities to 3.x
and deprecating
-`TikaConfig` and `tika-config.xml`.
-
-A converter tool for transforming `tika-config.xml` to `tika-config.json` is
planned, with
-support focused on components in `tika-parsers-standard-modules`.
-
-== Development Tips
-
-=== Common Issues
-
-* Plugin directories and `@TikaComponent` annotations becoming out of sync
across modules
-* IntelliJ conflicts with command-line builds
-* Checkstyle running before Spotless, causing preventable failures
-
-=== Recommended Build Commands
-
-For faster builds during development:
-
-[source,bash]
-----
-mvn clean install -am -pl :tika-app -Pfast
-----
-
-To apply formatting and build:
-
-[source,bash]
-----
-mvn clean spotless:apply install
-----
-
-== Outstanding Tasks
-
-* Implement flexible component loading without `@TikaComponent` requirements
-* Enable friendly name usage throughout the codebase
-* Resolve gRPC issues
-* Fix mutool renderer byte-passing in open containers
-* Simplify and strengthen serialization code
-* Consider relocating `TikaConfig` and `ForkParser` to legacy module
diff --git a/docs/src/main/asciidoc/migration-to-4x/index.adoc
b/docs/src/main/asciidoc/migration-to-4x/index.adoc
deleted file mode 100644
index c8d5be9f5d..0000000000
--- a/docs/src/main/asciidoc/migration-to-4x/index.adoc
+++ /dev/null
@@ -1,32 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Migrating to Tika 4.x
-
-This section provides guides and background documentation for migrating to
Apache Tika 4.x.
-
-See the xref:../roadmap.adoc[Roadmap] for version timelines and support
schedules.
-
-== Migration Guides
-
-* xref:migrating-to-4x.adoc[Migration Guide] - Step-by-step guide for
upgrading from Tika 3.x to 4.x
-* xref:metadata-changes-4x.adoc[Metadata Changes] - Detailed metadata key
changes and migration examples
-
-== Background Documentation
-
-* xref:design-notes-4x.adoc[Design Notes] - Architectural decisions and design
rationale
-* xref:serialization-4x.adoc[Serialization] - JSON serialization design and
implementation details
diff --git a/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc
b/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc
deleted file mode 100644
index e129d33008..0000000000
--- a/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc
+++ /dev/null
@@ -1,121 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Metadata Changes in Tika 4.x
-
-This document details the metadata key changes in Apache Tika 4.x.
-
-== Overview
-
-Tika 4.x prefixes all "user generated" metadata keys to prevent overwrites and
improve
-namespace clarity. This is a security-focused change that prevents
user-controlled data
-from potentially overwriting existing metadata values in the Metadata object.
-
-== Metadata Key Changes
-
-[cols="2,2,3"]
-|===
-|Category |Change |Details
-
-|HTML custom metadata
-|Prefixed with `html:`
-|Custom metadata from HTML documents now uses the `html:` prefix
-
-|MAPI metadata
-|Prefix changed to `mapi:`
-|Microsoft MAPI properties now use the `mapi:` prefix
-
-|Resource name
-|Renamed
-|`resourceName` changed to `X-TIKA:resourceName`
-
-|Unrecognized image metadata
-|Prefixed with `img:`
-|Unrecognized image metadata keys now use the `img:` prefix
-
-|Office metadata
-|Prefix changed
-|Changed from `meta` prefix to `office` prefix
-|===
-
-== Migration Steps
-
-When upgrading to Tika 4.x, you will need to update any code that references
metadata keys
-directly:
-
-=== HTML Metadata
-
-[source,java]
-----
-// Before (3.x)
-String value = metadata.get("custom-key");
-
-// After (4.x)
-String value = metadata.get("html:custom-key");
-----
-
-=== MAPI Metadata
-
-[source,java]
-----
-// Before (3.x)
-String value = metadata.get("mapi:some-property");
-
-// After (4.x) - prefix remains mapi: but verify specific keys
-String value = metadata.get("mapi:some-property");
-----
-
-=== Resource Name
-
-[source,java]
-----
-// Before (3.x)
-String name = metadata.get("resourceName");
-
-// After (4.x)
-String name = metadata.get("X-TIKA:resourceName");
-----
-
-=== Image Metadata
-
-[source,java]
-----
-// Before (3.x)
-String value = metadata.get("unknown-image-key");
-
-// After (4.x)
-String value = metadata.get("img:unknown-image-key");
-----
-
-=== Office Metadata
-
-[source,java]
-----
-// Before (3.x)
-String value = metadata.get("meta:some-property");
-
-// After (4.x)
-String value = metadata.get("office:some-property");
-----
-
-== Rationale
-
-The namespacing of metadata keys provides several benefits:
-
-* *Security*: Prevents user-controlled content from overwriting internal
metadata
-* *Clarity*: Makes it clear which parser or source generated a metadata key
-* *Consistency*: Provides a uniform approach to metadata naming across all
parsers
diff --git a/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc
b/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc
deleted file mode 100644
index ba26d25acc..0000000000
--- a/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc
+++ /dev/null
@@ -1,157 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Migrating to Tika 4.x
-
-This guide covers the changes required when upgrading from Apache Tika 3.x to
4.x.
-
-See the xref:../roadmap.adoc[Roadmap] for version timelines and support
schedules.
-
-== Requirements
-
-* Java 17 or later (upgraded from Java 11 in 3.x)
-
-== Configuration: XML to JSON
-
-Tika 4.x uses JSON configuration files instead of XML. The legacy
`tika-config.xml` format
-is no longer supported.
-
-=== Automatic Conversion
-
-Tika provides a conversion tool in `tika-app` to help migrate your XML
configuration:
-
-[source,bash]
-----
-java -jar tika-app.jar
--convert-config-xml-to-json=tika-config.xml,tika-config.json
-----
-
-The converter currently supports:
-
-* **Parsers section** - parser declarations with parameters and exclusions
-* **Parameter types** - bool, int, long, double, float, string, list, and map
-* **Special handling** - TesseractOCR's `otherTesseractSettings` list is
automatically
- converted to the `otherTesseractConfig` map format
-
-=== Example Conversion
-
-**XML Format (3.x):**
-[source,xml]
-----
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="sortByPosition" type="bool">true</param>
- <param name="maxMainMemoryBytes" type="long">1000000</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- </parser>
- </parsers>
-</properties>
-----
-
-**JSON Format (4.x):**
-[source,json]
-----
-{
- "parsers": [
- {
- "pdf-parser": {
- "sortByPosition": true,
- "maxMainMemoryBytes": 1000000
- }
- },
- {
- "default-parser": {
- "_exclude": ["pdf-parser"]
- }
- }
- ]
-}
-----
-
-=== Key Differences
-
-[cols="1,1,2"]
-|===
-|Aspect |XML (3.x) |JSON (4.x)
-
-|Class references
-|Full class name (`org.apache.tika.parser.pdf.PDFParser`)
-|Kebab-case component name (`pdf-parser`)
-
-|Parameters
-|`<param name="..." type="...">value</param>`
-|Direct key-value pairs
-
-|Exclusions
-|`<parser-exclude class="..."/>`
-|`"_exclude": ["component-name"]`
-|===
-
-NOTE: When you configure a parser with specific settings in JSON, the loader
automatically
-excludes it from SPI loading. Explicit exclusions are only needed when you
want to disable
-a parser entirely without providing custom configuration.
-
-=== Limitations
-
-The automatic converter has some limitations:
-
-* Only the `parsers` section is currently converted
-* Detectors and other sections require manual migration
-* Custom or third-party parsers not in the registry will use kebab-case name
conversion
-
-=== Parser Configuration Changes
-
-WARNING: The configuration options for `PDFParser` and `TesseractOCRParser`
have changed
-significantly in 4.x. The automatic converter will migrate your parameter
names, but you
-should review the updated documentation to ensure your configuration is
optimal.
-
-See:
-
-* xref:../configuration/parsers/pdf-parser.adoc[PDFParser Configuration] -
Updated options for PDF parsing
-* xref:../configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser
Configuration] - Updated OCR options
-
-=== Full Configuration Example
-
-Below is a complete example of a Tika 4.x JSON configuration file with
commonly configured parsers:
-
-[source,json]
-----
-include::{parser-examples}/migration-full-example.json[]
-----
-
-NOTE: This example shows common options. See the individual parser
configuration pages for
-complete documentation of all available options.
-
-== Metadata Key Changes
-
-Tika 4.x prefixes all "user generated" metadata keys to prevent overwrites and
improve
-namespace clarity.
-
-See xref:metadata-changes-4x.adoc[Metadata Changes in 4.x] for complete
details, including
-a full table of changes and code migration examples.
-
-== API Changes
-
-// TODO: Document API changes
-
-== Deprecations and Removals
-
-// TODO: Document deprecated and removed features
diff --git a/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc
b/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc
deleted file mode 100644
index e11bdc4959..0000000000
--- a/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc
+++ /dev/null
@@ -1,101 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Serialization in Tika 4.x
-
-This document describes the JSON serialization design and implementation
details for Apache Tika 4.x.
-
-== High-Level Goals
-
-=== Jackson Framework Integration
-
-Use Jackson as much as possible with as few custom serializers and as few
annotations as possible.
-Jackson dependencies are kept out of core modules to maintain flexibility.
-
-=== Friendly Naming Conventions
-
-Implementation uses friendly names like `pdf-parser` rather than full class
names. These friendly
-names are applied to configured items rather than configuration class names.
-
-=== Custom Class Support
-
-The design permits users to add custom classes through Jackson's polymorphic
handling:
-
-* `org.apache.tika` patterns are allowed by default
-* Users can define additional inclusion patterns for security
-
-=== Configuration Consistency
-
-The approach seeks to make initialization and runtime configuration look
exactly the same and use
-the same underlying code where possible. However, security constraints may
require differences in
-which fields are modifiable at runtime.
-
-=== Configuration Objects Over Annotations
-
-Preference for config objects rather than field annotations to support
multithreading. Parsers
-retrieve settings from `ParseContext` at runtime.
-
-=== Cross-System Configuration Flow
-
-Configuration must pass seamlessly from:
-
-. User clients
-. Through tika-server REST APIs
-. Into tika-pipes infrastructure
-
-== Initialization Structure
-
-=== Tier 1 Objects
-
-ID Objects::
-Fetchers, emitters - components with unique identifiers
-
-Composite Objects::
-Parsers, detectors - components that aggregate other components
-
-Single Objects::
-Pipes, gRPC, server configurations
-
-=== Tier 2 Objects
-
-Components that can be read via friendly names using `@TikaComponent`
annotations in an
-`other-config` section.
-
-== Runtime Patterns
-
-=== Backwards Compatibility
-
-The design maintains backwards compatibility by allowing `ParseContext`
additions where the
-interface serves as the key.
-
-=== Partial Configuration Updates
-
-Users can specify only updates to the initialization configuration through
partial JSON objects,
-rather than requiring complete configuration documents.
-
-=== Self-Configuring Components in Pipes
-
-In the pipes infrastructure, objects should configure themselves to avoid
classloading
-dependencies on components like `PDFParser`.
-
-== Security Considerations
-
-* Configuration files at initialization are treated as trusted sources
-* Runtime serialization/deserialization uses an allowlist of permitted packages
-* Custom components can register patterns in
`META-INF/tika-serialization-allowlist.txt`
-
-See link:design-notes-4x.html[Design Notes for 4.x] for additional
architectural context.
diff --git a/docs/src/main/asciidoc/pipes/index.adoc
b/docs/src/main/asciidoc/pipes/index.adoc
deleted file mode 100644
index e7b49ebc3c..0000000000
--- a/docs/src/main/asciidoc/pipes/index.adoc
+++ /dev/null
@@ -1,37 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Tika Pipes
-
-This section covers Tika Pipes for scalable, fault-tolerant document
processing.
-
-== Overview
-
-Tika Pipes provides a framework for processing large volumes of documents with:
-
-* **Fetchers** - Retrieve documents from various sources (filesystem, S3,
HTTP, etc.)
-* **Emitters** - Send parsed results to various destinations (filesystem,
OpenSearch, Solr, etc.)
-* **Pipelines** - Configure processing workflows
-
-== Topics
-
-// Add links to specific topics as they are created
-// * link:getting-started.html[Getting Started]
-// * link:fetchers.html[Fetchers]
-// * link:emitters.html[Emitters]
-// * link:configuration.html[Configuration]
-// * link:async.html[Async Processing]
diff --git a/docs/src/main/asciidoc/roadmap.adoc
b/docs/src/main/asciidoc/roadmap.adoc
deleted file mode 100644
index 3e28829a43..0000000000
--- a/docs/src/main/asciidoc/roadmap.adoc
+++ /dev/null
@@ -1,96 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Apache Tika Roadmap
-
-This page outlines the planned features and improvements for Apache Tika
releases.
-
-NOTE: All dates are in Open Source Standard Time which does not always neatly
align
-with traditional calendars.
-
-== Release Timeline
-
-[cols="1,3"]
-|===
-|Date |Milestone
-
-|October 2024
-|Release 3.0.0
-
-|October 2024
-|Move main branch to 4.x (Java 17) after 3.0.0 release
-
-|April 2025
-|End support for 2.x (and Java 8)
-
-|January 2026
-|Release 4.0.0
-
-|June 2026
-|End support for 3.x (and Java 11)
-|===
-
-== Version Support Matrix
-
-[cols="1,1,1,2,2"]
-|===
-|Version |Java |Jakarta/javax |Availability |Planned EOL
-
-|2.x
-|8
-|javax
-|Now
-|April 2025
-
-|3.x
-|11
-|jakarta
-|October 2024
-|June 2026 or 6 months after 4.0.0 release
-
-|4.x
-|17
-|jakarta
-|January 2026
-|TBD
-
-|5.x
-|21
-|jakarta
-|TBD
-|TBD
-
-|6.x
-|25
-|jakarta
-|TBD
-|TBD
-|===
-
-== Metadata Changes in 4.x
-
-Tika 4.x implements namespaced metadata keys to prevent overwrites and improve
namespace clarity.
-
-See xref:migration-to-4x/metadata-changes-4x.adoc[Metadata Changes in 4.x] for
complete details and
-migration examples.
-
-== Long-term Goals
-
-// Add long-term goals as they are defined
-// * Improved streaming support
-// * Enhanced language detection
-// * Better support for modern document formats
diff --git a/docs/src/main/asciidoc/security.adoc
b/docs/src/main/asciidoc/security.adoc
deleted file mode 100644
index ddc09b7215..0000000000
--- a/docs/src/main/asciidoc/security.adoc
+++ /dev/null
@@ -1,34 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Security
-
-This page covers security considerations when using Apache Tika.
-
-== Security Model
-
-Apache Tika's security model describes the trust boundaries and assumptions
that govern
-how Tika processes content. Understanding this model is essential for
deploying Tika securely.
-
-* https://tika.apache.org/security-model.html[Apache Tika Security Model]
-
-== Known Vulnerabilities
-
-For information about known security vulnerabilities (CVEs) in Apache Tika and
their
-remediation, please see:
-
-* https://tika.apache.org/security.html[Apache Tika Security Vulnerabilities]
diff --git a/docs/src/main/asciidoc/using-tika/cli/index.adoc
b/docs/src/main/asciidoc/using-tika/cli/index.adoc
deleted file mode 100644
index 56105528d7..0000000000
--- a/docs/src/main/asciidoc/using-tika/cli/index.adoc
+++ /dev/null
@@ -1,39 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Tika Command Line Interface
-
-This section covers using Apache Tika from the command line via `tika-app`.
-
-== Overview
-
-The Tika application (`tika-app.jar`) provides a command-line interface for
-parsing documents, detecting content types, and extracting metadata.
-
-== Basic Usage
-
-[source,bash]
-----
-java -jar tika-app.jar [options] <file>
-----
-
-== Topics
-
-// Add links to specific topics as they are created
-// * link:installation.html[Installation]
-// * link:options.html[Command Line Options]
-// * link:batch.html[Batch Processing]
diff --git a/docs/src/main/asciidoc/using-tika/grpc/index.adoc
b/docs/src/main/asciidoc/using-tika/grpc/index.adoc
deleted file mode 100644
index 2f1eb24adb..0000000000
--- a/docs/src/main/asciidoc/using-tika/grpc/index.adoc
+++ /dev/null
@@ -1,32 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Tika gRPC
-
-This section covers using Apache Tika via gRPC.
-
-== Overview
-
-Tika gRPC provides a high-performance gRPC interface for parsing documents.
-This is useful for microservices architectures and polyglot environments.
-
-== Topics
-
-// Add links to specific topics as they are created
-// * link:getting-started.html[Getting Started]
-// * link:api.html[gRPC API]
-// * link:clients.html[Client Libraries]
diff --git a/docs/src/main/asciidoc/using-tika/index.adoc
b/docs/src/main/asciidoc/using-tika/index.adoc
deleted file mode 100644
index 2f13102e82..0000000000
--- a/docs/src/main/asciidoc/using-tika/index.adoc
+++ /dev/null
@@ -1,71 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Getting Started with Apache Tika
-
-Apache Tika can be used in several ways depending on your needs. Choose the
approach
-that best fits your use case.
-
-== Choose Your Integration Method
-
-xref:java-api/index.adoc[Java API]::
-Use Tika directly in your Java application. Best for tight integration and
full control
-over parsing behavior.
-
-xref:cli/index.adoc[Command Line (tika-app)]::
-Run Tika from the command line. Best for quick extraction, scripting, and
one-off tasks.
-
-xref:server/index.adoc[Server (REST API)]::
-Run Tika as a standalone server with a REST API. Best for language-agnostic
integration
-and microservice architectures.
-
-xref:grpc/index.adoc[gRPC]::
-Use Tika via gRPC protocol. Best for high-performance, cross-language
communication.
-
-== Which Should I Use?
-
-[cols="1,3"]
-|===
-|Use Case |Recommended Approach
-
-|Java application needing content extraction
-|Java API
-
-|Shell scripts or batch processing
-|Command Line
-
-|Non-Java application (Python, Node.js, etc.)
-|Server (REST) or gRPC
-
-|High-throughput processing pipeline
-|Server or gRPC with xref:../pipes/index.adoc[Pipes]
-
-|Quick one-time extraction
-|Command Line
-|===
-
-== Scalable Processing
-
-For processing large volumes of documents, see xref:../pipes/index.adoc[Tika
Pipes],
-which provides fault-tolerant, scalable document processing and works with all
of the
-above integration methods.
-
-== Understanding the Output
-
-xref:../advanced/metadata/embedded-documents.adoc[Embedded Document Metadata]::
-Learn how Tika tracks and reports metadata for embedded documents
(attachments, images,
-and other resources contained within files).
diff --git a/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc
b/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc
deleted file mode 100644
index ff8df846d4..0000000000
--- a/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc
+++ /dev/null
@@ -1,130 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Getting Started with the Java API
-
-== Before You Start
-
-Before embedding Tika directly in your Java application, consider whether a
-client-server architecture would better suit your needs.
-
-=== Recommended: Use tika-server or tika-grpc
-
-For most use cases, we recommend running Tika as a separate service rather than
-embedding it directly:
-
-* **xref:../server/index.adoc[tika-server]** - REST API, language-agnostic
-* **xref:../grpc/index.adoc[tika-grpc]** - High-performance gRPC protocol
-
-**Why?**
-
-* **Process isolation** - Parser crashes don't affect your application
-* **Easier deployment** - Use official Docker images
-* **Language flexibility** - Call from any language, not just Java
-* **Simpler upgrades** - Update Tika independently of your application
-
-Docker images are available at https://hub.docker.com/r/apache/tika[Docker
Hub].
-
-=== When to Use the Java API
-
-The Java API is appropriate when you:
-
-* Need tight integration with Tika internals
-* Cannot use a network service
-* Have specific customization requirements
-
-== Using PipesForkParser (Recommended)
-
-If you must use Tika as a library, use `PipesForkParser` from the
-`tika-pipes-fork-parser` module. It provides process isolation to protect your
-application from parser crashes, memory leaks, and infinite loops.
-
-=== Maven Dependency
-
-[source,xml]
-----
-<dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-pipes-fork-parser</artifactId>
- <version>${tika.version}</version>
-</dependency>
-----
-
-=== Basic Example
-
-[source,java]
-----
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.pipes.fork.PipesForkParser;
-import org.apache.tika.pipes.fork.PipesForkResult;
-
-try (PipesForkParser parser = new PipesForkParser();
- TikaInputStream tis = TikaInputStream.get(filePath)) {
-
- PipesForkResult result = parser.parse(tis);
-
- if (result.isSuccess()) {
- String content = result.getContent();
- // process content...
- } else {
- // handle failure
- }
-}
-----
-
-=== Key Features
-
-* **Process isolation** - Parsing runs in a separate JVM
-* **Automatic restart** - If the forked process crashes, it restarts
automatically
-* **Configurable timeouts** - Prevent infinite loops
-* **Thread-safe** - Reuse across multiple threads
-
-=== Complete Examples
-
-See
-https://github.com/apache/tika/blob/main/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java[PipesForkParserExample.java]
-in the `tika-example` module for comprehensive examples including:
-
-* Basic parsing
-* Handling embedded documents
-* Custom configuration
-* Error handling
-* Batch processing
-
-== Without Pipes: Understanding the Risks
-
-If you choose not to use `PipesForkParser` and instead use Tika's parsers
directly
-(e.g., `AutoDetectParser`), you are responsible for handling the risks of
parsing
-untrusted content.
-
-WARNING: Running parsers directly on untrusted data can cause
OutOfMemoryErrors,
-infinite loops, and crashes that will affect your entire application.
-
-Before proceeding without process isolation, read:
-
-* xref:../../advanced/robustness.adoc[The Robustness of Apache Tika] -
Understanding parser risks and mitigations
-* https://tika.apache.org/security-model.html[Apache Tika Security Model] -
Trust boundaries and assumptions
-
-If you still need to use parsers directly, your application is responsible for
-implementing its own process isolation so that you can:
-
-* Set parse timeouts (Tika cannot enforce timeouts without process isolation)
-* Configure memory limits (requires separate JVM)
-* Kill runaway processes
-* Recover from crashes
-
-Never run Tika in the same JVM as critical infrastructure.
diff --git a/docs/src/main/asciidoc/using-tika/java-api/index.adoc
b/docs/src/main/asciidoc/using-tika/java-api/index.adoc
deleted file mode 100644
index 8ab2b22291..0000000000
--- a/docs/src/main/asciidoc/using-tika/java-api/index.adoc
+++ /dev/null
@@ -1,35 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Using Tika as a Library (Java API)
-
-This section covers using Apache Tika programmatically in your Java
applications.
-
-== Overview
-
-Tika can be embedded directly into your Java applications as a library. This
gives you
-full control over parsing, detection, and configuration.
-
-However, for most use cases we recommend using
xref:../server/index.adoc[tika-server]
-or xref:../grpc/index.adoc[tika-grpc] instead. See
-xref:getting-started.adoc[Getting Started] for guidance on choosing the right
approach.
-
-== Topics
-
-* xref:getting-started.adoc[Getting Started] - Recommendations and
PipesForkParser usage
-* xref:../../advanced/metadata/embedded-documents.adoc[Embedded Document
Metadata] -
- Understanding attachment and embedded resource tracking
diff --git a/docs/src/main/asciidoc/using-tika/server/index.adoc
b/docs/src/main/asciidoc/using-tika/server/index.adoc
deleted file mode 100644
index accfc02700..0000000000
--- a/docs/src/main/asciidoc/using-tika/server/index.adoc
+++ /dev/null
@@ -1,42 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements. See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-= Tika Server
-
-This section covers running Apache Tika as a REST server via `tika-server`.
-
-== Overview
-
-Tika Server provides a RESTful HTTP interface for parsing documents and
extracting
-content. It can be deployed as a standalone service or in a containerized
environment.
-
-== Basic Usage
-
-[source,bash]
-----
-java -jar tika-server-standard.jar
-----
-
-The server starts on port 9998 by default.
-
-== Topics
-
-// Add links to specific topics as they are created
-// * link:installation.html[Installation]
-// * link:endpoints.html[REST Endpoints]
-// * link:configuration.html[Configuration]
-// * link:docker.html[Docker Deployment]