This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 55bd3e7dd6 TIKA-4667 - add Tess4J in-process OCR parser and docs
(#2615)
55bd3e7dd6 is described below
commit 55bd3e7dd68e1279af460608aa7e357b6556f2e3
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 18 17:01:03 2026 -0500
TIKA-4667 - add Tess4J in-process OCR parser and docs (#2615)
---
docs/modules/ROOT/examples/tess4j-basic.json | 10 +
docs/modules/ROOT/examples/tess4j-full.json | 18 +
docs/modules/ROOT/nav.adoc | 1 +
.../pages/configuration/parsers/tess4j-parser.adoc | 282 +++++++++++
tika-parsers/tika-parsers-ml/pom.xml | 1 +
.../tika-parser-tess4j-module/pom.xml | 86 ++++
.../tika/parser/ocr/tess4j/Tess4JConfig.java | 355 ++++++++++++++
.../tika/parser/ocr/tess4j/Tess4JParser.java | 516 +++++++++++++++++++++
.../tika/parser/ocr/tess4j/Tess4JConfigTest.java | 140 ++++++
.../tika/parser/ocr/tess4j/Tess4JParserTest.java | 255 ++++++++++
.../src/test/resources/test-documents/testOCR.jpg | Bin 0 -> 3408 bytes
11 files changed, 1664 insertions(+)
diff --git a/docs/modules/ROOT/examples/tess4j-basic.json
b/docs/modules/ROOT/examples/tess4j-basic.json
new file mode 100644
index 0000000000..3fc74587be
--- /dev/null
+++ b/docs/modules/ROOT/examples/tess4j-basic.json
@@ -0,0 +1,10 @@
+{
+ "parsers": [
+ {
+ "name": "tess4j-parser",
+ "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
+ "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
+ "poolSize": 4
+ }
+ ]
+}
diff --git a/docs/modules/ROOT/examples/tess4j-full.json
b/docs/modules/ROOT/examples/tess4j-full.json
new file mode 100644
index 0000000000..c2d5170ecf
--- /dev/null
+++ b/docs/modules/ROOT/examples/tess4j-full.json
@@ -0,0 +1,18 @@
+{
+ "parsers": [
+ {
+ "name": "tess4j-parser",
+ "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
+ "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
+ "language": "eng",
+ "pageSegMode": 1,
+ "ocrEngineMode": 3,
+ "poolSize": 4,
+ "timeoutSeconds": 120,
+ "dpi": 300,
+ "minFileSizeToOcr": 0,
+ "maxFileSizeToOcr": 2147483647,
+ "skipOcr": false
+ }
+ ]
+}
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 1864c77a42..d4bf3cb857 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -27,6 +27,7 @@
** xref:configuration/parsers/pdf-parser.adoc[PDF Parser]
** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR]
** xref:configuration/parsers/vlm-parsers.adoc[VLM Parsers (Claude, Gemini,
OpenAI)]
+** xref:configuration/parsers/tess4j-parser.adoc[Tess4J OCR (In-Process)]
* xref:migration-to-4x/index.adoc[Migration to 4.x]
** xref:migration-to-4x/migrating-to-4x.adoc[Migration Guide]
** xref:migration-to-4x/migrating-tika-server-4x.adoc[Tika Server Migration]
diff --git a/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc
b/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc
new file mode 100644
index 0000000000..fb52b1d6e0
--- /dev/null
+++ b/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc
@@ -0,0 +1,282 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Tess4J OCR Parser
+
+The `Tess4JParser` is an OCR parser that calls the Tesseract native library
+in-process via https://github.com/nguyenq/tess4j[Tess4J] and JNA, rather
+than spawning a `tesseract` child process for every image. This eliminates
+per-file process-spawn overhead and can be significantly faster when
+processing large batches of images.
+
+Because the native Tesseract handle is *not thread-safe*, the parser
+maintains a configurable pool of `Tesseract` instances. Multiple threads
+borrow from the pool and return instances when done, so the parser is safe
+for concurrent use.
+
+[WARNING]
+====
+This parser loads native C/{cpp} libraries (Tesseract, Leptonica) into
+the JVM via JNA. A segfault or memory leak in the native code *will*
+crash your entire JVM.
+
+*You should run this parser in a forked child process using tika-pipes,
+ideally inside a Docker container.* Do not load it into a long-lived
+application server process unless you are comfortable with the risk.
+====
+
+== Module dependency
+
+The parser lives in the `tika-parser-tess4j-module` artifact:
+
+[source,xml]
+----
+<dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-tess4j-module</artifactId>
+ <version>${tika.version}</version>
+</dependency>
+----
+
+== Prerequisites
+
+You *must* have the Tesseract and Leptonica shared libraries installed on
+the machine where the parser runs. The tess4j jar bundles Windows DLLs
+only -- on macOS and Linux you are responsible for installing the native
+libraries yourself.
+
+* *Debian / Ubuntu:* `apt-get install libtesseract-dev libleptonica-dev
tesseract-ocr-eng`
+* *RHEL / Fedora:* `dnf install tesseract-devel leptonica-devel
tesseract-langpack-eng`
+* *macOS (Homebrew):* `brew install tesseract`
+
+You also need the tessdata language files. The `dataPath` configuration
+option must point to the directory containing them (e.g.,
+`/usr/share/tesseract-ocr/5/tessdata`).
+
+== Native library path (`jna.library.path`)
+
+[CAUTION]
+====
+JNA must be able to find `libtesseract` and `libleptonica` at runtime.
+The tess4j jar does *not* bundle these libraries for macOS or Linux.
+If JNA cannot find them on the default library search path, the parser
+will silently disable itself.
+
+You have several options:
+
+1. Set `nativeLibPath` in the parser configuration (recommended). The
+ parser will prepend this to the `jna.library.path` system property at
+ initialization time.
+2. Set the `jna.library.path` JVM system property yourself, e.g.,
+ `-Djna.library.path=/opt/homebrew/lib`.
+3. Install the libraries into a directory that is already on the default
+ search path (e.g., `/usr/lib`).
+
+*You are on your own here.* The correct path depends entirely on your OS,
+distribution, and how you installed Tesseract. Common values:
+
+[cols="1,2"]
+|===
+|Platform |Typical `nativeLibPath`
+
+|Debian / Ubuntu
+|`/usr/lib/x86_64-linux-gnu`
+
+|RHEL / Fedora
+|`/usr/lib64`
+
+|macOS (Homebrew, Apple Silicon)
+|`/opt/homebrew/lib`
+
+|macOS (Homebrew, Intel)
+|`/usr/local/lib`
+
+|Docker (see below)
+|`/usr/lib/x86_64-linux-gnu`
+|===
+====
+
+== Basic Configuration
+
+[source,json]
+----
+include::example$tess4j-basic.json[]
+----
+
+== Full Configuration
+
+[source,json]
+----
+include::example$tess4j-full.json[]
+----
+
+== Configuration options reference
+
+[cols="2,1,3"]
+|===
+|Property |Default |Description
+
+|`dataPath`
+|`""` (empty)
+|Path to the tessdata directory containing language data files. *Required*
+on macOS and Linux.
+
+|`nativeLibPath`
+|`""` (empty)
+|Path to the directory containing `libtesseract` and `libleptonica` shared
+libraries. Prepended to `jna.library.path` at initialization time.
+
+|`language`
+|`"eng"`
+|Tesseract language(s). Multiple languages separated by `+`
+(e.g., `eng+fra`).
+
+|`pageSegMode`
+|`1`
+|Page segmentation mode (0-13). 1 = automatic with OSD.
+
+|`ocrEngineMode`
+|`3`
+|OCR engine mode. 0 = legacy, 1 = LSTM only, 2 = legacy + LSTM,
+3 = default (whatever is available).
+
+|`poolSize`
+|`2`
+|Number of `Tesseract` instances in the pool. Set this to the number of
+threads that will call the parser concurrently. Each instance consumes
+native memory.
+
+|`timeoutSeconds`
+|`120`
+|Maximum time (seconds) to wait for a pooled `Tesseract` instance before
+throwing an exception.
+
+|`dpi`
+|`300`
+|DPI for image rendering.
+
+|`minFileSizeToOcr`
+|`0`
+|Minimum input file size in bytes. Smaller files are skipped.
+
+|`maxFileSizeToOcr`
+|`2147483647` (~2 GB)
+|Maximum input file size in bytes. Larger files are skipped.
+
+|`skipOcr`
+|`false`
+|Runtime kill-switch to disable the parser entirely.
+|===
+
+== Recommended: Docker + tika-pipes
+
+Because this parser loads native code into the JVM, the safest deployment
+is a Docker container running tika-pipes with forked child processes.
+If the native code crashes, only the child process dies -- tika-pipes will
+respawn it automatically.
+
+A minimal `Dockerfile`:
+
+[source,dockerfile]
+----
+FROM eclipse-temurin:21-jre
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ libtesseract-dev \
+ libleptonica-dev \
+ tesseract-ocr-eng && \
+ rm -rf /var/lib/apt/lists/*
+
+# Copy your tika-pipes application and config
+COPY target/tika-pipes-app.jar /app/tika-pipes-app.jar
+COPY tika-config.json /app/tika-config.json
+
+WORKDIR /app
+ENTRYPOINT ["java", "-jar", "tika-pipes-app.jar"]
+----
+
+With the following parser configuration:
+
+[source,json]
+----
+{
+ "parsers": [
+ {
+ "name": "tess4j-parser",
+ "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
+ "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
+ "poolSize": 4
+ }
+ ]
+}
+----
+
+TIP: Set `poolSize` equal to the number of forked parser threads to
+maximize throughput without over-allocating native memory.
+
+== Tess4J vs. TesseractOCRParser
+
+[cols="1,2,2"]
+|===
+|Aspect |`TesseractOCRParser` |`Tess4JParser`
+
+|How it calls Tesseract
+|Spawns a new `tesseract` child process per image
+|Calls the native library in-process via JNA
+
+|Startup overhead
+|Process fork + exec per file
+|One-time JNA initialization; pooled thereafter
+
+|Thread safety
+|Naturally safe (separate processes)
+|Safe via pooled instances
+
+|Crash isolation
+|Child process crashes do not affect the JVM
+|A native crash *will* take down the JVM
+
+|Dependencies
+|`tesseract` binary on `PATH`
+|`libtesseract` + `libleptonica` shared libraries + JNA
+
+|Best for
+|Safety-first deployments, light OCR workloads
+|High-throughput batch processing in Docker / tika-pipes
+|===
+
+== Per-request configuration
+
+Override configuration for a single parse call by placing a `Tess4JConfig`
+on the `ParseContext`:
+
+[source,java]
+----
+Tess4JConfig override = new Tess4JConfig();
+override.setLanguage("fra");
+override.setPageSegMode(6);
+
+ParseContext context = new ParseContext();
+context.set(Tess4JConfig.class, override);
+----
+
+Note: `dataPath` and `nativeLibPath` cannot be changed at parse time
+(they are locked at parser initialization). Attempting to set them in a
+runtime config will throw `TikaConfigException`.
+
+@since Apache Tika 4.0
diff --git a/tika-parsers/tika-parsers-ml/pom.xml
b/tika-parsers/tika-parsers-ml/pom.xml
index d9f982b337..5a508fe2a7 100644
--- a/tika-parsers/tika-parsers-ml/pom.xml
+++ b/tika-parsers/tika-parsers-ml/pom.xml
@@ -37,6 +37,7 @@
<module>tika-parser-nlp-package</module>
<module>tika-inference</module>
<module>tika-parser-vlm-ocr-module</module>
+ <module>tika-parser-tess4j-module</module>
<module>tika-transcribe-aws</module>
</modules>
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml
new file mode 100644
index 0000000000..32dddef369
--- /dev/null
+++ b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
https://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>tika-parsers-ml</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <version>${revision}</version>
+ </parent>
+
+ <artifactId>tika-parser-tess4j-module</artifactId>
+ <name>Apache Tika Tess4J OCR parser module</name>
+
+ <properties>
+ <tess4j.version>5.16.0</tess4j.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>net.sourceforge.tess4j</groupId>
+ <artifactId>tess4j</artifactId>
+ <version>${tess4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+ <!-- test -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.parser.ocr.tess4j</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <inputExcludes>
+ <inputExclude>src/test/resources/test-documents/**</inputExclude>
+ </inputExcludes>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <scm>
+ <tag>3.0.0-rc1</tag>
+ </scm>
+</project>
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfig.java
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfig.java
new file mode 100644
index 0000000000..8e8d8f8e40
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfig.java
@@ -0,0 +1,355 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import java.io.Serializable;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Configuration for {@link Tess4JParser}.
+ * <p>
+ * This class is not thread-safe and must be synchronized externally.
+ * </p>
+ */
+public class Tess4JConfig implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Language dictionary to be used. Default is "eng".
+ */
+ private String language = "eng";
+
+ /**
+ * Path to the tessdata directory containing language data files.
+ * If empty, tess4j will try to find the tessdata directory automatically.
+ */
+ private String dataPath = "";
+
+ /**
+ * Tesseract page segmentation mode. Default is 1.
+ * <ul>
+ * <li>0 = Orientation and script detection (OSD) only.</li>
+ * <li>1 = Automatic page segmentation with OSD.</li>
+ * <li>3 = Fully automatic page segmentation, but no OSD. (Default for
Tesseract)</li>
+ * <li>6 = Assume a single uniform block of text.</li>
+ * </ul>
+ */
+ private int pageSegMode = 1;
+
+ /**
+ * Tesseract OCR Engine mode. Default is 3 (Default, based on what is
available).
+ * <ul>
+ * <li>0 = Original Tesseract only.</li>
+ * <li>1 = Neural nets LSTM only.</li>
+ * <li>2 = Tesseract + LSTM.</li>
+ * <li>3 = Default, based on what is available.</li>
+ * </ul>
+ */
+ private int ocrEngineMode = 3;
+
+ /**
+ * Maximum file size (in bytes) to submit to OCR. Default is 50 MB.
+ */
+ private long maxFileSizeToOcr = 50 * 1024 * 1024;
+
+ /**
+ * Minimum file size (in bytes) to submit to OCR. Default is 0.
+ */
+ private long minFileSizeToOcr = 0;
+
+ /**
+ * Number of Tesseract instances to keep in the pool. Default is 2.
+ */
+ private int poolSize = 2;
+
+ /**
+ * Maximum time (in seconds) to wait for a Tesseract instance from the
pool.
+ * Default is 120.
+ */
+ private int timeoutSeconds = 120;
+
+ /**
+ * Runtime switch to turn off OCR.
+ */
+ private boolean skipOcr = false;
+
+ /**
+ * DPI for image rendering. Default is 300.
+ */
+ private int dpi = 300;
+
+ /**
+ * Maximum total pixels (width × height) allowed for an image
+ * before OCR is skipped. This prevents OOM from decompressing
+ * pathologically large images (e.g., a 30,000 × 30,000 image
+ * would require ~3.6 GB of heap as a BufferedImage).
+ * <p>
+ * Default is 100,000,000 (100 megapixels, ~10,000 × 10,000).
+ * Set to {@code -1} for no limit (not recommended).
+ */
+ private long maxImagePixels = 100_000_000L;
+
+ /**
+ * Path to the directory containing native Tesseract and Leptonica shared
libraries
+ * (e.g., {@code libtesseract.dylib}, {@code libtesseract.so}).
+ * <p>
+ * On macOS with Homebrew, this is typically {@code /opt/homebrew/lib}.
+ * On Linux, it may be {@code /usr/lib} or {@code /usr/local/lib}.
+ * <p>
+ * If empty, JNA will search the default system library paths.
+ */
+ private String nativeLibPath = "";
+
+ public String getLanguage() {
+ return language;
+ }
+
+ /**
+ * Set tesseract language dictionary to be used. Default is "eng".
+ * Multiple languages may be specified, separated by plus characters.
+ * e.g. "eng+fra"
+ */
+ public void setLanguage(String language) {
+ Set<String> invalidCodes = new HashSet<>();
+ Set<String> validCodes = new HashSet<>();
+ validateLangs(language, validCodes, invalidCodes);
+ if (!invalidCodes.isEmpty()) {
+ throw new IllegalArgumentException("Invalid language code(s): " +
invalidCodes);
+ }
+ this.language = language;
+ }
+
+ public String getDataPath() {
+ return dataPath;
+ }
+
+ /**
+ * Set the path to the tessdata directory.
+ */
+ public void setDataPath(String dataPath) throws TikaConfigException {
+ this.dataPath = dataPath;
+ }
+
+ public int getPageSegMode() {
+ return pageSegMode;
+ }
+
+ /**
+ * Set tesseract page segmentation mode.
+ * Default is 1.
+ */
+ public void setPageSegMode(int pageSegMode) {
+ if (pageSegMode < 0 || pageSegMode > 13) {
+ throw new IllegalArgumentException(
+ "Invalid page segmentation mode: " + pageSegMode +
+ ". Must be between 0 and 13.");
+ }
+ this.pageSegMode = pageSegMode;
+ }
+
+ public int getOcrEngineMode() {
+ return ocrEngineMode;
+ }
+
+ /**
+ * Set OCR Engine Mode.
+ * Default is 3.
+ */
+ public void setOcrEngineMode(int ocrEngineMode) {
+ if (ocrEngineMode < 0 || ocrEngineMode > 3) {
+ throw new IllegalArgumentException(
+ "Invalid OCR Engine Mode: " + ocrEngineMode +
+ ". Must be between 0 and 3.");
+ }
+ this.ocrEngineMode = ocrEngineMode;
+ }
+
+ public long getMaxFileSizeToOcr() {
+ return maxFileSizeToOcr;
+ }
+
+ public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
+ this.maxFileSizeToOcr = maxFileSizeToOcr;
+ }
+
+ public long getMinFileSizeToOcr() {
+ return minFileSizeToOcr;
+ }
+
+ public void setMinFileSizeToOcr(long minFileSizeToOcr) {
+ this.minFileSizeToOcr = minFileSizeToOcr;
+ }
+
+ public int getPoolSize() {
+ return poolSize;
+ }
+
+ /**
+ * Set the number of Tesseract instances to keep in the pool.
+ * Default is 2. Must be at least 1.
+ */
+ public void setPoolSize(int poolSize) {
+ if (poolSize < 1) {
+ throw new IllegalArgumentException("Pool size must be at least 1,
got: " + poolSize);
+ }
+ this.poolSize = poolSize;
+ }
+
+ public int getTimeoutSeconds() {
+ return timeoutSeconds;
+ }
+
+ /**
+ * Set maximum time (seconds) to wait for a pooled Tesseract instance.
+ * Default is 120.
+ */
+ public void setTimeoutSeconds(int timeoutSeconds) {
+ this.timeoutSeconds = timeoutSeconds;
+ }
+
+ public boolean isSkipOcr() {
+ return skipOcr;
+ }
+
+ public void setSkipOcr(boolean skipOcr) {
+ this.skipOcr = skipOcr;
+ }
+
+ public int getDpi() {
+ return dpi;
+ }
+
+ /**
+ * Set the DPI for image rendering. Default is 300.
+ */
+ public void setDpi(int dpi) {
+ if (dpi < 72 || dpi > 1200) {
+ throw new IllegalArgumentException("DPI must be between 72 and
1200, got: " + dpi);
+ }
+ this.dpi = dpi;
+ }
+
+ public long getMaxImagePixels() {
+ return maxImagePixels;
+ }
+
+ /**
+ * Set the maximum total pixels (width × height) allowed for
+ * an image before OCR is skipped. Default is 100,000,000
+ * (100 megapixels). Set to {@code -1} for no limit (not recommended).
+ */
+ public void setMaxImagePixels(long maxImagePixels) {
+ if (maxImagePixels < 1 && maxImagePixels != -1) {
+ throw new IllegalArgumentException(
+ "maxImagePixels must be -1 (no limit) or at least 1, got: "
+ + maxImagePixels);
+ }
+ this.maxImagePixels = maxImagePixels;
+ }
+
+ public String getNativeLibPath() {
+ return nativeLibPath;
+ }
+
+ /**
+ * Set the path to the directory containing native Tesseract/Leptonica
shared libraries.
+ * On macOS with Homebrew this is typically {@code /opt/homebrew/lib}.
+ */
+ public void setNativeLibPath(String nativeLibPath) throws
TikaConfigException {
+ this.nativeLibPath = nativeLibPath;
+ }
+
+ /**
+ * Validates language strings. Languages should conform to tesseract's
expected format.
+ */
+ static void validateLangs(String language, Set<String> validLangs,
Set<String> invalidLangs) {
+ if (StringUtils.isBlank(language)) {
+ return;
+ }
+ language = language.replaceAll("\\s", "");
+ if (language.matches("\\+.*|.*\\+")) {
+ throw new IllegalArgumentException(
+ "Invalid syntax - Can't start or end with +: " + language);
+ }
+ final String[] langs = language.split("\\+");
+ for (String lang : langs) {
+ if (!lang.matches(
+
"([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
+ invalidLangs.add(lang + " (invalid syntax)");
+ } else {
+ validLangs.add(lang);
+ }
+ }
+ }
+
+ /**
+ * Runtime-only Tess4JConfig that prevents modification of paths and
+ * pool settings during parse-time configuration.
+ * <p>
+ * <b>Always blocked:</b> {@code dataPath}, {@code nativeLibPath},
+ * {@code poolSize}.
+ * <p>
+ * Paths are blocked to prevent file-system access attacks.
+ * Pool size is blocked because the pool is built at init time and
+ * cannot be resized at runtime.
+ */
+ public static class RuntimeConfig extends Tess4JConfig {
+
+ public RuntimeConfig() {
+ super();
+ }
+
+ @Override
+ public void setDataPath(String dataPath) throws TikaConfigException {
+ if (!StringUtils.isBlank(dataPath)) {
+ throw new TikaConfigException(
+ "Cannot modify dataPath at runtime. " +
+ "Paths must be configured at parser
initialization time.");
+ }
+ }
+
+ @Override
+ public void setNativeLibPath(String nativeLibPath) throws
TikaConfigException {
+ if (!StringUtils.isBlank(nativeLibPath)) {
+ throw new TikaConfigException(
+ "Cannot modify nativeLibPath at runtime. " +
+ "Paths must be configured at parser
initialization time.");
+ }
+ }
+
+ @Override
+ public void setPoolSize(int poolSize) {
+ throw new IllegalStateException(
+ "Cannot modify poolSize at runtime. " +
+ "The pool is created at initialization time " +
+ "and cannot be resized.");
+ }
+
+ @Override
+ public void setMaxImagePixels(long maxImagePixels) {
+ throw new IllegalStateException(
+ "Cannot modify maxImagePixels at runtime. " +
+ "Image size limits must be configured at " +
+ "initialization time.");
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JParser.java
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JParser.java
new file mode 100644
index 0000000000..8796736251
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JParser.java
@@ -0,0 +1,516 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.awt.image.BufferedImage;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.TimeUnit;
+import javax.imageio.ImageIO;
+
+import net.sourceforge.tess4j.Tesseract;
+import net.sourceforge.tess4j.TesseractException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.config.ConfigDeserializer;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.ParseContextConfig;
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.config.TikaTaskTimeout;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * OCR parser using <a href="https://github.com/nguyenq/tess4j">Tess4J</a>,
+ * which provides a Java JNA wrapper around the native Tesseract library.
+ * <p>
+ * Unlike the command-line {@code TesseractOCRParser}, this parser calls
Tesseract
+ * in-process via JNA, eliminating the per-file process-spawn overhead.
+ * <p>
+ * Because the native Tesseract handle is <b>not thread-safe</b>, this parser
+ * maintains a configurable pool of {@link Tesseract} instances. The pool size
+ * is controlled by {@link Tess4JConfig#setPoolSize(int)}.
+ * <p>
+ * Configuration key: {@code "tess4j-parser"}
+ *
+ * @since Apache Tika 4.0
+ */
+@TikaComponent(name = "tess4j-parser")
+public class Tess4JParser implements Parser, Initializable {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final Logger LOG =
LoggerFactory.getLogger(Tess4JParser.class);
+
+ private static final String OCR = "ocr-";
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+ MediaType.image(OCR + "png"),
+ MediaType.image(OCR + "jpeg"),
+ MediaType.image(OCR + "tiff"),
+ MediaType.image(OCR + "bmp"),
+ MediaType.image(OCR + "gif"),
+ MediaType.image("jp2"),
+ MediaType.image("jpx"),
+ MediaType.image("x-portable-pixmap"),
+ MediaType.image(OCR + "jp2"),
+ MediaType.image(OCR + "jpx"),
+ MediaType.image(OCR + "x-portable-pixmap")
+ )));
+
+ private static volatile boolean HAS_WARNED = false;
+ private static final Object[] LOCK = new Object[0];
+
+ private Tess4JConfig defaultConfig;
+ private transient BlockingQueue<Tesseract> pool;
+ private volatile boolean initialized = false;
+
+ public Tess4JParser() throws TikaConfigException {
+ this.defaultConfig = new Tess4JConfig();
+ initialize();
+ }
+
+ public Tess4JParser(Tess4JConfig config) throws TikaConfigException {
+ this.defaultConfig = config;
+ initialize();
+ }
+
+ public Tess4JParser(JsonConfig jsonConfig) throws TikaConfigException {
+ this(ConfigDeserializer.buildConfig(jsonConfig, Tess4JConfig.class));
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ if (!initialized) {
+ return Collections.emptySet();
+ }
+ Tess4JConfig config = context.get(Tess4JConfig.class);
+ if (config != null && config.isSkipOcr()) {
+ return Collections.emptySet();
+ }
+ if (defaultConfig.isSkipOcr()) {
+ return Collections.emptySet();
+ }
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(TikaInputStream tis, ContentHandler handler, Metadata
metadata,
+ ParseContext parseContext)
+ throws IOException, SAXException, TikaException {
+
+ Tess4JConfig config = getConfig(parseContext);
+
+ if (!initialized || config.isSkipOcr()) {
+ return;
+ }
+
+ warnOnFirstParse();
+
+ long size = tis.getLength();
+ if (size >= 0 && (size < config.getMinFileSizeToOcr() ||
+ size > config.getMaxFileSizeToOcr())) {
+ return;
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata,
parseContext);
+ xhtml.startDocument();
+
+ Tesseract tesseract = null;
+ long timeoutMillis = TikaTaskTimeout.getTimeoutMillis(
+ parseContext, config.getTimeoutSeconds() * 1000L);
+ try {
+ tesseract = borrowTesseract(timeoutMillis);
+ if (tesseract == null) {
+ throw new TikaException("Timed out waiting for a Tesseract
instance from the pool");
+ }
+
+ // Apply per-request config if different from defaults
+ applyConfig(tesseract, config);
+
+ // Check image dimensions before full decode to prevent OOM
+ long maxPixels = config.getMaxImagePixels();
+ if (maxPixels > 0) {
+ tis.mark((int) Math.min(tis.getLength() + 1, 1024 * 1024));
+ try {
+ long pixels = getImagePixels(tis);
+ if (pixels > maxPixels) {
+ LOG.warn("Image has {} pixels, exceeding
maxImagePixels={}. "
+ + "Skipping OCR.", pixels, maxPixels);
+ xhtml.endDocument();
+ return;
+ }
+ } finally {
+ tis.reset();
+ }
+ }
+
+ BufferedImage image = readImage(tis);
+ if (image == null) {
+ LOG.warn("Could not read image from stream");
+ xhtml.endDocument();
+ return;
+ }
+
+ String ocrResult = tesseract.doOCR(image);
+
+ // Emit the text as XHTML
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute("", "class", "class", "CDATA", "ocr");
+ xhtml.startElement(XHTML, "div", "div", attrs);
+ if (ocrResult != null && !ocrResult.isEmpty()) {
+ xhtml.characters(ocrResult.toCharArray(), 0,
ocrResult.length());
+ }
+ xhtml.endElement(XHTML, "div", "div");
+
+ } catch (TesseractException e) {
+ throw new TikaException("Tess4J OCR failed", e);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ throw new TikaException("Interrupted while waiting for Tesseract
instance", e);
+ } finally {
+ if (tesseract != null) {
+ returnTesseract(tesseract);
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+ @Override
+ public void initialize() throws TikaConfigException {
+ if (defaultConfig.isSkipOcr()) {
+ initialized = false;
+ return;
+ }
+ try {
+ configureNativeLibPath();
+ initPool();
+ initialized = true;
+ LOG.info("Tess4J parser initialized with pool size {}",
defaultConfig.getPoolSize());
+ } catch (UnsatisfiedLinkError | NoClassDefFoundError e) {
+ LOG.warn("Tess4J native library not available: {}. " +
+ "Tess4JParser will be disabled.", e.getMessage());
+ initialized = false;
+ } catch (Exception e) {
+ LOG.warn("Failed to initialize Tess4J: {}. " +
+ "Tess4JParser will be disabled.", e.getMessage());
+ initialized = false;
+ }
+ }
+
+ /**
+ * If a native library path is configured, prepend it to the JNA library
search path
+ * so that JNA can find libtesseract and libleptonica on non-Windows
platforms.
+ */
+ private void configureNativeLibPath() {
+ String nativeLibPath = defaultConfig.getNativeLibPath();
+ if (!StringUtils.isBlank(nativeLibPath)) {
+ String existing = System.getProperty("jna.library.path", "");
+ if (existing.isEmpty()) {
+ System.setProperty("jna.library.path", nativeLibPath);
+ } else if (!existing.contains(nativeLibPath)) {
+ System.setProperty("jna.library.path",
+ nativeLibPath + System.getProperty("path.separator") +
existing);
+ }
+ LOG.debug("jna.library.path set to: {}",
System.getProperty("jna.library.path"));
+ }
+ }
+
+ /**
+ * Creates the pool of {@link Tesseract} instances based on the default
config.
+ */
+ private void initPool() {
+ int size = defaultConfig.getPoolSize();
+ pool = new ArrayBlockingQueue<>(size);
+ for (int i = 0; i < size; i++) {
+ Tesseract tesseract = createTesseract(defaultConfig);
+ pool.add(tesseract);
+ }
+ // Tess4J loads the native library lazily on first doOCR call.
+ // Force it now so UnsatisfiedLinkError is caught by initialize().
+ Tesseract probe = pool.peek();
+ if (probe != null) {
+ try {
+ BufferedImage tiny = new BufferedImage(1, 1,
+ BufferedImage.TYPE_BYTE_GRAY);
+ probe.doOCR(tiny);
+ } catch (TesseractException e) {
+ // Expected — OCR on a 1x1 image may fail,
+ // but the native library loaded successfully
+ }
+ }
+ }
+
+ /**
+ * Creates and configures a new {@link Tesseract} instance.
+ */
+ private Tesseract createTesseract(Tess4JConfig config) {
+ Tesseract tesseract = new Tesseract();
+ applyConfig(tesseract, config);
+ return tesseract;
+ }
+
+ /**
+ * Applies the given configuration to a {@link Tesseract} instance.
+ */
+ private void applyConfig(Tesseract tesseract, Tess4JConfig config) {
+ if (!StringUtils.isBlank(config.getDataPath())) {
+ tesseract.setDatapath(config.getDataPath());
+ }
+ tesseract.setLanguage(config.getLanguage());
+ tesseract.setPageSegMode(config.getPageSegMode());
+ tesseract.setOcrEngineMode(config.getOcrEngineMode());
+ }
+
+ /**
+ * Borrows a {@link Tesseract} instance from the pool, waiting up to the
+ * specified timeout.
+ *
+ * @param timeoutMillis maximum time to wait in milliseconds
+ * @return a Tesseract instance, or null if the timeout elapsed
+ * @throws InterruptedException if the thread was interrupted while waiting
+ */
+ private Tesseract borrowTesseract(long timeoutMillis) throws
InterruptedException {
+ return pool.poll(timeoutMillis, TimeUnit.MILLISECONDS);
+ }
+
+ /**
+ * Returns a {@link Tesseract} instance to the pool.
+ */
+ private void returnTesseract(Tesseract tesseract) {
+ if (!pool.offer(tesseract)) {
+ // pool is full (shouldn't happen in normal operation) - just
discard
+ LOG.warn("Tesseract pool is full; discarding instance");
+ }
+ }
+
+ /**
+ * Reads a {@link BufferedImage} from the input stream.
+ */
+ private BufferedImage readImage(InputStream is) throws IOException {
+ return ImageIO.read(is);
+ }
+
+ /**
+ * Reads only the image header to determine width × height
+ * without decoding the full raster. Returns {@code -1} if dimensions
+ * cannot be determined.
+ */
+ private long getImagePixels(InputStream is) throws IOException {
+ try (javax.imageio.stream.ImageInputStream iis =
+ ImageIO.createImageInputStream(is)) {
+ if (iis == null) {
+ return -1;
+ }
+ java.util.Iterator<javax.imageio.ImageReader> readers =
+ ImageIO.getImageReaders(iis);
+ if (!readers.hasNext()) {
+ return -1;
+ }
+ javax.imageio.ImageReader reader = readers.next();
+ try {
+ reader.setInput(iis);
+ long w = reader.getWidth(0);
+ long h = reader.getHeight(0);
+ return w * h;
+ } finally {
+ reader.dispose();
+ }
+ }
+ }
+
+ /**
+ * Resolves the effective config: JSON config > ParseContext config >
default.
+ */
+ private Tess4JConfig getConfig(ParseContext parseContext)
+ throws TikaConfigException, IOException {
+
+ if (parseContext.hasJsonConfig("tess4j-parser")) {
+ // Validate no paths in runtime config
+ Tess4JConfig.RuntimeConfig runtimeConfig =
ParseContextConfig.getConfig(
+ parseContext,
+ "tess4j-parser",
+ Tess4JConfig.RuntimeConfig.class,
+ new Tess4JConfig.RuntimeConfig());
+
+ if (runtimeConfig.isSkipOcr()) {
+ return runtimeConfig;
+ }
+
+ return ParseContextConfig.getConfig(
+ parseContext,
+ "tess4j-parser",
+ Tess4JConfig.class,
+ defaultConfig);
+ }
+
+ Tess4JConfig userConfig = parseContext.get(Tess4JConfig.class);
+ if (userConfig != null) {
+ return userConfig;
+ }
+ return defaultConfig;
+ }
+
+ private void warnOnFirstParse() {
+ if (!HAS_WARNED) {
+ synchronized (LOCK) {
+ if (!HAS_WARNED) {
+ LOG.info("Tess4J OCR is being invoked. " +
+ "This can add greatly to processing time. " +
+ "If you do not want OCR to be applied to your
files, " +
+ "configure skipOcr=true.");
+ HAS_WARNED = true;
+ }
+ }
+ }
+ }
+
+ // -- Delegating getters/setters for parser-level configuration --
+
+ public String getLanguage() {
+ return defaultConfig.getLanguage();
+ }
+
+ public void setLanguage(String language) {
+ defaultConfig.setLanguage(language);
+ }
+
+ public String getDataPath() {
+ return defaultConfig.getDataPath();
+ }
+
+ public void setDataPath(String dataPath) throws TikaConfigException {
+ defaultConfig.setDataPath(dataPath);
+ }
+
+ public int getPageSegMode() {
+ return defaultConfig.getPageSegMode();
+ }
+
+ public void setPageSegMode(int pageSegMode) {
+ defaultConfig.setPageSegMode(pageSegMode);
+ }
+
+ public int getOcrEngineMode() {
+ return defaultConfig.getOcrEngineMode();
+ }
+
+ public void setOcrEngineMode(int ocrEngineMode) {
+ defaultConfig.setOcrEngineMode(ocrEngineMode);
+ }
+
+ public long getMaxFileSizeToOcr() {
+ return defaultConfig.getMaxFileSizeToOcr();
+ }
+
+ public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
+ defaultConfig.setMaxFileSizeToOcr(maxFileSizeToOcr);
+ }
+
+ public long getMinFileSizeToOcr() {
+ return defaultConfig.getMinFileSizeToOcr();
+ }
+
+ public void setMinFileSizeToOcr(long minFileSizeToOcr) {
+ defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr);
+ }
+
+ public int getPoolSize() {
+ return defaultConfig.getPoolSize();
+ }
+
+ public void setPoolSize(int poolSize) {
+ defaultConfig.setPoolSize(poolSize);
+ }
+
+ public int getTimeoutSeconds() {
+ return defaultConfig.getTimeoutSeconds();
+ }
+
+ public void setTimeoutSeconds(int timeoutSeconds) {
+ defaultConfig.setTimeoutSeconds(timeoutSeconds);
+ }
+
+ public boolean isSkipOcr() {
+ return defaultConfig.isSkipOcr();
+ }
+
+ public void setSkipOcr(boolean skipOcr) {
+ defaultConfig.setSkipOcr(skipOcr);
+ }
+
+ public int getDpi() {
+ return defaultConfig.getDpi();
+ }
+
+ public void setDpi(int dpi) {
+ defaultConfig.setDpi(dpi);
+ }
+
+ public String getNativeLibPath() {
+ return defaultConfig.getNativeLibPath();
+ }
+
+ public void setNativeLibPath(String nativeLibPath) throws
TikaConfigException {
+ defaultConfig.setNativeLibPath(nativeLibPath);
+ }
+
+ public long getMaxImagePixels() {
+ return defaultConfig.getMaxImagePixels();
+ }
+
+ public void setMaxImagePixels(long maxImagePixels) {
+ defaultConfig.setMaxImagePixels(maxImagePixels);
+ }
+
+ /**
+ * Returns whether the parser has been successfully initialized
+ * (i.e., Tess4J native library is available).
+ */
+ public boolean isInitialized() {
+ return initialized;
+ }
+
+ /**
+ * Returns the default configuration. Visible for testing.
+ */
+ Tess4JConfig getDefaultConfig() {
+ return defaultConfig;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigTest.java
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigTest.java
new file mode 100644
index 0000000000..f92ab8b53a
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.exception.TikaConfigException;
+
+public class Tess4JConfigTest {
+
+ @Test
+ public void testDefaults() {
+ Tess4JConfig config = new Tess4JConfig();
+ assertEquals("eng", config.getLanguage());
+ assertEquals("", config.getDataPath());
+ assertEquals(1, config.getPageSegMode());
+ assertEquals(3, config.getOcrEngineMode());
+ assertEquals(50 * 1024 * 1024, config.getMaxFileSizeToOcr());
+ assertEquals(0, config.getMinFileSizeToOcr());
+ assertEquals(2, config.getPoolSize());
+ assertEquals(120, config.getTimeoutSeconds());
+ assertFalse(config.isSkipOcr());
+ assertEquals(300, config.getDpi());
+ }
+
+ @Test
+ public void testSetLanguageValid() {
+ Tess4JConfig config = new Tess4JConfig();
+ config.setLanguage("eng+fra");
+ assertEquals("eng+fra", config.getLanguage());
+ }
+
+ @Test
+ public void testSetLanguageInvalid() {
+ Tess4JConfig config = new Tess4JConfig();
+ assertThrows(IllegalArgumentException.class, () ->
config.setLanguage("xy"));
+ }
+
+ @Test
+ public void testSetLanguageLeadingPlus() {
+ Tess4JConfig config = new Tess4JConfig();
+ assertThrows(IllegalArgumentException.class, () ->
config.setLanguage("+eng"));
+ }
+
+ @Test
+ public void testSetPageSegModeValid() {
+ Tess4JConfig config = new Tess4JConfig();
+ config.setPageSegMode(6);
+ assertEquals(6, config.getPageSegMode());
+ }
+
+ @Test
+ public void testSetPageSegModeInvalid() {
+ Tess4JConfig config = new Tess4JConfig();
+ assertThrows(IllegalArgumentException.class, () ->
config.setPageSegMode(14));
+ assertThrows(IllegalArgumentException.class, () ->
config.setPageSegMode(-1));
+ }
+
+ @Test
+ public void testSetOcrEngineModeValid() {
+ Tess4JConfig config = new Tess4JConfig();
+ config.setOcrEngineMode(1);
+ assertEquals(1, config.getOcrEngineMode());
+ }
+
+ @Test
+ public void testSetOcrEngineModeInvalid() {
+ Tess4JConfig config = new Tess4JConfig();
+ assertThrows(IllegalArgumentException.class, () ->
config.setOcrEngineMode(4));
+ assertThrows(IllegalArgumentException.class, () ->
config.setOcrEngineMode(-1));
+ }
+
+ @Test
+ public void testSetPoolSizeValid() {
+ Tess4JConfig config = new Tess4JConfig();
+ config.setPoolSize(4);
+ assertEquals(4, config.getPoolSize());
+ }
+
+ @Test
+ public void testSetPoolSizeInvalid() {
+ Tess4JConfig config = new Tess4JConfig();
+ assertThrows(IllegalArgumentException.class, () ->
config.setPoolSize(0));
+ assertThrows(IllegalArgumentException.class, () ->
config.setPoolSize(-1));
+ }
+
+ @Test
+ public void testSetDpiValid() {
+ Tess4JConfig config = new Tess4JConfig();
+ config.setDpi(150);
+ assertEquals(150, config.getDpi());
+ }
+
+ @Test
+ public void testSetDpiInvalid() {
+ Tess4JConfig config = new Tess4JConfig();
+ assertThrows(IllegalArgumentException.class, () -> config.setDpi(50));
+ assertThrows(IllegalArgumentException.class, () ->
config.setDpi(1500));
+ }
+
+ @Test
+ public void testSkipOcr() {
+ Tess4JConfig config = new Tess4JConfig();
+ config.setSkipOcr(true);
+ assertTrue(config.isSkipOcr());
+ }
+
+ @Test
+ public void testRuntimeConfigBlocksDataPath() {
+ Tess4JConfig.RuntimeConfig config = new Tess4JConfig.RuntimeConfig();
+ assertThrows(TikaConfigException.class,
+ () -> config.setDataPath("/some/path"));
+ }
+
+ @Test
+ public void testRuntimeConfigAllowsEmptyDataPath() throws
TikaConfigException {
+ Tess4JConfig.RuntimeConfig config = new Tess4JConfig.RuntimeConfig();
+ config.setDataPath("");
+ assertEquals("", config.getDataPath());
+ }
+}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JParserTest.java
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JParserTest.java
new file mode 100644
index 0000000000..e347007293
--- /dev/null
+++
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JParserTest.java
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+
+public class Tess4JParserTest {
+
+ private static Tess4JParser parser;
+ private static boolean tess4jAvailable;
+
+ @BeforeAll
+ static void setUp() throws Exception {
+ Tess4JConfig config = new Tess4JConfig();
+ config.setPoolSize(2);
+ config.setDataPath(getTessDataPath());
+ config.setNativeLibPath(getNativeLibPath());
+ parser = new Tess4JParser(config);
+ tess4jAvailable = parser.isInitialized();
+ }
+
+ /**
+ * Returns the tessdata path, checking system property first, then common
locations.
+ */
+ private static String getTessDataPath() {
+ String prop = System.getProperty("tess4j.datapath");
+ if (prop != null && !prop.isEmpty()) {
+ return prop;
+ }
+ // Common Homebrew location on macOS
+ java.io.File homebrew = new
java.io.File("/opt/homebrew/share/tessdata");
+ if (homebrew.isDirectory()) {
+ return homebrew.getAbsolutePath();
+ }
+ // Common Linux locations
+ java.io.File usrShare = new
java.io.File("/usr/share/tesseract-ocr/5/tessdata");
+ if (usrShare.isDirectory()) {
+ return usrShare.getAbsolutePath();
+ }
+ java.io.File usrShareAlt = new java.io.File("/usr/share/tessdata");
+ if (usrShareAlt.isDirectory()) {
+ return usrShareAlt.getAbsolutePath();
+ }
+ return "";
+ }
+
+ /**
+ * Returns the native library path, checking system property first, then
common locations.
+ */
+ private static String getNativeLibPath() {
+ String prop = System.getProperty("tess4j.native.lib.path");
+ if (prop != null && !prop.isEmpty()) {
+ return prop;
+ }
+ // Common Homebrew location on macOS
+ java.io.File homebrewLib = new java.io.File("/opt/homebrew/lib");
+ if (homebrewLib.isDirectory()) {
+ return homebrewLib.getAbsolutePath();
+ }
+ return "";
+ }
+
+ @Test
+ public void testDelegatingGettersSetters() throws Exception {
+ Tess4JConfig config = new Tess4JConfig();
+ config.setPoolSize(1);
+ config.setSkipOcr(true);
+ Tess4JParser p = new Tess4JParser(config);
+
+ assertEquals("eng", p.getLanguage());
+ p.setLanguage("fra");
+ assertEquals("fra", p.getLanguage());
+
+ assertEquals(1, p.getPageSegMode());
+ p.setPageSegMode(3);
+ assertEquals(3, p.getPageSegMode());
+
+ assertEquals(3, p.getOcrEngineMode());
+ p.setOcrEngineMode(1);
+ assertEquals(1, p.getOcrEngineMode());
+
+ assertEquals(120, p.getTimeoutSeconds());
+ p.setTimeoutSeconds(60);
+ assertEquals(60, p.getTimeoutSeconds());
+
+ assertEquals(300, p.getDpi());
+ p.setDpi(150);
+ assertEquals(150, p.getDpi());
+
+ assertTrue(p.isSkipOcr());
+ }
+
+ @Test
+ public void testSkipOcrReturnEmptyTypes() throws Exception {
+ assumeTrue(tess4jAvailable, "Tess4J not available");
+
+ ParseContext context = new ParseContext();
+ Tess4JConfig ctxConfig = new Tess4JConfig();
+ ctxConfig.setSkipOcr(true);
+ context.set(Tess4JConfig.class, ctxConfig);
+ assertEquals(Collections.emptySet(),
parser.getSupportedTypes(context));
+ }
+
+ @Test
+ public void testSupportedTypesWhenInitialized() {
+ assumeTrue(tess4jAvailable, "Tess4J not available");
+
+ ParseContext context = new ParseContext();
+ assertFalse(parser.getSupportedTypes(context).isEmpty());
+ assertTrue(parser.getSupportedTypes(context)
+ .contains(MediaType.image("ocr-png")));
+ assertTrue(parser.getSupportedTypes(context)
+ .contains(MediaType.image("ocr-jpeg")));
+ }
+
+ @Test
+ public void testOcrJpeg() throws Exception {
+ assumeTrue(tess4jAvailable, "Tess4J not available");
+
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ try (InputStream is =
getClass().getResourceAsStream("/test-documents/testOCR.jpg");
+ TikaInputStream tis = TikaInputStream.get(is)) {
+ parser.parse(tis, handler, metadata, context);
+ }
+
+ String content = handler.toString();
+ assertTrue(content.contains("OCR") || content.contains("Testing"),
+ "Expected OCR output to contain recognizable text, got: " +
content);
+ }
+
+ @Test
+ public void testSkipOcrReturnsNoContent() throws Exception {
+ assumeTrue(tess4jAvailable, "Tess4J not available");
+
+ Tess4JConfig skipConfig = new Tess4JConfig();
+ skipConfig.setPoolSize(1);
+ skipConfig.setDataPath(getTessDataPath());
+ skipConfig.setNativeLibPath(getNativeLibPath());
+ skipConfig.setSkipOcr(true);
+ Tess4JParser skipParser = new Tess4JParser(skipConfig);
+
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ try (InputStream is =
getClass().getResourceAsStream("/test-documents/testOCR.jpg");
+ TikaInputStream tis = TikaInputStream.get(is)) {
+ skipParser.parse(tis, handler, metadata, context);
+ }
+
+ assertEquals("", handler.toString().trim());
+ }
+
+ @Test
+ public void testFileSizeFilter() throws Exception {
+ assumeTrue(tess4jAvailable, "Tess4J not available");
+
+ // Set maxFileSizeToOcr to 1 byte so the image is skipped
+ ParseContext context = new ParseContext();
+ Tess4JConfig smallConfig = new Tess4JConfig();
+ smallConfig.setMaxFileSizeToOcr(1);
+ context.set(Tess4JConfig.class, smallConfig);
+
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream is =
getClass().getResourceAsStream("/test-documents/testOCR.jpg");
+ TikaInputStream tis = TikaInputStream.get(is)) {
+ parser.parse(tis, handler, metadata, context);
+ }
+
+ assertEquals("", handler.toString().trim());
+ }
+
+ @Test
+ public void testConcurrentOcr() throws Exception {
+ assumeTrue(tess4jAvailable, "Tess4J not available");
+
+ int numThreads = 4;
+ ExecutorService executor = Executors.newFixedThreadPool(numThreads);
+ CountDownLatch latch = new CountDownLatch(numThreads);
+ AtomicInteger successCount = new AtomicInteger(0);
+ AtomicInteger failCount = new AtomicInteger(0);
+
+ for (int i = 0; i < numThreads; i++) {
+ executor.submit(() -> {
+ try {
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ try (InputStream is = getClass()
+
.getResourceAsStream("/test-documents/testOCR.jpg");
+ TikaInputStream tis = TikaInputStream.get(is)) {
+ parser.parse(tis, handler, metadata, context);
+ }
+
+ String content = handler.toString();
+ if (content != null && !content.trim().isEmpty()) {
+ successCount.incrementAndGet();
+ } else {
+ failCount.incrementAndGet();
+ }
+ } catch (Exception e) {
+ failCount.incrementAndGet();
+ } finally {
+ latch.countDown();
+ }
+ });
+ }
+
+ assertTrue(latch.await(120, TimeUnit.SECONDS), "Timed out waiting for
threads");
+ executor.shutdown();
+
+ assertEquals(numThreads, successCount.get(),
+ "All threads should have succeeded; failures=" +
failCount.get());
+ }
+}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/test-documents/testOCR.jpg
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/test-documents/testOCR.jpg
new file mode 100644
index 0000000000..b3f1df3636
Binary files /dev/null and
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/test-documents/testOCR.jpg
differ