(tika) branch main updated: TIKA-4667 - add Tess4J in-process OCR parser and docs (#2615)

tallison Wed, 18 Feb 2026 14:02:13 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 55bd3e7dd6 TIKA-4667 - add Tess4J in-process OCR parser and docs 
(#2615)
55bd3e7dd6 is described below

commit 55bd3e7dd68e1279af460608aa7e357b6556f2e3
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 18 17:01:03 2026 -0500

    TIKA-4667 - add Tess4J in-process OCR parser and docs (#2615)
---
 docs/modules/ROOT/examples/tess4j-basic.json       |  10 +
 docs/modules/ROOT/examples/tess4j-full.json        |  18 +
 docs/modules/ROOT/nav.adoc                         |   1 +
 .../pages/configuration/parsers/tess4j-parser.adoc | 282 +++++++++++
 tika-parsers/tika-parsers-ml/pom.xml               |   1 +
 .../tika-parser-tess4j-module/pom.xml              |  86 ++++
 .../tika/parser/ocr/tess4j/Tess4JConfig.java       | 355 ++++++++++++++
 .../tika/parser/ocr/tess4j/Tess4JParser.java       | 516 +++++++++++++++++++++
 .../tika/parser/ocr/tess4j/Tess4JConfigTest.java   | 140 ++++++
 .../tika/parser/ocr/tess4j/Tess4JParserTest.java   | 255 ++++++++++
 .../src/test/resources/test-documents/testOCR.jpg  | Bin 0 -> 3408 bytes
 11 files changed, 1664 insertions(+)

diff --git a/docs/modules/ROOT/examples/tess4j-basic.json 
b/docs/modules/ROOT/examples/tess4j-basic.json
new file mode 100644
index 0000000000..3fc74587be
--- /dev/null
+++ b/docs/modules/ROOT/examples/tess4j-basic.json
@@ -0,0 +1,10 @@
+{
+  "parsers": [
+    {
+      "name": "tess4j-parser",
+      "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
+      "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
+      "poolSize": 4
+    }
+  ]
+}
diff --git a/docs/modules/ROOT/examples/tess4j-full.json 
b/docs/modules/ROOT/examples/tess4j-full.json
new file mode 100644
index 0000000000..c2d5170ecf
--- /dev/null
+++ b/docs/modules/ROOT/examples/tess4j-full.json
@@ -0,0 +1,18 @@
+{
+  "parsers": [
+    {
+      "name": "tess4j-parser",
+      "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
+      "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
+      "language": "eng",
+      "pageSegMode": 1,
+      "ocrEngineMode": 3,
+      "poolSize": 4,
+      "timeoutSeconds": 120,
+      "dpi": 300,
+      "minFileSizeToOcr": 0,
+      "maxFileSizeToOcr": 2147483647,
+      "skipOcr": false
+    }
+  ]
+}
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 1864c77a42..d4bf3cb857 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -27,6 +27,7 @@
 ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser]
 ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR]
 ** xref:configuration/parsers/vlm-parsers.adoc[VLM Parsers (Claude, Gemini, 
OpenAI)]
+** xref:configuration/parsers/tess4j-parser.adoc[Tess4J OCR (In-Process)]
 * xref:migration-to-4x/index.adoc[Migration to 4.x]
 ** xref:migration-to-4x/migrating-to-4x.adoc[Migration Guide]
 ** xref:migration-to-4x/migrating-tika-server-4x.adoc[Tika Server Migration]
diff --git a/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc 
b/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc
new file mode 100644
index 0000000000..fb52b1d6e0
--- /dev/null
+++ b/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc
@@ -0,0 +1,282 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Tess4J OCR Parser
+
+The `Tess4JParser` is an OCR parser that calls the Tesseract native library
+in-process via https://github.com/nguyenq/tess4j[Tess4J] and JNA, rather
+than spawning a `tesseract` child process for every image. This eliminates
+per-file process-spawn overhead and can be significantly faster when
+processing large batches of images.
+
+Because the native Tesseract handle is *not thread-safe*, the parser
+maintains a configurable pool of `Tesseract` instances. Multiple threads
+borrow from the pool and return instances when done, so the parser is safe
+for concurrent use.
+
+[WARNING]
+====
+This parser loads native C/{cpp} libraries (Tesseract, Leptonica) into
+the JVM via JNA. A segfault or memory leak in the native code *will*
+crash your entire JVM.
+
+*You should run this parser in a forked child process using tika-pipes,
+ideally inside a Docker container.* Do not load it into a long-lived
+application server process unless you are comfortable with the risk.
+====
+
+== Module dependency
+
+The parser lives in the `tika-parser-tess4j-module` artifact:
+
+[source,xml]
+----
+<dependency>
+  <groupId>org.apache.tika</groupId>
+  <artifactId>tika-parser-tess4j-module</artifactId>
+  <version>${tika.version}</version>
+</dependency>
+----
+
+== Prerequisites
+
+You *must* have the Tesseract and Leptonica shared libraries installed on
+the machine where the parser runs. The tess4j jar bundles Windows DLLs
+only -- on macOS and Linux you are responsible for installing the native
+libraries yourself.
+
+* *Debian / Ubuntu:* `apt-get install libtesseract-dev libleptonica-dev 
tesseract-ocr-eng`
+* *RHEL / Fedora:* `dnf install tesseract-devel leptonica-devel 
tesseract-langpack-eng`
+* *macOS (Homebrew):* `brew install tesseract`
+
+You also need the tessdata language files. The `dataPath` configuration
+option must point to the directory containing them (e.g.,
+`/usr/share/tesseract-ocr/5/tessdata`).
+
+== Native library path (`jna.library.path`)
+
+[CAUTION]
+====
+JNA must be able to find `libtesseract` and `libleptonica` at runtime.
+The tess4j jar does *not* bundle these libraries for macOS or Linux.
+If JNA cannot find them on the default library search path, the parser
+will silently disable itself.
+
+You have several options:
+
+1. Set `nativeLibPath` in the parser configuration (recommended). The
+   parser will prepend this to the `jna.library.path` system property at
+   initialization time.
+2. Set the `jna.library.path` JVM system property yourself, e.g.,
+   `-Djna.library.path=/opt/homebrew/lib`.
+3. Install the libraries into a directory that is already on the default
+   search path (e.g., `/usr/lib`).
+
+*You are on your own here.* The correct path depends entirely on your OS,
+distribution, and how you installed Tesseract. Common values:
+
+[cols="1,2"]
+|===
+|Platform |Typical `nativeLibPath`
+
+|Debian / Ubuntu
+|`/usr/lib/x86_64-linux-gnu`
+
+|RHEL / Fedora
+|`/usr/lib64`
+
+|macOS (Homebrew, Apple Silicon)
+|`/opt/homebrew/lib`
+
+|macOS (Homebrew, Intel)
+|`/usr/local/lib`
+
+|Docker (see below)
+|`/usr/lib/x86_64-linux-gnu`
+|===
+====
+
+== Basic Configuration
+
+[source,json]
+----
+include::example$tess4j-basic.json[]
+----
+
+== Full Configuration
+
+[source,json]
+----
+include::example$tess4j-full.json[]
+----
+
+== Configuration options reference
+
+[cols="2,1,3"]
+|===
+|Property |Default |Description
+
+|`dataPath`
+|`""` (empty)
+|Path to the tessdata directory containing language data files. *Required*
+on macOS and Linux.
+
+|`nativeLibPath`
+|`""` (empty)
+|Path to the directory containing `libtesseract` and `libleptonica` shared
+libraries. Prepended to `jna.library.path` at initialization time.
+
+|`language`
+|`"eng"`
+|Tesseract language(s). Multiple languages separated by `+`
+(e.g., `eng+fra`).
+
+|`pageSegMode`
+|`1`
+|Page segmentation mode (0-13). 1 = automatic with OSD.
+
+|`ocrEngineMode`
+|`3`
+|OCR engine mode. 0 = legacy, 1 = LSTM only, 2 = legacy + LSTM,
+3 = default (whatever is available).
+
+|`poolSize`
+|`2`
+|Number of `Tesseract` instances in the pool. Set this to the number of
+threads that will call the parser concurrently. Each instance consumes
+native memory.
+
+|`timeoutSeconds`
+|`120`
+|Maximum time (seconds) to wait for a pooled `Tesseract` instance before
+throwing an exception.
+
+|`dpi`
+|`300`
+|DPI for image rendering.
+
+|`minFileSizeToOcr`
+|`0`
+|Minimum input file size in bytes. Smaller files are skipped.
+
+|`maxFileSizeToOcr`
+|`2147483647` (~2 GB)
+|Maximum input file size in bytes. Larger files are skipped.
+
+|`skipOcr`
+|`false`
+|Runtime kill-switch to disable the parser entirely.
+|===
+
+== Recommended: Docker + tika-pipes
+
+Because this parser loads native code into the JVM, the safest deployment
+is a Docker container running tika-pipes with forked child processes.
+If the native code crashes, only the child process dies -- tika-pipes will
+respawn it automatically.
+
+A minimal `Dockerfile`:
+
+[source,dockerfile]
+----
+FROM eclipse-temurin:21-jre
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libtesseract-dev \
+        libleptonica-dev \
+        tesseract-ocr-eng && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy your tika-pipes application and config
+COPY target/tika-pipes-app.jar /app/tika-pipes-app.jar
+COPY tika-config.json /app/tika-config.json
+
+WORKDIR /app
+ENTRYPOINT ["java", "-jar", "tika-pipes-app.jar"]
+----
+
+With the following parser configuration:
+
+[source,json]
+----
+{
+  "parsers": [
+    {
+      "name": "tess4j-parser",
+      "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
+      "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
+      "poolSize": 4
+    }
+  ]
+}
+----
+
+TIP: Set `poolSize` equal to the number of forked parser threads to
+maximize throughput without over-allocating native memory.
+
+== Tess4J vs. TesseractOCRParser
+
+[cols="1,2,2"]
+|===
+|Aspect |`TesseractOCRParser` |`Tess4JParser`
+
+|How it calls Tesseract
+|Spawns a new `tesseract` child process per image
+|Calls the native library in-process via JNA
+
+|Startup overhead
+|Process fork + exec per file
+|One-time JNA initialization; pooled thereafter
+
+|Thread safety
+|Naturally safe (separate processes)
+|Safe via pooled instances
+
+|Crash isolation
+|Child process crashes do not affect the JVM
+|A native crash *will* take down the JVM
+
+|Dependencies
+|`tesseract` binary on `PATH`
+|`libtesseract` + `libleptonica` shared libraries + JNA
+
+|Best for
+|Safety-first deployments, light OCR workloads
+|High-throughput batch processing in Docker / tika-pipes
+|===
+
+== Per-request configuration
+
+Override configuration for a single parse call by placing a `Tess4JConfig`
+on the `ParseContext`:
+
+[source,java]
+----
+Tess4JConfig override = new Tess4JConfig();
+override.setLanguage("fra");
+override.setPageSegMode(6);
+
+ParseContext context = new ParseContext();
+context.set(Tess4JConfig.class, override);
+----
+
+Note: `dataPath` and `nativeLibPath` cannot be changed at parse time
+(they are locked at parser initialization). Attempting to set them in a
+runtime config will throw `TikaConfigException`.
+
+@since Apache Tika 4.0
diff --git a/tika-parsers/tika-parsers-ml/pom.xml 
b/tika-parsers/tika-parsers-ml/pom.xml
index d9f982b337..5a508fe2a7 100644
--- a/tika-parsers/tika-parsers-ml/pom.xml
+++ b/tika-parsers/tika-parsers-ml/pom.xml
@@ -37,6 +37,7 @@
     <module>tika-parser-nlp-package</module>
     <module>tika-inference</module>
     <module>tika-parser-vlm-ocr-module</module>
+    <module>tika-parser-tess4j-module</module>
     <module>tika-transcribe-aws</module>
   </modules>
 
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml
new file mode 100644
index 0000000000..32dddef369
--- /dev/null
+++ b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
https://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <artifactId>tika-parsers-ml</artifactId>
+    <groupId>org.apache.tika</groupId>
+    <version>${revision}</version>
+  </parent>
+
+  <artifactId>tika-parser-tess4j-module</artifactId>
+  <name>Apache Tika Tess4J OCR parser module</name>
+
+  <properties>
+    <tess4j.version>5.16.0</tess4j.version>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>net.sourceforge.tess4j</groupId>
+      <artifactId>tess4j</artifactId>
+      <version>${tess4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+    <!-- test -->
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifestEntries>
+              
<Automatic-Module-Name>org.apache.tika.parser.ocr.tess4j</Automatic-Module-Name>
+            </manifestEntries>
+          </archive>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+        <configuration>
+          <inputExcludes>
+            <inputExclude>src/test/resources/test-documents/**</inputExclude>
+          </inputExcludes>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+  <scm>
+    <tag>3.0.0-rc1</tag>
+  </scm>
+</project>
diff --git 
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfig.java
 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfig.java
new file mode 100644
index 0000000000..8e8d8f8e40
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfig.java
@@ -0,0 +1,355 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import java.io.Serializable;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Configuration for {@link Tess4JParser}.
+ * <p>
+ * This class is not thread-safe and must be synchronized externally.
+ * </p>
+ */
+public class Tess4JConfig implements Serializable {
+
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * Language dictionary to be used. Default is "eng".
+     */
+    private String language = "eng";
+
+    /**
+     * Path to the tessdata directory containing language data files.
+     * If empty, tess4j will try to find the tessdata directory automatically.
+     */
+    private String dataPath = "";
+
+    /**
+     * Tesseract page segmentation mode. Default is 1.
+     * <ul>
+     *   <li>0 = Orientation and script detection (OSD) only.</li>
+     *   <li>1 = Automatic page segmentation with OSD.</li>
+     *   <li>3 = Fully automatic page segmentation, but no OSD. (Default for 
Tesseract)</li>
+     *   <li>6 = Assume a single uniform block of text.</li>
+     * </ul>
+     */
+    private int pageSegMode = 1;
+
+    /**
+     * Tesseract OCR Engine mode. Default is 3 (Default, based on what is 
available).
+     * <ul>
+     *   <li>0 = Original Tesseract only.</li>
+     *   <li>1 = Neural nets LSTM only.</li>
+     *   <li>2 = Tesseract + LSTM.</li>
+     *   <li>3 = Default, based on what is available.</li>
+     * </ul>
+     */
+    private int ocrEngineMode = 3;
+
+    /**
+     * Maximum file size (in bytes) to submit to OCR. Default is 50 MB.
+     */
+    private long maxFileSizeToOcr = 50 * 1024 * 1024;
+
+    /**
+     * Minimum file size (in bytes) to submit to OCR. Default is 0.
+     */
+    private long minFileSizeToOcr = 0;
+
+    /**
+     * Number of Tesseract instances to keep in the pool. Default is 2.
+     */
+    private int poolSize = 2;
+
+    /**
+     * Maximum time (in seconds) to wait for a Tesseract instance from the 
pool.
+     * Default is 120.
+     */
+    private int timeoutSeconds = 120;
+
+    /**
+     * Runtime switch to turn off OCR.
+     */
+    private boolean skipOcr = false;
+
+    /**
+     * DPI for image rendering. Default is 300.
+     */
+    private int dpi = 300;
+
+    /**
+     * Maximum total pixels (width &times; height) allowed for an image
+     * before OCR is skipped.  This prevents OOM from decompressing
+     * pathologically large images (e.g., a 30,000 &times; 30,000 image
+     * would require ~3.6 GB of heap as a BufferedImage).
+     * <p>
+     * Default is 100,000,000 (100 megapixels, ~10,000 &times; 10,000).
+     * Set to {@code -1} for no limit (not recommended).
+     */
+    private long maxImagePixels = 100_000_000L;
+
+    /**
+     * Path to the directory containing native Tesseract and Leptonica shared 
libraries
+     * (e.g., {@code libtesseract.dylib}, {@code libtesseract.so}).
+     * <p>
+     * On macOS with Homebrew, this is typically {@code /opt/homebrew/lib}.
+     * On Linux, it may be {@code /usr/lib} or {@code /usr/local/lib}.
+     * <p>
+     * If empty, JNA will search the default system library paths.
+     */
+    private String nativeLibPath = "";
+
+    public String getLanguage() {
+        return language;
+    }
+
+    /**
+     * Set tesseract language dictionary to be used. Default is "eng".
+     * Multiple languages may be specified, separated by plus characters.
+     * e.g. "eng+fra"
+     */
+    public void setLanguage(String language) {
+        Set<String> invalidCodes = new HashSet<>();
+        Set<String> validCodes = new HashSet<>();
+        validateLangs(language, validCodes, invalidCodes);
+        if (!invalidCodes.isEmpty()) {
+            throw new IllegalArgumentException("Invalid language code(s): " + 
invalidCodes);
+        }
+        this.language = language;
+    }
+
+    public String getDataPath() {
+        return dataPath;
+    }
+
+    /**
+     * Set the path to the tessdata directory.
+     */
+    public void setDataPath(String dataPath) throws TikaConfigException {
+        this.dataPath = dataPath;
+    }
+
+    public int getPageSegMode() {
+        return pageSegMode;
+    }
+
+    /**
+     * Set tesseract page segmentation mode.
+     * Default is 1.
+     */
+    public void setPageSegMode(int pageSegMode) {
+        if (pageSegMode < 0 || pageSegMode > 13) {
+            throw new IllegalArgumentException(
+                    "Invalid page segmentation mode: " + pageSegMode +
+                            ". Must be between 0 and 13.");
+        }
+        this.pageSegMode = pageSegMode;
+    }
+
+    public int getOcrEngineMode() {
+        return ocrEngineMode;
+    }
+
+    /**
+     * Set OCR Engine Mode.
+     * Default is 3.
+     */
+    public void setOcrEngineMode(int ocrEngineMode) {
+        if (ocrEngineMode < 0 || ocrEngineMode > 3) {
+            throw new IllegalArgumentException(
+                    "Invalid OCR Engine Mode: " + ocrEngineMode +
+                            ". Must be between 0 and 3.");
+        }
+        this.ocrEngineMode = ocrEngineMode;
+    }
+
+    public long getMaxFileSizeToOcr() {
+        return maxFileSizeToOcr;
+    }
+
+    public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
+        this.maxFileSizeToOcr = maxFileSizeToOcr;
+    }
+
+    public long getMinFileSizeToOcr() {
+        return minFileSizeToOcr;
+    }
+
+    public void setMinFileSizeToOcr(long minFileSizeToOcr) {
+        this.minFileSizeToOcr = minFileSizeToOcr;
+    }
+
+    public int getPoolSize() {
+        return poolSize;
+    }
+
+    /**
+     * Set the number of Tesseract instances to keep in the pool.
+     * Default is 2. Must be at least 1.
+     */
+    public void setPoolSize(int poolSize) {
+        if (poolSize < 1) {
+            throw new IllegalArgumentException("Pool size must be at least 1, 
got: " + poolSize);
+        }
+        this.poolSize = poolSize;
+    }
+
+    public int getTimeoutSeconds() {
+        return timeoutSeconds;
+    }
+
+    /**
+     * Set maximum time (seconds) to wait for a pooled Tesseract instance.
+     * Default is 120.
+     */
+    public void setTimeoutSeconds(int timeoutSeconds) {
+        this.timeoutSeconds = timeoutSeconds;
+    }
+
+    public boolean isSkipOcr() {
+        return skipOcr;
+    }
+
+    public void setSkipOcr(boolean skipOcr) {
+        this.skipOcr = skipOcr;
+    }
+
+    public int getDpi() {
+        return dpi;
+    }
+
+    /**
+     * Set the DPI for image rendering. Default is 300.
+     */
+    public void setDpi(int dpi) {
+        if (dpi < 72 || dpi > 1200) {
+            throw new IllegalArgumentException("DPI must be between 72 and 
1200, got: " + dpi);
+        }
+        this.dpi = dpi;
+    }
+
+    public long getMaxImagePixels() {
+        return maxImagePixels;
+    }
+
+    /**
+     * Set the maximum total pixels (width &times; height) allowed for
+     * an image before OCR is skipped. Default is 100,000,000
+     * (100 megapixels). Set to {@code -1} for no limit (not recommended).
+     */
+    public void setMaxImagePixels(long maxImagePixels) {
+        if (maxImagePixels < 1 && maxImagePixels != -1) {
+            throw new IllegalArgumentException(
+                    "maxImagePixels must be -1 (no limit) or at least 1, got: "
+                            + maxImagePixels);
+        }
+        this.maxImagePixels = maxImagePixels;
+    }
+
+    public String getNativeLibPath() {
+        return nativeLibPath;
+    }
+
+    /**
+     * Set the path to the directory containing native Tesseract/Leptonica 
shared libraries.
+     * On macOS with Homebrew this is typically {@code /opt/homebrew/lib}.
+     */
+    public void setNativeLibPath(String nativeLibPath) throws 
TikaConfigException {
+        this.nativeLibPath = nativeLibPath;
+    }
+
+    /**
+     * Validates language strings. Languages should conform to tesseract's 
expected format.
+     */
+    static void validateLangs(String language, Set<String> validLangs, 
Set<String> invalidLangs) {
+        if (StringUtils.isBlank(language)) {
+            return;
+        }
+        language = language.replaceAll("\\s", "");
+        if (language.matches("\\+.*|.*\\+")) {
+            throw new IllegalArgumentException(
+                    "Invalid syntax - Can't start or end with +: " + language);
+        }
+        final String[] langs = language.split("\\+");
+        for (String lang : langs) {
+            if (!lang.matches(
+                    
"([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
+                invalidLangs.add(lang + " (invalid syntax)");
+            } else {
+                validLangs.add(lang);
+            }
+        }
+    }
+
+    /**
+     * Runtime-only Tess4JConfig that prevents modification of paths and
+     * pool settings during parse-time configuration.
+     * <p>
+     * <b>Always blocked:</b> {@code dataPath}, {@code nativeLibPath},
+     * {@code poolSize}.
+     * <p>
+     * Paths are blocked to prevent file-system access attacks.
+     * Pool size is blocked because the pool is built at init time and
+     * cannot be resized at runtime.
+     */
+    public static class RuntimeConfig extends Tess4JConfig {
+
+        public RuntimeConfig() {
+            super();
+        }
+
+        @Override
+        public void setDataPath(String dataPath) throws TikaConfigException {
+            if (!StringUtils.isBlank(dataPath)) {
+                throw new TikaConfigException(
+                        "Cannot modify dataPath at runtime. " +
+                                "Paths must be configured at parser 
initialization time.");
+            }
+        }
+
+        @Override
+        public void setNativeLibPath(String nativeLibPath) throws 
TikaConfigException {
+            if (!StringUtils.isBlank(nativeLibPath)) {
+                throw new TikaConfigException(
+                        "Cannot modify nativeLibPath at runtime. " +
+                                "Paths must be configured at parser 
initialization time.");
+            }
+        }
+
+        @Override
+        public void setPoolSize(int poolSize) {
+            throw new IllegalStateException(
+                    "Cannot modify poolSize at runtime. " +
+                            "The pool is created at initialization time " +
+                            "and cannot be resized.");
+        }
+
+        @Override
+        public void setMaxImagePixels(long maxImagePixels) {
+            throw new IllegalStateException(
+                    "Cannot modify maxImagePixels at runtime. " +
+                            "Image size limits must be configured at " +
+                            "initialization time.");
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JParser.java
 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JParser.java
new file mode 100644
index 0000000000..8796736251
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/main/java/org/apache/tika/parser/ocr/tess4j/Tess4JParser.java
@@ -0,0 +1,516 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.awt.image.BufferedImage;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.TimeUnit;
+import javax.imageio.ImageIO;
+
+import net.sourceforge.tess4j.Tesseract;
+import net.sourceforge.tess4j.TesseractException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.config.ConfigDeserializer;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.ParseContextConfig;
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.config.TikaTaskTimeout;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * OCR parser using <a href="https://github.com/nguyenq/tess4j";>Tess4J</a>,
+ * which provides a Java JNA wrapper around the native Tesseract library.
+ * <p>
+ * Unlike the command-line {@code TesseractOCRParser}, this parser calls 
Tesseract
+ * in-process via JNA, eliminating the per-file process-spawn overhead.
+ * <p>
+ * Because the native Tesseract handle is <b>not thread-safe</b>, this parser
+ * maintains a configurable pool of {@link Tesseract} instances.  The pool size
+ * is controlled by {@link Tess4JConfig#setPoolSize(int)}.
+ * <p>
+ * Configuration key: {@code "tess4j-parser"}
+ *
+ * @since Apache Tika 4.0
+ */
+@TikaComponent(name = "tess4j-parser")
+public class Tess4JParser implements Parser, Initializable {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(Tess4JParser.class);
+
+    private static final String OCR = "ocr-";
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.image(OCR + "png"),
+                    MediaType.image(OCR + "jpeg"),
+                    MediaType.image(OCR + "tiff"),
+                    MediaType.image(OCR + "bmp"),
+                    MediaType.image(OCR + "gif"),
+                    MediaType.image("jp2"),
+                    MediaType.image("jpx"),
+                    MediaType.image("x-portable-pixmap"),
+                    MediaType.image(OCR + "jp2"),
+                    MediaType.image(OCR + "jpx"),
+                    MediaType.image(OCR + "x-portable-pixmap")
+            )));
+
+    private static volatile boolean HAS_WARNED = false;
+    private static final Object[] LOCK = new Object[0];
+
+    private Tess4JConfig defaultConfig;
+    private transient BlockingQueue<Tesseract> pool;
+    private volatile boolean initialized = false;
+
+    public Tess4JParser() throws TikaConfigException {
+        this.defaultConfig = new Tess4JConfig();
+        initialize();
+    }
+
+    public Tess4JParser(Tess4JConfig config) throws TikaConfigException {
+        this.defaultConfig = config;
+        initialize();
+    }
+
+    public Tess4JParser(JsonConfig jsonConfig) throws TikaConfigException {
+        this(ConfigDeserializer.buildConfig(jsonConfig, Tess4JConfig.class));
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        if (!initialized) {
+            return Collections.emptySet();
+        }
+        Tess4JConfig config = context.get(Tess4JConfig.class);
+        if (config != null && config.isSkipOcr()) {
+            return Collections.emptySet();
+        }
+        if (defaultConfig.isSkipOcr()) {
+            return Collections.emptySet();
+        }
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
+                      ParseContext parseContext)
+            throws IOException, SAXException, TikaException {
+
+        Tess4JConfig config = getConfig(parseContext);
+
+        if (!initialized || config.isSkipOcr()) {
+            return;
+        }
+
+        warnOnFirstParse();
+
+        long size = tis.getLength();
+        if (size >= 0 && (size < config.getMinFileSizeToOcr() ||
+                size > config.getMaxFileSizeToOcr())) {
+            return;
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, 
parseContext);
+        xhtml.startDocument();
+
+        Tesseract tesseract = null;
+        long timeoutMillis = TikaTaskTimeout.getTimeoutMillis(
+                parseContext, config.getTimeoutSeconds() * 1000L);
+        try {
+            tesseract = borrowTesseract(timeoutMillis);
+            if (tesseract == null) {
+                throw new TikaException("Timed out waiting for a Tesseract 
instance from the pool");
+            }
+
+            // Apply per-request config if different from defaults
+            applyConfig(tesseract, config);
+
+            // Check image dimensions before full decode to prevent OOM
+            long maxPixels = config.getMaxImagePixels();
+            if (maxPixels > 0) {
+                tis.mark((int) Math.min(tis.getLength() + 1, 1024 * 1024));
+                try {
+                    long pixels = getImagePixels(tis);
+                    if (pixels > maxPixels) {
+                        LOG.warn("Image has {} pixels, exceeding 
maxImagePixels={}. "
+                                + "Skipping OCR.", pixels, maxPixels);
+                        xhtml.endDocument();
+                        return;
+                    }
+                } finally {
+                    tis.reset();
+                }
+            }
+
+            BufferedImage image = readImage(tis);
+            if (image == null) {
+                LOG.warn("Could not read image from stream");
+                xhtml.endDocument();
+                return;
+            }
+
+            String ocrResult = tesseract.doOCR(image);
+
+            // Emit the text as XHTML
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute("", "class", "class", "CDATA", "ocr");
+            xhtml.startElement(XHTML, "div", "div", attrs);
+            if (ocrResult != null && !ocrResult.isEmpty()) {
+                xhtml.characters(ocrResult.toCharArray(), 0, 
ocrResult.length());
+            }
+            xhtml.endElement(XHTML, "div", "div");
+
+        } catch (TesseractException e) {
+            throw new TikaException("Tess4J OCR failed", e);
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            throw new TikaException("Interrupted while waiting for Tesseract 
instance", e);
+        } finally {
+            if (tesseract != null) {
+                returnTesseract(tesseract);
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+    @Override
+    public void initialize() throws TikaConfigException {
+        if (defaultConfig.isSkipOcr()) {
+            initialized = false;
+            return;
+        }
+        try {
+            configureNativeLibPath();
+            initPool();
+            initialized = true;
+            LOG.info("Tess4J parser initialized with pool size {}", 
defaultConfig.getPoolSize());
+        } catch (UnsatisfiedLinkError | NoClassDefFoundError e) {
+            LOG.warn("Tess4J native library not available: {}. " +
+                    "Tess4JParser will be disabled.", e.getMessage());
+            initialized = false;
+        } catch (Exception e) {
+            LOG.warn("Failed to initialize Tess4J: {}. " +
+                    "Tess4JParser will be disabled.", e.getMessage());
+            initialized = false;
+        }
+    }
+
+    /**
+     * If a native library path is configured, prepend it to the JNA library 
search path
+     * so that JNA can find libtesseract and libleptonica on non-Windows 
platforms.
+     */
+    private void configureNativeLibPath() {
+        String nativeLibPath = defaultConfig.getNativeLibPath();
+        if (!StringUtils.isBlank(nativeLibPath)) {
+            String existing = System.getProperty("jna.library.path", "");
+            if (existing.isEmpty()) {
+                System.setProperty("jna.library.path", nativeLibPath);
+            } else if (!existing.contains(nativeLibPath)) {
+                System.setProperty("jna.library.path",
+                        nativeLibPath + System.getProperty("path.separator") + 
existing);
+            }
+            LOG.debug("jna.library.path set to: {}", 
System.getProperty("jna.library.path"));
+        }
+    }
+
+    /**
+     * Creates the pool of {@link Tesseract} instances based on the default 
config.
+     */
+    private void initPool() {
+        int size = defaultConfig.getPoolSize();
+        pool = new ArrayBlockingQueue<>(size);
+        for (int i = 0; i < size; i++) {
+            Tesseract tesseract = createTesseract(defaultConfig);
+            pool.add(tesseract);
+        }
+        // Tess4J loads the native library lazily on first doOCR call.
+        // Force it now so UnsatisfiedLinkError is caught by initialize().
+        Tesseract probe = pool.peek();
+        if (probe != null) {
+            try {
+                BufferedImage tiny = new BufferedImage(1, 1,
+                        BufferedImage.TYPE_BYTE_GRAY);
+                probe.doOCR(tiny);
+            } catch (TesseractException e) {
+                // Expected — OCR on a 1x1 image may fail,
+                // but the native library loaded successfully
+            }
+        }
+    }
+
+    /**
+     * Creates and configures a new {@link Tesseract} instance.
+     */
+    private Tesseract createTesseract(Tess4JConfig config) {
+        Tesseract tesseract = new Tesseract();
+        applyConfig(tesseract, config);
+        return tesseract;
+    }
+
+    /**
+     * Applies the given configuration to a {@link Tesseract} instance.
+     */
+    private void applyConfig(Tesseract tesseract, Tess4JConfig config) {
+        if (!StringUtils.isBlank(config.getDataPath())) {
+            tesseract.setDatapath(config.getDataPath());
+        }
+        tesseract.setLanguage(config.getLanguage());
+        tesseract.setPageSegMode(config.getPageSegMode());
+        tesseract.setOcrEngineMode(config.getOcrEngineMode());
+    }
+
+    /**
+     * Borrows a {@link Tesseract} instance from the pool, waiting up to the
+     * specified timeout.
+     *
+     * @param timeoutMillis maximum time to wait in milliseconds
+     * @return a Tesseract instance, or null if the timeout elapsed
+     * @throws InterruptedException if the thread was interrupted while waiting
+     */
+    private Tesseract borrowTesseract(long timeoutMillis) throws 
InterruptedException {
+        return pool.poll(timeoutMillis, TimeUnit.MILLISECONDS);
+    }
+
+    /**
+     * Returns a {@link Tesseract} instance to the pool.
+     */
+    private void returnTesseract(Tesseract tesseract) {
+        if (!pool.offer(tesseract)) {
+            // pool is full (shouldn't happen in normal operation) - just 
discard
+            LOG.warn("Tesseract pool is full; discarding instance");
+        }
+    }
+
+    /**
+     * Reads a {@link BufferedImage} from the input stream.
+     */
+    private BufferedImage readImage(InputStream is) throws IOException {
+        return ImageIO.read(is);
+    }
+
+    /**
+     * Reads only the image header to determine width &times; height
+     * without decoding the full raster. Returns {@code -1} if dimensions
+     * cannot be determined.
+     */
+    private long getImagePixels(InputStream is) throws IOException {
+        try (javax.imageio.stream.ImageInputStream iis =
+                     ImageIO.createImageInputStream(is)) {
+            if (iis == null) {
+                return -1;
+            }
+            java.util.Iterator<javax.imageio.ImageReader> readers =
+                    ImageIO.getImageReaders(iis);
+            if (!readers.hasNext()) {
+                return -1;
+            }
+            javax.imageio.ImageReader reader = readers.next();
+            try {
+                reader.setInput(iis);
+                long w = reader.getWidth(0);
+                long h = reader.getHeight(0);
+                return w * h;
+            } finally {
+                reader.dispose();
+            }
+        }
+    }
+
+    /**
+     * Resolves the effective config: JSON config > ParseContext config > 
default.
+     */
+    private Tess4JConfig getConfig(ParseContext parseContext)
+            throws TikaConfigException, IOException {
+
+        if (parseContext.hasJsonConfig("tess4j-parser")) {
+            // Validate no paths in runtime config
+            Tess4JConfig.RuntimeConfig runtimeConfig = 
ParseContextConfig.getConfig(
+                    parseContext,
+                    "tess4j-parser",
+                    Tess4JConfig.RuntimeConfig.class,
+                    new Tess4JConfig.RuntimeConfig());
+
+            if (runtimeConfig.isSkipOcr()) {
+                return runtimeConfig;
+            }
+
+            return ParseContextConfig.getConfig(
+                    parseContext,
+                    "tess4j-parser",
+                    Tess4JConfig.class,
+                    defaultConfig);
+        }
+
+        Tess4JConfig userConfig = parseContext.get(Tess4JConfig.class);
+        if (userConfig != null) {
+            return userConfig;
+        }
+        return defaultConfig;
+    }
+
+    private void warnOnFirstParse() {
+        if (!HAS_WARNED) {
+            synchronized (LOCK) {
+                if (!HAS_WARNED) {
+                    LOG.info("Tess4J OCR is being invoked. " +
+                            "This can add greatly to processing time. " +
+                            "If you do not want OCR to be applied to your 
files, " +
+                            "configure skipOcr=true.");
+                    HAS_WARNED = true;
+                }
+            }
+        }
+    }
+
+    // -- Delegating getters/setters for parser-level configuration --
+
+    public String getLanguage() {
+        return defaultConfig.getLanguage();
+    }
+
+    public void setLanguage(String language) {
+        defaultConfig.setLanguage(language);
+    }
+
+    public String getDataPath() {
+        return defaultConfig.getDataPath();
+    }
+
+    public void setDataPath(String dataPath) throws TikaConfigException {
+        defaultConfig.setDataPath(dataPath);
+    }
+
+    public int getPageSegMode() {
+        return defaultConfig.getPageSegMode();
+    }
+
+    public void setPageSegMode(int pageSegMode) {
+        defaultConfig.setPageSegMode(pageSegMode);
+    }
+
+    public int getOcrEngineMode() {
+        return defaultConfig.getOcrEngineMode();
+    }
+
+    public void setOcrEngineMode(int ocrEngineMode) {
+        defaultConfig.setOcrEngineMode(ocrEngineMode);
+    }
+
+    public long getMaxFileSizeToOcr() {
+        return defaultConfig.getMaxFileSizeToOcr();
+    }
+
+    public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
+        defaultConfig.setMaxFileSizeToOcr(maxFileSizeToOcr);
+    }
+
+    public long getMinFileSizeToOcr() {
+        return defaultConfig.getMinFileSizeToOcr();
+    }
+
+    public void setMinFileSizeToOcr(long minFileSizeToOcr) {
+        defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr);
+    }
+
+    public int getPoolSize() {
+        return defaultConfig.getPoolSize();
+    }
+
+    public void setPoolSize(int poolSize) {
+        defaultConfig.setPoolSize(poolSize);
+    }
+
+    public int getTimeoutSeconds() {
+        return defaultConfig.getTimeoutSeconds();
+    }
+
+    public void setTimeoutSeconds(int timeoutSeconds) {
+        defaultConfig.setTimeoutSeconds(timeoutSeconds);
+    }
+
+    public boolean isSkipOcr() {
+        return defaultConfig.isSkipOcr();
+    }
+
+    public void setSkipOcr(boolean skipOcr) {
+        defaultConfig.setSkipOcr(skipOcr);
+    }
+
+    public int getDpi() {
+        return defaultConfig.getDpi();
+    }
+
+    public void setDpi(int dpi) {
+        defaultConfig.setDpi(dpi);
+    }
+
+    public String getNativeLibPath() {
+        return defaultConfig.getNativeLibPath();
+    }
+
+    public void setNativeLibPath(String nativeLibPath) throws 
TikaConfigException {
+        defaultConfig.setNativeLibPath(nativeLibPath);
+    }
+
+    public long getMaxImagePixels() {
+        return defaultConfig.getMaxImagePixels();
+    }
+
+    public void setMaxImagePixels(long maxImagePixels) {
+        defaultConfig.setMaxImagePixels(maxImagePixels);
+    }
+
+    /**
+     * Returns whether the parser has been successfully initialized
+     * (i.e., Tess4J native library is available).
+     */
+    public boolean isInitialized() {
+        return initialized;
+    }
+
+    /**
+     * Returns the default configuration. Visible for testing.
+     */
+    Tess4JConfig getDefaultConfig() {
+        return defaultConfig;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigTest.java
 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigTest.java
new file mode 100644
index 0000000000..f92ab8b53a
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.exception.TikaConfigException;
+
+public class Tess4JConfigTest {
+
+    @Test
+    public void testDefaults() {
+        Tess4JConfig config = new Tess4JConfig();
+        assertEquals("eng", config.getLanguage());
+        assertEquals("", config.getDataPath());
+        assertEquals(1, config.getPageSegMode());
+        assertEquals(3, config.getOcrEngineMode());
+        assertEquals(50 * 1024 * 1024, config.getMaxFileSizeToOcr());
+        assertEquals(0, config.getMinFileSizeToOcr());
+        assertEquals(2, config.getPoolSize());
+        assertEquals(120, config.getTimeoutSeconds());
+        assertFalse(config.isSkipOcr());
+        assertEquals(300, config.getDpi());
+    }
+
+    @Test
+    public void testSetLanguageValid() {
+        Tess4JConfig config = new Tess4JConfig();
+        config.setLanguage("eng+fra");
+        assertEquals("eng+fra", config.getLanguage());
+    }
+
+    @Test
+    public void testSetLanguageInvalid() {
+        Tess4JConfig config = new Tess4JConfig();
+        assertThrows(IllegalArgumentException.class, () -> 
config.setLanguage("xy"));
+    }
+
+    @Test
+    public void testSetLanguageLeadingPlus() {
+        Tess4JConfig config = new Tess4JConfig();
+        assertThrows(IllegalArgumentException.class, () -> 
config.setLanguage("+eng"));
+    }
+
+    @Test
+    public void testSetPageSegModeValid() {
+        Tess4JConfig config = new Tess4JConfig();
+        config.setPageSegMode(6);
+        assertEquals(6, config.getPageSegMode());
+    }
+
+    @Test
+    public void testSetPageSegModeInvalid() {
+        Tess4JConfig config = new Tess4JConfig();
+        assertThrows(IllegalArgumentException.class, () -> 
config.setPageSegMode(14));
+        assertThrows(IllegalArgumentException.class, () -> 
config.setPageSegMode(-1));
+    }
+
+    @Test
+    public void testSetOcrEngineModeValid() {
+        Tess4JConfig config = new Tess4JConfig();
+        config.setOcrEngineMode(1);
+        assertEquals(1, config.getOcrEngineMode());
+    }
+
+    @Test
+    public void testSetOcrEngineModeInvalid() {
+        Tess4JConfig config = new Tess4JConfig();
+        assertThrows(IllegalArgumentException.class, () -> 
config.setOcrEngineMode(4));
+        assertThrows(IllegalArgumentException.class, () -> 
config.setOcrEngineMode(-1));
+    }
+
+    @Test
+    public void testSetPoolSizeValid() {
+        Tess4JConfig config = new Tess4JConfig();
+        config.setPoolSize(4);
+        assertEquals(4, config.getPoolSize());
+    }
+
+    @Test
+    public void testSetPoolSizeInvalid() {
+        Tess4JConfig config = new Tess4JConfig();
+        assertThrows(IllegalArgumentException.class, () -> 
config.setPoolSize(0));
+        assertThrows(IllegalArgumentException.class, () -> 
config.setPoolSize(-1));
+    }
+
+    @Test
+    public void testSetDpiValid() {
+        Tess4JConfig config = new Tess4JConfig();
+        config.setDpi(150);
+        assertEquals(150, config.getDpi());
+    }
+
+    @Test
+    public void testSetDpiInvalid() {
+        Tess4JConfig config = new Tess4JConfig();
+        assertThrows(IllegalArgumentException.class, () -> config.setDpi(50));
+        assertThrows(IllegalArgumentException.class, () -> 
config.setDpi(1500));
+    }
+
+    @Test
+    public void testSkipOcr() {
+        Tess4JConfig config = new Tess4JConfig();
+        config.setSkipOcr(true);
+        assertTrue(config.isSkipOcr());
+    }
+
+    @Test
+    public void testRuntimeConfigBlocksDataPath() {
+        Tess4JConfig.RuntimeConfig config = new Tess4JConfig.RuntimeConfig();
+        assertThrows(TikaConfigException.class,
+                () -> config.setDataPath("/some/path"));
+    }
+
+    @Test
+    public void testRuntimeConfigAllowsEmptyDataPath() throws 
TikaConfigException {
+        Tess4JConfig.RuntimeConfig config = new Tess4JConfig.RuntimeConfig();
+        config.setDataPath("");
+        assertEquals("", config.getDataPath());
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JParserTest.java
 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JParserTest.java
new file mode 100644
index 0000000000..e347007293
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JParserTest.java
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+
+public class Tess4JParserTest {
+
+    private static Tess4JParser parser;
+    private static boolean tess4jAvailable;
+
+    @BeforeAll
+    static void setUp() throws Exception {
+        Tess4JConfig config = new Tess4JConfig();
+        config.setPoolSize(2);
+        config.setDataPath(getTessDataPath());
+        config.setNativeLibPath(getNativeLibPath());
+        parser = new Tess4JParser(config);
+        tess4jAvailable = parser.isInitialized();
+    }
+
+    /**
+     * Returns the tessdata path, checking system property first, then common 
locations.
+     */
+    private static String getTessDataPath() {
+        String prop = System.getProperty("tess4j.datapath");
+        if (prop != null && !prop.isEmpty()) {
+            return prop;
+        }
+        // Common Homebrew location on macOS
+        java.io.File homebrew = new 
java.io.File("/opt/homebrew/share/tessdata");
+        if (homebrew.isDirectory()) {
+            return homebrew.getAbsolutePath();
+        }
+        // Common Linux locations
+        java.io.File usrShare = new 
java.io.File("/usr/share/tesseract-ocr/5/tessdata");
+        if (usrShare.isDirectory()) {
+            return usrShare.getAbsolutePath();
+        }
+        java.io.File usrShareAlt = new java.io.File("/usr/share/tessdata");
+        if (usrShareAlt.isDirectory()) {
+            return usrShareAlt.getAbsolutePath();
+        }
+        return "";
+    }
+
+    /**
+     * Returns the native library path, checking system property first, then 
common locations.
+     */
+    private static String getNativeLibPath() {
+        String prop = System.getProperty("tess4j.native.lib.path");
+        if (prop != null && !prop.isEmpty()) {
+            return prop;
+        }
+        // Common Homebrew location on macOS
+        java.io.File homebrewLib = new java.io.File("/opt/homebrew/lib");
+        if (homebrewLib.isDirectory()) {
+            return homebrewLib.getAbsolutePath();
+        }
+        return "";
+    }
+
+    @Test
+    public void testDelegatingGettersSetters() throws Exception {
+        Tess4JConfig config = new Tess4JConfig();
+        config.setPoolSize(1);
+        config.setSkipOcr(true);
+        Tess4JParser p = new Tess4JParser(config);
+
+        assertEquals("eng", p.getLanguage());
+        p.setLanguage("fra");
+        assertEquals("fra", p.getLanguage());
+
+        assertEquals(1, p.getPageSegMode());
+        p.setPageSegMode(3);
+        assertEquals(3, p.getPageSegMode());
+
+        assertEquals(3, p.getOcrEngineMode());
+        p.setOcrEngineMode(1);
+        assertEquals(1, p.getOcrEngineMode());
+
+        assertEquals(120, p.getTimeoutSeconds());
+        p.setTimeoutSeconds(60);
+        assertEquals(60, p.getTimeoutSeconds());
+
+        assertEquals(300, p.getDpi());
+        p.setDpi(150);
+        assertEquals(150, p.getDpi());
+
+        assertTrue(p.isSkipOcr());
+    }
+
+    @Test
+    public void testSkipOcrReturnEmptyTypes() throws Exception {
+        assumeTrue(tess4jAvailable, "Tess4J not available");
+
+        ParseContext context = new ParseContext();
+        Tess4JConfig ctxConfig = new Tess4JConfig();
+        ctxConfig.setSkipOcr(true);
+        context.set(Tess4JConfig.class, ctxConfig);
+        assertEquals(Collections.emptySet(), 
parser.getSupportedTypes(context));
+    }
+
+    @Test
+    public void testSupportedTypesWhenInitialized() {
+        assumeTrue(tess4jAvailable, "Tess4J not available");
+
+        ParseContext context = new ParseContext();
+        assertFalse(parser.getSupportedTypes(context).isEmpty());
+        assertTrue(parser.getSupportedTypes(context)
+                .contains(MediaType.image("ocr-png")));
+        assertTrue(parser.getSupportedTypes(context)
+                .contains(MediaType.image("ocr-jpeg")));
+    }
+
+    @Test
+    public void testOcrJpeg() throws Exception {
+        assumeTrue(tess4jAvailable, "Tess4J not available");
+
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        try (InputStream is = 
getClass().getResourceAsStream("/test-documents/testOCR.jpg");
+             TikaInputStream tis = TikaInputStream.get(is)) {
+            parser.parse(tis, handler, metadata, context);
+        }
+
+        String content = handler.toString();
+        assertTrue(content.contains("OCR") || content.contains("Testing"),
+                "Expected OCR output to contain recognizable text, got: " + 
content);
+    }
+
+    @Test
+    public void testSkipOcrReturnsNoContent() throws Exception {
+        assumeTrue(tess4jAvailable, "Tess4J not available");
+
+        Tess4JConfig skipConfig = new Tess4JConfig();
+        skipConfig.setPoolSize(1);
+        skipConfig.setDataPath(getTessDataPath());
+        skipConfig.setNativeLibPath(getNativeLibPath());
+        skipConfig.setSkipOcr(true);
+        Tess4JParser skipParser = new Tess4JParser(skipConfig);
+
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        try (InputStream is = 
getClass().getResourceAsStream("/test-documents/testOCR.jpg");
+             TikaInputStream tis = TikaInputStream.get(is)) {
+            skipParser.parse(tis, handler, metadata, context);
+        }
+
+        assertEquals("", handler.toString().trim());
+    }
+
+    @Test
+    public void testFileSizeFilter() throws Exception {
+        assumeTrue(tess4jAvailable, "Tess4J not available");
+
+        // Set maxFileSizeToOcr to 1 byte so the image is skipped
+        ParseContext context = new ParseContext();
+        Tess4JConfig smallConfig = new Tess4JConfig();
+        smallConfig.setMaxFileSizeToOcr(1);
+        context.set(Tess4JConfig.class, smallConfig);
+
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream is = 
getClass().getResourceAsStream("/test-documents/testOCR.jpg");
+             TikaInputStream tis = TikaInputStream.get(is)) {
+            parser.parse(tis, handler, metadata, context);
+        }
+
+        assertEquals("", handler.toString().trim());
+    }
+
+    @Test
+    public void testConcurrentOcr() throws Exception {
+        assumeTrue(tess4jAvailable, "Tess4J not available");
+
+        int numThreads = 4;
+        ExecutorService executor = Executors.newFixedThreadPool(numThreads);
+        CountDownLatch latch = new CountDownLatch(numThreads);
+        AtomicInteger successCount = new AtomicInteger(0);
+        AtomicInteger failCount = new AtomicInteger(0);
+
+        for (int i = 0; i < numThreads; i++) {
+            executor.submit(() -> {
+                try {
+                    BodyContentHandler handler = new BodyContentHandler();
+                    Metadata metadata = new Metadata();
+                    ParseContext context = new ParseContext();
+
+                    try (InputStream is = getClass()
+                            
.getResourceAsStream("/test-documents/testOCR.jpg");
+                         TikaInputStream tis = TikaInputStream.get(is)) {
+                        parser.parse(tis, handler, metadata, context);
+                    }
+
+                    String content = handler.toString();
+                    if (content != null && !content.trim().isEmpty()) {
+                        successCount.incrementAndGet();
+                    } else {
+                        failCount.incrementAndGet();
+                    }
+                } catch (Exception e) {
+                    failCount.incrementAndGet();
+                } finally {
+                    latch.countDown();
+                }
+            });
+        }
+
+        assertTrue(latch.await(120, TimeUnit.SECONDS), "Timed out waiting for 
threads");
+        executor.shutdown();
+
+        assertEquals(numThreads, successCount.get(),
+                "All threads should have succeeded; failures=" + 
failCount.get());
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/test-documents/testOCR.jpg
 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/test-documents/testOCR.jpg
new file mode 100644
index 0000000000..b3f1df3636
Binary files /dev/null and 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/test-documents/testOCR.jpg
 differ

(tika) branch main updated: TIKA-4667 - add Tess4J in-process OCR parser and docs (#2615)

Reply via email to