This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8c0329132a TIKA-4676 -- refactor inference and fix endian bug in
ESEmitter (#2653)
8c0329132a is described below
commit 8c0329132a75ad749e2e8644eed4dff455144d79
Author: Tim Allison <[email protected]>
AuthorDate: Thu Feb 26 19:36:05 2026 -0500
TIKA-4676 -- refactor inference and fix endian bug in ESEmitter (#2653)
---
tika-app/pom.xml | 12 +
tika-parsers/pom.xml | 2 +
.../tika-inference => tika-http-jdk}/pom.xml | 74 ++----
.../java/org/apache/tika/http/TikaHttpClient.java | 150 ++++++++++++
.../org/apache/tika/http/TikaTestHttpServer.java | 268 +++++++++++++++++++++
tika-parsers/tika-parsers-ml/pom.xml | 2 +-
.../tika-parsers-ml/tika-inference/pom.xml | 48 +---
.../tika/inference/OpenAIEmbeddingFilter.java | 64 +----
.../tika/inference/OpenAIImageEmbeddingParser.java | 75 +-----
.../apache/tika/inference/VectorSerializer.java | 18 +-
.../tika/inference/OpenAIEmbeddingFilterTest.java | 52 ++--
.../inference/OpenAIImageEmbeddingParserTest.java | 110 +++------
.../pom.xml | 51 ++--
.../apache/tika/parser/vlm/AbstractVLMParser.java | 101 ++++----
.../apache/tika/parser/vlm/ClaudeVLMParser.java | 21 +-
.../apache/tika/parser/vlm/GeminiVLMParser.java | 14 +-
.../tika/parser/vlm/MarkdownToXHTMLEmitter.java | 0
.../apache/tika/parser/vlm/OpenAIVLMParser.java | 19 +-
.../org/apache/tika/parser/vlm/VLMOCRConfig.java | 0
.../tika/parser/vlm/ClaudeVLMParserTest.java | 62 +++--
.../tika/parser/vlm/GeminiVLMParserTest.java | 64 +++--
.../parser/vlm/MarkdownToXHTMLEmitterTest.java | 0
.../tika/parser/vlm/OpenAIVLMParserTest.java | 65 +++--
tika-server/tika-server-standard/pom.xml | 12 +
24 files changed, 732 insertions(+), 552 deletions(-)
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 3d117bc17a..5fad464dc3 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -40,6 +40,18 @@
<artifactId>tika-parsers-standard-package</artifactId>
<version>${project.version}</version>
</dependency>
+ <!-- inference: text embeddings and CLIP image embeddings -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-inference</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <!-- VLM: OpenAI-compatible, Gemini, and Claude vision parsers -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-vlm</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-handler-boilerpipe</artifactId>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 47ee4a1a32..9c63396976 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -32,6 +32,8 @@
<packaging>pom</packaging>
<modules>
+ <!-- shared JDK HTTP client for parser modules (zero runtime deps) -->
+ <module>tika-http-jdk</module>
<!-- basic parsers - avoid network dependent parsers, native code -->
<module>tika-parsers-standard</module>
<!-- allow network dependent parsers and native code -->
diff --git a/tika-parsers/tika-parsers-ml/tika-inference/pom.xml
b/tika-parsers/tika-http-jdk/pom.xml
similarity index 52%
copy from tika-parsers/tika-parsers-ml/tika-inference/pom.xml
copy to tika-parsers/tika-http-jdk/pom.xml
index f56dcc19ad..1b3fa2e405 100644
--- a/tika-parsers/tika-parsers-ml/tika-inference/pom.xml
+++ b/tika-parsers/tika-http-jdk/pom.xml
@@ -23,64 +23,31 @@
<modelVersion>4.0.0</modelVersion>
<parent>
- <artifactId>tika-parsers-ml</artifactId>
+ <artifactId>tika-parsers</artifactId>
<groupId>org.apache.tika</groupId>
<version>${revision}</version>
</parent>
- <artifactId>tika-inference</artifactId>
- <name>Apache Tika inference module</name>
-
- <properties>
- <okhttp.version>5.3.2</okhttp.version>
- </properties>
-
- <dependencyManagement>
- <dependencies>
- <!-- align Kotlin stdlib versions pulled by OkHttp and Okio -->
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-jdk8</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-jdk7</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-common</artifactId>
- <version>2.3.10</version>
- </dependency>
- </dependencies>
- </dependencyManagement>
+ <artifactId>tika-http-jdk</artifactId>
+ <name>Apache Tika JDK HTTP client</name>
+ <description>
+ Thin wrapper around java.net.http.HttpClient for use by Tika parser
+ modules that make outbound REST calls (embedding APIs, VLM endpoints,
+ etc.). Has zero runtime dependencies beyond tika-core and the JDK.
+ </description>
<dependencies>
<dependency>
- <groupId>com.squareup.okhttp3</groupId>
- <artifactId>okhttp-jvm</artifactId>
- <version>${okhttp.version}</version>
- </dependency>
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
</dependency>
<!-- test -->
<dependency>
- <groupId>com.squareup.okhttp3</groupId>
- <artifactId>mockwebserver</artifactId>
- <version>${okhttp.version}</version>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
@@ -93,17 +60,24 @@
<configuration>
<archive>
<manifestEntries>
-
<Automatic-Module-Name>org.apache.tika.inference</Automatic-Module-Name>
+
<Automatic-Module-Name>org.apache.tika.http</Automatic-Module-Name>
</manifestEntries>
</archive>
</configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
<configuration>
<inputExcludes>
- <inputExclude>src/test/resources/test-documents/**</inputExclude>
+ <inputExclude>src/test/resources/**</inputExclude>
</inputExcludes>
</configuration>
</plugin>
@@ -111,6 +85,6 @@
</build>
<scm>
- <tag>3.0.0-rc1</tag>
+ <tag>HEAD</tag>
</scm>
</project>
diff --git
a/tika-parsers/tika-http-jdk/src/main/java/org/apache/tika/http/TikaHttpClient.java
b/tika-parsers/tika-http-jdk/src/main/java/org/apache/tika/http/TikaHttpClient.java
new file mode 100644
index 0000000000..ce5418a4f4
--- /dev/null
+++
b/tika-parsers/tika-http-jdk/src/main/java/org/apache/tika/http/TikaHttpClient.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.http;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.nio.charset.StandardCharsets;
+import java.time.Duration;
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Lightweight HTTP client for Tika parser modules that call external REST
+ * endpoints (embedding APIs, VLM services, etc.).
+ * <p>
+ * Built on {@link java.net.http.HttpClient} with a daemon thread executor
+ * so the JVM — including forked {@code PipesServer} processes — shuts down
+ * cleanly without waiting for idle HTTP threads.
+ * <p>
+ * This class has no runtime dependencies beyond the JDK and {@code tika-core}.
+ * Obtain an instance via {@link #build(int)} and close it when done to release
+ * the underlying executor.
+ *
+ * @since Apache Tika 4.0
+ */
+public class TikaHttpClient implements Closeable {
+
+ private static final String JSON_CONTENT_TYPE = "application/json;
charset=utf-8";
+
+ private final HttpClient httpClient;
+ private final ExecutorService executor;
+ private final int defaultTimeoutSeconds;
+
+ private TikaHttpClient(HttpClient httpClient, ExecutorService executor,
+ int defaultTimeoutSeconds) {
+ this.httpClient = httpClient;
+ this.executor = executor;
+ this.defaultTimeoutSeconds = defaultTimeoutSeconds;
+ }
+
+ /**
+ * Create a new {@code TikaHttpClient} with a daemon-thread executor.
+ *
+ * @param connectTimeoutSeconds TCP connection timeout in seconds
+ */
+ public static TikaHttpClient build(int connectTimeoutSeconds) {
+ ExecutorService executor = Executors.newCachedThreadPool(r -> {
+ Thread t = new Thread(r, "tika-http-jdk");
+ t.setDaemon(true);
+ return t;
+ });
+ HttpClient client = HttpClient.newBuilder()
+ .executor(executor)
+ .connectTimeout(Duration.ofSeconds(connectTimeoutSeconds))
+ .followRedirects(HttpClient.Redirect.NORMAL)
+ .version(HttpClient.Version.HTTP_1_1)
+ .build();
+ return new TikaHttpClient(client, executor, connectTimeoutSeconds);
+ }
+
+ /**
+ * POST a JSON body to {@code url} and return the response body as a
string.
+ *
+ * @param url target URL
+ * @param jsonBody request body (UTF-8 JSON)
+ * @param headers additional HTTP headers (e.g. {@code
Authorization})
+ * @param timeoutSeconds read timeout; {@code 0} uses the default timeout
+ * @return response body string
+ * @throws IOException on network error
+ * @throws TikaException on non-2xx HTTP status
+ */
+ public String postJson(String url, String jsonBody, Map<String, String>
headers,
+ int timeoutSeconds) throws IOException,
TikaException {
+ HttpRequest.Builder builder = HttpRequest.newBuilder()
+ .uri(URI.create(url))
+ .timeout(Duration.ofSeconds(timeoutSeconds > 0
+ ? timeoutSeconds : defaultTimeoutSeconds))
+ .header("Content-Type", JSON_CONTENT_TYPE)
+ .POST(HttpRequest.BodyPublishers.ofString(jsonBody,
StandardCharsets.UTF_8));
+
+ headers.forEach(builder::header);
+
+ return send(builder.build());
+ }
+
+ /**
+ * GET {@code url} and return the response body as a string.
+ * Useful for health-check probes at init time.
+ *
+ * @param url target URL
+ * @param headers additional HTTP headers
+ * @param timeoutSeconds read timeout; {@code 0} uses the default timeout
+ * @return response body string
+ * @throws IOException on network error
+ * @throws TikaException on non-2xx HTTP status
+ */
+ public String get(String url, Map<String, String> headers,
+ int timeoutSeconds) throws IOException, TikaException {
+ HttpRequest.Builder builder = HttpRequest.newBuilder()
+ .uri(URI.create(url))
+ .timeout(Duration.ofSeconds(timeoutSeconds > 0
+ ? timeoutSeconds : defaultTimeoutSeconds))
+ .GET();
+
+ headers.forEach(builder::header);
+
+ return send(builder.build());
+ }
+
+ private String send(HttpRequest request) throws IOException, TikaException
{
+ try {
+ HttpResponse<String> response = httpClient.send(
+ request,
HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
+ if (response.statusCode() < 200 || response.statusCode() >= 300) {
+ throw new TikaException("HTTP " + response.statusCode()
+ + " from " + request.uri() + ": " + response.body());
+ }
+ return response.body();
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ throw new IOException("HTTP request interrupted: " +
request.uri(), e);
+ }
+ }
+
+ @Override
+ public void close() {
+ executor.shutdown();
+ }
+}
diff --git
a/tika-parsers/tika-http-jdk/src/test/java/org/apache/tika/http/TikaTestHttpServer.java
b/tika-parsers/tika-http-jdk/src/test/java/org/apache/tika/http/TikaTestHttpServer.java
new file mode 100644
index 0000000000..2debc2a753
--- /dev/null
+++
b/tika-parsers/tika-http-jdk/src/test/java/org/apache/tika/http/TikaTestHttpServer.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.http;
+
+import java.io.BufferedReader;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Minimal mock HTTP/1.1 server for unit tests, backed by a plain
+ * {@link ServerSocket}. Has no dependencies outside the JDK.
+ * <p>
+ * Drop-in replacement for OkHttp's {@code MockWebServer} in Tika unit tests.
+ * <p>
+ * Usage:
+ * <pre>{@code
+ * try (TikaTestHttpServer server = new TikaTestHttpServer()) {
+ * server.enqueue(new MockResponse(200, "{\"data\":[]}"));
+ * // configure code under test to use server.url()
+ * RecordedRequest req = server.takeRequest();
+ * assertEquals("POST", req.method());
+ * }
+ * }</pre>
+ *
+ * @since Apache Tika 4.0
+ */
+public class TikaTestHttpServer implements Closeable {
+
+ /** A pre-programmed response to return for the next incoming request. */
+ public record MockResponse(int status, String body) {}
+
+ /** A captured incoming HTTP request. */
+ public record RecordedRequest(String method, String path,
+ Map<String, String> headers, String body) {
+ /** Returns the header value for {@code name} (case-insensitive), or
{@code null}. */
+ public String header(String name) {
+ return headers.get(name.toLowerCase(java.util.Locale.ROOT));
+ }
+ }
+
+ private final ServerSocket serverSocket;
+ private final ExecutorService executor;
+ private final BlockingQueue<MockResponse> responses = new
LinkedBlockingQueue<>();
+ private final BlockingQueue<RecordedRequest> requests = new
LinkedBlockingQueue<>();
+ private final AtomicInteger requestCount = new AtomicInteger(0);
+ private volatile boolean running = true;
+
+ public TikaTestHttpServer() throws IOException {
+ serverSocket = new ServerSocket(0);
+ executor = Executors.newCachedThreadPool(r -> {
+ Thread t = new Thread(r, "tika-test-http");
+ t.setDaemon(true);
+ return t;
+ });
+ executor.submit(this::acceptLoop);
+ }
+
+ private void acceptLoop() {
+ while (running) {
+ try {
+ Socket socket = serverSocket.accept();
+ executor.submit(() -> handleConnection(socket));
+ } catch (IOException e) {
+ if (running) {
+ // unexpected error while accepting
+ Thread.currentThread().interrupt();
+ }
+ }
+ }
+ }
+
+ private void handleConnection(Socket socket) {
+ try (socket) {
+ // Use a single BufferedReader for the entire connection — reading
+ // body via the same reader avoids the buffered-read-ahead pitfall
+ // where a raw InputStream read would miss bytes already buffered.
+ // Body content is JSON (UTF-8/ASCII), so char-level reading is
safe.
+ BufferedReader reader = new BufferedReader(
+ new InputStreamReader(socket.getInputStream(),
+ StandardCharsets.ISO_8859_1));
+ OutputStream out = socket.getOutputStream();
+
+ // Parse request line: METHOD path HTTP/1.x
+ String requestLine = reader.readLine();
+ if (requestLine == null || requestLine.isBlank()) {
+ return;
+ }
+ String[] parts = requestLine.split(" ", 3);
+ String method = parts[0];
+ String path = parts.length > 1 ? parts[1] : "/";
+
+ // Parse headers
+ Map<String, String> headers = new HashMap<>();
+ int contentLength = 0;
+ String line;
+ while ((line = reader.readLine()) != null && !line.isEmpty()) {
+ int colon = line.indexOf(':');
+ if (colon > 0) {
+ String name = line.substring(0, colon).trim()
+ .toLowerCase(java.util.Locale.ROOT);
+ String value = line.substring(colon + 1).trim();
+ headers.put(name, value);
+ if ("content-length".equals(name)) {
+ try {
+ contentLength = Integer.parseInt(value);
+ } catch (NumberFormatException ignored) {
+ // ignore
+ }
+ }
+ }
+ }
+
+ // Read body through the same BufferedReader to avoid consuming
bytes
+ // from the underlying stream that are already buffered in the
reader.
+ String body = "";
+ String transferEncoding =
headers.getOrDefault("transfer-encoding", "");
+ if
(transferEncoding.toLowerCase(java.util.Locale.ROOT).contains("chunked")) {
+ body = readChunkedFromReader(reader);
+ } else if (contentLength > 0) {
+ char[] bodyChars = new char[contentLength];
+ int read = 0;
+ while (read < contentLength) {
+ int n = reader.read(bodyChars, read, contentLength - read);
+ if (n < 0) {
+ break;
+ }
+ read += n;
+ }
+ body = new String(bodyChars, 0, read);
+ }
+
+ requests.add(new RecordedRequest(method, path, headers, body));
+ requestCount.incrementAndGet();
+
+ // Send response
+ MockResponse resp = responses.poll();
+ if (resp == null) {
+ resp = new MockResponse(500, "{\"error\":\"no response
queued\"}");
+ }
+
+ byte[] responseBytes =
resp.body().getBytes(StandardCharsets.UTF_8);
+ String statusText = resp.status() == 200 ? "OK"
+ : resp.status() == 500 ? "Internal Server Error"
+ : String.valueOf(resp.status());
+ String responseHeaders =
+ "HTTP/1.1 " + resp.status() + " " + statusText + "\r\n"
+ + "Content-Type: application/json\r\n"
+ + "Content-Length: " + responseBytes.length + "\r\n"
+ + "Connection: close\r\n"
+ + "\r\n";
+ out.write(responseHeaders.getBytes(StandardCharsets.US_ASCII));
+ out.write(responseBytes);
+ out.flush();
+ } catch (IOException e) {
+ // connection closed or error; ignore in test context
+ }
+ }
+
+ private static String readChunkedFromReader(BufferedReader reader) throws
IOException {
+ StringBuilder sb = new StringBuilder();
+ String sizeLine;
+ while ((sizeLine = reader.readLine()) != null) {
+ // strip any chunk extensions (e.g. "4;ext=val")
+ int semicolon = sizeLine.indexOf(';');
+ String hexSize = semicolon >= 0 ? sizeLine.substring(0, semicolon)
: sizeLine;
+ int chunkSize = Integer.parseInt(hexSize.trim(), 16);
+ if (chunkSize == 0) {
+ reader.readLine(); // consume trailing empty line
+ break;
+ }
+ char[] chunk = new char[chunkSize];
+ int read = 0;
+ while (read < chunkSize) {
+ int n = reader.read(chunk, read, chunkSize - read);
+ if (n < 0) {
+ break;
+ }
+ read += n;
+ }
+ sb.append(chunk, 0, read);
+ reader.readLine(); // consume CRLF after chunk data
+ }
+ return sb.toString();
+ }
+
+ /** Queue a response to return for the next request. */
+ public void enqueue(MockResponse response) {
+ responses.add(response);
+ }
+
+ /**
+ * Retrieves and removes the earliest recorded request, waiting up to
+ * 5 seconds if necessary.
+ *
+ * @return the recorded request, or {@code null} if no request arrived
+ * within the timeout
+ */
+ public RecordedRequest takeRequest() throws InterruptedException {
+ return requests.poll(5, TimeUnit.SECONDS);
+ }
+
+ /**
+ * Returns the total number of requests received so far
+ * (including those already consumed by {@link #takeRequest()}).
+ */
+ public int getRequestCount() {
+ return requestCount.get();
+ }
+
+ /**
+ * Clears all recorded requests and resets the request counter to zero.
+ * <p>
+ * Call this in test {@code setUp()} after invoking {@code initialize()}
on a
+ * parser under test so that health-check probes made during
initialization do
+ * not pollute per-test assertions about request count or request content.
+ */
+ public void clearRequests() {
+ requests.clear();
+ requestCount.set(0);
+ }
+
+ /** Returns the base URL (e.g. {@code http://localhost:54321}) with no
trailing slash. */
+ public String url() {
+ return "http://localhost:" + serverSocket.getLocalPort();
+ }
+
+ public void shutdown() {
+ running = false;
+ try {
+ serverSocket.close();
+ } catch (IOException ignored) {
+ // ignore
+ }
+ executor.shutdownNow();
+ }
+
+ @Override
+ public void close() {
+ shutdown();
+ }
+}
diff --git a/tika-parsers/tika-parsers-ml/pom.xml
b/tika-parsers/tika-parsers-ml/pom.xml
index 5a508fe2a7..028346e226 100644
--- a/tika-parsers/tika-parsers-ml/pom.xml
+++ b/tika-parsers/tika-parsers-ml/pom.xml
@@ -36,7 +36,7 @@
<module>tika-parser-nlp-module</module>
<module>tika-parser-nlp-package</module>
<module>tika-inference</module>
- <module>tika-parser-vlm-ocr-module</module>
+ <module>tika-vlm</module>
<module>tika-parser-tess4j-module</module>
<module>tika-transcribe-aws</module>
</modules>
diff --git a/tika-parsers/tika-parsers-ml/tika-inference/pom.xml
b/tika-parsers/tika-parsers-ml/tika-inference/pom.xml
index f56dcc19ad..17db66625b 100644
--- a/tika-parsers/tika-parsers-ml/tika-inference/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-inference/pom.xml
@@ -31,41 +31,11 @@
<artifactId>tika-inference</artifactId>
<name>Apache Tika inference module</name>
- <properties>
- <okhttp.version>5.3.2</okhttp.version>
- </properties>
-
- <dependencyManagement>
- <dependencies>
- <!-- align Kotlin stdlib versions pulled by OkHttp and Okio -->
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-jdk8</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-jdk7</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-common</artifactId>
- <version>2.3.10</version>
- </dependency>
- </dependencies>
- </dependencyManagement>
-
<dependencies>
<dependency>
- <groupId>com.squareup.okhttp3</groupId>
- <artifactId>okhttp-jvm</artifactId>
- <version>${okhttp.version}</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-http-jdk</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
@@ -78,9 +48,15 @@
<!-- test -->
<dependency>
- <groupId>com.squareup.okhttp3</groupId>
- <artifactId>mockwebserver</artifactId>
- <version>${okhttp.version}</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-http-jdk</artifactId>
+ <version>${project.version}</version>
+ <classifier>tests</classifier>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIEmbeddingFilter.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIEmbeddingFilter.java
index 7a841ea904..398e75e5a6 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIEmbeddingFilter.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIEmbeddingFilter.java
@@ -17,21 +17,18 @@
package org.apache.tika.inference;
import java.io.IOException;
+import java.util.HashMap;
import java.util.List;
-import java.util.concurrent.TimeUnit;
+import java.util.Map;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
-import okhttp3.MediaType;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
-import okhttp3.RequestBody;
-import okhttp3.Response;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.http.TikaHttpClient;
import org.apache.tika.utils.StringUtils;
/**
@@ -54,10 +51,7 @@ public class OpenAIEmbeddingFilter extends
AbstractEmbeddingFilter {
private static final ObjectMapper MAPPER = new ObjectMapper();
- private static final MediaType JSON_MEDIA_TYPE =
- MediaType.parse("application/json; charset=utf-8");
-
- private transient OkHttpClient httpClient;
+ private transient TikaHttpClient httpClient;
/**
* URL path appended to {@code baseUrl} for embeddings requests.
@@ -74,12 +68,12 @@ public class OpenAIEmbeddingFilter extends
AbstractEmbeddingFilter {
public OpenAIEmbeddingFilter() {
super();
- buildHttpClient();
+ this.httpClient = TikaHttpClient.build(30);
}
public OpenAIEmbeddingFilter(InferenceConfig config) {
super(config);
- buildHttpClient();
+ this.httpClient = TikaHttpClient.build(30);
}
@Override
@@ -90,33 +84,17 @@ public class OpenAIEmbeddingFilter extends
AbstractEmbeddingFilter {
return;
}
- // Build the request with all chunk texts in one batch
String requestJson = buildRequest(chunks, config);
String url = config.getBaseUrl().replaceAll("/+$", "") +
embeddingsPath;
- Request.Builder builder = new Request.Builder()
- .url(url)
- .post(RequestBody.create(requestJson, JSON_MEDIA_TYPE));
-
+ Map<String, String> headers = new HashMap<>();
if (!StringUtils.isBlank(config.getApiKey())) {
- builder.header(apiKeyHeaderName, apiKeyPrefix +
config.getApiKey());
+ headers.put(apiKeyHeaderName, apiKeyPrefix + config.getApiKey());
}
- OkHttpClient client = getClientWithTimeout(config);
-
- try (Response response = client.newCall(builder.build()).execute()) {
- if (!response.isSuccessful()) {
- String body = response.body() != null
- ? response.body().string() : "";
- throw new TikaException(
- "Embedding request failed with HTTP "
- + response.code() + ": " + body);
- }
-
- String responseBody = response.body() != null
- ? response.body().string() : "";
- parseResponse(responseBody, chunks);
- }
+ String responseBody = httpClient.postJson(url, requestJson, headers,
+ config.getTimeoutSeconds());
+ parseResponse(responseBody, chunks);
}
String buildRequest(List<Chunk> chunks, InferenceConfig config) {
@@ -164,26 +142,6 @@ public class OpenAIEmbeddingFilter extends
AbstractEmbeddingFilter {
}
}
- private void buildHttpClient() {
- int timeout = getDefaultConfig().getTimeoutSeconds();
- httpClient = new OkHttpClient.Builder()
- .connectTimeout(30, TimeUnit.SECONDS)
- .readTimeout(timeout, TimeUnit.SECONDS)
- .writeTimeout(60, TimeUnit.SECONDS)
- .build();
- }
-
- private OkHttpClient getClientWithTimeout(InferenceConfig config) {
- long defaultMs = getDefaultConfig().getTimeoutSeconds() * 1000L;
- long requestMs = config.getTimeoutSeconds() * 1000L;
- if (requestMs == defaultMs) {
- return httpClient;
- }
- return httpClient.newBuilder()
- .readTimeout(requestMs, TimeUnit.MILLISECONDS)
- .build();
- }
-
// ---- Azure / endpoint config getters/setters
----------------------------
public String getEmbeddingsPath() {
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
index b88c54d1bf..3a3af456ff 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
@@ -20,19 +20,16 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Set;
-import java.util.concurrent.TimeUnit;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
-import okhttp3.RequestBody;
-import okhttp3.Response;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
@@ -47,6 +44,7 @@ import org.apache.tika.config.TikaProgressTracker;
import org.apache.tika.config.TimeoutLimits;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.http.TikaHttpClient;
import org.apache.tika.inference.locator.Locators;
import org.apache.tika.inference.locator.PaginatedLocator;
import org.apache.tika.io.TikaInputStream;
@@ -106,11 +104,8 @@ public class OpenAIImageEmbeddingParser implements Parser,
Initializable {
private static final ObjectMapper MAPPER = new ObjectMapper();
- private static final okhttp3.MediaType JSON_MEDIA_TYPE =
- okhttp3.MediaType.parse("application/json; charset=utf-8");
-
private ImageEmbeddingConfig defaultConfig;
- private transient OkHttpClient httpClient;
+ private transient TikaHttpClient httpClient;
/** URL path for embeddings requests. Default: {@code /v1/embeddings}. */
private String embeddingsPath = "/v1/embeddings";
@@ -127,7 +122,7 @@ public class OpenAIImageEmbeddingParser implements Parser,
Initializable {
public OpenAIImageEmbeddingParser(ImageEmbeddingConfig config) {
this.defaultConfig = config;
- buildHttpClient();
+ this.httpClient = TikaHttpClient.build(30);
}
public OpenAIImageEmbeddingParser(JsonConfig jsonConfig) {
@@ -168,21 +163,17 @@ public class OpenAIImageEmbeddingParser implements
Parser, Initializable {
long timeoutMillis = TimeoutLimits.getProcessTimeoutMillis(
parseContext, config.getTimeoutSeconds() * 1000L);
+ int timeoutSeconds = (int) (timeoutMillis / 1000L);
- float[] vector = callEmbeddingEndpoint(config, mimeType, base64Data,
- timeoutMillis);
+ float[] vector = callEmbeddingEndpoint(config, mimeType, base64Data,
timeoutSeconds);
TikaProgressTracker.update(parseContext);
- // Build a Chunk with the vector and locators
Locators locators = buildLocators(metadata);
Chunk chunk = new Chunk(null, locators);
chunk.setVector(vector);
- // Merge into the canonical chunks field so image embeddings
- // coexist with text chunks in a single array
ChunkSerializer.mergeInto(metadata, List.of(chunk));
- // Emit an empty document -- this parser produces vectors, not text
XHTMLContentHandler xhtml = new XHTMLContentHandler(
handler, metadata, parseContext);
xhtml.startDocument();
@@ -193,42 +184,26 @@ public class OpenAIImageEmbeddingParser implements
Parser, Initializable {
@Override
public void initialize() throws TikaConfigException {
- buildHttpClient();
+ this.httpClient = TikaHttpClient.build(30);
}
// ---- internals --------------------------------------------------------
float[] callEmbeddingEndpoint(ImageEmbeddingConfig config,
String mimeType, String base64Data,
- long timeoutMillis)
+ int timeoutSeconds)
throws IOException, TikaException {
String requestJson = buildRequest(config, mimeType, base64Data);
String url = config.getBaseUrl().replaceAll("/+$", "") +
embeddingsPath;
- Request.Builder builder = new Request.Builder()
- .url(url)
- .post(RequestBody.create(requestJson, JSON_MEDIA_TYPE));
-
+ Map<String, String> headers = new HashMap<>();
if (!StringUtils.isBlank(config.getApiKey())) {
- builder.header(apiKeyHeaderName, apiKeyPrefix +
config.getApiKey());
+ headers.put(apiKeyHeaderName, apiKeyPrefix + config.getApiKey());
}
- OkHttpClient client = getClientWithTimeout(timeoutMillis);
-
- try (Response response = client.newCall(builder.build()).execute()) {
- if (!response.isSuccessful()) {
- String body = response.body() != null
- ? response.body().string() : "";
- throw new TikaException(
- "Image embedding request failed with HTTP "
- + response.code() + ": " + body);
- }
-
- String responseBody = response.body() != null
- ? response.body().string() : "";
- return parseResponse(responseBody);
- }
+ String responseBody = httpClient.postJson(url, requestJson, headers,
timeoutSeconds);
+ return parseResponse(responseBody);
}
String buildRequest(ImageEmbeddingConfig config, String mimeType,
@@ -275,8 +250,6 @@ public class OpenAIImageEmbeddingParser implements Parser,
Initializable {
Locators buildLocators(Metadata metadata) {
Locators locators = new Locators();
- // If we have page number metadata (from PDF rendering), create
- // a PaginatedLocator
String pageStr = metadata.get(TikaPagedText.PAGE_NUMBER);
if (pageStr != null) {
try {
@@ -305,8 +278,6 @@ public class OpenAIImageEmbeddingParser implements Parser,
Initializable {
throws TikaConfigException, IOException {
String key = "openai-image-embedding-parser";
if (parseContext.hasJsonConfig(key)) {
- // Deserialize into RuntimeConfig which prevents overriding
- // security-sensitive fields (baseUrl, apiKey) at parse time
ImageEmbeddingConfig.RuntimeConfig runtimeConfig =
ParseContextConfig.getConfig(
parseContext, key,
@@ -317,7 +288,6 @@ public class OpenAIImageEmbeddingParser implements Parser,
Initializable {
return runtimeConfig;
}
- // Merge runtime overrides with the init-time defaults
return ParseContextConfig.getConfig(
parseContext, key, ImageEmbeddingConfig.class,
defaultConfig);
@@ -325,25 +295,6 @@ public class OpenAIImageEmbeddingParser implements Parser,
Initializable {
return defaultConfig;
}
- private void buildHttpClient() {
- httpClient = new OkHttpClient.Builder()
- .connectTimeout(30, TimeUnit.SECONDS)
- .readTimeout(defaultConfig.getTimeoutSeconds(),
- TimeUnit.SECONDS)
- .writeTimeout(60, TimeUnit.SECONDS)
- .build();
- }
-
- private OkHttpClient getClientWithTimeout(long timeoutMillis) {
- long defaultMs = defaultConfig.getTimeoutSeconds() * 1000L;
- if (timeoutMillis == defaultMs) {
- return httpClient;
- }
- return httpClient.newBuilder()
- .readTimeout(timeoutMillis, TimeUnit.MILLISECONDS)
- .build();
- }
-
// ---- delegating config getters/setters --------------------------------
public String getBaseUrl() {
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/VectorSerializer.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/VectorSerializer.java
index f350fee123..6964365f0f 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/VectorSerializer.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/VectorSerializer.java
@@ -17,14 +17,13 @@
package org.apache.tika.inference;
import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
import java.nio.FloatBuffer;
import java.util.Base64;
/**
- * Serializes and deserializes float vectors as base64-encoded little-endian
- * float32 byte arrays. Little-endian matches numpy/PyTorch convention so
- * vectors from Python inference servers round-trip cleanly.
+ * Serializes and deserializes float vectors as base64-encoded big-endian
+ * float32 byte arrays. Big-endian (network byte order) matches what
+ * Elasticsearch expects when ingesting base64-encoded dense vectors.
*/
public final class VectorSerializer {
@@ -32,23 +31,20 @@ public final class VectorSerializer {
}
/**
- * Encode a float array as a base64 string (little-endian float32).
+ * Encode a float array as a base64 string (big-endian float32).
*/
public static String encode(float[] vector) {
- ByteBuffer buf = ByteBuffer.allocate(vector.length * Float.BYTES)
- .order(ByteOrder.LITTLE_ENDIAN);
+ ByteBuffer buf = ByteBuffer.allocate(vector.length * Float.BYTES);
buf.asFloatBuffer().put(vector);
return Base64.getEncoder().encodeToString(buf.array());
}
/**
- * Decode a base64 string back to a float array (little-endian float32).
+ * Decode a base64 string back to a float array (big-endian float32).
*/
public static float[] decode(String base64) {
byte[] bytes = Base64.getDecoder().decode(base64);
- FloatBuffer fb = ByteBuffer.wrap(bytes)
- .order(ByteOrder.LITTLE_ENDIAN)
- .asFloatBuffer();
+ FloatBuffer fb = ByteBuffer.wrap(bytes).asFloatBuffer();
float[] vector = new float[fb.remaining()];
fb.get(vector);
return vector;
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIEmbeddingFilterTest.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIEmbeddingFilterTest.java
index b90e70b0c0..c576438f4b 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIEmbeddingFilterTest.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIEmbeddingFilterTest.java
@@ -27,31 +27,28 @@ import java.util.List;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
-import okhttp3.mockwebserver.MockResponse;
-import okhttp3.mockwebserver.MockWebServer;
-import okhttp3.mockwebserver.RecordedRequest;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.http.TikaTestHttpServer;
import org.apache.tika.metadata.Metadata;
public class OpenAIEmbeddingFilterTest {
private static final ObjectMapper MAPPER = new ObjectMapper();
- private MockWebServer server;
+ private TikaTestHttpServer server;
private OpenAIEmbeddingFilter filter;
private InferenceConfig config;
@BeforeEach
void setUp() throws Exception {
- server = new MockWebServer();
- server.start();
+ server = new TikaTestHttpServer();
config = new InferenceConfig();
- config.setBaseUrl(server.url("").toString().replaceAll("/+$", ""));
+ config.setBaseUrl(server.url());
config.setModel("text-embedding-3-small");
config.setMaxChunkChars(500);
config.setOverlapChars(0);
@@ -61,7 +58,7 @@ public class OpenAIEmbeddingFilterTest {
}
@AfterEach
- void tearDown() throws Exception {
+ void tearDown() {
server.shutdown();
}
@@ -70,10 +67,8 @@ public class OpenAIEmbeddingFilterTest {
String content = "# Section A\n\nSome text about section A.\n\n"
+ "# Section B\n\nSome text about section B.";
- // Mock embeddings response with 2 vectors (3 dims each)
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(2, 3))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(2, 3)));
Metadata metadata = new Metadata();
metadata.set("tika:content", content);
@@ -94,10 +89,9 @@ public class OpenAIEmbeddingFilterTest {
assertNotNull(chunks.get(1).getVector());
assertEquals(3, chunks.get(0).getVector().length);
- // Verify the request
- RecordedRequest request = server.takeRequest();
- assertEquals("/v1/embeddings", request.getPath());
- JsonNode body = MAPPER.readTree(request.getBody().readUtf8());
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ assertEquals("/v1/embeddings", request.path());
+ JsonNode body = MAPPER.readTree(request.body());
assertEquals("text-embedding-3-small", body.get("model").asText());
assertEquals(2, body.get("input").size());
}
@@ -107,9 +101,8 @@ public class OpenAIEmbeddingFilterTest {
config.setApiKey("sk-test-key");
filter = new OpenAIEmbeddingFilter(config);
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(1, 3))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(1, 3)));
Metadata metadata = new Metadata();
metadata.set("tika:content", "Some text.");
@@ -118,7 +111,7 @@ public class OpenAIEmbeddingFilterTest {
filter.filter(list);
assertEquals("Bearer sk-test-key",
- server.takeRequest().getHeader("Authorization"));
+ server.takeRequest().header("authorization"));
}
@Test
@@ -147,8 +140,7 @@ public class OpenAIEmbeddingFilterTest {
@Test
void testServerError() {
- server.enqueue(new MockResponse().setResponseCode(500)
- .setBody("{\"error\":\"boom\"}"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(500,
"{\"error\":\"boom\"}"));
Metadata metadata = new Metadata();
metadata.set("tika:content", "Some text.");
@@ -185,9 +177,8 @@ public class OpenAIEmbeddingFilterTest {
@Test
void testVectorSerialization() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(1, 3))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(1, 3)));
Metadata metadata = new Metadata();
metadata.set("tika:content", "Single chunk of text.");
@@ -202,16 +193,14 @@ public class OpenAIEmbeddingFilterTest {
// Vector should be base64, not a JSON array
String vectorField = array.get(0).get("vector").asText();
assertNotNull(vectorField);
- // Should be decodable
float[] decoded = VectorSerializer.decode(vectorField);
assertEquals(3, decoded.length);
}
@Test
void testMergeWithExistingChunks() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(1, 3))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(1, 3)));
Metadata metadata = new Metadata();
metadata.set("tika:content", "Some text.");
@@ -234,9 +223,7 @@ public class OpenAIEmbeddingFilterTest {
metadata.get(ChunkSerializer.CHUNKS_FIELD));
// Should have the pre-existing image chunk + the new text chunk
assertEquals(2, merged.size());
- // First is the image chunk (no text)
assertNull(merged.get(0).getText());
- // Second is the text chunk
assertNotNull(merged.get(1).getText());
assertNotNull(merged.get(1).getVector());
}
@@ -257,7 +244,8 @@ public class OpenAIEmbeddingFilterTest {
if (d > 0) {
sb.append(",");
}
- sb.append(String.format(java.util.Locale.ROOT, "%.6f", (i + 1)
* 0.1 + d * 0.01));
+ sb.append(String.format(java.util.Locale.ROOT,
+ "%.6f", (i + 1) * 0.1 + d * 0.01));
}
sb.append("]}");
}
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIImageEmbeddingParserTest.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIImageEmbeddingParserTest.java
index bd6ca03131..7d48a7c437 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIImageEmbeddingParserTest.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIImageEmbeddingParserTest.java
@@ -26,15 +26,13 @@ import java.util.List;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
-import okhttp3.mockwebserver.MockResponse;
-import okhttp3.mockwebserver.MockWebServer;
-import okhttp3.mockwebserver.RecordedRequest;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.http.TikaTestHttpServer;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaPagedText;
@@ -44,17 +42,16 @@ public class OpenAIImageEmbeddingParserTest {
private static final ObjectMapper MAPPER = new ObjectMapper();
- private MockWebServer server;
+ private TikaTestHttpServer server;
private OpenAIImageEmbeddingParser parser;
private ImageEmbeddingConfig config;
@BeforeEach
void setUp() throws Exception {
- server = new MockWebServer();
- server.start();
+ server = new TikaTestHttpServer();
config = new ImageEmbeddingConfig();
- config.setBaseUrl(server.url("").toString().replaceAll("/+$", ""));
+ config.setBaseUrl(server.url());
config.setModel("jina-clip-v2");
config.setTimeoutSeconds(10);
@@ -62,25 +59,22 @@ public class OpenAIImageEmbeddingParserTest {
}
@AfterEach
- void tearDown() throws Exception {
+ void tearDown() {
server.shutdown();
}
@Test
void testEndToEnd() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(3))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(3)));
- // Fake 1x1 PNG bytes (just needs to be non-empty)
byte[] fakeImage = new byte[]{(byte) 0x89, 'P', 'N', 'G'};
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/ocr-png");
try (TikaInputStream tis = TikaInputStream.get(fakeImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
String output = metadata.get(ChunkSerializer.CHUNKS_FIELD);
@@ -92,10 +86,9 @@ public class OpenAIImageEmbeddingParserTest {
assertNotNull(chunks.get(0).getVector());
assertEquals(3, chunks.get(0).getVector().length);
- // Verify request format
- RecordedRequest request = server.takeRequest();
- assertEquals("/v1/embeddings", request.getPath());
- JsonNode body = MAPPER.readTree(request.getBody().readUtf8());
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ assertEquals("/v1/embeddings", request.path());
+ JsonNode body = MAPPER.readTree(request.body());
assertEquals("jina-clip-v2", body.get("model").asText());
assertTrue(body.get("input").get(0).get("image").asText()
.startsWith("data:image/png;base64,"));
@@ -103,9 +96,8 @@ public class OpenAIImageEmbeddingParserTest {
@Test
void testPageNumberLocator() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(2))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(2)));
byte[] fakeImage = new byte[]{1, 2, 3};
@@ -114,8 +106,7 @@ public class OpenAIImageEmbeddingParserTest {
metadata.set(TikaPagedText.PAGE_NUMBER, 7);
try (TikaInputStream tis = TikaInputStream.get(fakeImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
List<Chunk> chunks = ChunkSerializer.fromJson(
@@ -124,15 +115,13 @@ public class OpenAIImageEmbeddingParserTest {
assertNotNull(chunks.get(0).getLocators().getPaginated());
assertEquals(1, chunks.get(0).getLocators().getPaginated().size());
- assertEquals(7,
- chunks.get(0).getLocators().getPaginated().get(0).getPage());
+ assertEquals(7,
chunks.get(0).getLocators().getPaginated().get(0).getPage());
}
@Test
void testOcrPrefixStripped() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(2))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(2)));
byte[] fakeImage = new byte[]{1, 2, 3};
@@ -140,12 +129,11 @@ public class OpenAIImageEmbeddingParserTest {
metadata.set(Metadata.CONTENT_TYPE, "image/ocr-jpeg");
try (TikaInputStream tis = TikaInputStream.get(fakeImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
- RecordedRequest request = server.takeRequest();
- JsonNode body = MAPPER.readTree(request.getBody().readUtf8());
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ JsonNode body = MAPPER.readTree(request.body());
// Should strip "ocr-" prefix: image/ocr-jpeg -> image/jpeg
assertTrue(body.get("input").get(0).get("image").asText()
.startsWith("data:image/jpeg;base64,"));
@@ -156,21 +144,19 @@ public class OpenAIImageEmbeddingParserTest {
config.setApiKey("sk-test-clip-key");
parser = new OpenAIImageEmbeddingParser(config);
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(2))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(2)));
byte[] fakeImage = new byte[]{1};
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
try (TikaInputStream tis = TikaInputStream.get(fakeImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
assertEquals("Bearer sk-test-clip-key",
- server.takeRequest().getHeader("Authorization"));
+ server.takeRequest().header("authorization"));
}
@Test
@@ -183,8 +169,7 @@ public class OpenAIImageEmbeddingParserTest {
metadata.set(Metadata.CONTENT_TYPE, "image/png");
try (TikaInputStream tis = TikaInputStream.get(fakeImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
assertNull(metadata.get(ChunkSerializer.CHUNKS_FIELD));
@@ -196,14 +181,12 @@ public class OpenAIImageEmbeddingParserTest {
config.setMinFileSizeToEmbed(100);
parser = new OpenAIImageEmbeddingParser(config);
- // 4 bytes -- below minimum
byte[] tinyImage = new byte[]{1, 2, 3, 4};
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
try (TikaInputStream tis = TikaInputStream.get(tinyImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
assertNull(metadata.get(ChunkSerializer.CHUNKS_FIELD));
@@ -212,9 +195,8 @@ public class OpenAIImageEmbeddingParserTest {
@Test
void testServerError() {
- server.enqueue(new MockResponse()
- .setResponseCode(500)
- .setBody("{\"error\":\"internal error\"}"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(500,
+ "{\"error\":\"internal error\"}"));
byte[] fakeImage = new byte[]{1};
Metadata metadata = new Metadata();
@@ -222,17 +204,15 @@ public class OpenAIImageEmbeddingParserTest {
assertThrows(TikaException.class, () -> {
try (TikaInputStream tis = TikaInputStream.get(fakeImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
});
}
@Test
void testMergeWithExistingChunks() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(4))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(4)));
byte[] fakeImage = new byte[]{1};
Metadata metadata = new Metadata();
@@ -245,17 +225,14 @@ public class OpenAIImageEmbeddingParserTest {
ChunkSerializer.toJson(List.of(textChunk)));
try (TikaInputStream tis = TikaInputStream.get(fakeImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
List<Chunk> merged = ChunkSerializer.fromJson(
metadata.get(ChunkSerializer.CHUNKS_FIELD));
assertEquals(2, merged.size());
- // First chunk is the pre-existing text chunk
assertEquals("existing text", merged.get(0).getText());
assertNotNull(merged.get(0).getVector());
- // Second chunk is the image embedding (no text)
assertNull(merged.get(1).getText());
assertNotNull(merged.get(1).getVector());
assertEquals(4, merged.get(1).getVector().length);
@@ -264,11 +241,11 @@ public class OpenAIImageEmbeddingParserTest {
@Test
void testSupportedTypes() {
assertTrue(parser.getSupportedTypes(new ParseContext())
- .contains(MediaType.image("ocr-png")));
+ .contains(org.apache.tika.mime.MediaType.image("ocr-png")));
assertTrue(parser.getSupportedTypes(new ParseContext())
- .contains(MediaType.image("ocr-jpeg")));
+ .contains(org.apache.tika.mime.MediaType.image("ocr-jpeg")));
assertTrue(parser.getSupportedTypes(new ParseContext())
- .contains(MediaType.image("webp")));
+ .contains(org.apache.tika.mime.MediaType.image("webp")));
}
@Test
@@ -293,22 +270,19 @@ public class OpenAIImageEmbeddingParserTest {
@Test
void testVectorSerializedAsBase64() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildEmbeddingResponse(3))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildEmbeddingResponse(3)));
byte[] fakeImage = new byte[]{1};
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
try (TikaInputStream tis = TikaInputStream.get(fakeImage)) {
- parser.parse(tis, new DefaultHandler(), metadata,
- new ParseContext());
+ parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
String output = metadata.get(ChunkSerializer.CHUNKS_FIELD);
JsonNode array = MAPPER.readTree(output);
- // Vector should be base64-encoded string, not a JSON array
String vectorField = array.get(0).get("vector").asText();
assertNotNull(vectorField);
float[] decoded = VectorSerializer.decode(vectorField);
@@ -329,12 +303,4 @@ public class OpenAIImageEmbeddingParserTest {
sb.append("\"usage\":{\"prompt_tokens\":10,\"total_tokens\":10}}");
return sb.toString();
}
-
- // Local MediaType helper since we don't want to import tika-core's
- // MediaType in the static context of these asserts
- private static class MediaType {
- static org.apache.tika.mime.MediaType image(String subtype) {
- return org.apache.tika.mime.MediaType.image(subtype);
- }
- }
}
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml
b/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml
similarity index 68%
rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml
rename to tika-parsers/tika-parsers-ml/tika-vlm/pom.xml
index 6de2e96345..4aa0097050 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml
@@ -28,45 +28,18 @@
<version>${revision}</version>
</parent>
- <artifactId>tika-parser-vlm-ocr-module</artifactId>
- <name>Apache Tika VLM OCR parser module</name>
+ <artifactId>tika-vlm</artifactId>
+ <name>Apache Tika VLM module</name>
<properties>
- <okhttp.version>5.3.2</okhttp.version>
<commonmark.version>0.27.1</commonmark.version>
</properties>
- <dependencyManagement>
- <dependencies>
- <!-- align Kotlin stdlib versions pulled by OkHttp and Okio -->
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-jdk8</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-jdk7</artifactId>
- <version>2.3.10</version>
- </dependency>
- <dependency>
- <groupId>org.jetbrains.kotlin</groupId>
- <artifactId>kotlin-stdlib-common</artifactId>
- <version>2.3.10</version>
- </dependency>
- </dependencies>
- </dependencyManagement>
-
<dependencies>
<dependency>
- <groupId>com.squareup.okhttp3</groupId>
- <artifactId>okhttp-jvm</artifactId>
- <version>${okhttp.version}</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-http-jdk</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
@@ -94,9 +67,15 @@
<!-- test -->
<dependency>
- <groupId>com.squareup.okhttp3</groupId>
- <artifactId>mockwebserver</artifactId>
- <version>${okhttp.version}</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-http-jdk</artifactId>
+ <version>${project.version}</version>
+ <classifier>tests</classifier>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
@@ -109,7 +88,7 @@
<configuration>
<archive>
<manifestEntries>
-
<Automatic-Module-Name>org.apache.tika.parser.vlm</Automatic-Module-Name>
+
<Automatic-Module-Name>org.apache.tika.vlm</Automatic-Module-Name>
</manifestEntries>
</archive>
</configuration>
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
similarity index 80%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
index c135cea99f..6cdae6f6dd 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
+++
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
@@ -22,12 +22,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Base64;
import java.util.Collections;
+import java.util.Map;
import java.util.Set;
-import java.util.concurrent.TimeUnit;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
-import okhttp3.Response;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
@@ -41,6 +38,7 @@ import org.apache.tika.config.TimeoutLimits;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ParentContentHandler;
+import org.apache.tika.http.TikaHttpClient;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
@@ -79,34 +77,36 @@ public abstract class AbstractVLMParser implements Parser,
Initializable {
public static final Property VLM_COMPLETION_TOKENS =
Property.externalInteger(VLM_META + "completion_tokens");
- static final okhttp3.MediaType JSON_MEDIA_TYPE =
- okhttp3.MediaType.parse("application/json; charset=utf-8");
+ /**
+ * Encapsulates a fully built HTTP request for a VLM API call.
+ *
+ * @param url full request URL (base + path)
+ * @param json serialized JSON request body
+ * @param headers additional HTTP headers (e.g. Authorization)
+ */
+ protected record HttpCall(String url, String json, Map<String, String>
headers) {}
private VLMOCRConfig defaultConfig;
- private transient OkHttpClient httpClient;
+ private transient TikaHttpClient httpClient;
private boolean serverAvailable = false;
protected AbstractVLMParser(VLMOCRConfig config) {
this.defaultConfig = config;
- buildHttpClient();
+ this.httpClient = buildHttpClient();
}
// ---- abstract contract for subclasses ---------------------------------
/**
- * Build a fully formed {@link Request} for the target API.
+ * Build a fully formed {@link HttpCall} for the target API.
*
- * @param config resolved config for this parse
- * @param fileBytes raw bytes of the input (image or document)
- * @param mimeType the MIME type of the input (e.g. {@code image/png},
- * {@code application/pdf})
- * @param base64Data base64-encoded version of {@code fileBytes}
- * @param client the OkHttp client (for timeout-aware request
building)
- * @return a ready-to-execute OkHttp {@link Request}
+ * @param config resolved config for this parse
+ * @param base64Data base64-encoded version of the file bytes
+ * @param mimeType the MIME type of the input (e.g. {@code image/png})
+ * @return a ready-to-execute {@link HttpCall}
*/
- protected abstract Request buildHttpRequest(VLMOCRConfig config, byte[]
fileBytes,
- String mimeType, String
base64Data,
- OkHttpClient client);
+ protected abstract HttpCall buildHttpCall(VLMOCRConfig config,
+ String base64Data, String
mimeType);
/**
* Parse the API response body and extract the model's text output.
@@ -142,6 +142,9 @@ public abstract class AbstractVLMParser implements Parser,
Initializable {
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
+ if (!serverAvailable) {
+ return Collections.emptySet();
+ }
VLMOCRConfig config = context.get(VLMOCRConfig.class);
if (config != null && config.isSkipOcr()) {
return Collections.emptySet();
@@ -171,7 +174,6 @@ public abstract class AbstractVLMParser implements Parser,
Initializable {
String mimeType = detectMimeType(metadata);
- // Check image pixel dimensions before reading fully (skip for PDFs)
long maxPixels = config.getMaxImagePixels();
if (maxPixels > 0 && mimeType.startsWith("image/")) {
tis.mark((int) Math.min(tis.getLength() + 1, 1024 * 1024));
@@ -195,20 +197,18 @@ public abstract class AbstractVLMParser implements
Parser, Initializable {
long timeoutMillis = TimeoutLimits.getProcessTimeoutMillis(
parseContext, config.getTimeoutSeconds() * 1000L);
- OkHttpClient client = getClientWithTimeout(timeoutMillis);
+ int timeoutSeconds = (int) (timeoutMillis / 1000L);
- Request httpRequest = buildHttpRequest(config, fileBytes, mimeType,
base64Data, client);
+ HttpCall call = buildHttpCall(config, base64Data, mimeType);
String responseText;
- try (Response response = client.newCall(httpRequest).execute()) {
- if (!response.isSuccessful()) {
- String body = response.body() != null ?
response.body().string() : "";
- throw new TikaException(
- "VLM request failed with HTTP " + response.code() + ":
" + body);
- }
- String responseBody = response.body() != null ?
response.body().string() : "";
+ try {
+ String responseBody = httpClient.postJson(
+ call.url(), call.json(), call.headers(), timeoutSeconds);
responseText = extractResponseText(responseBody, metadata);
TikaProgressTracker.update(parseContext);
+ } catch (TikaException e) {
+ throw e;
} catch (IOException e) {
throw new TikaException("VLM request failed: " + e.getMessage(),
e);
}
@@ -227,22 +227,21 @@ public abstract class AbstractVLMParser implements
Parser, Initializable {
@Override
public void initialize() throws TikaConfigException {
- buildHttpClient();
+ this.httpClient = buildHttpClient();
String healthUrl = getHealthCheckUrl(defaultConfig);
if (healthUrl == null) {
+ // No health check configured (e.g. Claude) — assume available
+ serverAvailable = true;
return;
}
try {
- Request request = new
Request.Builder().url(healthUrl).get().build();
- try (Response response = httpClient.newCall(request).execute()) {
- serverAvailable = response.isSuccessful();
- if (serverAvailable) {
- LOG.info("VLM server is available at {}",
defaultConfig.getBaseUrl());
- } else {
- LOG.warn("VLM server returned HTTP {} at {}",
- response.code(), defaultConfig.getBaseUrl());
- }
- }
+ httpClient.get(healthUrl, Map.of(),
defaultConfig.getTimeoutSeconds());
+ serverAvailable = true;
+ LOG.info("VLM server is available at {}",
defaultConfig.getBaseUrl());
+ } catch (TikaException e) {
+ LOG.warn("VLM server returned error at {}: {}",
+ defaultConfig.getBaseUrl(), e.getMessage());
+ serverAvailable = false;
} catch (IOException e) {
LOG.warn("VLM server is not available at {}: {}",
defaultConfig.getBaseUrl(), e.getMessage());
@@ -256,9 +255,6 @@ public abstract class AbstractVLMParser implements Parser,
Initializable {
throws TikaConfigException, IOException {
String key = configKey();
if (parseContext.hasJsonConfig(key)) {
- // Deserialize into RuntimeConfig which prevents overriding
- // security-sensitive fields (baseUrl, apiKey, prompt) at parse
time.
- // Pass the init-time config so that allowRuntimePrompt is
inherited.
VLMOCRConfig.RuntimeConfig runtimeConfig =
ParseContextConfig.getConfig(
parseContext, key, VLMOCRConfig.RuntimeConfig.class,
new VLMOCRConfig.RuntimeConfig(defaultConfig));
@@ -267,7 +263,6 @@ public abstract class AbstractVLMParser implements Parser,
Initializable {
return runtimeConfig;
}
- // Merge runtime overrides with the init-time defaults
return ParseContextConfig.getConfig(
parseContext, key, VLMOCRConfig.class, defaultConfig);
}
@@ -356,22 +351,8 @@ public abstract class AbstractVLMParser implements Parser,
Initializable {
}
}
- private void buildHttpClient() {
- httpClient = new OkHttpClient.Builder()
- .connectTimeout(30, TimeUnit.SECONDS)
- .readTimeout(defaultConfig.getTimeoutSeconds(),
TimeUnit.SECONDS)
- .writeTimeout(60, TimeUnit.SECONDS)
- .build();
- }
-
- OkHttpClient getClientWithTimeout(long timeoutMillis) {
- long defaultTimeoutMillis = defaultConfig.getTimeoutSeconds() * 1000L;
- if (timeoutMillis == defaultTimeoutMillis) {
- return httpClient;
- }
- return httpClient.newBuilder()
- .readTimeout(timeoutMillis, TimeUnit.MILLISECONDS)
- .build();
+ private TikaHttpClient buildHttpClient() {
+ return TikaHttpClient.build(30);
}
// ---- delegating config getters/setters --------------------------------
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
similarity index 92%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
index 9f20ccdd2a..051b6b6867 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
+++
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
@@ -19,16 +19,15 @@ package org.apache.tika.parser.vlm;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
-import okhttp3.RequestBody;
import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
@@ -105,22 +104,18 @@ public class ClaudeVLMParser extends AbstractVLMParser {
}
@Override
- protected Request buildHttpRequest(VLMOCRConfig config, byte[] fileBytes,
- String mimeType, String base64Data,
- OkHttpClient client) {
+ protected HttpCall buildHttpCall(VLMOCRConfig config,
+ String base64Data, String mimeType) {
String json = buildRequestJson(config, base64Data, mimeType);
String url = stripTrailingSlash(config.getBaseUrl()) + "/v1/messages";
- Request.Builder builder = new Request.Builder()
- .url(url)
- .post(RequestBody.create(json, JSON_MEDIA_TYPE))
- .header("anthropic-version", ANTHROPIC_VERSION);
-
+ Map<String, String> headers = new HashMap<>();
+ headers.put("anthropic-version", ANTHROPIC_VERSION);
if (!StringUtils.isBlank(config.getApiKey())) {
- builder.header("x-api-key", config.getApiKey());
+ headers.put("x-api-key", config.getApiKey());
}
- return builder.build();
+ return new HttpCall(url, json, headers);
}
@Override
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
similarity index 94%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
index 6997b30b1b..c6e8d4a9f2 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
+++
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
@@ -20,15 +20,13 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
-import okhttp3.RequestBody;
import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
@@ -115,9 +113,8 @@ public class GeminiVLMParser extends AbstractVLMParser {
}
@Override
- protected Request buildHttpRequest(VLMOCRConfig config, byte[] fileBytes,
- String mimeType, String base64Data,
- OkHttpClient client) {
+ protected HttpCall buildHttpCall(VLMOCRConfig config,
+ String base64Data, String mimeType) {
String json = buildRequestJson(config, base64Data, mimeType);
String baseUrl = stripTrailingSlash(config.getBaseUrl());
@@ -127,10 +124,7 @@ public class GeminiVLMParser extends AbstractVLMParser {
url += "?key=" + config.getApiKey();
}
- return new Request.Builder()
- .url(url)
- .post(RequestBody.create(json, JSON_MEDIA_TYPE))
- .build();
+ return new HttpCall(url, json, Map.of());
}
@Override
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java
similarity index 100%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
similarity index 94%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
index 3109651ba0..0ead21c492 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
+++
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
@@ -19,16 +19,15 @@ package org.apache.tika.parser.vlm;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
-import okhttp3.RequestBody;
import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
@@ -124,20 +123,16 @@ public class OpenAIVLMParser extends AbstractVLMParser {
}
@Override
- protected Request buildHttpRequest(VLMOCRConfig config, byte[] fileBytes,
- String mimeType, String base64Data,
- OkHttpClient client) {
+ protected HttpCall buildHttpCall(VLMOCRConfig config,
+ String base64Data, String mimeType) {
String json = buildRequestJson(config, base64Data, mimeType);
String url = stripTrailingSlash(config.getBaseUrl()) + completionsPath;
- Request.Builder builder = new Request.Builder()
- .url(url)
- .post(RequestBody.create(json, JSON_MEDIA_TYPE));
-
+ Map<String, String> headers = new HashMap<>();
if (!StringUtils.isBlank(config.getApiKey())) {
- builder.header(apiKeyHeaderName, apiKeyPrefix +
config.getApiKey());
+ headers.put(apiKeyHeaderName, apiKeyPrefix + config.getApiKey());
}
- return builder.build();
+ return new HttpCall(url, json, headers);
}
@Override
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java
similarity index 100%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
similarity index 82%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
index ba8241e7c0..5d0b558dc9 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
+++
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
@@ -27,15 +27,13 @@ import java.io.ByteArrayInputStream;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
-import okhttp3.mockwebserver.MockResponse;
-import okhttp3.mockwebserver.MockWebServer;
-import okhttp3.mockwebserver.RecordedRequest;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.http.TikaTestHttpServer;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -46,17 +44,16 @@ public class ClaudeVLMParserTest {
private static final ObjectMapper MAPPER = new ObjectMapper();
- private MockWebServer server;
+ private TikaTestHttpServer server;
private ClaudeVLMParser parser;
private VLMOCRConfig config;
@BeforeEach
void setUp() throws Exception {
- server = new MockWebServer();
- server.start();
+ server = new TikaTestHttpServer();
config = new VLMOCRConfig();
- config.setBaseUrl(server.url("").toString().replaceAll("/+$", ""));
+ config.setBaseUrl(server.url());
config.setModel("claude-sonnet-4-20250514");
config.setPrompt("Extract all text.");
config.setMaxTokens(4096);
@@ -64,18 +61,18 @@ public class ClaudeVLMParserTest {
config.setApiKey("sk-ant-test-key");
parser = new ClaudeVLMParser(config);
+ parser.initialize();
}
@AfterEach
- void tearDown() throws Exception {
+ void tearDown() {
server.shutdown();
}
@Test
void testSuccessfulImageOcr() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildClaudeResponse("Hello from Claude!", 200, 30))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildClaudeResponse("Hello from Claude!", 200, 30)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
@@ -92,15 +89,15 @@ public class ClaudeVLMParserTest {
assertEquals("200", metadata.get(AbstractVLMParser.VLM_PROMPT_TOKENS));
assertEquals("30",
metadata.get(AbstractVLMParser.VLM_COMPLETION_TOKENS));
- RecordedRequest request = server.takeRequest();
- assertEquals("/v1/messages", request.getPath());
- assertEquals("POST", request.getMethod());
- assertEquals("2023-06-01", request.getHeader("anthropic-version"));
- assertEquals("sk-ant-test-key", request.getHeader("x-api-key"));
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ assertEquals("/v1/messages", request.path());
+ assertEquals("POST", request.method());
+ assertEquals("2023-06-01", request.header("anthropic-version"));
+ assertEquals("sk-ant-test-key", request.header("x-api-key"));
// Claude does NOT use Bearer auth
- assertNull(request.getHeader("Authorization"));
+ assertNull(request.header("authorization"));
- JsonNode body = MAPPER.readTree(request.getBody().readUtf8());
+ JsonNode body = MAPPER.readTree(request.body());
assertEquals("claude-sonnet-4-20250514", body.get("model").asText());
assertEquals(4096, body.get("max_tokens").asInt());
@@ -111,7 +108,6 @@ public class ClaudeVLMParserTest {
JsonNode parts = messages.get(0).get("content");
assertEquals(2, parts.size());
- // First part: image
JsonNode imagePart = parts.get(0);
assertEquals("image", imagePart.get("type").asText());
JsonNode source = imagePart.get("source");
@@ -119,16 +115,14 @@ public class ClaudeVLMParserTest {
assertEquals("image/png", source.get("media_type").asText());
assertNotNull(source.get("data").asText());
- // Second part: text prompt
assertEquals("text", parts.get(1).get("type").asText());
assertEquals("Extract all text.", parts.get(1).get("text").asText());
}
@Test
void testPdfSupport() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildClaudeResponse("PDF text extracted by Claude",
500, 60))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildClaudeResponse("PDF text extracted by Claude", 500, 60)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
@@ -141,11 +135,10 @@ public class ClaudeVLMParserTest {
assertTrue(handler.toString().contains("PDF text extracted by
Claude"));
- RecordedRequest request = server.takeRequest();
- JsonNode body = MAPPER.readTree(request.getBody().readUtf8());
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ JsonNode body = MAPPER.readTree(request.body());
JsonNode parts = body.get("messages").get(0).get("content");
- // For PDFs, the content type should be "document" not "image"
assertEquals("document", parts.get(0).get("type").asText());
assertEquals("application/pdf",
parts.get(0).get("source").get("media_type").asText());
@@ -172,9 +165,8 @@ public class ClaudeVLMParserTest {
@Test
void testApiKeyAsXApiKeyHeader() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildClaudeResponse("ok", 10, 5))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildClaudeResponse("ok", 10, 5)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
@@ -184,16 +176,16 @@ public class ClaudeVLMParserTest {
parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
}
- RecordedRequest request = server.takeRequest();
- assertEquals("sk-ant-test-key", request.getHeader("x-api-key"));
- assertNull(request.getHeader("Authorization"));
- assertEquals("2023-06-01", request.getHeader("anthropic-version"));
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ assertEquals("sk-ant-test-key", request.header("x-api-key"));
+ assertNull(request.header("authorization"));
+ assertEquals("2023-06-01", request.header("anthropic-version"));
}
@Test
void testServerError() throws Exception {
- server.enqueue(new MockResponse().setResponseCode(500)
-
.setBody("{\"error\":{\"type\":\"server_error\",\"message\":\"boom\"}}"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(500,
+
"{\"error\":{\"type\":\"server_error\",\"message\":\"boom\"}}"));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
similarity index 81%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
index 05d19b747a..c54b25e1d1 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
+++
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
@@ -25,15 +25,13 @@ import java.io.ByteArrayInputStream;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
-import okhttp3.mockwebserver.MockResponse;
-import okhttp3.mockwebserver.MockWebServer;
-import okhttp3.mockwebserver.RecordedRequest;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.http.TikaTestHttpServer;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -44,36 +42,38 @@ public class GeminiVLMParserTest {
private static final ObjectMapper MAPPER = new ObjectMapper();
- private MockWebServer server;
+ private TikaTestHttpServer server;
private GeminiVLMParser parser;
private VLMOCRConfig config;
@BeforeEach
void setUp() throws Exception {
- server = new MockWebServer();
- server.start();
+ server = new TikaTestHttpServer();
config = new VLMOCRConfig();
- config.setBaseUrl(server.url("").toString().replaceAll("/+$", ""));
+ config.setBaseUrl(server.url());
config.setModel("gemini-2.5-flash");
config.setPrompt("Extract all text from this document.");
config.setMaxTokens(4096);
config.setTimeoutSeconds(10);
config.setApiKey("test-gemini-key");
+ // Queue 200 for the GET /v1beta/models health check in initialize()
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
"{\"models\":[]}"));
parser = new GeminiVLMParser(config);
+ parser.initialize();
+ server.clearRequests(); // discard the health-check request from the
log
}
@AfterEach
- void tearDown() throws Exception {
+ void tearDown() {
server.shutdown();
}
@Test
void testSuccessfulImageOcr() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildGeminiResponse("Hello from Gemini!", 80, 15))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildGeminiResponse("Hello from Gemini!", 80, 15)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
@@ -89,13 +89,12 @@ public class GeminiVLMParserTest {
assertEquals("80", metadata.get(AbstractVLMParser.VLM_PROMPT_TOKENS));
assertEquals("15",
metadata.get(AbstractVLMParser.VLM_COMPLETION_TOKENS));
- RecordedRequest request = server.takeRequest();
-
assertTrue(request.getPath().contains("/v1beta/models/gemini-2.5-flash:generateContent"));
- assertTrue(request.getPath().contains("key=test-gemini-key"));
- assertEquals("POST", request.getMethod());
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+
assertTrue(request.path().contains("/v1beta/models/gemini-2.5-flash:generateContent"));
+ assertTrue(request.path().contains("key=test-gemini-key"));
+ assertEquals("POST", request.method());
- // Verify Gemini request format
- JsonNode body = MAPPER.readTree(request.getBody().readUtf8());
+ JsonNode body = MAPPER.readTree(request.body());
JsonNode contents = body.get("contents");
assertNotNull(contents);
assertEquals(1, contents.size());
@@ -109,21 +108,18 @@ public class GeminiVLMParserTest {
assertEquals("image/png", inlineData.get("mime_type").asText());
assertNotNull(inlineData.get("data").asText());
- // Verify generation config
assertEquals(4096,
body.get("generationConfig").get("maxOutputTokens").asInt());
}
@Test
void testPdfSupport() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildGeminiResponse("PDF content extracted", 200, 50))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildGeminiResponse("PDF content extracted", 200, 50)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
BodyContentHandler handler = new BodyContentHandler();
- // Fake PDF bytes (starts with %PDF)
byte[] fakePdf = "%PDF-1.4 fake
content".getBytes(java.nio.charset.StandardCharsets.UTF_8);
try (TikaInputStream tis = TikaInputStream.get(new
ByteArrayInputStream(fakePdf))) {
@@ -132,9 +128,10 @@ public class GeminiVLMParserTest {
assertTrue(handler.toString().contains("PDF content extracted"));
- RecordedRequest request = server.takeRequest();
- JsonNode body = MAPPER.readTree(request.getBody().readUtf8());
- JsonNode inlineData =
body.get("contents").get(0).get("parts").get(1).get("inline_data");
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ JsonNode body = MAPPER.readTree(request.body());
+ JsonNode inlineData =
+
body.get("contents").get(0).get("parts").get(1).get("inline_data");
assertEquals("application/pdf", inlineData.get("mime_type").asText());
}
@@ -155,9 +152,8 @@ public class GeminiVLMParserTest {
@Test
void testApiKeyAsQueryParam() throws Exception {
- server.enqueue(new MockResponse()
- .setBody(buildGeminiResponse("ok", 10, 5))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildGeminiResponse("ok", 10, 5)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
@@ -167,17 +163,17 @@ public class GeminiVLMParserTest {
parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
}
- RecordedRequest request = server.takeRequest();
- assertTrue(request.getPath().contains("key=test-gemini-key"),
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ assertTrue(request.path().contains("key=test-gemini-key"),
"API key should be in query params, not header");
// Gemini does NOT use Bearer auth
- assertEquals(null, request.getHeader("Authorization"));
+ assertEquals(null, request.header("authorization"));
}
@Test
void testServerError() throws Exception {
- server.enqueue(new MockResponse().setResponseCode(500)
- .setBody("{\"error\":{\"message\":\"internal\"}}"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(500,
+ "{\"error\":{\"message\":\"internal\"}}"));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
@@ -199,7 +195,6 @@ public class GeminiVLMParserTest {
@Test
void testExtractResponseTextMultipleParts() throws Exception {
- // Gemini can return multiple text parts
String json = "{\"candidates\":[{\"content\":{\"parts\":["
+ "{\"text\":\"Part one\"},"
+ "{\"text\":\"Part two\"}"
@@ -220,7 +215,6 @@ public class GeminiVLMParserTest {
assertTrue(json.contains("\"data\":\"AAAA\""));
assertTrue(json.contains("\"maxOutputTokens\":4096"));
assertTrue(json.contains("Extract all text from this document."));
- // Should NOT contain OpenAI-style fields
assertTrue(!json.contains("\"messages\""));
assertTrue(!json.contains("\"max_tokens\""));
}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
similarity index 100%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
similarity index 82%
rename from
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
rename to
tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
index ed573c8bab..7a00ed7176 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
+++
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
@@ -26,15 +26,13 @@ import java.io.ByteArrayInputStream;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
-import okhttp3.mockwebserver.MockResponse;
-import okhttp3.mockwebserver.MockWebServer;
-import okhttp3.mockwebserver.RecordedRequest;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.http.TikaTestHttpServer;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -44,27 +42,30 @@ public class OpenAIVLMParserTest {
private static final ObjectMapper MAPPER = new ObjectMapper();
- private MockWebServer server;
+ private TikaTestHttpServer server;
private OpenAIVLMParser parser;
private VLMOCRConfig config;
@BeforeEach
void setUp() throws Exception {
- server = new MockWebServer();
- server.start();
+ server = new TikaTestHttpServer();
config = new VLMOCRConfig();
- config.setBaseUrl(server.url("").toString().replaceAll("/+$", ""));
+ config.setBaseUrl(server.url());
config.setModel("test-model");
config.setPrompt("Extract text from this image.");
config.setMaxTokens(1024);
config.setTimeoutSeconds(10);
+ // Queue 200 for the GET /v1/models health check in initialize()
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
"{\"object\":\"list\"}"));
parser = new OpenAIVLMParser(config);
+ parser.initialize();
+ server.clearRequests(); // discard the health-check request from the
log
}
@AfterEach
- void tearDown() throws Exception {
+ void tearDown() {
server.shutdown();
}
@@ -72,9 +73,8 @@ public class OpenAIVLMParserTest {
void testSuccessfulOcr() throws Exception {
String ocrText = "Hello, World!\nThis is extracted text.";
- server.enqueue(new MockResponse()
- .setBody(buildChatResponse(ocrText, 100, 20))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildChatResponse(ocrText, 100, 20)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
@@ -91,11 +91,11 @@ public class OpenAIVLMParserTest {
assertEquals("100", metadata.get(AbstractVLMParser.VLM_PROMPT_TOKENS));
assertEquals("20",
metadata.get(AbstractVLMParser.VLM_COMPLETION_TOKENS));
- RecordedRequest request = server.takeRequest();
- assertEquals("/v1/chat/completions", request.getPath());
- assertEquals("POST", request.getMethod());
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ assertEquals("/v1/chat/completions", request.path());
+ assertEquals("POST", request.method());
- JsonNode body = MAPPER.readTree(request.getBody().readUtf8());
+ JsonNode body = MAPPER.readTree(request.body());
assertEquals("test-model", body.get("model").asText());
assertEquals(1024, body.get("max_tokens").asInt());
@@ -112,8 +112,7 @@ public class OpenAIVLMParserTest {
@Test
void testServerError() throws Exception {
- server.enqueue(new MockResponse().setResponseCode(500)
- .setBody("{\"error\":\"boom\"}"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(500,
"{\"error\":\"boom\"}"));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
@@ -164,9 +163,8 @@ public class OpenAIVLMParserTest {
config.setApiKey("sk-test-key");
parser = new OpenAIVLMParser(config);
- server.enqueue(new MockResponse()
- .setBody(buildChatResponse("text", 10, 5))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildChatResponse("text", 10, 5)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
@@ -176,7 +174,7 @@ public class OpenAIVLMParserTest {
parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
}
- assertEquals("Bearer sk-test-key",
server.takeRequest().getHeader("Authorization"));
+ assertEquals("Bearer sk-test-key",
server.takeRequest().header("authorization"));
}
@Test
@@ -185,11 +183,11 @@ public class OpenAIVLMParserTest {
parser = new OpenAIVLMParser(config);
parser.setApiKeyHeaderName("api-key");
parser.setApiKeyPrefix("");
-
parser.setCompletionsPath("/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01");
+ parser.setCompletionsPath(
+
"/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01");
- server.enqueue(new MockResponse()
- .setBody(buildChatResponse("text", 10, 5))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildChatResponse("text", 10, 5)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
@@ -199,10 +197,10 @@ public class OpenAIVLMParserTest {
parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
}
- var request = server.takeRequest();
- assertEquals("azure-key-123", request.getHeader("api-key"));
- assertNull(request.getHeader("Authorization"));
- assertTrue(request.getPath().startsWith(
+ TikaTestHttpServer.RecordedRequest request = server.takeRequest();
+ assertEquals("azure-key-123", request.header("api-key"));
+ assertNull(request.header("authorization"));
+ assertTrue(request.path().startsWith(
"/openai/deployments/gpt-4o/chat/completions"));
}
@@ -220,15 +218,14 @@ public class OpenAIVLMParserTest {
@Test
void testPerRequestConfigOverride() throws Exception {
VLMOCRConfig override = new VLMOCRConfig();
- override.setBaseUrl(server.url("").toString().replaceAll("/+$", ""));
+ override.setBaseUrl(server.url());
override.setModel("override-model");
override.setPrompt("Custom.");
override.setMaxTokens(2048);
override.setTimeoutSeconds(10);
- server.enqueue(new MockResponse()
- .setBody(buildChatResponse("ok", 10, 5))
- .setHeader("Content-Type", "application/json"));
+ server.enqueue(new TikaTestHttpServer.MockResponse(200,
+ buildChatResponse("ok", 10, 5)));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/png");
@@ -240,7 +237,7 @@ public class OpenAIVLMParserTest {
parser.parse(tis, new BodyContentHandler(), metadata, ctx);
}
- JsonNode body =
MAPPER.readTree(server.takeRequest().getBody().readUtf8());
+ JsonNode body = MAPPER.readTree(server.takeRequest().body());
assertEquals("override-model", body.get("model").asText());
assertEquals(2048, body.get("max_tokens").asInt());
}
diff --git a/tika-server/tika-server-standard/pom.xml
b/tika-server/tika-server-standard/pom.xml
index 143f16d251..4c6a92227b 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -49,6 +49,18 @@
</exclusion>
</exclusions>
</dependency>
+ <!-- inference: text embeddings and CLIP image embeddings -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-inference</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <!-- VLM: OpenAI-compatible, Gemini, and Claude vision parsers -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-vlm</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-handler-boilerpipe</artifactId>