This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch refactor-inference in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3a91082f950d273d2170dc26c631226cf5a57252 Author: tballison <[email protected]> AuthorDate: Thu Feb 26 15:09:32 2026 -0500 refactor inference and fix bug in ESEmitter --- tika-app/pom.xml | 12 + tika-parsers/pom.xml | 2 + .../tika-inference => tika-http-jdk}/pom.xml | 74 ++---- .../java/org/apache/tika/http/TikaHttpClient.java | 150 ++++++++++++ .../org/apache/tika/http/TikaTestHttpServer.java | 256 +++++++++++++++++++++ tika-parsers/tika-parsers-ml/pom.xml | 2 +- .../tika-parsers-ml/tika-inference/pom.xml | 48 +--- .../tika/inference/OpenAIEmbeddingFilter.java | 64 +----- .../tika/inference/OpenAIImageEmbeddingParser.java | 75 ++---- .../apache/tika/inference/VectorSerializer.java | 18 +- .../tika/inference/OpenAIEmbeddingFilterTest.java | 52 ++--- .../inference/OpenAIImageEmbeddingParserTest.java | 110 +++------ .../pom.xml | 51 ++-- .../apache/tika/parser/vlm/AbstractVLMParser.java | 96 +++----- .../apache/tika/parser/vlm/ClaudeVLMParser.java | 21 +- .../apache/tika/parser/vlm/GeminiVLMParser.java | 14 +- .../tika/parser/vlm/MarkdownToXHTMLEmitter.java | 0 .../apache/tika/parser/vlm/OpenAIVLMParser.java | 19 +- .../org/apache/tika/parser/vlm/VLMOCRConfig.java | 0 .../tika/parser/vlm/ClaudeVLMParserTest.java | 61 +++-- .../tika/parser/vlm/GeminiVLMParserTest.java | 60 ++--- .../parser/vlm/MarkdownToXHTMLEmitterTest.java | 0 .../tika/parser/vlm/OpenAIVLMParserTest.java | 61 +++-- tika-server/tika-server-standard/pom.xml | 12 + 24 files changed, 706 insertions(+), 552 deletions(-) diff --git a/tika-app/pom.xml b/tika-app/pom.xml index 3d117bc17a..5fad464dc3 100644 --- a/tika-app/pom.xml +++ b/tika-app/pom.xml @@ -40,6 +40,18 @@ <artifactId>tika-parsers-standard-package</artifactId> <version>${project.version}</version> </dependency> + <!-- inference: text embeddings and CLIP image embeddings --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-inference</artifactId> + <version>${project.version}</version> + </dependency> + <!-- VLM: OpenAI-compatible, Gemini, and Claude vision parsers --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-vlm</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-handler-boilerpipe</artifactId> diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 47ee4a1a32..9c63396976 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -32,6 +32,8 @@ <packaging>pom</packaging> <modules> + <!-- shared JDK HTTP client for parser modules (zero runtime deps) --> + <module>tika-http-jdk</module> <!-- basic parsers - avoid network dependent parsers, native code --> <module>tika-parsers-standard</module> <!-- allow network dependent parsers and native code --> diff --git a/tika-parsers/tika-parsers-ml/tika-inference/pom.xml b/tika-parsers/tika-http-jdk/pom.xml similarity index 52% copy from tika-parsers/tika-parsers-ml/tika-inference/pom.xml copy to tika-parsers/tika-http-jdk/pom.xml index f56dcc19ad..1b3fa2e405 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/pom.xml +++ b/tika-parsers/tika-http-jdk/pom.xml @@ -23,64 +23,31 @@ <modelVersion>4.0.0</modelVersion> <parent> - <artifactId>tika-parsers-ml</artifactId> + <artifactId>tika-parsers</artifactId> <groupId>org.apache.tika</groupId> <version>${revision}</version> </parent> - <artifactId>tika-inference</artifactId> - <name>Apache Tika inference module</name> - - <properties> - <okhttp.version>5.3.2</okhttp.version> - </properties> - - <dependencyManagement> - <dependencies> - <!-- align Kotlin stdlib versions pulled by OkHttp and Okio --> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk8</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk7</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-common</artifactId> - <version>2.3.10</version> - </dependency> - </dependencies> - </dependencyManagement> + <artifactId>tika-http-jdk</artifactId> + <name>Apache Tika JDK HTTP client</name> + <description> + Thin wrapper around java.net.http.HttpClient for use by Tika parser + modules that make outbound REST calls (embedding APIs, VLM endpoints, + etc.). Has zero runtime dependencies beyond tika-core and the JDK. + </description> <dependencies> <dependency> - <groupId>com.squareup.okhttp3</groupId> - <artifactId>okhttp-jvm</artifactId> - <version>${okhttp.version}</version> - </dependency> - <dependency> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-databind</artifactId> - </dependency> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <scope>provided</scope> </dependency> <!-- test --> <dependency> - <groupId>com.squareup.okhttp3</groupId> - <artifactId>mockwebserver</artifactId> - <version>${okhttp.version}</version> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter</artifactId> <scope>test</scope> </dependency> </dependencies> @@ -93,17 +60,24 @@ <configuration> <archive> <manifestEntries> - <Automatic-Module-Name>org.apache.tika.inference</Automatic-Module-Name> + <Automatic-Module-Name>org.apache.tika.http</Automatic-Module-Name> </manifestEntries> </archive> </configuration> + <executions> + <execution> + <goals> + <goal>test-jar</goal> + </goals> + </execution> + </executions> </plugin> <plugin> <groupId>org.apache.rat</groupId> <artifactId>apache-rat-plugin</artifactId> <configuration> <inputExcludes> - <inputExclude>src/test/resources/test-documents/**</inputExclude> + <inputExclude>src/test/resources/**</inputExclude> </inputExcludes> </configuration> </plugin> @@ -111,6 +85,6 @@ </build> <scm> - <tag>3.0.0-rc1</tag> + <tag>HEAD</tag> </scm> </project> diff --git a/tika-parsers/tika-http-jdk/src/main/java/org/apache/tika/http/TikaHttpClient.java b/tika-parsers/tika-http-jdk/src/main/java/org/apache/tika/http/TikaHttpClient.java new file mode 100644 index 0000000000..ce5418a4f4 --- /dev/null +++ b/tika-parsers/tika-http-jdk/src/main/java/org/apache/tika/http/TikaHttpClient.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.http; + +import java.io.Closeable; +import java.io.IOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.apache.tika.exception.TikaException; + +/** + * Lightweight HTTP client for Tika parser modules that call external REST + * endpoints (embedding APIs, VLM services, etc.). + * <p> + * Built on {@link java.net.http.HttpClient} with a daemon thread executor + * so the JVM — including forked {@code PipesServer} processes — shuts down + * cleanly without waiting for idle HTTP threads. + * <p> + * This class has no runtime dependencies beyond the JDK and {@code tika-core}. + * Obtain an instance via {@link #build(int)} and close it when done to release + * the underlying executor. + * + * @since Apache Tika 4.0 + */ +public class TikaHttpClient implements Closeable { + + private static final String JSON_CONTENT_TYPE = "application/json; charset=utf-8"; + + private final HttpClient httpClient; + private final ExecutorService executor; + private final int defaultTimeoutSeconds; + + private TikaHttpClient(HttpClient httpClient, ExecutorService executor, + int defaultTimeoutSeconds) { + this.httpClient = httpClient; + this.executor = executor; + this.defaultTimeoutSeconds = defaultTimeoutSeconds; + } + + /** + * Create a new {@code TikaHttpClient} with a daemon-thread executor. + * + * @param connectTimeoutSeconds TCP connection timeout in seconds + */ + public static TikaHttpClient build(int connectTimeoutSeconds) { + ExecutorService executor = Executors.newCachedThreadPool(r -> { + Thread t = new Thread(r, "tika-http-jdk"); + t.setDaemon(true); + return t; + }); + HttpClient client = HttpClient.newBuilder() + .executor(executor) + .connectTimeout(Duration.ofSeconds(connectTimeoutSeconds)) + .followRedirects(HttpClient.Redirect.NORMAL) + .version(HttpClient.Version.HTTP_1_1) + .build(); + return new TikaHttpClient(client, executor, connectTimeoutSeconds); + } + + /** + * POST a JSON body to {@code url} and return the response body as a string. + * + * @param url target URL + * @param jsonBody request body (UTF-8 JSON) + * @param headers additional HTTP headers (e.g. {@code Authorization}) + * @param timeoutSeconds read timeout; {@code 0} uses the default timeout + * @return response body string + * @throws IOException on network error + * @throws TikaException on non-2xx HTTP status + */ + public String postJson(String url, String jsonBody, Map<String, String> headers, + int timeoutSeconds) throws IOException, TikaException { + HttpRequest.Builder builder = HttpRequest.newBuilder() + .uri(URI.create(url)) + .timeout(Duration.ofSeconds(timeoutSeconds > 0 + ? timeoutSeconds : defaultTimeoutSeconds)) + .header("Content-Type", JSON_CONTENT_TYPE) + .POST(HttpRequest.BodyPublishers.ofString(jsonBody, StandardCharsets.UTF_8)); + + headers.forEach(builder::header); + + return send(builder.build()); + } + + /** + * GET {@code url} and return the response body as a string. + * Useful for health-check probes at init time. + * + * @param url target URL + * @param headers additional HTTP headers + * @param timeoutSeconds read timeout; {@code 0} uses the default timeout + * @return response body string + * @throws IOException on network error + * @throws TikaException on non-2xx HTTP status + */ + public String get(String url, Map<String, String> headers, + int timeoutSeconds) throws IOException, TikaException { + HttpRequest.Builder builder = HttpRequest.newBuilder() + .uri(URI.create(url)) + .timeout(Duration.ofSeconds(timeoutSeconds > 0 + ? timeoutSeconds : defaultTimeoutSeconds)) + .GET(); + + headers.forEach(builder::header); + + return send(builder.build()); + } + + private String send(HttpRequest request) throws IOException, TikaException { + try { + HttpResponse<String> response = httpClient.send( + request, HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); + if (response.statusCode() < 200 || response.statusCode() >= 300) { + throw new TikaException("HTTP " + response.statusCode() + + " from " + request.uri() + ": " + response.body()); + } + return response.body(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("HTTP request interrupted: " + request.uri(), e); + } + } + + @Override + public void close() { + executor.shutdown(); + } +} diff --git a/tika-parsers/tika-http-jdk/src/test/java/org/apache/tika/http/TikaTestHttpServer.java b/tika-parsers/tika-http-jdk/src/test/java/org/apache/tika/http/TikaTestHttpServer.java new file mode 100644 index 0000000000..b42920cecd --- /dev/null +++ b/tika-parsers/tika-http-jdk/src/test/java/org/apache/tika/http/TikaTestHttpServer.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.http; + +import java.io.BufferedReader; +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.ServerSocket; +import java.net.Socket; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Minimal mock HTTP/1.1 server for unit tests, backed by a plain + * {@link ServerSocket}. Has no dependencies outside the JDK. + * <p> + * Drop-in replacement for OkHttp's {@code MockWebServer} in Tika unit tests. + * <p> + * Usage: + * <pre>{@code + * try (TikaTestHttpServer server = new TikaTestHttpServer()) { + * server.enqueue(new MockResponse(200, "{\"data\":[]}")); + * // configure code under test to use server.url() + * RecordedRequest req = server.takeRequest(); + * assertEquals("POST", req.method()); + * } + * }</pre> + * + * @since Apache Tika 4.0 + */ +public class TikaTestHttpServer implements Closeable { + + /** A pre-programmed response to return for the next incoming request. */ + public record MockResponse(int status, String body) {} + + /** A captured incoming HTTP request. */ + public record RecordedRequest(String method, String path, + Map<String, String> headers, String body) { + /** Returns the header value for {@code name} (case-insensitive), or {@code null}. */ + public String header(String name) { + return headers.get(name.toLowerCase(java.util.Locale.ROOT)); + } + } + + private final ServerSocket serverSocket; + private final ExecutorService executor; + private final BlockingQueue<MockResponse> responses = new LinkedBlockingQueue<>(); + private final BlockingQueue<RecordedRequest> requests = new LinkedBlockingQueue<>(); + private final AtomicInteger requestCount = new AtomicInteger(0); + private volatile boolean running = true; + + public TikaTestHttpServer() throws IOException { + serverSocket = new ServerSocket(0); + executor = Executors.newCachedThreadPool(r -> { + Thread t = new Thread(r, "tika-test-http"); + t.setDaemon(true); + return t; + }); + executor.submit(this::acceptLoop); + } + + private void acceptLoop() { + while (running) { + try { + Socket socket = serverSocket.accept(); + executor.submit(() -> handleConnection(socket)); + } catch (IOException e) { + if (running) { + // unexpected error while accepting + Thread.currentThread().interrupt(); + } + } + } + } + + private void handleConnection(Socket socket) { + try (socket) { + // Use a single BufferedReader for the entire connection — reading + // body via the same reader avoids the buffered-read-ahead pitfall + // where a raw InputStream read would miss bytes already buffered. + // Body content is JSON (UTF-8/ASCII), so char-level reading is safe. + BufferedReader reader = new BufferedReader( + new InputStreamReader(socket.getInputStream(), + StandardCharsets.ISO_8859_1)); + OutputStream out = socket.getOutputStream(); + + // Parse request line: METHOD path HTTP/1.x + String requestLine = reader.readLine(); + if (requestLine == null || requestLine.isBlank()) { + return; + } + String[] parts = requestLine.split(" ", 3); + String method = parts[0]; + String path = parts.length > 1 ? parts[1] : "/"; + + // Parse headers + Map<String, String> headers = new HashMap<>(); + int contentLength = 0; + String line; + while ((line = reader.readLine()) != null && !line.isEmpty()) { + int colon = line.indexOf(':'); + if (colon > 0) { + String name = line.substring(0, colon).trim() + .toLowerCase(java.util.Locale.ROOT); + String value = line.substring(colon + 1).trim(); + headers.put(name, value); + if ("content-length".equals(name)) { + try { + contentLength = Integer.parseInt(value); + } catch (NumberFormatException ignored) { + // ignore + } + } + } + } + + // Read body through the same BufferedReader to avoid consuming bytes + // from the underlying stream that are already buffered in the reader. + String body = ""; + String transferEncoding = headers.getOrDefault("transfer-encoding", ""); + if (transferEncoding.toLowerCase(java.util.Locale.ROOT).contains("chunked")) { + body = readChunkedFromReader(reader); + } else if (contentLength > 0) { + char[] bodyChars = new char[contentLength]; + int read = 0; + while (read < contentLength) { + int n = reader.read(bodyChars, read, contentLength - read); + if (n < 0) { + break; + } + read += n; + } + body = new String(bodyChars, 0, read); + } + + requests.add(new RecordedRequest(method, path, headers, body)); + requestCount.incrementAndGet(); + + // Send response + MockResponse resp = responses.poll(); + if (resp == null) { + resp = new MockResponse(500, "{\"error\":\"no response queued\"}"); + } + + byte[] responseBytes = resp.body().getBytes(StandardCharsets.UTF_8); + String statusText = resp.status() == 200 ? "OK" + : resp.status() == 500 ? "Internal Server Error" + : String.valueOf(resp.status()); + String responseHeaders = + "HTTP/1.1 " + resp.status() + " " + statusText + "\r\n" + + "Content-Type: application/json\r\n" + + "Content-Length: " + responseBytes.length + "\r\n" + + "Connection: close\r\n" + + "\r\n"; + out.write(responseHeaders.getBytes(StandardCharsets.US_ASCII)); + out.write(responseBytes); + out.flush(); + } catch (IOException e) { + // connection closed or error; ignore in test context + } + } + + private static String readChunkedFromReader(BufferedReader reader) throws IOException { + StringBuilder sb = new StringBuilder(); + String sizeLine; + while ((sizeLine = reader.readLine()) != null) { + // strip any chunk extensions (e.g. "4;ext=val") + int semicolon = sizeLine.indexOf(';'); + String hexSize = semicolon >= 0 ? sizeLine.substring(0, semicolon) : sizeLine; + int chunkSize = Integer.parseInt(hexSize.trim(), 16); + if (chunkSize == 0) { + reader.readLine(); // consume trailing empty line + break; + } + char[] chunk = new char[chunkSize]; + int read = 0; + while (read < chunkSize) { + int n = reader.read(chunk, read, chunkSize - read); + if (n < 0) { + break; + } + read += n; + } + sb.append(chunk, 0, read); + reader.readLine(); // consume CRLF after chunk data + } + return sb.toString(); + } + + /** Queue a response to return for the next request. */ + public void enqueue(MockResponse response) { + responses.add(response); + } + + /** + * Retrieves and removes the earliest recorded request, waiting up to + * 5 seconds if necessary. + * + * @return the recorded request, or {@code null} if no request arrived + * within the timeout + */ + public RecordedRequest takeRequest() throws InterruptedException { + return requests.poll(5, TimeUnit.SECONDS); + } + + /** + * Returns the total number of requests received so far + * (including those already consumed by {@link #takeRequest()}). + */ + public int getRequestCount() { + return requestCount.get(); + } + + /** Returns the base URL (e.g. {@code http://localhost:54321}) with no trailing slash. */ + public String url() { + return "http://localhost:" + serverSocket.getLocalPort(); + } + + public void shutdown() { + running = false; + try { + serverSocket.close(); + } catch (IOException ignored) { + // ignore + } + executor.shutdownNow(); + } + + @Override + public void close() { + shutdown(); + } +} diff --git a/tika-parsers/tika-parsers-ml/pom.xml b/tika-parsers/tika-parsers-ml/pom.xml index 5a508fe2a7..028346e226 100644 --- a/tika-parsers/tika-parsers-ml/pom.xml +++ b/tika-parsers/tika-parsers-ml/pom.xml @@ -36,7 +36,7 @@ <module>tika-parser-nlp-module</module> <module>tika-parser-nlp-package</module> <module>tika-inference</module> - <module>tika-parser-vlm-ocr-module</module> + <module>tika-vlm</module> <module>tika-parser-tess4j-module</module> <module>tika-transcribe-aws</module> </modules> diff --git a/tika-parsers/tika-parsers-ml/tika-inference/pom.xml b/tika-parsers/tika-parsers-ml/tika-inference/pom.xml index f56dcc19ad..17db66625b 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/pom.xml +++ b/tika-parsers/tika-parsers-ml/tika-inference/pom.xml @@ -31,41 +31,11 @@ <artifactId>tika-inference</artifactId> <name>Apache Tika inference module</name> - <properties> - <okhttp.version>5.3.2</okhttp.version> - </properties> - - <dependencyManagement> - <dependencies> - <!-- align Kotlin stdlib versions pulled by OkHttp and Okio --> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk8</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk7</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-common</artifactId> - <version>2.3.10</version> - </dependency> - </dependencies> - </dependencyManagement> - <dependencies> <dependency> - <groupId>com.squareup.okhttp3</groupId> - <artifactId>okhttp-jvm</artifactId> - <version>${okhttp.version}</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-http-jdk</artifactId> + <version>${project.version}</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> @@ -78,9 +48,15 @@ <!-- test --> <dependency> - <groupId>com.squareup.okhttp3</groupId> - <artifactId>mockwebserver</artifactId> - <version>${okhttp.version}</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-http-jdk</artifactId> + <version>${project.version}</version> + <classifier>tests</classifier> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter</artifactId> <scope>test</scope> </dependency> </dependencies> diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIEmbeddingFilter.java b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIEmbeddingFilter.java index 7a841ea904..398e75e5a6 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIEmbeddingFilter.java +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIEmbeddingFilter.java @@ -17,21 +17,18 @@ package org.apache.tika.inference; import java.io.IOException; +import java.util.HashMap; import java.util.List; -import java.util.concurrent.TimeUnit; +import java.util.Map; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; -import okhttp3.MediaType; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.RequestBody; -import okhttp3.Response; import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; +import org.apache.tika.http.TikaHttpClient; import org.apache.tika.utils.StringUtils; /** @@ -54,10 +51,7 @@ public class OpenAIEmbeddingFilter extends AbstractEmbeddingFilter { private static final ObjectMapper MAPPER = new ObjectMapper(); - private static final MediaType JSON_MEDIA_TYPE = - MediaType.parse("application/json; charset=utf-8"); - - private transient OkHttpClient httpClient; + private transient TikaHttpClient httpClient; /** * URL path appended to {@code baseUrl} for embeddings requests. @@ -74,12 +68,12 @@ public class OpenAIEmbeddingFilter extends AbstractEmbeddingFilter { public OpenAIEmbeddingFilter() { super(); - buildHttpClient(); + this.httpClient = TikaHttpClient.build(30); } public OpenAIEmbeddingFilter(InferenceConfig config) { super(config); - buildHttpClient(); + this.httpClient = TikaHttpClient.build(30); } @Override @@ -90,33 +84,17 @@ public class OpenAIEmbeddingFilter extends AbstractEmbeddingFilter { return; } - // Build the request with all chunk texts in one batch String requestJson = buildRequest(chunks, config); String url = config.getBaseUrl().replaceAll("/+$", "") + embeddingsPath; - Request.Builder builder = new Request.Builder() - .url(url) - .post(RequestBody.create(requestJson, JSON_MEDIA_TYPE)); - + Map<String, String> headers = new HashMap<>(); if (!StringUtils.isBlank(config.getApiKey())) { - builder.header(apiKeyHeaderName, apiKeyPrefix + config.getApiKey()); + headers.put(apiKeyHeaderName, apiKeyPrefix + config.getApiKey()); } - OkHttpClient client = getClientWithTimeout(config); - - try (Response response = client.newCall(builder.build()).execute()) { - if (!response.isSuccessful()) { - String body = response.body() != null - ? response.body().string() : ""; - throw new TikaException( - "Embedding request failed with HTTP " - + response.code() + ": " + body); - } - - String responseBody = response.body() != null - ? response.body().string() : ""; - parseResponse(responseBody, chunks); - } + String responseBody = httpClient.postJson(url, requestJson, headers, + config.getTimeoutSeconds()); + parseResponse(responseBody, chunks); } String buildRequest(List<Chunk> chunks, InferenceConfig config) { @@ -164,26 +142,6 @@ public class OpenAIEmbeddingFilter extends AbstractEmbeddingFilter { } } - private void buildHttpClient() { - int timeout = getDefaultConfig().getTimeoutSeconds(); - httpClient = new OkHttpClient.Builder() - .connectTimeout(30, TimeUnit.SECONDS) - .readTimeout(timeout, TimeUnit.SECONDS) - .writeTimeout(60, TimeUnit.SECONDS) - .build(); - } - - private OkHttpClient getClientWithTimeout(InferenceConfig config) { - long defaultMs = getDefaultConfig().getTimeoutSeconds() * 1000L; - long requestMs = config.getTimeoutSeconds() * 1000L; - if (requestMs == defaultMs) { - return httpClient; - } - return httpClient.newBuilder() - .readTimeout(requestMs, TimeUnit.MILLISECONDS) - .build(); - } - // ---- Azure / endpoint config getters/setters ---------------------------- public String getEmbeddingsPath() { diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java index 44d3808136..2ab6b70d5c 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java @@ -20,19 +20,16 @@ import java.io.IOException; import java.util.Arrays; import java.util.Base64; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; -import java.util.concurrent.TimeUnit; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.RequestBody; -import okhttp3.Response; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; @@ -46,6 +43,7 @@ import org.apache.tika.config.TikaComponent; import org.apache.tika.config.TikaTaskTimeout; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; +import org.apache.tika.http.TikaHttpClient; import org.apache.tika.inference.locator.Locators; import org.apache.tika.inference.locator.PaginatedLocator; import org.apache.tika.io.TikaInputStream; @@ -105,11 +103,8 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable { private static final ObjectMapper MAPPER = new ObjectMapper(); - private static final okhttp3.MediaType JSON_MEDIA_TYPE = - okhttp3.MediaType.parse("application/json; charset=utf-8"); - private ImageEmbeddingConfig defaultConfig; - private transient OkHttpClient httpClient; + private transient TikaHttpClient httpClient; /** URL path for embeddings requests. Default: {@code /v1/embeddings}. */ private String embeddingsPath = "/v1/embeddings"; @@ -126,7 +121,7 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable { public OpenAIImageEmbeddingParser(ImageEmbeddingConfig config) { this.defaultConfig = config; - buildHttpClient(); + this.httpClient = TikaHttpClient.build(30); } public OpenAIImageEmbeddingParser(JsonConfig jsonConfig) { @@ -167,20 +162,16 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable { long timeoutMillis = TikaTaskTimeout.getTimeoutMillis( parseContext, config.getTimeoutSeconds() * 1000L); + int timeoutSeconds = (int) (timeoutMillis / 1000L); - float[] vector = callEmbeddingEndpoint(config, mimeType, base64Data, - timeoutMillis); + float[] vector = callEmbeddingEndpoint(config, mimeType, base64Data, timeoutSeconds); - // Build a Chunk with the vector and locators Locators locators = buildLocators(metadata); Chunk chunk = new Chunk(null, locators); chunk.setVector(vector); - // Merge into the canonical chunks field so image embeddings - // coexist with text chunks in a single array ChunkSerializer.mergeInto(metadata, List.of(chunk)); - // Emit an empty document -- this parser produces vectors, not text XHTMLContentHandler xhtml = new XHTMLContentHandler( handler, metadata, parseContext); xhtml.startDocument(); @@ -191,42 +182,26 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable { @Override public void initialize() throws TikaConfigException { - buildHttpClient(); + this.httpClient = TikaHttpClient.build(30); } // ---- internals -------------------------------------------------------- float[] callEmbeddingEndpoint(ImageEmbeddingConfig config, String mimeType, String base64Data, - long timeoutMillis) + int timeoutSeconds) throws IOException, TikaException { String requestJson = buildRequest(config, mimeType, base64Data); String url = config.getBaseUrl().replaceAll("/+$", "") + embeddingsPath; - Request.Builder builder = new Request.Builder() - .url(url) - .post(RequestBody.create(requestJson, JSON_MEDIA_TYPE)); - + Map<String, String> headers = new HashMap<>(); if (!StringUtils.isBlank(config.getApiKey())) { - builder.header(apiKeyHeaderName, apiKeyPrefix + config.getApiKey()); + headers.put(apiKeyHeaderName, apiKeyPrefix + config.getApiKey()); } - OkHttpClient client = getClientWithTimeout(timeoutMillis); - - try (Response response = client.newCall(builder.build()).execute()) { - if (!response.isSuccessful()) { - String body = response.body() != null - ? response.body().string() : ""; - throw new TikaException( - "Image embedding request failed with HTTP " - + response.code() + ": " + body); - } - - String responseBody = response.body() != null - ? response.body().string() : ""; - return parseResponse(responseBody); - } + String responseBody = httpClient.postJson(url, requestJson, headers, timeoutSeconds); + return parseResponse(responseBody); } String buildRequest(ImageEmbeddingConfig config, String mimeType, @@ -273,8 +248,6 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable { Locators buildLocators(Metadata metadata) { Locators locators = new Locators(); - // If we have page number metadata (from PDF rendering), create - // a PaginatedLocator String pageStr = metadata.get(TikaPagedText.PAGE_NUMBER); if (pageStr != null) { try { @@ -303,8 +276,6 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable { throws TikaConfigException, IOException { String key = "openai-image-embedding-parser"; if (parseContext.hasJsonConfig(key)) { - // Deserialize into RuntimeConfig which prevents overriding - // security-sensitive fields (baseUrl, apiKey) at parse time ImageEmbeddingConfig.RuntimeConfig runtimeConfig = ParseContextConfig.getConfig( parseContext, key, @@ -315,7 +286,6 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable { return runtimeConfig; } - // Merge runtime overrides with the init-time defaults return ParseContextConfig.getConfig( parseContext, key, ImageEmbeddingConfig.class, defaultConfig); @@ -323,25 +293,6 @@ public class OpenAIImageEmbeddingParser implements Parser, Initializable { return defaultConfig; } - private void buildHttpClient() { - httpClient = new OkHttpClient.Builder() - .connectTimeout(30, TimeUnit.SECONDS) - .readTimeout(defaultConfig.getTimeoutSeconds(), - TimeUnit.SECONDS) - .writeTimeout(60, TimeUnit.SECONDS) - .build(); - } - - private OkHttpClient getClientWithTimeout(long timeoutMillis) { - long defaultMs = defaultConfig.getTimeoutSeconds() * 1000L; - if (timeoutMillis == defaultMs) { - return httpClient; - } - return httpClient.newBuilder() - .readTimeout(timeoutMillis, TimeUnit.MILLISECONDS) - .build(); - } - // ---- delegating config getters/setters -------------------------------- public String getBaseUrl() { diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/VectorSerializer.java b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/VectorSerializer.java index f350fee123..6964365f0f 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/VectorSerializer.java +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/VectorSerializer.java @@ -17,14 +17,13 @@ package org.apache.tika.inference; import java.nio.ByteBuffer; -import java.nio.ByteOrder; import java.nio.FloatBuffer; import java.util.Base64; /** - * Serializes and deserializes float vectors as base64-encoded little-endian - * float32 byte arrays. Little-endian matches numpy/PyTorch convention so - * vectors from Python inference servers round-trip cleanly. + * Serializes and deserializes float vectors as base64-encoded big-endian + * float32 byte arrays. Big-endian (network byte order) matches what + * Elasticsearch expects when ingesting base64-encoded dense vectors. */ public final class VectorSerializer { @@ -32,23 +31,20 @@ public final class VectorSerializer { } /** - * Encode a float array as a base64 string (little-endian float32). + * Encode a float array as a base64 string (big-endian float32). */ public static String encode(float[] vector) { - ByteBuffer buf = ByteBuffer.allocate(vector.length * Float.BYTES) - .order(ByteOrder.LITTLE_ENDIAN); + ByteBuffer buf = ByteBuffer.allocate(vector.length * Float.BYTES); buf.asFloatBuffer().put(vector); return Base64.getEncoder().encodeToString(buf.array()); } /** - * Decode a base64 string back to a float array (little-endian float32). + * Decode a base64 string back to a float array (big-endian float32). */ public static float[] decode(String base64) { byte[] bytes = Base64.getDecoder().decode(base64); - FloatBuffer fb = ByteBuffer.wrap(bytes) - .order(ByteOrder.LITTLE_ENDIAN) - .asFloatBuffer(); + FloatBuffer fb = ByteBuffer.wrap(bytes).asFloatBuffer(); float[] vector = new float[fb.remaining()]; fb.get(vector); return vector; diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIEmbeddingFilterTest.java b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIEmbeddingFilterTest.java index b90e70b0c0..c576438f4b 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIEmbeddingFilterTest.java +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIEmbeddingFilterTest.java @@ -27,31 +27,28 @@ import java.util.List; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import okhttp3.mockwebserver.MockResponse; -import okhttp3.mockwebserver.MockWebServer; -import okhttp3.mockwebserver.RecordedRequest; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.apache.tika.exception.TikaException; +import org.apache.tika.http.TikaTestHttpServer; import org.apache.tika.metadata.Metadata; public class OpenAIEmbeddingFilterTest { private static final ObjectMapper MAPPER = new ObjectMapper(); - private MockWebServer server; + private TikaTestHttpServer server; private OpenAIEmbeddingFilter filter; private InferenceConfig config; @BeforeEach void setUp() throws Exception { - server = new MockWebServer(); - server.start(); + server = new TikaTestHttpServer(); config = new InferenceConfig(); - config.setBaseUrl(server.url("").toString().replaceAll("/+$", "")); + config.setBaseUrl(server.url()); config.setModel("text-embedding-3-small"); config.setMaxChunkChars(500); config.setOverlapChars(0); @@ -61,7 +58,7 @@ public class OpenAIEmbeddingFilterTest { } @AfterEach - void tearDown() throws Exception { + void tearDown() { server.shutdown(); } @@ -70,10 +67,8 @@ public class OpenAIEmbeddingFilterTest { String content = "# Section A\n\nSome text about section A.\n\n" + "# Section B\n\nSome text about section B."; - // Mock embeddings response with 2 vectors (3 dims each) - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(2, 3)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(2, 3))); Metadata metadata = new Metadata(); metadata.set("tika:content", content); @@ -94,10 +89,9 @@ public class OpenAIEmbeddingFilterTest { assertNotNull(chunks.get(1).getVector()); assertEquals(3, chunks.get(0).getVector().length); - // Verify the request - RecordedRequest request = server.takeRequest(); - assertEquals("/v1/embeddings", request.getPath()); - JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + assertEquals("/v1/embeddings", request.path()); + JsonNode body = MAPPER.readTree(request.body()); assertEquals("text-embedding-3-small", body.get("model").asText()); assertEquals(2, body.get("input").size()); } @@ -107,9 +101,8 @@ public class OpenAIEmbeddingFilterTest { config.setApiKey("sk-test-key"); filter = new OpenAIEmbeddingFilter(config); - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(1, 3)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(1, 3))); Metadata metadata = new Metadata(); metadata.set("tika:content", "Some text."); @@ -118,7 +111,7 @@ public class OpenAIEmbeddingFilterTest { filter.filter(list); assertEquals("Bearer sk-test-key", - server.takeRequest().getHeader("Authorization")); + server.takeRequest().header("authorization")); } @Test @@ -147,8 +140,7 @@ public class OpenAIEmbeddingFilterTest { @Test void testServerError() { - server.enqueue(new MockResponse().setResponseCode(500) - .setBody("{\"error\":\"boom\"}")); + server.enqueue(new TikaTestHttpServer.MockResponse(500, "{\"error\":\"boom\"}")); Metadata metadata = new Metadata(); metadata.set("tika:content", "Some text."); @@ -185,9 +177,8 @@ public class OpenAIEmbeddingFilterTest { @Test void testVectorSerialization() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(1, 3)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(1, 3))); Metadata metadata = new Metadata(); metadata.set("tika:content", "Single chunk of text."); @@ -202,16 +193,14 @@ public class OpenAIEmbeddingFilterTest { // Vector should be base64, not a JSON array String vectorField = array.get(0).get("vector").asText(); assertNotNull(vectorField); - // Should be decodable float[] decoded = VectorSerializer.decode(vectorField); assertEquals(3, decoded.length); } @Test void testMergeWithExistingChunks() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(1, 3)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(1, 3))); Metadata metadata = new Metadata(); metadata.set("tika:content", "Some text."); @@ -234,9 +223,7 @@ public class OpenAIEmbeddingFilterTest { metadata.get(ChunkSerializer.CHUNKS_FIELD)); // Should have the pre-existing image chunk + the new text chunk assertEquals(2, merged.size()); - // First is the image chunk (no text) assertNull(merged.get(0).getText()); - // Second is the text chunk assertNotNull(merged.get(1).getText()); assertNotNull(merged.get(1).getVector()); } @@ -257,7 +244,8 @@ public class OpenAIEmbeddingFilterTest { if (d > 0) { sb.append(","); } - sb.append(String.format(java.util.Locale.ROOT, "%.6f", (i + 1) * 0.1 + d * 0.01)); + sb.append(String.format(java.util.Locale.ROOT, + "%.6f", (i + 1) * 0.1 + d * 0.01)); } sb.append("]}"); } diff --git a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIImageEmbeddingParserTest.java b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIImageEmbeddingParserTest.java index bd6ca03131..7d48a7c437 100644 --- a/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIImageEmbeddingParserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-inference/src/test/java/org/apache/tika/inference/OpenAIImageEmbeddingParserTest.java @@ -26,15 +26,13 @@ import java.util.List; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import okhttp3.mockwebserver.MockResponse; -import okhttp3.mockwebserver.MockWebServer; -import okhttp3.mockwebserver.RecordedRequest; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; +import org.apache.tika.http.TikaTestHttpServer; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaPagedText; @@ -44,17 +42,16 @@ public class OpenAIImageEmbeddingParserTest { private static final ObjectMapper MAPPER = new ObjectMapper(); - private MockWebServer server; + private TikaTestHttpServer server; private OpenAIImageEmbeddingParser parser; private ImageEmbeddingConfig config; @BeforeEach void setUp() throws Exception { - server = new MockWebServer(); - server.start(); + server = new TikaTestHttpServer(); config = new ImageEmbeddingConfig(); - config.setBaseUrl(server.url("").toString().replaceAll("/+$", "")); + config.setBaseUrl(server.url()); config.setModel("jina-clip-v2"); config.setTimeoutSeconds(10); @@ -62,25 +59,22 @@ public class OpenAIImageEmbeddingParserTest { } @AfterEach - void tearDown() throws Exception { + void tearDown() { server.shutdown(); } @Test void testEndToEnd() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(3)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(3))); - // Fake 1x1 PNG bytes (just needs to be non-empty) byte[] fakeImage = new byte[]{(byte) 0x89, 'P', 'N', 'G'}; Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/ocr-png"); try (TikaInputStream tis = TikaInputStream.get(fakeImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } String output = metadata.get(ChunkSerializer.CHUNKS_FIELD); @@ -92,10 +86,9 @@ public class OpenAIImageEmbeddingParserTest { assertNotNull(chunks.get(0).getVector()); assertEquals(3, chunks.get(0).getVector().length); - // Verify request format - RecordedRequest request = server.takeRequest(); - assertEquals("/v1/embeddings", request.getPath()); - JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + assertEquals("/v1/embeddings", request.path()); + JsonNode body = MAPPER.readTree(request.body()); assertEquals("jina-clip-v2", body.get("model").asText()); assertTrue(body.get("input").get(0).get("image").asText() .startsWith("data:image/png;base64,")); @@ -103,9 +96,8 @@ public class OpenAIImageEmbeddingParserTest { @Test void testPageNumberLocator() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(2)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(2))); byte[] fakeImage = new byte[]{1, 2, 3}; @@ -114,8 +106,7 @@ public class OpenAIImageEmbeddingParserTest { metadata.set(TikaPagedText.PAGE_NUMBER, 7); try (TikaInputStream tis = TikaInputStream.get(fakeImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } List<Chunk> chunks = ChunkSerializer.fromJson( @@ -124,15 +115,13 @@ public class OpenAIImageEmbeddingParserTest { assertNotNull(chunks.get(0).getLocators().getPaginated()); assertEquals(1, chunks.get(0).getLocators().getPaginated().size()); - assertEquals(7, - chunks.get(0).getLocators().getPaginated().get(0).getPage()); + assertEquals(7, chunks.get(0).getLocators().getPaginated().get(0).getPage()); } @Test void testOcrPrefixStripped() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(2)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(2))); byte[] fakeImage = new byte[]{1, 2, 3}; @@ -140,12 +129,11 @@ public class OpenAIImageEmbeddingParserTest { metadata.set(Metadata.CONTENT_TYPE, "image/ocr-jpeg"); try (TikaInputStream tis = TikaInputStream.get(fakeImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } - RecordedRequest request = server.takeRequest(); - JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + JsonNode body = MAPPER.readTree(request.body()); // Should strip "ocr-" prefix: image/ocr-jpeg -> image/jpeg assertTrue(body.get("input").get(0).get("image").asText() .startsWith("data:image/jpeg;base64,")); @@ -156,21 +144,19 @@ public class OpenAIImageEmbeddingParserTest { config.setApiKey("sk-test-clip-key"); parser = new OpenAIImageEmbeddingParser(config); - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(2)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(2))); byte[] fakeImage = new byte[]{1}; Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); try (TikaInputStream tis = TikaInputStream.get(fakeImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } assertEquals("Bearer sk-test-clip-key", - server.takeRequest().getHeader("Authorization")); + server.takeRequest().header("authorization")); } @Test @@ -183,8 +169,7 @@ public class OpenAIImageEmbeddingParserTest { metadata.set(Metadata.CONTENT_TYPE, "image/png"); try (TikaInputStream tis = TikaInputStream.get(fakeImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } assertNull(metadata.get(ChunkSerializer.CHUNKS_FIELD)); @@ -196,14 +181,12 @@ public class OpenAIImageEmbeddingParserTest { config.setMinFileSizeToEmbed(100); parser = new OpenAIImageEmbeddingParser(config); - // 4 bytes -- below minimum byte[] tinyImage = new byte[]{1, 2, 3, 4}; Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); try (TikaInputStream tis = TikaInputStream.get(tinyImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } assertNull(metadata.get(ChunkSerializer.CHUNKS_FIELD)); @@ -212,9 +195,8 @@ public class OpenAIImageEmbeddingParserTest { @Test void testServerError() { - server.enqueue(new MockResponse() - .setResponseCode(500) - .setBody("{\"error\":\"internal error\"}")); + server.enqueue(new TikaTestHttpServer.MockResponse(500, + "{\"error\":\"internal error\"}")); byte[] fakeImage = new byte[]{1}; Metadata metadata = new Metadata(); @@ -222,17 +204,15 @@ public class OpenAIImageEmbeddingParserTest { assertThrows(TikaException.class, () -> { try (TikaInputStream tis = TikaInputStream.get(fakeImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } }); } @Test void testMergeWithExistingChunks() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(4)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(4))); byte[] fakeImage = new byte[]{1}; Metadata metadata = new Metadata(); @@ -245,17 +225,14 @@ public class OpenAIImageEmbeddingParserTest { ChunkSerializer.toJson(List.of(textChunk))); try (TikaInputStream tis = TikaInputStream.get(fakeImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } List<Chunk> merged = ChunkSerializer.fromJson( metadata.get(ChunkSerializer.CHUNKS_FIELD)); assertEquals(2, merged.size()); - // First chunk is the pre-existing text chunk assertEquals("existing text", merged.get(0).getText()); assertNotNull(merged.get(0).getVector()); - // Second chunk is the image embedding (no text) assertNull(merged.get(1).getText()); assertNotNull(merged.get(1).getVector()); assertEquals(4, merged.get(1).getVector().length); @@ -264,11 +241,11 @@ public class OpenAIImageEmbeddingParserTest { @Test void testSupportedTypes() { assertTrue(parser.getSupportedTypes(new ParseContext()) - .contains(MediaType.image("ocr-png"))); + .contains(org.apache.tika.mime.MediaType.image("ocr-png"))); assertTrue(parser.getSupportedTypes(new ParseContext()) - .contains(MediaType.image("ocr-jpeg"))); + .contains(org.apache.tika.mime.MediaType.image("ocr-jpeg"))); assertTrue(parser.getSupportedTypes(new ParseContext()) - .contains(MediaType.image("webp"))); + .contains(org.apache.tika.mime.MediaType.image("webp"))); } @Test @@ -293,22 +270,19 @@ public class OpenAIImageEmbeddingParserTest { @Test void testVectorSerializedAsBase64() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildEmbeddingResponse(3)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildEmbeddingResponse(3))); byte[] fakeImage = new byte[]{1}; Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); try (TikaInputStream tis = TikaInputStream.get(fakeImage)) { - parser.parse(tis, new DefaultHandler(), metadata, - new ParseContext()); + parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } String output = metadata.get(ChunkSerializer.CHUNKS_FIELD); JsonNode array = MAPPER.readTree(output); - // Vector should be base64-encoded string, not a JSON array String vectorField = array.get(0).get("vector").asText(); assertNotNull(vectorField); float[] decoded = VectorSerializer.decode(vectorField); @@ -329,12 +303,4 @@ public class OpenAIImageEmbeddingParserTest { sb.append("\"usage\":{\"prompt_tokens\":10,\"total_tokens\":10}}"); return sb.toString(); } - - // Local MediaType helper since we don't want to import tika-core's - // MediaType in the static context of these asserts - private static class MediaType { - static org.apache.tika.mime.MediaType image(String subtype) { - return org.apache.tika.mime.MediaType.image(subtype); - } - } } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml b/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml similarity index 68% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml rename to tika-parsers/tika-parsers-ml/tika-vlm/pom.xml index 6de2e96345..4aa0097050 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml +++ b/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml @@ -28,45 +28,18 @@ <version>${revision}</version> </parent> - <artifactId>tika-parser-vlm-ocr-module</artifactId> - <name>Apache Tika VLM OCR parser module</name> + <artifactId>tika-vlm</artifactId> + <name>Apache Tika VLM module</name> <properties> - <okhttp.version>5.3.2</okhttp.version> <commonmark.version>0.27.1</commonmark.version> </properties> - <dependencyManagement> - <dependencies> - <!-- align Kotlin stdlib versions pulled by OkHttp and Okio --> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk8</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-jdk7</artifactId> - <version>2.3.10</version> - </dependency> - <dependency> - <groupId>org.jetbrains.kotlin</groupId> - <artifactId>kotlin-stdlib-common</artifactId> - <version>2.3.10</version> - </dependency> - </dependencies> - </dependencyManagement> - <dependencies> <dependency> - <groupId>com.squareup.okhttp3</groupId> - <artifactId>okhttp-jvm</artifactId> - <version>${okhttp.version}</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-http-jdk</artifactId> + <version>${project.version}</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> @@ -94,9 +67,15 @@ <!-- test --> <dependency> - <groupId>com.squareup.okhttp3</groupId> - <artifactId>mockwebserver</artifactId> - <version>${okhttp.version}</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-http-jdk</artifactId> + <version>${project.version}</version> + <classifier>tests</classifier> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter</artifactId> <scope>test</scope> </dependency> </dependencies> @@ -109,7 +88,7 @@ <configuration> <archive> <manifestEntries> - <Automatic-Module-Name>org.apache.tika.parser.vlm</Automatic-Module-Name> + <Automatic-Module-Name>org.apache.tika.vlm</Automatic-Module-Name> </manifestEntries> </archive> </configuration> diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java similarity index 80% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java index 5b16f2c90e..2ce0d598f4 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java @@ -22,12 +22,9 @@ import java.io.IOException; import java.io.InputStream; import java.util.Base64; import java.util.Collections; +import java.util.Map; import java.util.Set; -import java.util.concurrent.TimeUnit; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.Response; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; @@ -40,6 +37,7 @@ import org.apache.tika.config.TikaTaskTimeout; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.ParentContentHandler; +import org.apache.tika.http.TikaHttpClient; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -78,34 +76,36 @@ public abstract class AbstractVLMParser implements Parser, Initializable { public static final Property VLM_COMPLETION_TOKENS = Property.externalInteger(VLM_META + "completion_tokens"); - static final okhttp3.MediaType JSON_MEDIA_TYPE = - okhttp3.MediaType.parse("application/json; charset=utf-8"); + /** + * Encapsulates a fully built HTTP request for a VLM API call. + * + * @param url full request URL (base + path) + * @param json serialized JSON request body + * @param headers additional HTTP headers (e.g. Authorization) + */ + protected record HttpCall(String url, String json, Map<String, String> headers) {} private VLMOCRConfig defaultConfig; - private transient OkHttpClient httpClient; + private transient TikaHttpClient httpClient; private boolean serverAvailable = false; protected AbstractVLMParser(VLMOCRConfig config) { this.defaultConfig = config; - buildHttpClient(); + this.httpClient = buildHttpClient(); } // ---- abstract contract for subclasses --------------------------------- /** - * Build a fully formed {@link Request} for the target API. + * Build a fully formed {@link HttpCall} for the target API. * - * @param config resolved config for this parse - * @param fileBytes raw bytes of the input (image or document) - * @param mimeType the MIME type of the input (e.g. {@code image/png}, - * {@code application/pdf}) - * @param base64Data base64-encoded version of {@code fileBytes} - * @param client the OkHttp client (for timeout-aware request building) - * @return a ready-to-execute OkHttp {@link Request} + * @param config resolved config for this parse + * @param base64Data base64-encoded version of the file bytes + * @param mimeType the MIME type of the input (e.g. {@code image/png}) + * @return a ready-to-execute {@link HttpCall} */ - protected abstract Request buildHttpRequest(VLMOCRConfig config, byte[] fileBytes, - String mimeType, String base64Data, - OkHttpClient client); + protected abstract HttpCall buildHttpCall(VLMOCRConfig config, + String base64Data, String mimeType); /** * Parse the API response body and extract the model's text output. @@ -170,7 +170,6 @@ public abstract class AbstractVLMParser implements Parser, Initializable { String mimeType = detectMimeType(metadata); - // Check image pixel dimensions before reading fully (skip for PDFs) long maxPixels = config.getMaxImagePixels(); if (maxPixels > 0 && mimeType.startsWith("image/")) { tis.mark((int) Math.min(tis.getLength() + 1, 1024 * 1024)); @@ -194,19 +193,17 @@ public abstract class AbstractVLMParser implements Parser, Initializable { long timeoutMillis = TikaTaskTimeout.getTimeoutMillis( parseContext, config.getTimeoutSeconds() * 1000L); - OkHttpClient client = getClientWithTimeout(timeoutMillis); + int timeoutSeconds = (int) (timeoutMillis / 1000L); - Request httpRequest = buildHttpRequest(config, fileBytes, mimeType, base64Data, client); + HttpCall call = buildHttpCall(config, base64Data, mimeType); String responseText; - try (Response response = client.newCall(httpRequest).execute()) { - if (!response.isSuccessful()) { - String body = response.body() != null ? response.body().string() : ""; - throw new TikaException( - "VLM request failed with HTTP " + response.code() + ": " + body); - } - String responseBody = response.body() != null ? response.body().string() : ""; + try { + String responseBody = httpClient.postJson( + call.url(), call.json(), call.headers(), timeoutSeconds); responseText = extractResponseText(responseBody, metadata); + } catch (TikaException e) { + throw e; } catch (IOException e) { throw new TikaException("VLM request failed: " + e.getMessage(), e); } @@ -225,22 +222,19 @@ public abstract class AbstractVLMParser implements Parser, Initializable { @Override public void initialize() throws TikaConfigException { - buildHttpClient(); + this.httpClient = buildHttpClient(); String healthUrl = getHealthCheckUrl(defaultConfig); if (healthUrl == null) { return; } try { - Request request = new Request.Builder().url(healthUrl).get().build(); - try (Response response = httpClient.newCall(request).execute()) { - serverAvailable = response.isSuccessful(); - if (serverAvailable) { - LOG.info("VLM server is available at {}", defaultConfig.getBaseUrl()); - } else { - LOG.warn("VLM server returned HTTP {} at {}", - response.code(), defaultConfig.getBaseUrl()); - } - } + httpClient.get(healthUrl, Map.of(), defaultConfig.getTimeoutSeconds()); + serverAvailable = true; + LOG.info("VLM server is available at {}", defaultConfig.getBaseUrl()); + } catch (TikaException e) { + LOG.warn("VLM server returned error at {}: {}", + defaultConfig.getBaseUrl(), e.getMessage()); + serverAvailable = false; } catch (IOException e) { LOG.warn("VLM server is not available at {}: {}", defaultConfig.getBaseUrl(), e.getMessage()); @@ -254,9 +248,6 @@ public abstract class AbstractVLMParser implements Parser, Initializable { throws TikaConfigException, IOException { String key = configKey(); if (parseContext.hasJsonConfig(key)) { - // Deserialize into RuntimeConfig which prevents overriding - // security-sensitive fields (baseUrl, apiKey, prompt) at parse time. - // Pass the init-time config so that allowRuntimePrompt is inherited. VLMOCRConfig.RuntimeConfig runtimeConfig = ParseContextConfig.getConfig( parseContext, key, VLMOCRConfig.RuntimeConfig.class, new VLMOCRConfig.RuntimeConfig(defaultConfig)); @@ -265,7 +256,6 @@ public abstract class AbstractVLMParser implements Parser, Initializable { return runtimeConfig; } - // Merge runtime overrides with the init-time defaults return ParseContextConfig.getConfig( parseContext, key, VLMOCRConfig.class, defaultConfig); } @@ -354,22 +344,8 @@ public abstract class AbstractVLMParser implements Parser, Initializable { } } - private void buildHttpClient() { - httpClient = new OkHttpClient.Builder() - .connectTimeout(30, TimeUnit.SECONDS) - .readTimeout(defaultConfig.getTimeoutSeconds(), TimeUnit.SECONDS) - .writeTimeout(60, TimeUnit.SECONDS) - .build(); - } - - OkHttpClient getClientWithTimeout(long timeoutMillis) { - long defaultTimeoutMillis = defaultConfig.getTimeoutSeconds() * 1000L; - if (timeoutMillis == defaultTimeoutMillis) { - return httpClient; - } - return httpClient.newBuilder() - .readTimeout(timeoutMillis, TimeUnit.MILLISECONDS) - .build(); + private TikaHttpClient buildHttpClient() { + return TikaHttpClient.build(30); } // ---- delegating config getters/setters -------------------------------- diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java similarity index 92% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java index 9f20ccdd2a..051b6b6867 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java @@ -19,16 +19,15 @@ package org.apache.tika.parser.vlm; import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.RequestBody; import org.apache.tika.config.ConfigDeserializer; import org.apache.tika.config.JsonConfig; @@ -105,22 +104,18 @@ public class ClaudeVLMParser extends AbstractVLMParser { } @Override - protected Request buildHttpRequest(VLMOCRConfig config, byte[] fileBytes, - String mimeType, String base64Data, - OkHttpClient client) { + protected HttpCall buildHttpCall(VLMOCRConfig config, + String base64Data, String mimeType) { String json = buildRequestJson(config, base64Data, mimeType); String url = stripTrailingSlash(config.getBaseUrl()) + "/v1/messages"; - Request.Builder builder = new Request.Builder() - .url(url) - .post(RequestBody.create(json, JSON_MEDIA_TYPE)) - .header("anthropic-version", ANTHROPIC_VERSION); - + Map<String, String> headers = new HashMap<>(); + headers.put("anthropic-version", ANTHROPIC_VERSION); if (!StringUtils.isBlank(config.getApiKey())) { - builder.header("x-api-key", config.getApiKey()); + headers.put("x-api-key", config.getApiKey()); } - return builder.build(); + return new HttpCall(url, json, headers); } @Override diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java similarity index 94% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java index 6997b30b1b..c6e8d4a9f2 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java @@ -20,15 +20,13 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; +import java.util.Map; import java.util.Set; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.RequestBody; import org.apache.tika.config.ConfigDeserializer; import org.apache.tika.config.JsonConfig; @@ -115,9 +113,8 @@ public class GeminiVLMParser extends AbstractVLMParser { } @Override - protected Request buildHttpRequest(VLMOCRConfig config, byte[] fileBytes, - String mimeType, String base64Data, - OkHttpClient client) { + protected HttpCall buildHttpCall(VLMOCRConfig config, + String base64Data, String mimeType) { String json = buildRequestJson(config, base64Data, mimeType); String baseUrl = stripTrailingSlash(config.getBaseUrl()); @@ -127,10 +124,7 @@ public class GeminiVLMParser extends AbstractVLMParser { url += "?key=" + config.getApiKey(); } - return new Request.Builder() - .url(url) - .post(RequestBody.create(json, JSON_MEDIA_TYPE)) - .build(); + return new HttpCall(url, json, Map.of()); } @Override diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java similarity index 94% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java index 3109651ba0..0ead21c492 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java @@ -19,16 +19,15 @@ package org.apache.tika.parser.vlm; import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.RequestBody; import org.apache.tika.config.ConfigDeserializer; import org.apache.tika.config.JsonConfig; @@ -124,20 +123,16 @@ public class OpenAIVLMParser extends AbstractVLMParser { } @Override - protected Request buildHttpRequest(VLMOCRConfig config, byte[] fileBytes, - String mimeType, String base64Data, - OkHttpClient client) { + protected HttpCall buildHttpCall(VLMOCRConfig config, + String base64Data, String mimeType) { String json = buildRequestJson(config, base64Data, mimeType); String url = stripTrailingSlash(config.getBaseUrl()) + completionsPath; - Request.Builder builder = new Request.Builder() - .url(url) - .post(RequestBody.create(json, JSON_MEDIA_TYPE)); - + Map<String, String> headers = new HashMap<>(); if (!StringUtils.isBlank(config.getApiKey())) { - builder.header(apiKeyHeaderName, apiKeyPrefix + config.getApiKey()); + headers.put(apiKeyHeaderName, apiKeyPrefix + config.getApiKey()); } - return builder.build(); + return new HttpCall(url, json, headers); } @Override diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java similarity index 82% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java index ba8241e7c0..f04a65c42f 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java @@ -27,15 +27,13 @@ import java.io.ByteArrayInputStream; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import okhttp3.mockwebserver.MockResponse; -import okhttp3.mockwebserver.MockWebServer; -import okhttp3.mockwebserver.RecordedRequest; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; +import org.apache.tika.http.TikaTestHttpServer; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -46,17 +44,16 @@ public class ClaudeVLMParserTest { private static final ObjectMapper MAPPER = new ObjectMapper(); - private MockWebServer server; + private TikaTestHttpServer server; private ClaudeVLMParser parser; private VLMOCRConfig config; @BeforeEach void setUp() throws Exception { - server = new MockWebServer(); - server.start(); + server = new TikaTestHttpServer(); config = new VLMOCRConfig(); - config.setBaseUrl(server.url("").toString().replaceAll("/+$", "")); + config.setBaseUrl(server.url()); config.setModel("claude-sonnet-4-20250514"); config.setPrompt("Extract all text."); config.setMaxTokens(4096); @@ -67,15 +64,14 @@ public class ClaudeVLMParserTest { } @AfterEach - void tearDown() throws Exception { + void tearDown() { server.shutdown(); } @Test void testSuccessfulImageOcr() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildClaudeResponse("Hello from Claude!", 200, 30)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildClaudeResponse("Hello from Claude!", 200, 30))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); @@ -92,15 +88,15 @@ public class ClaudeVLMParserTest { assertEquals("200", metadata.get(AbstractVLMParser.VLM_PROMPT_TOKENS)); assertEquals("30", metadata.get(AbstractVLMParser.VLM_COMPLETION_TOKENS)); - RecordedRequest request = server.takeRequest(); - assertEquals("/v1/messages", request.getPath()); - assertEquals("POST", request.getMethod()); - assertEquals("2023-06-01", request.getHeader("anthropic-version")); - assertEquals("sk-ant-test-key", request.getHeader("x-api-key")); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + assertEquals("/v1/messages", request.path()); + assertEquals("POST", request.method()); + assertEquals("2023-06-01", request.header("anthropic-version")); + assertEquals("sk-ant-test-key", request.header("x-api-key")); // Claude does NOT use Bearer auth - assertNull(request.getHeader("Authorization")); + assertNull(request.header("authorization")); - JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + JsonNode body = MAPPER.readTree(request.body()); assertEquals("claude-sonnet-4-20250514", body.get("model").asText()); assertEquals(4096, body.get("max_tokens").asInt()); @@ -111,7 +107,6 @@ public class ClaudeVLMParserTest { JsonNode parts = messages.get(0).get("content"); assertEquals(2, parts.size()); - // First part: image JsonNode imagePart = parts.get(0); assertEquals("image", imagePart.get("type").asText()); JsonNode source = imagePart.get("source"); @@ -119,16 +114,14 @@ public class ClaudeVLMParserTest { assertEquals("image/png", source.get("media_type").asText()); assertNotNull(source.get("data").asText()); - // Second part: text prompt assertEquals("text", parts.get(1).get("type").asText()); assertEquals("Extract all text.", parts.get(1).get("text").asText()); } @Test void testPdfSupport() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildClaudeResponse("PDF text extracted by Claude", 500, 60)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildClaudeResponse("PDF text extracted by Claude", 500, 60))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); @@ -141,11 +134,10 @@ public class ClaudeVLMParserTest { assertTrue(handler.toString().contains("PDF text extracted by Claude")); - RecordedRequest request = server.takeRequest(); - JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + JsonNode body = MAPPER.readTree(request.body()); JsonNode parts = body.get("messages").get(0).get("content"); - // For PDFs, the content type should be "document" not "image" assertEquals("document", parts.get(0).get("type").asText()); assertEquals("application/pdf", parts.get(0).get("source").get("media_type").asText()); @@ -172,9 +164,8 @@ public class ClaudeVLMParserTest { @Test void testApiKeyAsXApiKeyHeader() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildClaudeResponse("ok", 10, 5)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildClaudeResponse("ok", 10, 5))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); @@ -184,16 +175,16 @@ public class ClaudeVLMParserTest { parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); } - RecordedRequest request = server.takeRequest(); - assertEquals("sk-ant-test-key", request.getHeader("x-api-key")); - assertNull(request.getHeader("Authorization")); - assertEquals("2023-06-01", request.getHeader("anthropic-version")); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + assertEquals("sk-ant-test-key", request.header("x-api-key")); + assertNull(request.header("authorization")); + assertEquals("2023-06-01", request.header("anthropic-version")); } @Test void testServerError() throws Exception { - server.enqueue(new MockResponse().setResponseCode(500) - .setBody("{\"error\":{\"type\":\"server_error\",\"message\":\"boom\"}}")); + server.enqueue(new TikaTestHttpServer.MockResponse(500, + "{\"error\":{\"type\":\"server_error\",\"message\":\"boom\"}}")); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java similarity index 81% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java index 05d19b747a..ac87dfcd4f 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java @@ -25,15 +25,13 @@ import java.io.ByteArrayInputStream; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import okhttp3.mockwebserver.MockResponse; -import okhttp3.mockwebserver.MockWebServer; -import okhttp3.mockwebserver.RecordedRequest; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; +import org.apache.tika.http.TikaTestHttpServer; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -44,17 +42,16 @@ public class GeminiVLMParserTest { private static final ObjectMapper MAPPER = new ObjectMapper(); - private MockWebServer server; + private TikaTestHttpServer server; private GeminiVLMParser parser; private VLMOCRConfig config; @BeforeEach void setUp() throws Exception { - server = new MockWebServer(); - server.start(); + server = new TikaTestHttpServer(); config = new VLMOCRConfig(); - config.setBaseUrl(server.url("").toString().replaceAll("/+$", "")); + config.setBaseUrl(server.url()); config.setModel("gemini-2.5-flash"); config.setPrompt("Extract all text from this document."); config.setMaxTokens(4096); @@ -65,15 +62,14 @@ public class GeminiVLMParserTest { } @AfterEach - void tearDown() throws Exception { + void tearDown() { server.shutdown(); } @Test void testSuccessfulImageOcr() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildGeminiResponse("Hello from Gemini!", 80, 15)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildGeminiResponse("Hello from Gemini!", 80, 15))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); @@ -89,13 +85,12 @@ public class GeminiVLMParserTest { assertEquals("80", metadata.get(AbstractVLMParser.VLM_PROMPT_TOKENS)); assertEquals("15", metadata.get(AbstractVLMParser.VLM_COMPLETION_TOKENS)); - RecordedRequest request = server.takeRequest(); - assertTrue(request.getPath().contains("/v1beta/models/gemini-2.5-flash:generateContent")); - assertTrue(request.getPath().contains("key=test-gemini-key")); - assertEquals("POST", request.getMethod()); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + assertTrue(request.path().contains("/v1beta/models/gemini-2.5-flash:generateContent")); + assertTrue(request.path().contains("key=test-gemini-key")); + assertEquals("POST", request.method()); - // Verify Gemini request format - JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + JsonNode body = MAPPER.readTree(request.body()); JsonNode contents = body.get("contents"); assertNotNull(contents); assertEquals(1, contents.size()); @@ -109,21 +104,18 @@ public class GeminiVLMParserTest { assertEquals("image/png", inlineData.get("mime_type").asText()); assertNotNull(inlineData.get("data").asText()); - // Verify generation config assertEquals(4096, body.get("generationConfig").get("maxOutputTokens").asInt()); } @Test void testPdfSupport() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildGeminiResponse("PDF content extracted", 200, 50)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildGeminiResponse("PDF content extracted", 200, 50))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); BodyContentHandler handler = new BodyContentHandler(); - // Fake PDF bytes (starts with %PDF) byte[] fakePdf = "%PDF-1.4 fake content".getBytes(java.nio.charset.StandardCharsets.UTF_8); try (TikaInputStream tis = TikaInputStream.get(new ByteArrayInputStream(fakePdf))) { @@ -132,9 +124,10 @@ public class GeminiVLMParserTest { assertTrue(handler.toString().contains("PDF content extracted")); - RecordedRequest request = server.takeRequest(); - JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); - JsonNode inlineData = body.get("contents").get(0).get("parts").get(1).get("inline_data"); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + JsonNode body = MAPPER.readTree(request.body()); + JsonNode inlineData = + body.get("contents").get(0).get("parts").get(1).get("inline_data"); assertEquals("application/pdf", inlineData.get("mime_type").asText()); } @@ -155,9 +148,8 @@ public class GeminiVLMParserTest { @Test void testApiKeyAsQueryParam() throws Exception { - server.enqueue(new MockResponse() - .setBody(buildGeminiResponse("ok", 10, 5)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildGeminiResponse("ok", 10, 5))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); @@ -167,17 +159,17 @@ public class GeminiVLMParserTest { parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); } - RecordedRequest request = server.takeRequest(); - assertTrue(request.getPath().contains("key=test-gemini-key"), + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + assertTrue(request.path().contains("key=test-gemini-key"), "API key should be in query params, not header"); // Gemini does NOT use Bearer auth - assertEquals(null, request.getHeader("Authorization")); + assertEquals(null, request.header("authorization")); } @Test void testServerError() throws Exception { - server.enqueue(new MockResponse().setResponseCode(500) - .setBody("{\"error\":{\"message\":\"internal\"}}")); + server.enqueue(new TikaTestHttpServer.MockResponse(500, + "{\"error\":{\"message\":\"internal\"}}")); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); @@ -199,7 +191,6 @@ public class GeminiVLMParserTest { @Test void testExtractResponseTextMultipleParts() throws Exception { - // Gemini can return multiple text parts String json = "{\"candidates\":[{\"content\":{\"parts\":[" + "{\"text\":\"Part one\"}," + "{\"text\":\"Part two\"}" @@ -220,7 +211,6 @@ public class GeminiVLMParserTest { assertTrue(json.contains("\"data\":\"AAAA\"")); assertTrue(json.contains("\"maxOutputTokens\":4096")); assertTrue(json.contains("Extract all text from this document.")); - // Should NOT contain OpenAI-style fields assertTrue(!json.contains("\"messages\"")); assertTrue(!json.contains("\"max_tokens\"")); } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java similarity index 100% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java diff --git a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java similarity index 82% rename from tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java rename to tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java index ed573c8bab..68f040f604 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java @@ -26,15 +26,13 @@ import java.io.ByteArrayInputStream; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import okhttp3.mockwebserver.MockResponse; -import okhttp3.mockwebserver.MockWebServer; -import okhttp3.mockwebserver.RecordedRequest; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; +import org.apache.tika.http.TikaTestHttpServer; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -44,17 +42,16 @@ public class OpenAIVLMParserTest { private static final ObjectMapper MAPPER = new ObjectMapper(); - private MockWebServer server; + private TikaTestHttpServer server; private OpenAIVLMParser parser; private VLMOCRConfig config; @BeforeEach void setUp() throws Exception { - server = new MockWebServer(); - server.start(); + server = new TikaTestHttpServer(); config = new VLMOCRConfig(); - config.setBaseUrl(server.url("").toString().replaceAll("/+$", "")); + config.setBaseUrl(server.url()); config.setModel("test-model"); config.setPrompt("Extract text from this image."); config.setMaxTokens(1024); @@ -64,7 +61,7 @@ public class OpenAIVLMParserTest { } @AfterEach - void tearDown() throws Exception { + void tearDown() { server.shutdown(); } @@ -72,9 +69,8 @@ public class OpenAIVLMParserTest { void testSuccessfulOcr() throws Exception { String ocrText = "Hello, World!\nThis is extracted text."; - server.enqueue(new MockResponse() - .setBody(buildChatResponse(ocrText, 100, 20)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildChatResponse(ocrText, 100, 20))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); @@ -91,11 +87,11 @@ public class OpenAIVLMParserTest { assertEquals("100", metadata.get(AbstractVLMParser.VLM_PROMPT_TOKENS)); assertEquals("20", metadata.get(AbstractVLMParser.VLM_COMPLETION_TOKENS)); - RecordedRequest request = server.takeRequest(); - assertEquals("/v1/chat/completions", request.getPath()); - assertEquals("POST", request.getMethod()); + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + assertEquals("/v1/chat/completions", request.path()); + assertEquals("POST", request.method()); - JsonNode body = MAPPER.readTree(request.getBody().readUtf8()); + JsonNode body = MAPPER.readTree(request.body()); assertEquals("test-model", body.get("model").asText()); assertEquals(1024, body.get("max_tokens").asInt()); @@ -112,8 +108,7 @@ public class OpenAIVLMParserTest { @Test void testServerError() throws Exception { - server.enqueue(new MockResponse().setResponseCode(500) - .setBody("{\"error\":\"boom\"}")); + server.enqueue(new TikaTestHttpServer.MockResponse(500, "{\"error\":\"boom\"}")); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); @@ -164,9 +159,8 @@ public class OpenAIVLMParserTest { config.setApiKey("sk-test-key"); parser = new OpenAIVLMParser(config); - server.enqueue(new MockResponse() - .setBody(buildChatResponse("text", 10, 5)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildChatResponse("text", 10, 5))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); @@ -176,7 +170,7 @@ public class OpenAIVLMParserTest { parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); } - assertEquals("Bearer sk-test-key", server.takeRequest().getHeader("Authorization")); + assertEquals("Bearer sk-test-key", server.takeRequest().header("authorization")); } @Test @@ -185,11 +179,11 @@ public class OpenAIVLMParserTest { parser = new OpenAIVLMParser(config); parser.setApiKeyHeaderName("api-key"); parser.setApiKeyPrefix(""); - parser.setCompletionsPath("/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01"); + parser.setCompletionsPath( + "/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01"); - server.enqueue(new MockResponse() - .setBody(buildChatResponse("text", 10, 5)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildChatResponse("text", 10, 5))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); @@ -199,10 +193,10 @@ public class OpenAIVLMParserTest { parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); } - var request = server.takeRequest(); - assertEquals("azure-key-123", request.getHeader("api-key")); - assertNull(request.getHeader("Authorization")); - assertTrue(request.getPath().startsWith( + TikaTestHttpServer.RecordedRequest request = server.takeRequest(); + assertEquals("azure-key-123", request.header("api-key")); + assertNull(request.header("authorization")); + assertTrue(request.path().startsWith( "/openai/deployments/gpt-4o/chat/completions")); } @@ -220,15 +214,14 @@ public class OpenAIVLMParserTest { @Test void testPerRequestConfigOverride() throws Exception { VLMOCRConfig override = new VLMOCRConfig(); - override.setBaseUrl(server.url("").toString().replaceAll("/+$", "")); + override.setBaseUrl(server.url()); override.setModel("override-model"); override.setPrompt("Custom."); override.setMaxTokens(2048); override.setTimeoutSeconds(10); - server.enqueue(new MockResponse() - .setBody(buildChatResponse("ok", 10, 5)) - .setHeader("Content-Type", "application/json")); + server.enqueue(new TikaTestHttpServer.MockResponse(200, + buildChatResponse("ok", 10, 5))); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); @@ -240,7 +233,7 @@ public class OpenAIVLMParserTest { parser.parse(tis, new BodyContentHandler(), metadata, ctx); } - JsonNode body = MAPPER.readTree(server.takeRequest().getBody().readUtf8()); + JsonNode body = MAPPER.readTree(server.takeRequest().body()); assertEquals("override-model", body.get("model").asText()); assertEquals(2048, body.get("max_tokens").asInt()); } diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml index 143f16d251..4c6a92227b 100644 --- a/tika-server/tika-server-standard/pom.xml +++ b/tika-server/tika-server-standard/pom.xml @@ -49,6 +49,18 @@ </exclusion> </exclusions> </dependency> + <!-- inference: text embeddings and CLIP image embeddings --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-inference</artifactId> + <version>${project.version}</version> + </dependency> + <!-- VLM: OpenAI-compatible, Gemini, and Claude vision parsers --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-vlm</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-handler-boilerpipe</artifactId>
