This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4703-docker-ci in repository https://gitbox.apache.org/repos/asf/tika.git
commit 02d061bf89da22cfc7cc9695c5527d099efe34d9 Author: Nicholas DiPiazza <[email protected]> AuthorDate: Fri Mar 27 12:32:33 2026 -0500 TIKA-4703: Fix Docker builds found during local testing - tika-server snapshot Dockerfiles: use assembly tgz (thin JAR + lib/) instead of the thin JAR alone, matching the 4.x packaging model - tika-grpc: bundle default-tika-config.json so the server starts without requiring a config volume mount - tika-grpc: pass -c, -p, and --plugin-roots as CLI args instead of system properties so TikaGrpcServer actually picks them up - tika-grpc: default port is now 9090 (configurable via TIKA_GRPC_PORT) Tested locally: all three images (minimal, full, grpc) build and start successfully. Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]> --- .github/workflows/docker-release.yml | 1 + .github/workflows/docker-snapshot.yml | 9 ++++--- tika-grpc/docker-build/default-tika-config.json | 20 +++++++++++++++ tika-grpc/docker-build/docker-build.sh | 1 + tika-grpc/docker-build/start-tika-grpc.sh | 29 +++++++++++----------- tika-server/docker-build/full/Dockerfile.snapshot | 6 ++--- .../docker-build/minimal/Dockerfile.snapshot | 6 ++--- 7 files changed, 48 insertions(+), 24 deletions(-) diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index a412c2a061..d2a3403e47 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -139,6 +139,7 @@ jobs: done cp "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin/" + cp "tika-grpc/docker-build/default-tika-config.json" "${OUT_DIR}/config/" cp "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" - name: Build and push tika-grpc diff --git a/.github/workflows/docker-snapshot.yml b/.github/workflows/docker-snapshot.yml index 315cba2da4..28998e8f5f 100644 --- a/.github/workflows/docker-snapshot.yml +++ b/.github/workflows/docker-snapshot.yml @@ -65,8 +65,8 @@ jobs: run: | TIKA_VERSION="${{ steps.version.outputs.tika_version }}" OUT_DIR=target/tika-server-minimal-docker - mkdir -p "${OUT_DIR}" - cp "tika-server/tika-server-standard/target/tika-server-standard-${TIKA_VERSION}.jar" "${OUT_DIR}/" + mkdir -p "${OUT_DIR}/tika-server" + tar xzf "tika-server/tika-server-standard/target/tika-server-standard-${TIKA_VERSION}-bin.tgz" -C "${OUT_DIR}/tika-server" cp "tika-server/docker-build/minimal/Dockerfile.snapshot" "${OUT_DIR}/Dockerfile" - name: Build and push tika-server minimal snapshot @@ -85,8 +85,8 @@ jobs: run: | TIKA_VERSION="${{ steps.version.outputs.tika_version }}" OUT_DIR=target/tika-server-full-docker - mkdir -p "${OUT_DIR}" - cp "tika-server/tika-server-standard/target/tika-server-standard-${TIKA_VERSION}.jar" "${OUT_DIR}/" + mkdir -p "${OUT_DIR}/tika-server" + tar xzf "tika-server/tika-server-standard/target/tika-server-standard-${TIKA_VERSION}-bin.tgz" -C "${OUT_DIR}/tika-server" cp "tika-server/docker-build/full/Dockerfile.snapshot" "${OUT_DIR}/Dockerfile" - name: Build and push tika-server full snapshot @@ -133,6 +133,7 @@ jobs: done cp "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin/" + cp "tika-grpc/docker-build/default-tika-config.json" "${OUT_DIR}/config/" cp "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" - name: Build and push tika-grpc snapshot diff --git a/tika-grpc/docker-build/default-tika-config.json b/tika-grpc/docker-build/default-tika-config.json new file mode 100644 index 0000000000..000bb01812 --- /dev/null +++ b/tika-grpc/docker-build/default-tika-config.json @@ -0,0 +1,20 @@ +{ + "fetchers": [ + { + "fs": { + "defaultFetcher": { + "basePath": "/data/input" + } + } + } + ], + "emitters": [ + { + "fs": { + "defaultEmitter": { + "basePath": "/data/output" + } + } + } + ] +} diff --git a/tika-grpc/docker-build/docker-build.sh b/tika-grpc/docker-build/docker-build.sh index c522ec04fa..9ce5daa928 100755 --- a/tika-grpc/docker-build/docker-build.sh +++ b/tika-grpc/docker-build/docker-build.sh @@ -81,6 +81,7 @@ for parser_package in "${parser_packages[@]}"; do done cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin" +cp -v "tika-grpc/docker-build/default-tika-config.json" "${OUT_DIR}/config" cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" cd "${OUT_DIR}" || exit diff --git a/tika-grpc/docker-build/start-tika-grpc.sh b/tika-grpc/docker-build/start-tika-grpc.sh index c42c953d7b..919a51afcc 100755 --- a/tika-grpc/docker-build/start-tika-grpc.sh +++ b/tika-grpc/docker-build/start-tika-grpc.sh @@ -12,21 +12,19 @@ # License for the specific language governing permissions and limitations under # the License. -echo "Tika Version:" -echo "${TIKA_VERSION}" +# Use user-provided config or fall back to the bundled default +TIKA_CONFIG="${TIKA_CONFIG:-/tika/config/default-tika-config.json}" + +echo "Tika Version: ${TIKA_VERSION}" +echo "Tika Config: ${TIKA_CONFIG}" echo "Tika Plugins:" ls "/tika/plugins" -echo "Tika gRPC Max Inbound Message Size:" -echo "${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}" -echo "Tika gRPC Max Outbound Message Size:" -echo "${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}" -echo "Tika gRPC Num Threads:" -echo "${TIKA_GRPC_NUM_THREADS}" +echo "Tika gRPC Max Inbound Message Size: ${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}" +echo "Tika gRPC Max Outbound Message Size: ${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}" +echo "Tika gRPC Num Threads: ${TIKA_GRPC_NUM_THREADS}" +TIKA_GRPC_PORT="${TIKA_GRPC_PORT:-9090}" + exec java \ - -Dgrpc.server.port=9090 \ - "-Dgrpc.server.max-inbound-message-size=${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}" \ - "-Dgrpc.server.max-outbound-message-size=${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}" \ - "-Dgrpc.server.numThreads=${TIKA_GRPC_NUM_THREADS}" \ --add-opens=jdk.management/com.sun.management.internal=ALL-UNNAMED \ --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \ --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ @@ -38,5 +36,8 @@ exec java \ --add-opens=java.base/java.util=ALL-UNNAMED \ --add-opens=java.base/java.lang=ALL-UNNAMED \ -Djava.net.preferIPv4Stack=true \ - "-Dplugins.pluginDirs=/tika/plugins" \ - -jar "/tika/libs/tika-grpc-${TIKA_VERSION}.jar" + -jar "/tika/libs/tika-grpc-${TIKA_VERSION}.jar" \ + -c "${TIKA_CONFIG}" \ + -p "${TIKA_GRPC_PORT}" \ + --plugin-roots "/tika/plugins" \ + "$@" diff --git a/tika-server/docker-build/full/Dockerfile.snapshot b/tika-server/docker-build/full/Dockerfile.snapshot index 8882dc5b90..4f655005e6 100644 --- a/tika-server/docker-build/full/Dockerfile.snapshot +++ b/tika-server/docker-build/full/Dockerfile.snapshot @@ -10,7 +10,7 @@ # License for the specific language governing permissions and limitations under # the License. -# Snapshot variant: copies the JAR from the Maven build output rather than +# Snapshot variant: copies the assembly from the Maven build output rather than # downloading from Apache mirrors. Used for nightly/snapshot Docker builds. ARG UID_GID="35002:35002" @@ -44,9 +44,9 @@ RUN set -eux \ && apt-get clean -y \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ENV TIKA_VERSION=$TIKA_VERSION -COPY tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +COPY tika-server/ /tika-server/ USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server/tika-server.jar:/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-server/docker-build/minimal/Dockerfile.snapshot b/tika-server/docker-build/minimal/Dockerfile.snapshot index ac6644f345..d701dfee68 100644 --- a/tika-server/docker-build/minimal/Dockerfile.snapshot +++ b/tika-server/docker-build/minimal/Dockerfile.snapshot @@ -10,7 +10,7 @@ # License for the specific language governing permissions and limitations under # the License. -# Snapshot variant: copies the JAR from the Maven build output rather than +# Snapshot variant: copies the assembly from the Maven build output rather than # downloading from Apache mirrors. Used for nightly/snapshot Docker builds. ARG UID_GID="35002:35002" @@ -26,9 +26,9 @@ RUN set -eux \ ca-certificates \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ENV TIKA_VERSION=$TIKA_VERSION -COPY tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +COPY tika-server/ /tika-server/ USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server/tika-server.jar:/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers [email protected]"
