This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4703-docker-ci in repository https://gitbox.apache.org/repos/asf/tika.git
commit 041ea74b3d6c6a2975aa22e2b8915ba46af8497b Author: Nicholas DiPiazza <[email protected]> AuthorDate: Fri Mar 27 09:02:45 2026 -0500 TIKA-4703: Add Docker CI pipelines for tika-server and tika-grpc Move Docker build infrastructure into the main tika repo so that Docker image releases are tied directly to Tika releases rather than requiring cross-repo coordination with tika-docker/tika-grpc-docker. Snapshot workflow (main branch push): - Builds tika-server minimal and full images from Maven output - Builds tika-grpc image from Maven output - Pushes snapshot tags to Docker Hub (e.g. 4.0.0-SNAPSHOT) Release workflow (version tag push): - Builds tika-server minimal/full from Apache mirror JARs with GPG verification (multi-arch: amd64, arm64, arm/v7, s390x) - Builds tika-grpc from Maven output (multi-arch: amd64, arm64) - Pushes versioned + latest tags to Docker Hub Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]> --- .github/workflows/docker-release.yml | 154 +++++++++++++++++++++ .github/workflows/docker-snapshot.yml | 147 ++++++++++++++++++++ tika-grpc/docker-build/Dockerfile | 53 +++++++ tika-grpc/docker-build/docker-build.sh | 118 ++++++++++++++++ tika-grpc/docker-build/start-tika-grpc.sh | 42 ++++++ tika-server/docker-build/docker-tool.sh | 131 ++++++++++++++++++ tika-server/docker-build/full/Dockerfile | 82 +++++++++++ tika-server/docker-build/full/Dockerfile.snapshot | 52 +++++++ tika-server/docker-build/minimal/Dockerfile | 70 ++++++++++ .../docker-build/minimal/Dockerfile.snapshot | 34 +++++ .../tika/parser/ocr/TesseractOCRConfig.properties | 25 ++++ .../customocr/tika-config-inline.xml | 31 +++++ .../customocr/tika-config-rendered.xml | 38 +++++ .../tika/parser/journal/GrobidExtractor.properties | 16 +++ .../sample-configs/grobid/tika-config.xml | 24 ++++ .../sample-configs/ner/run_tika_server.sh | 62 +++++++++ .../sample-configs/ner/tika-config.xml | 28 ++++ .../vision/inception-rest-caption.xml | 32 +++++ .../sample-configs/vision/inception-rest-video.xml | 32 +++++ .../sample-configs/vision/inception-rest.xml | 32 +++++ 20 files changed, 1203 insertions(+) diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml new file mode 100644 index 0000000000..a412c2a061 --- /dev/null +++ b/.github/workflows/docker-release.yml @@ -0,0 +1,154 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Docker release - tika-server and tika-grpc + +on: + push: + tags: + - '[0-9]+.[0-9]+.[0-9]+*' + +jobs: + release-tika-server: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - uses: actions/checkout@v4 + + - name: Extract version from tag + id: version + run: | + TAG_NAME="${GITHUB_REF#refs/tags/}" + echo "tag=${TAG_NAME}" >> "$GITHUB_OUTPUT" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up QEMU for multi-arch + uses: docker/setup-qemu-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push tika-server minimal + uses: docker/build-push-action@v6 + with: + file: tika-server/docker-build/minimal/Dockerfile + platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + push: true + build-args: | + TIKA_VERSION=${{ steps.version.outputs.tag }} + tags: | + apache/tika:${{ steps.version.outputs.tag }} + apache/tika:latest + + - name: Build and push tika-server full + uses: docker/build-push-action@v6 + with: + file: tika-server/docker-build/full/Dockerfile + platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + push: true + build-args: | + TIKA_VERSION=${{ steps.version.outputs.tag }} + tags: | + apache/tika:${{ steps.version.outputs.tag }}-full + apache/tika:latest-full + + release-tika-grpc: + runs-on: ubuntu-latest + timeout-minutes: 120 + + steps: + - uses: actions/checkout@v4 + + - name: Extract version from tag + id: version + run: | + TAG_NAME="${GITHUB_REF#refs/tags/}" + echo "tag=${TAG_NAME}" >> "$GITHUB_OUTPUT" + + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + cache: 'maven' + + - name: Build with Maven (skip tests) + run: mvn clean install -DskipTests -B "-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up QEMU for multi-arch + uses: docker/setup-qemu-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Prepare tika-grpc Docker build context + run: | + TIKA_VERSION="${{ steps.version.outputs.tag }}" + OUT_DIR=target/tika-grpc-docker + + mkdir -p "${OUT_DIR}/libs" "${OUT_DIR}/plugins" "${OUT_DIR}/config" "${OUT_DIR}/bin" + + cp "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs/" + + # Copy tika-pipes plugin zip files + for dir in tika-pipes/tika-pipes-plugins/*/; do + plugin_name=$(basename "$dir") + zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip" + if [ -f "$zip_file" ]; then + cp "$zip_file" "${OUT_DIR}/plugins/" + fi + done + + # Copy parser packages + for parser_package in \ + "tika-parsers/tika-parsers-standard/tika-parsers-standard-package" \ + "tika-parsers/tika-parsers-extended/tika-parser-scientific-package" \ + "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package" \ + "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"; do + package_name=$(basename "$parser_package") + jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar" + if [ -f "$jar_file" ]; then + cp "$jar_file" "${OUT_DIR}/plugins/" + fi + done + + cp "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin/" + cp "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" + + - name: Build and push tika-grpc + uses: docker/build-push-action@v6 + with: + context: target/tika-grpc-docker + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + VERSION=${{ steps.version.outputs.tag }} + tags: | + apache/tika-grpc:${{ steps.version.outputs.tag }} + apache/tika-grpc:latest diff --git a/.github/workflows/docker-snapshot.yml b/.github/workflows/docker-snapshot.yml new file mode 100644 index 0000000000..315cba2da4 --- /dev/null +++ b/.github/workflows/docker-snapshot.yml @@ -0,0 +1,147 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Docker snapshot - tika-server and tika-grpc + +on: + push: + branches: [ main ] + paths-ignore: + - 'docs/**' + - '*.md' + +jobs: + build: + runs-on: ubuntu-latest + timeout-minutes: 120 + + steps: + - uses: actions/checkout@v4 + + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + cache: 'maven' + + - name: Extract version from pom + id: version + run: | + TIKA_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) + echo "tika_version=${TIKA_VERSION}" >> "$GITHUB_OUTPUT" + + - name: Build with Maven (skip tests) + run: mvn clean install -DskipTests -B "-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up QEMU for multi-arch + uses: docker/setup-qemu-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + # --- tika-server (minimal) --- + - name: Prepare tika-server minimal build context + run: | + TIKA_VERSION="${{ steps.version.outputs.tika_version }}" + OUT_DIR=target/tika-server-minimal-docker + mkdir -p "${OUT_DIR}" + cp "tika-server/tika-server-standard/target/tika-server-standard-${TIKA_VERSION}.jar" "${OUT_DIR}/" + cp "tika-server/docker-build/minimal/Dockerfile.snapshot" "${OUT_DIR}/Dockerfile" + + - name: Build and push tika-server minimal snapshot + uses: docker/build-push-action@v6 + with: + context: target/tika-server-minimal-docker + platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + push: true + build-args: | + TIKA_VERSION=${{ steps.version.outputs.tika_version }} + tags: | + apache/tika:${{ steps.version.outputs.tika_version }} + + # --- tika-server (full) --- + - name: Prepare tika-server full build context + run: | + TIKA_VERSION="${{ steps.version.outputs.tika_version }}" + OUT_DIR=target/tika-server-full-docker + mkdir -p "${OUT_DIR}" + cp "tika-server/tika-server-standard/target/tika-server-standard-${TIKA_VERSION}.jar" "${OUT_DIR}/" + cp "tika-server/docker-build/full/Dockerfile.snapshot" "${OUT_DIR}/Dockerfile" + + - name: Build and push tika-server full snapshot + uses: docker/build-push-action@v6 + with: + context: target/tika-server-full-docker + platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + push: true + build-args: | + TIKA_VERSION=${{ steps.version.outputs.tika_version }} + tags: | + apache/tika:${{ steps.version.outputs.tika_version }}-full + + # --- tika-grpc --- + - name: Prepare tika-grpc Docker build context + run: | + TIKA_VERSION="${{ steps.version.outputs.tika_version }}" + OUT_DIR=target/tika-grpc-docker + + mkdir -p "${OUT_DIR}/libs" "${OUT_DIR}/plugins" "${OUT_DIR}/config" "${OUT_DIR}/bin" + + cp "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs/" + + # Copy tika-pipes plugin zip files + for dir in tika-pipes/tika-pipes-plugins/*/; do + plugin_name=$(basename "$dir") + zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip" + if [ -f "$zip_file" ]; then + cp "$zip_file" "${OUT_DIR}/plugins/" + fi + done + + # Copy parser packages + for parser_package in \ + "tika-parsers/tika-parsers-standard/tika-parsers-standard-package" \ + "tika-parsers/tika-parsers-extended/tika-parser-scientific-package" \ + "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package" \ + "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"; do + package_name=$(basename "$parser_package") + jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar" + if [ -f "$jar_file" ]; then + cp "$jar_file" "${OUT_DIR}/plugins/" + fi + done + + cp "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin/" + cp "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" + + - name: Build and push tika-grpc snapshot + uses: docker/build-push-action@v6 + with: + context: target/tika-grpc-docker + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + VERSION=${{ steps.version.outputs.tika_version }} + tags: | + apache/tika-grpc:${{ steps.version.outputs.tika_version }} diff --git a/tika-grpc/docker-build/Dockerfile b/tika-grpc/docker-build/Dockerfile new file mode 100644 index 0000000000..ccb77c088d --- /dev/null +++ b/tika-grpc/docker-build/Dockerfile @@ -0,0 +1,53 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +FROM ubuntu:plucky +COPY libs/ /tika/libs/ +COPY plugins/ /tika/plugins/ +COPY config/ /tika/config/ +COPY bin/ /tika/bin +ARG JRE='openjdk-17-jre-headless' +ARG VERSION +ARG TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=104857600 +ARG TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE=104857600 +ARG TIKA_GRPC_NUM_THREADS=4 +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends $JRE \ + gdal-bin \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu \ + && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + wget \ + cabextract \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +EXPOSE 9090 +ENV TIKA_VERSION=$VERSION +ENV TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=$TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE +ENV TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE=$TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE +ENV TIKA_GRPC_NUM_THREADS=$TIKA_GRPC_NUM_THREADS +RUN chmod +x "/tika/bin/start-tika-grpc.sh" +ENTRYPOINT ["/tika/bin/start-tika-grpc.sh"] + +LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-grpc/docker-build/docker-build.sh b/tika-grpc/docker-build/docker-build.sh new file mode 100755 index 0000000000..c522ec04fa --- /dev/null +++ b/tika-grpc/docker-build/docker-build.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script assembles the Docker build context for tika-grpc and builds the image. +# It is intended to be run from the root of the tika repository after a Maven build. + +set -euo pipefail + +if [ -z "${TIKA_VERSION:-}" ]; then + echo "Environment variable TIKA_VERSION is required, and should match the maven project version of Tika" + exit 1 +fi + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +REPO_ROOT="${SCRIPT_DIR}/../../" + +cd "${REPO_ROOT}" || exit + +OUT_DIR=target/tika-grpc-docker + +MULTI_ARCH=${MULTI_ARCH:-false} +DOCKER_ID=${DOCKER_ID:-} +PROJECT_NAME=${PROJECT_NAME:-tika-grpc} + +# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION +if [[ -z "${RELEASE_IMAGE_TAG:-}" ]]; then + RELEASE_IMAGE_TAG="${TIKA_VERSION}" + # Remove '-SNAPSHOT' from the version string + RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}" +fi + +mkdir -p "${OUT_DIR}/libs" +mkdir -p "${OUT_DIR}/plugins" +mkdir -p "${OUT_DIR}/config" +mkdir -p "${OUT_DIR}/bin" +cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs" + +# Copy all tika-pipes plugin zip files +for dir in tika-pipes/tika-pipes-plugins/*/; do + plugin_name=$(basename "$dir") + zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip" + if [ -f "$zip_file" ]; then + cp -v -r "$zip_file" "${OUT_DIR}/plugins" + else + echo "WARNING: Plugin file $zip_file does not exist, skipping." + fi +done + +# Copy parser package jars as plugins +parser_packages=( + "tika-parsers/tika-parsers-standard/tika-parsers-standard-package" + "tika-parsers/tika-parsers-extended/tika-parser-scientific-package" + "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package" + "tika-parsers/tika-parsers-ml/tika-parser-nlp-package" +) + +for parser_package in "${parser_packages[@]}"; do + package_name=$(basename "$parser_package") + jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar" + if [ -f "$jar_file" ]; then + cp -v -r "$jar_file" "${OUT_DIR}/plugins" + else + echo "Parser package file $jar_file does not exist, skipping." + fi +done + +cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin" +cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" + +cd "${OUT_DIR}" || exit + +echo "Running docker build from directory: $(pwd)" + +IMAGE_TAGS=() +if [[ -n "${DOCKER_ID}" ]]; then + IMAGE_TAGS+=("-t" "${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}") +fi + +if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then + echo "No image tags specified. Set DOCKER_ID environment variable to enable Docker build." + exit 0 +fi + +if [ "${MULTI_ARCH}" == "true" ]; then + echo "Building multi-arch image" + docker buildx create --name tikabuilder --use || true + docker buildx build \ + --builder=tikabuilder . \ + "${IMAGE_TAGS[@]}" \ + --build-arg VERSION="${TIKA_VERSION}" \ + --platform linux/amd64,linux/arm64 \ + --push + docker buildx stop tikabuilder + docker buildx rm tikabuilder +else + echo "Building single-arch image" + docker build . "${IMAGE_TAGS[@]}" --build-arg VERSION="${TIKA_VERSION}" +fi + +echo "===================================================================================================" +echo "Done running docker build with tags: ${IMAGE_TAGS[*]}" +echo "===================================================================================================" diff --git a/tika-grpc/docker-build/start-tika-grpc.sh b/tika-grpc/docker-build/start-tika-grpc.sh new file mode 100755 index 0000000000..c42c953d7b --- /dev/null +++ b/tika-grpc/docker-build/start-tika-grpc.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +echo "Tika Version:" +echo "${TIKA_VERSION}" +echo "Tika Plugins:" +ls "/tika/plugins" +echo "Tika gRPC Max Inbound Message Size:" +echo "${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}" +echo "Tika gRPC Max Outbound Message Size:" +echo "${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}" +echo "Tika gRPC Num Threads:" +echo "${TIKA_GRPC_NUM_THREADS}" +exec java \ + -Dgrpc.server.port=9090 \ + "-Dgrpc.server.max-inbound-message-size=${TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE}" \ + "-Dgrpc.server.max-outbound-message-size=${TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE}" \ + "-Dgrpc.server.numThreads=${TIKA_GRPC_NUM_THREADS}" \ + --add-opens=jdk.management/com.sun.management.internal=ALL-UNNAMED \ + --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \ + --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ + --add-opens=java.management/com.sun.jmx.mbeanserver=ALL-UNNAMED \ + --add-opens=jdk.internal.jvmstat/sun.jvmstat.monitor=ALL-UNNAMED \ + --add-opens=java.base/sun.reflect.generics.reflectiveObjects=ALL-UNNAMED \ + --add-opens=java.base/java.io=ALL-UNNAMED \ + --add-opens=java.base/java.nio=ALL-UNNAMED \ + --add-opens=java.base/java.util=ALL-UNNAMED \ + --add-opens=java.base/java.lang=ALL-UNNAMED \ + -Djava.net.preferIPv4Stack=true \ + "-Dplugins.pluginDirs=/tika/plugins" \ + -jar "/tika/libs/tika-grpc-${TIKA_VERSION}.jar" diff --git a/tika-server/docker-build/docker-tool.sh b/tika-server/docker-build/docker-tool.sh new file mode 100755 index 0000000000..2a82b5fa34 --- /dev/null +++ b/tika-server/docker-build/docker-tool.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +image_name=apache/tika + +stop_and_die() { + docker buildx rm tika-builder || die "couldn't stop builder -- make sure to stop the builder manually! " + die "$*" +} + +die() { + echo "$*" >&2 + exit 1 +} + +while getopts ":h" opt; do + case ${opt} in + h ) + echo "Usage:" + echo " docker-tool.sh -h Display this help message." + echo " docker-tool.sh build <TIKA_DOCKER_VERSION> <TIKA_VERSION> Builds <TIKA_DOCKER_VERSION> images for <TIKA_VERSION>." + echo " docker-tool.sh test <TIKA_DOCKER_VERSION> Tests images for <TIKA_DOCKER_VERSION>." + echo " docker-tool.sh publish <TIKA_DOCKER_VERSION> <TIKA_VERSION> Builds multi-arch images for <TIKA_DOCKER_VERSION> and pushes to Docker Hub." + exit 0 + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + exit 1 + ;; + esac +done + +stop_test_container() { + container_name=$1 + docker kill "$container_name" + docker rm "$container_name" +} + +test_docker_image() { + container_name=$1 + image=$image_name:$1 + full=$2 + + docker run -d --name "$container_name" -p 127.0.0.1:9998:9998 "$image" + sleep 10 + url=http://localhost:9998/ + status=$(curl --head --location --connect-timeout 5 --write-out %{http_code} --silent --output /dev/null ${url}) + user=$(docker inspect "$container_name" --format '{{.Config.User}}') + + if [[ $status == '200' ]] + then + echo "$(tput setaf 2)Image: $image - Basic test passed$(tput sgr0)" + else + echo "$(tput setaf 1)Image: $image - Basic test failed$(tput sgr0)" + stop_test_container "$container_name" + exit 1 + fi + + #now test that the user is correctly set + if [[ $user == '35002:35002' ]] + then + echo "$(tput setaf 2)Image: $image - User passed$(tput sgr0)" + else + echo "$(tput setaf 1)Image: $image - User failed$(tput sgr0)" + stop_test_container "$container_name" + exit 1 + fi + + if [ $full == true ] + then + # Test ImageMagick is installed and runnable + if docker exec "$1" /usr/bin/convert -version >/dev/null + then + echo "$(tput setaf 2)Image: $image - ImageMagick passed$(tput sgr0)" + else + echo "$(tput setaf 1)Image: $image - ImageMagick failed$(tput sgr0)" + stop_test_container "$container_name" + exit 1 + fi + fi + + stop_test_container "$container_name" +} + +shift $((OPTIND -1)) +subcommand=$1; shift +tika_docker_version=$1; shift +tika_version=$1; shift + + +case "$subcommand" in + build) + # Build slim tika- with minimal dependencies + docker build -t ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} - < minimal/Dockerfile --no-cache || die "couldn't build minimal" + # Build full tika- with OCR, Fonts and GDAL + docker build -t ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} - < full/Dockerfile --no-cache || die "couldn't build full" + ;; + + test) + # Test the images + test_docker_image ${tika_docker_version} false + test_docker_image "${tika_docker_version}-full" true + ;; + + publish) + docker buildx create --use --name tika-builder || die "couldn't create builder" + # Build multi-arch with buildx and push + docker buildx build --platform linux/arm/v7,linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ + --tag ${image_name}:latest --tag ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder minimal || stop_and_die "couldn't build multi-arch minimal" + docker buildx build --platform linux/arm/v7,linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ + --tag ${image_name}:latest-full --tag ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder full || stop_and_die "couldn't build multi-arch full" + docker buildx rm tika-builder || die "couldn't stop builder -- make sure to stop the builder manually! " + ;; + +esac diff --git a/tika-server/docker-build/full/Dockerfile b/tika-server/docker-build/full/Dockerfile new file mode 100644 index 0000000000..1b918390f6 --- /dev/null +++ b/tika-server/docker-build/full/Dockerfile @@ -0,0 +1,82 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# "random" uid/gid hopefully not used anywhere else +# This needs to be set globally and then referenced in +# the subsequent stages -- see TIKA-3912 +ARG UID_GID="35002:35002" + +FROM ubuntu:plucky AS base + +FROM base AS fetch_tika + +ARG TIKA_VERSION +ARG CHECK_SIG=true + +ENV NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ + TIKA_VERSION=$TIKA_VERSION + +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install gnupg2 wget ca-certificates \ + && wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://downloads.apache.org/tika/KEYS | gpg --import \ + && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $BACKUP_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ + && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ + && gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar + +#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi + +FROM base AS runtime +ARG UID_GID +ARG JRE='openjdk-21-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends $JRE \ + gdal-bin \ + imagemagick \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu \ + tesseract-ocr-jpn \ + && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + wget \ + cabextract \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ARG TIKA_VERSION +ENV TIKA_VERSION=$TIKA_VERSION + +COPY --from=fetch_tika /tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +USER $UID_GID + +EXPOSE 9998 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] + +LABEL maintainer="Apache Tika Developers [email protected]" + diff --git a/tika-server/docker-build/full/Dockerfile.snapshot b/tika-server/docker-build/full/Dockerfile.snapshot new file mode 100644 index 0000000000..8882dc5b90 --- /dev/null +++ b/tika-server/docker-build/full/Dockerfile.snapshot @@ -0,0 +1,52 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# Snapshot variant: copies the JAR from the Maven build output rather than +# downloading from Apache mirrors. Used for nightly/snapshot Docker builds. + +ARG UID_GID="35002:35002" + +FROM ubuntu:plucky AS runtime +ARG UID_GID +ARG TIKA_VERSION +ARG JRE='openjdk-21-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends $JRE \ + gdal-bin \ + imagemagick \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu \ + tesseract-ocr-jpn \ + && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + wget \ + cabextract \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ENV TIKA_VERSION=$TIKA_VERSION +COPY tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +USER $UID_GID +EXPOSE 9998 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] + +LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-server/docker-build/minimal/Dockerfile b/tika-server/docker-build/minimal/Dockerfile new file mode 100644 index 0000000000..1c5195920a --- /dev/null +++ b/tika-server/docker-build/minimal/Dockerfile @@ -0,0 +1,70 @@ + +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# "random" uid/gid hopefully not used anywhere else +# This needs to be set globally and then referenced in +# the subsequent stages -- see TIKA-3912 +ARG UID_GID="35002:35002" + +FROM ubuntu:plucky AS base + +FROM base AS fetch_tika + +ARG TIKA_VERSION +ARG CHECK_SIG=true + +ENV NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ + TIKA_VERSION=$TIKA_VERSION + +RUN set -eux \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + gnupg2 \ + wget \ + ca-certificates \ + && wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://downloads.apache.org/tika/KEYS | gpg --import \ + && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $BACKUP_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ + && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ + && gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar + +# this used to work, but I'm getting "ERROR: failed to solve: failed to prepare $data as $data2: invalid argument" +# when trying to build 2.9.2.0 +#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi + +FROM base AS runtime +# must reference uid_gid +ARG UID_GID +ARG JRE='openjdk-21-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends \ + ${JRE} \ + ca-certificates \ + && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ARG TIKA_VERSION +ENV TIKA_VERSION=$TIKA_VERSION +COPY --from=fetch_tika /tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +USER $UID_GID +EXPOSE 9998 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] + +LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-server/docker-build/minimal/Dockerfile.snapshot b/tika-server/docker-build/minimal/Dockerfile.snapshot new file mode 100644 index 0000000000..ac6644f345 --- /dev/null +++ b/tika-server/docker-build/minimal/Dockerfile.snapshot @@ -0,0 +1,34 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# Snapshot variant: copies the JAR from the Maven build output rather than +# downloading from Apache mirrors. Used for nightly/snapshot Docker builds. + +ARG UID_GID="35002:35002" + +FROM ubuntu:plucky AS runtime +ARG UID_GID +ARG TIKA_VERSION +ARG JRE='openjdk-21-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends \ + ${JRE} \ + ca-certificates \ + && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ENV TIKA_VERSION=$TIKA_VERSION +COPY tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +USER $UID_GID +EXPOSE 9998 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] + +LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-server/docker-build/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-server/docker-build/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties new file mode 100644 index 0000000000..b4b787ffc6 --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# You customise or add the settings you want here +language=eng+spa+fra+deu+ita +timeout=240 +minFileSizeToOcr=1 +enableImageProcessing=0 +density=200 +depth=8 +filter=box +resize=300 +applyRotation=true \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml new file mode 100644 index 0000000000..1c9b613033 --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> +<properties> + <parsers> + <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) --> + <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/> + + <!-- Extract and OCR Inline Images in PDF --> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="extractInlineImages" type="bool">true</param> + </params> + </parser> + + </parsers> +</properties> diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml new file mode 100644 index 0000000000..bcd8666996 --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> +<properties> + <parsers> + <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) --> + <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/> + + <!-- OCR on Rendered Pages --> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <!-- no_ocr - extract text only + ocr_only - don't extract text and just attempt OCR + ocr_and_text - extract text and attempt OCR (from Tika 1.24) + auto - extract text but if < 10 characters try OCR + --> + <param name="ocrStrategy" type="string">ocr_only</param> + <param name="ocrImageType" type="string">rgb</param> + <param name="ocrDPI" type="int">100</param> + </params> + </parser> + + </parsers> +</properties> diff --git a/tika-server/docker-build/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties b/tika-server/docker-build/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties new file mode 100644 index 0000000000..44689a2bb3 --- /dev/null +++ b/tika-server/docker-build/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +grobid.server.url=http://grobid:8070 \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/grobid/tika-config.xml b/tika-server/docker-build/sample-configs/grobid/tika-config.xml new file mode 100644 index 0000000000..5b4aad9c72 --- /dev/null +++ b/tika-server/docker-build/sample-configs/grobid/tika-config.xml @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> +<properties> + <parsers> + <parser class="org.apache.tika.parser.journal.JournalParser"> + <mime>application/pdf</mime> + </parser> + </parsers> +</properties> diff --git a/tika-server/docker-build/sample-configs/ner/run_tika_server.sh b/tika-server/docker-build/sample-configs/ner/run_tika_server.sh new file mode 100755 index 0000000000..fb447be4cf --- /dev/null +++ b/tika-server/docker-build/sample-configs/ner/run_tika_server.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +############################################################################# +# See https://cwiki.apache.org/confluence/display/TIKA/TikaAndNER for details +# on how to configure additional NER libraries +############################################################################# + +# ------------------------------------ +# Download OpenNLP Models to classpath +# ------------------------------------ + +OPENNLP_LOCATION="/ner/org/apache/tika/parser/ner/opennlp" +URL="http://opennlp.sourceforge.net/models-1.5" + +mkdir -p $OPENNLP_LOCATION +if [ "$(ls -A $OPENNLP_LOCATION/*.bin)" ]; then + echo "OpenNLP models directory has files, so skipping fetch"; +else + echo "No OpenNLP models found, so fetching them" + wget "$URL/en-ner-person.bin" -O $OPENNLP_LOCATION/ner-person.bin + wget "$URL/en-ner-location.bin" -O $OPENNLP_LOCATION/ner-location.bin + wget "$URL/en-ner-organization.bin" -O $OPENNLP_LOCATION/ner-organization.bin; + wget "$URL/en-ner-date.bin" -O $OPENNLP_LOCATION/ner-date.bin + wget "$URL/en-ner-time.bin" -O $OPENNLP_LOCATION/ner-time.bin + wget "$URL/en-ner-percentage.bin" -O $OPENNLP_LOCATION/ner-percentage.bin + wget "$URL/en-ner-money.bin" -O $OPENNLP_LOCATION/ner-money.bin +fi + +# -------------------------------------------- +# Create RexExp Example for Email on classpath +# -------------------------------------------- +REGEXP_LOCATION="/ner/org/apache/tika/parser/ner/regex" +mkdir -p $REGEXP_LOCATION +echo "EMAIL=(?:[a-z0-9!#$%&'*+/=?^_\`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])" > $REGEXP_LOCATION/ner-regex.txt + + +# ------------------- +# Now run Tika Server +# ------------------- + +# Can be a single implementation or comma seperated list for multiple for "ner.impl.class" property +RECOGNISERS=org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser,org.apache.tika.parser.ner.regex.RegexNERecogniser +# Set classpath to the Tika Server JAR and the /ner folder so it has the configuration and models from above +CLASSPATH="/ner:/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*" +# Run the server with the custom configuration ner.impl.class property and custom /ner/tika-config.xml +exec java -Dner.impl.class=$RECOGNISERS -cp $CLASSPATH org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 -c /ner/tika-config.xml \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/ner/tika-config.xml b/tika-server/docker-build/sample-configs/ner/tika-config.xml new file mode 100644 index 0000000000..65d5774c22 --- /dev/null +++ b/tika-server/docker-build/sample-configs/ner/tika-config.xml @@ -0,0 +1,28 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> +<properties> + <parsers> + <parser class="org.apache.tika.parser.ner.NamedEntityParser"> + <mime>application/pdf</mime> + <mime>text/plain</mime> + <mime>text/html</mime> + <mime>application/xhtml+xml</mime> + </parser> + </parsers> +</properties> + diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml b/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml new file mode 100644 index 0000000000..c70c207b28 --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> +<properties> + <parsers> + <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> + <mime>image/jpeg</mime> + <mime>image/png</mime> + <mime>image/gif</mime> + <params> + <param name="apiBaseUri" type="uri">http://inception-caption:8764/inception/v3</param> + <param name="captions" type="int">5</param> + <param name="maxCaptionLength" type="int">15</param> + <param name="class" type="string">org.apache.tika.parser.captioning.tf.TensorflowRESTCaptioner</param> + </params> + </parser> + </parsers> +</properties> \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml b/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml new file mode 100644 index 0000000000..f6a4e6a938 --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> +<properties> + <parsers> + <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> + <mime>video/mp4</mime> + <mime>video/quicktime</mime> + <params> + <param name="apiBaseUri" type="uri">http://inception-video:8764/inception/v4</param> + <param name="topN" type="int">4</param> + <param name="minConfidence" type="double">0.015</param> + <param name="mode" type="string">fixed</param> + <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTVideoRecogniser</param> + </params> + </parser> + </parsers> +</properties> \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest.xml b/tika-server/docker-build/sample-configs/vision/inception-rest.xml new file mode 100644 index 0000000000..caa6468595 --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/inception-rest.xml @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> +<properties> + <parsers> + <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> + <mime>image/jpeg</mime> + <mime>image/png</mime> + <mime>image/gif</mime> + <params> + <param name="apiBaseUri" type="uri">http://inception-rest:8764/inception/v4</param> + <param name="topN" type="int">2</param> + <param name="minConfidence" type="double">0.015</param> + <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser</param> + </params> + </parser> + </parsers> +</properties>
