This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4272-docker in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9fbaadfad02344567fd9689af9439d39fc22f43c Author: Nicholas DiPiazza <ndipia...@apache.org> AuthorDate: Fri Aug 16 14:14:08 2024 -0500 TIKA-4272: start of some fixes for tika grpc so that it works in tika-docker. plugins were stepping on each other's toes with classpath before. --- tika-core/pom.xml | 6 + .../org/apache/tika/pipes/fetcher/Fetcher.java | 4 +- tika-fuzzing/pom.xml | 8 +- tika-grpc/pom.xml | 6 + .../org/apache/tika/pipes/grpc/TikaGrpcServer.java | 29 +++- .../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 26 ++- tika-grpc/src/main/proto/tika.proto | 12 ++ .../apache/tika/pipes/grpc/TikaGrpcServerTest.java | 5 +- .../src/test/resources/tika-pipes-test-config.xml | 8 +- tika-integration-tests/pom.xml | 6 + tika-parent/pom.xml | 7 + tika-pipes/tika-fetchers/pom.xml | 68 +++++++- .../tika-fetchers/tika-fetcher-az-blob/pom.xml | 21 +-- .../pipes/fetcher/azblob/AZBlobFetcherPlugin.java | 38 +++-- .../src/main/resources/plugin.properties | 21 +++ .../{tika-fetcher-gcs => tika-fetcher-fs}/pom.xml | 19 +-- .../tika/pipes/fetcher/fs/FileSystemFetcher.java | 182 +++++++++++++++++++++ .../pipes/fetcher/fs/FileSystemFetcherPlugin.java | 38 +++-- .../fetcher/fs/config/FileSystemFetcherConfig.java | 36 ++-- .../src/main/resources/plugin.properties | 21 +++ tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml | 8 +- .../tika/pipes/fetcher/gcs/GCSFetcherPlugin.java | 38 +++-- .../src/main/resources/plugin.properties | 21 +++ tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml | 23 --- .../tika/pipes/fetcher/http/HttpFetcherPlugin.java | 38 +++-- .../src/main/resources/plugin.properties | 21 +++ .../tika-fetcher-microsoft-graph/pom.xml | 20 +-- .../microsoftgraph/MicrosoftGraphFetcher.java | 2 + .../microsoftgraph/MicrosoftGraphPlugin.java | 38 +++-- .../src/main/resources/plugin.properties | 21 +++ tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml | 13 +- .../tika/pipes/fetcher/s3/S3FetcherPlugin.java | 38 +++-- .../src/main/resources/plugin.properties | 21 +++ .../{tika-fetcher-gcs => tika-fetcher-url}/pom.xml | 22 +-- .../apache/tika/pipes/fetcher/url/UrlFetcher.java | 53 ++++++ .../pipes/fetcher/url/config/UrlFetcherConfig.java | 23 +-- .../src/main/resources/plugin.properties | 21 +++ tika-server/tika-server-core/pom.xml | 7 +- 38 files changed, 728 insertions(+), 261 deletions(-) diff --git a/tika-core/pom.xml b/tika-core/pom.xml index 7e163c061..172552a17 100644 --- a/tika-core/pom.xml +++ b/tika-core/pom.xml @@ -40,6 +40,12 @@ <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> </dependency> + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <!-- !!! VERY IMPORTANT --> + <scope>provided</scope> + </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java index 8f7a186fd..c7e1b3d43 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java @@ -19,6 +19,8 @@ package org.apache.tika.pipes.fetcher; import java.io.IOException; import java.io.InputStream; +import org.pf4j.ExtensionPoint; + import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -30,7 +32,7 @@ import org.apache.tika.parser.ParseContext; * <p> * Implementations of Fetcher must be thread safe. */ -public interface Fetcher { +public interface Fetcher extends ExtensionPoint { String getName(); diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml index 2faa23ce7..fdf94626c 100644 --- a/tika-fuzzing/pom.xml +++ b/tika-fuzzing/pom.xml @@ -87,6 +87,12 @@ <type>test-jar</type> <scope>test</scope> </dependency> + + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <scope>provided</scope> + </dependency> </dependencies> <build> <plugins> @@ -133,4 +139,4 @@ </plugin> </plugins> </build> -</project> \ No newline at end of file +</project> diff --git a/tika-grpc/pom.xml b/tika-grpc/pom.xml index d7e40d748..6cd06e9f3 100644 --- a/tika-grpc/pom.xml +++ b/tika-grpc/pom.xml @@ -41,6 +41,7 @@ <j2objc-annotations.version>3.0.0</j2objc-annotations.version> <!-- javadocs doesn't build for generated code --> <maven.javadoc.skip>true</maven.javadoc.skip> + <pf4j.version>3.12.0</pf4j.version> </properties> <dependencyManagement> @@ -226,6 +227,11 @@ <groupId>com.fasterxml.jackson.module</groupId> <artifactId>jackson-module-jsonSchema</artifactId> </dependency> + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <version>${pf4j.version}</version> + </dependency> <dependency> <groupId>com.asarkar.grpc</groupId> <artifactId>grpc-test</artifactId> diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java index 506522c74..2ef816d3a 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java @@ -21,6 +21,8 @@ import static io.grpc.health.v1.HealthCheckResponse.ServingStatus; import java.io.File; import java.io.FileWriter; import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.List; import java.util.concurrent.TimeUnit; import com.beust.jcommander.JCommander; @@ -32,11 +34,15 @@ import io.grpc.ServerCredentials; import io.grpc.TlsServerCredentials; import io.grpc.protobuf.services.HealthStatusManager; import io.grpc.protobuf.services.ProtoReflectionService; +import org.pf4j.DefaultPluginManager; +import org.pf4j.PluginManager; +import org.pf4j.PluginWrapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfigSerializer; +import org.apache.tika.pipes.fetcher.Fetcher; /** * Server that manages startup/shutdown of the GRPC Tika server. @@ -45,12 +51,17 @@ public class TikaGrpcServer { private static final Logger LOGGER = LoggerFactory.getLogger(TikaGrpcServer.class); public static final int TIKA_SERVER_GRPC_DEFAULT_PORT = 50052; private Server server; + // create the plugin manager + private PluginManager pluginManager; @Parameter(names = {"-p", "--port"}, description = "The grpc server port", help = true) private Integer port = TIKA_SERVER_GRPC_DEFAULT_PORT; - @Parameter(names = {"-c", "--config"}, description = "The grpc server port", help = true) + @Parameter(names = {"-c", "--config"}, description = "The grpc server configuration XML file", help = true) private File tikaConfigXml; + @Parameter(names = {"-d", "--plugins-dir"}, description = "Tika pipes plugin root directories", help = true) + private List<Path> pluginDirs; + @Parameter(names = {"-s", "--secure"}, description = "Enable credentials required to access this grpc server") private boolean secure; @@ -95,11 +106,25 @@ public class TikaGrpcServer { TikaConfigSerializer.serialize(new TikaConfig(), TikaConfigSerializer.Mode.STATIC_FULL, fw, StandardCharsets.UTF_8); } } + pluginManager = pluginDirs == null ? new DefaultPluginManager() : new DefaultPluginManager(pluginDirs); + pluginManager.loadPlugins(); + LOGGER.info("Loaded {} plugins", pluginManager.getPlugins().size()); + pluginManager.startPlugins(); + for (PluginWrapper plugin : pluginManager.getStartedPlugins()) { + LOGGER.info("Add-in " + plugin.getPluginId() + " : " + plugin.getDescriptor() + " has started."); + for (Class<?> extension : pluginManager.getExtensionClasses(plugin.getPluginId())) { + LOGGER.info(" Extension " + extension + " has been registered -- {}", extension.isAssignableFrom(Fetcher.class)); + LOGGER.info(" or -- {}", Fetcher.class.isAssignableFrom(extension)); + } + } + for (PluginWrapper plugin : pluginManager.getUnresolvedPlugins()) { + LOGGER.warn("Add-in " + plugin.getPluginId() + " : " + plugin.getDescriptor() + " is unresolved."); + } File tikaConfigFile = new File(tikaConfigXml.getAbsolutePath()); healthStatusManager.setStatus(TikaGrpcServer.class.getSimpleName(), ServingStatus.SERVING); server = Grpc .newServerBuilderForPort(port, creds) - .addService(new TikaGrpcServerImpl(tikaConfigFile.getAbsolutePath())) + .addService(new TikaGrpcServerImpl(tikaConfigFile.getAbsolutePath(), pluginManager)) .addService(healthStatusManager.getHealthService()) .addService(ProtoReflectionService.newInstance()) .build() diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java index 4eb5f0b01..d65178e8c 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java @@ -45,6 +45,7 @@ import io.grpc.protobuf.StatusProto; import io.grpc.stub.StreamObserver; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; +import org.pf4j.PluginManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -59,6 +60,8 @@ import org.apache.tika.GetFetcherConfigJsonSchemaReply; import org.apache.tika.GetFetcherConfigJsonSchemaRequest; import org.apache.tika.GetFetcherReply; import org.apache.tika.GetFetcherRequest; +import org.apache.tika.ListFetcherPluginsReply; +import org.apache.tika.ListFetcherPluginsRequest; import org.apache.tika.ListFetchersReply; import org.apache.tika.ListFetchersRequest; import org.apache.tika.SaveFetcherReply; @@ -76,6 +79,7 @@ import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.fetcher.AbstractFetcher; import org.apache.tika.pipes.fetcher.FetchKey; +import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.config.AbstractConfig; import org.apache.tika.pipes.fetcher.config.FetcherConfigContainer; @@ -87,6 +91,8 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { } public static final JsonSchemaGenerator JSON_SCHEMA_GENERATOR = new JsonSchemaGenerator(OBJECT_MAPPER); + private final PluginManager pluginManager; + /** * FetcherID is key, The pair is the Fetcher object and the Metadata */ @@ -96,9 +102,8 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { String tikaConfigPath; - TikaGrpcServerImpl(String tikaConfigPath) - throws TikaConfigException, IOException, ParserConfigurationException, - TransformerException, SAXException { + TikaGrpcServerImpl(String tikaConfigPath, PluginManager pluginManager) throws TikaConfigException, IOException, + ParserConfigurationException, TransformerException, SAXException { File tikaConfigFile = new File(tikaConfigPath); if (!tikaConfigFile.canWrite()) { File tmpTikaConfigFile = File.createTempFile("configCopy", tikaConfigFile.getName()); @@ -114,8 +119,15 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { expiringFetcherStore = new ExpiringFetcherStore(pipesConfig.getStaleFetcherTimeoutSeconds(), pipesConfig.getStaleFetcherDelaySeconds()); + this.tikaConfigPath = tikaConfigPath; updateTikaConfig(); + + this.pluginManager = pluginManager; + List<Fetcher> fetchers = pluginManager.getExtensions(Fetcher.class); + for (Fetcher fetcher : fetchers) { + + } } private void updateTikaConfig() @@ -409,6 +421,14 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { responseObserver.onCompleted(); } + @Override + public void listFetcherPlugins(ListFetcherPluginsRequest request, StreamObserver<ListFetcherPluginsReply> responseObserver) { + for (Fetcher extension : pluginManager.getExtensions(Fetcher.class)) { + responseObserver.onNext(ListFetcherPluginsReply.newBuilder().setFetcherPluginId(extension.getName()).build()); + } + + } + private boolean deleteFetcher(String fetcherName) { return expiringFetcherStore.deleteFetcher(fetcherName); } diff --git a/tika-grpc/src/main/proto/tika.proto b/tika-grpc/src/main/proto/tika.proto index 572ded7ab..8019ca919 100644 --- a/tika-grpc/src/main/proto/tika.proto +++ b/tika-grpc/src/main/proto/tika.proto @@ -59,6 +59,11 @@ service Tika { Get the Fetcher Config schema for a given fetcher class. */ rpc GetFetcherConfigJsonSchema(GetFetcherConfigJsonSchemaRequest) returns (GetFetcherConfigJsonSchemaReply) {} + /* + List fetcher plugins + */ + rpc ListFetcherPlugins(ListFetcherPluginsRequest) returns (ListFetcherPluginsReply) {} + } message SaveFetcherRequest { @@ -143,3 +148,10 @@ message GetFetcherConfigJsonSchemaReply { // The json schema that describes the fetcher config in string format. string fetcher_config_json_schema = 1; } + +message ListFetcherPluginsRequest { +} + +message ListFetcherPluginsReply { + string fetcher_plugin_id = 1; +} diff --git a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java index 80f391e33..d5aebed67 100644 --- a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java +++ b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java @@ -52,6 +52,7 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; +import org.pf4j.DefaultPluginManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -93,7 +94,7 @@ public class TikaGrpcServerTest { Server server = InProcessServerBuilder .forName(serverName) .directExecutor() - .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath())) + .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath(), new DefaultPluginManager())) .build() .start(); resources.register(server, Duration.ofSeconds(10)); @@ -188,7 +189,7 @@ public class TikaGrpcServerTest { Server server = InProcessServerBuilder .forName(serverName) .directExecutor() - .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath())) + .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath(), new DefaultPluginManager())) .build() .start(); resources.register(server, Duration.ofSeconds(10)); diff --git a/tika-grpc/src/test/resources/tika-pipes-test-config.xml b/tika-grpc/src/test/resources/tika-pipes-test-config.xml index e4006edb3..e7f4240c3 100644 --- a/tika-grpc/src/test/resources/tika-pipes-test-config.xml +++ b/tika-grpc/src/test/resources/tika-pipes-test-config.xml @@ -13,8 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ---> -<properties> +--><properties> <async> <staleFetcherTimeoutSeconds>600</staleFetcherTimeoutSeconds> <staleFetcherDelaySeconds>60</staleFetcherDelaySeconds> @@ -30,6 +29,5 @@ <maxForEmitBatchBytes>-1</maxForEmitBatchBytes> <!-- disable emit --> </params> </pipes> - <fetchers> - </fetchers> -</properties> + <fetchers/> +</properties> \ No newline at end of file diff --git a/tika-integration-tests/pom.xml b/tika-integration-tests/pom.xml index 5d641d852..620243c89 100644 --- a/tika-integration-tests/pom.xml +++ b/tika-integration-tests/pom.xml @@ -58,6 +58,12 @@ <artifactId>junit-vintage-engine</artifactId> <scope>test</scope> </dependency> + + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <scope>provided</scope> + </dependency> </dependencies> <scm> diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 3145cf7d5..d4175450c 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -445,6 +445,7 @@ <zstd.version>1.5.6-4</zstd.version> <nimbus-jose-jwt.version>9.40</nimbus-jose-jwt.version> <javacpp.version>1.5.10</javacpp.version> + <pf4j.version>3.12.0</pf4j.version> </properties> <dependencyManagement> @@ -1035,6 +1036,12 @@ <artifactId>jspecify</artifactId> <version>1.0.0</version> </dependency> + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <version>${pf4j.version}</version> + </dependency> + </dependencies> </dependencyManagement> diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml index 23b9d73f1..fa86d9db5 100644 --- a/tika-pipes/tika-fetchers/pom.xml +++ b/tika-pipes/tika-fetchers/pom.xml @@ -37,12 +37,76 @@ <module>tika-fetcher-gcs</module> <module>tika-fetcher-az-blob</module> <module>tika-fetcher-microsoft-graph</module> + <module>tika-fetcher-fs</module> + <module>tika-fetcher-url</module> </modules> <dependencies> - + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <!-- !!! VERY IMPORTANT --> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j2-impl</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>jcl-over-slf4j</artifactId> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j2-impl</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>jcl-over-slf4j</artifactId> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-serialization</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-core</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-engine</artifactId> + <scope>test</scope> + </dependency> </dependencies> <scm> <tag>3.0.0-BETA2-rc1</tag> </scm> -</project> \ No newline at end of file +</project> diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/pom.xml index f0d7642e5..903007cf2 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/pom.xml @@ -29,29 +29,10 @@ <name>Apache Tika Azure Blob fetcher</name> <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> <dependency> <groupId>com.azure</groupId> <artifactId>azure-storage-blob</artifactId> </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <type>test-jar</type> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-serialization</artifactId> - <version>${project.version}</version> - <scope>test</scope> - </dependency> </dependencies> <build> @@ -123,4 +104,4 @@ <scm> <tag>3.0.0-BETA2-rc1</tag> </scm> -</project> \ No newline at end of file +</project> diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcherPlugin.java similarity index 56% copy from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java copy to tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcherPlugin.java index 8f7a186fd..5ae15613a 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcherPlugin.java @@ -14,25 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher; +package org.apache.tika.pipes.fetcher.azblob; -import java.io.IOException; -import java.io.InputStream; +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +public class AZBlobFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(AZBlobFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } -/** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - * <p> - * Implementations of Fetcher must be thread safe. - */ -public interface Fetcher { - - String getName(); + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } - InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/resources/plugin.properties new file mode 100644 index 000000000..74dfeaadb --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=az-blob-fetcher +plugin.class=org.apache.tika.pipes.fetcher.azblob.AZBlobFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=Azure Blob Fetcher +plugin.description=Capable of taking Blob IDs from AZ and using their bytes as tika parse bytes. diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-fs/pom.xml similarity index 89% copy from tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml copy to tika-pipes/tika-fetchers/tika-fetcher-fs/pom.xml index e3f5044d4..e4bb74b24 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-fs/pom.xml @@ -25,20 +25,11 @@ </parent> <modelVersion>4.0.0</modelVersion> - <artifactId>tika-fetcher-gcs</artifactId> - <name>Apache Tika Google Cloud Storage fetcher</name> + <artifactId>tika-fetcher-fs</artifactId> + <name>Apache Tika FS Fetcher</name> + <description>Apache Tika Pipes Fetcher for Local File System</description> <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>com.google.cloud</groupId> - <artifactId>google-cloud-storage</artifactId> - </dependency> </dependencies> <build> @@ -49,7 +40,7 @@ <configuration> <archive> <manifestEntries> - <Automatic-Module-Name>org.apache.tika.pipes.fetcher.gcs</Automatic-Module-Name> + <Automatic-Module-Name>org.apache.tika.pipes.fetcher.s3</Automatic-Module-Name> </manifestEntries> </archive> </configuration> @@ -110,4 +101,4 @@ <scm> <tag>3.0.0-BETA2-rc1</tag> </scm> -</project> \ No newline at end of file +</project> diff --git a/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java new file mode 100644 index 000000000..bc3c4cddd --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.fs; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.nio.file.attribute.FileTime; +import java.util.Date; +import java.util.Map; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.FileSystem; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetcher.fs.config.FileSystemFetcherConfig; + +public class FileSystemFetcher extends AbstractFetcher implements Initializable { + public FileSystemFetcher() { + } + + public FileSystemFetcher(FileSystemFetcherConfig fileSystemFetcherConfig) { + setBasePath(fileSystemFetcherConfig.getBasePath()); + setExtractFileSystemMetadata(fileSystemFetcherConfig.isExtractFileSystemMetadata()); + } + + private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcher.class); + + //Warning! basePath can be null! + private Path basePath = null; + + private boolean extractFileSystemMetadata = false; + + static boolean isDescendant(Path root, Path descendant) { + return descendant.toAbsolutePath().normalize() + .startsWith(root.toAbsolutePath().normalize()); + } + + @Override + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws IOException, TikaException { + if (fetchKey.contains("\u0000")) { + throw new IllegalArgumentException("Path must not contain 'u0000'. " + + "Please review the life decisions that led you to requesting " + + "a file name with this character in it."); + } + Path p = null; + if (basePath != null) { + p = basePath.resolve(fetchKey); + if (!p.toRealPath().startsWith(basePath.toRealPath())) { + throw new IllegalArgumentException( + "fetchKey must resolve to be a descendant of the 'basePath'"); + } + } else { + p = Paths.get(fetchKey); + } + + metadata.set(TikaCoreProperties.SOURCE_PATH, fetchKey); + updateFileSystemMetadata(p, metadata); + + if (!Files.isRegularFile(p)) { + if (basePath != null && !Files.isDirectory(basePath)) { + throw new IOException("BasePath is not a directory: " + basePath); + } else { + throw new FileNotFoundException(p.toAbsolutePath().toString()); + } + } + + return TikaInputStream.get(p, metadata); + } + + private void updateFileSystemMetadata(Path p, Metadata metadata) throws IOException { + if (! extractFileSystemMetadata) { + return; + } + BasicFileAttributes attrs = Files.readAttributes(p, BasicFileAttributes.class); + updateFileTime(FileSystem.CREATED, attrs.creationTime(), metadata); + updateFileTime(FileSystem.MODIFIED, attrs.lastModifiedTime(), metadata); + updateFileTime(FileSystem.ACCESSED, attrs.lastAccessTime(), metadata); + //TODO extract owner or group? + } + + private void updateFileTime(Property property, FileTime fileTime, Metadata metadata) { + if (fileTime == null) { + return; + } + metadata.set(property, new Date(fileTime.toMillis())); + } + + /** + * + * @return the basePath or <code>null</code> if no base path was set + */ + public Path getBasePath() { + return basePath; + } + + /** + * Default behavior si that clients will send in relative paths, this + * must be set to allow this fetcher to fetch the + * full path. + * + * @param basePath + */ + @Field + public void setBasePath(String basePath) { + this.basePath = Paths.get(basePath); + } + + /** + * Extract file system metadata (created, modified, accessed) when fetching file. + * The default is <code>false</code>. + * + * @param extractFileSystemMetadata + */ + @Field + public void setExtractFileSystemMetadata(boolean extractFileSystemMetadata) { + this.extractFileSystemMetadata = extractFileSystemMetadata; + } + + @Override + public void initialize(Map<String, Param> params) throws TikaConfigException { + //no-op + } + + @Override + public void checkInitialization(InitializableProblemHandler problemHandler) + throws TikaConfigException { + if (basePath == null || basePath.toString().trim().length() == 0) { + LOG.warn("'basePath' has not been set. " + + "This means that client code or clients can read from any file that this " + + "process has permissions to read. If you are running tika-server, make " + + "absolutely certain that you've locked down " + + "access to tika-server and file-permissions for the tika-server process."); + return; + } + if (basePath.toString().startsWith("http://")) { + throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + + " Please use the tika-fetcher-http module for http calls"); + } else if (basePath.toString().startsWith("ftp://")) { + throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + + " Please consider contributing an ftp fetcher module"); + } else if (basePath.toString().startsWith("s3://")) { + throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + + " Please use the tika-fetcher-s3 module"); + } + + if (basePath.toAbsolutePath().toString().contains("\u0000")) { + throw new TikaConfigException( + "base path must not contain \u0000. " + "Seriously, what were you thinking?"); + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java similarity index 56% copy from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java copy to tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java index 8f7a186fd..931aa1089 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java @@ -14,25 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher; +package org.apache.tika.pipes.fetcher.fs; -import java.io.IOException; -import java.io.InputStream; +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +public class FileSystemFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } -/** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - * <p> - * Implementations of Fetcher must be thread safe. - */ -public interface Fetcher { - - String getName(); + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } - InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java similarity index 52% copy from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java copy to tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java index 8f7a186fd..b9f155fbd 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java @@ -14,25 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher; +package org.apache.tika.pipes.fetcher.fs.config; -import java.io.IOException; -import java.io.InputStream; +import org.apache.tika.pipes.fetcher.config.AbstractConfig; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +public class FileSystemFetcherConfig extends AbstractConfig { + private String basePath; + private boolean extractFileSystemMetadata; -/** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - * <p> - * Implementations of Fetcher must be thread safe. - */ -public interface Fetcher { + public String getBasePath() { + return basePath; + } + + public FileSystemFetcherConfig setBasePath(String basePath) { + this.basePath = basePath; + return this; + } - String getName(); + public boolean isExtractFileSystemMetadata() { + return extractFileSystemMetadata; + } - InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; + public FileSystemFetcherConfig setExtractFileSystemMetadata(boolean extractFileSystemMetadata) { + this.extractFileSystemMetadata = extractFileSystemMetadata; + return this; + } } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/resources/plugin.properties new file mode 100644 index 000000000..41b443a51 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-fs/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=fs-fetcher +plugin.class=org.apache.tika.pipes.fetcher.fs.FileSystemFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=Local File System Fetcher +plugin.description=Capable of fetching the local file system diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml index e3f5044d4..f975d145e 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml @@ -29,12 +29,6 @@ <name>Apache Tika Google Cloud Storage fetcher</name> <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> <dependency> <groupId>com.google.cloud</groupId> <artifactId>google-cloud-storage</artifactId> @@ -110,4 +104,4 @@ <scm> <tag>3.0.0-BETA2-rc1</tag> </scm> -</project> \ No newline at end of file +</project> diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcherPlugin.java similarity index 56% copy from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java copy to tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcherPlugin.java index 8f7a186fd..c90ebb140 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcherPlugin.java @@ -14,25 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher; +package org.apache.tika.pipes.fetcher.gcs; -import java.io.IOException; -import java.io.InputStream; +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +public class GCSFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(GCSFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } -/** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - * <p> - * Implementations of Fetcher must be thread safe. - */ -public interface Fetcher { - - String getName(); + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } - InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/resources/plugin.properties new file mode 100644 index 000000000..79e5590e8 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=gcs-fetcher +plugin.class=org.apache.tika.pipes.fetcher.gcs.GCSFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=GCS Fetcher +plugin.description=GCS Fetchedr diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml index 320569ed5..4d0761dd5 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml @@ -29,17 +29,6 @@ <name>Apache Tika http fetcher</name> <dependencies> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-slf4j2-impl</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-httpclient-commons</artifactId> @@ -61,18 +50,6 @@ <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-annotations</artifactId> </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <type>test-jar</type> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.mockito</groupId> - <artifactId>mockito-core</artifactId> - <scope>test</scope> - </dependency> </dependencies> <build> diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcherPlugin.java similarity index 56% copy from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java copy to tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcherPlugin.java index 8f7a186fd..bd77c1026 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcherPlugin.java @@ -14,25 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher; +package org.apache.tika.pipes.fetcher.http; -import java.io.IOException; -import java.io.InputStream; +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +public class HttpFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(HttpFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } -/** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - * <p> - * Implementations of Fetcher must be thread safe. - */ -public interface Fetcher { - - String getName(); + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } - InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/resources/plugin.properties new file mode 100644 index 000000000..ecd3cb512 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=http-fetcher +plugin.class=org.apache.tika.pipes.fetcher.http.HttpFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=HTTP Fetcher +plugin.description=HTTP web request fetcher diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml index ecee31f26..66951cf2c 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml @@ -56,11 +56,6 @@ </exclusion> </exclusions> </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - </dependency> <dependency> <groupId>com.microsoft.graph</groupId> <artifactId>microsoft-graph</artifactId> @@ -80,16 +75,6 @@ </exclusion> </exclusions> </dependency> - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-engine</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.mockito</groupId> - <artifactId>mockito-core</artifactId> - <scope>test</scope> - </dependency> <dependency> <groupId>org.mockito</groupId> <artifactId>mockito-junit-jupiter</artifactId> @@ -101,7 +86,6 @@ <artifactId>nimbus-jose-jwt</artifactId> </dependency> </dependencies> - <build> <plugins> <plugin> @@ -159,6 +143,10 @@ <resource>META-INF/DEPENDENCIES</resource> <file>target/classes/META-INF/DEPENDENCIES</file> </transformer> + <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer"> + <resource>META-INF/extensions.idx</resource> + <file>target/classes/META-INF/extensions.idx</file> + </transformer> </transformers> </configuration> </execution> diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java index 3c27795d3..cb74b77f0 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java @@ -24,6 +24,7 @@ import java.util.Map; import com.azure.identity.ClientCertificateCredentialBuilder; import com.azure.identity.ClientSecretCredentialBuilder; import com.microsoft.graph.serviceclient.GraphServiceClient; +import org.pf4j.Extension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,6 +45,7 @@ import org.apache.tika.pipes.fetchers.microsoftgraph.config.MicrosoftGraphFetche * Fetches files from Microsoft Graph API. * Fetch keys are ${siteDriveId},${driveItemId} */ +@Extension public class MicrosoftGraphFetcher extends AbstractFetcher implements Initializable { private static final Logger LOGGER = LoggerFactory.getLogger(MicrosoftGraphFetcher.class); private GraphServiceClient graphClient; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphPlugin.java similarity index 56% copy from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java copy to tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphPlugin.java index 8f7a186fd..541ba0f93 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphPlugin.java @@ -14,25 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher; +package org.apache.tika.pipes.fetchers.microsoftgraph; -import java.io.IOException; -import java.io.InputStream; +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +public class MicrosoftGraphPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(MicrosoftGraphPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } -/** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - * <p> - * Implementations of Fetcher must be thread safe. - */ -public interface Fetcher { - - String getName(); + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } - InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/resources/plugin.properties new file mode 100644 index 000000000..6d7e508e1 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=microsoft-graph-fetcher +plugin.class=org.apache.tika.pipes.fetchers.microsoftgraph.MicrosoftGraphPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=Microsoft Graph Fetcher +plugin.description=Uses the Microsoft Graph API to fetch data diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml index 8c06d0099..2189451e1 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml @@ -34,15 +34,8 @@ <artifactId>aws-java-sdk-s3</artifactId> </dependency> <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-slf4j2-impl</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <scope>provided</scope> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> </dependency> </dependencies> @@ -115,4 +108,4 @@ <scm> <tag>3.0.0-BETA2-rc1</tag> </scm> -</project> \ No newline at end of file +</project> diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3FetcherPlugin.java similarity index 56% copy from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java copy to tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3FetcherPlugin.java index 8f7a186fd..97676ca7d 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3FetcherPlugin.java @@ -14,25 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher; +package org.apache.tika.pipes.fetcher.s3; -import java.io.IOException; -import java.io.InputStream; +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +public class S3FetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(S3FetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } -/** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - * <p> - * Implementations of Fetcher must be thread safe. - */ -public interface Fetcher { - - String getName(); + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } - InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/resources/plugin.properties new file mode 100644 index 000000000..31bc1c52c --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=s3-fetcher +plugin.class=org.apache.tika.pipes.fetcher.s3.S3FetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=S3 Fetcher +plugin.description=Capable of using amazon s3 sdk and fetching content. diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-url/pom.xml similarity index 88% copy from tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml copy to tika-pipes/tika-fetchers/tika-fetcher-url/pom.xml index e3f5044d4..860f815fc 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/pom.xml @@ -25,21 +25,9 @@ </parent> <modelVersion>4.0.0</modelVersion> - <artifactId>tika-fetcher-gcs</artifactId> - <name>Apache Tika Google Cloud Storage fetcher</name> - - <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>com.google.cloud</groupId> - <artifactId>google-cloud-storage</artifactId> - </dependency> - </dependencies> + <artifactId>tika-fetcher-url</artifactId> + <name>Apache Tika URL Fetcher</name> + <description>Apache Tika Pipes Fetcher for HTTP URLs</description> <build> <plugins> @@ -49,7 +37,7 @@ <configuration> <archive> <manifestEntries> - <Automatic-Module-Name>org.apache.tika.pipes.fetcher.gcs</Automatic-Module-Name> + <Automatic-Module-Name>org.apache.tika.pipes.fetcher.s3</Automatic-Module-Name> </manifestEntries> </archive> </configuration> @@ -110,4 +98,4 @@ <scm> <tag>3.0.0-BETA2-rc1</tag> </scm> -</project> \ No newline at end of file +</project> diff --git a/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java new file mode 100644 index 000000000..7692516cd --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.url; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Locale; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.fetcher.AbstractFetcher; + +/** + * Simple fetcher for URLs. This simply calls {@link TikaInputStream#get(URL)}. + * This intentionally does not support fetching for files. + * Please use the FileSystemFetcher for that. If you need more advanced control (passwords, + * timeouts, proxies, etc), please use the tika-fetcher-http module. + */ +public class UrlFetcher extends AbstractFetcher { + + @Override + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws IOException, TikaException { + if (fetchKey.contains("\u0000")) { + throw new IllegalArgumentException("URL must not contain \u0000. " + + "Please review the life decisions that led you to requesting " + + "a URL with this character in it."); + } + if (fetchKey.toLowerCase(Locale.US).trim().startsWith("file:")) { + throw new IllegalArgumentException( + "The UrlFetcher does not fetch from file shares; " + + "please use the FileSystemFetcher"); + } + return TikaInputStream.get(new URL(fetchKey), metadata); + } + +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/config/UrlFetcherConfig.java similarity index 56% copy from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java copy to tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/config/UrlFetcherConfig.java index 8f7a186fd..0750b780b 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/config/UrlFetcherConfig.java @@ -14,25 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher; +package org.apache.tika.pipes.fetcher.url.config; -import java.io.IOException; -import java.io.InputStream; +import org.apache.tika.pipes.fetcher.config.AbstractConfig; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; - -/** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - * <p> - * Implementations of Fetcher must be thread safe. - */ -public interface Fetcher { - - String getName(); - - InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; +public class UrlFetcherConfig extends AbstractConfig { + // no fetcher config needed at this time. } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/resources/plugin.properties new file mode 100644 index 000000000..cc36bf1f5 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=url-fetcher +plugin.class=org.apache.tika.pipes.fetcher.url.UrlFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=URL Fetcher +plugin.description=Capable of fetching URLs diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml index b7120731d..a0d821a84 100644 --- a/tika-server/tika-server-core/pom.xml +++ b/tika-server/tika-server-core/pom.xml @@ -130,6 +130,11 @@ <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-slf4j2-impl</artifactId> </dependency> + <dependency> + <groupId>org.pf4j</groupId> + <artifactId>pf4j</artifactId> + <scope>provided</scope> + </dependency> </dependencies> <build> @@ -335,4 +340,4 @@ <scm> <tag>3.0.0-BETA2-rc1</tag> </scm> -</project> \ No newline at end of file +</project>