This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bc74d11b6 TIKA-4344 -- add a magika wrapper (#2036)
bc74d11b6 is described below
commit bc74d11b66d1d8fdb78867816962d232c6e1efcb
Author: Tim Allison <[email protected]>
AuthorDate: Tue Nov 5 12:45:40 2024 -0500
TIKA-4344 -- add a magika wrapper (#2036)
* TIKA-4344 -- add a magika wrapper
---
CHANGES.txt | 2 +
pom.xml | 2 +-
tika-detectors/pom.xml | 1 +
tika-detectors/tika-detector-magika/pom.xml | 112 +++++++
.../apache/tika/detect/magika/MagikaDetector.java | 354 +++++++++++++++++++++
.../services/org.apache.tika.detect.Detector | 16 +
.../tika/detect/magika/TestMagikaIntegration.java | 59 ++++
.../tika/detect/magika/TestMagikaJsonParsing.java | 111 +++++++
.../src/test/resources/configs/tika-config.xml | 28 ++
.../src/test/resources/json/test-basic-0.5.1.json | 21 ++
.../src/test/resources/json/test-basic.json | 33 ++
.../src/test/resources/test-documents/testPDF.pdf | Bin 0 -> 34824 bytes
12 files changed, 738 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 28b85fadf..395f41b07 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -3,6 +3,8 @@ Release 4.0.0-BETA1 - ???
Release 3.1.0 - ??
+ * Add a wrapper for Google's magika detector (TIKA-4344).
+
* Add support for MachO via Alexey Pelykh (TIKA-4309).
* Add logic to inject spaces in XPS files based on font widths via Ruairidh
Williamson (TIKA-4315).
diff --git a/pom.xml b/pom.xml
index aa7fe028f..035b98fd5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -39,6 +39,7 @@
<module>tika-bom</module>
<module>tika-core</module>
<module>tika-serialization</module>
+ <module>tika-detectors</module>
<module>tika-parsers</module>
<module>tika-bundles</module>
<module>tika-xmp</module>
@@ -54,7 +55,6 @@
<module>tika-translate</module>
<module>tika-example</module>
<module>tika-java7</module>
- <module>tika-detectors</module>
<module>tika-handlers</module>
</modules>
diff --git a/tika-detectors/pom.xml b/tika-detectors/pom.xml
index 145719d7a..1e0981b53 100644
--- a/tika-detectors/pom.xml
+++ b/tika-detectors/pom.xml
@@ -35,5 +35,6 @@
<modules>
<module>tika-detector-siegfried</module>
+ <module>tika-detector-magika</module>
</modules>
</project>
\ No newline at end of file
diff --git a/tika-detectors/tika-detector-magika/pom.xml
b/tika-detectors/tika-detector-magika/pom.xml
new file mode 100644
index 000000000..c4ba65f23
--- /dev/null
+++ b/tika-detectors/tika-detector-magika/pom.xml
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <artifactId>tika-detectors</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <version>4.0.0-SNAPSHOT</version>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>tika-detector-magika</artifactId>
+ <name>Apache Tika magika wrapper</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-slf4j2-impl</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-shade-plugin</artifactId>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <createDependencyReducedPom>
+ false
+ </createDependencyReducedPom>
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>module-info.class</exclude>
+ <exclude>module-info.class</exclude>
+ <exclude>META-INF/*.SF</exclude>
+ <exclude>META-INF/*.DSA</exclude>
+ <exclude>META-INF/*.RSA</exclude>
+ <exclude>META-INF/*.txt</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+ <manifestEntries>
+ <Multi-Release>true</Multi-Release>
+ </manifestEntries>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.detector.magika</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
diff --git
a/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java
b/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java
new file mode 100644
index 000000000..a96ad20a4
--- /dev/null
+++
b/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java
@@ -0,0 +1,354 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.magika;
+
+import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.ExternalProcess;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Simple wrapper around Google's magika: https://github.com/google/magika
+ * The tool must be installed on the host where Tika is running.
+ * The default behavior is to run detection, report the results in the
+ * metadata and then return null so that other detectors will be used.
+ */
+public class MagikaDetector implements Detector {
+
+ enum STATUS {
+ SUCCESS, TIMEOUT, CRASH, JSON_PARSE_EXCEPTION
+ }
+
+ public static final String MAGIKA_PREFIX = "magika:";
+
+ public static Property MAGIKA_STATUS = Property.externalText(MAGIKA_PREFIX
+ "status");
+ public static Property MAGIKA_DESCRIPTION =
+ Property.externalText(MAGIKA_PREFIX + "description");
+ public static Property MAGIKA_SCORE =
+ Property.externalReal(MAGIKA_PREFIX + "score");
+ public static Property MAGIKA_GROUP =
+ Property.externalText(MAGIKA_PREFIX + "group");
+ public static Property MAGIKA_LABEL =
+ Property.externalText(MAGIKA_PREFIX + "label");
+ public static Property MAGIKA_MIME =
+ Property.externalText(MAGIKA_PREFIX + "mime_type");
+ public static Property MAGIKA_IS_TEXT =
+ Property.externalBoolean(MAGIKA_PREFIX + "is_text");
+
+ public static Property MAGIKA_ERRORS =
+ Property.externalTextBag(MAGIKA_PREFIX + "errors");
+
+ public static Property MAGIKA_VERSION =
Property.externalText(MAGIKA_PREFIX + "version");
+
+ //TODO -- grab errors and warnings
+
+ private static final Logger LOGGER =
LoggerFactory.getLogger(MagikaDetector.class);
+ private static final long DEFAULT_TIMEOUT_MS = 60000;
+ private static final String DEFAULT_MAGIKA_PATH = "magika";
+
+ //we set this during the initial check.
+ //we assume that a new version is not installed during the lifecycle of
the MagikaDetector
+ private static String MAGIKA_VERSION_STRING = "";
+
+ private static ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+ private static boolean HAS_WARNED = false;
+ private Boolean hasMagika = null;
+ private String magikaPath = DEFAULT_MAGIKA_PATH;
+ private int maxBytes = 1_000_000;
+ private long timeoutMs = DEFAULT_TIMEOUT_MS;
+
+ private boolean useMime = false;
+
+ public static boolean checkHasMagika(String magikaCommandPath) {
+ String[] commandline = new String[]{magikaCommandPath, "--version"};
+ FileProcessResult result = null;
+ try {
+ result = ProcessUtils.execute(new ProcessBuilder(commandline),
+ 1000, 1000, 1000);
+ } catch (IOException e) {
+ LOGGER.debug("problem with magika");
+ return false;
+ }
+
+ if (result.getExitValue() != 0) {
+ return false;
+ }
+ /* python
+ Matcher m = Pattern
+ .compile("Magika version:\\s+(.{4,50})").matcher("");
+
+ */
+ //rust
+ Matcher m = Pattern
+ .compile("magika ([^\\s]{4,50})").matcher("");
+ for (String line : result.getStdout().split("[\r\n]+")) {
+ if (m.reset(line).find()) {
+ MAGIKA_VERSION_STRING = m.group(1);
+ break;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * @param input document input stream, or <code>null</code>
+ * @param metadata input metadata for the document
+ * @return mime as identified by the file command or
application/octet-stream otherwise
+ * @throws IOException
+ */
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws
IOException {
+ if (hasMagika == null) {
+ hasMagika = checkHasMagika(this.magikaPath);
+ }
+ if (!hasMagika) {
+ if (!HAS_WARNED) {
+ LOGGER.warn("'magika' command isn't working: '" + magikaPath +
"'");
+ HAS_WARNED = true;
+ }
+ return MediaType.OCTET_STREAM;
+ }
+ TikaInputStream tis = TikaInputStream.cast(input);
+ if (tis != null) {
+ //spool the full file to disk, if called with a TikaInputStream
+ //and there is no underlying file
+ return detectOnPath(tis.getPath(), metadata);
+ }
+
+ input.mark(maxBytes);
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ Path tmpFile = tmp.createTempFile();
+ Files.copy(new BoundedInputStream(maxBytes, input), tmpFile,
REPLACE_EXISTING);
+ return detectOnPath(tmpFile, metadata);
+ } finally {
+ input.reset();
+ }
+ }
+
+ /**
+ * As default behavior, Tika runs magika to add its detection
+ * to the metadata, but NOT to use detection in determining parsers
+ * etc. If this is set to <code>true</code>, this detector
+ * will return the first mime detected by magika and that
+ * mime will be used by the AutoDetectParser to select the appropriate
+ * parser.
+ *
+ * @param useMime
+ */
+ @Field
+ public void setUseMime(boolean useMime) {
+ this.useMime = useMime;
+ }
+
+ public boolean isUseMime() {
+ return useMime;
+ }
+
+ private MediaType detectOnPath(Path path, Metadata metadata) throws
IOException {
+
+ String[] args = new String[]{
+ ProcessUtils.escapeCommandLine(magikaPath),
+
ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString()),
+ "--json"
+ };
+ ProcessBuilder builder = new ProcessBuilder(args);
+ FileProcessResult result = ProcessUtils.execute(builder, timeoutMs,
10000000, 1000);
+ return processResult(result, metadata, useMime);
+ }
+
+ protected static MediaType processResult(FileProcessResult result,
Metadata metadata,
+ boolean returnMime) {
+ metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
+ metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
+
+ if (result.isTimeout()) {
+ metadata.set(MAGIKA_STATUS, STATUS.TIMEOUT.name());
+ return MediaType.OCTET_STREAM;
+ }
+ if (result.getExitValue() != 0) {
+ metadata.set(MAGIKA_STATUS, STATUS.CRASH.name());
+ return MediaType.OCTET_STREAM;
+ }
+ JsonNode rootArray;
+ try {
+ rootArray = OBJECT_MAPPER.readTree(result.getStdout());
+ } catch (JsonProcessingException e) {
+ metadata.set(MAGIKA_STATUS, STATUS.JSON_PARSE_EXCEPTION.name());
+ return MediaType.OCTET_STREAM;
+ }
+ if (! rootArray.isArray() || rootArray.isEmpty()) {
+ //something went wrong
+ return MediaType.OCTET_STREAM;
+ }
+ //for now just take the first value
+ JsonNode root = rootArray.get(0);
+ //this is the more modern version
+ if (root.has("result")) {
+ return processNewer(root.get("result"), metadata, returnMime);
+ } else {
+ return processOlder(root, metadata, returnMime);
+ }
+ }
+
+ private static MediaType processOlder(JsonNode root, Metadata metadata,
boolean returnMime) {
+ metadata.set(MAGIKA_STATUS, "ok");
+ //TODO -- should we get values in "dl" instead or in addition?
+ if (! root.has("output")) {
+ //do something else
+ return MediaType.OCTET_STREAM;
+ }
+ JsonNode mOutput = root.get("output");
+ if (mOutput.has("score")) {
+ double score = mOutput.get("score").asDouble(-1.0);
+ metadata.set(MAGIKA_SCORE, score);
+ }
+ addString(mOutput, "description", MAGIKA_DESCRIPTION, metadata);
+ addString(mOutput, "group", MAGIKA_GROUP, metadata);
+ addString(mOutput, "ct_label", MAGIKA_LABEL, metadata);
+ addString(mOutput, "mime_type", MAGIKA_MIME, metadata);
+ metadata.set(MAGIKA_VERSION, MAGIKA_VERSION_STRING);
+ if (returnMime && ! StringUtils.isBlank(metadata.get(MAGIKA_MIME))) {
+ return MediaType.parse(metadata.get(MAGIKA_MIME));
+ }
+
+ return MediaType.OCTET_STREAM;
+
+ }
+
+ private static MediaType processNewer(JsonNode result, Metadata metadata,
boolean returnMime) {
+ metadata.set(MAGIKA_STATUS, "ok");
+ //TODO -- should we get values in "dl" instead or in addition?
+ addString(result, "status", MAGIKA_STATUS, metadata);
+
+ if (! result.has("value")) {
+ return MediaType.OCTET_STREAM;
+ }
+ JsonNode mValue = result.get("value");
+
+ if (! mValue.has("output")) {
+ //do something else
+ return MediaType.OCTET_STREAM;
+ }
+
+ if (mValue.has("score")) {
+ double score = mValue.get("score").asDouble(-1.0);
+ metadata.set(MAGIKA_SCORE, score);
+ }
+
+ JsonNode mOutput = mValue.get("output");
+ if (mOutput.has("score")) {
+ double score = mOutput.get("score").asDouble(-1.0);
+ metadata.set(MAGIKA_SCORE, score);
+ }
+ addString(mOutput, "description", MAGIKA_DESCRIPTION, metadata);
+ addString(mOutput, "group", MAGIKA_GROUP, metadata);
+ addString(mOutput, "label", MAGIKA_LABEL, metadata);
+ addString(mOutput, "mime_type", MAGIKA_MIME, metadata);
+ setBoolean(mOutput, "is_text", MAGIKA_IS_TEXT, metadata);
+ metadata.set(MAGIKA_VERSION, MAGIKA_VERSION_STRING);
+ if (returnMime && ! StringUtils.isBlank(metadata.get(MAGIKA_MIME))) {
+ return MediaType.parse(metadata.get(MAGIKA_MIME));
+ }
+
+ return MediaType.OCTET_STREAM;
+
+ }
+
+ private static void setBoolean(JsonNode node, String jsonKey, Property
property,
+ Metadata metadata) {
+ if (! node.has(jsonKey)) {
+ return;
+ }
+ if (! node.get(jsonKey).isBoolean()) {
+ //log?
+ return;
+ }
+ metadata.set(property, node.get(jsonKey).booleanValue());
+
+ }
+
+ private static void addString(JsonNode node, String jsonKey, Property
property,
+ Metadata metadata) {
+ if (node.has(jsonKey)) {
+ if (node.get(jsonKey).isArray()) {
+ for (JsonNode child : node.get(jsonKey)) {
+ String val = child
+ .asText(StringUtils.EMPTY);
+ if (! StringUtils.isBlank(val)) {
+ metadata.add(property, val);
+ }
+ }
+ } else {
+ String val = node
+ .get(jsonKey)
+ .asText(StringUtils.EMPTY);
+ if (StringUtils.isBlank(val)) {
+ return;
+ }
+ metadata.set(property, val);
+ }
+ }
+ }
+
+ @Field
+ public void setMagikaPath(String fileCommandPath) {
+ //this opens up a potential command vulnerability.
+ //Don't ever let an untrusted user set this.
+ this.magikaPath = fileCommandPath;
+ checkHasMagika(this.magikaPath);
+ }
+
+ /**
+ * If this is not called on a TikaInputStream, this detector
+ * will spool up to this many bytes to a file to be detected
+ * by the 'file' command.
+ *
+ * @param maxBytes
+ */
+ @Field
+ public void setMaxBytes(int maxBytes) {
+ this.maxBytes = maxBytes;
+ }
+
+ @Field
+ public void setTimeoutMs(long timeoutMs) {
+ this.timeoutMs = timeoutMs;
+ }
+}
diff --git
a/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
b/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 000000000..380301a67
--- /dev/null
+++
b/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.detect.magika.MagikaDetector
\ No newline at end of file
diff --git
a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java
b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java
new file mode 100644
index 000000000..9b190d77c
--- /dev/null
+++
b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.magika;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+
+@Disabled("need to have magika on the path")
+public class TestMagikaIntegration extends TikaTest {
+
+ @Test
+ public void testIntegration() throws Exception {
+ TikaConfig tikaConfig = new TikaConfig(getConfig("tika-config.xml"));
+ Parser p = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p);
+ debug(getRecursiveMetadata("testPDF.pdf", p));
+ Metadata m = metadataList.get(0);
+ assertEquals("PDF document", m.get(MagikaDetector.MAGIKA_DESCRIPTION));
+ assertEquals(1.0,
Double.parseDouble(m.get(MagikaDetector.MAGIKA_SCORE)), 0.1);
+ assertEquals("document", m.get(MagikaDetector.MAGIKA_GROUP));
+ assertEquals("0.1.0-rc.1", m.get(MagikaDetector.MAGIKA_VERSION));
+ assertEquals("application/pdf", m.get(MagikaDetector.MAGIKA_MIME));
+ assertEquals("application/pdf", m.get(Metadata.CONTENT_TYPE));
+ assertEquals("ok", m.get(MagikaDetector.MAGIKA_STATUS));
+ }
+
+ private Path getConfig(String configName) throws URISyntaxException {
+ return Paths.get(
+ getClass().getResource("/configs/" + configName).toURI());
+ }
+
+}
diff --git
a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java
b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java
new file mode 100644
index 000000000..26b290065
--- /dev/null
+++
b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.magika;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.FileProcessResult;
+
+public class TestMagikaJsonParsing extends TikaTest {
+
+ //TODO -- add testcontainers unit test with dockerized magika
+
+ @Test
+ public void testPython0_5_1() throws Exception {
+ //this is the older python package available at the time of
development from pypi
+ FileProcessResult fileProcessResult = load("test-basic-0.5.1.json");
+ Metadata metadata = new Metadata();
+ MagikaDetector.processResult(fileProcessResult, metadata, false);
+ assertEquals("ok", metadata.get(MagikaDetector.MAGIKA_STATUS));
+ assertEquals("Python source",
metadata.get(MagikaDetector.MAGIKA_DESCRIPTION));
+ assertEquals(0.999987125396,
Double.parseDouble(metadata.get(MagikaDetector.MAGIKA_SCORE)), 0.0000001);
+ assertEquals("code", metadata.get(MagikaDetector.MAGIKA_GROUP));
+ assertEquals("python", metadata.get(MagikaDetector.MAGIKA_LABEL));
+ assertEquals("text/x-python",
metadata.get(MagikaDetector.MAGIKA_MIME));
+ assertNull(metadata.get(MagikaDetector.MAGIKA_IS_TEXT));
+ }
+
+ @Test
+ public void testRust0_1_0_rc1() throws Exception {
+ //this is the way of the future -- rust-based
+ FileProcessResult fileProcessResult = load("test-basic.json");
+ Metadata metadata = new Metadata();
+ MagikaDetector.processResult(fileProcessResult, metadata, false);
+ assertEquals("ok", metadata.get(MagikaDetector.MAGIKA_STATUS));
+ assertEquals("Python source",
metadata.get(MagikaDetector.MAGIKA_DESCRIPTION));
+ assertEquals(0.753000020980835,
Double.parseDouble(metadata.get(MagikaDetector.MAGIKA_SCORE)), 0.0000001);
+ assertEquals("code", metadata.get(MagikaDetector.MAGIKA_GROUP));
+ assertEquals("python", metadata.get(MagikaDetector.MAGIKA_LABEL));
+ assertEquals("text/x-python",
metadata.get(MagikaDetector.MAGIKA_MIME));
+ assertEquals(true,
Boolean.parseBoolean(metadata.get(MagikaDetector.MAGIKA_IS_TEXT)));
+
+ }
+/*
+ @Test
+ public void testErrors() throws Exception {
+ FileProcessResult fileProcessResult = load("test-errors.json");
+ Metadata metadata = new Metadata();
+ SiegfriedDetector.processResult(fileProcessResult, metadata, false);
+ //debug(metadata);
+ assertEquals("1.9.5",
metadata.get(SiegfriedDetector.SIEGFRIED_VERSION));
+ assertEquals("default.sig",
metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE));
+ assertEquals("x-fmt/111", metadata.get("sf:pronom:id"));
+ assertEquals("extension match txt", metadata.get("sf:pronom:basis"));
+ assertEquals("Plain Text File", metadata.get("sf:pronom:format"));
+ assertEquals("text/plain", metadata.get("sf:pronom:mime"));
+ assertNull(metadata.get("sf:pronom:version"));
+ assertEquals("empty source",
metadata.get(SiegfriedDetector.SIEGFRIED_ERRORS));
+ }
+
+ @Test
+ public void testWarnings() throws Exception {
+ FileProcessResult fileProcessResult = load("test-warnings.json");
+ Metadata metadata = new Metadata();
+ SiegfriedDetector.processResult(fileProcessResult, metadata, false);
+ assertEquals("1.9.5",
metadata.get(SiegfriedDetector.SIEGFRIED_VERSION));
+ assertEquals("default.sig",
metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE));
+ assertEquals("UNKNOWN", metadata.get("sf:pronom:id"));
+ assertNull(metadata.get("sf:pronom:basis"));
+ assertNull(metadata.get("sf:pronom:format"));
+ assertNull(metadata.get("sf:pronom:mime"));
+ assertNull(metadata.get("sf:pronom:version"));
+ assertTrue(metadata.get("sf:pronom:warning")
+ .startsWith("no match; possibilities based on extension are
fmt/14, fmt/15, fmt/16, " +
+ "fmt/17, fmt/18, fmt/19"));
+ }
+
+
+*/
+
+ private FileProcessResult load(String jsonFileName) throws IOException {
+ String jsonString = IOUtils.toString(
+ getClass().getResourceAsStream("/json/" + jsonFileName),
StandardCharsets.UTF_8);
+ FileProcessResult r = new FileProcessResult();
+ r.setStdout(jsonString);
+ r.setExitValue(0);
+ return r;
+ }
+}
diff --git
a/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml
b/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml
new file mode 100644
index 000000000..294f25290
--- /dev/null
+++
b/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <detectors>
+ <detector class="org.apache.tika.detect.DefaultDetector"/>
+ <detector class="org.apache.tika.detect.magika.MagikaDetector">
+ <params>
+ <param name="magikaPath"
type="string">/home/tallison/.cargo/bin/magika</param> <!-- or replace with
full path to the commandline -->
+ <param name="useMime" type="bool">true</param>
+ </params>
+ </detector>
+ </detectors>
+</properties>
diff --git
a/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic-0.5.1.json
b/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic-0.5.1.json
new file mode 100644
index 000000000..e8aa5366e
--- /dev/null
+++
b/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic-0.5.1.json
@@ -0,0 +1,21 @@
+[
+ {
+ "path": "...im2txtapi.py",
+ "dl": {
+ "ct_label": "python",
+ "score": 0.9999871253967285,
+ "group": "code",
+ "mime_type": "text/x-python",
+ "magic": "Python script",
+ "description": "Python source"
+ },
+ "output": {
+ "ct_label": "python",
+ "score": 0.9999871253967285,
+ "group": "code",
+ "mime_type": "text/x-python",
+ "magic": "Python script",
+ "description": "Python source"
+ }
+ }
+]
diff --git
a/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json
b/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json
new file mode 100644
index 000000000..a96bbf8ae
--- /dev/null
+++
b/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json
@@ -0,0 +1,33 @@
+[
+ {
+ "path": "./tests_data/basic/python/code.py",
+ "result": {
+ "status": "ok",
+ "value": {
+ "dl": {
+ "description": "Python source",
+ "extensions": [
+ "py",
+ "pyi"
+ ],
+ "group": "code",
+ "is_text": true,
+ "label": "python",
+ "mime_type": "text/x-python"
+ },
+ "output": {
+ "description": "Python source",
+ "extensions": [
+ "py",
+ "pyi"
+ ],
+ "group": "code",
+ "is_text": true,
+ "label": "python",
+ "mime_type": "text/x-python"
+ },
+ "score": 0.753000020980835
+ }
+ }
+ }
+]
\ No newline at end of file
diff --git
a/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf
b/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf
new file mode 100644
index 000000000..1f1bcff6f
Binary files /dev/null and
b/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf
differ