NIFI-296: Use only tika-core to keep jar footprint down
Project: http://git-wip-us.apache.org/repos/asf/incubator-nifi/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-nifi/commit/089eec2e Tree: http://git-wip-us.apache.org/repos/asf/incubator-nifi/tree/089eec2e Diff: http://git-wip-us.apache.org/repos/asf/incubator-nifi/diff/089eec2e Branch: refs/heads/develop Commit: 089eec2e0c03d0d3546c36fba716c71f48f66368 Parents: b418b89 Author: Adam Lamar <adamond...@gmail.com> Authored: Tue Feb 24 20:26:13 2015 +0000 Committer: Adam Lamar <adamond...@gmail.com> Committed: Sun Mar 1 19:58:46 2015 +0000 ---------------------------------------------------------------------- .../nifi-standard-processors/pom.xml | 6 -- .../processors/standard/IdentifyMimeType.java | 86 ++++--------------- .../org/apache/tika/mime/custom-mimetypes.xml | 83 ++++++++++++++++++ .../standard/TestIdentifyMimeType.java | 24 ++++++ .../resources/TestIdentifyMimeType/1.tar.gz | Bin 0 -> 154 bytes 5 files changed, 123 insertions(+), 76 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml ---------------------------------------------------------------------- diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml index b941d03..e31f0fa 100644 --- a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml +++ b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml @@ -159,12 +159,6 @@ <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> <version>1.7</version> - <type>pom</type> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parsers</artifactId> - <version>1.7</version> </dependency> </dependencies> </project> http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java ---------------------------------------------------------------------- diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java index fd3d4ae..68880e6 100644 --- a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java +++ b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java @@ -19,14 +19,9 @@ package org.apache.nifi.processors.standard; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashSet; -import java.util.List; import java.util.Set; -import org.apache.commons.compress.archivers.tar.TarArchiveEntry; -import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; @@ -42,15 +37,9 @@ import org.apache.nifi.annotation.behavior.SideEffectFree; import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.processor.io.InputStreamCallback; -import org.apache.nifi.stream.io.StreamUtils; -import org.apache.nifi.util.FlowFilePackagerV1; -import org.apache.nifi.util.FlowFilePackagerV3; import org.apache.nifi.util.ObjectHolder; import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.CompositeDetector; -import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; -import org.apache.tika.detect.MagicDetector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -97,16 +86,7 @@ public class IdentifyMimeType extends AbstractProcessor { public IdentifyMimeType() { // Setup Tika this.config = TikaConfig.getDefaultConfig(); - DefaultDetector ddetector = new DefaultDetector(); - - // Create list of detectors, preferring our custom detectors first - List<Detector> detectors = new ArrayList<>(); - detectors.add(getFlowFileV3Detector()); - detectors.add(getFlowFileV1Detector()); - detectors.addAll(ddetector.getDetectors()); - - CompositeDetector compositeDetector = new CompositeDetector(detectors); - this.detector = compositeDetector; + this.detector = config.getDetector(); } @Override @@ -129,9 +109,8 @@ public class IdentifyMimeType extends AbstractProcessor { } final ProcessorLog logger = getLogger(); - final ObjectHolder<String> mimeTypeRef = new ObjectHolder<>(null); - final ObjectHolder<String> extensionRef = new ObjectHolder<>(null); + session.read(flowFile, new InputStreamCallback() { @Override public void process(final InputStream stream) throws IOException { @@ -141,20 +120,25 @@ public class IdentifyMimeType extends AbstractProcessor { // Get mime type MediaType mediatype = detector.detect(tikaStream, metadata); mimeTypeRef.set(mediatype.toString()); - // Get common file extension - try { - MimeType mimetype; - mimetype = config.getMimeRepository().forName(mediatype.toString()); - extensionRef.set(mimetype.getExtension()); - } catch (MimeTypeException ex) { - logger.warn("MIME type detection failed: {}", new Object[]{ex}); - } } } }); String mimeType = mimeTypeRef.get(); - String extension = extensionRef.get(); + String extension = ""; + try { + MimeType mimetype; + mimetype = config.getMimeRepository().forName(mimeType); + extension = mimetype.getExtension(); + } catch (MimeTypeException ex) { + logger.warn("MIME type extension lookup failed: {}", new Object[]{ex}); + } + + // Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563 + if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) { + extension = ".gz"; + } + if (mimeType == null) { flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream"); flowFile = session.putAttribute(flowFile, "mime.extension", ""); @@ -168,42 +152,4 @@ public class IdentifyMimeType extends AbstractProcessor { session.getProvenanceReporter().modifyAttributes(flowFile); session.transfer(flowFile, REL_SUCCESS); } - - private Detector getFlowFileV3Detector() { - return new MagicDetector(FLOWFILE_V3, FlowFilePackagerV3.MAGIC_HEADER); - } - - private Detector getFlowFileV1Detector() { - return new FlowFileV1Detector(); - } - - private class FlowFileV1Detector implements Detector { - - @Override - public MediaType detect(InputStream in, Metadata mtdt) throws IOException { - // Sanity check the stream. This may not be a tarfile at all - in.mark(FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()); - byte[] bytes = new byte[FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()]; - StreamUtils.fillBuffer(in, bytes, false); - in.reset(); - - // Quick exit if the first filename is not correct - if (!Arrays.equals(bytes, FlowFilePackagerV1.FILENAME_ATTRIBUTES.getBytes())) { - return MediaType.OCTET_STREAM; - } - - // More in-depth detection - final TarArchiveInputStream tarIn = new TarArchiveInputStream(in); - final TarArchiveEntry firstEntry = tarIn.getNextTarEntry(); - if (firstEntry != null) { - if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) { - final TarArchiveEntry secondEntry = tarIn.getNextTarEntry(); - if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) { - return FLOWFILE_V1; - } - } - } - return MediaType.OCTET_STREAM; - } - } } http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml ---------------------------------------------------------------------- diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml new file mode 100644 index 0000000..657b4b5 --- /dev/null +++ b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml @@ -0,0 +1,83 @@ +<?xml version="1.0" encoding="UTF-8"?> +<mime-info> + + <mime-type type="application/flowfile-v1"> + <_comment>NiFi FlowFile V1</_comment> + <sub-class-of type="application/x-tar"/> + <magic> + <match value="flowfile.attributes" type="string" offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/flowfile-v3"> + <_comment>NiFi FlowFile V3</_comment> + <magic> + <match value="NiFiFF3" type="string" offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"> + <_comment>Office Open XML Workbook</_comment> + <glob pattern="*.xlsx"/> + <sub-class-of type="application/x-tika-ooxml"/> + <magic priority="60"> + <match value="PK\003\004" type="string" offset="0"> + <match value="[Content_Types].xml" type="string" offset="30"> + <match value="xl/_rels/workbook.xml.rels" type="string" offset="30:4096"/> + </match> + </match> + </magic> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"> + <_comment>Office Open XML Document</_comment> + <glob pattern="*.docx"/> + <sub-class-of type="application/x-tika-ooxml"/> + <magic priority="60"> + <match value="PK\003\004" type="string" offset="0"> + <match value="[Content_Types].xml" type="string" offset="30"> + <match value="word/_rels/document.xml.rels" type="string" offset="30:4096"/> + </match> + </match> + </magic> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation"> + <_comment>Office Open XML Presentation</_comment> + <glob pattern="*.pptx"/> + <glob pattern="*.thmx"/> + <sub-class-of type="application/x-tika-ooxml"/> + <magic priority="60"> + <match value="PK\003\004" type="string" offset="0"> + <match value="[Content_Types].xml" type="string" offset="30"> + <match value="ppt/slides/_rels/slide" type="string" offset="30:4096"/> + </match> + </match> + </magic> + </mime-type> + + <mime-type type="application/java-archive"> + <_comment>Java Archive</_comment> + <tika:link>http://en.wikipedia.org/wiki/.jar</tika:link> + <tika:uti>com.sun.java-archive</tika:uti> + <sub-class-of type="application/zip"/> + <glob pattern="*.jar"/> + <magic priority="50"> + <match value="PK\003\004" type="string" offset="0"> + <match value="META-INF/MANIFEST.MF" type="string" offset="0:1024"/> + </match> + </magic> + </mime-type> + + <!-- Override tika's default behavior for GNU tar detection because nobody calls + a GNU tar a .gtar --> + <mime-type type="application/x-tar"> + <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment> + <magic priority="60"> + <!-- GNU tar archive --> + <match value="ustar \0" type="string" offset="257" /> + </magic> + <glob pattern="*.tar"/> + </mime-type> + +</mime-info> http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java ---------------------------------------------------------------------- diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java index 40b03b4..1bf4585 100644 --- a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java +++ b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java @@ -67,17 +67,41 @@ public class TestIdentifyMimeType { expectedMimeTypes.put("1.pdf", "application/pdf"); expectedMimeTypes.put("grid.gif", "image/gif"); expectedMimeTypes.put("1.tar", "application/x-tar"); + expectedMimeTypes.put("1.tar.gz", "application/gzip"); expectedMimeTypes.put("1.jar", "application/java-archive"); expectedMimeTypes.put("1.xml", "application/xml"); expectedMimeTypes.put("flowfilev3", "application/flowfile-v3"); expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1"); + final Map<String, String> expectedExtensions = new HashMap<>(); + expectedExtensions.put("1.7z", ".7z"); + expectedExtensions.put("1.mdb", ".mdb"); + expectedExtensions.put("1.txt", ".txt"); + expectedExtensions.put("1.txt.bz2", ".bz2"); + expectedExtensions.put("1.txt.gz", ".gz"); + expectedExtensions.put("1.zip", ".zip"); + expectedExtensions.put("bgBannerFoot.png", ".png"); + expectedExtensions.put("blueBtnBg.jpg", ".jpg"); + expectedExtensions.put("1.pdf", ".pdf"); + expectedExtensions.put("grid.gif", ".gif"); + expectedExtensions.put("1.tar", ".tar"); + expectedExtensions.put("1.tar.gz", ".gz"); + expectedExtensions.put("1.jar", ".jar"); + expectedExtensions.put("1.xml", ".xml"); + expectedExtensions.put("flowfilev3", ""); + expectedExtensions.put("flowfilev1.tar", ""); + final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); for (final MockFlowFile file : filesOut) { final String filename = file.getAttribute(CoreAttributes.FILENAME.key()); final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key()); final String expected = expectedMimeTypes.get(filename); + + final String extension = file.getAttribute("mime.extension"); + final String expectedExtension = expectedExtensions.get(filename); + assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType); + assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension); } } } http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz ---------------------------------------------------------------------- diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz new file mode 100755 index 0000000..481ccc1 Binary files /dev/null and b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz differ