NIFI-296: Use only tika-core to keep jar footprint down

Project: http://git-wip-us.apache.org/repos/asf/incubator-nifi/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-nifi/commit/089eec2e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-nifi/tree/089eec2e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-nifi/diff/089eec2e

Branch: refs/heads/develop
Commit: 089eec2e0c03d0d3546c36fba716c71f48f66368
Parents: b418b89
Author: Adam Lamar <adamond...@gmail.com>
Authored: Tue Feb 24 20:26:13 2015 +0000
Committer: Adam Lamar <adamond...@gmail.com>
Committed: Sun Mar 1 19:58:46 2015 +0000

----------------------------------------------------------------------
 .../nifi-standard-processors/pom.xml            |   6 --
 .../processors/standard/IdentifyMimeType.java   |  86 ++++---------------
 .../org/apache/tika/mime/custom-mimetypes.xml   |  83 ++++++++++++++++++
 .../standard/TestIdentifyMimeType.java          |  24 ++++++
 .../resources/TestIdentifyMimeType/1.tar.gz     | Bin 0 -> 154 bytes
 5 files changed, 123 insertions(+), 76 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml
----------------------------------------------------------------------
diff --git 
a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml
index b941d03..e31f0fa 100644
--- 
a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml
+++ 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml
@@ -159,12 +159,6 @@
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-core</artifactId>
             <version>1.7</version>
-            <type>pom</type>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.tika</groupId>
-            <artifactId>tika-parsers</artifactId>
-            <version>1.7</version>
         </dependency>
     </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
----------------------------------------------------------------------
diff --git 
a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
index fd3d4ae..68880e6 100644
--- 
a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
+++ 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
@@ -19,14 +19,9 @@ package org.apache.nifi.processors.standard;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Set;
-import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 
 import org.apache.nifi.flowfile.FlowFile;
 import org.apache.nifi.flowfile.attributes.CoreAttributes;
@@ -42,15 +37,9 @@ import org.apache.nifi.annotation.behavior.SideEffectFree;
 import org.apache.nifi.annotation.behavior.SupportsBatching;
 import org.apache.nifi.annotation.documentation.Tags;
 import org.apache.nifi.processor.io.InputStreamCallback;
-import org.apache.nifi.stream.io.StreamUtils;
-import org.apache.nifi.util.FlowFilePackagerV1;
-import org.apache.nifi.util.FlowFilePackagerV3;
 import org.apache.nifi.util.ObjectHolder;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.CompositeDetector;
-import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
-import org.apache.tika.detect.MagicDetector;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -97,16 +86,7 @@ public class IdentifyMimeType extends AbstractProcessor {
     public IdentifyMimeType() {
         // Setup Tika
         this.config = TikaConfig.getDefaultConfig();
-        DefaultDetector ddetector = new DefaultDetector();
-
-        // Create list of detectors, preferring our custom detectors first
-        List<Detector> detectors = new ArrayList<>();
-        detectors.add(getFlowFileV3Detector());
-        detectors.add(getFlowFileV1Detector());
-        detectors.addAll(ddetector.getDetectors());
-
-        CompositeDetector compositeDetector = new CompositeDetector(detectors);
-        this.detector = compositeDetector;
+        this.detector = config.getDetector();
     }
 
     @Override
@@ -129,9 +109,8 @@ public class IdentifyMimeType extends AbstractProcessor {
         }
 
         final ProcessorLog logger = getLogger();
-
         final ObjectHolder<String> mimeTypeRef = new ObjectHolder<>(null);
-        final ObjectHolder<String> extensionRef = new ObjectHolder<>(null);
+
         session.read(flowFile, new InputStreamCallback() {
             @Override
             public void process(final InputStream stream) throws IOException {
@@ -141,20 +120,25 @@ public class IdentifyMimeType extends AbstractProcessor {
                     // Get mime type
                     MediaType mediatype = detector.detect(tikaStream, 
metadata);
                     mimeTypeRef.set(mediatype.toString());
-                    // Get common file extension
-                    try {
-                        MimeType mimetype;
-                        mimetype = 
config.getMimeRepository().forName(mediatype.toString());
-                        extensionRef.set(mimetype.getExtension());
-                    } catch (MimeTypeException ex) {
-                        logger.warn("MIME type detection failed: {}", new 
Object[]{ex});
-                    }
                 }
             }
         });
 
         String mimeType = mimeTypeRef.get();
-        String extension = extensionRef.get();
+        String extension = "";
+        try {
+            MimeType mimetype;
+            mimetype = config.getMimeRepository().forName(mimeType);
+            extension = mimetype.getExtension();
+        } catch (MimeTypeException ex) {
+            logger.warn("MIME type extension lookup failed: {}", new 
Object[]{ex});
+        }
+
+        // Workaround for bug in Tika - 
https://issues.apache.org/jira/browse/TIKA-1563
+        if (mimeType != null && mimeType.equals("application/gzip") && 
extension.equals(".tgz")) {
+            extension = ".gz";
+        }
+
         if (mimeType == null) {
             flowFile = session.putAttribute(flowFile, 
CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
             flowFile = session.putAttribute(flowFile, "mime.extension", "");
@@ -168,42 +152,4 @@ public class IdentifyMimeType extends AbstractProcessor {
         session.getProvenanceReporter().modifyAttributes(flowFile);
         session.transfer(flowFile, REL_SUCCESS);
     }
-
-    private Detector getFlowFileV3Detector() {
-        return new MagicDetector(FLOWFILE_V3, FlowFilePackagerV3.MAGIC_HEADER);
-    }
-
-    private Detector getFlowFileV1Detector() {
-        return new FlowFileV1Detector();
-    }
-
-    private class FlowFileV1Detector implements Detector {
-
-        @Override
-        public MediaType detect(InputStream in, Metadata mtdt) throws 
IOException {
-            // Sanity check the stream. This may not be a tarfile at all
-            in.mark(FlowFilePackagerV1.FILENAME_ATTRIBUTES.length());
-            byte[] bytes = new 
byte[FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()];
-            StreamUtils.fillBuffer(in, bytes, false);
-            in.reset();
-
-            // Quick exit if the first filename is not correct
-            if (!Arrays.equals(bytes, 
FlowFilePackagerV1.FILENAME_ATTRIBUTES.getBytes())) {
-                return MediaType.OCTET_STREAM;
-            }
-
-            // More in-depth detection
-            final TarArchiveInputStream tarIn = new TarArchiveInputStream(in);
-            final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
-            if (firstEntry != null) {
-                if 
(firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
-                    final TarArchiveEntry secondEntry = 
tarIn.getNextTarEntry();
-                    if (secondEntry != null && 
secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
-                        return FLOWFILE_V1;
-                    }
-                }
-            }
-            return MediaType.OCTET_STREAM;
-        }
-    }
 }

http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml
----------------------------------------------------------------------
diff --git 
a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml
 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml
new file mode 100644
index 0000000..657b4b5
--- /dev/null
+++ 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mime-info>
+
+  <mime-type type="application/flowfile-v1">
+    <_comment>NiFi FlowFile V1</_comment>
+    <sub-class-of type="application/x-tar"/>
+    <magic>
+      <match value="flowfile.attributes" type="string" offset="0" />
+    </magic>
+  </mime-type>
+
+  <mime-type type="application/flowfile-v3">
+    <_comment>NiFi FlowFile V3</_comment>
+    <magic>
+      <match value="NiFiFF3" type="string" offset="0" />
+    </magic>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet">
+    <_comment>Office Open XML Workbook</_comment>
+    <glob pattern="*.xlsx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+    <magic priority="60">
+      <match value="PK\003\004" type="string" offset="0">
+        <match value="[Content_Types].xml" type="string" offset="30">
+          <match value="xl/_rels/workbook.xml.rels" type="string" 
offset="30:4096"/>
+        </match>
+      </match>
+    </magic>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.wordprocessingml.document">
+    <_comment>Office Open XML Document</_comment>
+    <glob pattern="*.docx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+    <magic priority="60">
+      <match value="PK\003\004" type="string" offset="0">
+        <match value="[Content_Types].xml" type="string" offset="30">
+          <match value="word/_rels/document.xml.rels" type="string" 
offset="30:4096"/>
+        </match>
+      </match>
+    </magic>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
+    <_comment>Office Open XML Presentation</_comment>
+    <glob pattern="*.pptx"/>
+    <glob pattern="*.thmx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+    <magic priority="60">
+      <match value="PK\003\004" type="string" offset="0">
+        <match value="[Content_Types].xml" type="string" offset="30">
+          <match value="ppt/slides/_rels/slide" type="string" 
offset="30:4096"/>
+        </match>
+      </match>
+    </magic>
+  </mime-type>
+
+  <mime-type type="application/java-archive">
+    <_comment>Java Archive</_comment>
+    <tika:link>http://en.wikipedia.org/wiki/.jar</tika:link>
+    <tika:uti>com.sun.java-archive</tika:uti>
+    <sub-class-of type="application/zip"/>
+    <glob pattern="*.jar"/>
+    <magic priority="50">
+      <match value="PK\003\004" type="string" offset="0">
+        <match value="META-INF/MANIFEST.MF" type="string" offset="0:1024"/>
+      </match>
+    </magic>
+  </mime-type>
+
+  <!-- Override tika's default behavior for GNU tar detection because nobody 
calls
+       a GNU tar a .gtar -->
+  <mime-type type="application/x-tar">
+    <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
+    <magic priority="60">
+      <!-- GNU tar archive -->
+      <match value="ustar  \0" type="string" offset="257" />
+    </magic>
+    <glob pattern="*.tar"/>
+  </mime-type>
+
+</mime-info>

http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java
----------------------------------------------------------------------
diff --git 
a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java
 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java
index 40b03b4..1bf4585 100644
--- 
a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java
+++ 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java
@@ -67,17 +67,41 @@ public class TestIdentifyMimeType {
         expectedMimeTypes.put("1.pdf", "application/pdf");
         expectedMimeTypes.put("grid.gif", "image/gif");
         expectedMimeTypes.put("1.tar", "application/x-tar");
+        expectedMimeTypes.put("1.tar.gz", "application/gzip");
         expectedMimeTypes.put("1.jar", "application/java-archive");
         expectedMimeTypes.put("1.xml", "application/xml");
         expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
         expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
 
+        final Map<String, String> expectedExtensions = new HashMap<>();
+        expectedExtensions.put("1.7z", ".7z");
+        expectedExtensions.put("1.mdb", ".mdb");
+        expectedExtensions.put("1.txt", ".txt");
+        expectedExtensions.put("1.txt.bz2", ".bz2");
+        expectedExtensions.put("1.txt.gz", ".gz");
+        expectedExtensions.put("1.zip", ".zip");
+        expectedExtensions.put("bgBannerFoot.png", ".png");
+        expectedExtensions.put("blueBtnBg.jpg", ".jpg");
+        expectedExtensions.put("1.pdf", ".pdf");
+        expectedExtensions.put("grid.gif", ".gif");
+        expectedExtensions.put("1.tar", ".tar");
+        expectedExtensions.put("1.tar.gz", ".gz");
+        expectedExtensions.put("1.jar", ".jar");
+        expectedExtensions.put("1.xml", ".xml");
+        expectedExtensions.put("flowfilev3", "");
+        expectedExtensions.put("flowfilev1.tar", "");
+
         final List<MockFlowFile> filesOut = 
runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
         for (final MockFlowFile file : filesOut) {
             final String filename = 
file.getAttribute(CoreAttributes.FILENAME.key());
             final String mimeType = 
file.getAttribute(CoreAttributes.MIME_TYPE.key());
             final String expected = expectedMimeTypes.get(filename);
+
+            final String extension = file.getAttribute("mime.extension");
+            final String expectedExtension = expectedExtensions.get(filename);
+
             assertEquals("Expected " + file + " to have MIME Type " + expected 
+ ", but it was " + mimeType, expected, mimeType);
+            assertEquals("Expected " + file + " to have extension " + 
expectedExtension + ", but it was " + extension, expectedExtension, extension);
         }
     }
 }

http://git-wip-us.apache.org/repos/asf/incubator-nifi/blob/089eec2e/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz
----------------------------------------------------------------------
diff --git 
a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz
 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz
new file mode 100755
index 0000000..481ccc1
Binary files /dev/null and 
b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz
 differ

Reply via email to