Revision: 18660
          http://sourceforge.net/p/gate/code/18660
Author:   ian_roberts
Date:     2015-05-01 15:01:14 +0000 (Fri, 01 May 2015)
Log Message:
-----------
Use commons-compress to handle more compression formats in pure Java, and take 
advantage of its compression format auto-detection logic.

Modified Paths:
--------------
    gcp/trunk/.classpath
    gcp/trunk/.settings/org.eclipse.jdt.core.prefs
    gcp/trunk/build/ivy.xml
    gcp/trunk/src/gate/cloud/io/json/JSONStreamingInputHandler.java

Modified: gcp/trunk/.classpath
===================================================================
--- gcp/trunk/.classpath        2015-05-01 01:20:25 UTC (rev 18659)
+++ gcp/trunk/.classpath        2015-05-01 15:01:14 UTC (rev 18660)
@@ -5,6 +5,6 @@
        <classpathentry kind="con" 
path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
        <classpathentry kind="lib" path="conf"/>
        <classpathentry kind="lib" path="lib/mimir-client-5.0.jar"/>
-       <classpathentry kind="con" 
path="org.apache.ivyde.eclipse.cpcontainer.IVYDE_CONTAINER/?project=gcp&amp;ivyXmlPath=build%2Fivy.xml&amp;confs=*&amp;ivySettingsPath=%24%7Bworkspace_loc%3Agcp%2Fbuild%2Fivysettings.xml%7D&amp;loadSettingsOnDemand=false&amp;propertyFiles="/>
+       <classpathentry exported="true" kind="con" 
path="org.apache.ivyde.eclipse.cpcontainer.IVYDE_CONTAINER/?project=gcp&amp;ivyXmlPath=build%2Fivy.xml&amp;confs=*&amp;ivySettingsPath=%24%7Bworkspace_loc%3Agcp%2Fbuild%2Fivysettings.xml%7D&amp;loadSettingsOnDemand=false&amp;propertyFiles="/>
        <classpathentry kind="output" path="classes"/>
 </classpath>

Modified: gcp/trunk/.settings/org.eclipse.jdt.core.prefs
===================================================================
--- gcp/trunk/.settings/org.eclipse.jdt.core.prefs      2015-05-01 01:20:25 UTC 
(rev 18659)
+++ gcp/trunk/.settings/org.eclipse.jdt.core.prefs      2015-05-01 15:01:14 UTC 
(rev 18660)
@@ -1,12 +1,11 @@
-#Fri Nov 05 15:21:46 GMT 2010
 eclipse.preferences.version=1
 org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
-org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
 org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
-org.eclipse.jdt.core.compiler.compliance=1.5
+org.eclipse.jdt.core.compiler.compliance=1.7
 org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 org.eclipse.jdt.core.compiler.debug.localVariable=generate
 org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
 org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
-org.eclipse.jdt.core.compiler.source=1.5
+org.eclipse.jdt.core.compiler.source=1.7

Modified: gcp/trunk/build/ivy.xml
===================================================================
--- gcp/trunk/build/ivy.xml     2015-05-01 01:20:25 UTC (rev 18659)
+++ gcp/trunk/build/ivy.xml     2015-05-01 15:01:14 UTC (rev 18660)
@@ -27,5 +27,8 @@
     
     <!-- JNA for PID extraction -->
     <dependency org="net.java.dev.jna" name="jna" rev="4.0.0" />
+    
+    <!-- commons-compress for unzip/bz2/xz/etc -->
+    <dependency org="org.apache.commons" name="commons-compress" rev="1.9" />
   </dependencies>
 </ivy-module>

Modified: gcp/trunk/src/gate/cloud/io/json/JSONStreamingInputHandler.java
===================================================================
--- gcp/trunk/src/gate/cloud/io/json/JSONStreamingInputHandler.java     
2015-05-01 01:20:25 UTC (rev 18659)
+++ gcp/trunk/src/gate/cloud/io/json/JSONStreamingInputHandler.java     
2015-05-01 15:01:14 UTC (rev 18660)
@@ -29,6 +29,7 @@
 import gate.cloud.io.StreamingInputHandler;
 import gate.util.GateException;
 
+import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -41,6 +42,9 @@
 import java.util.Set;
 import java.util.zip.GZIPInputStream;
 
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.io.IOUtils;
 import org.apache.log4j.Logger;
 
 import com.fasterxml.jackson.core.JsonParser;
@@ -63,14 +67,18 @@
  * be found will be ignored.
  * </p>
  * <p>
- * The input file may be compressed. If the "compression" option is set
- * to "gzip" then the file will be unpacked using Java's native GZIP
- * support. Any other value of "compression" will be treated as a
- * command line to a program that expects the compressed data on its
- * standard input and will produce uncompressed output on its standard
- * out. The command will be split into words at whitespace, so embedded
- * whitespace within a single word is not permitted. For example, to
- * handle JSON files compressed in LZO format use
+ * The input file may be compressed. The following values of the
+ * "compression" option can be handled natively in Java by Apache
+ * commons-compress: "gz" (or "gzip"), "bzip2", "xz", "z" (the Unix
+ * <code>compress</code> format), "pack200", "lzma", "snappy-raw",
+ * "snappy-framed", "deflate". The value "any" will attempt to
+ * auto-detect the compression format, falling back on no compression if
+ * auto-detection fails. Any other value of "compression" will be
+ * treated as a command line to a program that expects the compressed
+ * data on its standard input and will produce uncompressed output on
+ * its standard out. The command will be split into words at whitespace,
+ * so embedded whitespace within a single word is not permitted. For
+ * example, to handle JSON files compressed in LZO format use
  * <code>compression="lzop -dc"</code>.
  * </p>
  * <p>
@@ -92,7 +100,7 @@
  * <pre>
  * &lt;input class="gate.cloud.io.json.JSONStreamingInputHandler"
  *        srcFile="interactions.gz"
- *        compression="gzip"
+ *        compression="gz"
  *        mimeType="text/x-json-datasift"
  *        idPointer="/interaction/id" />
  * </pre>
@@ -215,17 +223,54 @@
     InputStream inputStream = null;
     if(compression == null) {
       inputStream = new FileInputStream(srcFile);
-    } else if(VALUE_COMPRESSION_GZIP.equals(compression)) {
-      inputStream = new GZIPInputStream(new FileInputStream(srcFile));
+    } else if("any".equals(compression)) {
+      inputStream = new BufferedInputStream(new FileInputStream(srcFile));
+      try {
+        inputStream =
+                new CompressorStreamFactory()
+                        .createCompressorInputStream(inputStream);
+      } catch(CompressorException e) {
+        if(e.getCause() != null) {
+          if(e.getCause() instanceof IOException) {
+            throw (IOException)e.getCause();
+          } else {
+            throw new GateException(e.getCause());
+          }
+        } else {
+          // unrecognised signature, assume uncompressed
+          logger.info("Failed to detect compression format, assuming no 
compression");
+        }
+      }
     } else {
-      // treat compression value as a command line
-      ProcessBuilder pb = new ProcessBuilder(compression.trim().split("\\s+"));
-      pb.directory(batchDir);
-      pb.redirectError(Redirect.INHERIT);
-      pb.redirectOutput(Redirect.PIPE);
-      pb.redirectInput(srcFile);
-      decompressProcess = pb.start();
-      inputStream = decompressProcess.getInputStream();
+      if(compression == VALUE_COMPRESSION_GZIP) {
+        compression = CompressorStreamFactory.GZIP;
+      }
+      inputStream = new BufferedInputStream(new FileInputStream(srcFile));
+      try {
+        inputStream =
+                new CompressorStreamFactory()
+                        .createCompressorInputStream(compression, inputStream);
+      } catch(CompressorException e) {
+        if(e.getCause() != null) {
+          if(e.getCause() instanceof IOException) {
+            throw (IOException)e.getCause();
+          } else {
+            throw new GateException(e.getCause());
+          }
+        } else {
+          // unrecognised compressor name
+          logger.info("Unrecognised compression format, assuming external 
compressor");
+          IOUtils.closeQuietly(inputStream);
+          // treat compression value as a command line
+          ProcessBuilder pb = new 
ProcessBuilder(compression.trim().split("\\s+"));
+          pb.directory(batchDir);
+          pb.redirectError(Redirect.INHERIT);
+          pb.redirectOutput(Redirect.PIPE);
+          pb.redirectInput(srcFile);
+          decompressProcess = pb.start();
+          inputStream = decompressProcess.getInputStream();
+        }
+      }
     }
 
     objectMapper = new ObjectMapper();
@@ -281,11 +326,12 @@
           Document gateDoc =
                   (Document)Factory.createResource("gate.corpora.DocumentImpl",
                           docParams, Utils.featureMap(
-                                  GateConstants.THROWEX_FORMAT_PROPERTY_NAME, 
Boolean.TRUE),
-                          id);
+                                  GateConstants.THROWEX_FORMAT_PROPERTY_NAME,
+                                  Boolean.TRUE), id);
           return new DocumentData(gateDoc, docId);
         } catch(Exception e) {
-          logger.warn("Error encountered while parsing object with ID " + id + 
" - skipped", e);
+          logger.warn("Error encountered while parsing object with ID " + id
+                  + " - skipped", e);
         }
       }
     }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
One dashboard for servers and applications across Physical-Virtual-Cloud 
Widest out-of-the-box monitoring support with 50+ applications
Performance metrics, stats and reports that give you Actionable Insights
Deep dive visibility with transaction tracing using APM Insight.
http://ad.doubleclick.net/ddm/clk/290420510;117567292;y
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to