Revision: 18660
http://sourceforge.net/p/gate/code/18660
Author: ian_roberts
Date: 2015-05-01 15:01:14 +0000 (Fri, 01 May 2015)
Log Message:
-----------
Use commons-compress to handle more compression formats in pure Java, and take
advantage of its compression format auto-detection logic.
Modified Paths:
--------------
gcp/trunk/.classpath
gcp/trunk/.settings/org.eclipse.jdt.core.prefs
gcp/trunk/build/ivy.xml
gcp/trunk/src/gate/cloud/io/json/JSONStreamingInputHandler.java
Modified: gcp/trunk/.classpath
===================================================================
--- gcp/trunk/.classpath 2015-05-01 01:20:25 UTC (rev 18659)
+++ gcp/trunk/.classpath 2015-05-01 15:01:14 UTC (rev 18660)
@@ -5,6 +5,6 @@
<classpathentry kind="con"
path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
<classpathentry kind="lib" path="conf"/>
<classpathentry kind="lib" path="lib/mimir-client-5.0.jar"/>
- <classpathentry kind="con"
path="org.apache.ivyde.eclipse.cpcontainer.IVYDE_CONTAINER/?project=gcp&ivyXmlPath=build%2Fivy.xml&confs=*&ivySettingsPath=%24%7Bworkspace_loc%3Agcp%2Fbuild%2Fivysettings.xml%7D&loadSettingsOnDemand=false&propertyFiles="/>
+ <classpathentry exported="true" kind="con"
path="org.apache.ivyde.eclipse.cpcontainer.IVYDE_CONTAINER/?project=gcp&ivyXmlPath=build%2Fivy.xml&confs=*&ivySettingsPath=%24%7Bworkspace_loc%3Agcp%2Fbuild%2Fivysettings.xml%7D&loadSettingsOnDemand=false&propertyFiles="/>
<classpathentry kind="output" path="classes"/>
</classpath>
Modified: gcp/trunk/.settings/org.eclipse.jdt.core.prefs
===================================================================
--- gcp/trunk/.settings/org.eclipse.jdt.core.prefs 2015-05-01 01:20:25 UTC
(rev 18659)
+++ gcp/trunk/.settings/org.eclipse.jdt.core.prefs 2015-05-01 15:01:14 UTC
(rev 18660)
@@ -1,12 +1,11 @@
-#Fri Nov 05 15:21:46 GMT 2010
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
-org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
-org.eclipse.jdt.core.compiler.compliance=1.5
+org.eclipse.jdt.core.compiler.compliance=1.7
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
org.eclipse.jdt.core.compiler.debug.localVariable=generate
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
-org.eclipse.jdt.core.compiler.source=1.5
+org.eclipse.jdt.core.compiler.source=1.7
Modified: gcp/trunk/build/ivy.xml
===================================================================
--- gcp/trunk/build/ivy.xml 2015-05-01 01:20:25 UTC (rev 18659)
+++ gcp/trunk/build/ivy.xml 2015-05-01 15:01:14 UTC (rev 18660)
@@ -27,5 +27,8 @@
<!-- JNA for PID extraction -->
<dependency org="net.java.dev.jna" name="jna" rev="4.0.0" />
+
+ <!-- commons-compress for unzip/bz2/xz/etc -->
+ <dependency org="org.apache.commons" name="commons-compress" rev="1.9" />
</dependencies>
</ivy-module>
Modified: gcp/trunk/src/gate/cloud/io/json/JSONStreamingInputHandler.java
===================================================================
--- gcp/trunk/src/gate/cloud/io/json/JSONStreamingInputHandler.java
2015-05-01 01:20:25 UTC (rev 18659)
+++ gcp/trunk/src/gate/cloud/io/json/JSONStreamingInputHandler.java
2015-05-01 15:01:14 UTC (rev 18660)
@@ -29,6 +29,7 @@
import gate.cloud.io.StreamingInputHandler;
import gate.util.GateException;
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@@ -41,6 +42,9 @@
import java.util.Set;
import java.util.zip.GZIPInputStream;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import com.fasterxml.jackson.core.JsonParser;
@@ -63,14 +67,18 @@
* be found will be ignored.
* </p>
* <p>
- * The input file may be compressed. If the "compression" option is set
- * to "gzip" then the file will be unpacked using Java's native GZIP
- * support. Any other value of "compression" will be treated as a
- * command line to a program that expects the compressed data on its
- * standard input and will produce uncompressed output on its standard
- * out. The command will be split into words at whitespace, so embedded
- * whitespace within a single word is not permitted. For example, to
- * handle JSON files compressed in LZO format use
+ * The input file may be compressed. The following values of the
+ * "compression" option can be handled natively in Java by Apache
+ * commons-compress: "gz" (or "gzip"), "bzip2", "xz", "z" (the Unix
+ * <code>compress</code> format), "pack200", "lzma", "snappy-raw",
+ * "snappy-framed", "deflate". The value "any" will attempt to
+ * auto-detect the compression format, falling back on no compression if
+ * auto-detection fails. Any other value of "compression" will be
+ * treated as a command line to a program that expects the compressed
+ * data on its standard input and will produce uncompressed output on
+ * its standard out. The command will be split into words at whitespace,
+ * so embedded whitespace within a single word is not permitted. For
+ * example, to handle JSON files compressed in LZO format use
* <code>compression="lzop -dc"</code>.
* </p>
* <p>
@@ -92,7 +100,7 @@
* <pre>
* <input class="gate.cloud.io.json.JSONStreamingInputHandler"
* srcFile="interactions.gz"
- * compression="gzip"
+ * compression="gz"
* mimeType="text/x-json-datasift"
* idPointer="/interaction/id" />
* </pre>
@@ -215,17 +223,54 @@
InputStream inputStream = null;
if(compression == null) {
inputStream = new FileInputStream(srcFile);
- } else if(VALUE_COMPRESSION_GZIP.equals(compression)) {
- inputStream = new GZIPInputStream(new FileInputStream(srcFile));
+ } else if("any".equals(compression)) {
+ inputStream = new BufferedInputStream(new FileInputStream(srcFile));
+ try {
+ inputStream =
+ new CompressorStreamFactory()
+ .createCompressorInputStream(inputStream);
+ } catch(CompressorException e) {
+ if(e.getCause() != null) {
+ if(e.getCause() instanceof IOException) {
+ throw (IOException)e.getCause();
+ } else {
+ throw new GateException(e.getCause());
+ }
+ } else {
+ // unrecognised signature, assume uncompressed
+ logger.info("Failed to detect compression format, assuming no
compression");
+ }
+ }
} else {
- // treat compression value as a command line
- ProcessBuilder pb = new ProcessBuilder(compression.trim().split("\\s+"));
- pb.directory(batchDir);
- pb.redirectError(Redirect.INHERIT);
- pb.redirectOutput(Redirect.PIPE);
- pb.redirectInput(srcFile);
- decompressProcess = pb.start();
- inputStream = decompressProcess.getInputStream();
+ if(compression == VALUE_COMPRESSION_GZIP) {
+ compression = CompressorStreamFactory.GZIP;
+ }
+ inputStream = new BufferedInputStream(new FileInputStream(srcFile));
+ try {
+ inputStream =
+ new CompressorStreamFactory()
+ .createCompressorInputStream(compression, inputStream);
+ } catch(CompressorException e) {
+ if(e.getCause() != null) {
+ if(e.getCause() instanceof IOException) {
+ throw (IOException)e.getCause();
+ } else {
+ throw new GateException(e.getCause());
+ }
+ } else {
+ // unrecognised compressor name
+ logger.info("Unrecognised compression format, assuming external
compressor");
+ IOUtils.closeQuietly(inputStream);
+ // treat compression value as a command line
+ ProcessBuilder pb = new
ProcessBuilder(compression.trim().split("\\s+"));
+ pb.directory(batchDir);
+ pb.redirectError(Redirect.INHERIT);
+ pb.redirectOutput(Redirect.PIPE);
+ pb.redirectInput(srcFile);
+ decompressProcess = pb.start();
+ inputStream = decompressProcess.getInputStream();
+ }
+ }
}
objectMapper = new ObjectMapper();
@@ -281,11 +326,12 @@
Document gateDoc =
(Document)Factory.createResource("gate.corpora.DocumentImpl",
docParams, Utils.featureMap(
- GateConstants.THROWEX_FORMAT_PROPERTY_NAME,
Boolean.TRUE),
- id);
+ GateConstants.THROWEX_FORMAT_PROPERTY_NAME,
+ Boolean.TRUE), id);
return new DocumentData(gateDoc, docId);
} catch(Exception e) {
- logger.warn("Error encountered while parsing object with ID " + id +
" - skipped", e);
+ logger.warn("Error encountered while parsing object with ID " + id
+ + " - skipped", e);
}
}
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
One dashboard for servers and applications across Physical-Virtual-Cloud
Widest out-of-the-box monitoring support with 50+ applications
Performance metrics, stats and reports that give you Actionable Insights
Deep dive visibility with transaction tracing using APM Insight.
http://ad.doubleclick.net/ddm/clk/290420510;117567292;y
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs