Author: bodewig Date: Wed Nov 9 15:55:10 2011 New Revision: 1199812 URL: http://svn.apache.org/viewvc?rev=1199812&view=rev Log: support for concatenated streams in Gzip input. COMPRESS-154. Submitted by Lasse Collin
Modified: commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/gzip/GzipCompressorInputStream.java Modified: commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/gzip/GzipCompressorInputStream.java URL: http://svn.apache.org/viewvc/commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/gzip/GzipCompressorInputStream.java?rev=1199812&r1=1199811&r2=1199812&view=diff ============================================================================== --- commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/gzip/GzipCompressorInputStream.java (original) +++ commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/gzip/GzipCompressorInputStream.java Wed Nov 9 15:55:10 2011 @@ -19,71 +19,282 @@ package org.apache.commons.compress.compressors.gzip; import java.io.IOException; +import java.io.EOFException; import java.io.InputStream; -import java.util.zip.GZIPInputStream; +import java.io.DataInputStream; +import java.io.BufferedInputStream; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; +import java.util.zip.CRC32; import org.apache.commons.compress.compressors.CompressorInputStream; /** - * Implements the "gz" compression format as an input stream. - * This classes wraps the standard java classes for working with gz. + * Input stream that decompresses .gz files. + * This supports decompressing concatenated .gz files which is important + * when decompressing standalone .gz files. + * <p> + * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz + * files: it stops after the first member and silently ignores the rest. + * It doesn't leave the read position to point to the beginning of the next + * member, which makes it difficult workaround the lack of concatenation + * support. + * <p> + * Instead of using <code>GZIPInputStream</code>, this class has its own .gz + * container format decoder. The actual decompression is done with + * {@link java.util.zip.Inflater}. */ public class GzipCompressorInputStream extends CompressorInputStream { - /* reference to the compressed stream */ - private final GZIPInputStream in; + // Header flags + // private static final int FTEXT = 0x01; // Uninteresting for us + private static final int FHCRC = 0x02; + private static final int FEXTRA = 0x04; + private static final int FNAME = 0x08; + private static final int FCOMMENT = 0x10; + private static final int FRESERVED = 0xE0; + + // Compressed input stream, possibly wrapped in a BufferedInputStream + private final InputStream in; + + // True if decompressing multimember streams. + private final boolean decompressConcatenated; + + // Buffer to hold the input data + private final byte[] buf = new byte[8192]; + + // Amount of data in buf. + private int bufUsed = 0; + + // Decompressor + private Inflater inf = new Inflater(true); + + // CRC32 from uncompressed data + private CRC32 crc = new CRC32(); + + private int memberSize; + + // True once everything has been decompressed + private boolean endReached = false; /** - * Constructs a new GZip compressed input stream by the referenced - * InputStream. - * - * @param inputStream the InputStream from which this object should be created of + * Constructs a new input stream that decompresses gzip-compressed data + * from the specified input stream. + * <p> + * This is equivalent to + * <code>GzipCompressorInputStream(inputStream, false)</code> and thus + * will not decompress concatenated .gz files. + * + * @param inputStream the InputStream from which this object should + * be created of + * * @throws IOException if the stream could not be created */ - public GzipCompressorInputStream(InputStream inputStream) throws IOException { - in = new GZIPInputStream(inputStream); + public GzipCompressorInputStream(InputStream inputStream) + throws IOException { + this(inputStream, false); + } + + /** + * Constructs a new input stream that decompresses gzip-compressed data + * from the specified input stream. + * <p> + * If <code>decompressConcatenated</code> is <code>false</code>: + * This decompressor might read more input than it will actually use. + * If <code>inputStream</code> supports <code>mark</code> and + * <code>reset</code>, then the input position will be adjusted + * so that it is right after the last byte of the compressed stream. + * If <code>mark</code> isn't supported, the input position will be + * undefined. + * + * @param inputStream the InputStream from which this object should + * be created of + * @param decompressConcatenated + * if true, decompress until the end of the input; + * if false, stop after the first .gz member + * + * @throws IOException if the stream could not be created + */ + public GzipCompressorInputStream(InputStream inputStream, + boolean decompressConcatenated) + throws IOException { + // Mark support is strictly needed for concatenated files only, + // but it's simpler if it is always available. + if (inputStream.markSupported()) + in = inputStream; + else + in = new BufferedInputStream(inputStream); + + this.decompressConcatenated = decompressConcatenated; + init(true); + } + + private boolean init(boolean isFirstMember) throws IOException { + assert isFirstMember || decompressConcatenated; + + // Check the magic bytes without a possibility of EOFException. + int magic0 = in.read(); + int magic1 = in.read(); + + // If end of input was reached after decompressing at least + // one .gz member, we have reached the end of the file successfully. + if (magic0 == -1 && !isFirstMember) + return false; + + if (magic0 != 31 || magic1 != 139) + throw new IOException(isFirstMember + ? "Input is not in the .gz format" + : "Garbage after a valid .gz stream"); + + // Parsing the rest of the header may throw EOFException. + DataInputStream inData = new DataInputStream(in); + int method = inData.readUnsignedByte(); + if (method != 8) + throw new IOException("Unsupported compression method " + + method + " in the .gz header"); + + int flg = inData.readUnsignedByte(); + if ((flg & FRESERVED) != 0) + throw new IOException( + "Reserved flags are set in the .gz header"); + + inData.readInt(); // mtime, ignored + inData.readUnsignedByte(); // extra flags, ignored + inData.readUnsignedByte(); // operating system, ignored + + // Extra field, ignored + if ((flg & FEXTRA) != 0) { + int xlen = inData.readUnsignedByte(); + xlen |= inData.readUnsignedByte() << 8; + + // This isn't as efficient as calling in.skip would be, + // but it's lazier to handle unexpected end of input this way. + // Most files don't have an extra field anyway. + while (xlen-- > 0) + inData.readUnsignedByte(); + } + + // Original file name, ignored + if ((flg & FNAME) != 0) + while (inData.readUnsignedByte() != 0x00) ; + + // Comment, ignored + if ((flg & FCOMMENT) != 0) + while (inData.readUnsignedByte() != 0x00) ; + + // Header "CRC16" which is actually a truncated CRC32 (which isn't + // as good as real CRC16). I don't know if any encoder implementation + // sets this, so it's not worth trying to verify it. GNU gzip 1.4 + // doesn't support this field, but zlib seems to be able to at least + // skip over it. + if ((flg & FHCRC) != 0) + inData.readShort(); + + // Reset + inf.reset(); + crc.reset(); + memberSize = 0; + + return true; } /** {@inheritDoc} */ - @Override public int read() throws IOException { - int read = in.read(); - this.count(read < 0 ? -1 : 1); - return read; + byte[] buf = new byte[1]; + return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); } /** * {@inheritDoc} - * + * * @since Apache Commons Compress 1.1 */ - @Override - public int read(byte[] b) throws IOException { - int read = in.read(b); - this.count(read); - return read; - } + public int read(byte[] b, int off, int len) throws IOException { + if (endReached) + return -1; - /** - * {@inheritDoc} - * - * @since Apache Commons Compress 1.1 - */ - @Override - public int read(byte[] b, int from, int length) throws IOException { - int read = in.read(b, from, length); - this.count(read); - return read; + int size = 0; + + while (len > 0) { + if (inf.needsInput()) { + // Remember the current position because we may need to + // rewind after reading too much input. + in.mark(buf.length); + + bufUsed = in.read(buf); + if (bufUsed == -1) + throw new EOFException(); + + inf.setInput(buf, 0, bufUsed); + } + + int ret; + try { + ret = inf.inflate(b, off, len); + } catch (DataFormatException e) { + throw new IOException("Gzip-compressed data is corrupt"); + } + + crc.update(b, off, ret); + memberSize += ret; + off += ret; + len -= ret; + size += ret; + count(ret); + + if (inf.finished()) { + // We may have read too many bytes. Rewind the read + // position to match the actual amount used. + // + // NOTE: The "if" is there just in case. Since we used + // in.mark earler, it should always skip enough. + in.reset(); + + int skipAmount = bufUsed - inf.getRemaining(); + if (in.skip(skipAmount) != skipAmount) + throw new IOException(); + + bufUsed = 0; + + DataInputStream inData = new DataInputStream(in); + + // CRC32 + long crcStored = 0; + for (int i = 0; i < 4; ++i) + crcStored |= (long)inData.readUnsignedByte() << (i * 8); + + if (crcStored != crc.getValue()) + throw new IOException("Gzip-compressed data is corrupt " + + "(CRC32 error)"); + + // Uncompressed size modulo 2^32 (ISIZE in the spec) + int isize = 0; + for (int i = 0; i < 4; ++i) + isize |= inData.readUnsignedByte() << (i * 8); + + if (isize != memberSize) + throw new IOException("Gzip-compressed data is corrupt" + + "(uncompressed size mismatch)"); + + // See if this is the end of the file. + if (!decompressConcatenated || !init(false)) { + inf.end(); + inf = null; + endReached = true; + return size == 0 ? -1 : size; + } + } + } + + return size; } /** - * Checks if the signature matches what is expected for a gzip file. - * - * @param signature - * the bytes to check - * @param length - * the number of bytes to check - * @return true, if this stream is a gzipped compressed stream, false otherwise - * + * Checks if the signature matches what is expected for a .gz file. + * + * @param signature the bytes to check + * @param length the number of bytes to check + * @return true if this is a .gz stream, false otherwise + * * @since Apache Commons Compress 1.1 */ public static boolean matches(byte[] signature, int length) { @@ -91,7 +302,7 @@ public class GzipCompressorInputStream e if (length < 2) { return false; } - + if (signature[0] != 31) { return false; } @@ -99,17 +310,21 @@ public class GzipCompressorInputStream e if (signature[1] != -117) { return false; } - + return true; } - + /** * Closes the input stream (unless it is System.in). - * + * * @since 1.2 */ - @Override public void close() throws IOException { + if (inf != null) { + inf.end(); + inf = null; + } + if (this.in != System.in) { this.in.close(); }