COMPRESS-271 read-support for LZ4 block format
Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/56e82da9 Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/56e82da9 Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/56e82da9 Branch: refs/heads/master Commit: 56e82da90f1064c23dd630cf0066231567da3ed6 Parents: 6871295 Author: Stefan Bodewig <[email protected]> Authored: Mon Jan 16 19:56:02 2017 +0100 Committer: Stefan Bodewig <[email protected]> Committed: Mon Jan 16 19:56:02 2017 +0100 ---------------------------------------------------------------------- .../compressors/CompressorStreamFactory.java | 18 +- .../lz4/BlockLZ4CompressorInputStream.java | 295 +++++++++++++++++++ .../compress/compressors/lz4/package.html | 37 +++ src/site/xdoc/limitations.xml | 8 + .../lz4/BlockLZ4CompressorInputStreamTest.java | 53 ++++ 5 files changed, 409 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-compress/blob/56e82da9/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java b/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java index d28c9b8..85709af 100644 --- a/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java +++ b/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java @@ -37,6 +37,7 @@ import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStr import org.apache.commons.compress.compressors.deflate.DeflateCompressorOutputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; +import org.apache.commons.compress.compressors.lz4.BlockLZ4CompressorInputStream; import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream; import org.apache.commons.compress.compressors.lzma.LZMACompressorOutputStream; import org.apache.commons.compress.compressors.lzma.LZMAUtils; @@ -159,6 +160,14 @@ public class CompressorStreamFactory implements CompressorStreamProvider { public static final String DEFLATE = "deflate"; /** + * Constant (value {@value}) used to identify the block LZ4 + * compression method. Not supported as an output stream type. + * + * @since 1.14 + */ + public static final String LZ4_BLOCK = "lz4-block"; + + /** * Constructs a new sorted map from input stream provider names to provider * objects. * @@ -420,8 +429,9 @@ public class CompressorStreamFactory implements CompressorStreamProvider { * @param name * of the compressor, i.e. {@value #GZIP}, {@value #BZIP2}, * {@value #XZ}, {@value #LZMA}, {@value #PACK200}, - * {@value #SNAPPY_RAW}, {@value #SNAPPY_FRAMED}, {@value #Z} or - * {@value #DEFLATE} + * {@value #SNAPPY_RAW}, {@value #SNAPPY_FRAMED}, {@value #Z}, + * {@value #LZ4_BLOCK} + * or {@value #DEFLATE} * @param in * the input stream * @return compressor input stream @@ -480,6 +490,10 @@ public class CompressorStreamFactory implements CompressorStreamProvider { return new DeflateCompressorInputStream(in); } + if (LZ4_BLOCK.equalsIgnoreCase(name)) { + return new BlockLZ4CompressorInputStream(in); + } + } catch (final IOException e) { throw new CompressorException("Could not create CompressorInputStream.", e); } http://git-wip-us.apache.org/repos/asf/commons-compress/blob/56e82da9/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorInputStream.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorInputStream.java b/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorInputStream.java new file mode 100644 index 0000000..635dec9 --- /dev/null +++ b/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorInputStream.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.lz4; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.utils.ByteUtils; +import org.apache.commons.compress.utils.IOUtils; + +/** + * CompressorInputStream for the LZ4 block format. + * + * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> + * @since 1.14 + */ +public class BlockLZ4CompressorInputStream extends CompressorInputStream { + + private static final int WINDOW_SIZE = 1 << 16; + private static final int SIZE_BITS = 4; + private static final int COPY_SIZE_MASK = (1 << SIZE_BITS) - 1; + private static final int LITERAL_SIZE_MASK = COPY_SIZE_MASK << SIZE_BITS; + + /** Buffer to write decompressed bytes to for back-references */ + private final byte[] buf = new byte[3 * WINDOW_SIZE]; + + /** One behind the index of the last byte in the buffer that was written */ + private int writeIndex; + + /** Index of the next byte to be read. */ + private int readIndex; + + /** The underlying stream to read compressed data from */ + private final InputStream in; + + /** Number of bytes still to be read from the current literal or copy. */ + private long bytesRemaining; + + /** Copy-size part of the block starting byte. */ + private int nextCopySize; + + /** Offset of the current copy. */ + private int copyOffset; + + /** Current state of the stream */ + private State state = State.NO_BLOCK; + + /** uncompressed size */ + private int size = 0; + + // used in no-arg read method + private final byte[] oneByte = new byte[1]; + + private final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() { + @Override + public int getAsByte() throws IOException { + return readOneByte(); + } + }; + + /** + * Creates a new LZ4 input stream. + * + * @param is + * An InputStream to read compressed data from + * + * @throws IOException if reading fails + */ + public BlockLZ4CompressorInputStream(final InputStream is) throws IOException { + this.in = is; + writeIndex = readIndex = 0; + bytesRemaining = 0; + } + + /** {@inheritDoc} */ + @Override + public int read() throws IOException { + return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; + } + + /** {@inheritDoc} */ + @Override + public void close() throws IOException { + in.close(); + } + + /** {@inheritDoc} */ + @Override + public int available() { + return writeIndex - readIndex; + } + + /** + * {@inheritDoc} + */ + @Override + public int read(final byte[] b, final int off, final int len) throws IOException { + if (state == State.EOF) { + return -1; + } + switch (state) { + case NO_BLOCK: + readSizes(); + /*FALLTHROUGH*/ + case IN_LITERAL: + int litLen = readLiteral(b, off, len); + if (bytesRemaining == 0) { + state = State.LOOKING_FOR_COPY; + } + return litLen; + case LOOKING_FOR_COPY: + if (!initializeCopy()) { + state = State.EOF; + return -1; + } + /*FALLTHROUGH*/ + case IN_COPY: + int copyLen = readCopy(b, off, len); + if (bytesRemaining == 0) { + state = State.NO_BLOCK; + } + return copyLen; + default: + throw new IOException("Unknown stream state " + state); + } + } + + /** + * Get the uncompressed size of the stream + * + * @return the uncompressed size + */ + public int getSize() { + return size; + } + + private void readSizes() throws IOException { + int nextBlock = readOneByte(); + if (nextBlock == -1) { + throw new IOException("Premature end of stream while looking for next block"); + } + nextCopySize = nextBlock & COPY_SIZE_MASK; + long literalSizePart = (nextBlock & LITERAL_SIZE_MASK) >> SIZE_BITS; + if (literalSizePart == COPY_SIZE_MASK) { + literalSizePart += readSizeBytes(); + } + bytesRemaining = literalSizePart; + state = State.IN_LITERAL; + } + + private long readSizeBytes() throws IOException { + long accum = 0; + int nextByte; + do { + nextByte = readOneByte(); + if (nextByte == -1) { + throw new IOException("Premature end of stream while parsing length"); + } + accum += nextByte; + } while (nextByte == 255); + return accum; + } + + private int readLiteral(final byte[] b, final int off, final int len) throws IOException { + final int avail = available(); + if (len > avail) { + tryToReadLiteral(len - avail); + } + return readFromBuffer(b, off, len); + } + + private void tryToReadLiteral(int bytesToRead) throws IOException { + final int reallyTryToRead = (int) Math.min(Math.min(bytesToRead, bytesRemaining), + buf.length - writeIndex); + final int bytesRead = reallyTryToRead > 0 + ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) + : 0 /* happens for bytesRemaining == 0 */; + count(bytesRead); + if (reallyTryToRead != bytesRead) { + throw new IOException("Premature end of stream reading literal"); + } + writeIndex += reallyTryToRead; + bytesRemaining -= reallyTryToRead; + } + + private int readFromBuffer(final byte[] b, final int off, final int len) throws IOException { + final int readable = Math.min(len, available()); + if (readable > 0) { + System.arraycopy(buf, readIndex, b, off, readable); + readIndex += readable; + if (readIndex > 2 * WINDOW_SIZE) { + slideBuffer(); + } + } + size += readable; + return readable; + } + + private void slideBuffer() { + System.arraycopy(buf, WINDOW_SIZE, buf, 0, WINDOW_SIZE); + writeIndex -= WINDOW_SIZE; + readIndex -= WINDOW_SIZE; + } + + /** + * @return false if there is no more copy - this means this is the + * last block of the stream. + */ + private boolean initializeCopy() throws IOException { + try { + copyOffset = (int) ByteUtils.fromLittleEndian(supplier, 2); + } catch (IOException ex) { + if (nextCopySize == 0) { // the last block has no copy + return false; + } + throw ex; + } + long copySize = nextCopySize; + if (nextCopySize == COPY_SIZE_MASK) { + copySize += readSizeBytes(); + } + bytesRemaining = copySize + 4; // minimal match length 4 is encoded as 0 + state = State.IN_COPY; + return true; + } + + private int readCopy(final byte[] b, final int off, final int len) throws IOException { + final int avail = available(); + if (len > avail) { + tryToCopy(len - avail); + } + return readFromBuffer(b, off, len); + } + + private void tryToCopy(int bytesToCopy) throws IOException { + // this will fit into the buffer without sliding and not + // require more than is available inside the copy + int copy = (int) Math.min(Math.min(bytesToCopy, bytesRemaining), + buf.length - writeIndex); + if (copy == 0) { + // NOP + } else if (copyOffset == 1) { // pretty common special case + final byte last = buf[writeIndex - 1]; + for (int i = 0; i < copy; i++) { + buf[writeIndex++] = last; + } + } else if (copy < copyOffset) { + System.arraycopy(buf, writeIndex - copyOffset, buf, writeIndex, copy); + writeIndex += copy; + } else { + final int fullRots = copy / copyOffset; + for (int i = 0; i < fullRots; i++) { + System.arraycopy(buf, writeIndex - copyOffset, buf, writeIndex, copyOffset); + writeIndex += copyOffset; + } + + final int pad = copy - (copyOffset * fullRots); + if (pad > 0) { + System.arraycopy(buf, writeIndex - copyOffset, buf, writeIndex, pad); + writeIndex += pad; + } + } + bytesRemaining -= copy; + } + + private int readOneByte() throws IOException { + final int b = in.read(); + if (b != -1) { + count(1); + return b & 0xFF; + } + return -1; + } + + private enum State { + NO_BLOCK, IN_LITERAL, LOOKING_FOR_COPY, IN_COPY, EOF + } +} http://git-wip-us.apache.org/repos/asf/commons-compress/blob/56e82da9/src/main/java/org/apache/commons/compress/compressors/lz4/package.html ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz4/package.html b/src/main/java/org/apache/commons/compress/compressors/lz4/package.html new file mode 100644 index 0000000..54de62b --- /dev/null +++ b/src/main/java/org/apache/commons/compress/compressors/lz4/package.html @@ -0,0 +1,37 @@ +<html> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +--> + <body> + <p>Provides stream classes for the + <a href="http://lz4.github.io/lz4/">LZ4</a> + algorithm.</p> + + <p>The block LZ4 format which only contains the compressed data is + supported by the <code>BlockLZ4Compressor*putStream</code> + classes while the frame format is implemented + by <code>FramedLZ4Compressor*putStream</code>. The + implementation in Commons Compress is based on the + specifications "Last revised: 2015-03-26" for the block format + and version "1.5.1 (31/03/2015)" for the frame format.</p> + + <p>Only the frame format can be auto-detected this means you have + to speficy the format explicitly if you want to read a block LZ4 + stream via <code>CompressorStreamFactory</code>.</p> + </body> +</html> http://git-wip-us.apache.org/repos/asf/commons-compress/blob/56e82da9/src/site/xdoc/limitations.xml ---------------------------------------------------------------------- diff --git a/src/site/xdoc/limitations.xml b/src/site/xdoc/limitations.xml index 45c3a27..fd0c07e 100644 --- a/src/site/xdoc/limitations.xml +++ b/src/site/xdoc/limitations.xml @@ -106,6 +106,14 @@ MANIFEST</li> </ul> </section> + <section name="LZ4"> + <ul> + <li>In theory LZ4 compressed streams can contain literals and + copies of arbitrary length while Commons Compress only + supports sizes up to 2<sup>63</sup> - 1 (i.e. ≈ 9.2 + EB).</li> + </ul> + </section> <section name="LZMA"> <ul> <li>the format requires the otherwise optional <a http://git-wip-us.apache.org/repos/asf/commons-compress/blob/56e82da9/src/test/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorInputStreamTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorInputStreamTest.java b/src/test/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorInputStreamTest.java new file mode 100644 index 0000000..1c1155a --- /dev/null +++ b/src/test/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorInputStreamTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.lz4; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import org.apache.commons.compress.AbstractTestCase; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.commons.compress.utils.IOUtils; +import org.junit.Assert; +import org.junit.Test; + +public class BlockLZ4CompressorInputStreamTest extends AbstractTestCase { + + @Test + public void readBlaLz4() throws IOException { + try (InputStream a = new BlockLZ4CompressorInputStream(new FileInputStream(getFile("bla.tar.block_lz4"))); + FileInputStream e = new FileInputStream(getFile("bla.tar"))) { + byte[] expected = IOUtils.toByteArray(e); + byte[] actual = IOUtils.toByteArray(a); + Assert.assertArrayEquals(expected, actual); + } + } + + @Test + public void readBlaLz4ViaFactory() throws Exception { + try (InputStream a = new CompressorStreamFactory() + .createCompressorInputStream(CompressorStreamFactory.LZ4_BLOCK, + new FileInputStream(getFile("bla.tar.block_lz4"))); + FileInputStream e = new FileInputStream(getFile("bla.tar"))) { + byte[] expected = IOUtils.toByteArray(e); + byte[] actual = IOUtils.toByteArray(a); + Assert.assertArrayEquals(expected, actual); + } + } +}
