Author: lehmi Date: Sun Jun 2 07:11:03 2024 New Revision: 1918112 URL: http://svn.apache.org/viewvc?rev=1918112&view=rev Log: PDFBOX-5675: implement new RandomAccessRead class to combine limited random access with stream alike handling of an input stream to reduce memory footprint especially for huge streams
Added: pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java (with props) pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java (with props) Added: pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java URL: http://svn.apache.org/viewvc/pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java?rev=1918112&view=auto ============================================================================== --- pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java (added) +++ pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java Sun Jun 2 07:11:03 2024 @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pdfbox.io; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * An implementation of the RandomAccessRead interface using an InputStream as source. + * + * It is optimized for a minimal memory footprint by using a small buffer to read from the input stream instead of + * copying the whole input stream to memory. This reduces the random access read abilities, so that peek/rewind + * operations are limited to the data within the buffer. + * + * This class is meant to be used by consumers which process the data more or less in a serial manner and therefore + * don't need full random access. + * + */ +public class NonSeekableRandomAccessReadInputStream implements RandomAccessRead +{ + private static final Log LOG = LogFactory.getLog(NonSeekableRandomAccessReadInputStream.class); + + // current position within the stream + protected long position = 0; + // current pointer for the current chunk + protected int currentBufferPointer = 0; + // current size of the stream + protected long size = 0; + + // the source input stream + private final InputStream is; + + // buffer size + private static final int BUFFER_SIZE = 4096; + // we are using 3 different buffers for navigation + private static final int CURRENT = 0; + private static final int LAST = 1; + private static final int NEXT = 2; + + // array holding all buffers + private final byte[][] buffers = new byte[][] { new byte[BUFFER_SIZE], new byte[BUFFER_SIZE], + new byte[BUFFER_SIZE] }; + // array holding the number of bytes of all buffers + private final int[] bufferBytes = new int[] { -1, -1, -1 }; + + private boolean isClosed = false; + private boolean isEOF = false; + + /** + * Default constructor. + */ + public NonSeekableRandomAccessReadInputStream(InputStream inputStream) + { + is = inputStream; + } + + /** + * {@inheritDoc} + */ + @Override + public void close() throws IOException + { + is.close(); + isClosed = true; + } + + /** + * {@inheritDoc} + */ + @Override + public void seek(long position) throws IOException + { + throw new IOException(getClass().getName() + ".seek isn't supported."); + } + + @Override + public void skip(int length) throws IOException + { + for (int i = 0; i < length; i++) + { + read(); + } + } + + /** + * {@inheritDoc} + */ + @Override + public long getPosition() throws IOException + { + checkClosed(); + return position; + } + + /** + * {@inheritDoc} + */ + @Override + public int read() throws IOException + { + checkClosed(); + if (isEOF()) + { + return -1; + } + if (currentBufferPointer >= bufferBytes[CURRENT]) + { + if (!fetch()) + { + isEOF = true; + return -1; + } + } + position++; + return buffers[CURRENT][currentBufferPointer++] & 0xFF; + } + + /** + * {@inheritDoc} + */ + @Override + public int read(byte[] b, int offset, int length) throws IOException + { + checkClosed(); + if (isEOF()) + { + return -1; + } + int numberOfBytesRead = 0; + while (numberOfBytesRead < length) + { + int available = bufferBytes[CURRENT] - currentBufferPointer; + if (available > 0) + { + int bytes2Copy = Math.min(length - numberOfBytesRead, available); + System.arraycopy(buffers[CURRENT], currentBufferPointer, b, + numberOfBytesRead + offset, bytes2Copy); + currentBufferPointer += bytes2Copy; + position += bytes2Copy; + numberOfBytesRead += bytes2Copy; + } + else if (!fetch()) + { + isEOF = true; + break; + } + } + return numberOfBytesRead; + } + + private void switchBuffers(int firstBuffer, int secondBuffer) + { + byte[] tmpBuffer = buffers[firstBuffer]; + buffers[firstBuffer] = buffers[secondBuffer]; + buffers[secondBuffer] = tmpBuffer; + int tmpBufferBytes = bufferBytes[firstBuffer]; + bufferBytes[firstBuffer] = bufferBytes[secondBuffer]; + bufferBytes[secondBuffer] = tmpBufferBytes; + } + + private boolean fetch() throws IOException + { + checkClosed(); + currentBufferPointer = 0; + if (bufferBytes[NEXT] > -1) + { + // there is a next buffer from a former rewind operation + // switch to the next buffer and don't read any new data + switchBuffers(CURRENT, LAST); + switchBuffers(CURRENT, NEXT); + // reset next buffer + bufferBytes[NEXT] = -1; + return true; + } + try + { + // move the current data to last to support rewind operations + // right after refilling the current buffer + switchBuffers(CURRENT, LAST); + bufferBytes[CURRENT] = is.read(buffers[CURRENT]); + if (bufferBytes[CURRENT] < 0) + { + return false; + } + size += bufferBytes[CURRENT]; + } + catch (IOException exception) + { + // some data could be read -> don't throw an exception + LOG.warn("FlateFilter: premature end of stream due to a DataFormatException"); + isEOF = true; + throw exception; + } + return true; + } + + /** + * {@inheritDoc} + */ + @Override + public long length() throws IOException + { + checkClosed(); + return size; + } + + @Override + public void rewind(int bytes) throws IOException + { + // check if the rewind operation is limited to the current buffer + if (currentBufferPointer >= bytes) + { + currentBufferPointer -= bytes; + position -= bytes; + } + else if (bufferBytes[LAST] > 0) + { + // there is a former buffer + int remainingBytesToRewind = bytes - currentBufferPointer; + // save the current as next buffer + switchBuffers(CURRENT, NEXT); + // make the former buffer the current one + switchBuffers(CURRENT, LAST); + // reset last buffer + bufferBytes[LAST] = -1; + currentBufferPointer = bufferBytes[CURRENT] - remainingBytesToRewind; + position -= bytes; + } + else + { + // there aren't enough bytes left in the buffers to perform the rewind operation + throw new IOException("not enough bytes available to perfomr the rewind operation"); + } + } + + /** + * Ensure that the RandomAccessBuffer is not closed + * + * @throws IOException If RandomAccessBuffer already closed + */ + protected void checkClosed() throws IOException + { + if (isClosed) + { + throw new IOException(getClass().getSimpleName() + " already closed"); + } + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isClosed() + { + return isClosed; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isEOF() throws IOException + { + checkClosed(); + return isEOF; + } + + @Override + public RandomAccessReadView createView(long startPosition, long streamLength) throws IOException + { + throw new IOException(getClass().getName() + ".createView isn't supported."); + } + +} Propchange: pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java ------------------------------------------------------------------------------ svn:eol-style = native Added: pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java URL: http://svn.apache.org/viewvc/pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java?rev=1918112&view=auto ============================================================================== --- pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java (added) +++ pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java Sun Jun 2 07:11:03 2024 @@ -0,0 +1,281 @@ +/* + * Copyright 2020 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.pdfbox.io; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.OutputStream; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Random; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Unittest for {@link org.apache.pdfbox.io.NonSeekableRandomAccessReadInputStream} + */ +class NonSeekableRandomAccessReadInputStreamTest +{ + @Test + void testPositionSkip() throws IOException + { + byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + ByteArrayInputStream bais = new ByteArrayInputStream(inputValues); + + try (NonSeekableRandomAccessReadInputStream randomAccessSource = new NonSeekableRandomAccessReadInputStream( + bais)) + { + assertEquals(0, randomAccessSource.getPosition()); + randomAccessSource.skip(5); + assertEquals(5, randomAccessSource.read()); + assertEquals(6, randomAccessSource.getPosition()); + } + } + + @Test + void testPositionRead() throws IOException + { + byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + ByteArrayInputStream bais = new ByteArrayInputStream(inputValues); + + NonSeekableRandomAccessReadInputStream randomAccessSource = new NonSeekableRandomAccessReadInputStream( + bais); + + assertEquals(0, randomAccessSource.getPosition()); + assertEquals(0, randomAccessSource.read()); + assertEquals(1, randomAccessSource.read()); + assertEquals(2, randomAccessSource.read()); + assertEquals(3, randomAccessSource.getPosition()); + + assertFalse(randomAccessSource.isClosed()); + randomAccessSource.close(); + assertTrue(randomAccessSource.isClosed()); + } + + @Test + void testSeekEOF() throws IOException + { + byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + ByteArrayInputStream bais = new ByteArrayInputStream(inputValues); + + try (NonSeekableRandomAccessReadInputStream randomAccessSource = new NonSeekableRandomAccessReadInputStream( + bais)) + { + Assertions.assertThrows(IOException.class, () -> randomAccessSource.seek(3), + "seek should have thrown an IOException"); + } + } + + @Test + void testPositionReadBytes() throws IOException + { + byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + ByteArrayInputStream bais = new ByteArrayInputStream(inputValues); + + try (NonSeekableRandomAccessReadInputStream randomAccessSource = new NonSeekableRandomAccessReadInputStream( + bais)) + { + assertEquals(0, randomAccessSource.getPosition()); + byte[] buffer = new byte[4]; + randomAccessSource.read(buffer); + assertEquals(0, buffer[0]); + assertEquals(3, buffer[3]); + assertEquals(4, randomAccessSource.getPosition()); + + randomAccessSource.read(buffer, 1, 2); + assertEquals(0, buffer[0]); + assertEquals(4, buffer[1]); + assertEquals(5, buffer[2]); + assertEquals(3, buffer[3]); + assertEquals(6, randomAccessSource.getPosition()); + } + } + + @Test + void testPositionPeek() throws IOException + { + byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + ByteArrayInputStream bais = new ByteArrayInputStream(inputValues); + + try (NonSeekableRandomAccessReadInputStream randomAccessSource = new NonSeekableRandomAccessReadInputStream( + bais)) + { + assertEquals(0, randomAccessSource.getPosition()); + randomAccessSource.skip(6); + assertEquals(6, randomAccessSource.getPosition()); + + assertEquals(6, randomAccessSource.peek()); + assertEquals(6, randomAccessSource.getPosition()); + } + } + + @Test + void testPositionUnreadBytes() throws IOException + { + byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + ByteArrayInputStream bais = new ByteArrayInputStream(inputValues); + + try (NonSeekableRandomAccessReadInputStream randomAccessSource = new NonSeekableRandomAccessReadInputStream( + bais)) + { + assertEquals(0, randomAccessSource.getPosition()); + randomAccessSource.read(); + randomAccessSource.read(); + byte[] readBytes = new byte[6]; + assertEquals(readBytes.length, randomAccessSource.read(readBytes)); + assertEquals(8, randomAccessSource.getPosition()); + randomAccessSource.rewind(readBytes.length); + assertEquals(2, randomAccessSource.getPosition()); + assertEquals(2, randomAccessSource.read()); + assertEquals(3, randomAccessSource.getPosition()); + randomAccessSource.read(readBytes, 2, 4); + assertEquals(7, randomAccessSource.getPosition()); + randomAccessSource.rewind(4); + assertEquals(3, randomAccessSource.getPosition()); + } + } + + @Test + void testView() throws IOException + { + byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + ByteArrayInputStream bais = new ByteArrayInputStream(inputValues); + + try (NonSeekableRandomAccessReadInputStream randomAccessSource = new NonSeekableRandomAccessReadInputStream( + bais)) + { + Assertions.assertThrows(IOException.class, () -> randomAccessSource.createView(3, 5), + "createView should have thrown an IOException"); + } + } + + @Test + void testBufferSwitch() throws IOException + { + byte[] original = createRandomData(); + + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(original); + try (RandomAccessRead rar = new NonSeekableRandomAccessReadInputStream( + byteArrayInputStream)) + { + rar.skip(4098); + assertEquals(4098, rar.getPosition()); + rar.rewind(4); + assertEquals(4094, rar.getPosition()); + assertEquals(original[4094] & 0xFF, rar.read()); + } + } + + @Test + void testRewindException() throws IOException + { + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(createRandomData()); + try (RandomAccessRead rar = new NonSeekableRandomAccessReadInputStream( + byteArrayInputStream)) + { + rar.skip(10000); + assertEquals(10000, rar.getPosition()); + rar.rewind(4096); + assertEquals(5904, rar.getPosition()); + Assertions.assertThrows(IOException.class, () -> rar.rewind(4096), + "createView should have thrown an IOException"); + } + } + + private byte[] createRandomData() + { + final long seed = new Random().nextLong(); + final Random random = new Random(seed); + final int numBytes = 10000 + random.nextInt(20000); + byte[] original = new byte[numBytes]; + + int upto = 0; + while (upto < numBytes) + { + final int left = numBytes - upto; + if (random.nextBoolean() || left < 2) + { + // Fill w/ pseudo-random bytes: + final int end = upto + Math.min(left, 10 + random.nextInt(100)); + while (upto < end) + { + original[upto++] = (byte) random.nextInt(); + } + } + else + { + // Fill w/ very predictable bytes: + final int end = upto + Math.min(left, 2 + random.nextInt(10)); + final byte value = (byte) random.nextInt(4); + while (upto < end) + { + original[upto++] = value; + } + } + } + return original; + } + + /** + * PDFBOX-5158: endless loop reading a stream of a multiple of 4096 bytes from a FileInputStream. Test does not fail + * with a ByteArrayInputStream, so we need to create a temp file. + * + * @throws IOException + */ + @Test + void testPDFBOX5158() throws IOException + { + Path path = Files.createTempFile("len4096", ".pdf"); + try (OutputStream os = Files.newOutputStream(path)) + { + os.write(new byte[4096]); + } + assertEquals(4096, path.toFile().length()); + try (RandomAccessRead rar = new NonSeekableRandomAccessReadInputStream( + Files.newInputStream(path))) + { + assertEquals(0, rar.read()); + } + Files.delete(path); + } + + /** + * PDFBOX-5161: failure to read bytes after reading a multiple of 4096. Construction source must be an InputStream. + * + * @throws IOException + */ + @Test + void testPDFBOX5161() throws IOException + { + try (RandomAccessRead rar = new NonSeekableRandomAccessReadInputStream( + new ByteArrayInputStream(new byte[4099]))) + { + byte[] buf = new byte[4096]; + int bytesRead = rar.read(buf); + assertEquals(4096, bytesRead); + bytesRead = rar.read(buf, 0, 3); + assertEquals(3, bytesRead); + } + } + +} Propchange: pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java ------------------------------------------------------------------------------ svn:eol-style = native