Author: lehmi
Date: Sun Jun  2 07:11:03 2024
New Revision: 1918112

URL: http://svn.apache.org/viewvc?rev=1918112&view=rev
Log:
PDFBOX-5675: implement new RandomAccessRead class to combine limited random 
access with stream alike handling of an input stream to reduce memory footprint 
especially for huge streams

Added:
    
pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java
   (with props)
    
pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java
   (with props)

Added: 
pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java?rev=1918112&view=auto
==============================================================================
--- 
pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java
 (added)
+++ 
pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java
 Sun Jun  2 07:11:03 2024
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * An implementation of the RandomAccessRead interface using an InputStream as 
source.
+ * 
+ * It is optimized for a minimal memory footprint by using a small buffer to 
read from the input stream instead of
+ * copying the whole input stream to memory. This reduces the random access 
read abilities, so that peek/rewind
+ * operations are limited to the data within the buffer.
+ * 
+ * This class is meant to be used by consumers which process the data more or 
less in a serial manner and therefore
+ * don't need full random access.
+ * 
+ */
+public class NonSeekableRandomAccessReadInputStream implements RandomAccessRead
+{
+    private static final Log LOG = 
LogFactory.getLog(NonSeekableRandomAccessReadInputStream.class);
+
+    // current position within the stream
+    protected long position = 0;
+    // current pointer for the current chunk
+    protected int currentBufferPointer = 0;
+    // current size of the stream
+    protected long size = 0;
+
+    // the source input stream
+    private final InputStream is;
+
+    // buffer size
+    private static final int BUFFER_SIZE = 4096;
+    // we are using 3 different buffers for navigation
+    private static final int CURRENT = 0;
+    private static final int LAST = 1;
+    private static final int NEXT = 2;
+
+    // array holding all buffers
+    private final byte[][] buffers = new byte[][] { new byte[BUFFER_SIZE], new 
byte[BUFFER_SIZE],
+            new byte[BUFFER_SIZE] };
+    // array holding the number of bytes of all buffers
+    private final int[] bufferBytes = new int[] { -1, -1, -1 };
+
+    private boolean isClosed = false;
+    private boolean isEOF = false;
+
+    /**
+     * Default constructor.
+     */
+    public NonSeekableRandomAccessReadInputStream(InputStream inputStream)
+    {
+        is = inputStream;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void close() throws IOException
+    {
+        is.close();
+        isClosed = true;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void seek(long position) throws IOException
+    {
+        throw new IOException(getClass().getName() + ".seek isn't supported.");
+    }
+
+    @Override
+    public void skip(int length) throws IOException
+    {
+        for (int i = 0; i < length; i++)
+        {
+            read();
+        }
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public long getPosition() throws IOException
+    {
+        checkClosed();
+        return position;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public int read() throws IOException
+    {
+        checkClosed();
+        if (isEOF())
+        {
+            return -1;
+        }
+        if (currentBufferPointer >= bufferBytes[CURRENT])
+        {
+            if (!fetch())
+            {
+                isEOF = true;
+                return -1;
+            }
+        }
+        position++;
+        return buffers[CURRENT][currentBufferPointer++] & 0xFF;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public int read(byte[] b, int offset, int length) throws IOException
+    {
+        checkClosed();
+        if (isEOF())
+        {
+            return -1;
+        }
+        int numberOfBytesRead = 0;
+        while (numberOfBytesRead < length)
+        {
+            int available = bufferBytes[CURRENT] - currentBufferPointer;
+            if (available > 0)
+            {
+                int bytes2Copy = Math.min(length - numberOfBytesRead, 
available);
+                System.arraycopy(buffers[CURRENT], currentBufferPointer, b,
+                        numberOfBytesRead + offset, bytes2Copy);
+                currentBufferPointer += bytes2Copy;
+                position += bytes2Copy;
+                numberOfBytesRead += bytes2Copy;
+            }
+            else if (!fetch())
+            {
+                isEOF = true;
+                break;
+            }
+        }
+        return numberOfBytesRead;
+    }
+
+    private void switchBuffers(int firstBuffer, int secondBuffer)
+    {
+        byte[] tmpBuffer = buffers[firstBuffer];
+        buffers[firstBuffer] = buffers[secondBuffer];
+        buffers[secondBuffer] = tmpBuffer;
+        int tmpBufferBytes = bufferBytes[firstBuffer];
+        bufferBytes[firstBuffer] = bufferBytes[secondBuffer];
+        bufferBytes[secondBuffer] = tmpBufferBytes;
+    }
+
+    private boolean fetch() throws IOException
+    {
+        checkClosed();
+        currentBufferPointer = 0;
+        if (bufferBytes[NEXT] > -1)
+        {
+            // there is a next buffer from a former rewind operation
+            // switch to the next buffer and don't read any new data
+            switchBuffers(CURRENT, LAST);
+            switchBuffers(CURRENT, NEXT);
+            // reset next buffer
+            bufferBytes[NEXT] = -1;
+            return true;
+        }
+        try
+        {
+            // move the current data to last to support rewind operations
+            // right after refilling the current buffer
+            switchBuffers(CURRENT, LAST);
+            bufferBytes[CURRENT] = is.read(buffers[CURRENT]);
+            if (bufferBytes[CURRENT] < 0)
+            {
+                return false;
+            }
+            size += bufferBytes[CURRENT];
+        }
+        catch (IOException exception)
+        {
+            // some data could be read -> don't throw an exception
+            LOG.warn("FlateFilter: premature end of stream due to a 
DataFormatException");
+            isEOF = true;
+            throw exception;
+        }
+        return true;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public long length() throws IOException
+    {
+        checkClosed();
+        return size;
+    }
+
+    @Override
+    public void rewind(int bytes) throws IOException
+    {
+        // check if the rewind operation is limited to the current buffer
+        if (currentBufferPointer >= bytes)
+        {
+            currentBufferPointer -= bytes;
+            position -= bytes;
+        }
+        else if (bufferBytes[LAST] > 0)
+        {
+            // there is a former buffer
+            int remainingBytesToRewind = bytes - currentBufferPointer;
+            // save the current as next buffer
+            switchBuffers(CURRENT, NEXT);
+            // make the former buffer the current one
+            switchBuffers(CURRENT, LAST);
+            // reset last buffer
+            bufferBytes[LAST] = -1;
+            currentBufferPointer = bufferBytes[CURRENT] - 
remainingBytesToRewind;
+            position -= bytes;
+        }
+        else
+        {
+            // there aren't enough bytes left in the buffers to perform the 
rewind operation
+            throw new IOException("not enough bytes available to perfomr the 
rewind operation");
+        }
+    }
+
+    /**
+     * Ensure that the RandomAccessBuffer is not closed
+     * 
+     * @throws IOException If RandomAccessBuffer already closed
+     */
+    protected void checkClosed() throws IOException
+    {
+        if (isClosed)
+        {
+            throw new IOException(getClass().getSimpleName() + " already 
closed");
+        }
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public boolean isClosed()
+    {
+        return isClosed;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public boolean isEOF() throws IOException
+    {
+        checkClosed();
+        return isEOF;
+    }
+
+    @Override
+    public RandomAccessReadView createView(long startPosition, long 
streamLength) throws IOException
+    {
+        throw new IOException(getClass().getName() + ".createView isn't 
supported.");
+    }
+
+}

Propchange: 
pdfbox/branches/3.0/io/src/main/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java?rev=1918112&view=auto
==============================================================================
--- 
pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java
 (added)
+++ 
pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java
 Sun Jun  2 07:11:03 2024
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2020 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.io;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Random;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unittest for {@link 
org.apache.pdfbox.io.NonSeekableRandomAccessReadInputStream}
+ */
+class NonSeekableRandomAccessReadInputStreamTest
+{
+    @Test
+    void testPositionSkip() throws IOException
+    {
+        byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        ByteArrayInputStream bais = new ByteArrayInputStream(inputValues);
+
+        try (NonSeekableRandomAccessReadInputStream randomAccessSource = new 
NonSeekableRandomAccessReadInputStream(
+                bais))
+        {
+            assertEquals(0, randomAccessSource.getPosition());
+            randomAccessSource.skip(5);
+            assertEquals(5, randomAccessSource.read());
+            assertEquals(6, randomAccessSource.getPosition());
+        }
+    }
+
+    @Test
+    void testPositionRead() throws IOException
+    {
+        byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        ByteArrayInputStream bais = new ByteArrayInputStream(inputValues);
+
+        NonSeekableRandomAccessReadInputStream randomAccessSource = new 
NonSeekableRandomAccessReadInputStream(
+                bais);
+
+        assertEquals(0, randomAccessSource.getPosition());
+        assertEquals(0, randomAccessSource.read());
+        assertEquals(1, randomAccessSource.read());
+        assertEquals(2, randomAccessSource.read());
+        assertEquals(3, randomAccessSource.getPosition());
+
+        assertFalse(randomAccessSource.isClosed());
+        randomAccessSource.close();
+        assertTrue(randomAccessSource.isClosed());
+    }
+
+    @Test
+    void testSeekEOF() throws IOException
+    {
+        byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        ByteArrayInputStream bais = new ByteArrayInputStream(inputValues);
+
+        try (NonSeekableRandomAccessReadInputStream randomAccessSource = new 
NonSeekableRandomAccessReadInputStream(
+                bais))
+        {
+            Assertions.assertThrows(IOException.class, () -> 
randomAccessSource.seek(3),
+                    "seek should have thrown an IOException");
+        }
+    }
+
+    @Test
+    void testPositionReadBytes() throws IOException
+    {
+        byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        ByteArrayInputStream bais = new ByteArrayInputStream(inputValues);
+
+        try (NonSeekableRandomAccessReadInputStream randomAccessSource = new 
NonSeekableRandomAccessReadInputStream(
+                bais))
+        {
+            assertEquals(0, randomAccessSource.getPosition());
+            byte[] buffer = new byte[4];
+            randomAccessSource.read(buffer);
+            assertEquals(0, buffer[0]);
+            assertEquals(3, buffer[3]);
+            assertEquals(4, randomAccessSource.getPosition());
+
+            randomAccessSource.read(buffer, 1, 2);
+            assertEquals(0, buffer[0]);
+            assertEquals(4, buffer[1]);
+            assertEquals(5, buffer[2]);
+            assertEquals(3, buffer[3]);
+            assertEquals(6, randomAccessSource.getPosition());
+        }
+    }
+
+    @Test
+    void testPositionPeek() throws IOException
+    {
+        byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        ByteArrayInputStream bais = new ByteArrayInputStream(inputValues);
+
+        try (NonSeekableRandomAccessReadInputStream randomAccessSource = new 
NonSeekableRandomAccessReadInputStream(
+                bais))
+        {
+            assertEquals(0, randomAccessSource.getPosition());
+            randomAccessSource.skip(6);
+            assertEquals(6, randomAccessSource.getPosition());
+
+            assertEquals(6, randomAccessSource.peek());
+            assertEquals(6, randomAccessSource.getPosition());
+        }
+    }
+
+    @Test
+    void testPositionUnreadBytes() throws IOException
+    {
+        byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        ByteArrayInputStream bais = new ByteArrayInputStream(inputValues);
+
+        try (NonSeekableRandomAccessReadInputStream randomAccessSource = new 
NonSeekableRandomAccessReadInputStream(
+                bais))
+        {
+            assertEquals(0, randomAccessSource.getPosition());
+            randomAccessSource.read();
+            randomAccessSource.read();
+            byte[] readBytes = new byte[6];
+            assertEquals(readBytes.length, randomAccessSource.read(readBytes));
+            assertEquals(8, randomAccessSource.getPosition());
+            randomAccessSource.rewind(readBytes.length);
+            assertEquals(2, randomAccessSource.getPosition());
+            assertEquals(2, randomAccessSource.read());
+            assertEquals(3, randomAccessSource.getPosition());
+            randomAccessSource.read(readBytes, 2, 4);
+            assertEquals(7, randomAccessSource.getPosition());
+            randomAccessSource.rewind(4);
+            assertEquals(3, randomAccessSource.getPosition());
+        }
+    }
+
+    @Test
+    void testView() throws IOException
+    {
+        byte[] inputValues = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        ByteArrayInputStream bais = new ByteArrayInputStream(inputValues);
+
+        try (NonSeekableRandomAccessReadInputStream randomAccessSource = new 
NonSeekableRandomAccessReadInputStream(
+                bais))
+        {
+            Assertions.assertThrows(IOException.class, () -> 
randomAccessSource.createView(3, 5),
+                    "createView should have thrown an IOException");
+        }
+    }
+
+    @Test
+    void testBufferSwitch() throws IOException
+    {
+        byte[] original = createRandomData();
+
+        ByteArrayInputStream byteArrayInputStream = new 
ByteArrayInputStream(original);
+        try (RandomAccessRead rar = new NonSeekableRandomAccessReadInputStream(
+                byteArrayInputStream))
+        {
+            rar.skip(4098);
+            assertEquals(4098, rar.getPosition());
+            rar.rewind(4);
+            assertEquals(4094, rar.getPosition());
+            assertEquals(original[4094] & 0xFF, rar.read());
+        }
+    }
+
+    @Test
+    void testRewindException() throws IOException
+    {
+        ByteArrayInputStream byteArrayInputStream = new 
ByteArrayInputStream(createRandomData());
+        try (RandomAccessRead rar = new NonSeekableRandomAccessReadInputStream(
+                byteArrayInputStream))
+        {
+            rar.skip(10000);
+            assertEquals(10000, rar.getPosition());
+            rar.rewind(4096);
+            assertEquals(5904, rar.getPosition());
+            Assertions.assertThrows(IOException.class, () -> rar.rewind(4096),
+                    "createView should have thrown an IOException");
+        }
+    }
+
+    private byte[] createRandomData()
+    {
+        final long seed = new Random().nextLong();
+        final Random random = new Random(seed);
+        final int numBytes = 10000 + random.nextInt(20000);
+        byte[] original = new byte[numBytes];
+
+        int upto = 0;
+        while (upto < numBytes)
+        {
+            final int left = numBytes - upto;
+            if (random.nextBoolean() || left < 2)
+            {
+                // Fill w/ pseudo-random bytes:
+                final int end = upto + Math.min(left, 10 + 
random.nextInt(100));
+                while (upto < end)
+                {
+                    original[upto++] = (byte) random.nextInt();
+                }
+            }
+            else
+            {
+                // Fill w/ very predictable bytes:
+                final int end = upto + Math.min(left, 2 + random.nextInt(10));
+                final byte value = (byte) random.nextInt(4);
+                while (upto < end)
+                {
+                    original[upto++] = value;
+                }
+            }
+        }
+        return original;
+    }
+
+    /**
+     * PDFBOX-5158: endless loop reading a stream of a multiple of 4096 bytes 
from a FileInputStream. Test does not fail
+     * with a ByteArrayInputStream, so we need to create a temp file.
+     *
+     * @throws IOException
+     */
+    @Test
+    void testPDFBOX5158() throws IOException
+    {
+        Path path = Files.createTempFile("len4096", ".pdf");
+        try (OutputStream os = Files.newOutputStream(path))
+        {
+            os.write(new byte[4096]);
+        }
+        assertEquals(4096, path.toFile().length());
+        try (RandomAccessRead rar = new NonSeekableRandomAccessReadInputStream(
+                Files.newInputStream(path)))
+        {
+            assertEquals(0, rar.read());
+        }
+        Files.delete(path);
+    }
+
+    /**
+     * PDFBOX-5161: failure to read bytes after reading a multiple of 4096. 
Construction source must be an InputStream.
+     *
+     * @throws IOException
+     */
+    @Test
+    void testPDFBOX5161() throws IOException
+    {
+        try (RandomAccessRead rar = new NonSeekableRandomAccessReadInputStream(
+                new ByteArrayInputStream(new byte[4099])))
+        {
+            byte[] buf = new byte[4096];
+            int bytesRead = rar.read(buf);
+            assertEquals(4096, bytesRead);
+            bytesRead = rar.read(buf, 0, 3);
+            assertEquals(3, bytesRead);
+        }
+    }
+
+}

Propchange: 
pdfbox/branches/3.0/io/src/test/java/org/apache/pdfbox/io/NonSeekableRandomAccessReadInputStreamTest.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to