Author: lehmi
Date: Sun May 26 13:25:32 2024
New Revision: 1917978

URL: http://svn.apache.org/viewvc?rev=1917978&view=rev
Log:
PDFBOX-5675: use NonSeekableRandomAccessReadInputStream for content streams to 
reduce the memory footprint.

Modified:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java?rev=1917978&r1=1917977&r2=1917978&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java
 Sun May 26 13:25:32 2024
@@ -48,6 +48,20 @@ public interface PDContentStream
     RandomAccessRead getContentsForRandomAccess() throws IOException;
 
     /**
+     * Returns this stream's content, if any.
+     * 
+     * The random access capabilities of the returned instance is supposed to 
be limited. Peek/rewind operations are
+     * limited to a small range of data and not the whole set of data. Seek 
operations aren't supported at all.
+     * 
+     * @return A RandomAccessRead or null.
+     * @throws IOException If the content could not be read
+     */
+    default RandomAccessRead getContentsForStreamParsing() throws IOException
+    {
+        return getContentsForRandomAccess();
+    }
+
+    /**
      * Returns this stream's resources, if any.
      * 
      * @return the resources of the content stream or null

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1917978&r1=1917977&r2=1917978&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
 Sun May 26 13:25:32 2024
@@ -56,7 +56,7 @@ public class PDFStreamParser extends Bas
      */
     public PDFStreamParser(PDContentStream pdContentstream) throws IOException
     {
-        super(pdContentstream.getContentsForRandomAccess());
+        super(pdContentstream.getContentsForStreamParsing());
     }
 
     /**

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java?rev=1917978&r1=1917977&r2=1917978&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java 
(original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java Sun 
May 26 13:25:32 2024
@@ -35,9 +35,15 @@ import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSNumber;
 import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.filter.DecodeOptions;
+import org.apache.pdfbox.filter.Filter;
+import org.apache.pdfbox.filter.FilterFactory;
+import org.apache.pdfbox.filter.FlateFilterDecoderStream;
 import org.apache.pdfbox.io.RandomAccessInputStream;
 import org.apache.pdfbox.io.RandomAccessRead;
 import org.apache.pdfbox.io.RandomAccessReadBuffer;
+import org.apache.pdfbox.io.NonSeekableRandomAccessReadInputStream;
+import org.apache.pdfbox.io.RandomAccessReadView;
 import org.apache.pdfbox.io.SequenceRandomAccessRead;
 import org.apache.pdfbox.pdmodel.common.COSArrayList;
 import org.apache.pdfbox.pdmodel.common.COSObjectable;
@@ -167,6 +173,33 @@ public class PDPage implements COSObject
     }
 
     @Override
+    public RandomAccessRead getContentsForStreamParsing() throws IOException
+    {
+        // return a stream based reader if there is just one stream
+        COSStream contentStream = page.getCOSStream(COSName.CONTENTS);
+        if (contentStream != null)
+        {
+            COSBase filter = contentStream.getFilters();
+            // for now only streams using a flate filter are supported
+            if (filter instanceof COSName && ((COSName) 
filter).equals(COSName.FLATE_DECODE))
+            {
+                try
+                {
+                    FlateFilterDecoderStream decoderStream = new 
FlateFilterDecoderStream(
+                            contentStream.createRawInputStream());
+                    return new 
NonSeekableRandomAccessReadInputStream(decoderStream);
+                }
+                catch (IOException exception)
+                {
+                    LOG.warn("skipped malformed content stream");
+                    return new RandomAccessReadBuffer(DELIMITER);
+                }
+            }
+        }
+        return getContentsForRandomAccess();
+    }
+
+    @Override
     public RandomAccessRead getContentsForRandomAccess() throws IOException
     {
         COSStream contentStream = page.getCOSStream(COSName.CONTENTS);


Reply via email to