Author: lehmi
Date: Sun Jun  2 07:24:04 2024
New Revision: 1918113

URL: http://svn.apache.org/viewvc?rev=1918113&view=rev
Log:
PDFBOX-5675: use NonSeekableRandomAccessReadInputStream for content streams to 
reduce the memory footprint

Modified:
    
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java
    
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
    
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java

Modified: 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java?rev=1918113&r1=1918112&r2=1918113&view=diff
==============================================================================
--- 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java
 (original)
+++ 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java
 Sun Jun  2 07:24:04 2024
@@ -48,6 +48,20 @@ public interface PDContentStream
     RandomAccessRead getContentsForRandomAccess() throws IOException;
 
     /**
+     * Returns this stream's content, if any.
+     * 
+     * The random access capabilities of the returned instance is supposed to 
be limited. Peek/rewind operations are
+     * limited to a small range of data and not the whole set of data. Seek 
operations aren't supported at all.
+     * 
+     * @return A RandomAccessRead or null.
+     * @throws IOException If the content could not be read
+     */
+    default RandomAccessRead getContentsForStreamParsing() throws IOException
+    {
+        return getContentsForRandomAccess();
+    }
+
+    /**
      * Returns this stream's resources, if any.
      * 
      * @return the resources of the content stream or null

Modified: 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1918113&r1=1918112&r2=1918113&view=diff
==============================================================================
--- 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
 (original)
+++ 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
 Sun Jun  2 07:24:04 2024
@@ -56,7 +56,7 @@ public class PDFStreamParser extends Bas
      */
     public PDFStreamParser(PDContentStream pdContentstream) throws IOException
     {
-        super(pdContentstream.getContentsForRandomAccess());
+        super(pdContentstream.getContentsForStreamParsing());
     }
 
     /**

Modified: 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java?rev=1918113&r1=1918112&r2=1918113&view=diff
==============================================================================
--- 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java 
(original)
+++ 
pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java 
Sun Jun  2 07:24:04 2024
@@ -35,6 +35,8 @@ import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSNumber;
 import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.filter.FlateFilterDecoderStream;
+import org.apache.pdfbox.io.NonSeekableRandomAccessReadInputStream;
 import org.apache.pdfbox.io.RandomAccessInputStream;
 import org.apache.pdfbox.io.RandomAccessRead;
 import org.apache.pdfbox.io.RandomAccessReadBuffer;
@@ -167,6 +169,33 @@ public class PDPage implements COSObject
     }
 
     @Override
+    public RandomAccessRead getContentsForStreamParsing() throws IOException
+    {
+        // return a stream based reader if there is just one stream
+        COSStream contentStream = page.getCOSStream(COSName.CONTENTS);
+        if (contentStream != null)
+        {
+            COSBase filter = contentStream.getFilters();
+            // for now only streams using a flate filter are supported
+            if (filter instanceof COSName && ((COSName) 
filter).equals(COSName.FLATE_DECODE))
+            {
+                try
+                {
+                    FlateFilterDecoderStream decoderStream = new 
FlateFilterDecoderStream(
+                            contentStream.createRawInputStream());
+                    return new 
NonSeekableRandomAccessReadInputStream(decoderStream);
+                }
+                catch (IOException exception)
+                {
+                    LOG.warn("skipped malformed content stream");
+                    return new RandomAccessReadBuffer(DELIMITER);
+                }
+            }
+        }
+        return getContentsForRandomAccess();
+    }
+
+    @Override
     public RandomAccessRead getContentsForRandomAccess() throws IOException
     {
         COSStream contentStream = page.getCOSStream(COSName.CONTENTS);


Reply via email to