Author: lehmi Date: Sun Jun 2 07:24:04 2024 New Revision: 1918113 URL: http://svn.apache.org/viewvc?rev=1918113&view=rev Log: PDFBOX-5675: use NonSeekableRandomAccessReadInputStream for content streams to reduce the memory footprint
Modified: pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java Modified: pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java URL: http://svn.apache.org/viewvc/pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java?rev=1918113&r1=1918112&r2=1918113&view=diff ============================================================================== --- pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java (original) +++ pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java Sun Jun 2 07:24:04 2024 @@ -48,6 +48,20 @@ public interface PDContentStream RandomAccessRead getContentsForRandomAccess() throws IOException; /** + * Returns this stream's content, if any. + * + * The random access capabilities of the returned instance is supposed to be limited. Peek/rewind operations are + * limited to a small range of data and not the whole set of data. Seek operations aren't supported at all. + * + * @return A RandomAccessRead or null. + * @throws IOException If the content could not be read + */ + default RandomAccessRead getContentsForStreamParsing() throws IOException + { + return getContentsForRandomAccess(); + } + + /** * Returns this stream's resources, if any. * * @return the resources of the content stream or null Modified: pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java URL: http://svn.apache.org/viewvc/pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1918113&r1=1918112&r2=1918113&view=diff ============================================================================== --- pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original) +++ pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Sun Jun 2 07:24:04 2024 @@ -56,7 +56,7 @@ public class PDFStreamParser extends Bas */ public PDFStreamParser(PDContentStream pdContentstream) throws IOException { - super(pdContentstream.getContentsForRandomAccess()); + super(pdContentstream.getContentsForStreamParsing()); } /** Modified: pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java URL: http://svn.apache.org/viewvc/pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java?rev=1918113&r1=1918112&r2=1918113&view=diff ============================================================================== --- pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java (original) +++ pdfbox/branches/3.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java Sun Jun 2 07:24:04 2024 @@ -35,6 +35,8 @@ import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.filter.FlateFilterDecoderStream; +import org.apache.pdfbox.io.NonSeekableRandomAccessReadInputStream; import org.apache.pdfbox.io.RandomAccessInputStream; import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.io.RandomAccessReadBuffer; @@ -167,6 +169,33 @@ public class PDPage implements COSObject } @Override + public RandomAccessRead getContentsForStreamParsing() throws IOException + { + // return a stream based reader if there is just one stream + COSStream contentStream = page.getCOSStream(COSName.CONTENTS); + if (contentStream != null) + { + COSBase filter = contentStream.getFilters(); + // for now only streams using a flate filter are supported + if (filter instanceof COSName && ((COSName) filter).equals(COSName.FLATE_DECODE)) + { + try + { + FlateFilterDecoderStream decoderStream = new FlateFilterDecoderStream( + contentStream.createRawInputStream()); + return new NonSeekableRandomAccessReadInputStream(decoderStream); + } + catch (IOException exception) + { + LOG.warn("skipped malformed content stream"); + return new RandomAccessReadBuffer(DELIMITER); + } + } + } + return getContentsForRandomAccess(); + } + + @Override public RandomAccessRead getContentsForRandomAccess() throws IOException { COSStream contentStream = page.getCOSStream(COSName.CONTENTS);