Author: lehmi Date: Sun May 26 13:25:32 2024 New Revision: 1917978 URL: http://svn.apache.org/viewvc?rev=1917978&view=rev Log: PDFBOX-5675: use NonSeekableRandomAccessReadInputStream for content streams to reduce the memory footprint.
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java?rev=1917978&r1=1917977&r2=1917978&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDContentStream.java Sun May 26 13:25:32 2024 @@ -48,6 +48,20 @@ public interface PDContentStream RandomAccessRead getContentsForRandomAccess() throws IOException; /** + * Returns this stream's content, if any. + * + * The random access capabilities of the returned instance is supposed to be limited. Peek/rewind operations are + * limited to a small range of data and not the whole set of data. Seek operations aren't supported at all. + * + * @return A RandomAccessRead or null. + * @throws IOException If the content could not be read + */ + default RandomAccessRead getContentsForStreamParsing() throws IOException + { + return getContentsForRandomAccess(); + } + + /** * Returns this stream's resources, if any. * * @return the resources of the content stream or null Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1917978&r1=1917977&r2=1917978&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Sun May 26 13:25:32 2024 @@ -56,7 +56,7 @@ public class PDFStreamParser extends Bas */ public PDFStreamParser(PDContentStream pdContentstream) throws IOException { - super(pdContentstream.getContentsForRandomAccess()); + super(pdContentstream.getContentsForStreamParsing()); } /** Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java?rev=1917978&r1=1917977&r2=1917978&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDPage.java Sun May 26 13:25:32 2024 @@ -35,9 +35,15 @@ import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.filter.DecodeOptions; +import org.apache.pdfbox.filter.Filter; +import org.apache.pdfbox.filter.FilterFactory; +import org.apache.pdfbox.filter.FlateFilterDecoderStream; import org.apache.pdfbox.io.RandomAccessInputStream; import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.io.RandomAccessReadBuffer; +import org.apache.pdfbox.io.NonSeekableRandomAccessReadInputStream; +import org.apache.pdfbox.io.RandomAccessReadView; import org.apache.pdfbox.io.SequenceRandomAccessRead; import org.apache.pdfbox.pdmodel.common.COSArrayList; import org.apache.pdfbox.pdmodel.common.COSObjectable; @@ -167,6 +173,33 @@ public class PDPage implements COSObject } @Override + public RandomAccessRead getContentsForStreamParsing() throws IOException + { + // return a stream based reader if there is just one stream + COSStream contentStream = page.getCOSStream(COSName.CONTENTS); + if (contentStream != null) + { + COSBase filter = contentStream.getFilters(); + // for now only streams using a flate filter are supported + if (filter instanceof COSName && ((COSName) filter).equals(COSName.FLATE_DECODE)) + { + try + { + FlateFilterDecoderStream decoderStream = new FlateFilterDecoderStream( + contentStream.createRawInputStream()); + return new NonSeekableRandomAccessReadInputStream(decoderStream); + } + catch (IOException exception) + { + LOG.warn("skipped malformed content stream"); + return new RandomAccessReadBuffer(DELIMITER); + } + } + } + return getContentsForRandomAccess(); + } + + @Override public RandomAccessRead getContentsForRandomAccess() throws IOException { COSStream contentStream = page.getCOSStream(COSName.CONTENTS);