[ https://issues.apache.org/jira/browse/PDFBOX-2493?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
John Hewson updated PDFBOX-2493: -------------------------------- Priority: Major (was: Blocker) > OOM with corrupt PDF file > ------------------------- > > Key: PDFBOX-2493 > URL: https://issues.apache.org/jira/browse/PDFBOX-2493 > Project: PDFBox > Issue Type: Bug > Components: Text extraction > Affects Versions: 1.8.6 > Environment: Linux, JVM 1.8.0_25 (64-bit) > Reporter: Alan Burlison > > I have a large archive of PDF files, some of which are unfortunately corrupt. > I'm scanning them using a webapp and Tika, which in turn uses PDFBox. I have > one file which results in errors in Tika 1.4 & 1.5 but with Tika 1.6 (which > uses PDFBox 1.8.6) as well as causing errors it also causes PDFBox to consume > ~4GB of heap before descending into a GC death-spiral. Unfortunately I can't > provide the PDF file that causes this as the contents are confidential. As > Tika/PDFBox are being used from inside a webapp I can cope with errors being > thrown but the OOM caused by 1.8.6 is a blocker and I've had to revert to > Tika 1.5, which in turn uses PDFBox 1.8.4. > ERROR - FlateFilter: stop reading corrupt stream due to a DataFormatException > ERROR - FlateFilter: stop reading corrupt stream due to a DataFormatException > INFO - unsupported/disabled operation: > > WARN - java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > at java.util.ArrayList.rangeCheck(ArrayList.java:653) > at java.util.ArrayList.get(ArrayList.java:429) > at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:44) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) > INFO - unsupported/disabled operation: B110EBE04050412 > WARN - java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > at java.util.ArrayList.rangeCheck(ArrayList.java:653) > at java.util.ArrayList.get(ArrayList.java:429) > at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:44) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) > INFO - unsupported/disabled operation: > B0F0F07100D05050603140D10093E0903DB06050E3C0405D > INFO - unsupported/disabled operation: E > INFO - unsupported/disabled operation: C > INFO - unsupported/disabled operation: > B051A0E0C0E130B060B0C0D050640750D020E0D050DE506400C13010B050271 > INFO - unsupported/disabled operation: A > INFO - unsupported/disabled operation: D > INFO - unsupported/disabled operation: B100 > WARN - java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > at java.util.ArrayList.rangeCheck(ArrayList.java:653) > at java.util.ArrayList.get(ArrayList.java:429) > at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:44) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) > INFO - unsupported/disabled operation: B5 > WARN - java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > at java.util.ArrayList.rangeCheck(ArrayList.java:653) > at java.util.ArrayList.get(ArrayList.java:429) > at org.apache.pdfbox.util.operator.MoveText.process(MoveText.java:41) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) > INFO - unsupported/disabled operation: > B020903110B06050E0F051E0C67A31C05340D000E0D03070C05074 > INFO - unsupported/disabled operation: B11160B1005 > INFO - unsupported/disabled operation: FB > INFO - unsupported/disabled operation: B230C0B12 > WARN - java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > at java.util.ArrayList.rangeCheck(ArrayList.java:653) > at java.util.ArrayList.get(ArrayList.java:429) > at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:44) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) > INFO - unsupported/disabled operation: DE > INFO - unsupported/disabled operation: B10050E1F0506AE080B230C0B1419C50E3C0 > INFO - unsupported/disabled operation: B05650A09010D0B3F1103B > WARN - java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > at java.util.ArrayList.rangeCheck(ArrayList.java:653) > at java.util.ArrayList.get(ArrayList.java:429) > at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:44) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) > INFO - unsupported/disabled operation: B0C0D2419 > INFO - unsupported/disabled operation: B040503020B0 > WARN - java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > at java.util.ArrayList.rangeCheck(ArrayList.java:653) > at java.util.ArrayList.get(ArrayList.java:429) > at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:44) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) > ERROR - FlateFilter: stop reading corrupt stream due to a DataFormatException > ERROR - FlateFilter: stop reading corrupt stream due to a DataFormatException > ERROR - FlateFilter: stop reading corrupt stream due to a DataFormatException > ERROR - error: array index out of bounds > java.lang.ArrayIndexOutOfBoundsException: 3337 > at > org.apache.fontbox.ttf.GlyfSimpleDescript.readFlags(GlyfSimpleDescript.java:199) > at > org.apache.fontbox.ttf.GlyfSimpleDescript.<init>(GlyfSimpleDescript.java:78) > at org.apache.fontbox.ttf.GlyphData.initData(GlyphData.java:57) > at org.apache.fontbox.ttf.GlyphTable.initData(GlyphTable.java:69) > at > org.apache.fontbox.ttf.TrueTypeFont.initializeTable(TrueTypeFont.java:280) > at > org.apache.fontbox.ttf.AbstractTTFParser.parseTables(AbstractTTFParser.java:128) > at org.apache.fontbox.ttf.TTFParser.parseTables(TTFParser.java:80) > at > org.apache.fontbox.ttf.AbstractTTFParser.parseTTF(AbstractTTFParser.java:109) > at org.apache.fontbox.ttf.TTFParser.parseTTF(TTFParser.java:25) > at > org.apache.fontbox.ttf.AbstractTTFParser.parseTTF(AbstractTTFParser.java:84) > at org.apache.fontbox.ttf.TTFParser.parseTTF(TTFParser.java:25) > at > org.apache.pdfbox.pdmodel.font.PDTrueTypeFont.getTTFFont(PDTrueTypeFont.java:632) > at > org.apache.pdfbox.pdmodel.font.PDTrueTypeFont.getFontWidth(PDTrueTypeFont.java:673) > at > org.apache.pdfbox.pdmodel.font.PDSimpleFont.getFontWidth(PDSimpleFont.java:233) > at > org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEngine.java:411) > at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:45) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) > Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit > exceeded > at > org.apache.fontbox.ttf.GlyfCompositeDescript.<init>(GlyfCompositeDescript.java:58) > at org.apache.fontbox.ttf.GlyphData.initData(GlyphData.java:62) > at org.apache.fontbox.ttf.GlyphTable.initData(GlyphTable.java:69) > at > org.apache.fontbox.ttf.TrueTypeFont.initializeTable(TrueTypeFont.java:280) > at > org.apache.fontbox.ttf.AbstractTTFParser.parseTables(AbstractTTFParser.java:128) > at org.apache.fontbox.ttf.TTFParser.parseTables(TTFParser.java:80) > at > org.apache.fontbox.ttf.AbstractTTFParser.parseTTF(AbstractTTFParser.java:109) > at org.apache.fontbox.ttf.TTFParser.parseTTF(TTFParser.java:25) > at > org.apache.fontbox.ttf.AbstractTTFParser.parseTTF(AbstractTTFParser.java:84) > at org.apache.fontbox.ttf.TTFParser.parseTTF(TTFParser.java:25) > at > org.apache.pdfbox.pdmodel.font.PDTrueTypeFont.getTTFFont(PDTrueTypeFont.java:632) > at > org.apache.pdfbox.pdmodel.font.PDTrueTypeFont.getFontWidth(PDTrueTypeFont.java:673) > at > org.apache.pdfbox.pdmodel.font.PDSimpleFont.getFontWidth(PDSimpleFont.java:233) > at > org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEngine.java:411) > at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:45) > at > org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:557) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:268) > at > org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235) > at > org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215) > at > org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:460) > at > org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:385) > at > org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:344) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:130) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:159) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:121) > at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:143) > at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:422) > at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:113) -- This message was sent by Atlassian JIRA (v6.3.4#6332)