[ https://issues.apache.org/jira/browse/PDFBOX-1202?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ilija Pavlic updated PDFBOX-1202: --------------------------------- Priority: Critical (was: Minor) Description: Error "org.apache.pdfbox.filter.FlateFilter decode SEVERE: Stop reading corrupt stream" thrown when extracting text. The error is thrown at: - page 397 if the page loop starts at zero -- for (int i = 0; i < allPages.size(); i++) - page 790 if the loop starts at 395 (that would make it approx. 397 pages from the beggining of the loop) - page 848 if the loop starts at 450 (that would make it aprox. 397 pages from the beggining of the loop) The error is not thrown if: - the loop starts at page 452 or later - the loop starts at 0 and ends before 396 - the loop starts at 200 and ends before 595 Therefore I suspect that a loop spanning more than 396 pages will throw an error. Is that an indication of a memory leak of some sort? Here is the full code: package transhotel.pdf.iata; import java.awt.geom.Rectangle2D; import java.io.IOException; import java.util.List; import org.apache.pdfbox.exceptions.COSVisitorException; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.exceptions.InvalidPasswordException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.util.PDFTextStripperByArea; public class Main { public static void main(String[] args) throws IOException, COSVisitorException, CryptographyException { PDDocument document = null; try { document = PDDocument.load("/Users/ilijapavlic/Desktop/IATAUnitedStates.pdf"); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } float x = 55f; float y = 40f; float width = 168.5f; float height = 689f; float evenOffset = -10f; List allPages = document.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { System.out.println("Page " + i); PDPage page = (PDPage) allPages.get(i); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); for (int j = 0; j < 3; j++) { if (i % 2 == 0) { Rectangle2D.Float region = new Rectangle2D.Float(x, y, width*3, height); stripper.addRegion("region", region); } else { Rectangle2D.Float region = new Rectangle2D.Float(x + evenOffset, y, width*3, height); stripper.addRegion("region", region); } } stripper.extractRegions(page); for (String regionName : stripper.getRegions()) { stripper.getTextForRegion(regionName); } } } catch(Exception e) { e.printStackTrace(); } finally { if (document != null) { document.close(); } } } } was: Error "org.apache.pdfbox.filter.FlateFilter decode SEVERE: Stop reading corrupt stream" thrown when extracting text. The document was loaded with the following snippet: document = PDDocument.load("C:/Users/ilija.pavlic/Downloads/TestInput.pdf"); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } Environment: Mac OS X 10.7.2 Updated to include new obtained information. Full code included. > org.apache.pdfbox.filter.FlateFilter decode SEVERE: Stop reading corrupt > stream > ------------------------------------------------------------------------------- > > Key: PDFBOX-1202 > URL: https://issues.apache.org/jira/browse/PDFBOX-1202 > Project: PDFBox > Issue Type: Bug > Components: Text extraction > Affects Versions: 1.6.0 > Environment: Mac OS X 10.7.2 > Reporter: Ilija Pavlic > Priority: Critical > Attachments: IATAUnitedStates.pdf > > > Error "org.apache.pdfbox.filter.FlateFilter decode SEVERE: Stop reading > corrupt stream" thrown when extracting text. > The error is thrown at: > - page 397 if the page loop starts at zero -- for (int i = 0; i < > allPages.size(); i++) > - page 790 if the loop starts at 395 (that would make it approx. 397 pages > from the beggining of the loop) > - page 848 if the loop starts at 450 (that would make it aprox. 397 pages > from the beggining of the loop) > The error is not thrown if: > - the loop starts at page 452 or later > - the loop starts at 0 and ends before 396 > - the loop starts at 200 and ends before 595 > Therefore I suspect that a loop spanning more than 396 pages will throw an > error. Is that an indication of a memory leak of some sort? > Here is the full code: > package transhotel.pdf.iata; > import java.awt.geom.Rectangle2D; > import java.io.IOException; > import java.util.List; > import org.apache.pdfbox.exceptions.COSVisitorException; > import org.apache.pdfbox.exceptions.CryptographyException; > import org.apache.pdfbox.exceptions.InvalidPasswordException; > import org.apache.pdfbox.pdmodel.PDDocument; > import org.apache.pdfbox.pdmodel.PDPage; > import org.apache.pdfbox.util.PDFTextStripperByArea; > public class Main { > public static void main(String[] args) throws IOException, > COSVisitorException, CryptographyException { > > PDDocument document = null; > try { > document = > PDDocument.load("/Users/ilijapavlic/Desktop/IATAUnitedStates.pdf"); > if (document.isEncrypted()) { > try { > document.decrypt(""); > } catch (InvalidPasswordException e) { > System.err.println("Error: Document is > encrypted with a password."); > System.exit(1); > } > } > float x = 55f; > float y = 40f; > float width = 168.5f; > float height = 689f; > float evenOffset = -10f; > List allPages = > document.getDocumentCatalog().getAllPages(); > for (int i = 0; i < allPages.size(); i++) { > System.out.println("Page " + i); > PDPage page = (PDPage) allPages.get(i); > PDFTextStripperByArea stripper = new > PDFTextStripperByArea(); > stripper.setSortByPosition(true); > for (int j = 0; j < 3; j++) > { > if (i % 2 == 0) { > Rectangle2D.Float region = new > Rectangle2D.Float(x, y, width*3, height); > stripper.addRegion("region", > region); > } > else { > Rectangle2D.Float region = new > Rectangle2D.Float(x + evenOffset, y, width*3, height); > stripper.addRegion("region", > region); > } > } > stripper.extractRegions(page); > for (String regionName : stripper.getRegions()) > { > stripper.getTextForRegion(regionName); > } > } > } > > catch(Exception e) { > e.printStackTrace(); > } > finally { > if (document != null) { > document.close(); > } > } > } > } -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators: https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa For more information on JIRA, see: http://www.atlassian.com/software/jira