Try the iterator of document.getPages <http://document.getPages> ().
--- Original-Nachricht --- Von: simon steiner (JIRA) Betreff: [jira] [Created] (PDFBOX-4319) Parsing 100000 page pdf is slow Datum: 20.09.2018, 11:18 Uhr An: dev@pdfbox.apache.org simon steiner created PDFBOX-4319: ------------------------------------- Summary: Parsing 100000 page pdf is slow Key: PDFBOX-4319 URL: https://issues.apache.org/jira/browse/PDFBOX-4319 <https://issues.apache.org/jira/browse/PDFBOX-4319> Project: PDFBox Issue Type: Bug Components: Parsing Reporter: simon steiner Parsing 100000 page pdf is slow, how do i speed it up? {code:java} import org.apache.pdfbox.contentstream.operator.Operator <http://org.apache.pdfbox.contentstream.operator.Operator> ; import org.apache.pdfbox.cos.COSBase <http://org.apache.pdfbox.cos.COSBase> ; import org.apache.pdfbox.cos.COSName <http://org.apache.pdfbox.cos.COSName> ; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import java.io.File <http://java.io.File> ; import java.io.IOException <http://java.io.IOException> ; import java.util.ArrayList; import java.util.List; public class CheckPDF { public static void main(String[] args) throws IOException { PDDocument doc = PDDocument.load(new File("out.pdf")); for (int i=0; i<doc.getNumberOfPages <http://doc.getNumberOfPages> (); i++) { System.out.println(i <http://System.out.println(i> ); PDPage page = doc.getPage(i <http://doc.getPage(i> ); PDFStreamParser parser = new PDFStreamParser(page.getContents <http://page.getContents> ()); parser.parse <http://parser.parse> (); List<Object> it = parser.getTokens <http://parser.getTokens> (); List<COSBase> arguments = new ArrayList<COSBase>(); for (Object o : it) { if (o instanceof Operator) { Operator op = (Operator)o; if (op.getName().equals("Do" <http://op.getName().equals("Do"> ;)) { COSName name = (COSName) arguments.get(0 <http://arguments.get(0> ); if (page.getResources().getXObject(name <http://page.getResources().getXObject(name> ) == null) { throw new RuntimeException(name + " not found"); } } arguments.clear <http://arguments.clear> (); } else { arguments.add((COSBase)o <http://arguments.add((COSBase)o> ); } } } doc.close <http://doc.close> (); } }{code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org <mailto:dev-unsubscr...@pdfbox.apache.org> For additional commands, e-mail: dev-h...@pdfbox.apache.org <mailto:dev-h...@pdfbox.apache.org>