simon steiner created PDFBOX-4319: ------------------------------------- Summary: Parsing 100000 page pdf is slow Key: PDFBOX-4319 URL: https://issues.apache.org/jira/browse/PDFBOX-4319 Project: PDFBox Issue Type: Bug Components: Parsing Reporter: simon steiner
Parsing 100000 page pdf is slow, how do i speed it up? {code:java} import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class CheckPDF { public static void main(String[] args) throws IOException { PDDocument doc = PDDocument.load(new File("out.pdf")); for (int i=0; i<doc.getNumberOfPages(); i++) { System.out.println(i); PDPage page = doc.getPage(i); PDFStreamParser parser = new PDFStreamParser(page.getContents()); parser.parse(); List<Object> it = parser.getTokens(); List<COSBase> arguments = new ArrayList<COSBase>(); for (Object o : it) { if (o instanceof Operator) { Operator op = (Operator)o; if (op.getName().equals("Do")) { COSName name = (COSName) arguments.get(0); if (page.getResources().getXObject(name) == null) { throw new RuntimeException(name + " not found"); } } arguments.clear(); } else { arguments.add((COSBase)o); } } } doc.close(); } }{code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org For additional commands, e-mail: dev-h...@pdfbox.apache.org