[ https://issues.apache.org/jira/browse/PDFBOX-4319?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16621824#comment-16621824 ]
simon steiner commented on PDFBOX-4319: --------------------------------------- should getPage be deprecated? > Parsing 100000 page pdf is slow > ------------------------------- > > Key: PDFBOX-4319 > URL: https://issues.apache.org/jira/browse/PDFBOX-4319 > Project: PDFBox > Issue Type: Bug > Components: Parsing > Affects Versions: 3.0.0 PDFBox > Reporter: simon steiner > Priority: Major > Attachments: out.pdf.zip > > > Parsing 100000 page pdf is slow, how do i speed it up? > {code:java} > import org.apache.pdfbox.contentstream.operator.Operator; > import org.apache.pdfbox.cos.COSBase; > import org.apache.pdfbox.cos.COSName; > import org.apache.pdfbox.pdfparser.PDFStreamParser; > import org.apache.pdfbox.pdmodel.PDDocument; > import org.apache.pdfbox.pdmodel.PDPage; > import java.io.File; > import java.io.IOException; > import java.util.ArrayList; > import java.util.List; > public class CheckPDF { > public static void main(String[] args) throws IOException { > PDDocument doc = PDDocument.load(new File("out.pdf")); > for (int i=0; i<doc.getNumberOfPages(); i++) { > System.out.println(i); > PDPage page = doc.getPage(i); > PDFStreamParser parser = new PDFStreamParser(page.getContents()); > parser.parse(); > List<Object> it = parser.getTokens(); > List<COSBase> arguments = new ArrayList<COSBase>(); > for (Object o : it) { > if (o instanceof Operator) { > Operator op = (Operator)o; > if (op.getName().equals("Do")) { > COSName name = (COSName) arguments.get(0); > if (page.getResources().getXObject(name) == null) { > throw new RuntimeException(name + " not found"); > } > } > arguments.clear(); > } else { > arguments.add((COSBase)o); > } > } > } > doc.close(); > } > }{code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org For additional commands, e-mail: dev-h...@pdfbox.apache.org