[ https://issues.apache.org/jira/browse/PDFBOX-4319?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
simon steiner updated PDFBOX-4319: ---------------------------------- Description: Parsing 100000 page pdf is slow, how do i speed it up? {code:java} import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class CheckPDF { public static void main(String[] args) throws IOException { PDDocument doc = PDDocument.load(new File("out.pdf")); for (int i=0; i<doc.getNumberOfPages(); i++) { System.out.println(i); PDPage page = doc.getPage(i); PDFStreamParser parser = new PDFStreamParser(page.getContents()); parser.parse(); List<Object> it = parser.getTokens(); List<COSBase> arguments = new ArrayList<COSBase>(); for (Object o : it) { if (o instanceof Operator) { Operator op = (Operator)o; if (op.getName().equals("Do")) { COSName name = (COSName) arguments.get(0); if (page.getResources().getXObject(name) == null) { throw new RuntimeException(name + " not found"); } } arguments.clear(); } else { arguments.add((COSBase)o); } } } doc.close(); } }{code} was: Parsing 100000 page pdf is slow, how do i speed it up? {code:java} import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class CheckPDF { public static void main(String[] args) throws IOException { PDDocument doc = PDDocument.load(new File("out.pdf")); for (int i=0; i<doc.getNumberOfPages(); i++) { System.out.println(i); PDPage page = doc.getPage(i); PDFStreamParser parser = new PDFStreamParser(page.getContents()); parser.parse(); List<Object> it = parser.getTokens(); List<COSBase> arguments = new ArrayList<COSBase>(); for (Object o : it) { if (o instanceof Operator) { Operator op = (Operator)o; if (op.getName().equals("Do")) { COSName name = (COSName) arguments.get(0); if (page.getResources().getXObject(name) == null) { throw new RuntimeException(name + " not found"); } } arguments.clear(); } else { arguments.add((COSBase)o); } } } doc.close(); } }{code} > Parsing 100000 page pdf is slow > ------------------------------- > > Key: PDFBOX-4319 > URL: https://issues.apache.org/jira/browse/PDFBOX-4319 > Project: PDFBox > Issue Type: Bug > Components: Parsing > Reporter: simon steiner > Priority: Major > > Parsing 100000 page pdf is slow, how do i speed it up? > {code:java} > import org.apache.pdfbox.contentstream.operator.Operator; > import org.apache.pdfbox.cos.COSBase; > import org.apache.pdfbox.cos.COSName; > import org.apache.pdfbox.pdfparser.PDFStreamParser; > import org.apache.pdfbox.pdmodel.PDDocument; > import org.apache.pdfbox.pdmodel.PDPage; > import java.io.File; > import java.io.IOException; > import java.util.ArrayList; > import java.util.List; > public class CheckPDF { > public static void main(String[] args) throws IOException { > PDDocument doc = PDDocument.load(new File("out.pdf")); > for (int i=0; i<doc.getNumberOfPages(); i++) { > System.out.println(i); > PDPage page = doc.getPage(i); > PDFStreamParser parser = new PDFStreamParser(page.getContents()); > parser.parse(); > List<Object> it = parser.getTokens(); > List<COSBase> arguments = new ArrayList<COSBase>(); > for (Object o : it) { > if (o instanceof Operator) { > Operator op = (Operator)o; > if (op.getName().equals("Do")) { > COSName name = (COSName) arguments.get(0); > if (page.getResources().getXObject(name) == null) { > throw new RuntimeException(name + " not found"); > } > } > arguments.clear(); > } else { > arguments.add((COSBase)o); > } > } > } > doc.close(); > } > }{code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org For additional commands, e-mail: dev-h...@pdfbox.apache.org