[ https://issues.apache.org/jira/browse/TIKA-1742?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Nathan Dire updated TIKA-1742: ------------------------------ Description: Here's the file: http://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf Code to repro ({{ExtractInlineImages}} must be true): {noformat} Parser parser = new PDFParser(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); ContentHandler handler = new DefaultHandler(); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); context.set(PDFParserConfig.class, config); context.set(Parser.class, parser); InputStream is = new BufferedInputStream(new FileInputStream(args[0])); try { parser.parse(is, handler, metadata, context); } finally { is.close(); } {noformat} Error (infinite recursion in {{extractImages}}): {noformat} Exception in thread "main" java.lang.StackOverflowError at java.util.LinkedHashMap$Entry.addBefore(LinkedHashMap.java:340) at java.util.LinkedHashMap$Entry.access$600(LinkedHashMap.java:320) at java.util.LinkedHashMap.createEntry(LinkedHashMap.java:444) at java.util.HashMap.addEntry(HashMap.java:888) at java.util.LinkedHashMap.addEntry(LinkedHashMap.java:427) at java.util.HashMap.put(HashMap.java:509) at org.apache.pdfbox.cos.COSDictionary.setItem(COSDictionary.java:246) at org.apache.pdfbox.pdmodel.common.COSDictionaryMap.convert(COSDictionaryMap.java:206) at org.apache.pdfbox.pdmodel.PDResources.setXObjects(PDResources.java:331) at org.apache.pdfbox.pdmodel.PDResources.getXObjects(PDResources.java:269) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:310) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) {noformat} was: Here's the file: http://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf Code to repro (ExtractInlineImages must be true): {noformat} Parser parser = new PDFParser(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); ContentHandler handler = new DefaultHandler(); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); context.set(PDFParserConfig.class, config); context.set(Parser.class, parser); InputStream is = new BufferedInputStream(new FileInputStream(args[0])); try { parser.parse(is, handler, metadata, context); } finally { is.close(); } {noformat} Error: {noformat} Exception in thread "main" java.lang.StackOverflowError at java.util.LinkedHashMap$Entry.addBefore(LinkedHashMap.java:340) at java.util.LinkedHashMap$Entry.access$600(LinkedHashMap.java:320) at java.util.LinkedHashMap.createEntry(LinkedHashMap.java:444) at java.util.HashMap.addEntry(HashMap.java:888) at java.util.LinkedHashMap.addEntry(LinkedHashMap.java:427) at java.util.HashMap.put(HashMap.java:509) at org.apache.pdfbox.cos.COSDictionary.setItem(COSDictionary.java:246) at org.apache.pdfbox.pdmodel.common.COSDictionaryMap.convert(COSDictionaryMap.java:206) at org.apache.pdfbox.pdmodel.PDResources.setXObjects(PDResources.java:331) at org.apache.pdfbox.pdmodel.PDResources.getXObjects(PDResources.java:269) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:310) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) {noformat} > StackOverflowError parsing a PDF with ExtractInlineImages=true > -------------------------------------------------------------- > > Key: TIKA-1742 > URL: https://issues.apache.org/jira/browse/TIKA-1742 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 1.10 > Reporter: Nathan Dire > > Here's the file: > http://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf > Code to repro ({{ExtractInlineImages}} must be true): > {noformat} > Parser parser = new PDFParser(); > Metadata metadata = new Metadata(); > ParseContext context = new ParseContext(); > PDFParserConfig config = new PDFParserConfig(); > ContentHandler handler = new DefaultHandler(); > config.setExtractInlineImages(true); > config.setExtractUniqueInlineImagesOnly(false); > context.set(PDFParserConfig.class, config); > context.set(Parser.class, parser); > InputStream is = new BufferedInputStream(new FileInputStream(args[0])); > try { > parser.parse(is, handler, metadata, context); > } finally { > is.close(); > } > {noformat} > Error (infinite recursion in {{extractImages}}): > {noformat} > Exception in thread "main" java.lang.StackOverflowError > at java.util.LinkedHashMap$Entry.addBefore(LinkedHashMap.java:340) > at java.util.LinkedHashMap$Entry.access$600(LinkedHashMap.java:320) > at java.util.LinkedHashMap.createEntry(LinkedHashMap.java:444) > at java.util.HashMap.addEntry(HashMap.java:888) > at java.util.LinkedHashMap.addEntry(LinkedHashMap.java:427) > at java.util.HashMap.put(HashMap.java:509) > at org.apache.pdfbox.cos.COSDictionary.setItem(COSDictionary.java:246) > at > org.apache.pdfbox.pdmodel.common.COSDictionaryMap.convert(COSDictionaryMap.java:206) > at > org.apache.pdfbox.pdmodel.PDResources.setXObjects(PDResources.java:331) > at > org.apache.pdfbox.pdmodel.PDResources.getXObjects(PDResources.java:269) > at > org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:310) > at > org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) > at > org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) > at > org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) > at > org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:319) > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332)