[ https://issues.apache.org/jira/browse/TIKA-3720?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17524386#comment-17524386 ]
denisn commented on TIKA-3720: ------------------------------ I guess the problem is only in 1.x starting with 1.23. 1.23 and 1.28.1 are failing right after startup and not extracting anything from my pdf. My parser extends the ParserDecorator at some point: {code:java} private class TimeoutParser(parser: Parser) extends ParserDecorator(parser) private class CEParser extends AutoDetectParser { override protected def getParser(metadata: Metadata, context: ParseContext): Parser = { val parser = super.getParser(metadata, context) val p = if (parser.isInstanceOf[DefaultParser]) new DFP().getParserPublic(metadata, context) else parser new TimeoutParser(parser) } } private val parser = new CEParser() val contentParser = new AbstractParser { def parse = parser.parse } val currentContext = new ParseContext currentContext.set(classOf[TesseractOCRConfig], ocrConfig) currentContext.set(classOf[PDFParserConfig], pdfConfig) currentContext.set(classOf[Parser], contentParser){code} 2.x is not failing at startup (i just didn't wait long enough to get the results). > IllegalArgumentException in PDF parser > -------------------------------------- > > Key: TIKA-3720 > URL: https://issues.apache.org/jira/browse/TIKA-3720 > Project: Tika > Issue Type: Bug > Affects Versions: 1.23 > Environment: Fedora 36, Java 11, Scala 2.13.4, Tika 1.28.1 > Reporter: denisn > Priority: Major > Attachments: test.pdf > > > Tika packages: > {code:java} > "org.apache.tika" % "tika" % 1.28.1 > "org.apache.tika" % "tika-core" % 1.28.1 > "org.apache.tika" % "tika-parsers" % 1.28.1 > "org.apache.poi" % "poi" % "4.0.1" > "org.apache.poi" % "poi-ooxml" % "4.0.1"{code} > It seems to work fine in 1.22 but in 1.23 and all following versions there is > an error. I've attached the pdf file which i've tested. > Exception text: > {code:java} > java.lang.IllegalArgumentException > at org.apache.xerces.jaxp.DocumentBuilderFactoryImpl.setAttribute(Unknown > Source) > at > org.apache.tika.utils.XMLReaderUtils.trySetXercesSecurityManager(XMLReaderUtils.java:721) > at > org.apache.tika.utils.XMLReaderUtils.getDocumentBuilderFactory(XMLReaderUtils.java:289) > at > org.apache.tika.utils.XMLReaderUtils.getDocumentBuilder(XMLReaderUtils.java:305) > at > org.apache.tika.parser.external.ExternalParsersConfigReader.read(ExternalParsersConfigReader.java:58) > at > org.apache.tika.parser.external.ExternalParsersFactory.create(ExternalParsersFactory.java:67) > at > org.apache.tika.parser.external.ExternalParsersFactory.create(ExternalParsersFactory.java:59) > at > org.apache.tika.parser.external.ExternalParsersFactory.create(ExternalParsersFactory.java:49) > at > org.apache.tika.parser.external.ExternalParsersFactory.create(ExternalParsersFactory.java:44) > at > org.apache.tika.parser.external.CompositeExternalParser.<init>(CompositeExternalParser.java:44) > at > org.apache.tika.parser.external.CompositeExternalParser.<init>(CompositeExternalParser.java:37) > at > java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native > Method) > at > java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) > at > java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) > at > java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490) > at java.base/java.lang.Class.newInstance(Class.java:584) > at > org.apache.tika.config.ServiceLoader.loadStaticServiceProviders(ServiceLoader.java:358) > at > org.apache.tika.parser.DefaultParser.getDefaultParsers(DefaultParser.java:55) > at org.apache.tika.parser.DefaultParser.<init>(DefaultParser.java:85) > at org.apache.tika.parser.DefaultParser.<init>(DefaultParser.java:100) > at org.apache.tika.parser.DefaultParser.<init>(DefaultParser.java:112) > at org.apache.tika.parser.DefaultParser.<init>(DefaultParser.java:116) > at test.Main$DFP.<init>(Main.scala:55) > at test.Main$CEParser.getParser(Main.scala:75) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:269) > at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143) > at test.Main$.parseNode(Main.scala:194) > at test.Main$$anon$1.parse(Main.scala:151) > at org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:72) > at > org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:104) > at > org.apache.tika.parser.pdf.ImageGraphicsEngine.processImage(ImageGraphicsEngine.java:321) > at > org.apache.tika.parser.pdf.ImageGraphicsEngine.drawImage(ImageGraphicsEngine.java:182) > at > org.apache.pdfbox.contentstream.operator.graphics.DrawObject.process(DrawObject.java:67) > at > org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:939) > at > org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514) > at > org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492) > at > org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:155) > at > org.apache.tika.parser.pdf.ImageGraphicsEngine.run(ImageGraphicsEngine.java:128) > at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:159) > at org.apache.tika.parser.pdf.PDF2XHTML.endPage(PDF2XHTML.java:139) > at > org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:365) > at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:127) > at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:985) > at > org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:238) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:98) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:177) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:281) > at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188) > at test.Main$TimeoutParser.super$parse(Main.scala:67) > at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67) > at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) > at > cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:104) > at > cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:463) > at > cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:484) > at > cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:422) > at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36) > at > java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426) > at > java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) > at > java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) > at > java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) > at > java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) > at > java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)Error: > org.apache.tika.exception.TikaException: Unexpected RuntimeException from > org.apache.tika.parser.pdf.PDFParser@268b50a0 > org.apache.tika.exception.TikaException: Unexpected RuntimeException from > org.apache.tika.parser.pdf.PDFParser@268b50a0 > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:297) > at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188) > at test.Main$TimeoutParser.super$parse(Main.scala:67) > at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67) > at unsafeRunSync @ test.Main$TimeoutParser.parse(Main.scala:68) > Caused by: java.lang.NullPointerException > at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.doOCROnCurrentPage(AbstractPDF2XHTML.java:450) > at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.endPage(AbstractPDF2XHTML.java:557) > at org.apache.tika.parser.pdf.PDF2XHTML.endPage(PDF2XHTML.java:143) > at > org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:365) > at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:127) > at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:985) > at > org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:238) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:98) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:177) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:281) > at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188) > at test.Main$TimeoutParser.super$parse(Main.scala:67) > at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67) > at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) > at > cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:104) > at > cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:463) > at > cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:484) > at > cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:422) > at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36) > at > java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426) > at > java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) > at > java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) > at > java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) > at > java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) > at > java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) > {code} > > In 2.3.0 it still gives me an error but the extraction seems to work after > all: > {code:java} > Error: org.apache.tika.exception.TikaException: Unable to extract PDF content > org.apache.tika.exception.TikaException: Unable to extract PDF content > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:119) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:174) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:289) > at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:152) > at test.Main$TimeoutParser.super$parse(Main.scala:67) > at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67) > at unsafeRunSync @ test.Main$TimeoutParser.parse(Main.scala:68) > Caused by: java.io.IOException: Unable to end a page > at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.endPage(AbstractPDF2XHTML.java:637) > at org.apache.tika.parser.pdf.PDF2XHTML.endPage(PDF2XHTML.java:142) > at > org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:365) > at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:126) > at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:1089) > at > org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:238) > at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:97) > at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:174) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:289) > at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:152) > at test.Main$TimeoutParser.super$parse(Main.scala:67) > at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67) > at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18) > at > cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:104) > at > cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:463) > at > cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:484) > at > cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:422) > at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36) > at > java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426) > at > java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) > at > java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) > at > java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) > at > java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) > at > java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) > Caused by: org.apache.tika.exception.TikaException: I regret that I couldn't > find an OCR parser to handle image/ocr-png.Please set the OCR_STRATEGY to > NO_OCR or configure yourOCR parser correctly > at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.doOCROnCurrentPage(AbstractPDF2XHTML.java:473) > at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.endPage(AbstractPDF2XHTML.java:614) > ... 23 more > {code} > -- This message was sent by Atlassian Jira (v8.20.7#820007)