Hi, I ran into a problem with Tika-server where pdf parsing fails seemingly because pdf picture metadata xmp:Rating value is string not expected integer[0-5]. Error with using: Tika-server 3.1.0.0-full docker image with Tesseract OCR configured to extractInlineImages=true
Seemingly the cause of the error is PDF containing picture with metadata: xmp:Rating="2.0" Error: org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.parser.pdf.PDFParser@1c82c055 at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:312) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:204) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.server.core.resource.TikaResource.parse(TikaResource.java:363) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.server.core.resource.TikaResource.parseToMetadata(TikaResource.java:594) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.server.core.resource.TikaResource.getJson(TikaResource.java:567) ~[tika-server-standard-3.1.0.jar:3.1.0] at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103) ~[?:?] at java.base/java.lang.reflect.Method.invoke(Method.java:580) ~[?:?] at org.apache.cxf.service.invoker.AbstractInvoker.performInvocation(AbstractInvoker.java:179) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.service.invoker.AbstractInvoker.invoke(AbstractInvoker.java:96) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:200) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:103) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.interceptor.ServiceInvokerInterceptor$1.run(ServiceInvokerInterceptor.java:59) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.interceptor.ServiceInvokerInterceptor.handleMessage(ServiceInvokerInterceptor.java:96) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.phase.PhaseInterceptorChain.doIntercept(PhaseInterceptorChain.java:307) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.transport.ChainInitiationObserver.onMessage(ChainInitiationObserver.java:121) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.transport.http.AbstractHTTPDestination.invoke(AbstractHTTPDestination.java:265) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.transport.http_jetty.JettyHTTPDestination.doService(JettyHTTPDestination.java:244) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.cxf.transport.http_jetty.JettyHTTPHandler.handle(JettyHTTPHandler.java:80) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:122) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:223) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1381) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:178) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1303) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:129) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:149) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:122) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.Server.handle(Server.java:563) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.HttpChannel$RequestDispatchable.dispatch(HttpChannel.java:1598) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:753) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:501) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:287) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:314) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:100) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.io.SelectableChannelEndPoint$1.run(SelectableChannelEndPoint.java:53) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:969) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.doRunJob(QueuedThreadPool.java:1194) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1149) ~[tika-server-standard-3.1.0.jar:3.1.0] at java.base/java.lang.Thread.run(Thread.java:1583) [?:?] Caused by: java.lang.NumberFormatException: For input string: "2.0" at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:67) ~[?:?] at java.base/java.lang.Integer.parseInt(Integer.java:662) ~[?:?] at java.base/java.lang.Integer.<init>(Integer.java:1119) ~[?:?] at org.apache.jempbox.xmp.XMPSchema.getIntegerProperty(XMPSchema.java:311) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.jempbox.xmp.XMPSchemaBasic.getRating(XMPSchemaBasic.java:309) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.PDMetadataExtractor.extractBasic(PDMetadataExtractor.java:310) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.PDMetadataExtractor.extract(PDMetadataExtractor.java:79) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.PDMetadataExtractor.extract(PDMetadataExtractor.java:75) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.image.ImageGraphicsEngine.processImage(ImageGraphicsEngine.java:417) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.image.ImageGraphicsEngine.drawImage(ImageGraphicsEngine.java:290) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.pdfbox.contentstream.operator.graphics.DrawObject.process(DrawObject.java:78) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:919) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:552) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:510) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:157) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.image.ImageGraphicsEngine.run(ImageGraphicsEngine.java:235) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:203) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.PDF2XHTML.endPage(PDF2XHTML.java:148) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.PDF2XHTML$AngleDetectingPDF2XHTML.processPage(PDF2XHTML.java:296) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:1362) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:252) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:107) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:219) ~[tika-server-standard-3.1.0.jar:3.1.0] at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) ~[tika-server-standard-3.1.0.jar:3.1.0] ... 38 more Had to disable Tesseract ExtractInlineImages conf to process these kinds of PDFs. Is this a bug in tika logic or is there some sort of setting or workaround that I'm missing? maybe tika could just ignore incorrect xmp:Rating values with a warning, not fail the whole PDF processing? Many thanks, Siim
