Hi, We were doing some tests with 1.19 and found that some sites became unparsable using Tika. At this moment i know of at least two sites causing this, my own, https://www.openindex.io/ and https://www.elzendaalcollege.nl/
2024-02-15 12:33:49,639 WARN o.a.n.p.ParseUtil [main] Error parsing https://www.elzendaalcollege.nl/ with org.apache.nutch.parse.tika.TikaParser java.util.concurrent.ExecutionException: java.lang.NoSuchFieldError: NUM_IMAGES at java.util.concurrent.FutureTask.report(FutureTask.java:122) ~[?:?] at java.util.concurrent.FutureTask.get(FutureTask.java:205) ~[?:?] at org.apache.nutch.parse.ParseUtil.runParser(ParseUtil.java:188) ~[apache-nutch-1.20-SNAPSHOT.jar:?] at org.apache.nutch.parse.ParseUtil.parse(ParseUtil.java:92) ~[apache-nutch-1.20-SNAPSHOT.jar:?] at org.apache.nutch.parse.ParserChecker.process(ParserChecker.java:266) ~[apache-nutch-1.20-SNAPSHOT.jar:?] at org.apache.nutch.util.AbstractChecker.processSingle(AbstractChecker.java:86) ~[apache-nutch-1.20-SNAPSHOT.jar:?] at org.apache.nutch.parse.ParserChecker.run(ParserChecker.java:150) ~[apache-nutch-1.20-SNAPSHOT.jar:?] at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:81) ~[hadoop-common-3.3.4.jar:?] at org.apache.nutch.parse.ParserChecker.main(ParserChecker.java:308) ~[apache-nutch-1.20-SNAPSHOT.jar:?] Caused by: java.lang.NoSuchFieldError: NUM_IMAGES at org.apache.tika.parser.image.ImageParser.extractMetadata(ImageParser.java:177) ~[?:?] at org.apache.tika.parser.image.AbstractImageParser.parse(AbstractImageParser.java:79) ~[?:?] at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:289) ~[tika-core-2.3.0.jar:2.3.0] at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:289) ~[tika-core-2.3.0.jar:2.3.0] at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:289) ~[tika-core-2.3.0.jar:2.3.0] at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:185) ~[tika-core-2.3.0.jar:2.3.0] at org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:71) ~[tika-core-2.3.0.jar:2.3.0] at org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:106) ~[tika-core-2.3.0.jar:2.3.0] at org.apache.tika.parser.html.HtmlHandler.handleDataURIScheme(HtmlHandler.java:385) ~[?:?] at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:187) ~[?:?] at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:123) ~[tika-core-2.3.0.jar:2.3.0] at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:59) ~[?:?] at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794) ~[?:?] at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061) ~[?:?] at org.ccil.cowan.tagsoup.Parser.stage(Parser.java:1026) ~[?:?] at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:633) ~[?:?] at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449) ~[?:?] at org.apache.tika.parser.html.HtmlParser.parseImpl(HtmlParser.java:149) ~[?:?] at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:99) ~[?:?] at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:289) ~[tika-core-2.3.0.jar:2.3.0] at org.apache.nutch.parse.tika.TikaParser.getParse(TikaParser.java:151) ~[?:?] at org.apache.nutch.parse.tika.TikaParser.getParse(TikaParser.java:90) ~[?:?] at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:34) ~[apache-nutch-1.20-SNAPSHOT.jar:?] at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:23) ~[apache-nutch-1.20-SNAPSHOT.jar:?] at java.util.concurrent.FutureTask.run(FutureTask.java:264) ~[?:?] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) ~[?:?] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) ~[?:?] at java.lang.Thread.run(Thread.java:829) ~[?:?]