[ https://issues.apache.org/jira/browse/ANY23-504?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Lewis John McGibbney resolved ANY23-504. ---------------------------------------- Resolution: Fixed > XML-based parsers should not load external DTDs by default > ---------------------------------------------------------- > > Key: ANY23-504 > URL: https://issues.apache.org/jira/browse/ANY23-504 > Project: Apache Any23 > Issue Type: Improvement > Reporter: Sebastian Nagel > Assignee: Lewis John McGibbney > Priority: Major > Fix For: 2.6 > > > The Any23 parser should optionally avoid to open HTTP connections when > parsing XML. > While testing the Nutch's Any23 plugin with 2.5 (NUTCH-2892) on the file > "BBC_News_Scotland.htm", the parser did hang for about two minutes with an > open HTTP connection to "hans-moleman.w3.org" and the following stack: > {noformat} > "parse-0" #19 daemon prio=5 os_prio=0 cpu=1432.93ms elapsed=15.85s > tid=0x00007efc713bd800 nid=0x16ff4 runnable [0x00007efc29f2d000] > java.lang.Thread.State: RUNNABLE > at java.net.SocketInputStream.socketRead0(java.base@11.0.11/Native > Method) > at > java.net.SocketInputStream.socketRead(java.base@11.0.11/SocketInputStream.java:115) > at > java.net.SocketInputStream.read(java.base@11.0.11/SocketInputStream.java:168) > at > java.net.SocketInputStream.read(java.base@11.0.11/SocketInputStream.java:140) > at > java.io.BufferedInputStream.fill(java.base@11.0.11/BufferedInputStream.java:252) > at > java.io.BufferedInputStream.read1(java.base@11.0.11/BufferedInputStream.java:292) > at > java.io.BufferedInputStream.read(java.base@11.0.11/BufferedInputStream.java:351) > - locked <0x000000071be1bb68> (a java.io.BufferedInputStream) > at > sun.net.www.http.HttpClient.parseHTTPHeader(java.base@11.0.11/HttpClient.java:754) > at > sun.net.www.http.HttpClient.parseHTTP(java.base@11.0.11/HttpClient.java:689) > at > sun.net.www.protocol.http.HttpURLConnection.getInputStream0(java.base@11.0.11/HttpURLConnection.java:1615) > - locked <0x000000071be11040> (a > sun.net.www.protocol.http.HttpURLConnection) > at > sun.net.www.protocol.http.HttpURLConnection.getInputStream(java.base@11.0.11/HttpURLConnection.java:1520) > - locked <0x000000071be11040> (a > sun.net.www.protocol.http.HttpURLConnection) > at org.apache.xerces.impl.XMLEntityManager.setupCurrentEntity(Unknown > Source) > at org.apache.xerces.impl.XMLEntityManager.startEntity(Unknown Source) > at org.apache.xerces.impl.XMLEntityManager.startDTDEntity(Unknown > Source) > at org.apache.xerces.impl.XMLDTDScannerImpl.setInputSource(Unknown > Source) > at > org.apache.xerces.impl.XMLDocumentScannerImpl$DTDDispatcher.dispatch(Unknown > Source) > at > org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown > Source) > at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) > at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) > at org.apache.xerces.parsers.XMLParser.parse(Unknown Source) > at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source) > at org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser.parse(Unknown > Source) > at > org.eclipse.rdf4j.common.xml.SimpleSAXParser.parse(SimpleSAXParser.java:197) > - locked <0x000000071bfe6f28> (a > org.eclipse.rdf4j.common.xml.SimpleSAXParser) > at org.eclipse.rdf4j.rio.trix.TriXParser.parse(TriXParser.java:177) > at org.eclipse.rdf4j.rio.trix.TriXParser.parse(TriXParser.java:134) > at > org.apache.any23.extractor.rdf.BaseRDFExtractor.run(BaseRDFExtractor.java:86) > at > org.apache.any23.extractor.rdf.BaseRDFExtractor.run(BaseRDFExtractor.java:39) > at > org.apache.any23.extractor.SingleDocumentExtraction.runExtractor(SingleDocumentExtraction.java:523) > at > org.apache.any23.extractor.SingleDocumentExtraction.run(SingleDocumentExtraction.java:265) > at org.apache.any23.Any23.extract(Any23.java:315) > at org.apache.any23.Any23.extract(Any23.java:483) > at org.apache.any23.Any23.extract(Any23.java:345) > at > org.apache.nutch.any23.Any23ParseFilter$Any23Parser.parse(Any23ParseFilter.java:106) > at > org.apache.nutch.any23.Any23ParseFilter$Any23Parser.<init>(Any23ParseFilter.java:81) > at > org.apache.nutch.any23.Any23ParseFilter.filter(Any23ParseFilter.java:153) > at > org.apache.nutch.parse.HtmlParseFilters.filter(HtmlParseFilters.java:55) > at > org.apache.nutch.parse.html.HtmlParser.getParse(HtmlParser.java:257) > at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:34) > at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:23) > at > java.util.concurrent.FutureTask.run(java.base@11.0.11/FutureTask.java:264) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(java.base@11.0.11/ThreadPoolExecutor.java:1128) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(java.base@11.0.11/ThreadPoolExecutor.java:628) > at java.lang.Thread.run(java.base@11.0.11/Thread.java:829) > {noformat} -- This message was sent by Atlassian Jira (v8.3.4#803005)