[ 
https://issues.apache.org/jira/browse/ANY23-271?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16338218#comment-16338218
 ] 

Lewis John McGibbney edited comment on ANY23-271 at 1/24/18 9:05 PM:
---------------------------------------------------------------------

When I run the above extraction with the patch provided at 
https://github.com/apache/any23/pull/59 I get the following issues... note they 
are still related to the RDFa1.1 Extractor. Also note however that the entity 
"raquo" issue is now resolved so this issue is fixed.

{code}
<?xml version="1.0" encoding="UTF-8" ?>
<response>
<extractors>
<extractor>html-head-meta</extractor>
<extractor>html-embedded-jsonld</extractor>
<extractor>html-head-title</extractor>
<extractor>html-rdfa11</extractor>
</extractors>
<report>
<message/>
<error/>
<issueReport>
<extractorIssues extractor="html-rdfa11">
<issue level="WARNING" row="-1" col="-1">Can't resolve term profile</issue>
<issue level="WARNING" row="-1" col="-1">Can't resolve term pingback</issue>
<issue level="WARNING" row="-1" col="-1">Can't resolve term dns-prefetch</issue>
<issue level="WARNING" row="-1" col="-1">Can't resolve term dns-prefetch</issue>
<issue level="WARNING" row="-1" col="-1">Can't resolve term dns-prefetch</issue>
<issue level="ERROR" row="-1" col="-1">Element type "i.length" must be followed 
by either attribute specifications, ">" or "/>".</issue>
</extractorIssues>
</issueReport>
<validationReport>
<errors>
</errors>
<ruleActivations>
</ruleActivations>
<issues>
</issues>
</validationReport>
</report>
<data>
<![CDATA[
# OUTPUT FORMAT: Turtle (mimeTypes=text/turtle, application/x-turtle; ext=ttl)
# BEGIN: 
ExtractionContext(urn:x-any23:html-embedded-jsonld:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
@prefix jsonld: <http://www.w3.org/ns/json-ld#> .
@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix v: <http://www.w3.org/2006/vcard/ns#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix sem: <http://semanticweb.cs.vu.nl/2009/11/sem/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix communication: <http://vocab.resc.info/communication#> .
@prefix locn: <http://www.w3.org/ns/locn#> .
@prefix incident: <http://vocab.resc.info/incident#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#addr>
 a locn:Address , sem:Place , v:Address ;
        v:country-name "NEDERLAND" ;
        v:locality "Amsterdam" ;
        v:street-address "Wilhelmina Druckerstraat" ;
        locn:adminUnitL1 "NL" ;
        locn:fullAddress "Wilhelmina Druckerstraat , Amsterdam" ;
        locn:postName "Amsterdam" ;
        locn:thoroughfare "Wilhelmina Druckerstraat" .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#location>
 a geo:Point , sem:Place ;
        geo:lat "52.3500002993288618" ;
        geo:long "4.82990412818292469" .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#message>
 a sem:Event , communication:DispatchMessage ;
        sem:eventType <http://data.brandweeraa.nl/data/classification/PRIO_1> , 
<http://www.firebrary.com/data/terms/nl/lmc/4.0/200000012> ;
        sem:hasTimeStamp "2016-01-27T19:48:40.000Z" ;
        communication:dispatchedTo <http://data.brandweeraa.nl/data/stations/P> 
;
        communication:incidentAddress 
<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#addr>
 ;
        communication:incidentLocation 
<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#location>
 ;
        communication:pagerMessage "Pac Melding" ;
        communication:unit <http://data.brandweeraa.nl/data/units/ASP> ;
        incident:isDispatchMessageOf 
<http://data.brandweeraa.nl/data/incident/2016/32601> ;
        rdfs:label "Pac Melding" .
# BEGIN: 
ExtractionContext(urn:x-any23:html-head-meta:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
@prefix sindice: <http://vocab.sindice.net/> .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400>
 <http://vocab.sindice.net/any23#robots> "noindex,follow"@nl ;
        <http://vocab.sindice.net/any23#generator> "WordPress 4.7.9"@nl ;
        <http://vocab.sindice.net/any23#viewport> "width=device-width, 
initial-scale=1.0"@nl ;
        <http://vocab.sindice.net/any23#X-UA-Compatible> "IE=edge"@nl .
# BEGIN: 
ExtractionContext(urn:x-any23:html-head-title:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
@prefix dcterms: <http://purl.org/dc/terms/> .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400>
 dcterms:title "Data:  | Open Data Brandweer Amsterdam Amstelland" .
# BEGIN: 
ExtractionContext(urn:x-any23:html-rdfa11:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)

<http://data.brandweeraa.nl/xmlrpc.php> 
<http://www.w3.org/1999/xhtml/vocab#alternate> 
<http://data.brandweeraa.nl/feed> , <http://data.brandweeraa.nl/comments/feed> .
# END: 
ExtractionContext(urn:x-any23:html-rdfa11:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
# END: 
ExtractionContext(urn:x-any23:html-embedded-jsonld:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
# END: 
ExtractionContext(urn:x-any23:html-head-meta:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
# END: 
ExtractionContext(urn:x-any23:html-head-title:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
]]>
</data>
</response>

{code}


was (Author: lewismc):
When I run the above extraction with the patch provided at I get the following 
issues... note they are still related to the RDFa1.1 Extractor. Also note 
however that the entity "raquo" issue is now resolved so this issue is fixed.

{code}
<?xml version="1.0" encoding="UTF-8" ?>
<response>
<extractors>
<extractor>html-head-meta</extractor>
<extractor>html-embedded-jsonld</extractor>
<extractor>html-head-title</extractor>
<extractor>html-rdfa11</extractor>
</extractors>
<report>
<message/>
<error/>
<issueReport>
<extractorIssues extractor="html-rdfa11">
<issue level="WARNING" row="-1" col="-1">Can't resolve term profile</issue>
<issue level="WARNING" row="-1" col="-1">Can't resolve term pingback</issue>
<issue level="WARNING" row="-1" col="-1">Can't resolve term dns-prefetch</issue>
<issue level="WARNING" row="-1" col="-1">Can't resolve term dns-prefetch</issue>
<issue level="WARNING" row="-1" col="-1">Can't resolve term dns-prefetch</issue>
<issue level="ERROR" row="-1" col="-1">Element type "i.length" must be followed 
by either attribute specifications, ">" or "/>".</issue>
</extractorIssues>
</issueReport>
<validationReport>
<errors>
</errors>
<ruleActivations>
</ruleActivations>
<issues>
</issues>
</validationReport>
</report>
<data>
<![CDATA[
# OUTPUT FORMAT: Turtle (mimeTypes=text/turtle, application/x-turtle; ext=ttl)
# BEGIN: 
ExtractionContext(urn:x-any23:html-embedded-jsonld:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
@prefix jsonld: <http://www.w3.org/ns/json-ld#> .
@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix v: <http://www.w3.org/2006/vcard/ns#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix sem: <http://semanticweb.cs.vu.nl/2009/11/sem/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix communication: <http://vocab.resc.info/communication#> .
@prefix locn: <http://www.w3.org/ns/locn#> .
@prefix incident: <http://vocab.resc.info/incident#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#addr>
 a locn:Address , sem:Place , v:Address ;
        v:country-name "NEDERLAND" ;
        v:locality "Amsterdam" ;
        v:street-address "Wilhelmina Druckerstraat" ;
        locn:adminUnitL1 "NL" ;
        locn:fullAddress "Wilhelmina Druckerstraat , Amsterdam" ;
        locn:postName "Amsterdam" ;
        locn:thoroughfare "Wilhelmina Druckerstraat" .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#location>
 a geo:Point , sem:Place ;
        geo:lat "52.3500002993288618" ;
        geo:long "4.82990412818292469" .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#message>
 a sem:Event , communication:DispatchMessage ;
        sem:eventType <http://data.brandweeraa.nl/data/classification/PRIO_1> , 
<http://www.firebrary.com/data/terms/nl/lmc/4.0/200000012> ;
        sem:hasTimeStamp "2016-01-27T19:48:40.000Z" ;
        communication:dispatchedTo <http://data.brandweeraa.nl/data/stations/P> 
;
        communication:incidentAddress 
<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#addr>
 ;
        communication:incidentLocation 
<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400#location>
 ;
        communication:pagerMessage "Pac Melding" ;
        communication:unit <http://data.brandweeraa.nl/data/units/ASP> ;
        incident:isDispatchMessageOf 
<http://data.brandweeraa.nl/data/incident/2016/32601> ;
        rdfs:label "Pac Melding" .
# BEGIN: 
ExtractionContext(urn:x-any23:html-head-meta:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
@prefix sindice: <http://vocab.sindice.net/> .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400>
 <http://vocab.sindice.net/any23#robots> "noindex,follow"@nl ;
        <http://vocab.sindice.net/any23#generator> "WordPress 4.7.9"@nl ;
        <http://vocab.sindice.net/any23#viewport> "width=device-width, 
initial-scale=1.0"@nl ;
        <http://vocab.sindice.net/any23#X-UA-Compatible> "IE=edge"@nl .
# BEGIN: 
ExtractionContext(urn:x-any23:html-head-title:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
@prefix dcterms: <http://purl.org/dc/terms/> .

<http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400>
 dcterms:title "Data:  | Open Data Brandweer Amsterdam Amstelland" .
# BEGIN: 
ExtractionContext(urn:x-any23:html-rdfa11:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)

<http://data.brandweeraa.nl/xmlrpc.php> 
<http://www.w3.org/1999/xhtml/vocab#alternate> 
<http://data.brandweeraa.nl/feed> , <http://data.brandweeraa.nl/comments/feed> .
# END: 
ExtractionContext(urn:x-any23:html-rdfa11:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
# END: 
ExtractionContext(urn:x-any23:html-embedded-jsonld:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
# END: 
ExtractionContext(urn:x-any23:html-head-meta:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
# END: 
ExtractionContext(urn:x-any23:html-head-title:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
]]>
</data>
</response>

{code}

> Address "...The entity "raquo" was referenced, but not declared" 
> SAXParseException
> ----------------------------------------------------------------------------------
>
>                 Key: ANY23-271
>                 URL: https://issues.apache.org/jira/browse/ANY23-271
>             Project: Apache Any23
>          Issue Type: Bug
>          Components: extractors
>    Affects Versions: 1.1
>            Reporter: Lewis John McGibbney
>            Priority: Major
>             Fix For: 2.2
>
>
> When attempting extractions on the following URL
> http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400
> I get the following Exception with the Webservice at any23.org
> {code}
> <?xml version="1.0" encoding="UTF-8" ?>
> <report>
> <message>Could not parse input.</message>
> <error>
> <![CDATA[
> ------------ BEGIN Exception context ------------
> ExtractionContext(urn:x-any23:html-rdfa11:root-extraction-result-id:http://data.brandweeraa.nl/data/incident/2016/32601/deployment/201601272048400)
> Errors {
> }
> ------------ END   Exception context ------------
> org.apache.any23.extractor.ExtractionException: Error while parsing RDF 
> document.
>       at 
> org.apache.any23.extractor.rdf.BaseRDFExtractor.run(BaseRDFExtractor.java:109)
>       at 
> org.apache.any23.extractor.rdf.BaseRDFExtractor.run(BaseRDFExtractor.java:41)
>       at 
> org.apache.any23.extractor.SingleDocumentExtraction.runExtractor(SingleDocumentExtraction.java:463)
>       at 
> org.apache.any23.extractor.SingleDocumentExtraction.run(SingleDocumentExtraction.java:255)
>       at org.apache.any23.Any23.extract(Any23.java:298)
>       at org.apache.any23.Any23.extract(Any23.java:450)
>       at 
> org.apache.any23.servlet.WebResponder.runExtraction(WebResponder.java:114)
>       at org.apache.any23.servlet.Servlet.doGet(Servlet.java:79)
>       at javax.servlet.http.HttpServlet.service(HttpServlet.java:618)
>       at javax.servlet.http.HttpServlet.service(HttpServlet.java:725)
>       at 
> org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:301)
>       at 
> org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
>       at 
> org.apache.tomcat.websocket.server.WsFilter.doFilter(WsFilter.java:52)
>       at 
> org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:239)
>       at 
> org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
>       at 
> org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:219)
>       at 
> org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:106)
>       at 
> org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:503)
>       at 
> org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:136)
>       at 
> org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:74)
>       at 
> org.apache.catalina.valves.AbstractAccessLogValve.invoke(AbstractAccessLogValve.java:610)
>       at 
> org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:88)
>       at 
> org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:526)
>       at 
> org.apache.coyote.ajp.AbstractAjpProcessor.process(AbstractAjpProcessor.java:794)
>       at 
> org.apache.coyote.AbstractProtocol$AbstractConnectionHandler.process(AbstractProtocol.java:652)
>       at 
> org.apache.tomcat.util.net.NioEndpoint$SocketProcessor.doRun(NioEndpoint.java:1575)
>       at 
> org.apache.tomcat.util.net.NioEndpoint$SocketProcessor.run(NioEndpoint.java:1533)
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>       at java.lang.Thread.run(Thread.java:745)
> Caused by: org.openrdf.rio.RDFParseException: org.xml.sax.SAXParseException; 
> lineNumber: 14; columnNumber: 105; The entity "raquo" was referenced, but not 
> declared.
>       at 
> org.semarglproject.sesame.rdf.rdfa.SesameRDFaParser.parse(SesameRDFaParser.java:111)
>       at 
> org.semarglproject.sesame.rdf.rdfa.SesameRDFaParser.parse(SesameRDFaParser.java:95)
>       at 
> org.apache.any23.extractor.rdf.BaseRDFExtractor.run(BaseRDFExtractor.java:105)
>       ... 29 more
> Caused by: org.semarglproject.rdf.ParseException: 
> org.xml.sax.SAXParseException; lineNumber: 14; columnNumber: 105; The entity 
> "raquo" was referenced, but not declared.
>       at 
> org.semarglproject.rdf.rdfa.RdfaParser.processException(RdfaParser.java:1130)
>       at org.semarglproject.source.XmlSource.process(XmlSource.java:50)
>       at 
> org.semarglproject.source.StreamProcessor.processInternal(StreamProcessor.java:87)
>       at 
> org.semarglproject.source.BaseStreamProcessor.process(BaseStreamProcessor.java:167)
>       at 
> org.semarglproject.source.BaseStreamProcessor.process(BaseStreamProcessor.java:154)
>       at 
> org.semarglproject.sesame.rdf.rdfa.SesameRDFaParser.parse(SesameRDFaParser.java:109)
>       ... 31 more
> Caused by: org.xml.sax.SAXParseException; lineNumber: 14; columnNumber: 105; 
> The entity "raquo" was referenced, but not declared.
>       at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
>       at org.semarglproject.source.XmlSource.process(XmlSource.java:48)
>       ... 35 more
> ]]>
> </error>
> <issueReport>
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to