Author: vsiveton
Date: Fri Jan 30 01:56:23 2009
New Revision: 739137
URL: http://svn.apache.org/viewvc?rev=739137&view=rev
Log:
DOXIA-250: Xml parser should handle entities defined in doctype
o better handle of entities
o updated test case
Modified:
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java
Modified:
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java
URL:
http://svn.apache.org/viewvc/maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java?rev=739137&r1=739136&r2=739137&view=diff
==============================================================================
---
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java
(original)
+++
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java
Fri Jan 30 01:56:23 2009
@@ -23,6 +23,7 @@
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlHeader2;
+import com.gargoylesoftware.htmlunit.html.HtmlHeader3;
import com.gargoylesoftware.htmlunit.html.HtmlHeader4;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlParagraph;
@@ -30,9 +31,8 @@
import java.util.Iterator;
-
/**
- *
+ * Verify the <code>site/xdoc/entityTest.xml</code>
*
* @author ltheussl
* @version $Id$
@@ -78,17 +78,41 @@
assertNotNull( h4 );
assertEquals( h4.asText().trim(), "Entities" );
+ div = (HtmlDivision) elementIterator.next();
+
+ HtmlHeader3 h3 = (HtmlHeader3) elementIterator.next();
+ assertNotNull( h3 );
+ assertEquals( h3.asText().trim(), "Generic Entities" );
+
+ a = (HtmlAnchor) elementIterator.next();
+
HtmlParagraph p = (HtmlParagraph) elementIterator.next();
assertNotNull( p );
- assertEquals( p.asText().trim(), "'&' '<' '>' '\"' ''' ' ' ' '" );
+ assertEquals( p.asText().trim(), "'&' '<' '>' '\"' '''" );
div = (HtmlDivision) elementIterator.next();
- assertNotNull( div );
- assertEquals( div.getAttributeValue( "class" ), "section" );
- h4 = (HtmlHeader4) elementIterator.next();
- assertNotNull( h4 );
- assertEquals( h4.asText().trim(), "Comment" );
+ h3 = (HtmlHeader3) elementIterator.next();
+ assertNotNull( h3 );
+ assertEquals( h3.asText().trim(), "Local Entities" );
+
+ a = (HtmlAnchor) elementIterator.next();
+
+ p = (HtmlParagraph) elementIterator.next();
+ assertNotNull( p );
+ assertEquals( p.asText().trim(), "'Î' 'Î' 'Î'" );
+
+ div = (HtmlDivision) elementIterator.next();
+
+ h3 = (HtmlHeader3) elementIterator.next();
+ assertNotNull( h3 );
+ assertEquals( h3.asText().trim(), "DTD Entities" );
+
+ a = (HtmlAnchor) elementIterator.next();
+
+ p = (HtmlParagraph) elementIterator.next();
+ assertNotNull( p );
+ assertEquals( p.asText().trim(), "' ' '¡' '¢'" );
div = (HtmlDivision) elementIterator.next();
assertNotNull( div );
@@ -106,6 +130,17 @@
assertNotNull( pre );
assertEquals( pre.asText().trim(), "<project xmlns:ant=\"jelly:ant\">"
);
+ p = (HtmlParagraph) elementIterator.next();
+ assertNotNull( p );
+ assertEquals( p.asText().trim(), "' ' '¡'" );
+
+ elementIterator.next(); // div
+ elementIterator.next(); // hr
+ elementIterator.next(); // div
+ elementIterator.next(); // div
+ elementIterator.next(); // hr
+ elementIterator.next(); // hr
+
assertFalse( elementIterator.hasNext() );
}
}
Modified:
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml
URL:
http://svn.apache.org/viewvc/maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml?rev=739137&r1=739136&r2=739137&view=diff
==============================================================================
---
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml
(original)
+++
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml
Fri Jan 30 01:56:23 2009
@@ -19,15 +19,14 @@
-->
<!DOCTYPE document [
- <!-- These are the entity sets for ISO Latin 1 characters for the XHTML -->
- <!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent">
- %HTMLlat1;
- <!-- These are the entity sets for special characters for the XHTML -->
- <!ENTITY % HTMLsymbol PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent">
- %HTMLsymbol;
- <!-- These are the entity sets for symbol characters for the XHTML -->
- <!ENTITY % HTMLspecial PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent">
- %HTMLspecial;
+<!-- These are the entity sets for ISO Latin 1 characters for the XHTML -->
+<!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent">
+%HTMLlat1;
+<!-- Some entities from http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent -->
+<!ENTITY Alpha "Α"> <!-- greek capital letter alpha, U+0391 -->
+<!ENTITY Beta "Β"> <!-- greek capital letter beta, U+0392 -->
+<!ENTITY Gamma "Γ"> <!-- greek capital letter gamma,
+U+0393 ISOgrk3 -->
]>
<document xmlns="http://maven.apache.org/XDOC/2.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
@@ -41,13 +40,18 @@
<section name="section name">
<h4>Entities</h4>
- <p>'&' '<' '>' '"' ''' ' ' ' '</p>
+ <h3>Generic Entities</h3>
+ <p>'&' '<' '>' '"' '''</p>
- <h4>Comment</h4>
- <!-- a comment and nothing else! -->
+ <h3>Local Entities</h3>
+ <p>'Α' 'Β' 'Γ'</p>
+
+ <h3>DTD Entities</h3>
+ <p>' ' '¡' '¢'</p>
<h4>CDATA</h4>
<source><![CDATA[<project xmlns:ant="jelly:ant">]]></source>
+ <p><![CDATA[' ' '¡']]></p>
</section>
Modified:
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java
URL:
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java?rev=739137&r1=739136&r2=739137&view=diff
==============================================================================
---
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java
(original)
+++
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java
Fri Jan 30 01:56:23 2009
@@ -42,4 +42,10 @@
/** CDATA string: "CDATA" */
String CDATA = "CDATA";
+
+ /** DOCTYPE start string: "<!DOCTYPE" */
+ String DOCTYPE_START = "<!DOCTYPE";
+
+ /** ENTITY start string: "<!ENTITY" */
+ String ENTITY_START = "<!ENTITY";
}
Modified:
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java
URL:
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java?rev=739137&r1=739136&r2=739137&view=diff
==============================================================================
---
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java
(original)
+++
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java
Fri Jan 30 01:56:23 2009
@@ -30,6 +30,7 @@
import java.io.StringReader;
import java.net.URL;
import java.util.Hashtable;
+import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
@@ -70,16 +71,21 @@
extends AbstractParser
implements XmlMarkup
{
- /** Entity pattern for HTML entity, i.e. &nbsp; see
http://www.w3.org/TR/REC-xml/#NT-EntityDecl */
+ /** Entity pattern for HTML entity, i.e. &nbsp;
"<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>
+ * <br/>
+ * see <a
href="http://www.w3.org/TR/REC-xml/#NT-EntityDecl">http://www.w3.org/TR/REC-xml/#NT-EntityDecl</a>
*/
private static final Pattern PATTERN_ENTITY_1 =
- Pattern.compile(
"<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>" );
+ Pattern.compile( ENTITY_START +
"(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>" );
- /** Entity pattern for Unicode entity, i.e. &#38; see
http://www.w3.org/TR/REC-xml/#NT-EntityDecl */
+ /** Entity pattern for Unicode entity, i.e. &#38;
"<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&(#x?[0-9a-fA-F]{1,4};)*)(\\s)*\"(\\s)*>"
+ * <br/>
+ * see <a
href="http://www.w3.org/TR/REC-xml/#NT-EntityDecl">http://www.w3.org/TR/REC-xml/#NT-EntityDecl</a>
*/
private static final Pattern PATTERN_ENTITY_2 =
- Pattern.compile(
"<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&#x?[0-9a-fA-F]{1,4};)(\\s)*\"(\\s)*>"
);
+ Pattern.compile( ENTITY_START +
"(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&(#x?[0-9a-fA-F]{1,4};)*)(\\s)*\"(\\s)*>" );
- /** Doctype pattern as defined in
http://www.w3.org/TR/REC-xml/#NT-doctypedecl */
- private static final Pattern PATTERN_DOCTYPE = Pattern.compile(
".*<!DOCTYPE([^>]*)>.*" );
+ /** Doctype pattern i.e. ".*<!DOCTYPE([^>]*)>.*"
+ * see <a
href="http://www.w3.org/TR/REC-xml/#NT-doctypedecl">http://www.w3.org/TR/REC-xml/#NT-doctypedecl</a>
*/
+ private static final Pattern PATTERN_DOCTYPE = Pattern.compile( ".*" +
DOCTYPE_START + "([^>]*)>.*" );
/** Tag pattern as defined in http://www.w3.org/TR/REC-xml/#NT-Name */
private static final Pattern PATTERN_TAG = Pattern.compile(
".*<([A-Za-z][A-Za-z0-9:_.-]*)([^>]*)>.*" );
@@ -247,43 +253,13 @@
}
else if ( eventType == XmlPullParser.DOCDECL )
{
- String text = parser.getText();
- int entitiesCount = StringUtils.countMatches( text, "<!ENTITY"
);
- // entities defined in a local doctype
- if ( entitiesCount > 0 )
+ addLocalEntities( parser, parser.getText() );
+
+ for ( Iterator it =
CachedFileEntityResolver.ENTITY_CACHE.values().iterator(); it.hasNext(); )
{
- int start = text.indexOf( "<" );
- int end = text.lastIndexOf( ">" );
- if ( start != -1 && end != -1 )
- {
- text = text.substring( start, end + 1 );
- for ( int i = 0; i < entitiesCount; i++ )
- {
- String tmp = text.substring( text.indexOf( "<" ),
text.indexOf( ">" ) + 1 );
- Matcher matcher = PATTERN_ENTITY_1.matcher( tmp );
- if ( matcher.find() && matcher.groupCount() == 7 )
- {
- String entityName = matcher.group( 2 );
- String entityValue = matcher.group( 5 );
-
- parser.defineEntityReplacementText(
entityName, entityValue );
- getLocalEntities().put( entityName,
entityValue );
- }
- else
- {
- matcher = PATTERN_ENTITY_2.matcher( text );
- if ( matcher.find() && matcher.groupCount() ==
7 )
- {
- String entityName = matcher.group( 2 );
- String entityValue = matcher.group( 5 );
-
- parser.defineEntityReplacementText(
entityName, entityValue );
- getLocalEntities().put( entityName,
entityValue );
- }
- }
- text = StringUtils.replace( text, tmp, "" ).trim();
- }
- }
+ byte[] res = (byte[])it.next();
+
+ addDTDEntities( parser, new String( res ) );
}
}
@@ -589,6 +565,123 @@
}
/**
+ * Add an entity given by <code>entityName</code> and
<code>entityValue</code> to {...@link #entities}.
+ * <br/>
+ * By default, we exclude the default XML entities: &amp;, &lt;,
&gt;, &quot; and &apos;.
+ *
+ * @param parser not null
+ * @param entityName not null
+ * @param entityValue not null
+ * @throws XmlPullParserException if any
+ * @see {...@link XmlPullParser#defineEntityReplacementText(String,
String)}
+ */
+ private void addEntity( XmlPullParser parser, String entityName, String
entityValue )
+ throws XmlPullParserException
+ {
+ if ( entityName.endsWith( "amp" ) || entityName.endsWith( "lt" ) ||
entityName.endsWith( "gt" )
+ || entityName.endsWith( "quot" ) || entityName.endsWith( "apos" ) )
+ {
+ return;
+ }
+
+ parser.defineEntityReplacementText( entityName, entityValue );
+ getLocalEntities().put( entityName, entityValue );
+ }
+
+ /**
+ * Handle entities defined in a local doctype as the following:
+ * <pre>
+ * <!DOCTYPE foo [
+ * <!ENTITY bar "&#x160;">
+ * <!ENTITY bar1 "&#x161;">
+ * ]>
+ * </pre>
+ *
+ * @param parser not null
+ * @param text not null
+ * @throws XmlPullParserException if any
+ */
+ private void addLocalEntities( XmlPullParser parser, String text )
+ throws XmlPullParserException
+ {
+ int entitiesCount = StringUtils.countMatches( text, ENTITY_START );
+ if ( entitiesCount > 0 )
+ {
+ // text should be foo [...]
+ int start = text.indexOf( "[" );
+ int end = text.lastIndexOf( "]" );
+ if ( start != -1 && end != -1 )
+ {
+ text = text.substring( start + 1, end );
+ addDTDEntities( parser, text );
+ }
+ }
+ }
+
+ /**
+ * Handle entities defined in external doctypes as the following:
+ * <pre>
+ * <!DOCTYPE foo [
+ * <!-- These are the entity sets for ISO Latin 1 characters for the
XHTML -->
+ * <!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for
XHTML//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent">
+ * %HTMLlat1;
+ * ]>
+ * </pre>
+ *
+ * @param parser not null
+ * @param text not null
+ * @throws XmlPullParserException if any
+ */
+ private void addDTDEntities( XmlPullParser parser, String text )
+ throws XmlPullParserException
+ {
+ int entitiesCount = StringUtils.countMatches( text, ENTITY_START );
+ if ( entitiesCount > 0 )
+ {
+ BufferedReader reader = new BufferedReader( new StringReader( text
) );
+ String line;
+ String tmpLine = "";
+ try
+ {
+ Matcher matcher;
+ while ( ( line = reader.readLine() ) != null )
+ {
+ tmpLine += "\n" + line;
+ matcher = PATTERN_ENTITY_1.matcher( tmpLine );
+ if ( matcher.find() && matcher.groupCount() == 7 )
+ {
+ String entityName = matcher.group( 2 );
+ String entityValue = matcher.group( 5 );
+
+ addEntity( parser, entityName, entityValue );
+ tmpLine = "";
+ }
+ else
+ {
+ matcher = PATTERN_ENTITY_2.matcher( tmpLine );
+ if ( matcher.find() && matcher.groupCount() == 8 )
+ {
+ String entityName = matcher.group( 2 );
+ String entityValue = matcher.group( 5 );
+
+ addEntity( parser, entityName, entityValue );
+ tmpLine = "";
+ }
+ }
+ }
+ }
+ catch ( IOException e )
+ {
+ // nop
+ }
+ finally
+ {
+ IOUtil.close( reader );
+ }
+ }
+ }
+
+ /**
* Convenience class to beautify <code>SAXParseException</code> messages.
*/
static class MessagesErrorHandler
@@ -714,13 +807,14 @@
public static class CachedFileEntityResolver
implements EntityResolver
{
- private static final Map cache = new Hashtable();
+ /** Map with systemId as key and the content of systemId as byte[]. */
+ protected static final Map ENTITY_CACHE = new Hashtable();
/** {...@inheritdoc} */
public InputSource resolveEntity( String publicId, String systemId )
throws SAXException, IOException
{
- byte[] res = (byte[]) cache.get( systemId );
+ byte[] res = (byte[]) ENTITY_CACHE.get( systemId );
// already cached?
if ( res == null )
{
@@ -758,7 +852,7 @@
res = toByteArray( temp.toURL() );
}
- cache.put( systemId, res );
+ ENTITY_CACHE.put( systemId, res );
}
InputSource is = new InputSource( new ByteArrayInputStream( res )
);