Author: vsiveton
Date: Tue May 19 12:18:28 2009
New Revision: 776288
URL: http://svn.apache.org/viewvc?rev=776288&view=rev
Log:
o improved escapeHTML and unescapeHTML for all entities
o added more test cases
o updated parser test
o import part of ASF Harmony project
Modified:
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java
Modified:
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java
URL:
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java?rev=776288&r1=776287&r2=776288&view=diff
==============================================================================
---
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java
(original)
+++
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java
Tue May 19 12:18:28 2009
@@ -20,15 +20,15 @@
*/
import java.io.UnsupportedEncodingException;
-
+import java.util.ArrayList;
import java.util.Hashtable;
+import java.util.List;
import javax.swing.text.html.HTML.Tag;
import org.apache.commons.lang.StringEscapeUtils;
-
import org.apache.maven.doxia.markup.HtmlMarkup;
-import org.apache.xerces.util.XMLChar;
+import org.codehaus.plexus.util.StringUtils;
/**
* The <code>HtmlTools</code> class defines methods to HTML handling.
@@ -117,6 +117,12 @@
* If <code>xmlMode</code> is true, every other character than the above
remains unchanged,
* if <code>xmlMode</code> is false, non-ascii characters get replaced by
their hex code.
*
+ * <b>Note</b>: all characters are encoded, i.e.:
+ * <pre>
+ * \u0159 = &#x159;
+ * \uD835\uDFED = &#x1d7ed;
+ * </pre>
+ *
* @param text The String to escape, may be null.
* @param xmlMode set to <code>false</code> to replace non-ascii
characters.
* @return The escaped text or the empty string if text == null.
@@ -164,25 +170,9 @@
else
{
buffer.append( "&#x" );
- if ( XMLChar.isHighSurrogate( c ) )
+ if ( isHighSurrogate( c ) )
{
- int c2 = text.charAt( ++i );
- if ( XMLChar.isLowSurrogate( c2 ) )
- {
- int sup = XMLChar.supplemental( c, (char)
c2 );
- if ( !XMLChar.isValid( sup ) )
- {
- throw new IllegalArgumentException(
"Invalid XML character "
- + Integer.toString( sup, 16 ) + "
in " + text );
- }
-
- buffer.append( Integer.toHexString( sup )
);
- }
- else
- {
- throw new IllegalArgumentException(
"Invalid XML character "
- + Integer.toString( c2, 16 ) + " in "
+ text );
- }
+ buffer.append( Integer.toHexString(
toCodePoint( c, text.charAt( ++i ) ) ) );
}
else
{
@@ -207,15 +197,67 @@
* <p>For example, the string "&lt;Fran&ccedil;ais&gt;"
* will become "<Français>".</p>
*
- * @param text the <code>String</code> to unescape, may be null.
+ * <b>Note</b>: all unicode entities are decoded, i.e.:
+ * <pre>
+ * &#x159; = \u0159
+ * &#x1d7ed; = \uD835\uDFED
+ * </pre>
*
+ * @param text the <code>String</code> to unescape, may be null.
* @return a new unescaped <code>String</code>, <code>null</code> if null
string input.
- *
* @since 1.1.1.
*/
public static String unescapeHtml( String text )
{
- return StringEscapeUtils.unescapeHtml( text );
+ if ( text == null )
+ {
+ return null;
+ }
+
+ String unescaped = StringEscapeUtils.unescapeHtml( text );
+
+ if ( !text.equals( unescaped ))
+ {
+ return unescaped;
+ }
+
+ String tmp = text;
+ List entities = new ArrayList();
+ while ( true )
+ {
+ int i = tmp.indexOf( "&#x" );
+ if ( i == -1 )
+ {
+ break;
+ }
+
+ tmp = tmp.substring( i + 3 );
+ if ( tmp.indexOf( ';' ) == -1 )
+ {
+ throw new IllegalArgumentException( "Wrong HTML near '..." +
tmp + "'" );
+ }
+
+ String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
+ try
+ {
+ Integer.parseInt( entity, 16 );
+ }
+ catch ( Exception e )
+ {
+ throw new IllegalArgumentException( "Wrong HTML near '..." +
tmp + "'" );
+ }
+ entities.add( entity );
+ }
+
+ for ( int i = 0; i < entities.size(); i++ )
+ {
+ String entity = (String) entities.get( i );
+
+ int codePoint = Integer.parseInt( entity, 16 );
+ text = StringUtils.replace( text, "&#x" + entity + ";", new
String( toChars( codePoint ) ) );
+ }
+
+ return text;
}
/**
@@ -338,4 +380,57 @@
{
// utility class
}
+
+ //
+ // Imported code from ASF Harmony project
+ //
http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
+ //
+
+ private static int toCodePoint( char high, char low )
+ {
+ // See RFC 2781, Section 2.2
+ // http://www.faqs.org/rfcs/rfc2781.html
+ int h = ( high & 0x3FF ) << 10;
+ int l = low & 0x3FF;
+ return ( h | l ) + 0x10000;
+ }
+
+ private static final char MIN_HIGH_SURROGATE = '\uD800';
+ private static final char MAX_HIGH_SURROGATE = '\uDBFF';
+
+ private static boolean isHighSurrogate( char ch )
+ {
+ return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
+ }
+
+ private static final int MIN_CODE_POINT = 0x000000;
+ private static final int MAX_CODE_POINT = 0x10FFFF;
+ private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
+
+ private static boolean isValidCodePoint( int codePoint )
+ {
+ return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
+ }
+
+ private static boolean isSupplementaryCodePoint( int codePoint )
+ {
+ return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT
>= codePoint );
+ }
+
+ private static char[] toChars( int codePoint )
+ {
+ if ( !isValidCodePoint( codePoint ) )
+ {
+ throw new IllegalArgumentException();
+ }
+
+ if ( isSupplementaryCodePoint( codePoint ) )
+ {
+ int cpPrime = codePoint - 0x10000;
+ int high = 0xD800 | ( ( cpPrime >> 10 ) & 0x3FF );
+ int low = 0xDC00 | ( cpPrime & 0x3FF );
+ return new char[] { (char) high, (char) low };
+ }
+ return new char[] { (char) codePoint };
+ }
}
Modified:
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java
URL:
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java?rev=776288&r1=776287&r2=776288&view=diff
==============================================================================
---
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java
(original)
+++
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java
Tue May 19 12:18:28 2009
@@ -348,8 +348,8 @@
assertEquals( "\u0161", (String) event.getArgs()[0] );
event = (SinkEventElement) it.next();
- assertEquals( "unknown", event.getName() );
- assertEquals( "𝟭", (String) event.getArgs()[0] );
+ assertEquals( "text", event.getName() );
+ assertEquals( "\uD835\uDFED", (String) event.getArgs()[0] );
event = (SinkEventElement) it.next();
assertEquals( "bold_", event.getName() );
@@ -382,10 +382,9 @@
assertEquals( "text", textEvt.getName() );
assertEquals( "\u0159", textEvt.getArgs()[0] );
- // TODO this should be emitted as the same text event as well
textEvt = (SinkEventElement) it.next();
- assertEquals( "unknown", textEvt.getName() );
- assertEquals( "𝟭", textEvt.getArgs()[0] );
+ assertEquals( "text", textEvt.getName() );
+ assertEquals( "\uD835\uDFED", (String) textEvt.getArgs()[0] );
textEvt = (SinkEventElement) it.next();
assertEquals( "text", textEvt.getName() );
@@ -406,10 +405,9 @@
assertEquals( "text", textEvt.getName() );
assertEquals( "\u0159", textEvt.getArgs()[0] );
- // TODO this should be emitted as the same text event as well
textEvt = (SinkEventElement) it.next();
- assertEquals( "unknown", textEvt.getName() );
- assertEquals( "𝟭", textEvt.getArgs()[0] );
+ assertEquals( "text", textEvt.getName() );
+ assertEquals( "\uD835\uDFED", (String) textEvt.getArgs()[0] );
textEvt = (SinkEventElement) it.next();
assertEquals( "text", textEvt.getName() );
Modified:
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java
URL:
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java?rev=776288&r1=776287&r2=776288&view=diff
==============================================================================
---
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java
(original)
+++
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java
Tue May 19 12:18:28 2009
@@ -19,6 +19,7 @@
* under the License.
*/
+import org.apache.commons.lang.StringEscapeUtils;
import org.codehaus.plexus.PlexusTestCase;
/**
@@ -46,6 +47,7 @@
// xml mode
assertEquals( HtmlTools.escapeHTML( "\u00e4", true ), "\u00e4" );
assertEquals( HtmlTools.escapeHTML( "\u00e4", false ), "ä" );
+ assertEquals( HtmlTools.escapeHTML( "\u0159", false ), "ř" );
assertEquals( HtmlTools.escapeHTML( "\uD835\uDFED", false ),
"𝟭" );
}
@@ -62,7 +64,20 @@
assertEquals( "\"", HtmlTools.unescapeHtml( """ ) );
assertEquals( "&", HtmlTools.unescapeHtml( "&amp;" ) );
assertEquals( "<Français>", HtmlTools.unescapeHtml(
"&lt;Fran&ccedil;ais&gt;" ) );
- assertEquals( "𒍅", HtmlTools.unescapeHtml( "𒍅" ) );
+ assertEquals( "\u0159", HtmlTools.unescapeHtml( "ř" ) );
+ assertEquals( "\uD808\uDF45", HtmlTools.unescapeHtml( "𒍅" ) );
+ assertEquals( "\uD835\uDFED", HtmlTools.unescapeHtml( "𝟭" ) );
+ assertEquals( "\uD808\uDF45\uD835\uDFED", HtmlTools.unescapeHtml(
"𒍅𝟭" ) );
+
+ try
+ {
+ HtmlTools.unescapeHtml( "test 𝟭 test" );
+ assertTrue( false );
+ }
+ catch ( IllegalArgumentException e )
+ {
+ assertTrue( true );
+ }
}
/**