Author: stevencaswell Date: Sun Jun 26 09:56:36 2005 New Revision: 201875 URL: http://svn.apache.org/viewcvs?rev=201875&view=rev Log: (35366) Implementation of escape/unescapeHtml methods with Writer (http://issues.apache.org/bugzilla/show_bug.cgi?id=35366)
Modified: jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java jakarta/commons/proper/lang/trunk/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java Modified: jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java URL: http://svn.apache.org/viewcvs/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java?rev=201875&r1=201874&r2=201875&view=diff ============================================================================== --- jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java (original) +++ jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java Sun Jun 26 09:56:36 2005 @@ -15,6 +15,8 @@ */ package org.apache.commons.lang; +import java.io.IOException; +import java.io.Writer; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; @@ -219,7 +221,8 @@ {"real", "8476"}, //blackletter capital R = real part symbol,U+211C ISOamso --> {"trade", "8482"}, //trade mark sign, U+2122 ISOnum --> {"alefsym", "8501"}, //alef symbol = first transfinite cardinal,U+2135 NEW --> -// <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the same glyph could be used to depict both characters --> +// <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the +// same glyph could be used to depict both characters --> // <!-- Arrows --> {"larr", "8592"}, //leftwards arrow, U+2190 ISOnum --> {"uarr", "8593"}, //upwards arrow, U+2191 ISOnum--> @@ -228,10 +231,14 @@ {"harr", "8596"}, //left right arrow, U+2194 ISOamsa --> {"crarr", "8629"}, //downwards arrow with corner leftwards= carriage return, U+21B5 NEW --> {"lArr", "8656"}, //leftwards double arrow, U+21D0 ISOtech --> -// <!-- ISO 10646 does not say that lArr is the same as the 'is implied by' arrowbut also does not have any other character for that function. So ? lArr canbe used for 'is implied by' as ISOtech suggests --> +// <!-- ISO 10646 does not say that lArr is the same as the 'is implied by' +// arrow but also does not have any other character for that function. +// So ? lArr canbe used for 'is implied by' as ISOtech suggests --> {"uArr", "8657"}, //upwards double arrow, U+21D1 ISOamsa --> {"rArr", "8658"}, //rightwards double arrow,U+21D2 ISOtech --> -// <!-- ISO 10646 does not say this is the 'implies' character but does not have another character with this function so ?rArr can be used for 'implies' as ISOtech suggests --> +// <!-- ISO 10646 does not say this is the 'implies' character but does not +// have another character with this function so ?rArr can be used for +// 'implies' as ISOtech suggests --> {"dArr", "8659"}, //downwards double arrow, U+21D3 ISOamsa --> {"hArr", "8660"}, //left right double arrow,U+21D4 ISOamsa --> // <!-- Mathematical Operators --> @@ -245,9 +252,11 @@ {"ni", "8715"}, //contains as member, U+220B ISOtech --> // <!-- should there be a more memorable name than 'ni'? --> {"prod", "8719"}, //n-ary product = product sign,U+220F ISOamsb --> -// <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both --> +// <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' +// though the same glyph might be used for both --> {"sum", "8721"}, //n-ary summation, U+2211 ISOamsb --> -// <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both --> +// <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma' +// though the same glyph might be used for both --> {"minus", "8722"}, //minus sign, U+2212 ISOtech --> {"lowast", "8727"}, //asterisk operator, U+2217 ISOtech --> {"radic", "8730"}, //square root = radical sign,U+221A ISOtech --> @@ -261,7 +270,8 @@ {"int", "8747"}, //integral, U+222B ISOtech --> {"there4", "8756"}, //therefore, U+2234 ISOtech --> {"sim", "8764"}, //tilde operator = varies with = similar to,U+223C ISOtech --> -// <!-- tilde operator is NOT the same character as the tilde, U+007E,although the same glyph might be used to represent both --> +// <!-- tilde operator is NOT the same character as the tilde, U+007E,although +// the same glyph might be used to represent both --> {"cong", "8773"}, //approximately equal to, U+2245 ISOtech --> {"asymp", "8776"}, //almost equal to = asymptotic to,U+2248 ISOamsr --> {"ne", "8800"}, //not equal to, U+2260 ISOtech --> @@ -270,7 +280,10 @@ {"ge", "8805"}, //greater-than or equal to,U+2265 ISOtech --> {"sub", "8834"}, //subset of, U+2282 ISOtech --> {"sup", "8835"}, //superset of, U+2283 ISOtech --> -// <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry?It is in ISOamsn --> <!ENTITY nsub", "8836"}, //not a subset of, U+2284 ISOamsn --> +// <!-- note that nsup, 'not a superset of, U+2283' is not covered by the +// Symbol font encoding and is not included. Should it be, for symmetry? +// It is in ISOamsn --> <!ENTITY nsub", "8836"}, +// not a subset of, U+2284 ISOamsn --> {"sube", "8838"}, //subset of or equal to, U+2286 ISOtech --> {"supe", "8839"}, //superset of or equal to,U+2287 ISOtech --> {"oplus", "8853"}, //circled plus = direct sum,U+2295 ISOamsb --> @@ -286,7 +299,8 @@ {"lang", "9001"}, //left-pointing angle bracket = bra,U+2329 ISOtech --> // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' --> {"rang", "9002"}, //right-pointing angle bracket = ket,U+232A ISOtech --> -// <!-- rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark' --> +// <!-- rang is NOT the same character as U+003E 'greater than' or U+203A +// 'single right-pointing angle quotation mark' --> // <!-- Geometric Shapes --> {"loz", "9674"}, //lozenge, U+25CA ISOpub --> // <!-- Miscellaneous Symbols --> @@ -364,6 +378,11 @@ fillWithHtml40Entities(HTML40); } + /** + * <p>Fills the specified entities instance with HTML 40 entities.</p> + * + * @param entities the instance to be filled. + */ static void fillWithHtml40Entities(Entities entities) { entities.addEntities(BASIC_ARRAY); entities.addEntities(ISO8859_1_ARRAY); @@ -371,10 +390,28 @@ } static interface EntityMap { + /** + * <p>Add an entry to this entity map.</p> + * + * @param name the entity name + * @param value the entity value + */ void add(String name, int value); + /** + * <p>Returns the name of the entity identified by the specified value.</p> + * + * @param value the value to locate + * @return entity name associated with the specified value + */ String name(int value); + /** + * <p>Returns the value of the entity identified by the specified name.</p> + * + * @param name the name to locate + * @return entity value associated with the specified name + */ int value(String name); } @@ -382,15 +419,24 @@ private Map mapNameToValue = new HashMap(); private IntHashMap mapValueToName = new IntHashMap(); + /** + * [EMAIL PROTECTED] + */ public void add(String name, int value) { mapNameToValue.put(name, new Integer(value)); mapValueToName.put(value, name); } + /** + * [EMAIL PROTECTED] + */ public String name(int value) { return (String) mapValueToName.get(value); } + /** + * [EMAIL PROTECTED] + */ public int value(String name) { Object value = mapNameToValue.get(name); if (value == null) { @@ -405,15 +451,24 @@ protected Map mapNameToValue; protected Map mapValueToName; + /** + * [EMAIL PROTECTED] + */ public void add(String name, int value) { mapNameToValue.put(name, new Integer(value)); mapValueToName.put(new Integer(value), name); } + /** + * [EMAIL PROTECTED] + */ public String name(int value) { return (String) mapValueToName.get(new Integer(value)); } + /** + * [EMAIL PROTECTED] + */ public int value(String name) { Object value = mapNameToValue.get(name); if (value == null) { @@ -424,6 +479,9 @@ } static class HashEntityMap extends MapIntMap { + /** + * Constructs a new instance of <code>HashEntityMap</code>. + */ public HashEntityMap() { mapNameToValue = new HashMap(); mapValueToName = new HashMap(); @@ -431,6 +489,9 @@ } static class TreeEntityMap extends MapIntMap { + /** + * Constructs a new instance of <code>TreeEntityMap</code>. + */ public TreeEntityMap() { mapNameToValue = new TreeMap(); mapValueToName = new TreeMap(); @@ -441,6 +502,9 @@ private String[] lookupTable; private int LOOKUP_TABLE_SIZE = 256; + /** + * [EMAIL PROTECTED] + */ public String name(int value) { if (value < LOOKUP_TABLE_SIZE) { return lookupTable()[value]; @@ -448,6 +512,12 @@ return super.name(value); } + /** + * <p>Returns the lookup table for this entity map. The lookup table is created if it has not been + * previously.</p> + * + * @return the lookup table + */ private String[] lookupTable() { if (lookupTable == null) { createLookupTable(); @@ -455,6 +525,9 @@ return lookupTable; } + /** + * <p>Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.</p> + */ private void createLookupTable() { lookupTable = new String[LOOKUP_TABLE_SIZE]; for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) { @@ -469,17 +542,29 @@ protected String[] names; protected int[] values; + /** + * Constructs a new instance of <code>ArrayEntityMap</code>. + */ public ArrayEntityMap() { names = new String[growBy]; values = new int[growBy]; } + /** + * Constructs a new instance of <code>ArrayEntityMap</code> + * specifying the size by which the array should grow. + * + * @param growBy array will be initialized to and will grow by this amount + */ public ArrayEntityMap(int growBy) { this.growBy = growBy; names = new String[growBy]; values = new int[growBy]; } + /** + * [EMAIL PROTECTED] + */ public void add(String name, int value) { ensureCapacity(size + 1); names[size] = name; @@ -487,6 +572,11 @@ size++; } + /** + * Verifies the capacity of the entity array, adjusting the size if necessary. + * + * @param capacity size the array should be + */ protected void ensureCapacity(int capacity) { if (capacity > names.length) { int newSize = Math.max(capacity, size + growBy); @@ -499,6 +589,9 @@ } } + /** + * [EMAIL PROTECTED] + */ public String name(int value) { for (int i = 0; i < size; ++i) { if (values[i] == value) { @@ -508,6 +601,9 @@ return null; } + /** + * [EMAIL PROTECTED] + */ public int value(String name) { for (int i = 0; i < size; ++i) { if (names[i].equals(name)) { @@ -520,14 +616,30 @@ static class BinaryEntityMap extends ArrayEntityMap { + /** + * Constructs a new instance of <code>BinaryEntityMap</code>. + */ public BinaryEntityMap() { + ; // empty constructor } + /** + * Constructs a new instance of <code>ArrayEntityMap</code> + * specifying the size by which the underlying array should grow. + * + * @param growBy array will be initialized to and will grow by this amount + */ public BinaryEntityMap(int growBy) { super(growBy); } - // based on code in java.util.Arrays + /** + * Performs a binary search of the entity array for the specified key. + * This method is based on code in [EMAIL PROTECTED] java.util.Arrays}. + * + * @param key the key to be found + * @return the index of the entity array matching the specified key + */ private int binarySearch(int key) { int low = 0; int high = size - 1; @@ -547,6 +659,9 @@ return -(low + 1); // key not found. } + /** + * [EMAIL PROTECTED] + */ public void add(String name, int value) { ensureCapacity(size + 1); int insertAt = binarySearch(value); @@ -561,6 +676,9 @@ size++; } + /** + * [EMAIL PROTECTED] + */ public String name(int value) { int index = binarySearch(value); if (index < 0) { @@ -573,21 +691,43 @@ // package scoped for testing EntityMap map = new Entities.LookupEntityMap(); + /** + * <p>Adds entities to this entity.</p> + * + * @param entityArray array of entities to be added + */ public void addEntities(String[][] entityArray) { for (int i = 0; i < entityArray.length; ++i) { addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1])); } } + /** + * <p>Add an entity to this entity.</p> + * + * @param name name of the entity + * @param value vale of the entity + */ public void addEntity(String name, int value) { map.add(name, value); } + /** + * <p>Returns the name of the entity identified by the specified value.</p> + * + * @param value the value to locate + * @return entity name associated with the specified value + */ public String entityName(int value) { return map.name(value); } - + /** + * <p>Returns the value of the entity identified by the specified name.</p> + * + * @param name the name to locate + * @return entity value associated with the specified name + */ public int entityValue(String name) { return map.value(name); } @@ -627,6 +767,40 @@ } /** + * <p>Escapes the characters in the <code>String</code> passed and writes the result + * to the <code>Writer</code> passed. </p> + * + * @param writer The <code>Writer</code> to write the results of the escaping to. + * Assumed to be a non-null value. + * @param str The <code>String</code> to escape. Assumed to be a non-null value. + * @throws IOException when <code>Writer</code> passed throws the exception from + * calls to the [EMAIL PROTECTED] Writer#write(int)} methods. + * + * @see #escape(String) + * @see Writer + */ + public void escape(Writer writer, String str) throws IOException { + int len = str.length(); + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + String entityName = this.entityName(c); + if (entityName == null) { + if (c > 0x7F) { + writer.write("&#"); + writer.write(Integer.toString(c, 10)); + writer.write(';'); + } else { + writer.write(c); + } + } else { + writer.write('&'); + writer.write(entityName); + writer.write(';'); + } + } + } + + /** * <p>Unescapes the entities in a <code>String</code>.</p> * * <p>For example, if you have called addEntity("foo", 0xA1), @@ -683,4 +857,69 @@ return buf.toString(); } + /** + * <p>Unescapes the escaped entities in the <code>String</code> passed and + * writes the result to the <code>Writer</code> passed.</p> + * + * @param writer The <code>Writer</code> to write the results to; assumed to be non-null. + * @param string The <code>String</code> to write the results to; assumed to be non-null. + * @throws IOException when <code>Writer</code> passed throws the exception from + * calls to the [EMAIL PROTECTED] Writer#write(int)} methods. + * + * @see #escape(String) + * @see Writer + */ + public void unescape(Writer writer, String string) throws IOException { + int len = string.length(); + if (len == 0) { + return; + } + for (int i = 0; i < len; i++) { + char c = string.charAt(i); + if (c == '&') { + int nextIdx = i+1; + int semiColonIdx = string.indexOf(';', nextIdx); + if (semiColonIdx == -1) { + writer.write(c); + continue; + } + String entityContent = string.substring(nextIdx, semiColonIdx); + int entityValue = -1; + int entityContentLen = entityContent.length(); + if (entityContentLen > 0) { + if (entityContent.charAt(0) == '#') { //escaped value content is an integer (decimal or hexidecimal) + if (entityContentLen > 1) { + char isHexChar = entityContent.charAt(1); + try { + switch (isHexChar) { + case 'X' : + case 'x' : { + entityValue = Integer.parseInt(entityContent.substring(2), 16); + } + default : { + entityValue = Integer.parseInt(entityContent.substring(1), 10); + } + } + } catch (NumberFormatException e) { + } + } + } else { //escaped value content is an entity name + entityValue = this.entityValue(entityContent); + } + } + + if (entityValue == -1) { + writer.write('&'); + writer.write(entityContent); + writer.write(';'); + } else { + writer.write(entityValue); + } + i = semiColonIdx; //move index up to the semi-colon + } else { + writer.write(c); + } + } + } + } Modified: jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java URL: http://svn.apache.org/viewcvs/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=201875&r1=201874&r2=201875&view=diff ============================================================================== --- jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java (original) +++ jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java Sun Jun 26 09:56:36 2005 @@ -414,19 +414,66 @@ * @return a new escaped <code>String</code>, <code>null</code> if null string input * * @see #unescapeHtml(String) - * @see </br><a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> - * @see </br><a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> - * @see </br><a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> - * @see </br><a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> - * @see </br><a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> + * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> + * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> + * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> **/ public static String escapeHtml(String str) { if (str == null) { return null; } - //todo: add a version that takes a Writer - //todo: rewrite underlying method to use a Writer instead of a StringBuffer - return Entities.HTML40.escape(str); + + try { + StringPrintWriter writer = new StringPrintWriter ((int)(str.length() * 1.5)); + escapeHtml(writer, str); + return writer.getString(); + } catch (IOException e) { + //assert false; + //should be impossible + e.printStackTrace(); + return null; + } + } + + /** + * <p>Escapes the characters in a <code>String</code> using HTML entities and writes + * them to a <code>Writer</code>.</p> + * + * <p> + * For example: + * </p> + * <code>"bread" & "butter"</code> + * <p>becomes:</p> + * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>. + * + * <p>Supports all known HTML 4.0 entities, including funky accents.</p> + * + * @param writer The <code>Writer</code> to write the result to. This must not be <code>null</code>. + * @param string The <code>String</code> to escape. This may be <code>null</code>. + * + * @throws IOException when <code>Writer</code> passed throws the exception from + * calls to the [EMAIL PROTECTED] Writer#write(int)} methods. + * + * @see #escapeHtml(String) + * @see #unescapeHtml(String) + * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> + * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> + * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> + * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> + */ + public static void escapeHtml(Writer writer, String string) throws IOException { + if (writer == null ) { + throw new IllegalArgumentException ("The Writer must not be null."); + } + + if (string == null) { + return; + } + + Entities.HTML40.escape(writer, string); } /** @@ -449,7 +496,29 @@ if (str == null) { return null; } - return Entities.HTML40.unescape(str); + + try { + StringPrintWriter writer = new StringPrintWriter ((int)(str.length() * 1.5)); + unescapeHtml(writer, str); + return writer.getString(); + } catch (IOException e) { + //assert false; + //should be impossible + e.printStackTrace(); + return null; + } + } + + public static void unescapeHtml(Writer writer, String string) throws IOException { + if (writer == null ) { + throw new IllegalArgumentException ("The Writer must not be null."); + } + + if (string == null) { + return; + } + + Entities.HTML40.unescape(writer, string); } /** Modified: jakarta/commons/proper/lang/trunk/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java URL: http://svn.apache.org/viewcvs/jakarta/commons/proper/lang/trunk/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java?rev=201875&r1=201874&r2=201875&view=diff ============================================================================== --- jakarta/commons/proper/lang/trunk/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java (original) +++ jakarta/commons/proper/lang/trunk/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java Sun Jun 26 09:56:36 2005 @@ -210,17 +210,30 @@ String expected = htmlEscapes[i][1]; String original = htmlEscapes[i][2]; assertEquals(message, expected, StringEscapeUtils.escapeHtml(original)); - // todo: add test for (and implement) Writer-based version, something like this: -// StringPrintWriter sw = new StringPrintWriter(); -// StringEscapeUtils.escapeHtml(sw, original); -// assertEquals(expected, sw.getString()); + StringPrintWriter sw = new StringPrintWriter(); + try { + StringEscapeUtils.escapeHtml(sw, original); + } catch (IOException e) { + } + String actual = original == null ? null : sw.getString(); + assertEquals(message, expected, actual); } } public void testUnescapeHtml() { for (int i = 0; i < htmlEscapes.length; ++i) { - assertEquals(htmlEscapes[i][0], htmlEscapes[i][2], StringEscapeUtils.unescapeHtml(htmlEscapes[i][1])); - // todo: add test for (and implement) Writer-based version + String message = htmlEscapes[i][0]; + String expected = htmlEscapes[i][2]; + String original = htmlEscapes[i][1]; + assertEquals(message, expected, StringEscapeUtils.unescapeHtml(original)); + + StringPrintWriter sw = new StringPrintWriter(); + try { + StringEscapeUtils.unescapeHtml(sw, original); + } catch (IOException e) { + } + String actual = original == null ? null : sw.getString(); + assertEquals(message, expected, actual); } // \u00E7 is a cedilla (c with wiggle under) // note that the test string must be 7-bit-clean (unicode escaped) or else it will compile incorrectly --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]