dlr 2005/05/16 14:23:21
Modified: src/java/org/apache/xmlrpc XmlRpc.java XmlWriter.java
src/test/org/apache/xmlrpc XmlWriterTest.java
Log:
Significant changes to handling of character encodings, both character
set and XML related, to support the widest range of interoperability
while still functioning correctly.
* src/java/org/apache/xmlrpc/XmlWriter.java
Updated header doc with a link to Tim Bray's annotated XML spec
(recommended by John Wilson).
(PROLOG_START, PROLOG_END): Removed encoding portion. XML parsers
should assume an Unicode encoding (which will work even if we wrote
ASCII, since it is a subset).
(ISO8859_1, UTF8): Reduced visiblity from protected to
package-private. Corrected typo in JavaDoc.
(UTF16): New constant for the string "UTF-16".
(encodings): Added URL to JavaDoc.
(hasWrittenProlog): Added flag indicating whether or not the XML
prolog has been written by this Writer instance.
(XmlWriter): No longer writes XML prolog on instantiation; that is
now delayed until write() is called explicitly. As such, removed
throws decl for IOException (which will be backwards compatible,
since UnsupportedEncodingException sub-classes IOException).
Output encoding is now forced to UTF-8, if specified encoding is
non-Unicode. Since XML parsers are required to support UTF-8 and
UTF-16, this should be seemless from a caller's perspective
(especially since ASCII is a subset of UTF-8).
(write): New overload which writes the XML prolog lazily.
(writeCharacterReference): New helper method used to write XML
character references for single characters (e.g. '\r' as "
").
(chardata): Removed unused local variables enc and isUnicode. Write
carriage returns as XML character references (as recommended by
John Wilson). Write characters not valid in XML using character
references.
(isValidXMLChar): Helper function capturing the set of characters
known to be valid XML.
* src/test/org/apache/xmlrpc/XmlWriterTest.java
(buffer, writer): Instance fields used by all tests.
(setUp): Initialize buffer to an empty ByteArrayOutputStream.
(testForceAlternateEncoding): New test which assures that
non-Unicode output encodings are forced to UTF-8 by XmlWriter.
(testBasicResults): Renamed from testWriter to provide a somewhat
less generic name. Added description messages for assertion
failures. Added tests for Boolean.
(testWriteCharacterReference): New test for writing characters as
XML character references.
* src/java/org/apache/xmlrpc/XmlRpc.java
(encoding): Changed default output encoding from ISO-8859-1 to
UTF-8.
Target release: 2.0
Revision Changes Path
1.42 +4 -3 ws-xmlrpc/src/java/org/apache/xmlrpc/XmlRpc.java
Index: XmlRpc.java
===================================================================
RCS file: /home/cvs/ws-xmlrpc/src/java/org/apache/xmlrpc/XmlRpc.java,v
retrieving revision 1.41
retrieving revision 1.42
diff -u -u -r1.41 -r1.42
--- XmlRpc.java 28 Apr 2005 21:26:38 -0000 1.41
+++ XmlRpc.java 16 May 2005 21:23:21 -0000 1.42
@@ -147,9 +147,10 @@
/**
* Java's name for the encoding we're using. Defaults to
- * <code>ISO8859_1</code>.
+ * <code>UTF8</code> (of which <code>ISO8859_1</code> is a
+ * subset).
*/
- static String encoding = XmlWriter.ISO8859_1;
+ static String encoding = XmlWriter.UTF8;
/**
* Java's name for the input encoding we're using. Defaults to
1.14 +117 -50 ws-xmlrpc/src/java/org/apache/xmlrpc/XmlWriter.java
Index: XmlWriter.java
===================================================================
RCS file: /home/cvs/ws-xmlrpc/src/java/org/apache/xmlrpc/XmlWriter.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -u -r1.13 -r1.14
--- XmlWriter.java 2 May 2005 04:22:21 -0000 1.13
+++ XmlWriter.java 16 May 2005 21:23:21 -0000 1.14
@@ -32,7 +32,7 @@
import org.apache.commons.codec.EncoderException;
/**
- * A quick and dirty XML writer. If you feed it a
+ * A XML writer intended for single-thread usage. If you feed it a
* <code>ByteArrayInputStream</code>, it may be necessary to call
* <code>writer.flush()</code> before calling
* <code>buffer.toByteArray()</code> to get the data written to
@@ -40,12 +40,13 @@
*
* @author <a href="mailto:[EMAIL PROTECTED]">Hannes Wallnoefer</a>
* @author Daniel L. Rall
+ * @see <a href="http://www.xml.com/axml/testaxml.htm">Tim Bray's
+ * Annotated XML Spec</a>
*/
class XmlWriter extends OutputStreamWriter
{
// Various XML pieces.
- protected static final String PROLOG_START =
- "<?xml version=\"1.0\" encoding=\"";
+ protected static final String PROLOG_START = "<?xml version=\"1.0";
protected static final String PROLOG_END = "\"?>";
protected static final String CLOSING_TAG_START = "</";
protected static final String SINGLE_TAG_END = "/>";
@@ -54,14 +55,19 @@
protected static final String AMPERSAND_ENTITY = "&";
/**
- * Java's name for the the ISO8859_1 encoding.
+ * Java's name for the ISO-8859-1 encoding.
*/
- protected static final String ISO8859_1 = "ISO8859_1";
+ static final String ISO8859_1 = "ISO8859_1";
/**
- * Java's name for the the UTF8 encoding.
+ * Java's name for the UTF-8 encoding.
*/
- protected static final String UTF8 = "UTF8";
+ static final String UTF8 = "UTF8";
+
+ /**
+ * Java's name for the UTF-16 encoding.
+ */
+ static final String UTF16 = "UTF-16";
protected static final Base64 base64Codec = new Base64();
@@ -73,6 +79,8 @@
/**
* Mapping between Java encoding names and "real" names used in
* XML prolog.
+ *
+ * @see <a
href="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Java
character set names</a>
*/
private static Properties encodings = new Properties();
@@ -90,23 +98,40 @@
private static DateTool dateTool = new DateTool();
/**
+ * Whether the XML prolog has been written.
+ */
+ boolean hasWrittenProlog = false;
+
+ /**
* Creates a new instance.
*
* @param out The stream to write output to.
- * @param enc The encoding to using for outputing XML.
- * @throws UnsupportedEncodingException Encoding unrecognized.
- * @throws IOException Problem writing.
+ * @param enc The encoding to using for outputing XML. Only UTF-8
+ * and UTF-16 are supported. If another encoding is specified,
+ * UTF-8 will be used instead for widest XML parser
+ * interoperability.
+ * @exception UnsupportedEncodingException Since unsupported
+ * encodings are internally converted to UTF-8, this should only
+ * be seen as the result of an internal error.
*/
public XmlWriter(OutputStream out, String enc)
- throws UnsupportedEncodingException, IOException
+ throws UnsupportedEncodingException
{
// Super-class wants the Java form of the encoding.
- super(out, enc);
+ super(out, forceUnicode(enc));
+ }
- // Add the XML prolog (including the encoding in XML form).
- write(PROLOG_START);
- write(canonicalizeEncoding(enc));
- write(PROLOG_END);
+ /**
+ * @param encoding A caller-specified encoding.
+ * @return An Unicode encoding.
+ */
+ private static String forceUnicode(String encoding)
+ {
+ if (encoding == null || !encoding.toUpperCase().startsWith("UTF"))
+ {
+ encoding = UTF8;
+ }
+ return encoding;
}
/**
@@ -116,6 +141,8 @@
* @param javaEncoding The name of the encoding as known by Java.
* @return The XML encoding (if a mapping is available);
* otherwise, the encoding as provided.
+ *
+ * @deprecated This method will not be visible in 2.0.
*/
protected static String canonicalizeEncoding(String javaEncoding)
{
@@ -123,6 +150,25 @@
}
/**
+ * A mostly pass-through implementation wrapping
+ * <code>OutputStreamWriter.write()</code> which assures that the
+ * XML prolog is written before any other data.
+ *
+ * @see java.io.OutputStreamWriter.write(char[], int, int)
+ */
+ public void write(char[] cbuf, int off, int len)
+ throws IOException
+ {
+ if (!hasWrittenProlog)
+ {
+ super.write(PROLOG_START, 0, PROLOG_START.length());
+ super.write(PROLOG_END, 0, PROLOG_END.length());
+ hasWrittenProlog = true;
+ }
+ super.write(cbuf, off, len);
+ }
+
+ /**
* Writes the XML representation of a supported Java object type.
*
* @param obj The <code>Object</code> to write.
@@ -246,6 +292,17 @@
}
/**
+ * Writes characters like '\r' (0xd) as "&#13;".
+ */
+ private void writeCharacterReference(char c)
+ throws IOException
+ {
+ write("&#");
+ write(String.valueOf((int) c));
+ write(';');
+ }
+
+ /**
*
* @param elem
* @throws IOException
@@ -292,8 +349,6 @@
throws XmlRpcException, IOException
{
int l = text.length ();
- String enc = super.getEncoding();
- boolean isUnicode = UTF8.equals(enc) || "UTF-16".equals(enc);
// ### TODO: Use a buffer rather than going character by
// ### character to scale better for large text sizes.
//char[] buf = new char[32];
@@ -303,10 +358,13 @@
switch (c)
{
case '\t':
- case '\r':
case '\n':
write(c);
break;
+ case '\r':
+ // Avoid normalization of CR to LF.
+ writeCharacterReference(c);
+ break;
case '<':
write(LESS_THAN_ENTITY);
break;
@@ -317,38 +375,18 @@
write(AMPERSAND_ENTITY);
break;
default:
- if (c < 0x20 || c > 0x7f)
+ // Though the XML spec requires XML parsers to support
+ // Unicode, not all such code points are valid in XML
+ // documents. Additionally, previous to 2003-06-30
+ // the XML-RPC spec only allowed ASCII data (in
+ // <string> elements). For interoperability with
+ // clients rigidly conforming to the pre-2003 version
+ // of the XML-RPC spec, we entity encode characters
+ // outside of the valid range for ASCII, too.
+ if (c > 0x7f || !isValidXMLChar(c))
{
- // Though the XML-RPC spec allows any ASCII
- // characters except '<' and '&', the XML spec
- // does not allow this range of characters,
- // resulting in a parse error from most XML
- // parsers. However, the XML spec does require
- // XML parsers to support UTF-8 and UTF-16.
- if (isUnicode)
- {
- if (c < 0x20)
- {
- // Entity escape the character.
- write("&#");
- // ### Do we really need the String conversion?
- write(String.valueOf((int) c));
- write(';');
- }
- else // c > 0x7f
- {
- // Write the character in our encoding.
- write(new
String(String.valueOf(c).getBytes(enc)));
- }
- }
- else
- {
- throw new XmlRpcException(0, "Invalid character data
"
- + "corresponding to XML "
- + "entity &#"
- + String.valueOf((int) c)
- + ';');
- }
+ // Replace the code point with a character reference.
+ writeCharacterReference(c);
}
else
{
@@ -358,6 +396,35 @@
}
}
+ /**
+ * Section 2.2 of the XML spec describes which Unicode code points
+ * are valid in XML:
+ *
+ * <blockquote><code>#x9 | #xA | #xD | [#x20-#xD7FF] |
+ * [#xE000-#xFFFD] | [#x10000-#x10FFFF]</code></blockquote>
+ *
+ * Code points outside this set must be entity encoded to be
+ * represented in XML.
+ *
+ * @param c The character to inspect.
+ * @return Whether the specified character is valid in XML.
+ */
+ private static final boolean isValidXMLChar(char c)
+ {
+ switch (c)
+ {
+ case 0x9:
+ case 0xa: // line feed, '\n'
+ case 0xd: // carriage return, '\r'
+ return true;
+
+ default:
+ return ( (0x20 < c && c <= 0xd7ff) ||
+ (0xe000 < c && c <= 0xfffd) ||
+ (0x10000 < c && c <= 0x10ffff) );
+ }
+ }
+
protected static void setTypeDecoder(TypeDecoder newTypeDecoder)
{
typeDecoder = newTypeDecoder;
1.11 +64 -10 ws-xmlrpc/src/test/org/apache/xmlrpc/XmlWriterTest.java
Index: XmlWriterTest.java
===================================================================
RCS file: /home/cvs/ws-xmlrpc/src/test/org/apache/xmlrpc/XmlWriterTest.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -u -r1.10 -r1.11
--- XmlWriterTest.java 10 May 2005 18:58:45 -0000 1.10
+++ XmlWriterTest.java 16 May 2005 21:23:21 -0000 1.11
@@ -33,6 +33,9 @@
public class XmlWriterTest
extends TestCase
{
+ private ByteArrayOutputStream buffer;
+ private XmlWriter writer;
+
/**
* Constructor
*/
@@ -55,6 +58,7 @@
public void setUp()
{
XmlRpc.setDebug(true);
+ buffer = new ByteArrayOutputStream();
}
/**
@@ -65,27 +69,63 @@
XmlRpc.setDebug(false);
}
- public void testWriter()
+ public void testForceAlternateEncoding()
+ throws Exception
+ {
+ writer = new XmlWriter(buffer, null);
+ assertEquals("null should be forced to UTF-8",
+ XmlWriter.UTF8, writer.getEncoding());
+
+ writer = new XmlWriter(buffer, XmlWriter.ISO8859_1);
+ assertEquals(XmlWriter.ISO8859_1 + " should be forced to " +
+ XmlWriter.UTF8, XmlWriter.UTF8, writer.getEncoding());
+
+ writer = new XmlWriter(buffer, "ISO8859_15");
+ assertEquals("ISO8859_15 should be forced to " + XmlWriter.UTF8,
+ XmlWriter.UTF8, writer.getEncoding());
+
+ writer = new XmlWriter(buffer, "EUC_JP");
+ assertEquals("EUC_JP should be forced to " + XmlWriter.UTF8,
+ XmlWriter.UTF8, writer.getEncoding());
+
+ writer = new XmlWriter(buffer, XmlWriter.UTF16);
+ assertEquals(XmlWriter.UTF16 + " should remain " + XmlWriter.UTF16,
+ XmlWriter.UTF16, writer.getEncoding());
+ }
+
+ public void testBasicResults()
throws Exception
{
try
{
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- XmlWriter writer = new XmlWriter(buffer, XmlWriter.ISO8859_1);
- assertTrue(writer.getEncoding().equals(XmlRpc.encoding));
+ writer = new XmlWriter(buffer, XmlWriter.UTF8);
+
+ writer.write(new char[0], 0, 0);
+ writer.flush();
+ assertEquals("Unexpected or missing XML prolog",
+ XmlWriter.PROLOG_START + XmlWriter.PROLOG_END,
+ buffer.toString());
String foobar = "foobar";
writer.writeObject(foobar);
writer.flush();
- //System.err.println("buffer=" + new
String(buffer.toByteArray()));
String postProlog = "<value>" + foobar + "</value>";
- assertTrue(buffer.toString().endsWith(postProlog));
+ assertTrue("Unexpected results from writing of String",
+ buffer.toString().endsWith(postProlog));
Integer thirtySeven = new Integer(37);
writer.writeObject(thirtySeven);
writer.flush();
postProlog += "<value><int>" + thirtySeven + "</int></value>";
- assertTrue(buffer.toString().endsWith(postProlog));
+ assertTrue("Unexpected results from writing of Integer",
+ buffer.toString().endsWith(postProlog));
+
+ Boolean flag = Boolean.TRUE;
+ writer.writeObject(flag);
+ writer.flush();
+ postProlog += "<value><boolean>1</boolean></value>";
+ assertTrue("Unexpected results from writing of Boolean",
+ buffer.toString().endsWith(postProlog));
Object[] array = { foobar, thirtySeven };
writer.writeObject(array);
@@ -94,7 +134,8 @@
postProlog += "<value>" + foobar + "</value>";
postProlog += "<value><int>" + thirtySeven + "</int></value>";
postProlog += "</data></array></value>";
- assertTrue(buffer.toString().endsWith(postProlog));
+ assertTrue("Unexpected results from writing of Object[]",
+ buffer.toString().endsWith(postProlog));
Hashtable map = new Hashtable();
map.put(foobar, thirtySeven);
@@ -104,7 +145,8 @@
postProlog += "<name>" + foobar + "</name>";
postProlog += "<value><int>" + thirtySeven + "</int></value>";
postProlog += "</member></struct></value>";
- assertTrue(buffer.toString().endsWith(postProlog));
+ assertTrue("Unexpected results from writing of Hashtable",
+ buffer.toString().endsWith(postProlog));
}
catch (Exception e)
{
@@ -112,4 +154,16 @@
fail(e.getMessage());
}
}
+
+ public void testWriteCharacterReference()
+ throws Exception
+ {
+ writer = new XmlWriter(buffer, null);
+ writer.hasWrittenProlog = true;
+ writer.writeObject(String.valueOf((char) 0x80));
+ writer.flush();
+ String postProlog = "<value>€</value>";
+ assertTrue("Character reference not created as expected",
+ buffer.toString().endsWith(postProlog));
+ }
}