On 05/27/2014 10:46 AM, huizhe wang wrote:
Hi,
Are you okay with the updated patch?
Thanks,
Joe
looks fine for me.
Btw, if I took a quick look at the UTF8 reader, my observation
suggests read byte by byte
from the underlying stream probably is the bottleneck of the overall
"parsing". Attached
is a buffered the version, my simple test (just the parsing, use the
default handler do noting)
indicates it might double the parsing speed. Sure the overall
performance will depends on
the individual handler, but it might worth considering, any second
counts :-) The code is
not fully tested though, just for your reference.
-Sherman
package jdk.internal.util.xml.impl;
import java.io.Reader;
import java.io.InputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
/**
* UTF-8 transformed UCS-2 character stream reader.
*
* This reader converts UTF-8 transformed UCS-2 characters to Java
characters.
* The UCS-2 subset of UTF-8 transformation is described in RFC-2279 #2
* "UTF-8 definition":
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*
* This reader will return incorrect last character on broken UTF-8
stream.
*/
public class ReaderUTF8 extends Reader {
private InputStream is;
private static int DEFAULT_BUFFER_SIZE = 8192;
private byte buf[];
private int pos, limit;
/**
* Constructor.
*
* @param is A byte input stream.
*/
public ReaderUTF8(InputStream is) {
this.is = is;
this.buf = new byte[DEFAULT_BUFFER_SIZE];
this.pos = limit = 0;
}
private void fill() throws IOException {
if (pos >= buf.length) { // no room left in buffer
pos = limit = 0;
}
int n = is.read(buf, pos, buf.length - pos);
if (n > 0) {
limit = n + pos;
}
}
/**
* Reads characters into a portion of an array.
*
* @param cbuf Destination buffer.
* @param off Offset at which to start storing characters.
* @param len Maximum number of characters to read.
* @exception IOException If any IO errors occur.
* @exception UnsupportedEncodingException If UCS-4 character
occur in the stream.
*/
public int read(char[] cbuf, int off, int len) throws IOException {
int off0 = off;
int end = off + len;
while (off < len) {
if (pos >= limit) {
fill();
if (pos >= limit) {
return (off != off0) ? off - off0 : -1;
}
}
int val = buf[pos] & 0xff;
if (val >= 0x80) {
break;
}
cbuf[off++] = (char) val;
pos++;
}
while (off < end) {
if (pos >= limit) {
fill();
if (pos >= limit) {
return (off != off0) ? off - off0 : -1;
}
}
int val = buf[pos++] & 0xff;
switch (val & 0xf0) {
case 0xc0:
case 0xd0:
if (pos >= limit) {
fill();
}
if (pos >= limit) {
cbuf[off++] = (char) (((val & 0x1f) << 6) |
(is.read() & 0x3f));
} else {
cbuf[off++] = (char) (((val & 0x1f) << 6) |
(buf[pos++] & 0x3f));
}
break;
case 0xe0:
if (pos >= limit) {
fill();
}
val = (val & 0x0f) << 12;
if (pos >= limit) {
val |= ((is.read() & 0x3f) << 6);
} else {
val |= ((buf[pos++] & 0x3f) << 6);
}
if (pos >= limit) {
val |= (buf[pos++] & 0x3f);
} else {
val |= (is.read() & 0x3f);
}
cbuf[off++] = (char) val;
break;
case 0xf0: // UCS-4 character
throw new UnsupportedEncodingException("UTF-32
(or UCS-4) encoding not supported.");
default:
cbuf[off++] = (char) val;
break;
}
}
return off - off0;
}
/**
* Reads a single character.
*
* @return The character read, as an integer in the range 0 to 65535
* (0x00-0xffff), or -1 if the end of the stream has been reached.
* @exception IOException If any IO errors occur.
* @exception UnsupportedEncodingException If UCS-4 character
occur in the stream.
*/
public int read() throws IOException {
int val;
if (pos >= limit) {
val = is.read();
} else {
val = buf[pos++] & 0xff;
}
switch (val & 0xf0) {
case 0xc0:
case 0xd0:
if (pos >= limit) {
val = ((val & 0x1f) << 6) | (is.read() & 0x3f);
} else {
val = ((val & 0x1f) << 6) | (buf[pos++] & 0x3f);
}
break;
case 0xe0:
val = (val & 0x0f) << 12;
if (pos >= limit) {
val |= ((is.read() & 0x3f) << 6);
} else {
val |= ((buf[pos++] & 0x3f) << 6);
}
if (pos >= limit) {
val |= (is.read() & 0x3f);
} else {
val |= (buf[pos++] & 0x3f);
}
break;
case 0xf0: // UCS-4 character
throw new UnsupportedEncodingException();
default:
break;
}
return val;
}
/**
* Closes the stream.
*
* @exception IOException If any IO errors occur.
*/
public void close() throws IOException {
is.close();
}
}