DO NOT REPLY TO THIS EMAIL, BUT PLEASE POST YOUR BUG RELATED COMMENTS THROUGH THE WEB INTERFACE AVAILABLE AT <http://nagoya.apache.org/bugzilla/show_bug.cgi?id=16307>. ANY REPLY MADE TO THIS MESSAGE WILL NOT BE COLLECTED AND INSERTED IN THE BUG DATABASE.
http://nagoya.apache.org/bugzilla/show_bug.cgi?id=16307 Invalid byte 1 of 1-byte UTF-8 sequence - error for seemingly valid unicode characters (UTF-8) ... Summary: Invalid byte 1 of 1-byte UTF-8 sequence - error for seemingly valid unicode characters (UTF-8) ... Product: Xerces2-J Version: 2.1.0 Platform: All OS/Version: All Status: NEW Severity: Major Priority: Other Component: SAX AssignedTo: [EMAIL PROTECTED] ReportedBy: [EMAIL PROTECTED] Following exception is thrown if xml contains characters in range \u0153 to \u02DD and \u2013 to \u2123. Note : For a similar C++ based sample with xercesC 2.1 the same ranges are working fine. java.io.UTFDataFormatException: Invalid byte 1 of 1-byte UTF-8 sequence. at org.apache.xerces.impl.io.UTF8Reader.invalidByte(Unknown Source) at org.apache.xerces.impl.io.UTF8Reader.read(Unknown Source) at org.apache.xerces.impl.XMLEntityManager$EntityScanner.load(Unknown So urce) at org.apache.xerces.impl.XMLEntityManager$EntityScanner.scanContent(Unk nown Source) at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanContent(Unk nown Source) at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContent Dispatcher.dispatch(Unknown Source) at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Un known Source) at org.apache.xerces.parsers.DTDConfiguration.parse(Unknown Source) at org.apache.xerces.parsers.DTDConfiguration.parse(Unknown Source) at org.apache.xerces.parsers.XMLParser.parse(Unknown Source) at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source) at SAX2Writer.print(SAX2Writer.java:58) at SAX2Writer.main(SAX2Writer.java:378) I'm also including SAX2Writer.java file, modified to test the unicode data parsing. In this I'm marking the unicode ranges that work and the ones that don't. This can be tested by running the SAX2Writer class with arguments -u -V - S -F. Pl. let me know if you need any more information. Note: I'm using JDK 1.3.1 on Win2K Thanx, Sandeep Desale ------------------- SAX2Writer.java starts here -------------------------- // FrontEnd Plus GUI for JAD // DeCompiled : SAX2Writer.class // package sax; import java.io.*; import org.xml.sax.*; import org.xml.sax.helpers.DefaultHandler; import sax.helpers.AttributesImpl; import util.Arguments; public class SAX2Writer extends DefaultHandler { private static final String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser"; private static boolean unicodeTest = false; private static boolean setValidation = false; private static boolean setNameSpaces = true; private static boolean setSchemaSupport = true; private static boolean setSchemaFullSupport = false; protected PrintWriter out; protected boolean canonical; public SAX2Writer(boolean flag) throws UnsupportedEncodingException { this(null, flag); } protected SAX2Writer(String s, boolean flag) throws UnsupportedEncodingException { if(s == null) s = "UTF8"; out = new PrintWriter(new OutputStreamWriter(System.out, s)); canonical = flag; } public static void print(String s, String s1, boolean flag) { try { SAX2Writer sax2writer = new SAX2Writer(flag); XMLReader xmlreader = (XMLReader)Class.forName(s).newInstance(); String s2 = "http://www.tibco.com/xmlns/ae2xsd/2002/05 s100.xsd http://www.tibco.com/xmlns/ae2xsd/2002/05/ae/test/cross/XPschema s1.xsd"; System.out.println("Validation is " + setValidation); xmlreader.setFeature("http://xml.org/sax/features/validation", setValidation); xmlreader.setFeature("http://xml.org/sax/features/namespaces", setNameSpaces); xmlreader.setFeature ("http://apache.org/xml/features/validation/schema", setSchemaSupport); xmlreader.setFeature ("http://apache.org/xml/features/validation/schema-full-checking", setSchemaFullSupport); xmlreader.setProperty ("http://apache.org/xml/properties/schema/external-schemaLocation", s2); xmlreader.setContentHandler(sax2writer); xmlreader.setErrorHandler(sax2writer); xmlreader.setEntityResolver(sax2writer); if(unicodeTest) { ByteArrayInputStream bytearrayinputstream = new ByteArrayInputStream(s1.getBytes()); xmlreader.parse(new InputSource(bytearrayinputstream)); } else { xmlreader.parse(s1); } } catch(Exception exception) { exception.printStackTrace(System.err); } } public void processingInstruction(String s, String s1) { out.print("<?"); out.print(s); if(s1 != null && s1.length() > 0) { out.print(' '); out.print(s1); } out.print("?>"); out.flush(); } public void startDocument() { if(!canonical) { out.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); out.flush(); } } public void startElement(String s, String s1, String s2, Attributes attributes) { System.out.println("Start Element name : " + s1); if(attributes != null) System.out.println("Attribute count : " + attributes.getLength()); } public void characters(char ac[], int i, int j) { out.print(normalize(new String(ac, i, j))); out.flush(); } public void ignorableWhitespace(char ac[], int i, int j) { System.out.println("Found ignorable white space*********<"); for(int k = i; k < j; k++) System.out.println(ac[k]); System.out.println(">***"); characters(ac, i, j); out.flush(); } public void endElement(String s, String s1, String s2) { System.out.println("End Element name : " + s1); } public void warning(SAXParseException saxparseexception) { System.err.println("[Warning] " + getLocationString(saxparseexception) + ": " + saxparseexception.getMessage()); } public void error(SAXParseException saxparseexception) throws SAXException { System.err.println("[Error] " + getLocationString(saxparseexception) + ": " + saxparseexception.getMessage()); throw saxparseexception; } public void fatalError(SAXParseException saxparseexception) throws SAXException { System.err.println("[Fatal Error] " + getLocationString (saxparseexception) + ": " + saxparseexception.getMessage()); throw saxparseexception; } public InputSource resolveEntity(String s, String s1) throws SAXException { System.out.println("resolveEntity called systemid " + s1); return new InputSource(s1); } private String getLocationString(SAXParseException saxparseexception) { StringBuffer stringbuffer = new StringBuffer(); String s = saxparseexception.getSystemId(); if(s != null) { int i = s.lastIndexOf('/'); if(i != -1) s = s.substring(i + 1); stringbuffer.append(s); } stringbuffer.append(':'); stringbuffer.append(saxparseexception.getLineNumber()); stringbuffer.append(':'); stringbuffer.append(saxparseexception.getColumnNumber()); return stringbuffer.toString(); } protected String normalize(String s) { StringBuffer stringbuffer = new StringBuffer(); int i = s == null ? 0 : s.length(); for(int j = 0; j < i; j++) { char c = s.charAt(j); switch(c) { case 60: // '<' stringbuffer.append("<"); break; case 62: // '>' stringbuffer.append(">"); break; case 38: // '&' stringbuffer.append("&"); break; case 34: // '"' stringbuffer.append("""); break; case 10: // '\n' case 13: // '\r' if(canonical) { stringbuffer.append("&#"); stringbuffer.append(Integer.toString(c)); stringbuffer.append(';'); break; } // fall through default: stringbuffer.append(c); break; } } return stringbuffer.toString(); } protected Attributes sortAttributes(Attributes attributes) { AttributesImpl attributesimpl = new AttributesImpl(); int i = attributes == null ? 0 : attributes.getLength(); for(int j = 0; j < i; j++) { String s = attributes.getQName(j); int k = attributesimpl.getLength(); int l; for(l = 0; l < k; l++) if(s.compareTo(attributesimpl.getQName(l)) < 0) break; attributesimpl.insertAttributeAt(l, s, attributes.getType(j), attributes.getValue(j)); } return attributesimpl; } public static StringBuffer appendRange(char c, char c1) { int i = c1 - c; StringBuffer stringbuffer = new StringBuffer(i + 1); for(int j = 0; j < i; j++) stringbuffer.append((char)(c + j)); return stringbuffer; } public static String constructXML() { StringBuffer stringbuffer = new StringBuffer("<Monster><item>"); // Fails for this range '\u0100' - '\u0800' StringBuffer stringbuffer1 = appendRange('\u0100', '\u0800'); // Fails for this range '\u2000' - '\u27BF' // StringBuffer stringbuffer1 = appendRange ( '\u2000', '\u27BF' ); // Misc symbols // Works '\u0900' - '\u1900' // StringBuffer stringbuffer1 = appendRange( '\u0900', '\u1900' ); // too many to list... // Works '\u1E00' - '\u1FFF' // StringBuffer stringbuffer1 = appendRange ( '\u1E00', '\u1FFF' ); // Extended Latin, Extended Greek // Works '\u2800' - '\u28FF' // StringBuffer stringbuffer1 = appendRange ( '\u2800', '\u28FF' ); // Braille // Works '\u3040' - '\u30FF' // StringBuffer stringbuffer1 = appendRange ( '\u3040', '\u30FF' ); // Hiragana, Katakana // Works '\u3200' - '\u51FF' // StringBuffer stringbuffer1 = appendRange ( '\u3200', '\u51FF' ); // CJK letters and months, CJK Compatibility, CJK Unified, etc. // Works '\u5200' - '\u71FF' // StringBuffer stringbuffer1 = appendRange ( '\u5200', '\u71FF' ); // CJK letters and months, CJK Compatibility, CJK Unified, etc. // Works '\u7200' - '\u9FA5' // StringBuffer stringbuffer1 = appendRange ( '\u7200', '\u9FA5' ); // CJK letters and months, CJK Compatibility, CJK Unified, etc. // Works '\uA000' - '\uA48C' // StringBuffer stringbuffer1 = appendRange ( '\uA000', '\uA48C' ); // Yi - omitted Yi Radicals... // Works '\uAC00' - '\uD7A3' // StringBuffer stringbuffer1 = appendRange ( '\uAC00', '\uD7A3' ); // Hangul // Works '\uF900' - '\uFA2D' // StringBuffer stringbuffer1 = appendRange ( '\uF900', '\uFA2D' ); // CJK Compatibility Ideographs try { byte abyte0[] = stringbuffer1.toString().getBytes("UTF-8"); stringbuffer.append(new String(abyte0, "UTF-8")); stringbuffer.append("</item></Monster>"); } catch(UnsupportedEncodingException unsupportedencodingexception) { unsupportedencodingexception.printStackTrace(); } return stringbuffer.toString(); } public static void main(String args[]) { Arguments arguments = new Arguments(); arguments.setUsage(new String[] { "usage: java sax.SAX2Writer (options) (uri) ...", "", "options:", " -u | -U Unicode test", " -n | -N Turn on/off namespace [default=on]", " -v | -V Turn on/off validation [default=on]", " - s | -S Turn on/off Schema support [default=on]", " -f | -F Turn on/off Schema full consraint checking [default=off]", " -c Canonical XML output.", " -h This help screen." }); if(args.length == 0) { arguments.printUsage(); System.exit(1); } boolean flag = false; String s = "org.apache.xerces.parsers.SAXParser"; arguments.parseArgumentTokens(args, new char[] { 'p' }); for(String s1 = null; (s1 = arguments.getlistFiles()) != null;) { int i; label0: while((i = arguments.getArguments()) != -1) switch(i) { case -1: break label0; case 99: // 'c' flag = true; break; case 67: // 'C' flag = false; break; case 118: // 'v' setValidation = true; break; case 86: // 'V' setValidation = false; break; case 78: // 'N' setNameSpaces = false; break; case 110: // 'n' setNameSpaces = true; break; case 112: // 'p' s = arguments.getStringParameter(); break; case 115: // 's' setSchemaSupport = true; break; case 83: // 'S' setSchemaSupport = false; break; case 102: // 'f' setSchemaFullSupport = true; break; case 70: // 'F' setSchemaFullSupport = false; break; case 85: // 'U' case 117: // 'u' String s2 = constructXML(); unicodeTest = true; s1 = s2; break; case 45: // '-' case 63: // '?' case 104: // 'h' arguments.printUsage(); System.exit(1); break; } System.err.println(s1 + ':'); print(s, s1, flag); } } } ------------------- SAX2Writer.java ends here -------------------------- --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
