Dear list members, There is a small problem with unicode escapes in Clojure strings. When StringReader encounters a unicode escape it checks Character.isDigit which is only true for decimal digits, but not hexadecimal.
The other problem is that CharacterReader accepts a character constant denoting a high or low surrogate as a valid character which it is not. Cf. http://en.wikipedia.org/wiki/UTF-16 and http://java.sun.com/javase/6/docs/api/java/lang/Character.html The attached patch (against r1092) corrects both issues. Thanks, Toralf --~--~---------~--~----~------------~-------~--~----~ You received this message because you are subscribed to the Google Groups "Clojure" group. To post to this group, send email to clojure@googlegroups.com To unsubscribe from this group, send email to [EMAIL PROTECTED] For more options, visit this group at http://groups.google.com/group/clojure?hl=en -~----------~----~----~----~------~----~------~--~---
diff --git a/trunk/src/jvm/clojure/lang/LispReader.java b/trunk/src/jvm/clojure/lang/LispReader.java index 4847692..ddf1925 100644 --- a/trunk/src/jvm/clojure/lang/LispReader.java +++ b/trunk/src/jvm/clojure/lang/LispReader.java @@ -422,10 +422,9 @@ static class StringReader extends AFn{ case 'u': { ch = r.read(); - if(Character.isDigit(ch)) - ch = readUnicodeChar((PushbackReader) r, ch, 16, 4, true); - else - throw new Exception("Invalid unicode escape: \\" + (char) ch); + if (Character.digit(ch, 16) == -1) + throw new Exception("Invalid unicode escape: \\u" + (char) ch); + ch = readUnicodeChar((PushbackReader) r, ch, 16, 4, true); break; } default: @@ -809,7 +808,12 @@ static class CharacterReader extends AFn{ else if(token.equals("return")) return '\r'; else if(token.startsWith("u")) - return (char) readUnicodeChar(token, 1, 4, 16); + { + char c = (char) readUnicodeChar(token, 1, 4, 16); + if(c >= '\uD800' && c <= '\uDFFF') // surrogate code unit? + throw new Exception("Invalid character constant: \\u" + Integer.toString(c, 16)); + return c; + } else if(token.startsWith("o")) { int len = token.length() - 1;