Am 22.03.2010 23:36, schrieb Martin Buchholz:
Masayoshi,
Ulf and I are working on a few changes to supplementary character handling.
Character.isSurrogate has already gone in.
The following are in the pipeline:
6934268: Better implementation of Character.isValidCodePoint
http://cr.openjdk.java.net/~martin/webrevs/openjdk7/isValidCodePoint
6934265: Add public method Character.isBMPCodePoint
http://cr.openjdk.java.net/~martin/webrevs/openjdk7/public-isBMPCodePoint
[mq]: isBMPCodePoint2
http://cr.openjdk.java.net/~martin/webrevs/openjdk7/isBMPCodePoint2
6937112: String.lastIndexOf confused by unpaired trailing surrogate
http://cr.openjdk.java.net/~martin/webrevs/openjdk7/lastIndexOf
In addition, Ulf and I would like to add
char Character.highSurrogate(int codePoint)
char Character.lowSurrogate(int codePoint)
Ulf,
please provide me with your latest patch for Character.highSurrogate
and I will add it to the pipeline.
Here it is.
I couldn't resist from some beautifying, and purging of
sun.nio.cs.Surrogate.
Feel free to ignore it.
-Ulf
# HG changeset patch
# Parent 31dcf23042f9c22525bdcfd4c9926d12ff7f61ca
rev 2227 : 6666666: Add public methods Character.highSurrogate,
Character.lowSurrogate
Summary: Move high, low from sun.nio.cs.Surrogate to Character
Reviewed-by: sherman
Contributed-by: Ulf Zibis <ulf.zi...@cosoco.de>
diff --git a/src/share/classes/java/lang/Character.java
b/src/share/classes/java/lang/Character.java
--- a/src/share/classes/java/lang/Character.java
+++ b/src/share/classes/java/lang/Character.java
@@ -1,5 +1,5 @@
/*
- * Copyright 2002-2009 Sun Microsystems, Inc. All Rights Reserved.
+ * Copyright 2002-2010 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,6 +24,7 @@
*/
package java.lang;
+
import java.util.Map;
import java.util.HashMap;
import java.util.Locale;
@@ -111,13 +112,15 @@
* encoding. For more information on Unicode terminology, refer to the
* <a href="http://www.unicode.org/glossary/">Unicode Glossary</a>.
*
- * @author Lee Boynton
- * @author Guy Steele
- * @author Akira Tanaka
- * @since 1.0
+ * @author Lee Boynton
+ * @author Guy Steele
+ * @author Akira Tanaka
+ * @author Ulf Zibis, Cologne CoSoCo.de
+ * @since 1.0
*/
-public final
-class Character extends Object implements java.io.Serializable,
Comparable<Character> {
+public final class Character
+ implements java.io.Serializable, Comparable<Character> {
+
/**
* The minimum radix available for conversion to and from strings.
* The constant value of this field is the smallest value permitted
@@ -154,7 +157,7 @@
*
* @since 1.0.2
*/
- public static final char MIN_VALUE = '\u0000';
+ public static final char MIN_VALUE = '\u0000';
/**
* The constant value of this field is the largest value of type
@@ -162,7 +165,7 @@
*
* @since 1.0.2
*/
- public static final char MAX_VALUE = '\uFFFF';
+ public static final char MAX_VALUE = '\uFFFF';
/**
* The <code>Class</code> instance representing the primitive type
@@ -393,7 +396,7 @@
/**
* Error flag. Use int (code point) to avoid confusion with U+FFFF.
*/
- static final int ERROR = 0xFFFFFFFF;
+ static final int ERROR = -1;
/**
@@ -401,7 +404,7 @@
* values have undefined directionality in the Unicode specification.
* @since 1.4
*/
- public static final byte DIRECTIONALITY_UNDEFINED = -1;
+ public static final byte DIRECTIONALITY_UNDEFINED = -1;
/**
* Strong bidirectional character type "L" in the Unicode specification.
@@ -621,8 +624,8 @@
/**
* Constructs a new <code>Subset</code> instance.
*
+ * @param name The name of this subset
* @exception NullPointerException if name is <code>null</code>
- * @param name The name of this subset
*/
protected Subset(String name) {
if (name == null) {
@@ -2568,13 +2571,14 @@
}
private static class CharacterCache {
- private CharacterCache(){}
-
- static final Character cache[] = new Character[127 + 1];
+// private CharacterCache(){} // superfluous, as class is private
+
+ static final char SIZE = 0x80;
+ static final Character cache[] = new Character[SIZE];
static {
- for(int i = 0; i < cache.length; i++)
- cache[i] = new Character((char)i);
+ for(char c = SIZE; c > 0;) // backwards saves comparison against
non-zero limit
+ cache[--c] = new Character(c); // char saves i2c byte code
}
}
@@ -2596,9 +2600,8 @@
* @since 1.5
*/
public static Character valueOf(char c) {
- if(c <= 127) { // must cache
- return CharacterCache.cache[(int)c];
- }
+ if(c < CharacterCache.SIZE) // must cache
+ return CharacterCache.cache[c];
return new Character(c);
}
@@ -2718,7 +2721,7 @@
* @since 1.5
*/
public static boolean isHighSurrogate(char ch) {
- return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
+ return (ch -= MIN_HIGH_SURROGATE) >= 0 && ch < MAX_HIGH_SURROGATE + 1
- MIN_HIGH_SURROGATE;
}
/**
@@ -2741,7 +2744,7 @@
* @since 1.5
*/
public static boolean isLowSurrogate(char ch) {
- return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
+ return (ch -= MIN_LOW_SURROGATE) >= 0 && ch < MAX_LOW_SURROGATE + 1 -
MIN_LOW_SURROGATE;
}
/**
@@ -2765,7 +2768,7 @@
* @since 1.7
*/
public static boolean isSurrogate(char ch) {
- return ch >= MIN_SURROGATE && ch <= MAX_SURROGATE;
+ return (ch -= MIN_SURROGATE) >= 0 && ch < MAX_SURROGATE + 1 -
MIN_SURROGATE;
}
/**
@@ -2807,7 +2810,7 @@
* @since 1.5
*/
public static int charCount(int codePoint) {
- return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT? 2 : 1;
+ return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1;
}
/**
@@ -2924,6 +2927,7 @@
return codePointAtImpl(a, index, limit);
}
+ // throws ArrayIndexOutOfBoundsException
static int codePointAtImpl(char[] a, int index, int limit) {
char c1 = a[index++];
if (isHighSurrogate(c1)) {
@@ -3030,6 +3034,7 @@
return codePointBeforeImpl(a, index, start);
}
+ // throws ArrayIndexOutOfBoundsException
static int codePointBeforeImpl(char[] a, int index, int start) {
char c2 = a[--index];
if (isLowSurrogate(c2)) {
@@ -3114,11 +3119,69 @@
return result;
}
- static void toSurrogates(int codePoint, char[] dst, int index) {
+ static void toSurrogates(int codePoint, char[] dst, int dstIndex) {
// We write elements "backwards" to guarantee all-or-nothing
- dst[index+1] = (char)((codePoint & 0x3ff) + MIN_LOW_SURROGATE);
- dst[index] = (char)((codePoint >>> 10)
- + (MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
+ dst[dstIndex+1] = lowSurrogate(codePoint);
+ dst[dstIndex] = highSurrogate(codePoint);
+ }
+
+ /**
+ * Converts the specified character (Unicode code point) to the 1st
+ * {...@code char} of it's UTF-16 representation as surrogate pair.
+ *
+ * <p><b>Note:</b> Does not check, if code point is outside valid UTF-16
+ * surrogate representation range, i.e. < U+10000 or > U+10FFFF,
+ * and, in case, returns invalid result.
+ * To avoid this, check this by {...@link #isSurrogate(char)} and
+ * {...@link #isValidCodePoint(int)} method before.
+ *
+ * @param codePoint a Unicode code point
+ * @return the 1st {...@code char} of {...@code codePoint}'s UTF-16
representation.
+ * @since 1.7
+ */
+ public static char highSurrogate(int codePoint) {
+// highSurrogateInvocations++;
+ return (char)((codePoint >>> 10) +
+ MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10));
+ }
+
+ /**
+ * Converts the specified character (Unicode code point) to the 1st
+ * {...@code char} of it's UTF-16 representation as surrogate pair.
+ * I case of having {...@code highCPWord} as a {...@code static final}
constant,
+ * this method ensures optimized performance in case of virtual machine's
+ * byte code compiler inlining.
+ *
+ * <p><b>Note:</b> Does not check, if code point is outside illegal range,
+ * i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid
result.
+ * To avoid this, check this by {...@link #isSurrogate(char)} and
+ * {...@link #isValidCodePoint(int)} method before.
+ *
+ * @param highCPWord high 16 bit of a Unicode code point
+ * @param lowCPWord low 16 bit of a Unicode code point
+ * @return the 1st {...@code char} of {...@code codePoint}'s UTF-16
representation.
+ * @since 1.7
+ */
+ public static char highSurrogate(char highCPWord, char lowCPWord) {
+ return (char)((lowCPWord >>> 10) + (highCPWord << 6) +
+ MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10));
+ }
+
+ /**
+ * Converts the specified character (Unicode code point) to the 2nd
+ * {...@code char} of it's UTF-16 representation as surrogate pair.
+ *
+ * <p><b>Note:</b> Does not check, if code point is outside illegal range,
+ * i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid
result.
+ * To avoid this, check this by {...@link #isSurrogate(char)} and
+ * {...@link #isValidCodePoint(int)} method before.
+ *
+ * @param codePoint a Unicode code point
+ * @return the 2nd {...@code char} of {...@code codePoint}'s UTF-16
representation.
+ * @since 1.7
+ */
+ public static char lowSurrogate(int codePoint) {
+ return (char)((codePoint & 0x03FF) | MIN_LOW_SURROGATE);
}
/**
@@ -3187,16 +3250,13 @@
return codePointCountImpl(a, offset, count);
}
+ // throws ArrayIndexOutOfBoundsException
static int codePointCountImpl(char[] a, int offset, int count) {
int endIndex = offset + count;
int n = 0;
- for (int i = offset; i < endIndex; ) {
- n++;
- if (isHighSurrogate(a[i++])) {
- if (i < endIndex && isLowSurrogate(a[i])) {
- i++;
- }
- }
+ for (int i = offset; i < endIndex; n++) {
+ if (isHighSurrogate(a[i++]) && i < endIndex &&
isLowSurrogate(a[i]))
+ i++;
}
return n;
}
@@ -3234,27 +3294,21 @@
if (codePointOffset >= 0) {
int i;
for (i = 0; x < length && i < codePointOffset; i++) {
- if (isHighSurrogate(seq.charAt(x++))) {
- if (x < length && isLowSurrogate(seq.charAt(x))) {
- x++;
- }
- }
+ if (isHighSurrogate(seq.charAt(x++)) &&
+ x < length && isLowSurrogate(seq.charAt(x)))
+ x++;
}
- if (i < codePointOffset) {
+ if (i < codePointOffset)
throw new IndexOutOfBoundsException();
- }
} else {
int i;
for (i = codePointOffset; x > 0 && i < 0; i++) {
- if (isLowSurrogate(seq.charAt(--x))) {
- if (x > 0 && isHighSurrogate(seq.charAt(x-1))) {
- x--;
- }
- }
+ if (isLowSurrogate(seq.charAt(--x)) &&
+ x > 0 && isHighSurrogate(seq.charAt(x-1)))
+ x--;
}
- if (i < 0) {
+ if (i < 0)
throw new IndexOutOfBoundsException();
- }
}
return x;
}
@@ -3295,12 +3349,12 @@
public static int offsetByCodePoints(char[] a, int start, int count,
int index, int codePointOffset) {
if (count > a.length-start || start < 0 || count < 0
- || index < start || index > start+count) {
+ || index < start || index > start+count)
throw new IndexOutOfBoundsException();
- }
return offsetByCodePointsImpl(a, start, count, index, codePointOffset);
}
+ // throws IndexOutOfBoundsException
static int offsetByCodePointsImpl(char[]a, int start, int count,
int index, int codePointOffset) {
int x = index;
@@ -3308,32 +3362,24 @@
int limit = start + count;
int i;
for (i = 0; x < limit && i < codePointOffset; i++) {
- if (isHighSurrogate(a[x++])) {
- if (x < limit && isLowSurrogate(a[x])) {
- x++;
- }
- }
+ if (isHighSurrogate(a[x++]) && x < limit &&
isLowSurrogate(a[x]))
+ x++;
}
- if (i < codePointOffset) {
+ if (i < codePointOffset)
throw new IndexOutOfBoundsException();
- }
} else {
int i;
for (i = codePointOffset; x > start && i < 0; i++) {
- if (isLowSurrogate(a[--x])) {
- if (x > start && isHighSurrogate(a[x-1])) {
- x--;
- }
- }
+ if (isLowSurrogate(a[--x]) && x > start &&
isHighSurrogate(a[x-1]))
+ x--;
}
- if (i < 0) {
+ if (i < 0)
throw new IndexOutOfBoundsException();
- }
}
return x;
}
- /**
+ /**
* Determines if the specified character is a lowercase character.
* <p>
* A character is lowercase if its general category type, provided
diff --git a/src/share/classes/sun/nio/cs/Surrogate.java
b/src/share/classes/sun/nio/cs/Surrogate.java
--- a/src/share/classes/sun/nio/cs/Surrogate.java
+++ b/src/share/classes/sun/nio/cs/Surrogate.java
@@ -1,5 +1,5 @@
/*
- * Copyright 2000-2001 Sun Microsystems, Inc. All Rights Reserved.
+ * Copyright 2000-2010 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -27,15 +27,13 @@
import java.nio.CharBuffer;
import java.nio.charset.CoderResult;
-import java.nio.charset.MalformedInputException;
-import java.nio.charset.UnmappableCharacterException;
/**
* Utility class for dealing with surrogates.
*
* @author Mark Reinhold
+ * @author Ulf Zibis, Cologne CoSoCo.de
*/
-
public class Surrogate {
private Surrogate() { }
@@ -51,26 +49,10 @@
public static final int UCS4_MAX = Character.MAX_CODE_POINT;
/**
- * Tells whether or not the given value is in the high surrogate range.
- * Use of {...@link Character#isHighSurrogate} is generally preferred.
- */
- public static boolean isHigh(int c) {
- return (MIN_HIGH <= c) && (c <= MAX_HIGH);
- }
-
- /**
- * Tells whether or not the given value is in the low surrogate range.
- * Use of {...@link Character#isLowSurrogate} is generally preferred.
- */
- public static boolean isLow(int c) {
- return (MIN_LOW <= c) && (c <= MAX_LOW);
- }
-
- /**
* Tells whether or not the given value is in the surrogate range.
* Use of {...@link Character#isSurrogate} is generally preferred.
*/
- public static boolean is(int c) {
+ private static boolean is(int c) {
return (MIN <= c) && (c <= MAX);
}
@@ -78,16 +60,8 @@
* Tells whether or not the given UCS-4 character is in the Basic
* Multilingual Plane, and can be represented using a single char.
*/
- public static boolean isBMPCodePoint(int uc) {
- return uc >> 16 == 0;
- }
-
- /**
- * Tells whether or not the given UCS-4 character must be represented as a
- * surrogate pair in UTF-16.
- */
- public static boolean neededFor(int uc) {
- return Character.isSupplementaryCodePoint(uc);
+ private static boolean isBMPCodePoint(int uc) {
+ return uc >>> 16 == 0;
}
/**
@@ -95,9 +69,7 @@
*/
public static char high(int uc) {
assert Character.isSupplementaryCodePoint(uc);
- return (char)((uc >> 10)
- + (Character.MIN_HIGH_SURROGATE
- - (Character.MIN_SUPPLEMENTARY_CODE_POINT >> 10)));
+ return Character.highSurrogate(uc);
}
/**
@@ -105,7 +77,7 @@
*/
public static char low(int uc) {
assert Character.isSupplementaryCodePoint(uc);
- return (char)((uc & 0x3ff) + Character.MIN_LOW_SURROGATE);
+ return Character.lowSurrogate(uc);
}
/**
@@ -290,8 +262,9 @@
* error() will return a descriptive result object
*/
public int generate(int uc, int len, CharBuffer dst) {
- if (Surrogate.isBMPCodePoint(uc)) {
- if (Surrogate.is(uc)) {
+// if (Character.isBMPCodePoint(uc)) {
+ if (isBMPCodePoint(uc)) {
+ if (is(uc)) {
error = CoderResult.malformedForLength(len);
return -1;
}
@@ -307,8 +280,8 @@
error = CoderResult.OVERFLOW;
return -1;
}
- dst.put(Surrogate.high(uc));
- dst.put(Surrogate.low(uc));
+ dst.put(high(uc));
+ dst.put(low(uc));
error = null;
return 2;
} else {
@@ -334,8 +307,9 @@
* error() will return a descriptive result object
*/
public int generate(int uc, int len, char[] da, int dp, int dl) {
- if (Surrogate.isBMPCodePoint(uc)) {
- if (Surrogate.is(uc)) {
+// if (Character.isBMPCodePoint(uc)) {
+ if (isBMPCodePoint(uc)) {
+ if (is(uc)) {
error = CoderResult.malformedForLength(len);
return -1;
}
@@ -351,8 +325,8 @@
error = CoderResult.OVERFLOW;
return -1;
}
- da[dp] = Surrogate.high(uc);
- da[dp + 1] = Surrogate.low(uc);
+ da[dp] = high(uc);
+ da[dp + 1] = low(uc);
error = null;
return 2;
} else {
diff --git a/src/share/classes/sun/nio/cs/UTF_8.java
b/src/share/classes/sun/nio/cs/UTF_8.java
--- a/src/share/classes/sun/nio/cs/UTF_8.java
+++ b/src/share/classes/sun/nio/cs/UTF_8.java
@@ -102,7 +102,7 @@
// [F1..F3] [80..BF] [80..BF] [80..BF]
// [F4] [80..8F] [80..BF] [80..BF]
// only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]
- // will be checked by Surrogate.neededFor(uc)
+ // will be checked by Character.isSupplementaryCodePoint(uc)
private static boolean isMalformed4(int b2, int b3, int b4) {
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
(b4 & 0xc0) != 0x80;
@@ -248,7 +248,7 @@
((b3 & 0x3f) << 06) |
(b4 & 0x3f);
if (isMalformed4(b2, b3, b4) ||
- !Surrogate.neededFor(uc)) {
+ !Character.isSupplementaryCodePoint(uc)) {
return malformed(src, sp, dst, dp, 4);
}
da[dp++] = Surrogate.high(uc);
@@ -304,7 +304,7 @@
((b3 & 0x3f) << 06) |
(b4 & 0x3f);
if (isMalformed4(b2, b3, b4) ||
- !Surrogate.neededFor(uc)) { // shortest form check
+ !Character.isSupplementaryCodePoint(uc)) { // shortest
form check
return malformed(src, mark, 4);
}
dst.put(Surrogate.high(uc));