Re: String.lastIndexOf confused by unpaired trailing surrogate

Ulf Zibis Tue, 23 Mar 2010 09:12:28 -0700

Am 22.03.2010 23:36, schrieb Martin Buchholz:

Masayoshi,


Ulf and I are working on a few changes to supplementary character handling.
Character.isSurrogate has already gone in.

The following are in the pipeline:

6934268: Better implementation of Character.isValidCodePoint
http://cr.openjdk.java.net/~martin/webrevs/openjdk7/isValidCodePoint
6934265: Add public method Character.isBMPCodePoint
http://cr.openjdk.java.net/~martin/webrevs/openjdk7/public-isBMPCodePoint
[mq]: isBMPCodePoint2
http://cr.openjdk.java.net/~martin/webrevs/openjdk7/isBMPCodePoint2
6937112: String.lastIndexOf confused by unpaired trailing surrogate
http://cr.openjdk.java.net/~martin/webrevs/openjdk7/lastIndexOf

In addition, Ulf and I would like to add
char Character.highSurrogate(int codePoint)
char Character.lowSurrogate(int codePoint)

Ulf,
please provide me with your latest patch for Character.highSurrogate
and I will add it to the pipeline.


Here it is.

I couldn't resist from some beautifying, and purging ofsun.nio.cs.Surrogate.

Feel free to ignore it.

-Ulf

# HG changeset patch
# Parent 31dcf23042f9c22525bdcfd4c9926d12ff7f61ca
rev 2227 : 6666666: Add public methods Character.highSurrogate, 
Character.lowSurrogate
Summary: Move high, low from sun.nio.cs.Surrogate to Character
Reviewed-by: sherman
Contributed-by: Ulf Zibis <ulf.zi...@cosoco.de>

diff --git a/src/share/classes/java/lang/Character.java 
b/src/share/classes/java/lang/Character.java
--- a/src/share/classes/java/lang/Character.java
+++ b/src/share/classes/java/lang/Character.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2002-2009 Sun Microsystems, Inc.  All Rights Reserved.
+ * Copyright 2002-2010 Sun Microsystems, Inc.  All Rights Reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -24,6 +24,7 @@
  */
 
 package java.lang;
+
 import java.util.Map;
 import java.util.HashMap;
 import java.util.Locale;
@@ -111,13 +112,15 @@
  * encoding. For more information on Unicode terminology, refer to the
  * <a href="http://www.unicode.org/glossary/";>Unicode Glossary</a>.
  *
- * @author  Lee Boynton
- * @author  Guy Steele
- * @author  Akira Tanaka
- * @since   1.0
+ * @author Lee Boynton
+ * @author Guy Steele
+ * @author Akira Tanaka
+ * @author Ulf Zibis, Cologne CoSoCo.de
+ * @since  1.0
  */
-public final
-class Character extends Object implements java.io.Serializable, 
Comparable<Character> {
+public final class Character
+        implements java.io.Serializable, Comparable<Character> {
+
     /**
      * The minimum radix available for conversion to and from strings.
      * The constant value of this field is the smallest value permitted
@@ -154,7 +157,7 @@
      *
      * @since   1.0.2
      */
-    public static final char   MIN_VALUE = '\u0000';
+    public static final char MIN_VALUE = '\u0000';
 
     /**
      * The constant value of this field is the largest value of type
@@ -162,7 +165,7 @@
      *
      * @since   1.0.2
      */
-    public static final char   MAX_VALUE = '\uFFFF';
+    public static final char MAX_VALUE = '\uFFFF';
 
     /**
      * The <code>Class</code> instance representing the primitive type
@@ -393,7 +396,7 @@
     /**
      * Error flag. Use int (code point) to avoid confusion with U+FFFF.
      */
-     static final int ERROR = 0xFFFFFFFF;
+    static final int ERROR = -1;
 
 
     /**
@@ -401,7 +404,7 @@
      * values have undefined directionality in the Unicode specification.
      * @since 1.4
      */
-     public static final byte DIRECTIONALITY_UNDEFINED = -1;
+    public static final byte DIRECTIONALITY_UNDEFINED = -1;
 
     /**
      * Strong bidirectional character type "L" in the Unicode specification.
@@ -621,8 +624,8 @@
         /**
          * Constructs a new <code>Subset</code> instance.
          *
+         * @param  name  The name of this subset
          * @exception NullPointerException if name is <code>null</code>
-         * @param  name  The name of this subset
          */
         protected Subset(String name) {
             if (name == null) {
@@ -2568,13 +2571,14 @@
     }
 
     private static class CharacterCache {
-        private CharacterCache(){}
-
-        static final Character cache[] = new Character[127 + 1];
+//        private CharacterCache(){} // superfluous, as class is private
+
+        static final char SIZE = 0x80;
+        static final Character cache[] = new Character[SIZE];
 
         static {
-            for(int i = 0; i < cache.length; i++)
-                cache[i] = new Character((char)i);
+            for(char c = SIZE; c > 0;) // backwards saves comparison against 
non-zero limit
+                cache[--c] = new Character(c); // char saves i2c byte code
         }
     }
 
@@ -2596,9 +2600,8 @@
      * @since  1.5
      */
     public static Character valueOf(char c) {
-        if(c <= 127) { // must cache
-            return CharacterCache.cache[(int)c];
-        }
+        if(c < CharacterCache.SIZE) // must cache
+            return CharacterCache.cache[c];
         return new Character(c);
     }
 
@@ -2718,7 +2721,7 @@
      * @since  1.5
      */
     public static boolean isHighSurrogate(char ch) {
-        return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
+        return (ch -= MIN_HIGH_SURROGATE) >= 0 && ch < MAX_HIGH_SURROGATE + 1 
- MIN_HIGH_SURROGATE;
     }
 
     /**
@@ -2741,7 +2744,7 @@
      * @since  1.5
      */
     public static boolean isLowSurrogate(char ch) {
-        return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
+        return (ch -= MIN_LOW_SURROGATE) >= 0 && ch < MAX_LOW_SURROGATE + 1 - 
MIN_LOW_SURROGATE;
     }
 
     /**
@@ -2765,7 +2768,7 @@
      * @since  1.7
      */
     public static boolean isSurrogate(char ch) {
-        return ch >= MIN_SURROGATE && ch <= MAX_SURROGATE;
+        return (ch -= MIN_SURROGATE) >= 0 && ch < MAX_SURROGATE + 1 - 
MIN_SURROGATE;
     }
 
     /**
@@ -2807,7 +2810,7 @@
      * @since   1.5
      */
     public static int charCount(int codePoint) {
-        return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT? 2 : 1;
+        return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1;
     }
 
     /**
@@ -2924,6 +2927,7 @@
         return codePointAtImpl(a, index, limit);
     }
 
+    // throws ArrayIndexOutOfBoundsException
     static int codePointAtImpl(char[] a, int index, int limit) {
         char c1 = a[index++];
         if (isHighSurrogate(c1)) {
@@ -3030,6 +3034,7 @@
         return codePointBeforeImpl(a, index, start);
     }
 
+    // throws ArrayIndexOutOfBoundsException
     static int codePointBeforeImpl(char[] a, int index, int start) {
         char c2 = a[--index];
         if (isLowSurrogate(c2)) {
@@ -3114,11 +3119,69 @@
         return result;
     }
 
-    static void toSurrogates(int codePoint, char[] dst, int index) {
+    static void toSurrogates(int codePoint, char[] dst, int dstIndex) {
         // We write elements "backwards" to guarantee all-or-nothing
-        dst[index+1] = (char)((codePoint & 0x3ff) + MIN_LOW_SURROGATE);
-        dst[index] = (char)((codePoint >>> 10)
-            + (MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
+        dst[dstIndex+1] = lowSurrogate(codePoint);
+        dst[dstIndex] = highSurrogate(codePoint);
+    }
+
+    /**
+     * Converts the specified character (Unicode code point) to the 1st
+     * {...@code char} of it's UTF-16 representation as surrogate pair.
+     *
+     * <p><b>Note:</b> Does not check, if code point is outside valid UTF-16
+     * surrogate representation range, i.e. < U+10000 or > U+10FFFF,
+     * and, in case, returns invalid result.
+     * To avoid this, check this by {...@link #isSurrogate(char)} and
+     * {...@link #isValidCodePoint(int)} method before.
+     *
+     * @param  codePoint a Unicode code point
+     * @return the 1st {...@code char} of {...@code codePoint}'s UTF-16 
representation.
+     * @since  1.7
+     */
+    public static char highSurrogate(int codePoint) {
+//        highSurrogateInvocations++;
+        return (char)((codePoint >>> 10) +
+                MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10));
+    }
+
+    /**
+     * Converts the specified character (Unicode code point) to the 1st
+     * {...@code char} of it's UTF-16 representation as surrogate pair.
+     * I case of having {...@code highCPWord} as a {...@code static final} 
constant,
+     * this method ensures optimized performance in case of virtual machine's
+     * byte code compiler inlining.
+     *
+     * <p><b>Note:</b> Does not check, if code point is outside illegal range,
+     * i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid 
result.
+     * To avoid this, check this by {...@link #isSurrogate(char)} and
+     * {...@link #isValidCodePoint(int)} method before.
+     *
+     * @param  highCPWord high 16 bit of a Unicode code point
+     * @param  lowCPWord low 16 bit of a Unicode code point
+     * @return the 1st {...@code char} of {...@code codePoint}'s UTF-16 
representation.
+     * @since  1.7
+     */
+    public static char highSurrogate(char highCPWord, char lowCPWord) {
+        return (char)((lowCPWord >>> 10) + (highCPWord << 6) +
+                MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10));
+    }
+
+    /**
+     * Converts the specified character (Unicode code point) to the 2nd
+     * {...@code char} of it's UTF-16 representation as surrogate pair.
+     *
+     * <p><b>Note:</b> Does not check, if code point is outside illegal range,
+     * i.e. U+D800 ... U+DFFF or > U+10FFFF, and, in case, returns invalid 
result.
+     * To avoid this, check this by {...@link #isSurrogate(char)} and
+     * {...@link #isValidCodePoint(int)} method before.
+     *
+     * @param  codePoint a Unicode code point
+     * @return the 2nd {...@code char} of {...@code codePoint}'s UTF-16 
representation.
+     * @since  1.7
+     */
+    public static char lowSurrogate(int codePoint) {
+        return (char)((codePoint & 0x03FF) | MIN_LOW_SURROGATE);
     }
 
     /**
@@ -3187,16 +3250,13 @@
         return codePointCountImpl(a, offset, count);
     }
 
+    // throws ArrayIndexOutOfBoundsException
     static int codePointCountImpl(char[] a, int offset, int count) {
         int endIndex = offset + count;
         int n = 0;
-        for (int i = offset; i < endIndex; ) {
-            n++;
-            if (isHighSurrogate(a[i++])) {
-                if (i < endIndex && isLowSurrogate(a[i])) {
-                    i++;
-                }
-            }
+        for (int i = offset; i < endIndex; n++) {
+            if (isHighSurrogate(a[i++]) && i < endIndex && 
isLowSurrogate(a[i]))
+                i++;
         }
         return n;
     }
@@ -3234,27 +3294,21 @@
         if (codePointOffset >= 0) {
             int i;
             for (i = 0; x < length && i < codePointOffset; i++) {
-                if (isHighSurrogate(seq.charAt(x++))) {
-                    if (x < length && isLowSurrogate(seq.charAt(x))) {
-                        x++;
-                    }
-                }
+                if (isHighSurrogate(seq.charAt(x++)) &&
+                        x < length && isLowSurrogate(seq.charAt(x)))
+                    x++;
             }
-            if (i < codePointOffset) {
+            if (i < codePointOffset)
                 throw new IndexOutOfBoundsException();
-            }
         } else {
             int i;
             for (i = codePointOffset; x > 0 && i < 0; i++) {
-                if (isLowSurrogate(seq.charAt(--x))) {
-                    if (x > 0 && isHighSurrogate(seq.charAt(x-1))) {
-                        x--;
-                    }
-                }
+                if (isLowSurrogate(seq.charAt(--x)) &&
+                        x > 0 && isHighSurrogate(seq.charAt(x-1)))
+                    x--;
             }
-            if (i < 0) {
+            if (i < 0)
                 throw new IndexOutOfBoundsException();
-            }
         }
         return x;
     }
@@ -3295,12 +3349,12 @@
     public static int offsetByCodePoints(char[] a, int start, int count,
                                          int index, int codePointOffset) {
         if (count > a.length-start || start < 0 || count < 0
-            || index < start || index > start+count) {
+                || index < start || index > start+count)
             throw new IndexOutOfBoundsException();
-        }
         return offsetByCodePointsImpl(a, start, count, index, codePointOffset);
     }
 
+    // throws IndexOutOfBoundsException
     static int offsetByCodePointsImpl(char[]a, int start, int count,
                                       int index, int codePointOffset) {
         int x = index;
@@ -3308,32 +3362,24 @@
             int limit = start + count;
             int i;
             for (i = 0; x < limit && i < codePointOffset; i++) {
-                if (isHighSurrogate(a[x++])) {
-                    if (x < limit && isLowSurrogate(a[x])) {
-                        x++;
-                    }
-                }
+                if (isHighSurrogate(a[x++]) && x < limit && 
isLowSurrogate(a[x]))
+                    x++;
             }
-            if (i < codePointOffset) {
+            if (i < codePointOffset)
                 throw new IndexOutOfBoundsException();
-            }
         } else {
             int i;
             for (i = codePointOffset; x > start && i < 0; i++) {
-                if (isLowSurrogate(a[--x])) {
-                    if (x > start && isHighSurrogate(a[x-1])) {
-                        x--;
-                    }
-                }
+                if (isLowSurrogate(a[--x]) && x > start && 
isHighSurrogate(a[x-1]))
+                    x--;
             }
-            if (i < 0) {
+            if (i < 0)
                 throw new IndexOutOfBoundsException();
-            }
         }
         return x;
     }
 
-   /**
+    /**
      * Determines if the specified character is a lowercase character.
      * <p>
      * A character is lowercase if its general category type, provided
diff --git a/src/share/classes/sun/nio/cs/Surrogate.java 
b/src/share/classes/sun/nio/cs/Surrogate.java
--- a/src/share/classes/sun/nio/cs/Surrogate.java
+++ b/src/share/classes/sun/nio/cs/Surrogate.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2000-2001 Sun Microsystems, Inc.  All Rights Reserved.
+ * Copyright 2000-2010 Sun Microsystems, Inc.  All Rights Reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,15 +27,13 @@
 
 import java.nio.CharBuffer;
 import java.nio.charset.CoderResult;
-import java.nio.charset.MalformedInputException;
-import java.nio.charset.UnmappableCharacterException;
 
 /**
  * Utility class for dealing with surrogates.
  *
  * @author Mark Reinhold
+ * @author Ulf Zibis, Cologne CoSoCo.de
  */
-
 public class Surrogate {
 
     private Surrogate() { }
@@ -51,26 +49,10 @@
     public static final int UCS4_MAX  = Character.MAX_CODE_POINT;
 
     /**
-     * Tells whether or not the given value is in the high surrogate range.
-     * Use of {...@link Character#isHighSurrogate} is generally preferred.
-     */
-    public static boolean isHigh(int c) {
-        return (MIN_HIGH <= c) && (c <= MAX_HIGH);
-    }
-
-    /**
-     * Tells whether or not the given value is in the low surrogate range.
-     * Use of {...@link Character#isLowSurrogate} is generally preferred.
-     */
-    public static boolean isLow(int c) {
-        return (MIN_LOW <= c) && (c <= MAX_LOW);
-    }
-
-    /**
      * Tells whether or not the given value is in the surrogate range.
      * Use of {...@link Character#isSurrogate} is generally preferred.
      */
-    public static boolean is(int c) {
+    private static boolean is(int c) {
         return (MIN <= c) && (c <= MAX);
     }
 
@@ -78,16 +60,8 @@
      * Tells whether or not the given UCS-4 character is in the Basic
      * Multilingual Plane, and can be represented using a single char.
      */
-    public static boolean isBMPCodePoint(int uc) {
-        return uc >> 16 == 0;
-    }
-
-    /**
-     * Tells whether or not the given UCS-4 character must be represented as a
-     * surrogate pair in UTF-16.
-     */
-    public static boolean neededFor(int uc) {
-        return Character.isSupplementaryCodePoint(uc);
+    private static boolean isBMPCodePoint(int uc) {
+        return uc >>> 16 == 0;
     }
 
     /**
@@ -95,9 +69,7 @@
      */
     public static char high(int uc) {
         assert Character.isSupplementaryCodePoint(uc);
-        return (char)((uc >> 10)
-                      + (Character.MIN_HIGH_SURROGATE
-                         - (Character.MIN_SUPPLEMENTARY_CODE_POINT >> 10)));
+        return Character.highSurrogate(uc);
     }
 
     /**
@@ -105,7 +77,7 @@
      */
     public static char low(int uc) {
         assert Character.isSupplementaryCodePoint(uc);
-        return (char)((uc & 0x3ff) + Character.MIN_LOW_SURROGATE);
+        return Character.lowSurrogate(uc);
     }
 
     /**
@@ -290,8 +262,9 @@
          *           error() will return a descriptive result object
          */
         public int generate(int uc, int len, CharBuffer dst) {
-            if (Surrogate.isBMPCodePoint(uc)) {
-                if (Surrogate.is(uc)) {
+//            if (Character.isBMPCodePoint(uc)) {
+            if (isBMPCodePoint(uc)) {
+                if (is(uc)) {
                     error = CoderResult.malformedForLength(len);
                     return -1;
                 }
@@ -307,8 +280,8 @@
                     error = CoderResult.OVERFLOW;
                     return -1;
                 }
-                dst.put(Surrogate.high(uc));
-                dst.put(Surrogate.low(uc));
+                dst.put(high(uc));
+                dst.put(low(uc));
                 error = null;
                 return 2;
             } else {
@@ -334,8 +307,9 @@
          *           error() will return a descriptive result object
          */
         public int generate(int uc, int len, char[] da, int dp, int dl) {
-            if (Surrogate.isBMPCodePoint(uc)) {
-                if (Surrogate.is(uc)) {
+//            if (Character.isBMPCodePoint(uc)) {
+            if (isBMPCodePoint(uc)) {
+                if (is(uc)) {
                     error = CoderResult.malformedForLength(len);
                     return -1;
                 }
@@ -351,8 +325,8 @@
                     error = CoderResult.OVERFLOW;
                     return -1;
                 }
-                da[dp] = Surrogate.high(uc);
-                da[dp + 1] = Surrogate.low(uc);
+                da[dp] = high(uc);
+                da[dp + 1] = low(uc);
                 error = null;
                 return 2;
             } else {
diff --git a/src/share/classes/sun/nio/cs/UTF_8.java 
b/src/share/classes/sun/nio/cs/UTF_8.java
--- a/src/share/classes/sun/nio/cs/UTF_8.java
+++ b/src/share/classes/sun/nio/cs/UTF_8.java
@@ -102,7 +102,7 @@
         //  [F1..F3] [80..BF] [80..BF] [80..BF]
         //  [F4]     [80..8F] [80..BF] [80..BF]
         //  only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]
-        //  will be checked by Surrogate.neededFor(uc)
+        //  will be checked by Character.isSupplementaryCodePoint(uc)
         private static boolean isMalformed4(int b2, int b3, int b4) {
             return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
                    (b4 & 0xc0) != 0x80;
@@ -248,7 +248,7 @@
                              ((b3 & 0x3f) << 06) |
                              (b4 & 0x3f);
                     if (isMalformed4(b2, b3, b4) ||
-                        !Surrogate.neededFor(uc)) {
+                        !Character.isSupplementaryCodePoint(uc)) {
                         return malformed(src, sp, dst, dp, 4);
                     }
                     da[dp++] = Surrogate.high(uc);
@@ -304,7 +304,7 @@
                              ((b3 & 0x3f) << 06) |
                              (b4 & 0x3f);
                     if (isMalformed4(b2, b3, b4) ||
-                        !Surrogate.neededFor(uc)) { // shortest form check
+                        !Character.isSupplementaryCodePoint(uc)) { // shortest 
form check
                         return malformed(src, mark, 4);
                     }
                     dst.put(Surrogate.high(uc));

Re: String.lastIndexOf confused by unpaired trailing surrogate

Reply via email to