Author: elecharny
Date: Sun Apr  3 10:49:35 2005
New Revision: 159940

URL: http://svn.apache.org/viewcvs?view=rev&rev=159940
Log:
Added functions to deal with byte[] -> Unicode decoding. They are used by the 
MutableString and mainly in the decoder, to handle DN values.

Modified:
    
directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java

Modified: 
directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java
URL: 
http://svn.apache.org/viewcvs/directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java?view=diff&r1=159939&r2=159940
==============================================================================
--- 
directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java
 (original)
+++ 
directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java
 Sun Apr  3 10:49:35 2005
@@ -29,6 +29,24 @@
     /** Hex chars */
     private static final byte[] HEX =
         new byte[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 
'B', 'C', 'D', 'E', 'F' };
+    
+    private static int UTF8_MULTI_BYTES_MASK = 0x0080;
+    
+    private static int UTF8_TWO_BYTES_MASK = 0x00E0;
+    private static int UTF8_TWO_BYTES = 0x00C0;
+    
+    private static int UTF8_THREE_BYTES_MASK = 0x00F0;
+    private static int UTF8_THREE_BYTES = 0x00E0;
+
+    private static int UTF8_FOUR_BYTES_MASK = 0x00F8;
+    private static int UTF8_FOUR_BYTES = 0x00F0;
+    
+    private static int UTF8_FIVE_BYTES_MASK = 0x00FC;
+    private static int UTF8_FIVE_BYTES = 0x00F8;
+
+    private static int UTF8_SIX_BYTES_MASK = 0x00FE;
+    private static int UTF8_SIX_BYTES = 0x00FC;
+    
 
     //~ Methods 
------------------------------------------------------------------------------------
 
@@ -42,5 +60,195 @@
     {
         return new String(
                 new byte[] { '[', HEX[( octet & 0x00F0 ) >> 4], HEX[octet & 
0x000F], ']' } );
+    }
+    
+    /**
+     * Return the Unicode char which is coded in the bytes at position 0.
+     * 
+     * @param bytes The byte[] represntation of an Unicode string. 
+     * @return The first char found.
+     */
+    public static char bytesToChar(byte[] bytes)
+    {
+        return bytesToChar(bytes, 0);
+    }
+
+    /**
+     * Count the number of bytes needed to return an Unicode char. This
+     * can be from 1 to 6. 
+     * @param bytes The bytes to read
+     * @param pos Position to start counting. It must be a valid start of a 
+     * encoded char !
+     * @return The number of bytes to create a char, or -1 if the encoding is 
wrong.
+     * 
+     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
+     */
+    public static int countBytesPerChar(byte[] bytes, int pos)
+    {
+        if ((bytes[0] & UTF8_MULTI_BYTES_MASK) == 0)
+        {
+            return 1;
+        } else if ((bytes[0] & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES)
+       {
+            return 2;
+       }
+       else if ((bytes[0] & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES)
+       {
+           return 3;
+       }
+       else if ((bytes[0] & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES)
+       {
+           return 4;
+       }
+       else if ((bytes[0] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+       {
+           return 5;
+       }
+       else if ((bytes[0] & UTF8_SIX_BYTES_MASK) == UTF8_SIX_BYTES)
+       {
+           return 6;
+        } 
+       else
+       {
+           return -1;
+       }
+    }
+    
+    /**
+     * Return the Unicode char which is coded in the bytes at the given 
position. 
+     * @param bytes The byte[] represntation of an Unicode string. 
+     * @param pos The current position to start decoding the char
+     * @return The char found.
+     * @return The decoded char, or -1 if no char can be decoded
+     * 
+     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
+     */
+    public static char bytesToChar(byte[] bytes, int pos)
+    {
+       if ((bytes[pos] & UTF8_MULTI_BYTES_MASK) == 0)
+               {
+               return (char)bytes[pos];
+               }
+       else
+       {
+               if ((bytes[pos] & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES)
+               {
+                       // Two bytes char
+                       return (char)( 
+                                       ( ( bytes[pos] & 0x1C ) << 6 ) +        
// 110x-xxyy 10zz-zzzz -> 0000-0xxx 0000-0000
+                                       ( ( bytes[pos] & 0x03 ) << 6 ) +        
// 110x-xxyy 10zz-zzzz -> 0000-0000 yy00-0000
+                                               ( bytes[pos + 1] & 0x3F )       
                // 110x-xxyy 10zz-zzzz -> 0000-0000 00zz-zzzz
+                                               );                              
                                //                     -> 0000-0xxx yyzz-zzzz 
(07FF)
+               }
+               else if ((bytes[pos] & UTF8_THREE_BYTES_MASK) == 
UTF8_THREE_BYTES)
+               {
+                       // Three bytes char
+                       return (char)( 
+                                       // 1110-tttt 10xx-xxyy 10zz-zzzz -> 
tttt-0000-0000-0000
+                                       ( ( bytes[pos] & 0x0F) << 12 ) +        
+                                               // 1110-tttt 10xx-xxyy 
10zz-zzzz -> 0000-xxxx-0000-0000
+                                       ( ( bytes[pos + 1] & 0x3C) << 6 ) +     
+                                               // 1110-tttt 10xx-xxyy 
10zz-zzzz -> 0000-0000-yy00-0000
+                                       ( ( bytes[pos + 1] & 0x03) << 6 ) +     
+                                               // 1110-tttt 10xx-xxyy 
10zz-zzzz -> 0000-0000-00zz-zzzz
+                                               ( bytes[pos + 2] & 0x3F )       
                        
+                                               //                              
 -> tttt-xxxx yyzz-zzzz (FF FF)
+                                               );                              
                         
+               }
+               else if ((bytes[pos] & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES)
+               {
+                       // Four bytes char
+                       return (char)(
+                                       // 1111-0ttt 10uu-vvvv 10xx-xxyy 
10zz-zzzz -> 000t-tt00 0000-0000 0000-0000
+                                       ( ( bytes[pos] & 0x07) << 18 ) +
+                                               // 1111-0ttt 10uu-vvvv 
10xx-xxyy 10zz-zzzz -> 0000-00uu 0000-0000 0000-0000
+                                       ( ( bytes[pos + 1] & 0x30) << 16 ) + 
+                                               // 1111-0ttt 10uu-vvvv 
10xx-xxyy 10zz-zzzz -> 0000-0000 vvvv-0000 0000-0000
+                                       ( ( bytes[pos + 1] & 0x0F) << 12 ) + 
+                                               // 1111-0ttt 10uu-vvvv 
10xx-xxyy 10zz-zzzz -> 0000-0000 0000-xxxx 0000-0000
+                                       ( ( bytes[pos + 2] & 0x3C) << 6 ) + 
+                                               // 1111-0ttt 10uu-vvvv 
10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 yy00-0000
+                                       ( ( bytes[pos + 2] & 0x03) << 6 ) +
+                                               // 1111-0ttt 10uu-vvvv 
10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 00zz-zzzz
+                                               ( bytes[pos + 3] & 0x3F )
+                                               //                              
           -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
+                                               );   
+               }
+               else if ((bytes[pos] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+               {
+                       // Five bytes char
+                       return (char)( 
+                                       // 1111-10tt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz -> 0000-00tt 0000-0000 0000-0000 0000-0000
+                                       ( ( bytes[pos] & 0x03) << 24 ) + 
+                                       // 1111-10tt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz -> 0000-0000 uuuu-uu00 0000-0000 0000-0000
+                                       ( ( bytes[pos + 1] & 0x3F) << 18 ) + 
+                                       // 1111-10tt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz -> 0000-0000 0000-00vv 0000-0000 0000-0000
+                                       ( ( bytes[pos + 2] & 0x30) << 12 ) + 
+                                       // 1111-10tt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 wwww-0000 0000-0000
+                                       ( ( bytes[pos + 2] & 0x0F) << 12 ) + 
+                                       // 1111-10tt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-xxxx 0000-0000
+                                       ( ( bytes[pos + 3] & 0x3C) << 6 ) + 
+                                       // 1111-10tt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-0000 yy00-0000
+                                       ( ( bytes[pos + 3] & 0x03) << 6 ) + 
+                                               // 1111-10tt 10uu-uuuu 
10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-0000 00zz-zzzz
+                                               ( bytes[pos + 4] & 0x3F )
+                                               // -> 0000-00tt uuuu-uuvv 
wwww-xxxx yyzz-zzzz (03 FF FF FF)
+                                               );   
+               }
+               else if ((bytes[pos] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+               {
+                       // Six bytes char
+                       return (char)( 
+                               // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz ->
+                               // 0s00-0000 0000-0000 0000-0000 0000-0000
+                                       ( ( bytes[pos] & 0x01) << 30 ) + 
+                               // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz ->
+                               // 00tt-tttt 0000-0000 0000-0000 0000-0000
+                                       ( ( bytes[pos + 1] & 0x3F) << 24 ) + 
+                               // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz ->
+                               // 0000-0000 uuuu-uu00 0000-0000 0000-0000
+                                       ( ( bytes[pos + 2] & 0x3F) << 18 ) + 
+                               // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz ->
+                               // 0000-0000 0000-00vv 0000-0000 0000-0000
+                                       ( ( bytes[pos + 3] & 0x30) << 12 ) + 
+                               // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz ->
+                               // 0000-0000 0000-0000 wwww-0000 0000-0000
+                                       ( ( bytes[pos + 3] & 0x0F) << 12 ) + 
+                               // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz ->
+                                       // 0000-0000 0000-0000 0000-xxxx 
0000-0000
+                                       ( ( bytes[pos + 4] & 0x3C) << 6 ) + 
+                               // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz ->
+                                       // 0000-0000 0000-0000 0000-0000 
yy00-0000
+                                       ( ( bytes[pos + 4] & 0x03) << 6 ) + 
+                               // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 
10xx-xxyy 10zz-zzzz ->
+                                       // 0000-0000 0000-0000 0000-0000 
00zz-zzzz
+                                               ( bytes[pos + 5] & 0x3F )
+                               // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz 
(7F FF FF FF)
+                                               );   
+               } 
+               else
+               {
+                   return (char)-1;
+               }
+       }
+    }
+    
+    /**
+     * Count the number of chars included in the given byte[].  
+     * @param bytes The byte array to decode
+     * @return The number of char in the byte array
+     */
+    public static int countChars(byte[] bytes)
+    {
+        int nbChars = 0;
+        int currentPos = 0;
+        
+        while (currentPos < bytes.length)
+        {
+            currentPos += countBytesPerChar(bytes, currentPos);
+            nbChars ++;
+        }
+
+        return nbChars;
     }
 }


Reply via email to