On 10/21/2015 06:21 PM, Charles Hixson via Digitalmars-d-learn wrote:
To me this looks like a library error, but I'm not sure.  Any suggestions
import    std.uni;

char    gcCat1    (dchar ch)
{  if    (ch in unicode.L)        return    'L';        // Letter
    if    (ch in unicode.M)        return    'M';        // Mask
if (ch in unicode.C) return 'C'; // Control <<== error here!
    if    (ch in unicode.N)        return    'N';        // Numeric
    if    (ch in unicode.P)        return    'P';        // Punctuation
    if    (ch in unicode.S)        return    'S';        // Symbol
    if    (ch in unicode.Z)        return    'Z';        // Separator

    return    '?';
}

$ rdmd --main -unittest test2.d
/usr/include/dmd/phobos/std/uni.d(6220): Error: slice [0..2] exceeds array bounds [0..1] /usr/include/dmd/phobos/std/uni.d(6220): called from here: comparePropertyName(name[0..2], "In") /usr/include/dmd/phobos/std/uni.d(6119): called from here: findAny("C") /usr/include/dmd/phobos/std/uni.d(6122): Error: static assert "No unicode set by name C was found."
test2.d(7):        instantiated from here: opDispatch!"C"
Failed: ["dmd", "-unittest", "-v", "-o-", "test2.d", "-I."]


$ dmd
DMD64 D Compiler v2.068.2
Copyright (c) 1999-2015 by Digital Mars written by Walter Bright
Documentation: http://dlang.org/
Config file: /etc/dmd.conf
...

Here is a routine for general character category that works except for control characters, which appear to be mishandled by the library. (If there's another explanation, I'd like to hear it.)
Note:
The unittests are excessively verbose, but the way to silence them is pretty obvious, except in the cases that fail. This is NOT a comprehensive unittest, so there may be errors not currently detected.

Also, if there's a better way to do this, I'd like to hear it.

import   std.uni;

/**   Return a character approximately equivalent to the first character
 *    of the Unicode General Category Classification.
 * Warning:  This method has been specialized for English and related
 * languages.  (That's what category Q is about.  It has also been
 * adapted to consider the characters often used in SSI#s, phone #s,
 * dates, and times as embeddable.)
 * Returns: L, Z, C, Q, P, or ?.  Q is for embeddable chars. Numeric has
 *    been combined with Alpha, and ? is not otherwise classifiable. */
char  charCat (dchar ch)
{  if (isAlpha (ch) )         return   'L';
if (isNumber (ch) ) return 'L'; // I don't want to distinguish between letters and numbers
   if (isWhite (ch) )         return   'Z';
   if (isControl (ch) )       return   'C';
// if ("'-+.,@/’‘:/".indexOf (ch) >= 0) return 'Q'; // Not a unicode grouping
   if (isPunctuation (ch) )   return   'P';
   else                       return   '?';  // Includes not a character
}

char  gcCat1   (dchar ch)
{  if (ch in unicode.L)    return   'L';     // Letter
   if (ch in unicode.M)    return   'M';     // Mask
// if (ch in unicode.C)    return   'C';     // Control
   if (isControl(ch) )     return   'C';     // Control
   if (ch in unicode.N)    return   'N';     // Numeric
   if (ch in unicode.P)    return   'P';     // Punctuation
   if (ch in unicode.S)    return   'S';     // Symbol
   if (ch in unicode.Z)    return   'Z';     // Separator

   return   '?';
}
/**   Get the two letter general character category.  */
string   gcCat2 (dchar ch)
{  char  kind  =  gcCat1(ch);
   switch (kind)
   {  case  'C':     // C  Other
         //    Cc    Control
         if (ch in unicode.Cc)               return   "Cc";
         // Cf    Format
         if (ch in unicode.Cf)               return   "Cf";
         // Cn    Unassigned
         if (ch in unicode.Cn)               return   "Cn";
         // Co    Private_Use
         if (ch in unicode.Co)               return   "Co";
         // Cs    Surrogate
         if (ch in unicode.Cs)               return   "Cs";
         // Unexpected value
         return   "C?";
      case  'L':     // L  Letter
         // Ll    Lowercase_Letter
         if (ch in unicode.Ll)               return   "Ll";
         // Lm    Modifier_Letter
         if (ch in unicode.Lm)               return   "Lm";
         // Lo    Other_Letter
         if (ch in unicode.Lo)               return   "Lo";
         // Lt    Titlecase_Letter
         if (ch in unicode.Lt)               return   "Lt";
         // Lu    Uppercase_Letter
         if (ch in unicode.Lu)               return   "Lu";
         // Unexpected Letter
         return   "L?";
      case  'M':     // M  Mark
         // Mc    Spacing_Mark
         if (ch in unicode.Mc)               return   "Mc";
         // Me    Enclosing_Mark
         if (ch in unicode.Me)               return   "Me";
         // Mn    Nonspacing_Mark
         if (ch in unicode.Mn)               return   "Mn";
         // Unexpected Mark
         return   "M?";
      case  'N':     // N  Number
         // Nd    Decimal_Number
         if (ch in unicode.Nd)               return   "Nd";
         // Nl    Letter_Number
         if (ch in unicode.Nl)               return   "Nl";
         // No    Other_Number
         if (ch in unicode.No)               return   "No";
         // Unexpected Number
         return   "N?";
      case  'P':     // P  Punctuation
         // Pc    Connector_Punctuation
         if (ch in unicode.Pc)               return   "Pc";
         // Pd    Dash_Punctuation
         if (ch in unicode.Pd)               return   "Pd";
         // Pe    Close_Punctuation
         if (ch in unicode.Pe)               return   "Pe";
         // Pf    Final_Punctuation
         if (ch in unicode.Pf)               return   "Pf";
         // Pi    Initial_Punctuation
         if (ch in unicode.Pi)               return   "Pi";
         // Po    Other_Punctuation
         if (ch in unicode.Po)               return   "Po";
         // Ps    Open_Punctuation
         if (ch in unicode.Ps)               return   "Ps";
         // Unexpected Punctuation
         return   "P?";
      case  'S':     // S  Symbol
         // Sc    Currency_Symbol
         if (ch in unicode.Sc)               return   "Sc";
         // Sk    Modifier_Symbol
         if (ch in unicode.Sk)               return   "Sk";
         // Sm    Math_Symbol
         if (ch in unicode.Sm)               return   "Sm";
         // So    Other_Symbol
         if (ch in unicode.So)               return   "So";
         // Unexpected Symbol
         return   "S?";
      case  'Z':     // Z  Separator
         // Zl    Line_Separator
         if (ch in unicode.Zl)               return   "Zl";
         // Zp    Paragraph_Separator
         if (ch in unicode.Zp)               return   "Zp";
         // Zs    Space_Separator
         if (ch in unicode.Zs)               return   "Zs";
         // Unexpected Separator
         return   "z?";
      default:
         // Unexpected Kind
         return   "??";
   }
}  // string   gcCat2 (dchar ch)
unittest
{
   writeln ("\\a == ", gcCat2 ('\a') );      // Cc
   writeln ("\\n == ", gcCat2 ('\n') );      // Cc
   writeln ("\\r == ", gcCat2 ('\r') );      // Cc
   writeln ("\\t == ", gcCat2 ('\t') );      // Cc
   writeln ("\\b == ", gcCat2 ('\b') );      // Cc
writeln ("u00AD (SHY) == ", gcCat2 ('\u00AD'), " <<== FAIL, should be \"Cf\""); // Cf writeln ("u0600 == ", gcCat2 ('\u0600'), " <<== FAIL, should be \"Cf\""); // Cf writeln ("U000E007F == ", gcCat2 ('\U000E007F'), " <<== FAIL, should be \"Cf\""); // Cf writeln ("uD800 == ", gcCat2 (0xD800), " <<== FAIL, should be \"Co\""); // Co writeln ("uDB7F == ", gcCat2 (0xDB7F), " <<== FAIL, should be \"Co\""); // Co
   writeln ("a == ", gcCat2 ('a') );         // Ll
   writeln ("ʰ == ", gcCat2 ('ʰ') );         // Lm
   writeln ("ª == ", gcCat2 ('ª') );         // Lo
   writeln ("Dž == ", gcCat2 ('Dž') );         // Lt
   writeln ("A == ", gcCat2 ('A') );         // Lu
   writeln (" ः == ", gcCat2 ('ः') );        // Mc
   writeln ("   ⃤ == ", gcCat2 ('⃤') );         // Me
   writeln ("u065e == ", gcCat2 (0x65e));       // Mn
   writeln ("۶ == ", gcCat2 ('۶') );         // Nd
   writeln ("0 == ", gcCat2 ('0') );         // Nd
   writeln ("ᛯ == ", gcCat2 ('ᛯ') );         // Nl
   writeln ("¼ == ", gcCat2 ('¼') );         // No
   writeln ("_ == ", gcCat2 ('_') );         // Pc
   writeln ("- == ", gcCat2 ('-') );         // Pd
   writeln (") == ", gcCat2 (')') );         // Pe
   writeln ("] == ", gcCat2 (']') );         // Pe
   writeln ("} == ", gcCat2 ('}') );         // Pe
   writeln ("» == ", gcCat2 ('»') );         // Pf
   writeln ("« == ", gcCat2 ('«') );         // Pi
   writeln ("@ == ", gcCat2 ('@') );         // Po
   writeln (". == ", gcCat2 ('.') );         // Po
   writeln ("\" == ", gcCat2 ('"') );        // Po
   writeln ("{ == ", gcCat2 ('{') );         // Ps
   writeln ("[ == ", gcCat2 ('[') );         // Ps
   writeln ("( == ", gcCat2 ('(') );         // Ps
   writeln ("$ == ", gcCat2 ('$') );         // Sc
   writeln ("^ == ", gcCat2 ('^') );         // Sk
   writeln ("+ == ", gcCat2 ('+') );         // Sm
   writeln ("~ == ", gcCat2 ('~') );         // Sm
   writeln ("= == ", gcCat2 ('=') );         // Sm
   writeln ("© == ", gcCat2 ('©') );         // So
   writeln ("u2028 == ", gcCat2 (0x2028) );        // Zl
   writeln ("u2029 == ", gcCat2 (0x2029) );        // Zp
   writeln ("  == ", gcCat2 (' ') );         // Zs
}

Reply via email to