Re: error detected at """ ch in unicode.C """ Library error?

Charles Hixson via Digitalmars-d-learn Thu, 22 Oct 2015 18:46:26 -0700


On 10/21/2015 06:21 PM, Charles Hixson via Digitalmars-d-learn wrote:

To me this looks like a library error, but I'm not sure.  Any suggestions
import    std.uni;

char    gcCat1    (dchar ch)
{  if    (ch in unicode.L)        return    'L';        // Letter
    if    (ch in unicode.M)        return    'M';        // Mask
if (ch in unicode.C) return 'C'; // Control<<== error here!
    if    (ch in unicode.N)        return    'N';        // Numeric
    if    (ch in unicode.P)        return    'P';        // Punctuation
    if    (ch in unicode.S)        return    'S';        // Symbol
    if    (ch in unicode.Z)        return    'Z';        // Separator

    return    '?';
}

$ rdmd --main -unittest test2.d
/usr/include/dmd/phobos/std/uni.d(6220): Error: slice [0..2] exceedsarray bounds [0..1]/usr/include/dmd/phobos/std/uni.d(6220): called from here:comparePropertyName(name[0..2], "In")/usr/include/dmd/phobos/std/uni.d(6119): called from here:findAny("C")/usr/include/dmd/phobos/std/uni.d(6122): Error: static assert "Nounicode set by name C was found."
test2.d(7):        instantiated from here: opDispatch!"C"
Failed: ["dmd", "-unittest", "-v", "-o-", "test2.d", "-I."]


$ dmd
DMD64 D Compiler v2.068.2
Copyright (c) 1999-2015 by Digital Mars written by Walter Bright
Documentation: http://dlang.org/
Config file: /etc/dmd.conf
...

Here is a routine for general character category that works except forcontrol characters, which appear to be mishandled by the library. (Ifthere's another explanation, I'd like to hear it.)

Note:

The unittests are excessively verbose, but the way to silence them ispretty obvious, except in the cases that fail. This is NOT acomprehensive unittest, so there may be errors not currently detected.


Also, if there's a better way to do this, I'd like to hear it.

import   std.uni;

/**   Return a character approximately equivalent to the first character
 *    of the Unicode General Category Classification.
 * Warning:  This method has been specialized for English and related
 * languages.  (That's what category Q is about.  It has also been
 * adapted to consider the characters often used in SSI#s, phone #s,
 * dates, and times as embeddable.)
 * Returns: L, Z, C, Q, P, or ?.  Q is for embeddable chars. Numeric has
 *    been combined with Alpha, and ? is not otherwise classifiable. */
char  charCat (dchar ch)
{  if (isAlpha (ch) )         return   'L';

if (isNumber (ch) ) return 'L'; // I don't want todistinguish between letters and numbers

   if (isWhite (ch) )         return   'Z';
   if (isControl (ch) )       return   'C';

// if ("'-+.,@/’‘:/".indexOf (ch) >= 0) return 'Q'; // Not aunicode grouping

   if (isPunctuation (ch) )   return   'P';
   else                       return   '?';  // Includes not a character
}

char  gcCat1   (dchar ch)
{  if (ch in unicode.L)    return   'L';     // Letter
   if (ch in unicode.M)    return   'M';     // Mask
// if (ch in unicode.C)    return   'C';     // Control
   if (isControl(ch) )     return   'C';     // Control
   if (ch in unicode.N)    return   'N';     // Numeric
   if (ch in unicode.P)    return   'P';     // Punctuation
   if (ch in unicode.S)    return   'S';     // Symbol
   if (ch in unicode.Z)    return   'Z';     // Separator

   return   '?';
}
/**   Get the two letter general character category.  */
string   gcCat2 (dchar ch)
{  char  kind  =  gcCat1(ch);
   switch (kind)
   {  case  'C':     // C  Other
         //    Cc    Control
         if (ch in unicode.Cc)               return   "Cc";
         // Cf    Format
         if (ch in unicode.Cf)               return   "Cf";
         // Cn    Unassigned
         if (ch in unicode.Cn)               return   "Cn";
         // Co    Private_Use
         if (ch in unicode.Co)               return   "Co";
         // Cs    Surrogate
         if (ch in unicode.Cs)               return   "Cs";
         // Unexpected value
         return   "C?";
      case  'L':     // L  Letter
         // Ll    Lowercase_Letter
         if (ch in unicode.Ll)               return   "Ll";
         // Lm    Modifier_Letter
         if (ch in unicode.Lm)               return   "Lm";
         // Lo    Other_Letter
         if (ch in unicode.Lo)               return   "Lo";
         // Lt    Titlecase_Letter
         if (ch in unicode.Lt)               return   "Lt";
         // Lu    Uppercase_Letter
         if (ch in unicode.Lu)               return   "Lu";
         // Unexpected Letter
         return   "L?";
      case  'M':     // M  Mark
         // Mc    Spacing_Mark
         if (ch in unicode.Mc)               return   "Mc";
         // Me    Enclosing_Mark
         if (ch in unicode.Me)               return   "Me";
         // Mn    Nonspacing_Mark
         if (ch in unicode.Mn)               return   "Mn";
         // Unexpected Mark
         return   "M?";
      case  'N':     // N  Number
         // Nd    Decimal_Number
         if (ch in unicode.Nd)               return   "Nd";
         // Nl    Letter_Number
         if (ch in unicode.Nl)               return   "Nl";
         // No    Other_Number
         if (ch in unicode.No)               return   "No";
         // Unexpected Number
         return   "N?";
      case  'P':     // P  Punctuation
         // Pc    Connector_Punctuation
         if (ch in unicode.Pc)               return   "Pc";
         // Pd    Dash_Punctuation
         if (ch in unicode.Pd)               return   "Pd";
         // Pe    Close_Punctuation
         if (ch in unicode.Pe)               return   "Pe";
         // Pf    Final_Punctuation
         if (ch in unicode.Pf)               return   "Pf";
         // Pi    Initial_Punctuation
         if (ch in unicode.Pi)               return   "Pi";
         // Po    Other_Punctuation
         if (ch in unicode.Po)               return   "Po";
         // Ps    Open_Punctuation
         if (ch in unicode.Ps)               return   "Ps";
         // Unexpected Punctuation
         return   "P?";
      case  'S':     // S  Symbol
         // Sc    Currency_Symbol
         if (ch in unicode.Sc)               return   "Sc";
         // Sk    Modifier_Symbol
         if (ch in unicode.Sk)               return   "Sk";
         // Sm    Math_Symbol
         if (ch in unicode.Sm)               return   "Sm";
         // So    Other_Symbol
         if (ch in unicode.So)               return   "So";
         // Unexpected Symbol
         return   "S?";
      case  'Z':     // Z  Separator
         // Zl    Line_Separator
         if (ch in unicode.Zl)               return   "Zl";
         // Zp    Paragraph_Separator
         if (ch in unicode.Zp)               return   "Zp";
         // Zs    Space_Separator
         if (ch in unicode.Zs)               return   "Zs";
         // Unexpected Separator
         return   "z?";
      default:
         // Unexpected Kind
         return   "??";
   }
}  // string   gcCat2 (dchar ch)
unittest
{
   writeln ("\\a == ", gcCat2 ('\a') );      // Cc
   writeln ("\\n == ", gcCat2 ('\n') );      // Cc
   writeln ("\\r == ", gcCat2 ('\r') );      // Cc
   writeln ("\\t == ", gcCat2 ('\t') );      // Cc
   writeln ("\\b == ", gcCat2 ('\b') );      // Cc

writeln ("u00AD (SHY) == ", gcCat2 ('\u00AD'), " <<== FAIL, shouldbe \"Cf\""); // Cfwriteln ("u0600 == ", gcCat2 ('\u0600'), " <<== FAIL, should be\"Cf\""); // Cfwriteln ("U000E007F == ", gcCat2 ('\U000E007F'), " <<== FAIL, shouldbe \"Cf\""); // Cfwriteln ("uD800 == ", gcCat2 (0xD800), " <<== FAIL, should be\"Co\""); // Cowriteln ("uDB7F == ", gcCat2 (0xDB7F), " <<== FAIL, should be\"Co\""); // Co

   writeln ("a == ", gcCat2 ('a') );         // Ll
   writeln ("ʰ == ", gcCat2 ('ʰ') );         // Lm
   writeln ("ª == ", gcCat2 ('ª') );         // Lo
   writeln ("ǅ == ", gcCat2 ('ǅ') );         // Lt
   writeln ("A == ", gcCat2 ('A') );         // Lu
   writeln (" ः == ", gcCat2 ('ः') );        // Mc
   writeln ("   ⃤ == ", gcCat2 ('⃤') );         // Me
   writeln ("u065e == ", gcCat2 (0x65e));       // Mn
   writeln ("۶ == ", gcCat2 ('۶') );         // Nd
   writeln ("0 == ", gcCat2 ('0') );         // Nd
   writeln ("ᛯ == ", gcCat2 ('ᛯ') );         // Nl
   writeln ("¼ == ", gcCat2 ('¼') );         // No
   writeln ("_ == ", gcCat2 ('_') );         // Pc
   writeln ("- == ", gcCat2 ('-') );         // Pd
   writeln (") == ", gcCat2 (')') );         // Pe
   writeln ("] == ", gcCat2 (']') );         // Pe
   writeln ("} == ", gcCat2 ('}') );         // Pe
   writeln ("» == ", gcCat2 ('»') );         // Pf
   writeln ("« == ", gcCat2 ('«') );         // Pi
   writeln ("@ == ", gcCat2 ('@') );         // Po
   writeln (". == ", gcCat2 ('.') );         // Po
   writeln ("\" == ", gcCat2 ('"') );        // Po
   writeln ("{ == ", gcCat2 ('{') );         // Ps
   writeln ("[ == ", gcCat2 ('[') );         // Ps
   writeln ("( == ", gcCat2 ('(') );         // Ps
   writeln ("$ == ", gcCat2 ('$') );         // Sc
   writeln ("＾ == ", gcCat2 ('＾') );         // Sk
   writeln ("+ == ", gcCat2 ('+') );         // Sm
   writeln ("~ == ", gcCat2 ('~') );         // Sm
   writeln ("= == ", gcCat2 ('=') );         // Sm
   writeln ("© == ", gcCat2 ('©') );         // So
   writeln ("u2028 == ", gcCat2 (0x2028) );        // Zl
   writeln ("u2029 == ", gcCat2 (0x2029) );        // Zp
   writeln ("  == ", gcCat2 (' ') );         // Zs
}

Re: error detected at """ ch in unicode.C """ Library error?

Reply via email to