On 10/21/2015 06:21 PM, Charles Hixson via Digitalmars-d-learn wrote:
To me this looks like a library error, but I'm not sure. Any suggestions
import std.uni;
char gcCat1 (dchar ch)
{ if (ch in unicode.L) return 'L'; // Letter
if (ch in unicode.M) return 'M'; // Mask
if (ch in unicode.C) return 'C'; // Control
<<== error here!
if (ch in unicode.N) return 'N'; // Numeric
if (ch in unicode.P) return 'P'; // Punctuation
if (ch in unicode.S) return 'S'; // Symbol
if (ch in unicode.Z) return 'Z'; // Separator
return '?';
}
$ rdmd --main -unittest test2.d
/usr/include/dmd/phobos/std/uni.d(6220): Error: slice [0..2] exceeds
array bounds [0..1]
/usr/include/dmd/phobos/std/uni.d(6220): called from here:
comparePropertyName(name[0..2], "In")
/usr/include/dmd/phobos/std/uni.d(6119): called from here:
findAny("C")
/usr/include/dmd/phobos/std/uni.d(6122): Error: static assert "No
unicode set by name C was found."
test2.d(7): instantiated from here: opDispatch!"C"
Failed: ["dmd", "-unittest", "-v", "-o-", "test2.d", "-I."]
$ dmd
DMD64 D Compiler v2.068.2
Copyright (c) 1999-2015 by Digital Mars written by Walter Bright
Documentation: http://dlang.org/
Config file: /etc/dmd.conf
...
Here is a routine for general character category that works except for
control characters, which appear to be mishandled by the library. (If
there's another explanation, I'd like to hear it.)
Note:
The unittests are excessively verbose, but the way to silence them is
pretty obvious, except in the cases that fail. This is NOT a
comprehensive unittest, so there may be errors not currently detected.
Also, if there's a better way to do this, I'd like to hear it.
import std.uni;
/** Return a character approximately equivalent to the first character
* of the Unicode General Category Classification.
* Warning: This method has been specialized for English and related
* languages. (That's what category Q is about. It has also been
* adapted to consider the characters often used in SSI#s, phone #s,
* dates, and times as embeddable.)
* Returns: L, Z, C, Q, P, or ?. Q is for embeddable chars. Numeric has
* been combined with Alpha, and ? is not otherwise classifiable. */
char charCat (dchar ch)
{ if (isAlpha (ch) ) return 'L';
if (isNumber (ch) ) return 'L'; // I don't want to
distinguish between letters and numbers
if (isWhite (ch) ) return 'Z';
if (isControl (ch) ) return 'C';
// if ("'-+.,@/’‘:/".indexOf (ch) >= 0) return 'Q'; // Not a
unicode grouping
if (isPunctuation (ch) ) return 'P';
else return '?'; // Includes not a character
}
char gcCat1 (dchar ch)
{ if (ch in unicode.L) return 'L'; // Letter
if (ch in unicode.M) return 'M'; // Mask
// if (ch in unicode.C) return 'C'; // Control
if (isControl(ch) ) return 'C'; // Control
if (ch in unicode.N) return 'N'; // Numeric
if (ch in unicode.P) return 'P'; // Punctuation
if (ch in unicode.S) return 'S'; // Symbol
if (ch in unicode.Z) return 'Z'; // Separator
return '?';
}
/** Get the two letter general character category. */
string gcCat2 (dchar ch)
{ char kind = gcCat1(ch);
switch (kind)
{ case 'C': // C Other
// Cc Control
if (ch in unicode.Cc) return "Cc";
// Cf Format
if (ch in unicode.Cf) return "Cf";
// Cn Unassigned
if (ch in unicode.Cn) return "Cn";
// Co Private_Use
if (ch in unicode.Co) return "Co";
// Cs Surrogate
if (ch in unicode.Cs) return "Cs";
// Unexpected value
return "C?";
case 'L': // L Letter
// Ll Lowercase_Letter
if (ch in unicode.Ll) return "Ll";
// Lm Modifier_Letter
if (ch in unicode.Lm) return "Lm";
// Lo Other_Letter
if (ch in unicode.Lo) return "Lo";
// Lt Titlecase_Letter
if (ch in unicode.Lt) return "Lt";
// Lu Uppercase_Letter
if (ch in unicode.Lu) return "Lu";
// Unexpected Letter
return "L?";
case 'M': // M Mark
// Mc Spacing_Mark
if (ch in unicode.Mc) return "Mc";
// Me Enclosing_Mark
if (ch in unicode.Me) return "Me";
// Mn Nonspacing_Mark
if (ch in unicode.Mn) return "Mn";
// Unexpected Mark
return "M?";
case 'N': // N Number
// Nd Decimal_Number
if (ch in unicode.Nd) return "Nd";
// Nl Letter_Number
if (ch in unicode.Nl) return "Nl";
// No Other_Number
if (ch in unicode.No) return "No";
// Unexpected Number
return "N?";
case 'P': // P Punctuation
// Pc Connector_Punctuation
if (ch in unicode.Pc) return "Pc";
// Pd Dash_Punctuation
if (ch in unicode.Pd) return "Pd";
// Pe Close_Punctuation
if (ch in unicode.Pe) return "Pe";
// Pf Final_Punctuation
if (ch in unicode.Pf) return "Pf";
// Pi Initial_Punctuation
if (ch in unicode.Pi) return "Pi";
// Po Other_Punctuation
if (ch in unicode.Po) return "Po";
// Ps Open_Punctuation
if (ch in unicode.Ps) return "Ps";
// Unexpected Punctuation
return "P?";
case 'S': // S Symbol
// Sc Currency_Symbol
if (ch in unicode.Sc) return "Sc";
// Sk Modifier_Symbol
if (ch in unicode.Sk) return "Sk";
// Sm Math_Symbol
if (ch in unicode.Sm) return "Sm";
// So Other_Symbol
if (ch in unicode.So) return "So";
// Unexpected Symbol
return "S?";
case 'Z': // Z Separator
// Zl Line_Separator
if (ch in unicode.Zl) return "Zl";
// Zp Paragraph_Separator
if (ch in unicode.Zp) return "Zp";
// Zs Space_Separator
if (ch in unicode.Zs) return "Zs";
// Unexpected Separator
return "z?";
default:
// Unexpected Kind
return "??";
}
} // string gcCat2 (dchar ch)
unittest
{
writeln ("\\a == ", gcCat2 ('\a') ); // Cc
writeln ("\\n == ", gcCat2 ('\n') ); // Cc
writeln ("\\r == ", gcCat2 ('\r') ); // Cc
writeln ("\\t == ", gcCat2 ('\t') ); // Cc
writeln ("\\b == ", gcCat2 ('\b') ); // Cc
writeln ("u00AD (SHY) == ", gcCat2 ('\u00AD'), " <<== FAIL, should
be \"Cf\""); // Cf
writeln ("u0600 == ", gcCat2 ('\u0600'), " <<== FAIL, should be
\"Cf\""); // Cf
writeln ("U000E007F == ", gcCat2 ('\U000E007F'), " <<== FAIL, should
be \"Cf\""); // Cf
writeln ("uD800 == ", gcCat2 (0xD800), " <<== FAIL, should be
\"Co\""); // Co
writeln ("uDB7F == ", gcCat2 (0xDB7F), " <<== FAIL, should be
\"Co\""); // Co
writeln ("a == ", gcCat2 ('a') ); // Ll
writeln ("ʰ == ", gcCat2 ('ʰ') ); // Lm
writeln ("ª == ", gcCat2 ('ª') ); // Lo
writeln ("Dž == ", gcCat2 ('Dž') ); // Lt
writeln ("A == ", gcCat2 ('A') ); // Lu
writeln (" ः == ", gcCat2 ('ः') ); // Mc
writeln (" ⃤ == ", gcCat2 ('⃤') ); // Me
writeln ("u065e == ", gcCat2 (0x65e)); // Mn
writeln ("۶ == ", gcCat2 ('۶') ); // Nd
writeln ("0 == ", gcCat2 ('0') ); // Nd
writeln ("ᛯ == ", gcCat2 ('ᛯ') ); // Nl
writeln ("¼ == ", gcCat2 ('¼') ); // No
writeln ("_ == ", gcCat2 ('_') ); // Pc
writeln ("- == ", gcCat2 ('-') ); // Pd
writeln (") == ", gcCat2 (')') ); // Pe
writeln ("] == ", gcCat2 (']') ); // Pe
writeln ("} == ", gcCat2 ('}') ); // Pe
writeln ("» == ", gcCat2 ('»') ); // Pf
writeln ("« == ", gcCat2 ('«') ); // Pi
writeln ("@ == ", gcCat2 ('@') ); // Po
writeln (". == ", gcCat2 ('.') ); // Po
writeln ("\" == ", gcCat2 ('"') ); // Po
writeln ("{ == ", gcCat2 ('{') ); // Ps
writeln ("[ == ", gcCat2 ('[') ); // Ps
writeln ("( == ", gcCat2 ('(') ); // Ps
writeln ("$ == ", gcCat2 ('$') ); // Sc
writeln ("^ == ", gcCat2 ('^') ); // Sk
writeln ("+ == ", gcCat2 ('+') ); // Sm
writeln ("~ == ", gcCat2 ('~') ); // Sm
writeln ("= == ", gcCat2 ('=') ); // Sm
writeln ("© == ", gcCat2 ('©') ); // So
writeln ("u2028 == ", gcCat2 (0x2028) ); // Zl
writeln ("u2029 == ", gcCat2 (0x2029) ); // Zp
writeln (" == ", gcCat2 (' ') ); // Zs
}