On Sunday, 16 October 2016 at 07:59:16 UTC, Patrick Schluter
wrote:
Here my version. It's probably not the shortest (100 ligns of
assembly with LDC) but it is correct and has following
properties:
- Performance proportional to the encoding length
- Detects Invalid byte sequences
- Detects Overlong encodings
- Detects Invalid code points
I put the exception to be comparable to other routines but
Unicode specifies that it is preferable to not abort on
encoding errors (to avoid denial of service attacks).
dchar myFront2(ref char[] str)
{
dchar c0 = str.ptr[0];
if(c0 < 0x80) {
return c0;
}
else if(str.length > 1) {
dchar c1 = str.ptr[1];
if(c0 < 0xE0 && (c1 & 0xC0) == 0x80) {
c1 = ((c0 & 0x1F) << 6)|(c1 & 0x3F);
if(c1 < 0x80) goto Linvalid;
return c1;
}
else if(str.length > 2) {
dchar c2 = str.ptr[2];
if(c0 < 0xF0 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) ==
0x80) {
c2 = ((c0 & 0x0F) << 12)|((c1 & 0x3F) << 6)|(c2 & 0x3F);
if(c2 < 0x800) goto Linvalid;
return c2;
}
else if(str.length > 3) {
dchar c3 = str.ptr[3];
if(c0 < 0xF5 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) ==
0x80 && (c3 & 0xC0) == 0x80) {
c3 = ((c0 & 0x07) << 16)|((c1 & 0x3F) << 12)|((c2 &
0x3F) << 6)|(c3 & 0x3F);
if(c3 < 0x10000 || c3 > 0x10ffff) goto Linvalid;
return c3;
}
}
}
}
Linvalid:
throw new Exception("yadayada");
//assert(myFront2(['\xC2','\xA2'])==0xA3);
}
This looks quite slow.
We already have a correct version in utf.decodeImpl.
The goal here was to find a small and fast alternative.