On Sunday, 11 July 2021 at 05:54:48 UTC, Ali Çehreli wrote:
Ali
Primarily to Ali & Steve for their help, be advised, this post
will be somehow ... long.
Some bit of background to begin with: a week or so ago I posted
asking advice on code safeness, and still I didn't reply to the
ones that kindly answered. Seeing some replies, and encountering
a code issue regarding string manipulation, I pretty soon figured
out that I still did not have solid knowledge on many basic
things regarding D, so I put the brakes on, and went to square
one and started reading and researching some things a bit more
... slowly.
One of the things that struck me this week is that UniCode string
manipulation in many cases is more complex that I previously
thought, because there is no precise-concept of what is a
character in UniCode, at least, not the way we are used to with
plain-old-ASCII. After reading a lot of about it (this was good:
https://manishearth.github.io/blog/2017/01/14/stop-ascribing-meaning-to-unicode-code-points/) I learned of code-units, code-points, abstract-graphemes, graphemes-clusters, and the like.
And I learned the inner details of the UTF encodings and that
UTF-32 is best (almost required) for string processing (easier,
faster, etc) and of course UTF-8 for definitive storage, and
UTF-16 to the trashcan unless you need to interface with Windows
(I was previously using UTF-8 within all my code for processing).
So, in order to manipulate a string, say, left(n), right(n),
substr(n,m), ie: the usual stuff for many languages/libraries, I
need to operate on grapheme-clusters and not in code-points and
never ever on code-units, at least, for unexpected text, ie:
incoming text, user-input, etc, the things that we can not
control beforehand.
Both primary D books, Andrei's and Ali's ones, as the D
documentation, have plenty of examples but they are mainly
focused on simple things like strings having
nothing-out-of-the-ordinary. They perform string manipulation
mainly slicing the source string (ie: the char array) with the
functions of std.range like take, takeOne, etc.
I needed to set this things once-and-for-all for my code and thus
I decided to build a grapheme-aware UDT that once instantiated
with any given string will provide the usual string manipulation
functions so I can forget the minutiae about them. The unittest
at the bottom has many usage examples.
The whole UDT needed to be templated for the three string types
(string, dstring, wstring -and nothing else) and this was what
produced this post to begin with. This issue was solved, not the
way I liked to, but solved. The code works alas for something
that smells like a phobos bug (# 20483) using foreach with
grapheme arrays (foreach always missing the last one).
I ended up with the following (as usual advice/suggestions
welcomed):
```d
/// testing D on 2021-06~07
import std.algorithm : map, joiner;
import std.array : array;
import std.conv : to;
import std.range : walkLength, take, tail, drop, dropBack;
import std.stdio;
import std.uni : Grapheme, byGrapheme;
alias stringUGC = Grapheme;
alias stringUGC08 = gudtUGC!(stringUTF08);
alias stringUGC16 = gudtUGC!(stringUTF16);
alias stringUGC32 = gudtUGC!(stringUTF32);
alias stringUTF08 = string; /// same as immutable(char )[];
alias stringUTF16 = dstring; /// same as immutable(dchar)[];
alias stringUTF32 = wstring; /// same as immutable(wchar)[];
void main() {}
//mixin templateUGC!(stringUTF08, r"gudtUGC08"w); /// if these
were possible there will be no need for stringUGC## aliases in
main()
//mixin templateUGC!(stringUTF16, r"gudtUGC16"w);
//mixin templateUGC!(stringUTF32, r"gudtUGC32"w);
//template templateUGC (
// typeStringUTF,
// alias lstrStructureID
// ) {
public struct gudtUGC(typeStringUTF) { /// UniCode grapheme
cluster‐aware string manipulation
void popFront() { ++pintSequenceCurrent; }
bool empty() { return pintSequenceCurrent ==
pintSequenceCount; }
typeStringUTF front() { return toUTFtake(pintSequenceCurrent);
}
private stringUGC[] pugcSequence;
private size_t pintSequenceCount = cast(size_t) 0;
private size_t pintSequenceCurrent = cast(size_t) 0;
@property public size_t count() { return pintSequenceCount; }
this(scope const typeStringUTF lstrSequence) {
decode(lstrSequence);
}
@safe public size_t decode(
scope const typeStringUTF lstrSequence
) {
scope size_t lintSequenceCount = cast(size_t) 0;
if (lstrSequence is null) {
pugcSequence = null;
pintSequenceCount = cast(size_t) 0;
pintSequenceCurrent = cast(size_t) 0;
} else {
pugcSequence = lstrSequence.byGrapheme.array;
pintSequenceCount = pugcSequence.walkLength;
pintSequenceCurrent = cast(size_t) 1;
lintSequenceCount = pintSequenceCount;
}
return lintSequenceCount;
}
@safe public typeStringUTF encode() { /// UniCode grapheme
cluster to UniCode UTF‐encoded string
scope typeStringUTF lstrSequence = null;
if (pintSequenceCount >= cast(size_t) 1) {
lstrSequence = pugcSequence
.map!((ref g) => g[])
.joiner
.to!(typeStringUTF)
;
}
return lstrSequence;
}
@safe public typeStringUTF toUTFtake( /// UniCode grapheme
cluster to UniCode UTF‐encoded string
scope const size_t lintStart,
scope const size_t lintCount = cast(size_t) 1
) {
scope typeStringUTF lstrSequence = null;
if (lintStart <= lintStart + lintCount) {
/// eg#1: toUTFtake(1,3) → range#1=start-1=1-1=0 and
range#2=range#1+count=0+3=3 → 0..3
/// eg#1: toUTFtake(6,3) → range#2=start-1=6-1=5 and
range#2=range#1+count=5+3=8 → 5..8
/// eg#2: toUTFtake(01,1) → range#1=start-1=01-1=00 and
range#2=range#1+count=00+1=01 → 00..01
/// eg#2: toUTFtake(50,1) → range#2=start-1=50-1=49 and
range#2=range#1+count=49+1=50 → 49..50
scope size_t lintRange1 = lintStart - cast(size_t) 1;
scope size_t lintRange2 = lintRange1 + lintCount;
if (lintRange1 >= cast(size_t) 0 && lintRange2 <=
pintSequenceCount) {
lstrSequence = pugcSequence[lintRange1..lintRange2]
.map!((ref g) => g[])
.joiner
.to!(typeStringUTF)
;
}
}
return lstrSequence;
}
@safe public typeStringUTF toUTFtakeL( /// UniCode grapheme
cluster to UniCode UTF‐encoded string
scope const size_t lintCount
) {
scope typeStringUTF lstrSequence = null;
if (lintCount <= pintSequenceCount) {
lstrSequence = pugcSequence
.take(lintCount)
.map!((ref g) => g[])
.joiner
.to!(typeStringUTF)
;
}
return lstrSequence;
}
@safe public typeStringUTF toUTFtakeR( /// UniCode grapheme
cluster to UniCode UTF‐encoded string
scope const size_t lintCount
) {
scope typeStringUTF lstrSequence = null;
if (lintCount <= pintSequenceCount) {
lstrSequence = pugcSequence
.tail(lintCount)
.map!((ref g) => g[])
.joiner
.to!(typeStringUTF)
;
}
return lstrSequence;
}
@safe public typeStringUTF toUTFchopL( /// UniCode grapheme
cluster to UniCode UTF‐encoded string
scope const size_t lintCount
) {
scope typeStringUTF lstrSequence = null;
if (lintCount <= pintSequenceCount) {
lstrSequence = pugcSequence
.drop(lintCount)
.map!((ref g) => g[])
.joiner
.to!(typeStringUTF)
;
}
return lstrSequence;
}
@safe public typeStringUTF toUTFchopR( /// UniCode grapheme
cluster to UniCode UTF‐encoded string
scope const size_t lintCount
) {
scope typeStringUTF lstrSequence = null;
if (lintCount <= pintSequenceCount) {
lstrSequence = pugcSequence
.dropBack(lintCount)
.map!((ref g) => g[])
.joiner
.to!(typeStringUTF)
;
}
return lstrSequence;
}
@safe public typeStringUTF toUTFpadL( /// UniCode grapheme
cluster to UniCode UTF‐encoded string
scope const size_t lintCount,
scope const typeStringUTF lstrPadding = cast(typeStringUTF)
r" "
) {
scope typeStringUTF lstrSequence = null;
if (lintCount > pintSequenceCount) {
lstrSequence = null; /// pending
}
return lstrSequence;
}
@safe public typeStringUTF toUTFpadR( /// UniCode grapheme
cluster to UniCode UTF‐encoded string
scope const size_t lintCount,
scope const typeStringUTF lstrPadding = cast(typeStringUTF)
r" "
) {
scope typeStringUTF lstrSequence = null;
if (lintCount > pintSequenceCount) {
lstrSequence = null; /// pending
}
return lstrSequence;
}
/*@safe public gudtUGC(typeStringUTF) take(
scope const size_t lintStart,
scope const size_t lintCount = cast(size_t) 1
) {
/// the idea behind this new set of functions (returning a
new object) is to enable the following one‐liner constructions:
/// assert(lugcSequence3.take(35,
3).take(1,2).take(1,1).encode() == cast(stringUTF) r"日");
/// ooops … error: function declaration without return
type. (Note that constructors are always named `this`)
/// ooops … error: no identifier for declarator `@safe
gudtUGC(typeStringUTF)`
scope gudtUGC(typeStringUTF) lugcSequence;
if (lintStart <= lintStart + lintCount) {
/// eg#1: toUTFtake(1,3) → range#1=start-1=1-1=0 and
range#2=range#1+count=0+3=3 → 0..3
/// eg#1: toUTFtake(6,3) → range#2=start-1=6-1=5 and
range#2=range#1+count=5+3=8 → 5..8
/// eg#2: toUTFtake(01,1) → range#1=start-1=01-1=00 and
range#2=range#1+count=00+1=01 → 00..01
/// eg#2: toUTFtake(50,1) → range#2=start-1=50-1=49 and
range#2=range#1+count=49+1=50 → 49..50
scope size_t lintRange1 = lintStart - cast(size_t) 1;
scope size_t lintRange2 = lintRange1 + lintCount;
if (lintRange1 >= cast(size_t) 0 && lintRange2 <=
pintSequenceCount) {
lugcSequence =
gudtUGC(typeStringUTF)(pugcSequence[lintRange1..lintRange2]
.map!((ref g) => g[])
.joiner
.to!(typeStringUTF)
);
}
}
return lugcSequence;
}*/
}
//}
unittest {
version (useUTF08) {
scope stringUTF08 lstrSequence1 =
r"12345678901234567890123456789012345678901234567890"c;
scope stringUTF08 lstrSequence2 =
r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"c;
scope stringUTF08 lstrSequence3 = "äëåčñœß … russian = русский
🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"c;
}
version (useUTF16) {
scope stringUTF16 lstrSequence1 =
r"12345678901234567890123456789012345678901234567890"d;
scope stringUTF16 lstrSequence2 =
r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"d;
scope stringUTF16 lstrSequence3 = "äëåčñœß … russian = русский
🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"d;
}
version (useUTF32) {
scope stringUTF32 lstrSequence1 =
r"12345678901234567890123456789012345678901234567890"w;
scope stringUTF32 lstrSequence2 =
r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"w;
scope stringUTF32 lstrSequence3 = "äëåčñœß … russian = русский
🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"w;
}
scope size_t lintSequence1sizeUTF = lstrSequence1.length;
scope size_t lintSequence2sizeUTF = lstrSequence2.length;
scope size_t lintSequence3sizeUTF = lstrSequence3.length;
scope size_t lintSequence1sizeUGA = lstrSequence1.walkLength;
scope size_t lintSequence2sizeUGA = lstrSequence2.walkLength;
scope size_t lintSequence3sizeUGA = lstrSequence3.walkLength;
scope size_t lintSequence1sizeUGC =
lstrSequence1.byGrapheme.walkLength;
scope size_t lintSequence2sizeUGC =
lstrSequence2.byGrapheme.walkLength;
scope size_t lintSequence3sizeUGC =
lstrSequence3.byGrapheme.walkLength;
assert(lintSequence1sizeUGC == cast(size_t) 50);
assert(lintSequence2sizeUGC == cast(size_t) 50);
assert(lintSequence3sizeUGC == cast(size_t) 50);
assert(lintSequence1sizeUGA == cast(size_t) 50);
assert(lintSequence2sizeUGA == cast(size_t) 50);
assert(lintSequence3sizeUGA == cast(size_t) 52);
version (useUTF08) {
assert(lintSequence1sizeUTF == cast(size_t) 50);
assert(lintSequence2sizeUTF == cast(size_t) 60);
assert(lintSequence3sizeUTF == cast(size_t) 91);
}
version (useUTF16) {
assert(lintSequence1sizeUTF == cast(size_t) 50);
assert(lintSequence2sizeUTF == cast(size_t) 50);
assert(lintSequence3sizeUTF == cast(size_t) 52);
}
version (useUTF32) {
assert(lintSequence1sizeUTF == cast(size_t) 50);
assert(lintSequence2sizeUTF == cast(size_t) 50);
assert(lintSequence3sizeUTF == cast(size_t) 57);
}
/// the following should be the same regardless of the
encoding being used and is the whole point of this UDT being made:
version (useUTF08) { alias stringUTF = stringUTF08; scope
stringUGC08 lugcSequence3 = stringUGC08(lstrSequence3); }
version (useUTF16) { alias stringUTF = stringUTF16; scope
stringUGC16 lugcSequence3 = stringUGC16(lstrSequence3); }
version (useUTF32) { alias stringUTF = stringUTF32; scope
stringUGC32 lugcSequence3 = stringUGC32(lstrSequence3); }
assert(lugcSequence3.encode() == lstrSequence3);
assert(lugcSequence3.toUTFtake(21) == cast(stringUTF) r"р");
assert(lugcSequence3.toUTFtake(27) == cast(stringUTF) r"й");
assert(lugcSequence3.toUTFtake(35) == cast(stringUTF) r"日");
assert(lugcSequence3.toUTFtake(37) == cast(stringUTF) r"語");
assert(lugcSequence3.toUTFtake(21, 7) == cast(stringUTF)
r"русский");
assert(lugcSequence3.toUTFtake(35, 3) == cast(stringUTF)
r"日本語");
assert(lugcSequence3.toUTFtakeL(1) == cast(stringUTF) r"ä");
assert(lugcSequence3.toUTFtakeR(1) == cast(stringUTF) r"😎");
assert(lugcSequence3.toUTFtakeL(7) == cast(stringUTF)
r"äëåčñœß");
assert(lugcSequence3.toUTFtakeR(16) == cast(stringUTF) r"日本語 =
japanese 😎");
assert(lugcSequence3.toUTFchopL(10) == cast(stringUTF)
r"russian = русский 🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎");
assert(lugcSequence3.toUTFchopR(21) == cast(stringUTF)
r"äëåčñœß … russian = русский 🇷🇺");
version (useUTF08) { scope stringUTF08 lstrSequence3reencoded;
}
version (useUTF16) { scope stringUTF16 lstrSequence3reencoded;
}
version (useUTF32) { scope stringUTF32 lstrSequence3reencoded;
}
for (
size_t lintSequenceUGC = cast(size_t) 1;
lintSequenceUGC <= lintSequence3sizeUGC;
++lintSequenceUGC
) {
lstrSequence3reencoded ~=
lugcSequence3.toUTFtake(lintSequenceUGC);
}
assert(lstrSequence3reencoded == lstrSequence3);
lstrSequence3reencoded = null;
version (useUTF08) { foreach (stringUTF08 lstrSequence3UGC;
lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } }
version (useUTF16) { foreach (stringUTF16 lstrSequence3UGC;
lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } }
version (useUTF32) { foreach (stringUTF32 lstrSequence3UGC;
lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } }
assert(lstrSequence3reencoded == lstrSequence3); /// ooops …
missing last grapheme: possible bug # 20483
}
```