On Sunday, 11 July 2021 at 05:54:48 UTC, Ali Çehreli wrote:

Ali

Primarily to Ali & Steve for their help, be advised, this post will be somehow ... long.

Some bit of background to begin with: a week or so ago I posted asking advice on code safeness, and still I didn't reply to the ones that kindly answered. Seeing some replies, and encountering a code issue regarding string manipulation, I pretty soon figured out that I still did not have solid knowledge on many basic things regarding D, so I put the brakes on, and went to square one and started reading and researching some things a bit more ... slowly.

One of the things that struck me this week is that UniCode string manipulation in many cases is more complex that I previously thought, because there is no precise-concept of what is a character in UniCode, at least, not the way we are used to with plain-old-ASCII. After reading a lot of about it (this was good: https://manishearth.github.io/blog/2017/01/14/stop-ascribing-meaning-to-unicode-code-points/) I learned of code-units, code-points, abstract-graphemes, graphemes-clusters, and the like.

And I learned the inner details of the UTF encodings and that UTF-32 is best (almost required) for string processing (easier, faster, etc) and of course UTF-8 for definitive storage, and UTF-16 to the trashcan unless you need to interface with Windows (I was previously using UTF-8 within all my code for processing).

So, in order to manipulate a string, say, left(n), right(n), substr(n,m), ie: the usual stuff for many languages/libraries, I need to operate on grapheme-clusters and not in code-points and never ever on code-units, at least, for unexpected text, ie: incoming text, user-input, etc, the things that we can not control beforehand.

Both primary D books, Andrei's and Ali's ones, as the D documentation, have plenty of examples but they are mainly focused on simple things like strings having nothing-out-of-the-ordinary. They perform string manipulation mainly slicing the source string (ie: the char array) with the functions of std.range like take, takeOne, etc.

I needed to set this things once-and-for-all for my code and thus I decided to build a grapheme-aware UDT that once instantiated with any given string will provide the usual string manipulation functions so I can forget the minutiae about them. The unittest at the bottom has many usage examples.

The whole UDT needed to be templated for the three string types (string, dstring, wstring -and nothing else) and this was what produced this post to begin with. This issue was solved, not the way I liked to, but solved. The code works alas for something that smells like a phobos bug (# 20483) using foreach with grapheme arrays (foreach always missing the last one).

I ended up with the following (as usual advice/suggestions welcomed):

```d
/// testing D on 2021-06~07

import std.algorithm : map, joiner;
import std.array : array;
import std.conv : to;
import std.range : walkLength, take, tail, drop, dropBack;
import std.stdio;
import std.uni : Grapheme, byGrapheme;

alias stringUGC = Grapheme;
alias stringUGC08 = gudtUGC!(stringUTF08);
alias stringUGC16 = gudtUGC!(stringUTF16);
alias stringUGC32 = gudtUGC!(stringUTF32);
alias stringUTF08 = string;  /// same as immutable(char )[];
alias stringUTF16 = dstring; /// same as immutable(dchar)[];
alias stringUTF32 = wstring; /// same as immutable(wchar)[];

void main() {}

//mixin templateUGC!(stringUTF08, r"gudtUGC08"w); /// if these were possible there will be no need for stringUGC## aliases in main()
//mixin templateUGC!(stringUTF16, r"gudtUGC16"w);
//mixin templateUGC!(stringUTF32, r"gudtUGC32"w);

//template templateUGC (
//   typeStringUTF,
//   alias lstrStructureID
//   ) {

public struct gudtUGC(typeStringUTF) { /// UniCode grapheme cluster‐aware string manipulation

   void popFront() { ++pintSequenceCurrent; }
bool empty() { return pintSequenceCurrent == pintSequenceCount; } typeStringUTF front() { return toUTFtake(pintSequenceCurrent); }

   private stringUGC[] pugcSequence;
   private size_t pintSequenceCount = cast(size_t) 0;
   private size_t pintSequenceCurrent = cast(size_t) 0;

   @property public size_t count() { return pintSequenceCount; }

   this(scope const typeStringUTF lstrSequence) {

      decode(lstrSequence);

   }

   @safe public size_t decode(
      scope const typeStringUTF lstrSequence
      ) {

      scope size_t lintSequenceCount = cast(size_t) 0;

      if (lstrSequence is null) {

         pugcSequence = null;
         pintSequenceCount = cast(size_t) 0;
         pintSequenceCurrent = cast(size_t) 0;

      } else {

         pugcSequence = lstrSequence.byGrapheme.array;
         pintSequenceCount = pugcSequence.walkLength;
         pintSequenceCurrent = cast(size_t) 1;

         lintSequenceCount = pintSequenceCount;

      }

      return lintSequenceCount;

   }

@safe public typeStringUTF encode() { /// UniCode grapheme cluster to UniCode UTF‐encoded string

      scope typeStringUTF lstrSequence = null;

      if (pintSequenceCount >= cast(size_t) 1) {

         lstrSequence = pugcSequence
            .map!((ref g) => g[])
            .joiner
            .to!(typeStringUTF)
            ;

      }

      return lstrSequence;

   }

@safe public typeStringUTF toUTFtake( /// UniCode grapheme cluster to UniCode UTF‐encoded string
      scope const size_t lintStart,
      scope const size_t lintCount = cast(size_t) 1
      ) {

      scope typeStringUTF lstrSequence = null;

      if (lintStart <= lintStart + lintCount) {

/// eg#1: toUTFtake(1,3) → range#1=start-1=1-1=0 and range#2=range#1+count=0+3=3 → 0..3 /// eg#1: toUTFtake(6,3) → range#2=start-1=6-1=5 and range#2=range#1+count=5+3=8 → 5..8

/// eg#2: toUTFtake(01,1) → range#1=start-1=01-1=00 and range#2=range#1+count=00+1=01 → 00..01 /// eg#2: toUTFtake(50,1) → range#2=start-1=50-1=49 and range#2=range#1+count=49+1=50 → 49..50

         scope size_t lintRange1 = lintStart - cast(size_t) 1;
         scope size_t lintRange2 = lintRange1 + lintCount;

if (lintRange1 >= cast(size_t) 0 && lintRange2 <= pintSequenceCount) {

            lstrSequence = pugcSequence[lintRange1..lintRange2]
               .map!((ref g) => g[])
               .joiner
               .to!(typeStringUTF)
               ;

         }

      }

      return lstrSequence;

   }

@safe public typeStringUTF toUTFtakeL( /// UniCode grapheme cluster to UniCode UTF‐encoded string
      scope const size_t lintCount
      ) {

      scope typeStringUTF lstrSequence = null;

      if (lintCount <= pintSequenceCount) {

         lstrSequence = pugcSequence
            .take(lintCount)
            .map!((ref g) => g[])
            .joiner
            .to!(typeStringUTF)
            ;

      }

      return lstrSequence;

   }

@safe public typeStringUTF toUTFtakeR( /// UniCode grapheme cluster to UniCode UTF‐encoded string
      scope const size_t lintCount
      ) {

      scope typeStringUTF lstrSequence = null;

      if (lintCount <= pintSequenceCount) {

         lstrSequence = pugcSequence
            .tail(lintCount)
            .map!((ref g) => g[])
            .joiner
            .to!(typeStringUTF)
            ;

      }

      return lstrSequence;

   }

@safe public typeStringUTF toUTFchopL( /// UniCode grapheme cluster to UniCode UTF‐encoded string
      scope const size_t lintCount
      ) {

      scope typeStringUTF lstrSequence = null;

      if (lintCount <= pintSequenceCount) {

         lstrSequence = pugcSequence
            .drop(lintCount)
            .map!((ref g) => g[])
            .joiner
            .to!(typeStringUTF)
            ;

      }

      return lstrSequence;

   }

@safe public typeStringUTF toUTFchopR( /// UniCode grapheme cluster to UniCode UTF‐encoded string
      scope const size_t lintCount
      ) {

      scope typeStringUTF lstrSequence = null;

      if (lintCount <= pintSequenceCount) {

         lstrSequence = pugcSequence
            .dropBack(lintCount)
            .map!((ref g) => g[])
            .joiner
            .to!(typeStringUTF)
            ;

      }

      return lstrSequence;

   }

@safe public typeStringUTF toUTFpadL( /// UniCode grapheme cluster to UniCode UTF‐encoded string
      scope const size_t lintCount,
scope const typeStringUTF lstrPadding = cast(typeStringUTF) r" "
      ) {

      scope typeStringUTF lstrSequence = null;

      if (lintCount > pintSequenceCount) {

         lstrSequence = null; /// pending

      }

      return lstrSequence;

   }

@safe public typeStringUTF toUTFpadR( /// UniCode grapheme cluster to UniCode UTF‐encoded string
      scope const size_t lintCount,
scope const typeStringUTF lstrPadding = cast(typeStringUTF) r" "
      ) {

      scope typeStringUTF lstrSequence = null;

      if (lintCount > pintSequenceCount) {

         lstrSequence = null; /// pending

      }

      return lstrSequence;

   }

   /*@safe public gudtUGC(typeStringUTF) take(
      scope const size_t lintStart,
      scope const size_t lintCount = cast(size_t) 1
      ) {

/// the idea behind this new set of functions (returning a new object) is to enable the following one‐liner constructions: /// assert(lugcSequence3.take(35, 3).take(1,2).take(1,1).encode() == cast(stringUTF) r"日");

/// ooops … error: function declaration without return type. (Note that constructors are always named `this`) /// ooops … error: no identifier for declarator `@safe gudtUGC(typeStringUTF)`

      scope gudtUGC(typeStringUTF) lugcSequence;

      if (lintStart <= lintStart + lintCount) {

/// eg#1: toUTFtake(1,3) → range#1=start-1=1-1=0 and range#2=range#1+count=0+3=3 → 0..3 /// eg#1: toUTFtake(6,3) → range#2=start-1=6-1=5 and range#2=range#1+count=5+3=8 → 5..8

/// eg#2: toUTFtake(01,1) → range#1=start-1=01-1=00 and range#2=range#1+count=00+1=01 → 00..01 /// eg#2: toUTFtake(50,1) → range#2=start-1=50-1=49 and range#2=range#1+count=49+1=50 → 49..50

         scope size_t lintRange1 = lintStart - cast(size_t) 1;
         scope size_t lintRange2 = lintRange1 + lintCount;

if (lintRange1 >= cast(size_t) 0 && lintRange2 <= pintSequenceCount) {

lugcSequence = gudtUGC(typeStringUTF)(pugcSequence[lintRange1..lintRange2]
               .map!((ref g) => g[])
               .joiner
               .to!(typeStringUTF)
               );

         }

      }

      return lugcSequence;

   }*/

}

//}

unittest {

   version (useUTF08) {
scope stringUTF08 lstrSequence1 = r"12345678901234567890123456789012345678901234567890"c; scope stringUTF08 lstrSequence2 = r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"c; scope stringUTF08 lstrSequence3 = "äëåčñœß … russian = русский 🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"c;
   }

   version (useUTF16) {
scope stringUTF16 lstrSequence1 = r"12345678901234567890123456789012345678901234567890"d; scope stringUTF16 lstrSequence2 = r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"d; scope stringUTF16 lstrSequence3 = "äëåčñœß … russian = русский 🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"d;
   }

   version (useUTF32) {
scope stringUTF32 lstrSequence1 = r"12345678901234567890123456789012345678901234567890"w; scope stringUTF32 lstrSequence2 = r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"w; scope stringUTF32 lstrSequence3 = "äëåčñœß … russian = русский 🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"w;
   }

   scope size_t lintSequence1sizeUTF = lstrSequence1.length;
   scope size_t lintSequence2sizeUTF = lstrSequence2.length;
   scope size_t lintSequence3sizeUTF = lstrSequence3.length;

   scope size_t lintSequence1sizeUGA = lstrSequence1.walkLength;
   scope size_t lintSequence2sizeUGA = lstrSequence2.walkLength;
   scope size_t lintSequence3sizeUGA = lstrSequence3.walkLength;

scope size_t lintSequence1sizeUGC = lstrSequence1.byGrapheme.walkLength; scope size_t lintSequence2sizeUGC = lstrSequence2.byGrapheme.walkLength; scope size_t lintSequence3sizeUGC = lstrSequence3.byGrapheme.walkLength;

   assert(lintSequence1sizeUGC == cast(size_t) 50);
   assert(lintSequence2sizeUGC == cast(size_t) 50);
   assert(lintSequence3sizeUGC == cast(size_t) 50);

   assert(lintSequence1sizeUGA == cast(size_t) 50);
   assert(lintSequence2sizeUGA == cast(size_t) 50);
   assert(lintSequence3sizeUGA == cast(size_t) 52);

   version (useUTF08) {
   assert(lintSequence1sizeUTF == cast(size_t) 50);
   assert(lintSequence2sizeUTF == cast(size_t) 60);
   assert(lintSequence3sizeUTF == cast(size_t) 91);
   }

   version (useUTF16) {
   assert(lintSequence1sizeUTF == cast(size_t) 50);
   assert(lintSequence2sizeUTF == cast(size_t) 50);
   assert(lintSequence3sizeUTF == cast(size_t) 52);
   }

   version (useUTF32) {
   assert(lintSequence1sizeUTF == cast(size_t) 50);
   assert(lintSequence2sizeUTF == cast(size_t) 50);
   assert(lintSequence3sizeUTF == cast(size_t) 57);
   }

/// the following should be the same regardless of the encoding being used and is the whole point of this UDT being made:

version (useUTF08) { alias stringUTF = stringUTF08; scope stringUGC08 lugcSequence3 = stringUGC08(lstrSequence3); } version (useUTF16) { alias stringUTF = stringUTF16; scope stringUGC16 lugcSequence3 = stringUGC16(lstrSequence3); } version (useUTF32) { alias stringUTF = stringUTF32; scope stringUGC32 lugcSequence3 = stringUGC32(lstrSequence3); }

   assert(lugcSequence3.encode() == lstrSequence3);

   assert(lugcSequence3.toUTFtake(21) == cast(stringUTF) r"р");
   assert(lugcSequence3.toUTFtake(27) == cast(stringUTF) r"й");
   assert(lugcSequence3.toUTFtake(35) == cast(stringUTF) r"日");
   assert(lugcSequence3.toUTFtake(37) == cast(stringUTF) r"語");
assert(lugcSequence3.toUTFtake(21, 7) == cast(stringUTF) r"русский"); assert(lugcSequence3.toUTFtake(35, 3) == cast(stringUTF) r"日本語");

   assert(lugcSequence3.toUTFtakeL(1) == cast(stringUTF) r"ä");
   assert(lugcSequence3.toUTFtakeR(1) == cast(stringUTF) r"😎");
assert(lugcSequence3.toUTFtakeL(7) == cast(stringUTF) r"äëåčñœß"); assert(lugcSequence3.toUTFtakeR(16) == cast(stringUTF) r"日本語 = japanese 😎");

assert(lugcSequence3.toUTFchopL(10) == cast(stringUTF) r"russian = русский 🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"); assert(lugcSequence3.toUTFchopR(21) == cast(stringUTF) r"äëåčñœß … russian = русский 🇷🇺");

version (useUTF08) { scope stringUTF08 lstrSequence3reencoded; } version (useUTF16) { scope stringUTF16 lstrSequence3reencoded; } version (useUTF32) { scope stringUTF32 lstrSequence3reencoded; }

   for (
      size_t lintSequenceUGC = cast(size_t) 1;
      lintSequenceUGC <= lintSequence3sizeUGC;
      ++lintSequenceUGC
      ) {

lstrSequence3reencoded ~= lugcSequence3.toUTFtake(lintSequenceUGC);

   }

   assert(lstrSequence3reencoded == lstrSequence3);

   lstrSequence3reencoded = null;

version (useUTF08) { foreach (stringUTF08 lstrSequence3UGC; lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } } version (useUTF16) { foreach (stringUTF16 lstrSequence3UGC; lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } } version (useUTF32) { foreach (stringUTF32 lstrSequence3UGC; lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } }

assert(lstrSequence3reencoded == lstrSequence3); /// ooops … missing last grapheme: possible bug # 20483

}
```

Reply via email to