Re: Accessing non-binary Unicode properties with std.uni

H. S. Teoh via Digitalmars-d-learn Tue, 29 Sep 2020 10:05:26 -0700

On Tue, Sep 29, 2020 at 04:22:18PM +0000, Dukc via Digitalmars-d-learn wrote:
> On Monday, 28 September 2020 at 18:23:43 UTC, Chloé Kekoa wrote:
> > The documentation of std.uni [1] says that the unicode struct
> > provides sets for several binary properties. I am looking for a way
> > to query non-binary properties of a character. Is that possible with
> > std.uni or do I need to use a third-party library?
> > 
> > I am specifically interested in the East_Asian_Width property [2]
> > (which has six allowed values). Trying to access
> > std.uni.unicode.East_Asian_Width results in the error message:
> > 
> > > No unicode set by name East_Asian_Width was found.
> > 
> > [1]: https://dlang.org/library/std/uni.html
> > [2]: https://www.unicode.org/reports/tr11/tr11-38.html
> 
> It seems the East Asian width is Unicode standard 13.0, while Phobos
> implements 6.2. So seems like ca case for a third-party library :(.
[...]


OTOH, the relevant Unicode data file that contains East_Asian_Width data
(EastAsianWidth.txt) is relatively straightforward to parse.  In one of
my projects, I wrote a little helper program to parse this file and
generate a function that tells me if a given dchar is wide or narrow.

Here's the generated function (just copy-n-paste this into your code, no
need for yet another external library dependency):

        bool isWide(dchar ch) @safe pure nothrow @nogc
        {
            if (ch < 63744)
            {
                if (ch < 12880)
                {
                    if (ch < 11904)
                    {
                        if (ch < 4352) return false;
                        if (ch < 4448) return true;
                        if (ch == 9001 || ch == 9002) return true;
                        return false;
                    }
                    else if (ch < 12351) return true;
                    else
                    {
                        if (ch < 12353) return false;
                        if (ch < 12872) return true;
                        return false;
                    }
                }
                else if (ch < 19904) return true;
                else
                {
                    if (ch < 43360)
                    {
                        if (ch < 19968) return false;
                        if (ch < 42183) return true;
                        return false;
                    }
                    else if (ch < 43389) return true;
                    else
                    {
                        if (ch < 44032) return false;
                        if (ch < 55204) return true;
                        return false;
                    }
                }
            }
            else if (ch < 64256) return true;
            else
            {
                if (ch < 65504)
                {
                    if (ch < 65072)
                    {
                        if (ch < 65040) return false;
                        if (ch < 65050) return true;
                        return false;
                    }
                    else if (ch < 65132) return true;
                    else
                    {
                        if (ch < 65281) return false;
                        if (ch < 65377) return true;
                        return false;
                    }
                }
                else if (ch < 65511) return true;
                else
                {
                    if (ch < 127488)
                    {
                        if (ch == 110592 || ch == 110593) return true;
                        return false;
                    }
                    else if (ch < 127570) return true;
                    else
                    {
                        if (ch < 131072) return false;
                        if (ch < 262142) return true;
                        return false;
                    }
                }
            }
        }

Here's the utility that generated this code:

        /**
         * Simple program to parse EastAsianWidth.txt to extract some useful 
info.
         */
        
        import std.algorithm;
        import std.conv;
        import std.range;
        import std.regex;
        import std.stdio;
        
        struct CodeRange
        {
            dchar start, end;
        
            bool overlaps(CodeRange cr)
            {
                return ((start >= cr.start && start < cr.end) ||
                        (end >= cr.start && end < cr.end));
            }
        
            unittest
            {
                assert(CodeRange(1,11).overlaps(CodeRange(11,12)));
                assert(!CodeRange(1,10).overlaps(CodeRange(11,12)));
            }
        
            void merge(CodeRange cr)
            {
                start = min(start, cr.start);
                end = max(end, cr.end);
            }
        
            unittest
            {
                auto cr = CodeRange(10,20);
                cr.merge(CodeRange(20,30));
                assert(cr == CodeRange(10,30));
            }
        
            void toString(scope void delegate(const(char)[]) sink)
            {
                import std.format : formattedWrite;
                sink.formattedWrite("%04X", start);
                if (end > start+1)
                    sink.formattedWrite("..%04X", end-1);
            }
        }
        
        struct Entry
        {
            CodeRange range;
            string width;
        
            void toString(scope void delegate(const(char)[]) sink)
            {
                import std.format : formattedWrite;
                sink.formattedWrite("%s;%s", range, width);
            }
        }
        
        /**
         * Returns: An input range of Entry objects.
         */
        auto parse(R)(R input)
            if (isInputRange!R && is(ElementType!R : const(char)[]))
        {
            // For our purposes, we don't need to distinguish between 
explicit/implicit
            // narrowness, and ambiguous cases can just default to narrow. So 
we map
            // the original width to its equivalent using the following 
equivalence
            // table.
            string[string] equivs = [
                "Na" : "N",
                "N"  : "N",
                "H"  : "N",
                "A"  : "N",
                "W"  : "W",
                "F"  : "W"
            ];
        
            auto reEmpty = regex(`^\s*$`);
            auto reSingle = regex(`^([0-9A-F]+);(N|A|H|W|F|Na)\b`);
            auto reRange = 
regex(`^([0-9A-F]+)\.\.([0-9A-F]+);(N|A|H|W|F|Na)\b`);
        
            struct Result
            {
                R     range;
                Entry front;
                bool  empty;
        
                this(R _range)
                {
                    range = _range;
                    next(); // get things started
                }
        
                void next()
                {
                    while (!range.empty)
                    {
                        auto line = range.front;
        
                        if (auto m = line.match(reSingle))
                        {
                            auto width = equivs[m.captures[2]];
                            dchar ch = cast(dchar) m.captures[1].to!int(16);
                            front = Entry(CodeRange(ch, ch+1), width);
                            empty = false;
                            return;
                        }
                        else if (auto m = line.match(reRange))
                        {
                            auto width = equivs[m.captures[3]];
                            dchar start = cast(dchar) m.captures[1].to!int(16);
                            dchar end = cast(dchar) m.captures[2].to!int(16) + 
1;
                            front = Entry(CodeRange(start, end), width);
                            empty = false;
                            return;
                        }
                        else if (!line.startsWith("#") && !line.match(reEmpty))
                        {
                            import std.string : format;
                            throw new Exception("Couldn't parse line:\n%s"
                                                .format(line));
                        }
        
                        range.popFront();
                    }
                    empty = true;
                }
        
                void popFront()
                {
                    range.popFront();
                    next();
                }
            }
            static assert(isInputRange!Result);
        
            return Result(input);
        }
        
        void outputByWidthType(R)(R input)
            if (isInputRange!R && is(ElementType!R : const(char)[]))
        {
            CodeRange[][string] widths;
            string lastWidth;
        
            void addRange(Entry entry)
            {
                auto range = entry.range;
                auto width = entry.width;
                auto ranges = width in widths;
                if (ranges && ranges.length > 0 && width == lastWidth)
                {
                    (*ranges)[$-1].merge(range);
                }
                else
                    widths[width] ~= range;
        
                lastWidth = width;
            }
        
            foreach (entry; input.parse())
            {
                 addRange(entry);
            }
        
            foreach (width; widths.byKey())
            {
                writeln("# ", width);
                foreach (range; widths[width])
                {
                    writefln("%s;%s", range, width);
                }
                writeln();
            }
        }
        
        /**
         * Returns: An input range of Entry objects.
         */
        auto mergeConsecutive(R)(R input)
            if (isInputRange!R && is(ElementType!R : Entry))
        {
            struct Result
            {
                R     range;
                bool  empty;
                Entry front;
                Entry current;
        
                this(R _range)
                {
                    range = _range;
                    next();
                }
        
                void next()
                {
                    while (!range.empty)
                    {
                        auto e = range.front;
                        if (current.width != e.width)
                        {
                            if (current.width != "")
                            {
                                empty = false;
                                front = current;
        
                                current = e;
                                range.popFront();
        
                                //writefln("Yielding: %s", front);
                                return;
                            }
                            current = e;
                        }
                        else
                        {
                            //writefln("Merging: %s with %s", current, e);
                            current.range.merge(e.range);
                        }
        
                        range.popFront();
                    }
        
                    if (current.width != "")
                    {
                        empty = false;
                        front = current;
                    }
                    else
                        empty = true;
                }
        
                void popFront()
                {
                    if (range.empty)
                        empty = true; // on last element
                    else
                        next();
                }
            }
        
            return Result(input);
        }
        
        void outputByCodePoint(R)(R input)
            if (isInputRange!R && is(ElementType!R : const(char)[]))
        {
            writefln("%(%s\n%)", input.parse().mergeConsecutive());
        }
        
        void tally(R)(R input)
            if (isInputRange!R && is(ElementType!R : const(char)[]))
        {
            int totalW, totalN;
        
            foreach (e; input.parse().mergeConsecutive())
            {
                if (e.width=="W")
                    totalW += (e.range.end - e.range.start);
                else if (e.width=="N")
                    totalN += (e.range.end - e.range.start);
                else
                    assert(0);
            }
            writefln("Tally: W=%d N=%d\n", totalW, totalN);
        }
        
        void genRecogCode(R)(R input)
            if (isInputRange!R && is(ElementType!R : const(char)[]))
        {
            import std.uni;
        
            CodepointSet wideChars;
            foreach (e; input.parse().mergeConsecutive())
            {
                if (e.width=="W")
                    wideChars.add(e.range.start, e.range.end);
            }
        
            writeln(wideChars.toSourceCode("isWide"));
        }
        
        int main(string[] args)
        {
            if (args.length < 2)
            {
                assert(args.length > 0);
                stderr.writefln("Usage: %s (bywidth|bypoint|tally|gencode)", 
args[0]);
                return 1;
            }
        
            auto input = File("ext/EastAsianWidth.txt", "r").byLine();
        
            auto cmd = args[1];
            switch (cmd)
            {
                case "bywidth":
                    outputByWidthType(input);
                    break;
        
                case "bypoint":
                    outputByCodePoint(input);
                    break;
        
                case "tally":
                    tally(input);
                    break;
        
                case "gencode":
                    genRecogCode(input);
                    break;
        
                default:
                    stderr.writefln("Unknown command: %s", cmd);
                    return 1;
            }
            return 0;
        }


T

-- 
People tell me that I'm skeptical, but I don't believe them.

Re: Accessing non-binary Unicode properties with std.uni

Reply via email to