On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson <ben.han...@tfbplc.co.uk>
wrote:
> Here's the latest with naming convention (hopefully) followed. I've
> implemented my
> own squeeze() function and used sizeof in the memmove calls.
>
> How can I specify wide strings for the literals?
>
> Thanks,
>
> Ben
>
> module main;
>
> import std.algorithm;
> import std.array;
> import std.c.string;
> import std.string;
>
> import std.stdio;
>
> template regex(CharT)
> {
> struct BasicStringToken
> {
> bool negated = false;
> CharT[] charset;
> enum size_t MAX_CHARS = CharT.max + 1;
> enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>
> this(const bool negated_, ref CharT[] charset_)
> {
> negated = negated_;
> charset = charset_;
> }
>
> void removeDuplicates()
> {
> charset.sort;
> squeeze(charset);
> }
>
> void normalise()
> {
> if (charset.length == MAX_CHARS)
> {
> negated = !negated;
> charset.clear();
> }
> else if (charset.length > MAX_CHARS / 2)
> {
> negate();
> }
> }
>
> void negate()
> {
> CharT curr_char = START_CHAR;
> CharT[] temp;
> CharT *ptr = cast(CharT *) 0;
> CharT *curr = charset.ptr;
> CharT *end = curr + charset.length;
> size_t i = 0;
>
> negated = !negated;
> temp.length = MAX_CHARS - charset.length;
> ptr = temp.ptr;
>
> while (curr < end)
> {
> while (*curr > curr_char)
> {
> *ptr = curr_char;
> ++ptr;
> ++curr_char;
> ++i;
> }
>
> ++curr_char;
> ++curr;
> ++i;
> }
>
> for (; i < MAX_CHARS; ++i)
> {
> *ptr = curr_char;
> ++ptr;
> ++curr_char;
> }
>
> charset = temp;
> }
>
> bool empty()
> {
> return charset.length == 0 && !negated;
> }
>
> bool any()
> {
> return charset.length == 0 && negated;
> }
>
> void clear()
> {
> negated = false;
> charset.length = 0;
> }
>
> void intersect(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if ((any() && rhs.any()) || (negated == rhs.negated &&
> !any() && !rhs.any()))
> {
> intersectSameTypes(rhs, overlap);
> }
> else
> {
> intersectDiffTypes(rhs, overlap);
> }
> }
>
> private:
> void intersectSameTypes(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if (any())
> {
> clear();
> overlap.negated = true;
> rhs.clear();
> }
> else
> {
> CharT *iter = charset.ptr;
> CharT *end = iter + charset.length;
> CharT *rhs_iter = rhs.charset.ptr;
> CharT *rhs_end = rhs_iter + rhs.charset.length;
>
> overlap.negated = negated;
>
> while (iter != end && rhs_iter != rhs_end)
> {
> if (*iter < *rhs_iter)
> {
> ++iter;
> }
> else if (*iter > *rhs_iter)
> {
> ++rhs_iter;
> }
> else
> {
> overlap.charset ~= *iter;
> memmove(iter, iter + 1, (charset.ptr +
> charset.length - iter) * CharT.sizeof);
> --end;
> charset.length -= 1;
> memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
> rhs.charset.length - rhs_iter) *
CharT.sizeof);
> --rhs_end;
> rhs.charset.length -= 1;
> }
> }
>
> if (negated)
> {
> // duplicates already merged
> // src, dest
> merge(charset, overlap.charset);
> // duplicates already merged
> // src, dest
> merge(rhs.charset, overlap.charset);
> negated = false;
> rhs.negated = false;
> swap(charset, rhs.charset);
> normalise();
> overlap.normalise();
> rhs.normalise();
> }
> else if (!overlap.charset.length == 0)
> {
> normalise();
> overlap.normalise();
> rhs.normalise();
> }
> }
> }
>
> void intersectDiffTypes(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if (any())
> {
> intersectAny(rhs, overlap);
> }
> else if (negated)
> {
> intersectNegated(rhs, overlap);
> }
> else // negated == false
> {
> intersectCharset(rhs, overlap);
> }
> }
>
> void intersectAny(ref BasicStringToken rhs, ref BasicStringToken
> overlap)
> {
> if (rhs.negated)
> {
> rhs.intersectNegated(this, overlap);
> }
> else // rhs.negated == false
> {
> rhs.intersectCharset(this, overlap);
> }
> }
>
> void intersectNegated(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if (rhs.any())
> {
> overlap.negated = true;
> overlap.charset = charset;
> rhs.negated = false;
> rhs.charset = charset;
> clear();
> }
> else // rhs.negated == false
> {
> rhs.intersectCharset(this, overlap);
> }
> }
>
> void intersectCharset(ref BasicStringToken rhs,
> ref BasicStringToken overlap)
> {
> if (rhs.any())
> {
> overlap.charset = charset;
> rhs.negated = true;
> rhs.charset = charset;
> clear();
> }
> else // rhs.negated == true
> {
> CharT *iter = charset.ptr;
> CharT *end = iter + charset.length;
> CharT *rhs_iter = rhs.charset.ptr;
> CharT *rhs_end = rhs_iter + rhs.charset.length;
>
> while (iter != end && rhs_iter != rhs_end)
> {
> if (*iter < *rhs_iter)
> {
> overlap.charset ~= *iter;
> rhs.charset.length += 1;
> rhs_iter = rhs.charset.ptr;
> rhs_end = rhs_iter + rhs.charset.length;
> memmove(rhs_iter + 1, rhs_iter,
(rhs.charset.length -
> (rhs_end - rhs_iter - 1)) * CharT.sizeof);
> ++rhs_iter;
> memmove(iter, iter + 1, (charset.ptr +
> charset.length - iter) * CharT.sizeof);
> charset.length -= 1;
> --end;
> }
> else if (*iter > *rhs_iter)
> {
> ++rhs_iter;
> }
> else
> {
> ++iter;
> ++rhs_iter;
> }
> }
>
> if (iter != end)
> {
> CharT[] temp;
>
> temp.length = end - iter;
> memmove(temp.ptr, iter, temp.length * CharT.sizeof);
>
> // nothing bigger in rhs than iter
> // src, dest
> merge(temp, overlap.charset);
> memmove(iter, iter + 1, (charset.ptr +
> charset.length - iter) * CharT.sizeof);
> charset.length -= 1;
> }
>
> if (!overlap.charset.empty())
> {
> merge(overlap.charset, rhs.charset);
> // possible duplicates, so check for any and erase.
> squeeze(rhs.charset);
> normalise();
> overlap.normalise();
> rhs.normalise();
> }
> }
> }
>
> void squeeze(ref CharT[] str)
> {
> if (str.length > 1)
> {
> CharT *write = str.ptr;
> CharT *end = write + str.length;
> CharT *read = write + 1;
>
> while (read != end)
> {
> while (read != end && *read == *write)
> {
> ++read;
> }
>
> if (read == end) break;
>
> ++write;
>
> if (read > write)
> {
> *write = *read;
> }
>
> ++read;
> }
>
> str.length = write + 1 - str.ptr;
> }
> }
>
> void merge(ref CharT[] src, ref CharT[] dest)
> {
> CharT[] temp;
> CharT *ptr;
> CharT *iter = src.ptr;
> CharT *end = iter + src.length;
> CharT *dest_iter = dest.ptr;
> CharT *dest_end = dest_iter + dest.length;
>
> temp.length = src.length + dest.length;
> ptr = temp.ptr;
>
> while (iter != end && dest_iter != dest_end)
> {
> if (*iter < *dest_iter)
> {
> *ptr++ = *iter++;
> }
> else
> {
> *ptr++ = *dest_iter++;
> }
> }
>
> while (iter != end)
> {
> *ptr++ = *iter++;
> }
>
> while (dest_iter != dest_end)
> {
> *ptr++ = *dest_iter++;
> }
>
> dest = temp;
> }
> };
> }
>
> int main(char[][]argv)
> {
> regex!(char).BasicStringToken lhs;
> regex!(char).BasicStringToken rhs;
> regex!(char).BasicStringToken intersect;
>
> lhs.charset = "aaabbc".dup;
> lhs.negated = true;
> lhs.removeDuplicates();
> rhs.charset = "bccddd".dup;
> rhs.negated = true;
> rhs.removeDuplicates();
> writeln(lhs.charset, '(', lhs.negated, ") intersect ",
> rhs.charset, '(', rhs.negated, ") =");
> lhs.intersect(rhs, intersect);
> writeln(lhs.charset, '(', lhs.negated, "), ",
> rhs.charset, '(', rhs.negated, "), ",
> intersect.charset, '(', intersect.negated, ')');
> return 0;
> }