== Quote from Rory McGuire (rmcgu...@neonova.co.za)'s article > On Tue, 22 Jun 2010 15:31:14 +0200, Ben Hanson <ben.han...@tfbplc.co.uk> > wrote: > > == Quote from Rory McGuire (rmcgu...@neonova.co.za)'s article > >> On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson <ben.han...@tfbplc.co.uk> > >> wrote: > >> > Here's the latest with naming convention (hopefully) followed. I've > >> > implemented my > >> > own squeeze() function and used sizeof in the memmove calls. > >> > > >> > How can I specify wide strings for the literals? > >> > > >> > Thanks, > >> > > >> > Ben > >> > > >> > module main; > >> > > >> > import std.algorithm; > >> > import std.array; > >> > import std.c.string; > >> > import std.string; > >> > > >> > import std.stdio; > >> > > >> > template regex(CharT) > >> > { > >> > struct BasicStringToken > >> > { > >> > bool negated = false; > >> > CharT[] charset; > >> > enum size_t MAX_CHARS = CharT.max + 1; > >> > enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0; > >> > > >> > this(const bool negated_, ref CharT[] charset_) > >> > { > >> > negated = negated_; > >> > charset = charset_; > >> > } > >> > > >> > void removeDuplicates() > >> > { > >> > charset.sort; > >> > squeeze(charset); > >> > } > >> > > >> > void normalise() > >> > { > >> > if (charset.length == MAX_CHARS) > >> > { > >> > negated = !negated; > >> > charset.clear(); > >> > } > >> > else if (charset.length > MAX_CHARS / 2) > >> > { > >> > negate(); > >> > } > >> > } > >> > > >> > void negate() > >> > { > >> > CharT curr_char = START_CHAR; > >> > CharT[] temp; > >> > CharT *ptr = cast(CharT *) 0; > >> > CharT *curr = charset.ptr; > >> > CharT *end = curr + charset.length; > >> > size_t i = 0; > >> > > >> > negated = !negated; > >> > temp.length = MAX_CHARS - charset.length; > >> > ptr = temp.ptr; > >> > > >> > while (curr < end) > >> > { > >> > while (*curr > curr_char) > >> > { > >> > *ptr = curr_char; > >> > ++ptr; > >> > ++curr_char; > >> > ++i; > >> > } > >> > > >> > ++curr_char; > >> > ++curr; > >> > ++i; > >> > } > >> > > >> > for (; i < MAX_CHARS; ++i) > >> > { > >> > *ptr = curr_char; > >> > ++ptr; > >> > ++curr_char; > >> > } > >> > > >> > charset = temp; > >> > } > >> > > >> > bool empty() > >> > { > >> > return charset.length == 0 && !negated; > >> > } > >> > > >> > bool any() > >> > { > >> > return charset.length == 0 && negated; > >> > } > >> > > >> > void clear() > >> > { > >> > negated = false; > >> > charset.length = 0; > >> > } > >> > > >> > void intersect(ref BasicStringToken rhs, > >> > ref BasicStringToken overlap) > >> > { > >> > if ((any() && rhs.any()) || (negated == rhs.negated && > >> > !any() && !rhs.any())) > >> > { > >> > intersectSameTypes(rhs, overlap); > >> > } > >> > else > >> > { > >> > intersectDiffTypes(rhs, overlap); > >> > } > >> > } > >> > > >> > private: > >> > void intersectSameTypes(ref BasicStringToken rhs, > >> > ref BasicStringToken overlap) > >> > { > >> > if (any()) > >> > { > >> > clear(); > >> > overlap.negated = true; > >> > rhs.clear(); > >> > } > >> > else > >> > { > >> > CharT *iter = charset.ptr; > >> > CharT *end = iter + charset.length; > >> > CharT *rhs_iter = rhs.charset.ptr; > >> > CharT *rhs_end = rhs_iter + rhs.charset.length; > >> > > >> > overlap.negated = negated; > >> > > >> > while (iter != end && rhs_iter != rhs_end) > >> > { > >> > if (*iter < *rhs_iter) > >> > { > >> > ++iter; > >> > } > >> > else if (*iter > *rhs_iter) > >> > { > >> > ++rhs_iter; > >> > } > >> > else > >> > { > >> > overlap.charset ~= *iter; > >> > memmove(iter, iter + 1, (charset.ptr + > >> > charset.length - iter) * CharT.sizeof); > >> > --end; > >> > charset.length -= 1; > >> > memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr + > >> > rhs.charset.length - rhs_iter) * > >> CharT.sizeof); > >> > --rhs_end; > >> > rhs.charset.length -= 1; > >> > } > >> > } > >> > > >> > if (negated) > >> > { > >> > // duplicates already merged > >> > // src, dest > >> > merge(charset, overlap.charset); > >> > // duplicates already merged > >> > // src, dest > >> > merge(rhs.charset, overlap.charset); > >> > negated = false; > >> > rhs.negated = false; > >> > swap(charset, rhs.charset); > >> > normalise(); > >> > overlap.normalise(); > >> > rhs.normalise(); > >> > } > >> > else if (!overlap.charset.length == 0) > >> > { > >> > normalise(); > >> > overlap.normalise(); > >> > rhs.normalise(); > >> > } > >> > } > >> > } > >> > > >> > void intersectDiffTypes(ref BasicStringToken rhs, > >> > ref BasicStringToken overlap) > >> > { > >> > if (any()) > >> > { > >> > intersectAny(rhs, overlap); > >> > } > >> > else if (negated) > >> > { > >> > intersectNegated(rhs, overlap); > >> > } > >> > else // negated == false > >> > { > >> > intersectCharset(rhs, overlap); > >> > } > >> > } > >> > > >> > void intersectAny(ref BasicStringToken rhs, ref BasicStringToken > >> > overlap) > >> > { > >> > if (rhs.negated) > >> > { > >> > rhs.intersectNegated(this, overlap); > >> > } > >> > else // rhs.negated == false > >> > { > >> > rhs.intersectCharset(this, overlap); > >> > } > >> > } > >> > > >> > void intersectNegated(ref BasicStringToken rhs, > >> > ref BasicStringToken overlap) > >> > { > >> > if (rhs.any()) > >> > { > >> > overlap.negated = true; > >> > overlap.charset = charset; > >> > rhs.negated = false; > >> > rhs.charset = charset; > >> > clear(); > >> > } > >> > else // rhs.negated == false > >> > { > >> > rhs.intersectCharset(this, overlap); > >> > } > >> > } > >> > > >> > void intersectCharset(ref BasicStringToken rhs, > >> > ref BasicStringToken overlap) > >> > { > >> > if (rhs.any()) > >> > { > >> > overlap.charset = charset; > >> > rhs.negated = true; > >> > rhs.charset = charset; > >> > clear(); > >> > } > >> > else // rhs.negated == true > >> > { > >> > CharT *iter = charset.ptr; > >> > CharT *end = iter + charset.length; > >> > CharT *rhs_iter = rhs.charset.ptr; > >> > CharT *rhs_end = rhs_iter + rhs.charset.length; > >> > > >> > while (iter != end && rhs_iter != rhs_end) > >> > { > >> > if (*iter < *rhs_iter) > >> > { > >> > overlap.charset ~= *iter; > >> > rhs.charset.length += 1; > >> > rhs_iter = rhs.charset.ptr; > >> > rhs_end = rhs_iter + rhs.charset.length; > >> > memmove(rhs_iter + 1, rhs_iter, > >> (rhs.charset.length - > >> > (rhs_end - rhs_iter - 1)) * CharT.sizeof); > >> > ++rhs_iter; > >> > memmove(iter, iter + 1, (charset.ptr + > >> > charset.length - iter) * CharT.sizeof); > >> > charset.length -= 1; > >> > --end; > >> > } > >> > else if (*iter > *rhs_iter) > >> > { > >> > ++rhs_iter; > >> > } > >> > else > >> > { > >> > ++iter; > >> > ++rhs_iter; > >> > } > >> > } > >> > > >> > if (iter != end) > >> > { > >> > CharT[] temp; > >> > > >> > temp.length = end - iter; > >> > memmove(temp.ptr, iter, temp.length * CharT.sizeof); > >> > > >> > // nothing bigger in rhs than iter > >> > // src, dest > >> > merge(temp, overlap.charset); > >> > memmove(iter, iter + 1, (charset.ptr + > >> > charset.length - iter) * CharT.sizeof); > >> > charset.length -= 1; > >> > } > >> > > >> > if (!overlap.charset.empty()) > >> > { > >> > merge(overlap.charset, rhs.charset); > >> > // possible duplicates, so check for any and erase. > >> > squeeze(rhs.charset); > >> > normalise(); > >> > overlap.normalise(); > >> > rhs.normalise(); > >> > } > >> > } > >> > } > >> > > >> > void squeeze(ref CharT[] str) > >> > { > >> > if (str.length > 1) > >> > { > >> > CharT *write = str.ptr; > >> > CharT *end = write + str.length; > >> > CharT *read = write + 1; > >> > > >> > while (read != end) > >> > { > >> > while (read != end && *read == *write) > >> > { > >> > ++read; > >> > } > >> > > >> > if (read == end) break; > >> > > >> > ++write; > >> > > >> > if (read > write) > >> > { > >> > *write = *read; > >> > } > >> > > >> > ++read; > >> > } > >> > > >> > str.length = write + 1 - str.ptr; > >> > } > >> > } > >> > > >> > void merge(ref CharT[] src, ref CharT[] dest) > >> > { > >> > CharT[] temp; > >> > CharT *ptr; > >> > CharT *iter = src.ptr; > >> > CharT *end = iter + src.length; > >> > CharT *dest_iter = dest.ptr; > >> > CharT *dest_end = dest_iter + dest.length; > >> > > >> > temp.length = src.length + dest.length; > >> > ptr = temp.ptr; > >> > > >> > while (iter != end && dest_iter != dest_end) > >> > { > >> > if (*iter < *dest_iter) > >> > { > >> > *ptr++ = *iter++; > >> > } > >> > else > >> > { > >> > *ptr++ = *dest_iter++; > >> > } > >> > } > >> > > >> > while (iter != end) > >> > { > >> > *ptr++ = *iter++; > >> > } > >> > > >> > while (dest_iter != dest_end) > >> > { > >> > *ptr++ = *dest_iter++; > >> > } > >> > > >> > dest = temp; > >> > } > >> > }; > >> > } > >> > > >> > int main(char[][]argv) > >> > { > >> > regex!(char).BasicStringToken lhs; > >> > regex!(char).BasicStringToken rhs; > >> > regex!(char).BasicStringToken intersect; > >> > > >> > lhs.charset = "aaabbc".dup; > >> > lhs.negated = true; > >> > lhs.removeDuplicates(); > >> > rhs.charset = "bccddd".dup; > >> > rhs.negated = true; > >> > rhs.removeDuplicates(); > >> > writeln(lhs.charset, '(', lhs.negated, ") intersect ", > >> > rhs.charset, '(', rhs.negated, ") ="); > >> > lhs.intersect(rhs, intersect); > >> > writeln(lhs.charset, '(', lhs.negated, "), ", > >> > rhs.charset, '(', rhs.negated, "), ", > >> > intersect.charset, '(', intersect.negated, ')'); > >> > return 0; > >> > } > > > >> "the string"w > >> gives you 16bit I believe. postfix with a 'd' should give you 32bit. > > > > Thanks. The problem now is that sort() corrupts the strings. Does anyone > > know why? > > > > Regards, > > > > Ben > perhaps from mixing wide chars with CharT if CharT is 8bits? > Honestly I havn't read your code but that is just the likely scenario.
I don't think so: int main(char[][]argv) { regex!(wchar).BasicStringToken lhs; regex!(wchar).BasicStringToken rhs; regex!(wchar).BasicStringToken intersect; lhs.charset = "aaabbc"w.dup; lhs.negated = true; lhs.removeDuplicates(); rhs.charset = "bccddd"w.dup; rhs.negated = true; rhs.removeDuplicates(); writeln(lhs.charset, '(', lhs.negated, ") intersect ", rhs.charset, '(', rhs.negated, ") ="); lhs.intersect(rhs, intersect); writeln(lhs.charset, '(', lhs.negated, "), ", rhs.charset, '(', rhs.negated, "), ", intersect.charset, '(', intersect.negated, ')'); return 0; }