Hi there, I've basically got the string_token class working now. Thanks to everyone who helped. It still needs some work as memmove works with bytes so I need the equivalent of 'sizeof' in D for this.
'squeeze' doesn't work with wide chars, so I will write my own version. When I shrink or grow char arrays, I'd like to know if I should re-set my pointers into them accordingly. If anyone can point out any other obvious issues, bad style etc. that would be appreciated. Please bear in mind that I'd like the code to be as fast as possible. Here's the source: Regards, Ben module main; import std.algorithm; import std.array; import std.c.string; import std.string; import std.stdio; template regex(CharT) { struct basic_string_token { bool _negated = false; CharT[] _charset; enum size_t MAX_CHARS = CharT.max + 1; enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0; this(const bool negated_, ref CharT[] charset_) { _negated = negated_; _charset = charset_; } void remove_duplicates() { _charset.sort; _charset = squeeze(_charset.idup).dup; } void normalise() { if (_charset.length == MAX_CHARS) { _negated = !_negated; _charset.clear(); } else if (_charset.length > MAX_CHARS / 2) { negate(); } } void negate() { CharT curr_char_ = START_CHAR; CharT[] temp_; CharT *ptr_; CharT *curr_ = _charset.ptr; CharT *end_ = curr_ + _charset.length; size_t i_ = 0; _negated = !_negated; temp_.length = MAX_CHARS - _charset.length; ptr_ = temp_.ptr; while (curr_ < end_) { while (*curr_ > curr_char_) { *ptr_ = curr_char_; ++ptr_; ++curr_char_; ++i_; } ++curr_char_; ++curr_; ++i_; } for (; i_ < MAX_CHARS; ++i_) { *ptr_ = curr_char_; ++ptr_; ++curr_char_; } _charset = temp_; } bool empty() { return _charset.length == 0 && !_negated; } bool any() { return _charset.length == 0 && _negated; } void clear() { _negated = false; _charset.length = 0; } void intersect(ref basic_string_token rhs_, ref basic_string_token overlap_) { if ((any() && rhs_.any()) || (_negated == rhs_._negated && !any() && !rhs_.any())) { intersect_same_types(rhs_, overlap_); } else { intersect_diff_types(rhs_, overlap_); } } private: void intersect_same_types(ref basic_string_token rhs_, ref basic_string_token overlap_) { if (any()) { clear(); overlap_._negated = true; rhs_.clear(); } else { CharT *iter_ = _charset.ptr; CharT *end_ = iter_ + _charset.length; CharT *rhs_iter_ = rhs_._charset.ptr; CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length; overlap_._negated = _negated; while (iter_ != end_ && rhs_iter_ != rhs_end_) { if (*iter_ < *rhs_iter_) { ++iter_; } else if (*iter_ > *rhs_iter_) { ++rhs_iter_; } else { overlap_._charset ~= *iter_; memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_); --end_; _charset.length -= 1; memmove(rhs_iter_, rhs_iter_ + 1, rhs_._charset.ptr + rhs_._charset.length - rhs_iter_); --rhs_end_; rhs_._charset.length -= 1; } } if (_negated) { // duplicates already merged // src, dest merge(_charset, overlap_._charset); // duplicates already merged // src, dest merge(rhs_._charset, overlap_._charset); _negated = false; rhs_._negated = false; swap(_charset, rhs_._charset); normalise(); overlap_.normalise(); rhs_.normalise(); } else if (!overlap_._charset.length == 0) { normalise(); overlap_.normalise(); rhs_.normalise(); } } } void intersect_diff_types(ref basic_string_token rhs_, ref basic_string_token overlap_) { if (any()) { intersect_any(rhs_, overlap_); } else if (_negated) { intersect_negated(rhs_, overlap_); } else // _negated == false { intersect_charset(rhs_, overlap_); } } void intersect_any(ref basic_string_token rhs_, ref basic_string_token overlap_) { if (rhs_._negated) { rhs_.intersect_negated(this, overlap_); } else // rhs._negated == false { rhs_.intersect_charset(this, overlap_); } } void intersect_negated(ref basic_string_token rhs_, ref basic_string_token overlap_) { if (rhs_.any()) { overlap_._negated = true; overlap_._charset = _charset; rhs_._negated = false; rhs_._charset = _charset; clear(); } else // rhs._negated == false { rhs_.intersect_charset(this, overlap_); } } void intersect_charset(ref basic_string_token rhs_, ref basic_string_token overlap_) { if (rhs_.any()) { overlap_._charset = _charset; rhs_._negated = true; rhs_._charset = _charset; clear(); } else // rhs_._negated == true { CharT *iter_ = _charset.ptr; CharT *end_ = iter_ + _charset.length; CharT *rhs_iter_ = rhs_._charset.ptr; CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length; while (iter_ != end_ && rhs_iter_ != rhs_end_) { if (*iter_ < *rhs_iter_) { overlap_._charset ~= *iter_; rhs_._charset.length += 1; rhs_iter_ = rhs_._charset.ptr; rhs_end_ = rhs_iter_ + rhs_._charset.length; memmove(rhs_iter_ + 1, rhs_iter_, rhs_._charset.length - (rhs_end_ - rhs_iter_ - 1)); ++rhs_iter_; memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_); _charset.length -= 1; --end_; } else if (*iter_ > *rhs_iter_) { ++rhs_iter_; } else { ++iter_; ++rhs_iter_; } } if (iter_ != end_) { CharT[] temp_; temp_.length = end_ - iter_; memmove(temp_.ptr, iter_, temp_.length); // nothing bigger in rhs_ than iter_ // src, dest merge(temp_, overlap_._charset); memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_); _charset.length -= 1; } if (!overlap_._charset.empty()) { merge(overlap_._charset, rhs_._charset); // possible duplicates, so check for any and erase. rhs_._charset = squeeze(rhs_._charset.idup).dup; normalise(); overlap_.normalise(); rhs_.normalise(); } } } void merge(ref CharT[] src_, ref CharT[] dest_) { CharT[] temp_; CharT *ptr_; CharT *iter_ = src_.ptr; CharT *end_ = iter_ + src_.length; CharT *dest_iter_ = dest_.ptr; CharT *dest_end_ = dest_iter_ + dest_.length; temp_.length = src_.length + dest_.length; ptr_ = temp_.ptr; while (iter_ != end_ && dest_iter_ != dest_end_) { if (*iter_ < *dest_iter_) { *ptr_++ = *iter_++; } else { *ptr_++ = *dest_iter_++; } } while (iter_ != end_) { *ptr_++ = *iter_++; } while (dest_iter_ != dest_end_) { *ptr_++ = *dest_iter_++; } dest_ = temp_; } }; } int main(char[][]argv) { regex!(char).basic_string_token lhs_; regex!(char).basic_string_token rhs_; regex!(char).basic_string_token intersect_; lhs_._charset = "abc".dup; lhs_._negated = true; rhs_._charset = "bcd".dup; rhs_._negated = true; writeln(lhs_._charset, '(', lhs_._negated, ") intersect ", rhs_._charset, '(', rhs_._negated, ") ="); lhs_.intersect(rhs_, intersect_); writeln(lhs_._charset, '(', lhs_._negated, "), ", rhs_._charset, '(', rhs_._negated, "), ", intersect_._charset, '(', intersect_._negated, ')'); return 0; }