#include "xstring.h" const xstring::size_type xstring::npos = ustring::npos; xstring::size_type xstring::size() const { return _string.size(); } xstring::size_type xstring::length() const { return xstring_traits::length((XMLChar*)_string.c_str()); } xstring::size_type xstring::max_size() const { return _string.max_size(); } xstring::size_type xstring::capacity() const { return _string.capacity(); } bool xstring::empty() const { return _string.empty(); } xstring::reference xstring::operator[](xstring::size_type n) { return reinterpret_cast(_string[n]); } xstring::const_reference xstring::operator[](xstring::size_type n) const { return reinterpret_cast(_string[n]); } const XMLChar* xstring::c_str() const { return reinterpret_cast(_string.c_str()); } const XMLChar* xstring::data() const { return reinterpret_cast(_string.data()); } /** UTF8 OFFSET FUNCTIONS * All utf8_*_offset() functions return npos if offset is out of range. * The caller should decide if npos is a valid argument and just marks * the whole string, or if it is not allowed (e.g. for start positions). * In the latter case std::out_of_range should be thrown, but usually * ustring will do that for us. */ /** * calculate the byte offset from a source string * @param str the source string * @param offset the number of characters to offset * @return the corresponding byte offset */ static xstring::size_type utf8_byte_offset(const unsigned char* str, xstring::size_type offset) { if (str == NULL) throw std::invalid_argument("utf8_byte_offset: str"); if(offset == xstring::npos) return xstring::npos; const unsigned char* p = str; for(; offset != 0; --offset) { if(*p == '\0') return xstring::npos; p += xstring_traits::utf8_size((XMLChar*)p); } return (p - str); } /** * calculate the byte offset, stop when reaching maxlen * @param str the source string * @param offset the number of characters to advance * @param maxlen the maximum length, in bytes, to traverse * @return the byte offset */ static xstring::size_type utf8_byte_offset(const unsigned char* str, xstring::size_type offset, xstring::size_type maxlen) { if (str == NULL) throw std::invalid_argument("utf8_byte_offset: str"); if(offset == xstring::npos) return xstring::npos; const unsigned char *const pend = str + maxlen; const unsigned char* p = str; for(; offset != 0; --offset) { if(p >= pend) return xstring::npos; p += xstring_traits::utf8_size((XMLChar*)p); } return (p - str); } /** * Calculate the byte offset, stop when reaching str.size() * @param str the source string * @param offset the offset in characters to advance * @return the byte */ inline xstring::size_type utf8_byte_offset(const ustring& str, xstring::size_type offset) { return utf8_byte_offset(str.data(), offset, str.size()); } /** * Calculate the numberof characters hte given byte offset represents */ static xstring::size_type utf8_char_offset(const unsigned char*str,xstring::size_type n) { if (n == xstring::npos) return xstring::npos; const unsigned char*p = str; xstring::size_type ret = 0; while ((unsigned)(p - str) < n) { ret++; p += xstring_traits::utf8_size( (const XMLChar*) p); } return ret; } /** * The bounds structure is used to easily determine the start and end byte indexes * for a given string given the start character and the number of characters to process */ struct utf8_bounds { xstring::size_type i; xstring::size_type n; utf8_bounds(const ustring& str, xstring::size_type ci, xstring::size_type cn) : i (utf8_byte_offset(str, ci)), n (xstring::npos) { if(i != xstring::npos) n = utf8_byte_offset(str.data() + i, cn, str.size() - i); } }; /** CONSTRUCTORS */ xstring::xstring() : _string() { } /** Generalization of the copy constructor * @param s the source string to copy * @param pos the starting position to copy, in characters * @param size the maximum number of characters to copy */ xstring::xstring(const xstring& s, size_type pos, size_type n) : _string() { const utf8_bounds bounds(s._string, pos, n); _string.assign(s._string,bounds.i, bounds.n); } /** * Construct a string from a null terminated character length * @param the null terminated character string (utf-8 encoded), must not be NULL */ xstring::xstring(const XMLChar* s) : _string(s) {}; /** * Construct a string from a character array and a length * @param s the character array, encoded in utf-8 * @param n the length to copy (in characters) */ xstring::xstring(const XMLChar* s, size_type n) : _string() { xstring::size_type x = utf8_byte_offset(s,n); _string.assign( s, x); } /** * Create a string with n copies of c * @param n the of copies of the character * @param c a pointer to the character to copy */ xstring::xstring(size_type n, XMLChar* c) : _string() { xstring::size_type x = xstring_traits::utf8_size(c); _string.reserve(x * n + 1); while (--n) _string.append( c,x); } xstring::xstring(size_type n, XMLChar c) : _string(n,(char)c) {} xstring::xstring(const wchar_t *s) : _string() { assign(s); } xstring::xstring(const std::wstring& s) : _string() { assign(s.c_str()); } /** OPERATORS */ /** the assignment operator */ xstring& xstring::operator=(const xstring& s) { _string = s._string; return *this; } xstring& xstring::operator=(const wchar_t *s) { return assign(s); } /* note: this assumes a little endian structure */ xstring& xstring::operator=(const std::wstring& s) { return assign(s.c_str()); } /** Assign an null-terminated character array (encoded in utf-8) to a string */ xstring& xstring::operator=(const XMLChar* s) { _string = s; return *this; } /** Assign a single character to the string */ xstring& xstring::operator=(XMLChar c) { _string = c; return *this; } /** reserve the number of BYTES specified */ void xstring::reserve(size_t n) { _string.reserve(n); } /** swap the contents of two strings */ void xstring::swap(xstring& s) { _string.swap(s._string); } /** INSERT METHODS */ /** inserts x before pos */ xstring::iterator xstring::insert(xstring::iterator pos, const value_type& x) { size_type s = pos.base() - _string.begin(); _string.insert(s, 1, x); return iterator(_string.begin() + s); } xstring::iterator xstring::insert(xstring::iterator pos, size_type n, const value_type& x) { size_type s = pos.base() - _string.begin(); _string.insert(s,n,x); return iterator(_string.begin() + s); } xstring& xstring::insert(size_type pos, const xstring& s) { _string.insert(utf8_byte_offset(_string,pos),s._string); return *this; } /** Inserts a substring of s before pos */ xstring& xstring::insert(size_type pos, const xstring& s, size_type pos1, size_type pos2) { const utf8_bounds bounds (s._string,pos1,pos2); _string.insert(utf8_byte_offset(_string,pos),bounds.i, bounds.n); return *this; } /** Insert s before pos */ xstring& xstring::insert(size_type pos, const XMLChar* s) { _string.insert(utf8_byte_offset(_string,pos),s); return *this; } /** Inserts the first n characters of s before pos */ xstring& xstring::insert(size_type pos, const XMLChar*s, size_type n) { _string.insert(utf8_byte_offset(_string,pos),s, utf8_byte_offset( s,pos + n)); return *this; } /** Insert n copies of c before pos */ xstring& xstring::insert(size_type pos, size_type n, XMLChar c) { _string.insert(utf8_byte_offset(_string,pos),n,(char)c); return *this; } /** Insert n copies of c before pos * @todo: need to check if c is a utf8 char and if not encode it to utf8*/ xstring& xstring::insert(size_type pos, size_type n, char c) { _string.insert(utf8_byte_offset(_string,pos),n,(char)c); return *this; } /** Append s to *this */ xstring& xstring::append(const xstring& s) { _string.append(s._string); return *this; } xstring& xstring::append(const xstring&s, size_type pos, size_type n) { const utf8_bounds bounds(s._string, pos, n); _string.append(s._string, bounds.i, bounds.n); return *this; } /** append s to *this */ xstring& xstring::append(const XMLChar* s) { _string.append( s); return *this; } /** append the first n charactesr of s to *this */ xstring& xstring::append(const XMLChar* s, size_type n) { _string.append( s, utf8_byte_offset(s,n)); return *this; } /** append n copies of the character c */ xstring& xstring::append(size_type n, XMLChar c) { _string.append(n,(char)c); return *this; } /** append n copies of the character c */ xstring& xstring::append(size_type n, char c) { _string.append(n,c); return *this; } /** append a single character to this */ void xstring::push_back(XMLChar c) { _string.append(1,c); } xstring::iterator xstring::erase(iterator p) { return iterator(_string.erase(p.base())); } xstring::iterator xstring::erase(iterator first, iterator last) { return iterator(_string.erase(first.base(), last.base())); } /**Erases a range */ xstring& xstring::erase(size_type pos , size_type n) { _string.erase(utf8_byte_offset(_string,pos), n == npos ? npos : utf8_byte_offset(_string,pos+n)); return *this; } void xstring::resize(size_type n, value_type c) { _string.resize(n,(char) c); } /** Erases the entire container */ void xstring::clear() { _string.erase(0,ustring::npos); } /** synonm for operator= */ xstring& xstring::assign(const xstring& s) { _string.assign(s._string); return *this; } /** assigns a substring of s to *this */ xstring& xstring::assign(const xstring& s, size_type pos, size_type n) { const utf8_bounds bounds(s._string,pos,n); _string.assign(s._string,bounds.i,bounds.n); return *this; } /** assings the first n characters of s to *this */ xstring& xstring::assign(const XMLChar* s, size_type n) { _string.assign(s, utf8_byte_offset(s,n)); return *this; } /** assigns a null terminated array of characters to *this */ xstring& xstring::assign(const XMLChar*s) { _string.assign( s); return *this; } /** Erases hte existing characters and replaces them by n copies of c */ xstring& xstring::assign(size_type n, XMLChar c) { _string.assign(n,(char)c); return *this; } xstring& xstring::assign(const wchar_t *s) { unsigned int c, d; int bits; clear(); /* double the length should be enough to avoid many resizes */ if (*s) _string.reserve(wcslen(s)*2); const wchar_t* i = s; while (*i) { c = *i++; if ( (c & 0xFC00) == 0xD800) { /* surrogates */ d = *i++; if ((d & 0xFC00) == 0xDC00) { c &= 0x03FF; c <<= 10; c |= d & 0x03FF; c += 0x10000; } else throw encoding_error("invalid surrogate"); } /* assertion: c is a single UTF-4 value */ if (c < 0x80) { _string.append(1,c); bits = -6; } else if (c < 0x8000) { _string.append(1,((c >> 6) & 0x1F) | 0xC0); bits = 0; } else if (c < 0x10000) { _string.append(1, ((c >> 12) & 0x0F) | 0xE0); bits = 6; } else { _string.append(1,((c >> 18) & 0x07) | 0xF0); bits = 12; } for (; bits >= 0; bits -= 6) { _string.append(1, (( c >> bits) & 0x3F) | 0x80); } } return *this; } /** replace a substring of *this with the string s */ xstring& xstring::replace(size_type pos, size_type n, const xstring& s) { const utf8_bounds bounds (_string,pos,n); _string.replace(bounds.i, bounds.n, s._string); return *this; } /** replace a substring of *this with a substring of s */ xstring& xstring::replace(size_type pos, size_type n, const xstring& s, size_type pos1, size_type n1) { const utf8_bounds b1(_string,pos,n); const utf8_bounds b2(s._string,pos1,n1); _string.replace(b1.i,b1.n,s._string,b2.i,b2.n); return *this; } /** replace a substring of *this with the first n1 characters of s */ xstring& xstring::replace(size_type pos, size_type n, const XMLChar* s, size_type n1) { const utf8_bounds bounds(_string,pos,n); _string.replace(bounds.i, bounds.n, s, utf8_byte_offset(s, n1)); return *this; } /** replace a substring of *this with a null-terminated character array (utf-8 encoded) */ xstring& xstring::replace(size_type pos, size_type n, const XMLChar* s) { const utf8_bounds bounds(_string,pos,n); _string.replace(bounds.i, bounds.n, s); return *this; } /** replaces a substring of *this with n1 copies of c */ xstring& xstring::replace(size_type pos, size_type n, size_type n1, XMLChar c) { const utf8_bounds bounds(_string,pos, n); _string.replace(bounds.i, bounds.n,n1,(char) c); return *this; } xstring& xstring::replace(iterator first, iterator last, const xstring& s) { _string.replace(first.base(), last.base(), s._string); return *this; } xstring& xstring::replace(iterator first, iterator last, const pointer s, size_type n) { /** calculate the offset of n characters of s */ _string.replace(first.base(), last.base(), s, utf8_byte_offset(s,n)); return *this; } xstring& xstring::replace(iterator first, iterator last, const pointer s) { _string.replace(first.base(), last.base(), s); return *this; } xstring& xstring::replace(iterator first, iterator last, size_type n, value_type c) { _string.replace(first.base(), last.base(), n, c); return *this; } /** copies a substring of *this to a buffer */ xstring::size_type xstring::copy(XMLChar* buf, size_type n, size_type pos ) const { const utf8_bounds bounds(_string,pos,n); return _string.copy(buf, bounds.n == npos ? npos : bounds.n - bounds.i, bounds.i); } /** searches for s as a substring of *this, beginning at character pos of *this */ xstring::size_type xstring::find(const xstring& s, size_type pos ) { size_type tmp = _string.find(s._string, utf8_byte_offset(_string,pos)); /** figure out how many characters it takes to get */ return utf8_char_offset(_string.data(),tmp); } /** searches for the first n characters of s as a substring of *this, beginning at character pos of *this */ xstring::size_type xstring::find(const XMLChar* s, size_type pos, size_type n) const { const utf8_bounds bounds(s, pos, n); size_type tmp = _string.find( s, bounds.i, bounds.n); return utf8_char_offset(_string.data(),tmp); } /** searches for a null-terminated character array as a substring of *this, beginning at character pos of *this */ xstring::size_type xstring::find(const XMLChar* s, size_type pos ) const { size_type tmp = _string.find(s, utf8_byte_offset(_string,pos)); return utf8_char_offset(_string.data(),tmp); } /** searches for the chracter c, beginning at character position pos */ xstring::size_type xstring::find(XMLChar c, size_type pos ) const { size_type tmp = _string.find((char)c, utf8_byte_offset(_string,pos)); return utf8_char_offset(_string.data(),tmp); } /** searches backward for s as a substring of *this beginning * at characteer positoin min(pos,size()) */ xstring::size_type xstring::rfind(const xstring& s, size_type pos) const { size_type tmp = _string.rfind(s._string, utf8_byte_offset(_string,pos)); return utf8_char_offset(_string.data(),tmp); } /** seraches backward for the first n characters of s as a substring of *this, beginning at character position min(pos,size()) */ xstring::size_type xstring::rfind(const XMLChar* s, size_type pos, size_type n) const { const utf8_bounds bounds( s, pos, n); size_type tmp = _string.rfind(s, bounds.i, bounds.n); return utf8_char_offset(_string.data(),tmp); } /** searches backward for a null-terminated character array as a substring of *this, beginning at character min(pos,size()) */ xstring::size_type xstring::rfind(const XMLChar* s, size_type pos ) const { size_type tmp = _string.rfind(s, utf8_byte_offset(_string,pos)); return utf8_char_offset(_string.data(),tmp); } xstring::size_type xstring::rfind(XMLChar c, size_type pos ) const { size_type s = _string.rfind(c, utf8_byte_offset(_string,pos)); return utf8_char_offset(_string.data(),s); } xstring xstring::substr(size_type pos, size_type n ) const { const utf8_bounds bounds(_string,pos,n); return xstring((const XMLChar*)_string.substr(bounds.i, bounds.n).c_str()); } int xstring::compare(const xstring& s) const { return _string.compare(s._string); } int xstring::compare(size_type pos, size_type n, const xstring& s) const { const utf8_bounds bounds(_string,pos,n); return _string.compare(bounds.i, bounds.n, s._string); } int xstring::compare(size_type pos, size_type n, const xstring& s, size_type pos1, size_type n1) const { const utf8_bounds bounds1(_string,pos,n), bounds2(_string,pos1,n1); return _string.compare(bounds1.i, bounds1.n, s._string, bounds2.i, bounds2.n); } int xstring::compare(const pointer s) const { return _string.compare( s); } int xstring::compare(size_type pos, size_type n, const pointer s, size_type len) const { const utf8_bounds bounds(_string,pos,n); size_type l = utf8_byte_offset( s, len); return _string.compare(bounds.i, bounds.n, s, l); } /** ITERATORS */ xstring::iterator xstring::begin() { return iterator(_string.begin()); } /** return an iterator pointing to the end of the string */ xstring::iterator xstring::end() { return iterator(_string.end()); } /** return a const iterator */ xstring::const_iterator xstring::begin() const { return const_iterator(_string.begin()); } xstring::const_iterator xstring::end() const { return const_iterator(_string.end()); } /** return a UTF-16LE encoded version */ std::wstring xstring::w_str() const { unsigned int c,d; int trailing; std::wstring ret; ret.reserve(length()+1); ustring::const_iterator i = _string.begin(); while (i != _string.end()) { d = *i++; if (d < 0x80) { c = d; trailing = 0; } else if (d < 0xC0) { /* trailing byte in leading position */ throw encoding_error("trailing byte in leading position"); } else if (d < 0xE0) { c = d & 0x1f; trailing = 1; } else if (d < 0xF0) { c = d & 0x0F; trailing = 2; } else if (d < 0xF8) { c = d & 0x07; trailing = 3; } else throw encoding_error("invalid UTF16"); for (; trailing; trailing--) { if (i == _string.end() || (((d = *i++) & 0xC0) != 0x80)) break; c <<= 6; c |= d & 0x3F; } /** assertion: c is a single UTF-4 value */ if (c < 0x10000) { ret.append(1,c); } else if (c < 0x110000) { c -= 0x10000; ret.append(1,0xD800 | (c >> 10)); ret.append(1,0xDC00 | (c & 0x03FF)); } else break; } return ret; }