On Thu, Jul 17, 2025 at 7:02 PM Jonathan Wakely <jwak...@redhat.com> wrote:
> This implements the missing functions in _Utf_iterator to support > reverse iteration. All existing tests pass when the view is reversed, so > that the same code units are seen when iterating forwards or backwards. > > libstdc++-v3/ChangeLog: > > * include/bits/unicode.h (_Utf_iterator::operator--): Reorder > conditions and update position after reading a code unit. > (_Utf_iterator::_M_read_reverse): Define. > (_Utf_iterator::_M_read_utf8): Return extracted code point. > (_Utf_iterator::_M_read_reverse_utf8): Define. > (_Utf_iterator::_M_read_reverse_utf16): Define. > (_Utf_iterator::_M_read_reverse_utf32): Define. > * testsuite/ext/unicode/view.cc: Add checks for reversed views > and reverse iteration. > --- > > Tested x86_64-linux. > Please add a test where we have a string that contains only utf8 continuation code-points. Outside that LGTM. > > libstdc++-v3/include/bits/unicode.h | 136 ++++++++++++++++++++- > libstdc++-v3/testsuite/ext/unicode/view.cc | 68 +++++++---- > 2 files changed, 174 insertions(+), 30 deletions(-) > > diff --git a/libstdc++-v3/include/bits/unicode.h > b/libstdc++-v3/include/bits/unicode.h > index 9a38462e8102..9ff019594b7b 100644 > --- a/libstdc++-v3/include/bits/unicode.h > +++ b/libstdc++-v3/include/bits/unicode.h > @@ -209,10 +209,15 @@ namespace __unicode > constexpr _Utf_iterator& > operator--() requires bidirectional_iterator<_Iter> > { > - if (!_M_buf_index && _M_curr() != _M_first()) > - _M_read_reverse(); > - else if (_M_buf_index) > + if (_M_buf_index > 0) > --_M_buf_index; > + else if (_M_curr() != _M_first()) > + { > + _M_read_reverse(); > + _M_buf_index = _M_buf_last - 1; > + ranges::advance(_M_curr(), -_M_to_increment); > + } > + // else erroneous, but ignored for now. > return *this; > } > > @@ -269,7 +274,18 @@ namespace __unicode > } > > constexpr void > - _M_read_reverse(); // TODO > + _M_read_reverse() requires bidirectional_iterator<_Iter> > + { > + if constexpr (sizeof(_FromFmt) == sizeof(uint8_t)) > + _M_read_reverse_utf8(); > + else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t)) > + _M_read_reverse_utf16(); > + else > + { > + static_assert(sizeof(_FromFmt) == sizeof(uint32_t)); > + _M_read_reverse_utf32(); > + } > + } > > template<typename> > struct _Guard > @@ -285,7 +301,7 @@ namespace __unicode > _It _M_orig; > }; > > - constexpr void > + constexpr char32_t > _M_read_utf8() > { > _Guard<_Iter> __g{this, _M_curr()}; > @@ -383,6 +399,8 @@ namespace __unicode > __c = _S_error(); > > _M_update(__c, __to_incr); > + > + return __c; > } > > constexpr void > @@ -425,6 +443,114 @@ namespace __unicode > _M_update(__c, 1); > } > > + constexpr void > + _M_read_reverse_utf8() requires bidirectional_iterator<_Iter> > + { > + const auto __first = _M_first(); > + auto __curr = _M_curr(); > + // The code point we decode: > + char32_t __c{}; > + // The last code unit read: > + uint8_t __u = *--__curr; > + // Count of bytes read: > + uint8_t __to_incr = 1; > + > + if (__u <= 0x7F) [[likely]] > + { > + _M_update(__u, 1); > + return; > + } > + > + // Continuation bytes match 10xxxxxx > + auto __is_continuation = [](uint8_t __b) { > + return (__b & 0xC0) == 0x80; > + }; > + // 0xC0 and 0xC1 would produce overlong encodings of ASCII > characters. > + // 0xF5-0xFF would produce code points above U+10FFFF > + auto __is_invalid = [](uint8_t __b) { > + return (__b & 0xFE) == 0xC0 || __b >= 0xF5; > + }; > + > + // No valid or invalid multibyte sequence is longer than 4 bytes, > + // so skip back over at most four bytes. > + while (__is_continuation(__u) && __to_incr < 4 && __curr != > __first) > + { > + ++__to_incr; > + __u = *--__curr; > + } > + > + // If the last byte read was a continuation byte then either we > read > + // four continuation bytes, or stopped at the start of the > sequence. > + // Either way, the maximal subparts are the individual continuation > + // bytes so each one should be replaced with U+FFFD. > + if (__is_continuation(__u) || __is_invalid(__u)) [[unlikely]] > I was wondering why you are checking, as it looked like optimizing the error path, but then realized that this is required to make __seq_length computation below valid. > + { > + // Either found four continuation bytes (maximum allowed is > three) > + // or first non-continuation byte is an invalid UTF-8 code > unit. > + _M_update(_S_error(), 1); > + return; > + } > + int __seq_length = std::countl_one((unsigned char)__u); > + if (__seq_length < __to_incr) [[unlikely]] > + { > + // If the expected number of continuation bytes is less than > + // the number we found, then the last continuation byte is a > + // maximal subpart and the decremented iterator points to it. > + _M_update(_S_error(), 1); > + return; > + } > + > + auto __orig = std::__exchange(_M_curr(), std::move(__curr)); > + if (_M_read_utf8() == _S_error()) [[unlikely]] > This is a nice trick, to not repeat all checks, and compute the actual value of code point. > + { > + if (_M_to_increment < __to_incr) // Read truncated sequence, > set > + _M_to_increment = 1; // curr to last continuation > byte. > + } > + > + _M_curr() = std::move(__orig); > + // operator--() will move back by _M_to_increment > + } > + > + constexpr void > + _M_read_reverse_utf16() requires bidirectional_iterator<_Iter> > + { > + _Guard<_Iter> __g{this, _M_curr()}; > + char32_t __c{}; > + uint16_t __u = *--_M_curr(); > + uint8_t __to_incr = 1; > + > + if (__u < 0xD800 || __u > 0xDFFF) [[likely]] > + __c = __u; > + else if (__u >= 0xDC00 && _M_curr() != _M_first()) [[likely]] > + { > + // read a low surrogate, expect a high surrogate before it. > + uint16_t __u2 = *--_M_curr(); > + if (__u2 < 0xD800 || __u2 > 0xDC00) [[unlikely]] > + __c = _S_error(); // unpaired low surrogate > + else > + { > + __to_incr = 2; > + uint32_t __x = (__u2 & 0x3F) << 10 | (__u & 0x3FF); > + uint32_t __w = (__u2 >> 6) & 0x1F; > + __c = (__w + 1) << 16 | __x; > + } > + } > + else > + __c = _S_error(); // unpaired surrogate > + > + _M_update(__c, __to_incr); > + } > + > + constexpr void > + _M_read_reverse_utf32() requires bidirectional_iterator<_Iter> > + { > + _Guard<_Iter> __g{this, _M_curr()}; > + char32_t __c = *--_M_curr(); > + if (!__is_scalar_value(__c)) [[unlikely]] > + __c = _S_error(); > + _M_update(__c, 1); > + } > + > // Encode the code point __c as one or more code units in _M_buf. > constexpr void > _M_update(char32_t __c, uint8_t __to_incr) > diff --git a/libstdc++-v3/testsuite/ext/unicode/view.cc > b/libstdc++-v3/testsuite/ext/unicode/view.cc > index 6f3c099bd84a..b90fd6b4c57f 100644 > --- a/libstdc++-v3/testsuite/ext/unicode/view.cc > +++ b/libstdc++-v3/testsuite/ext/unicode/view.cc > @@ -7,13 +7,24 @@ > namespace uc = std::__unicode; > using namespace std::string_view_literals; > > +template<std::ranges::range View> > +constexpr void > +compare(View v, std::basic_string_view<std::ranges::range_value_t<View>> > s) > +{ > + long size = s.size(); > + VERIFY( std::ranges::distance(v) == size ); > + VERIFY( std::ranges::equal(v, s) ); > + auto rev = std::views::reverse(v); > + VERIFY( std::ranges::distance(rev) == size ); > + VERIFY( std::ranges::equal(rev, s | std::views::reverse) ); > +} > + > constexpr void > test_utf8_to_utf8() > { > const auto s8 = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv; > uc::_Utf8_view v(s8); > - VERIFY( std::ranges::distance(v) == s8.size() ); > - VERIFY( std::ranges::equal(v, s8) ); > + compare(v, s8); > } > > constexpr void > @@ -22,8 +33,7 @@ test_utf8_to_utf16() > const auto s8 = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv; > const std::u16string_view s16 = u"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"; > uc::_Utf16_view v(s8); > - VERIFY( std::ranges::distance(v) == s16.size() ); > - VERIFY( std::ranges::equal(v, s16) ); > + compare(v, s16); > } > > constexpr void > @@ -32,36 +42,34 @@ test_utf8_to_utf32() > const auto s8 = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv; > const auto s32 = U"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv; > uc::_Utf32_view v(s8); > - VERIFY( std::ranges::distance(v) == s32.size() ); > - VERIFY( std::ranges::equal(v, s32) ); > + compare(v, s32); > } > > constexpr void > test_illformed_utf8() > { > uc::_Utf32_view v("\xa3 10.99 \xee \xdd"sv); > - VERIFY( std::ranges::equal(v, U"\uFFFD 10.99 \uFFFD \uFFFD"sv) ); > + compare(v, U"\uFFFD 10.99 \uFFFD \uFFFD"sv); > > uc::_Utf16_view v2(" \xf8\x80\x80\x80 "sv); > - VERIFY( std::ranges::distance(v2) == 6 ); > - VERIFY( std::ranges::equal(v2, U" \uFFFD\uFFFD\uFFFD\uFFFD "sv) ); > + compare(v2, u" \uFFFD\uFFFD\uFFFD\uFFFD "sv); > > // Examples of U+FFFD substitution from Unicode standard. > uc::_Utf8_view v3("\xc0\xaf\xe0\x80\xbf\xf0\x81\x82\x41"sv); // Table > 3-8 > - VERIFY( std::ranges::equal(v3, > u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) ); > + compare(v3, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv); > uc::_Utf8_view v4("\xed\xa0\x80\xed\xbf\xbf\xed\xaf\x41"sv); // Table > 3-9 > - VERIFY( std::ranges::equal(v4, > u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) ); > + compare(v4, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv); > uc::_Utf8_view v5("\xf4\x91\x92\x93\xff\x41\x80\xbf\x42"sv); // Table > 3-10 > - VERIFY( std::ranges::equal(v5, > u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41\uFFFD\uFFFD\x42"sv) ); > + compare(v5, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41\uFFFD\uFFFD\x42"sv); > uc::_Utf8_view v6("\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41"sv); // Table > 3-11 > - VERIFY( std::ranges::equal(v6, u8"\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) ); > + compare(v6, u8"\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv); > > uc::_Utf32_view v7("\xe1\x80"sv); > - VERIFY( std::ranges::equal(v7, U"\uFFFD"sv) ); > + compare(v7, U"\uFFFD"sv); > uc::_Utf32_view v8("\xf1\x80"sv); > - VERIFY( std::ranges::equal(v8, U"\uFFFD"sv) ); > + compare(v8, U"\uFFFD"sv); > uc::_Utf32_view v9("\xf1\x80\x80"sv); > - VERIFY( std::ranges::equal(v9, U"\uFFFD"sv) ); > + compare(v9, U"\uFFFD"sv); > } > > constexpr void > @@ -69,27 +77,27 @@ test_illformed_utf16() > { > std::u16string_view s = u"\N{CLOWN FACE}"; > std::u16string_view r = u"\uFFFD"; > - VERIFY( std::ranges::equal(uc::_Utf16_view(s.substr(0, 1)), r) ); > - VERIFY( std::ranges::equal(uc::_Utf16_view(s.substr(1, 1)), r) ); > + compare(uc::_Utf16_view(s.substr(0, 1)), r); > + compare(uc::_Utf16_view(s.substr(1, 1)), r); > std::array s2{ s[0], s[0] }; > - VERIFY( std::ranges::equal(uc::_Utf16_view(s2), u"\uFFFD\uFFFD"sv) ); > + compare(uc::_Utf16_view(s2), u"\uFFFD\uFFFD"sv); > std::array s3{ s[0], s[0], s[1] }; > - VERIFY( std::ranges::equal(uc::_Utf16_view(s3), u"\uFFFD\N{CLOWN > FACE}"sv) ); > + compare(uc::_Utf16_view(s3), u"\uFFFD\N{CLOWN FACE}"sv); > std::array s4{ s[1], s[0] }; > - VERIFY( std::ranges::equal(uc::_Utf16_view(s4), u"\uFFFD\uFFFD"sv) ); > + compare(uc::_Utf16_view(s4), u"\uFFFD\uFFFD"sv); > std::array s5{ s[1], s[0], s[1] }; > - VERIFY( std::ranges::equal(uc::_Utf16_view(s5), u"\uFFFD\N{CLOWN > FACE}"sv) ); > + compare(uc::_Utf16_view(s5), u"\uFFFD\N{CLOWN FACE}"sv); > } > > constexpr void > test_illformed_utf32() > { > std::u32string_view s = U"\x110000"; > - VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) ); > + compare(uc::_Utf32_view(s), U"\uFFFD"sv); > s = U"\xFFFFFF"; > - VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) ); > + compare(uc::_Utf32_view(s), U"\uFFFD"sv); > s = U"\xFFFFFFF0"; > - VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) ); > + compare(uc::_Utf32_view(s), U"\uFFFD"sv); > } > > constexpr void > @@ -110,6 +118,13 @@ test_past_the_end() > iter++; > VERIFY( iter == v.end() ); > VERIFY( *iter == U'4' ); > + std::ranges::advance(iter, -4); > + VERIFY( *iter == U'1' ); > + // Incrementing before begin has well-defined behaviour. > + iter--; > + VERIFY( *iter == U'1' ); > + iter--; > + VERIFY( *iter == U'1' ); > > std::string_view empty; > uc::_Utf32_view v2(empty); > @@ -119,6 +134,9 @@ test_past_the_end() > iter++; > VERIFY( iter2 == v2.end() ); > VERIFY( *iter2 == U'\0' ); > + iter--; > + VERIFY( iter2 == v2.end() ); > + VERIFY( *iter2 == U'\0' ); > } > > int main() > -- > 2.50.1 > >