On Thu, Jul 17, 2025 at 7:02 PM Jonathan Wakely <jwak...@redhat.com> wrote:

> This implements the missing functions in _Utf_iterator to support
> reverse iteration. All existing tests pass when the view is reversed, so
> that the same code units are seen when iterating forwards or backwards.
>
> libstdc++-v3/ChangeLog:
>
>         * include/bits/unicode.h (_Utf_iterator::operator--): Reorder
>         conditions and update position after reading a code unit.
>         (_Utf_iterator::_M_read_reverse): Define.
>         (_Utf_iterator::_M_read_utf8): Return extracted code point.
>         (_Utf_iterator::_M_read_reverse_utf8): Define.
>         (_Utf_iterator::_M_read_reverse_utf16): Define.
>         (_Utf_iterator::_M_read_reverse_utf32): Define.
>         * testsuite/ext/unicode/view.cc: Add checks for reversed views
>         and reverse iteration.
> ---
>
> Tested x86_64-linux.
>
Please add a test where we have a string that contains only utf8
continuation code-points.
Outside that LGTM.

>
>  libstdc++-v3/include/bits/unicode.h        | 136 ++++++++++++++++++++-
>  libstdc++-v3/testsuite/ext/unicode/view.cc |  68 +++++++----
>  2 files changed, 174 insertions(+), 30 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/unicode.h
> b/libstdc++-v3/include/bits/unicode.h
> index 9a38462e8102..9ff019594b7b 100644
> --- a/libstdc++-v3/include/bits/unicode.h
> +++ b/libstdc++-v3/include/bits/unicode.h
> @@ -209,10 +209,15 @@ namespace __unicode
>        constexpr _Utf_iterator&
>        operator--() requires bidirectional_iterator<_Iter>
>        {
> -       if (!_M_buf_index && _M_curr() != _M_first())
> -         _M_read_reverse();
> -       else if (_M_buf_index)
> +       if (_M_buf_index > 0)
>           --_M_buf_index;
> +       else if (_M_curr() != _M_first())
> +         {
> +           _M_read_reverse();
> +           _M_buf_index = _M_buf_last - 1;
> +           ranges::advance(_M_curr(), -_M_to_increment);
> +         }
> +       // else erroneous, but ignored for now.
>         return *this;
>        }
>
> @@ -269,7 +274,18 @@ namespace __unicode
>        }
>
>        constexpr void
> -      _M_read_reverse(); // TODO
> +      _M_read_reverse() requires bidirectional_iterator<_Iter>
> +      {
> +       if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
> +         _M_read_reverse_utf8();
> +       else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
> +         _M_read_reverse_utf16();
> +       else
> +         {
> +           static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
> +           _M_read_reverse_utf32();
> +         }
> +      }
>
>        template<typename>
>         struct _Guard
> @@ -285,7 +301,7 @@ namespace __unicode
>           _It _M_orig;
>         };
>
> -      constexpr void
> +      constexpr char32_t
>        _M_read_utf8()
>        {
>         _Guard<_Iter> __g{this, _M_curr()};
> @@ -383,6 +399,8 @@ namespace __unicode
>           __c = _S_error();
>
>         _M_update(__c, __to_incr);
> +
> +       return __c;
>        }
>
>        constexpr void
> @@ -425,6 +443,114 @@ namespace __unicode
>         _M_update(__c, 1);
>        }
>
> +      constexpr void
> +      _M_read_reverse_utf8() requires bidirectional_iterator<_Iter>
> +      {
> +       const auto __first = _M_first();
> +       auto __curr = _M_curr();
> +       // The code point we decode:
> +       char32_t __c{};
> +       // The last code unit read:
> +       uint8_t __u = *--__curr;
> +       // Count of bytes read:
> +       uint8_t __to_incr = 1;
> +
> +       if (__u <= 0x7F) [[likely]]
> +         {
> +           _M_update(__u, 1);
> +           return;
> +         }
> +
> +       // Continuation bytes match 10xxxxxx
> +       auto __is_continuation = [](uint8_t __b) {
> +         return (__b & 0xC0) == 0x80;
> +       };
> +       // 0xC0 and 0xC1 would produce overlong encodings of ASCII
> characters.
> +       // 0xF5-0xFF would produce code points above U+10FFFF
> +       auto __is_invalid = [](uint8_t __b) {
> +         return (__b & 0xFE) == 0xC0 || __b >= 0xF5;
> +       };
> +
> +       // No valid or invalid multibyte sequence is longer than 4 bytes,
> +       // so skip back over at most four bytes.
> +       while (__is_continuation(__u) && __to_incr < 4 && __curr !=
> __first)
> +         {
> +           ++__to_incr;
> +           __u = *--__curr;
> +         }
> +
> +       // If the last byte read was a continuation byte then either we
> read
> +       // four continuation bytes, or stopped at the start of the
> sequence.
> +       // Either way, the maximal subparts are the individual continuation
> +       // bytes so each one should be replaced with U+FFFD.
> +       if (__is_continuation(__u) || __is_invalid(__u)) [[unlikely]]
>
I was wondering why you are checking, as it looked like optimizing the
error path,
but then realized that this is required to make __seq_length computation
below
valid.

> +         {
> +           // Either found four continuation bytes (maximum allowed is
> three)
> +           // or first non-continuation byte is an invalid UTF-8 code
> unit.
> +           _M_update(_S_error(), 1);
> +           return;
> +         }
> +       int __seq_length = std::countl_one((unsigned char)__u);
> +       if (__seq_length < __to_incr) [[unlikely]]
> +         {
> +           // If the expected number of continuation bytes is less than
> +           // the number we found, then the last continuation byte is a
> +           // maximal subpart and the decremented iterator points to it.
> +           _M_update(_S_error(), 1);
> +           return;
> +         }
> +
> +       auto __orig = std::__exchange(_M_curr(), std::move(__curr));
> +       if (_M_read_utf8() == _S_error()) [[unlikely]]
>
This is a nice trick, to not repeat all checks, and compute the actual
value of
code point.

> +         {
> +           if (_M_to_increment < __to_incr) // Read truncated sequence,
> set
> +             _M_to_increment = 1;           // curr to last continuation
> byte.
> +         }
> +
> +       _M_curr() = std::move(__orig);
> +       // operator--() will move back by _M_to_increment
> +      }
> +
> +      constexpr void
> +      _M_read_reverse_utf16() requires bidirectional_iterator<_Iter>
> +      {
> +       _Guard<_Iter> __g{this, _M_curr()};
> +       char32_t __c{};
> +       uint16_t __u = *--_M_curr();
> +       uint8_t __to_incr = 1;
> +
> +       if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
> +         __c = __u;
> +       else if (__u >= 0xDC00 && _M_curr() != _M_first()) [[likely]]
> +         {
> +           // read a low surrogate, expect a high surrogate before it.
> +           uint16_t __u2 = *--_M_curr();
> +           if (__u2 < 0xD800 || __u2 > 0xDC00) [[unlikely]]
> +             __c = _S_error(); // unpaired low surrogate
> +           else
> +             {
> +               __to_incr = 2;
> +               uint32_t __x = (__u2 & 0x3F) << 10 | (__u & 0x3FF);
> +               uint32_t __w = (__u2 >> 6) & 0x1F;
> +               __c = (__w + 1) << 16 | __x;
> +             }
> +         }
> +       else
> +         __c = _S_error(); // unpaired surrogate
> +
> +       _M_update(__c, __to_incr);
> +      }
> +
> +      constexpr void
> +      _M_read_reverse_utf32() requires bidirectional_iterator<_Iter>
> +      {
> +       _Guard<_Iter> __g{this, _M_curr()};
> +       char32_t __c = *--_M_curr();
> +       if (!__is_scalar_value(__c)) [[unlikely]]
> +         __c = _S_error();
> +       _M_update(__c, 1);
> +      }
> +
>        // Encode the code point __c as one or more code units in _M_buf.
>        constexpr void
>        _M_update(char32_t __c, uint8_t __to_incr)
> diff --git a/libstdc++-v3/testsuite/ext/unicode/view.cc
> b/libstdc++-v3/testsuite/ext/unicode/view.cc
> index 6f3c099bd84a..b90fd6b4c57f 100644
> --- a/libstdc++-v3/testsuite/ext/unicode/view.cc
> +++ b/libstdc++-v3/testsuite/ext/unicode/view.cc
> @@ -7,13 +7,24 @@
>  namespace uc = std::__unicode;
>  using namespace std::string_view_literals;
>
> +template<std::ranges::range View>
> +constexpr void
> +compare(View v, std::basic_string_view<std::ranges::range_value_t<View>>
> s)
> +{
> +  long size = s.size();
> +  VERIFY( std::ranges::distance(v) == size );
> +  VERIFY( std::ranges::equal(v,  s) );
> +  auto rev = std::views::reverse(v);
> +  VERIFY( std::ranges::distance(rev) == size );
> +  VERIFY( std::ranges::equal(rev,  s | std::views::reverse) );
> +}
> +
>  constexpr void
>  test_utf8_to_utf8()
>  {
>    const auto s8 = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv;
>    uc::_Utf8_view v(s8);
> -  VERIFY( std::ranges::distance(v) == s8.size() );
> -  VERIFY( std::ranges::equal(v,  s8) );
> +  compare(v, s8);
>  }
>
>  constexpr void
> @@ -22,8 +33,7 @@ test_utf8_to_utf16()
>    const auto s8  = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv;
>    const std::u16string_view s16 = u"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡";
>    uc::_Utf16_view v(s8);
> -  VERIFY( std::ranges::distance(v) == s16.size() );
> -  VERIFY( std::ranges::equal(v,  s16) );
> +  compare(v, s16);
>  }
>
>  constexpr void
> @@ -32,36 +42,34 @@ test_utf8_to_utf32()
>    const auto s8 = u8"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv;
>    const auto s32 = U"£🇬🇧 €🇪🇺 æбçδé ♠♥♦♣ 🤡"sv;
>    uc::_Utf32_view v(s8);
> -  VERIFY( std::ranges::distance(v) == s32.size() );
> -  VERIFY( std::ranges::equal(v,  s32) );
> +  compare(v, s32);
>  }
>
>  constexpr void
>  test_illformed_utf8()
>  {
>    uc::_Utf32_view v("\xa3 10.99 \xee \xdd"sv);
> -  VERIFY( std::ranges::equal(v, U"\uFFFD 10.99 \uFFFD \uFFFD"sv) );
> +  compare(v, U"\uFFFD 10.99 \uFFFD \uFFFD"sv);
>
>    uc::_Utf16_view v2(" \xf8\x80\x80\x80 "sv);
> -  VERIFY( std::ranges::distance(v2) == 6 );
> -  VERIFY( std::ranges::equal(v2, U" \uFFFD\uFFFD\uFFFD\uFFFD "sv) );
> +  compare(v2, u" \uFFFD\uFFFD\uFFFD\uFFFD "sv);
>
>    // Examples of U+FFFD substitution from Unicode standard.
>    uc::_Utf8_view v3("\xc0\xaf\xe0\x80\xbf\xf0\x81\x82\x41"sv); // Table
> 3-8
> -  VERIFY( std::ranges::equal(v3,
> u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) );
> +  compare(v3, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv);
>    uc::_Utf8_view v4("\xed\xa0\x80\xed\xbf\xbf\xed\xaf\x41"sv); // Table
> 3-9
> -  VERIFY( std::ranges::equal(v4,
> u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) );
> +  compare(v4, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv);
>    uc::_Utf8_view v5("\xf4\x91\x92\x93\xff\x41\x80\xbf\x42"sv); // Table
> 3-10
> -  VERIFY( std::ranges::equal(v5,
> u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41\uFFFD\uFFFD\x42"sv) );
> +  compare(v5, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41\uFFFD\uFFFD\x42"sv);
>    uc::_Utf8_view v6("\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41"sv); // Table
> 3-11
> -  VERIFY( std::ranges::equal(v6, u8"\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) );
> +  compare(v6, u8"\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv);
>
>    uc::_Utf32_view v7("\xe1\x80"sv);
> -  VERIFY( std::ranges::equal(v7, U"\uFFFD"sv) );
> +  compare(v7,  U"\uFFFD"sv);
>    uc::_Utf32_view v8("\xf1\x80"sv);
> -  VERIFY( std::ranges::equal(v8, U"\uFFFD"sv) );
> +  compare(v8,  U"\uFFFD"sv);
>    uc::_Utf32_view v9("\xf1\x80\x80"sv);
> -  VERIFY( std::ranges::equal(v9, U"\uFFFD"sv) );
> +  compare(v9,  U"\uFFFD"sv);
>  }
>
>  constexpr void
> @@ -69,27 +77,27 @@ test_illformed_utf16()
>  {
>    std::u16string_view s = u"\N{CLOWN FACE}";
>    std::u16string_view r = u"\uFFFD";
> -  VERIFY( std::ranges::equal(uc::_Utf16_view(s.substr(0, 1)), r) );
> -  VERIFY( std::ranges::equal(uc::_Utf16_view(s.substr(1, 1)), r) );
> +  compare(uc::_Utf16_view(s.substr(0, 1)), r);
> +  compare(uc::_Utf16_view(s.substr(1, 1)), r);
>    std::array s2{ s[0], s[0] };
> -  VERIFY( std::ranges::equal(uc::_Utf16_view(s2), u"\uFFFD\uFFFD"sv) );
> +  compare(uc::_Utf16_view(s2), u"\uFFFD\uFFFD"sv);
>    std::array s3{ s[0], s[0], s[1] };
> -  VERIFY( std::ranges::equal(uc::_Utf16_view(s3), u"\uFFFD\N{CLOWN
> FACE}"sv) );
> +  compare(uc::_Utf16_view(s3), u"\uFFFD\N{CLOWN FACE}"sv);
>    std::array s4{ s[1], s[0] };
> -  VERIFY( std::ranges::equal(uc::_Utf16_view(s4), u"\uFFFD\uFFFD"sv) );
> +  compare(uc::_Utf16_view(s4), u"\uFFFD\uFFFD"sv);
>    std::array s5{ s[1], s[0], s[1] };
> -  VERIFY( std::ranges::equal(uc::_Utf16_view(s5), u"\uFFFD\N{CLOWN
> FACE}"sv) );
> +  compare(uc::_Utf16_view(s5), u"\uFFFD\N{CLOWN FACE}"sv);
>  }
>
>  constexpr void
>  test_illformed_utf32()
>  {
>    std::u32string_view s = U"\x110000";
> -  VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) );
> +  compare(uc::_Utf32_view(s), U"\uFFFD"sv);
>    s = U"\xFFFFFF";
> -  VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) );
> +  compare(uc::_Utf32_view(s), U"\uFFFD"sv);
>    s = U"\xFFFFFFF0";
> -  VERIFY( std::ranges::equal(uc::_Utf32_view(s), U"\uFFFD"sv) );
> +  compare(uc::_Utf32_view(s), U"\uFFFD"sv);
>  }
>
>  constexpr void
> @@ -110,6 +118,13 @@ test_past_the_end()
>    iter++;
>    VERIFY( iter == v.end() );
>    VERIFY( *iter == U'4' );
> +  std::ranges::advance(iter, -4);
> +  VERIFY( *iter == U'1' );
> +  // Incrementing before begin has well-defined behaviour.
> +  iter--;
> +  VERIFY( *iter == U'1' );
> +  iter--;
> +  VERIFY( *iter == U'1' );
>
>    std::string_view empty;
>    uc::_Utf32_view v2(empty);
> @@ -119,6 +134,9 @@ test_past_the_end()
>    iter++;
>    VERIFY( iter2 == v2.end() );
>    VERIFY( *iter2 == U'\0' );
> +  iter--;
> +  VERIFY( iter2 == v2.end() );
> +  VERIFY( *iter2 == U'\0' );
>  }
>
>  int main()
> --
> 2.50.1
>
>

Reply via email to