On Wed, Jul 16, 2025 at 12:05 PM Jonathan Wakely <jwak...@redhat.com> wrote:

> Add comments documenting what it does and how it does it.
>
> Also reorder the if-else in operator++ so that we check whether to
> iterate over code units in the local buffer before checking whether to
> refill that buffer. That seems the more natural way to structure the
> function.
>
> libstdc++-v3/ChangeLog:
>
>         * include/bits/unicode.h (__unicode::_Utf_iterator): Add
>         comments.
>         (__unicode:_Utf_iterator::operator++()): Check whether to
>         iterate over the buffer first.
> ---
>
> Tested x86_64-linux. Are these comments clear? Helpful enough?
>
I have used the  iterator without the comments, so my option is biased.
They are clear for me, except one, as noted below.

>
>  libstdc++-v3/include/bits/unicode.h | 48 +++++++++++++++++++++++------
>  1 file changed, 38 insertions(+), 10 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/unicode.h
> b/libstdc++-v3/include/bits/unicode.h
> index f1b6bf49c54c..d4e379938c20 100644
> --- a/libstdc++-v3/include/bits/unicode.h
> +++ b/libstdc++-v3/include/bits/unicode.h
> @@ -86,6 +86,25 @@ namespace __unicode
>        { return *__it == iter_value_t<_It>{}; }
>    };
>
> +  // An iterator over an input range of FromFmt code units that yields
> either
> +  // UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units.
> +  // The code units from the input range are interpreted as Unicode code
> points
> +  // and the iterator produces the individual code unit for each code
> point.
> +  // Invalid sequences in the input are replaced with U+FFDD so that the
> result
> +  // is always valid UTF-8, UTF-16, or UTF-32.
> +  //
> +  // The iterator knows the bounds of the underlying input range and will
> not
> +  // read outside those bounds (incrementing or decrementing at the
> boundary
> +  // is erroneously idempotent).
> +  //
> +  // On construction, the iterator attemps to decode a single code point
> from
> +  // the input range and then encode it into an internal buffer in the
> output
> +  // format, e.g. if the input is UTF-8 and the output is UTF-16, it
> might read
> +  // three char8_t code units from the input and store two char16_t code
> units
> +  // in its buffer. Incrementing the iterator will first iterate over
> buffer,
> +  // yielding each code unit in turn, and then extract another code point
> from
> +  // the input. Failure to extract a valid code point from the input will
> store
> +  // U+FFFD in the buffer, encoded as the appropriate code units of type
> ToFmt.
>    template<typename _FromFmt, typename _ToFmt,
>            input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
>            typename _ErrorHandler = _Repl>
> @@ -162,17 +181,20 @@ namespace __unicode
>        constexpr _Utf_iterator&
>        operator++()
>        {
> -       if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last)
> +       if (_M_buf_index + 1 < _M_buf_last)
> +         ++_M_buf_index; // Move to the next code unit in the buffer.
> +       else if (_M_curr() != _M_last)
>           {
> +           // Advance past the current code point (for non-forward
> iterators
> +           // we already moved there after decoding the last code point).
>             if constexpr (forward_iterator<_Iter>)
>               std::advance(_M_curr(), _M_to_increment);
>             if (_M_curr() == _M_last)
>               _M_buf_index = 0;
> -           else
> +           else // Decode next code point from the input and update
> buffer.
>               _M_read();
>           }
> -       else if (_M_buf_index + 1 < _M_buf_last)
> -         ++_M_buf_index;
> +       // else erroneous, but ignored for now.
>         return *this;
>        }
>
> @@ -487,8 +509,11 @@ namespace __unicode
>        constexpr _Iter
>        _M_curr() const { return _M_first_and_curr._M_curr; }
>
> +      // Buffer holding the individual code units of the current code
> point.
>        array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
>
> +      // As an optimization we do not store _M_first for input ranges,
> +      // because there is no need for it in a single-pass range.
>        template<typename _It>
>         struct _First_and_curr
>         {
> @@ -502,7 +527,7 @@ namespace __unicode
>             _First_and_curr(const _First_and_curr<_It2>& __other)
>             : _M_curr(__other._M_curr) { }
>
> -         _It _M_curr;
> +         _It _M_curr;  // First code unit of the current code point.
>
This is the iterator after the last code unit of the current_code point
for input_iterators.

>         };
>
>        template<typename _It> requires bidirectional_iterator<_It>
> @@ -519,16 +544,19 @@ namespace __unicode
>             _First_and_curr(const _First_and_curr<_It2>& __other)
>             : _M_first(__other._M_first), _M_curr(__other._M_curr) { }
>
> -         _It _M_first;
> -         _It _M_curr;
> +         _It _M_first; // Start of the underlying range.
> +         _It _M_curr;  // First code unit of the current code point.
>         };
>
> +      // Iterators pointing to the start of the underlying range and to
> the
> +      // first code unit of the current code point.
>        _First_and_curr<_Iter> _M_first_and_curr;
>
> -      uint8_t _M_buf_index = 0;
> -      uint8_t _M_buf_last = 0;
> -      uint8_t _M_to_increment = 0;
> +      uint8_t _M_buf_index = 0;    // Index of current code unit in the
> buffer.
> +      uint8_t _M_buf_last = 0;     // Number of code units in the buffer.
> +      uint8_t _M_to_increment = 0; // How far to advance _M_curr on
> increment.
>
> +      // The end of the underlying input range.
>        [[no_unique_address]] _Sent _M_last;
>
>        template<typename _FromFmt2, typename _ToFmt2,
> --
> 2.50.1
>
>

Reply via email to