On Wed, Jul 16, 2025 at 12:05 PM Jonathan Wakely <jwak...@redhat.com> wrote:
> Add comments documenting what it does and how it does it. > > Also reorder the if-else in operator++ so that we check whether to > iterate over code units in the local buffer before checking whether to > refill that buffer. That seems the more natural way to structure the > function. > > libstdc++-v3/ChangeLog: > > * include/bits/unicode.h (__unicode::_Utf_iterator): Add > comments. > (__unicode:_Utf_iterator::operator++()): Check whether to > iterate over the buffer first. > --- > > Tested x86_64-linux. Are these comments clear? Helpful enough? > I have used the iterator without the comments, so my option is biased. They are clear for me, except one, as noted below. > > libstdc++-v3/include/bits/unicode.h | 48 +++++++++++++++++++++++------ > 1 file changed, 38 insertions(+), 10 deletions(-) > > diff --git a/libstdc++-v3/include/bits/unicode.h > b/libstdc++-v3/include/bits/unicode.h > index f1b6bf49c54c..d4e379938c20 100644 > --- a/libstdc++-v3/include/bits/unicode.h > +++ b/libstdc++-v3/include/bits/unicode.h > @@ -86,6 +86,25 @@ namespace __unicode > { return *__it == iter_value_t<_It>{}; } > }; > > + // An iterator over an input range of FromFmt code units that yields > either > + // UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units. > + // The code units from the input range are interpreted as Unicode code > points > + // and the iterator produces the individual code unit for each code > point. > + // Invalid sequences in the input are replaced with U+FFDD so that the > result > + // is always valid UTF-8, UTF-16, or UTF-32. > + // > + // The iterator knows the bounds of the underlying input range and will > not > + // read outside those bounds (incrementing or decrementing at the > boundary > + // is erroneously idempotent). > + // > + // On construction, the iterator attemps to decode a single code point > from > + // the input range and then encode it into an internal buffer in the > output > + // format, e.g. if the input is UTF-8 and the output is UTF-16, it > might read > + // three char8_t code units from the input and store two char16_t code > units > + // in its buffer. Incrementing the iterator will first iterate over > buffer, > + // yielding each code unit in turn, and then extract another code point > from > + // the input. Failure to extract a valid code point from the input will > store > + // U+FFFD in the buffer, encoded as the appropriate code units of type > ToFmt. > template<typename _FromFmt, typename _ToFmt, > input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter, > typename _ErrorHandler = _Repl> > @@ -162,17 +181,20 @@ namespace __unicode > constexpr _Utf_iterator& > operator++() > { > - if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last) > + if (_M_buf_index + 1 < _M_buf_last) > + ++_M_buf_index; // Move to the next code unit in the buffer. > + else if (_M_curr() != _M_last) > { > + // Advance past the current code point (for non-forward > iterators > + // we already moved there after decoding the last code point). > if constexpr (forward_iterator<_Iter>) > std::advance(_M_curr(), _M_to_increment); > if (_M_curr() == _M_last) > _M_buf_index = 0; > - else > + else // Decode next code point from the input and update > buffer. > _M_read(); > } > - else if (_M_buf_index + 1 < _M_buf_last) > - ++_M_buf_index; > + // else erroneous, but ignored for now. > return *this; > } > > @@ -487,8 +509,11 @@ namespace __unicode > constexpr _Iter > _M_curr() const { return _M_first_and_curr._M_curr; } > > + // Buffer holding the individual code units of the current code > point. > array<value_type, 4 / sizeof(_ToFmt)> _M_buf; > > + // As an optimization we do not store _M_first for input ranges, > + // because there is no need for it in a single-pass range. > template<typename _It> > struct _First_and_curr > { > @@ -502,7 +527,7 @@ namespace __unicode > _First_and_curr(const _First_and_curr<_It2>& __other) > : _M_curr(__other._M_curr) { } > > - _It _M_curr; > + _It _M_curr; // First code unit of the current code point. > This is the iterator after the last code unit of the current_code point for input_iterators. > }; > > template<typename _It> requires bidirectional_iterator<_It> > @@ -519,16 +544,19 @@ namespace __unicode > _First_and_curr(const _First_and_curr<_It2>& __other) > : _M_first(__other._M_first), _M_curr(__other._M_curr) { } > > - _It _M_first; > - _It _M_curr; > + _It _M_first; // Start of the underlying range. > + _It _M_curr; // First code unit of the current code point. > }; > > + // Iterators pointing to the start of the underlying range and to > the > + // first code unit of the current code point. > _First_and_curr<_Iter> _M_first_and_curr; > > - uint8_t _M_buf_index = 0; > - uint8_t _M_buf_last = 0; > - uint8_t _M_to_increment = 0; > + uint8_t _M_buf_index = 0; // Index of current code unit in the > buffer. > + uint8_t _M_buf_last = 0; // Number of code units in the buffer. > + uint8_t _M_to_increment = 0; // How far to advance _M_curr on > increment. > > + // The end of the underlying input range. > [[no_unique_address]] _Sent _M_last; > > template<typename _FromFmt2, typename _ToFmt2, > -- > 2.50.1 > >