Add comments documenting what it does and how it does it. Also reorder the if-else in operator++ so that we check whether to iterate over code units in the local buffer before checking whether to refill that buffer. That seems the more natural way to structure the function.
libstdc++-v3/ChangeLog: * include/bits/unicode.h (__unicode::_Utf_iterator): Add comments. (__unicode:_Utf_iterator::operator++()): Check whether to iterate over the buffer first. --- Tested x86_64-linux. Are these comments clear? Helpful enough? libstdc++-v3/include/bits/unicode.h | 48 +++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/libstdc++-v3/include/bits/unicode.h b/libstdc++-v3/include/bits/unicode.h index f1b6bf49c54c..d4e379938c20 100644 --- a/libstdc++-v3/include/bits/unicode.h +++ b/libstdc++-v3/include/bits/unicode.h @@ -86,6 +86,25 @@ namespace __unicode { return *__it == iter_value_t<_It>{}; } }; + // An iterator over an input range of FromFmt code units that yields either + // UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units. + // The code units from the input range are interpreted as Unicode code points + // and the iterator produces the individual code unit for each code point. + // Invalid sequences in the input are replaced with U+FFDD so that the result + // is always valid UTF-8, UTF-16, or UTF-32. + // + // The iterator knows the bounds of the underlying input range and will not + // read outside those bounds (incrementing or decrementing at the boundary + // is erroneously idempotent). + // + // On construction, the iterator attemps to decode a single code point from + // the input range and then encode it into an internal buffer in the output + // format, e.g. if the input is UTF-8 and the output is UTF-16, it might read + // three char8_t code units from the input and store two char16_t code units + // in its buffer. Incrementing the iterator will first iterate over buffer, + // yielding each code unit in turn, and then extract another code point from + // the input. Failure to extract a valid code point from the input will store + // U+FFFD in the buffer, encoded as the appropriate code units of type ToFmt. template<typename _FromFmt, typename _ToFmt, input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter, typename _ErrorHandler = _Repl> @@ -162,17 +181,20 @@ namespace __unicode constexpr _Utf_iterator& operator++() { - if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last) + if (_M_buf_index + 1 < _M_buf_last) + ++_M_buf_index; // Move to the next code unit in the buffer. + else if (_M_curr() != _M_last) { + // Advance past the current code point (for non-forward iterators + // we already moved there after decoding the last code point). if constexpr (forward_iterator<_Iter>) std::advance(_M_curr(), _M_to_increment); if (_M_curr() == _M_last) _M_buf_index = 0; - else + else // Decode next code point from the input and update buffer. _M_read(); } - else if (_M_buf_index + 1 < _M_buf_last) - ++_M_buf_index; + // else erroneous, but ignored for now. return *this; } @@ -487,8 +509,11 @@ namespace __unicode constexpr _Iter _M_curr() const { return _M_first_and_curr._M_curr; } + // Buffer holding the individual code units of the current code point. array<value_type, 4 / sizeof(_ToFmt)> _M_buf; + // As an optimization we do not store _M_first for input ranges, + // because there is no need for it in a single-pass range. template<typename _It> struct _First_and_curr { @@ -502,7 +527,7 @@ namespace __unicode _First_and_curr(const _First_and_curr<_It2>& __other) : _M_curr(__other._M_curr) { } - _It _M_curr; + _It _M_curr; // First code unit of the current code point. }; template<typename _It> requires bidirectional_iterator<_It> @@ -519,16 +544,19 @@ namespace __unicode _First_and_curr(const _First_and_curr<_It2>& __other) : _M_first(__other._M_first), _M_curr(__other._M_curr) { } - _It _M_first; - _It _M_curr; + _It _M_first; // Start of the underlying range. + _It _M_curr; // First code unit of the current code point. }; + // Iterators pointing to the start of the underlying range and to the + // first code unit of the current code point. _First_and_curr<_Iter> _M_first_and_curr; - uint8_t _M_buf_index = 0; - uint8_t _M_buf_last = 0; - uint8_t _M_to_increment = 0; + uint8_t _M_buf_index = 0; // Index of current code unit in the buffer. + uint8_t _M_buf_last = 0; // Number of code units in the buffer. + uint8_t _M_to_increment = 0; // How far to advance _M_curr on increment. + // The end of the underlying input range. [[no_unique_address]] _Sent _M_last; template<typename _FromFmt2, typename _ToFmt2, -- 2.50.1