Tested powerpc64le-linux, pushed to trunk.
std::regex currently allows invalid bracket ranges such as [\w-a] which are only allowed by ECMAScript when in web browser compatibility mode. It should be an error, because the start of the range is a character class, not a single character. The current implementation of _Compiler::_M_expression_term does not provide a way to reject this, because we only remember a previous character, not whether we just processed a character class (or collating symbol etc.) This patch replaces the pair<bool, CharT> used to emulate optional<CharT> with a custom class closer to pair<tribool,CharT>. That allows us to track three states, so that we can tell when we've just seen a character class. With this additional state the code in _M_expression_term for processing the _S_token_bracket_dash can be improved to correctly reject the [\w-a] case, without regressing for valid cases such as [\w-] and [----]. libstdc++-v3/ChangeLog: PR libstdc++/102447 * include/bits/regex_compiler.h (_Compiler::_BracketState): New class. (_Compiler::_BrackeyMatcher): New alias template. (_Compiler::_M_expression_term): Change pair<bool, CharT> parameter to _BracketState. Process first character for ECMAScript syntax as well as POSIX. * include/bits/regex_compiler.tcc (_Compiler::_M_insert_bracket_matcher): Pass _BracketState. (_Compiler::_M_expression_term): Use _BracketState to store state between calls. Improve handling of dashes in ranges. * testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc: Add more tests for ranges containing dashes. Check invalid ranges with character class at the beginning. --- libstdc++-v3/include/bits/regex_compiler.h | 40 +++++- libstdc++-v3/include/bits/regex_compiler.tcc | 118 ++++++++---------- .../regex_match/cstring_bracket_01.cc | 62 ++++++++- 3 files changed, 152 insertions(+), 68 deletions(-) diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 88c60c2bed7..06cb48f2b6d 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -121,13 +121,45 @@ namespace __detail void _M_insert_bracket_matcher(bool __neg); - // Returns true if successfully matched one term and should continue. + // Cache of the last atom seen in a bracketed range expression. + struct _BracketState + { + enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None; + _CharT _M_char; + + void + set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; } + + _GLIBCXX_NODISCARD _CharT + get() const noexcept { return _M_char; } + + void + reset(_Type __t = _Type::_None) noexcept { _M_type = __t; } + + explicit operator bool() const noexcept + { return _M_type != _Type::_None; } + + // Previous token was a single character. + _GLIBCXX_NODISCARD bool + _M_is_char() const noexcept { return _M_type == _Type::_Char; } + + // Previous token was a character class, equivalent class, + // collating symbol etc. + _GLIBCXX_NODISCARD bool + _M_is_class() const noexcept { return _M_type == _Type::_Class; } + }; + + template<bool __icase, bool __collate> + using _BracketMatcher + = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>; + + // Returns true if successfully parsed one term and should continue + // compiling a bracket expression. // Returns false if the compiler should move on. template<bool __icase, bool __collate> bool - _M_expression_term(pair<bool, _CharT>& __last_char, - _BracketMatcher<_TraitsT, __icase, __collate>& - __matcher); + _M_expression_term(_BracketState& __last_char, + _BracketMatcher<__icase, __collate>& __matcher); int _M_cur_int_value(int __radix); diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 0e2e1321376..9000aec8e25 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -403,7 +403,7 @@ namespace __detail _M_insert_character_class_matcher() { __glibcxx_assert(_M_value.size() == 1); - _BracketMatcher<_TraitsT, __icase, __collate> __matcher + _BracketMatcher<__icase, __collate> __matcher (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); __matcher._M_add_character_class(_M_value, false); __matcher._M_ready(); @@ -417,26 +417,17 @@ namespace __detail _Compiler<_TraitsT>:: _M_insert_bracket_matcher(bool __neg) { - _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); - pair<bool, _CharT> __last_char; // Optional<_CharT> - __last_char.first = false; - if (!(_M_flags & regex_constants::ECMAScript)) - { - if (_M_try_char()) - { - __last_char.first = true; - __last_char.second = _M_value[0]; - } - else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) - { - __last_char.first = true; - __last_char.second = '-'; - } - } + _BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits); + _BracketState __last_char; + if (_M_try_char()) + __last_char.set(_M_value[0]); + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + // Dash as first character is a normal character. + __last_char.set('-'); while (_M_expression_term(__last_char, __matcher)) ; - if (__last_char.first) - __matcher._M_add_char(__last_char.second); + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); __matcher._M_ready(); _M_stack.push(_StateSeqT( *_M_nfa, @@ -447,27 +438,27 @@ namespace __detail template<bool __icase, bool __collate> bool _Compiler<_TraitsT>:: - _M_expression_term(pair<bool, _CharT>& __last_char, - _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) + _M_expression_term(_BracketState& __last_char, + _BracketMatcher<__icase, __collate>& __matcher) { if (_M_match_token(_ScannerT::_S_token_bracket_end)) return false; + // Add any previously cached char into the matcher and update cache. const auto __push_char = [&](_CharT __ch) { - if (__last_char.first) - __matcher._M_add_char(__last_char.second); - else - __last_char.first = true; - __last_char.second = __ch; + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); + __last_char.set(__ch); }; - const auto __flush = [&] + // Add any previously cached char into the matcher and update cache. + const auto __push_class = [&] { - if (__last_char.first) - { - __matcher._M_add_char(__last_char.second); - __last_char.first = false; - } + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); + // We don't cache anything here, just record that the last thing + // processed was a character class (or similar). + __last_char.reset(_BracketState::_Type::_Class); }; if (_M_match_token(_ScannerT::_S_token_collsymbol)) @@ -476,16 +467,16 @@ namespace __detail if (__symbol.size() == 1) __push_char(__symbol[0]); else - __flush(); + __push_class(); } else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) { - __flush(); + __push_class(); __matcher._M_add_equivalence_class(_M_value); } else if (_M_match_token(_ScannerT::_S_token_char_class_name)) { - __flush(); + __push_class(); __matcher._M_add_character_class(_M_value, false); } else if (_M_try_char()) @@ -502,49 +493,50 @@ namespace __detail // It turns out that no one reads BNFs ;) else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) { - if (!__last_char.first) + if (_M_match_token(_ScannerT::_S_token_bracket_end)) { - if (!(_M_flags & regex_constants::ECMAScript)) - { - if (_M_match_token(_ScannerT::_S_token_bracket_end)) - { - __push_char('-'); - return false; - } - __throw_regex_error( - regex_constants::error_range, - "Unexpected dash in bracket expression. For POSIX syntax, " - "a dash is not treated literally only when it is at " - "beginning or end."); - } + // For "-]" the dash is a literal character. __push_char('-'); + return false; } - else + else if (__last_char._M_is_class()) + { + // "\\w-" is invalid, start of range must be a single char. + __throw_regex_error(regex_constants::error_range, + "Invalid start of range in bracket expression."); + } + else if (__last_char._M_is_char()) { if (_M_try_char()) { - __matcher._M_make_range(__last_char.second, _M_value[0]); - __last_char.first = false; + // "x-y" + __matcher._M_make_range(__last_char.get(), _M_value[0]); + __last_char.reset(); } else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) { - __matcher._M_make_range(__last_char.second, '-'); - __last_char.first = false; + // "x--" + __matcher._M_make_range(__last_char.get(), '-'); + __last_char.reset(); } else - { - if (_M_scanner._M_get_token() - != _ScannerT::_S_token_bracket_end) - __throw_regex_error( - regex_constants::error_range, - "Character is expected after a dash."); - __push_char('-'); - } + __throw_regex_error(regex_constants::error_range, + "Invalid end of range in bracket expression."); } + else if (_M_flags & regex_constants::ECMAScript) + { + // A dash that is not part of an existing range. Might be the + // start of a new range, or might just be a literal '-' char. + // Only ECMAScript allows that in the middle of a bracket expr. + __push_char('-'); + } + else + __throw_regex_error(regex_constants::error_range, + "Invalid dash in bracket expression."); } else if (_M_match_token(_ScannerT::_S_token_quoted_class)) { - __flush(); + __push_class(); __matcher._M_add_character_class(_M_value, _M_ctype.is(_CtypeT::upper, _M_value[0])); diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc index 7df70604ea6..0d76e63da7b 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc @@ -69,6 +69,16 @@ test01() void test02() { + VERIFY(regex_match("-", regex("[-]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[--]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[---]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[----]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[-----]", regex_constants::ECMAScript))); + + VERIFY(regex_match("-", regex("[-]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[--]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[---]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[----]", regex_constants::extended))); try { std::regex re("[-----]", std::regex::extended); @@ -78,7 +88,6 @@ test02() { VERIFY(e.code() == std::regex_constants::error_range); } - std::regex re("[-----]", std::regex::ECMAScript); VERIFY(!regex_match("b", regex("[-ac]", regex_constants::extended))); VERIFY(!regex_match("b", regex("[ac-]", regex_constants::extended))); @@ -93,7 +102,27 @@ test02() } catch (const std::regex_error& e) { + VERIFY(e.code() == std::regex_constants::error_range); } + try + { + regex("[@--]", regex_constants::extended); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + try + { + regex("[--%]", regex_constants::extended); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + VERIFY(regex_match("].", regex("[][.hyphen.]-0]*", regex_constants::extended))); } @@ -158,6 +187,36 @@ test06() VERIFY(regex_match("a-", debian_cron_namespace_ok)); } +// libstdc++/102447 +void +test07() +{ + VERIFY(regex_match("-", std::regex("[\\w-]", std::regex::ECMAScript))); + VERIFY(regex_match("a", std::regex("[\\w-]", std::regex::ECMAScript))); + VERIFY(regex_match("-", std::regex("[a-]", std::regex::ECMAScript))); + VERIFY(regex_match("a", std::regex("[a-]", std::regex::ECMAScript))); + + try + { + std::regex re("[\\w-a]", std::regex::ECMAScript); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + + try + { + std::regex re("[\\w--]", std::regex::ECMAScript); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } +} + int main() { @@ -167,6 +226,7 @@ main() test04(); test05(); test06(); + test07(); return 0; } -- 2.31.1