Hi, I've been looking through the regex code and have a few ideas for simplifications or optimisations that I'd like to share.
This first patch is for _BracketMatcher. We only use std::bitset when is_same<_CharT, char> so 8 * sizeof(_CharT) should be __CHAR_BIT__ instead. We also only user _UnsignedCharT when is_same<_CharT, char> so that can just be simplified to unsigned char. The contents of _BracketMatcher::_M_char_set are not sorted and can contain duplicates in the current code. Making that a sorted, unique list in _BracketMatcher::_M_ready() allows a binary search instead of linear search. This improves worst case performance for pathological regular expressions like std::wregex('['+std::wstring(1000, 'a')+"b]") but I'm not sure if it helps in the common case. Finally, in the non-char case the _CacheT member is an unused empty object, so having that as the first member requires 7 bytes of padding. Re-ordering the members reduces the size of a non-char _BracketMatcher by 8 bytes (but it's still a whopping 96 bytes). (For a char _BracketMatcher the bitset cache makes it 128 bytes, this patch doesn't change that). Thoughts?
diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index f5a198f..a9dd8d3 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -396,6 +396,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_ready() { + std::sort(_M_char_set.begin(), _M_char_set.end()); + auto __end = std::unique(_M_char_set.begin(), _M_char_set.end()); + _M_char_set.erase(__end, _M_char_set.end()); _M_make_cache(_IsChar()); #ifdef _GLIBCXX_DEBUG _M_is_ready = true; @@ -405,10 +408,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION private: typedef typename is_same<_CharT, char>::type _IsChar; struct _Dummy { }; + static constexpr size_t _S_cache_size() { return 1ul << __CHAR_BIT__; } typedef typename conditional<_IsChar::value, - std::bitset<1 << (8 * sizeof(_CharT))>, + std::bitset<_S_cache_size()>, _Dummy>::type _CacheT; - typedef typename make_unsigned<_CharT>::type _UnsignedCharT; private: bool @@ -416,14 +419,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_apply(_CharT __ch, true_type) const - { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; } + { return _M_cache[static_cast<unsigned char>(__ch)]; } void _M_make_cache(true_type) { - for (int __i = 0; __i < _M_cache.size(); __i++) - _M_cache[static_cast<_UnsignedCharT>(__i)] = - _M_apply(__i, false_type()); + for (unsigned __i = 0; __i < _S_cache_size(); __i++) + _M_cache[__i] = _M_apply(static_cast<char>(__i), false_type()); } void @@ -431,13 +433,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } private: - _CacheT _M_cache; std::vector<_CharT> _M_char_set; std::vector<_StringT> _M_equiv_set; std::vector<pair<_StrTransT, _StrTransT>> _M_range_set; _CharClassT _M_class_set; _TransT _M_translator; const _TraitsT& _M_traits; + _CacheT _M_cache; bool _M_is_non_matching; #ifdef _GLIBCXX_DEBUG bool _M_is_ready; diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 128dac1..36edfba 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -507,12 +507,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _BracketMatcher<_TraitsT, __icase, __collate>:: _M_apply(_CharT __ch, false_type) const { - bool __ret = false; - if (std::find(_M_char_set.begin(), _M_char_set.end(), - _M_translator._M_translate(__ch)) - != _M_char_set.end()) - __ret = true; - else + bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(), + _M_translator._M_translate(__ch)); + if (!__ret) { auto __s = _M_translator._M_transform(__ch); for (auto& __it : _M_range_set)