Re: [Patch 1/3] Use manual regex algorithm switching
On Sat, Apr 26, 2014 at 10:50 AM, Jonathan Wakely jwak...@redhat.com wrote: This patch is OK, thanks very much. Committed. Thanks! -- Regards, Tim Shen
Re: [Patch 1/3] Use manual regex algorithm switching
On 25/04/14 20:50 -0400, Tim Shen wrote: * include/bits/regex.tcc (__regex_algo_impl): Remove _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use _GLIBCXX_REGEX_USE_THOMPSON_NFA instead. * include/bits/regex_automaton.h: Remove quantifier counting variable. * include/bits/regex_automaton.tcc (_State_base::_M_dot): Adjust debug NFA dump. This patch is OK, thanks very much.
[Patch 1/3] Use manual regex algorithm switching
We used _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT to control which algorithm we should use; so when compiling a regex, the number of quantifiers(*, +, ?) will be counted and will be used for the algorithm switching. However, I do think give user the manual switch may make things simpler: _GLIBCXX_REGEX_USE_THOMPSON_NFA. By default we'll use the DFS executor, which Boost and libc++ have (only, I believe). Users who know what they are doing and willing to pay for what they get may switch to the Thompson NFA (we call it BFS approach) by defining this macro. A regex that contains back references will never be executed by a Thompson NFA. The whole patch series are booted and tested, but this one is not seperatly tested. Thanks! -- Regards, Tim Shen commit 7375e9662232054465ea5c606c88130e38296ff1 Author: tim timshe...@gmail.com Date: Fri Apr 25 01:49:31 2014 -0400 2014-04-25 Tim Shen timshe...@gmail.com * include/bits/regex.tcc (__regex_algo_impl): Remove _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use _GLIBCXX_REGEX_USE_THOMPSON_NFA instead. * include/bits/regex_automaton.h: Remove quantifier counting variable. * include/bits/regex_automaton.tcc (_State_base::_M_dot): Adjust debug NFA dump. diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc index 5fa1f01..d4d16a6 100644 --- a/libstdc++-v3/include/bits/regex.tcc +++ b/libstdc++-v3/include/bits/regex.tcc @@ -28,12 +28,12 @@ * Do not attempt to use it directly. @headername{regex} */ -// See below __regex_algo_impl to get what this is talking about. The default -// value 1 indicated a conservative optimization without giving up worst case -// performance. -#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT -#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1 -#endif +// A non-standard switch to let the user pick the matching algorithm. +// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA +// algorithm will be used. This algorithm is not by default, and cannot +// be used if the regex contains back-references, but has better +// (polynomial instead of exponential) worst case performace. +// See __regex_algo_impl below. namespace std _GLIBCXX_VISIBILITY(default) { @@ -68,7 +68,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // This function decide which executor to use under given circumstances. // The _S_auto policy now is the following: if a NFA has no - // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT + // back-references and has more than _GLIBCXX_REGEX_USE_THOMPSON_NFA // quantifiers (*, +, ?), the BFS executor will be used, other wise // DFS executor. This is because DFS executor has a exponential upper // bound, but better best-case performace. Meanwhile, BFS executor can @@ -82,8 +82,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool __ret; if (!__re._M_automaton-_M_has_backref (__policy == _RegexExecutorPolicy::_S_alternate - || __re._M_automaton-_M_quant_count -_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT)) +#ifdef _GLIBCXX_REGEX_USE_THOMPSON_NFA + || true +#endif + )) { _Executor_BiIter, _Alloc, _TraitsT, false __executor(__s, __e, __m, __re, __flags); diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h index a442cfe..64ecd6d 100644 --- a/libstdc++-v3/include/bits/regex_automaton.h +++ b/libstdc++-v3/include/bits/regex_automaton.h @@ -74,8 +74,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION size_t _M_backref_index; // for _S_opcode_backref struct { - // for _S_opcode_alternative. - _StateIdT _M_quant_index; // for _S_opcode_alternative or _S_opcode_subexpr_lookahead _StateIdT _M_alt; // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or @@ -120,7 +118,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION explicit _NFA_base(_FlagT __f) : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0), -_M_quant_count(0), _M_has_backref(false) +_M_has_backref(false) { } _NFA_base(_NFA_base) = default; @@ -145,7 +143,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _FlagT_M_flags; _StateIdT _M_start_state; _SizeT_M_subexpr_count; -_SizeT_M_quant_count; bool _M_has_backref; }; @@ -175,7 +172,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StateT __tmp(_S_opcode_alternative); // It labels every quantifier to make greedy comparison easier in BFS // approach. - __tmp._M_quant_index = this-_M_quant_count++; __tmp._M_next = __next; __tmp._M_alt = __alt; __tmp._M_neg = __neg; diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc index 1476ae2..38787fa 100644 ---
Re: [Patch 1/3] Use manual regex algorithm switching
--- a/libstdc++-v3/include/bits/regex.tcc +++ b/libstdc++-v3/include/bits/regex.tcc @@ -28,12 +28,12 @@ * Do not attempt to use it directly. @headername{regex} */ -// See below __regex_algo_impl to get what this is talking about. The default -// value 1 indicated a conservative optimization without giving up worst case -// performance. -#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT -#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1 -#endif +// A non-standard switch to let the user pick the matching algorithm. +// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA +// algorithm will be used. This algorithm is not by default, and cannot This should say not enabled by default or not used by default. +// be used if the regex contains back-references, but has better +// (polynomial instead of exponential) worst case performace. +// See __regex_algo_impl below. namespace std _GLIBCXX_VISIBILITY(default) { @@ -68,7 +68,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // This function decide which executor to use under given circumstances. // The _S_auto policy now is the following: if a NFA has no - // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT + // back-references and has more than _GLIBCXX_REGEX_USE_THOMPSON_NFA This comment needs more than just a simple replacement, as the way the macro is used changed, not just its name. // quantifiers (*, +, ?), the BFS executor will be used, other wise // DFS executor. This is because DFS executor has a exponential upper // bound, but better best-case performace. Meanwhile, BFS executor can @@ -82,8 +82,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool __ret; if (!__re._M_automaton-_M_has_backref (__policy == _RegexExecutorPolicy::_S_alternate - || __re._M_automaton-_M_quant_count -_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT)) +#ifdef _GLIBCXX_REGEX_USE_THOMPSON_NFA + || true +#endif + )) I wonder if this would be simpler to read like this: if (!__re._M_automaton-_M_has_backref #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA __policy == _RegexExecutorPolicy::_S_alternate #endif ) What do you think?
Re: [Patch 1/3] Use manual regex algorithm switching
On Fri, Apr 25, 2014 at 5:14 PM, Jonathan Wakely jwak...@redhat.com wrote: I wonder if this would be simpler to read like this: if (!__re._M_automaton-_M_has_backref #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA __policy == _RegexExecutorPolicy::_S_alternate #endif ) I don't think this has equivalent logic. If _GLIBCXX_REGEX_USE_THOMPSON_NFA is not defined, the _S_alternate should also work, for testing purpose. Other parts is good. Sorry for those inconvenience :( -- Regards, Tim Shen commit 0ffd4c5e283dc4e8314051ca7bfe8b9043d36537 Author: tim timshe...@gmail.com Date: Fri Apr 25 01:49:31 2014 -0400 2014-04-25 Tim Shen timshe...@gmail.com * include/bits/regex.tcc (__regex_algo_impl): Remove _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use _GLIBCXX_REGEX_USE_THOMPSON_NFA instead. * include/bits/regex_automaton.h: Remove quantifier counting variable. * include/bits/regex_automaton.tcc (_State_base::_M_dot): Adjust debug NFA dump. diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc index 5fa1f01..71078a1 100644 --- a/libstdc++-v3/include/bits/regex.tcc +++ b/libstdc++-v3/include/bits/regex.tcc @@ -28,12 +28,12 @@ * Do not attempt to use it directly. @headername{regex} */ -// See below __regex_algo_impl to get what this is talking about. The default -// value 1 indicated a conservative optimization without giving up worst case -// performance. -#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT -#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1 -#endif +// A non-standard switch to let the user pick the matching algorithm. +// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA +// algorithm will be used. This algorithm is not enabled by default, +// and cannot be used if the regex contains back-references, but has better +// (polynomial instead of exponential) worst case performace. +// See __regex_algo_impl below. namespace std _GLIBCXX_VISIBILITY(default) { @@ -66,24 +66,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION for (auto __it : __res) __it.matched = false; - // This function decide which executor to use under given circumstances. - // The _S_auto policy now is the following: if a NFA has no - // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT - // quantifiers (*, +, ?), the BFS executor will be used, other wise - // DFS executor. This is because DFS executor has a exponential upper - // bound, but better best-case performace. Meanwhile, BFS executor can - // effectively prevent from exponential-long time matching (which must - // contains many quantifiers), but it's slower in average. - // - // For simple regex, BFS executor could be 2 or more times slower than - // DFS executor. - // - // Of course, BFS executor cannot handle back-references. + // __policy is used by testsuites so that they can use Thompson NFA + // without defining a macro. Users should define + // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach. bool __ret; if (!__re._M_automaton-_M_has_backref (__policy == _RegexExecutorPolicy::_S_alternate - || __re._M_automaton-_M_quant_count -_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT)) +#ifdef _GLIBCXX_REGEX_USE_THOMPSON_NFA + || true +#endif + )) { _Executor_BiIter, _Alloc, _TraitsT, false __executor(__s, __e, __m, __re, __flags); diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h index a442cfe..64ecd6d 100644 --- a/libstdc++-v3/include/bits/regex_automaton.h +++ b/libstdc++-v3/include/bits/regex_automaton.h @@ -74,8 +74,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION size_t _M_backref_index; // for _S_opcode_backref struct { - // for _S_opcode_alternative. - _StateIdT _M_quant_index; // for _S_opcode_alternative or _S_opcode_subexpr_lookahead _StateIdT _M_alt; // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or @@ -120,7 +118,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION explicit _NFA_base(_FlagT __f) : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0), -_M_quant_count(0), _M_has_backref(false) +_M_has_backref(false) { } _NFA_base(_NFA_base) = default; @@ -145,7 +143,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _FlagT_M_flags; _StateIdT _M_start_state; _SizeT_M_subexpr_count; -_SizeT_M_quant_count; bool _M_has_backref; }; @@ -175,7 +172,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StateT __tmp(_S_opcode_alternative); // It labels every quantifier to make greedy comparison easier in BFS // approach. - __tmp._M_quant_index =
Re: [Patch 1/3] Use manual regex algorithm switching
On 25/04/14 18:14 -0400, Tim Shen wrote: On Fri, Apr 25, 2014 at 5:14 PM, Jonathan Wakely jwak...@redhat.com wrote: I wonder if this would be simpler to read like this: if (!__re._M_automaton-_M_has_backref #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA __policy == _RegexExecutorPolicy::_S_alternate #endif ) I don't think this has equivalent logic. If _GLIBCXX_REGEX_USE_THOMPSON_NFA is not defined, the _S_alternate should also work, for testing purpose. That's why I thought #ifndef would work, not #ifdef Other parts is good. Sorry for those inconvenience :( No problem, that's why we all review each other's code publicly :)
Re: [Patch 1/3] Use manual regex algorithm switching
On Fri, Apr 25, 2014 at 7:56 PM, Jonathan Wakely jwak...@redhat.com wrote: That's why I thought #ifndef would work, not #ifdef Oh I didn't notice that. Now they are equivalent. -- Regards, Tim Shen commit cb39603d6df4763e95a697792e7b45179994f096 Author: tim timshe...@gmail.com Date: Fri Apr 25 01:49:31 2014 -0400 2014-04-25 Tim Shen timshe...@gmail.com * include/bits/regex.tcc (__regex_algo_impl): Remove _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use _GLIBCXX_REGEX_USE_THOMPSON_NFA instead. * include/bits/regex_automaton.h: Remove quantifier counting variable. * include/bits/regex_automaton.tcc (_State_base::_M_dot): Adjust debug NFA dump. diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc index 5fa1f01..0d737a0 100644 --- a/libstdc++-v3/include/bits/regex.tcc +++ b/libstdc++-v3/include/bits/regex.tcc @@ -28,12 +28,12 @@ * Do not attempt to use it directly. @headername{regex} */ -// See below __regex_algo_impl to get what this is talking about. The default -// value 1 indicated a conservative optimization without giving up worst case -// performance. -#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT -#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1 -#endif +// A non-standard switch to let the user pick the matching algorithm. +// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA +// algorithm will be used. This algorithm is not enabled by default, +// and cannot be used if the regex contains back-references, but has better +// (polynomial instead of exponential) worst case performace. +// See __regex_algo_impl below. namespace std _GLIBCXX_VISIBILITY(default) { @@ -66,24 +66,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION for (auto __it : __res) __it.matched = false; - // This function decide which executor to use under given circumstances. - // The _S_auto policy now is the following: if a NFA has no - // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT - // quantifiers (*, +, ?), the BFS executor will be used, other wise - // DFS executor. This is because DFS executor has a exponential upper - // bound, but better best-case performace. Meanwhile, BFS executor can - // effectively prevent from exponential-long time matching (which must - // contains many quantifiers), but it's slower in average. - // - // For simple regex, BFS executor could be 2 or more times slower than - // DFS executor. - // - // Of course, BFS executor cannot handle back-references. + // __policy is used by testsuites so that they can use Thompson NFA + // without defining a macro. Users should define + // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach. bool __ret; if (!__re._M_automaton-_M_has_backref - (__policy == _RegexExecutorPolicy::_S_alternate - || __re._M_automaton-_M_quant_count -_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT)) +#ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA + __policy == _RegexExecutorPolicy::_S_alternate +#endif + ) { _Executor_BiIter, _Alloc, _TraitsT, false __executor(__s, __e, __m, __re, __flags); diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h index a442cfe..64ecd6d 100644 --- a/libstdc++-v3/include/bits/regex_automaton.h +++ b/libstdc++-v3/include/bits/regex_automaton.h @@ -74,8 +74,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION size_t _M_backref_index; // for _S_opcode_backref struct { - // for _S_opcode_alternative. - _StateIdT _M_quant_index; // for _S_opcode_alternative or _S_opcode_subexpr_lookahead _StateIdT _M_alt; // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or @@ -120,7 +118,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION explicit _NFA_base(_FlagT __f) : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0), -_M_quant_count(0), _M_has_backref(false) +_M_has_backref(false) { } _NFA_base(_NFA_base) = default; @@ -145,7 +143,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _FlagT_M_flags; _StateIdT _M_start_state; _SizeT_M_subexpr_count; -_SizeT_M_quant_count; bool _M_has_backref; }; @@ -175,7 +172,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StateT __tmp(_S_opcode_alternative); // It labels every quantifier to make greedy comparison easier in BFS // approach. - __tmp._M_quant_index = this-_M_quant_count++; __tmp._M_next = __next; __tmp._M_alt = __alt; __tmp._M_neg = __neg; diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc index 1476ae2..38787fa 100644 --- a/libstdc++-v3/include/bits/regex_automaton.tcc