Re: [Patch 1/3] Use manual regex algorithm switching

2014-04-27 Thread Tim Shen
On Sat, Apr 26, 2014 at 10:50 AM, Jonathan Wakely jwak...@redhat.com wrote:
 This patch is OK, thanks very much.

Committed. Thanks!


-- 
Regards,
Tim Shen


Re: [Patch 1/3] Use manual regex algorithm switching

2014-04-26 Thread Jonathan Wakely

On 25/04/14 20:50 -0400, Tim Shen wrote:


* include/bits/regex.tcc (__regex_algo_impl): Remove
_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use
_GLIBCXX_REGEX_USE_THOMPSON_NFA instead.
* include/bits/regex_automaton.h: Remove quantifier counting variable.
* include/bits/regex_automaton.tcc (_State_base::_M_dot):
Adjust debug NFA dump.


This patch is OK, thanks very much.



[Patch 1/3] Use manual regex algorithm switching

2014-04-25 Thread Tim Shen
We used _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT to control which
algorithm we should use; so when compiling a regex, the number of
quantifiers(*, +, ?) will be counted and will be used for the
algorithm switching.

However, I do think give user the manual switch may make things
simpler: _GLIBCXX_REGEX_USE_THOMPSON_NFA. By default we'll use the DFS
executor, which Boost and libc++ have (only, I believe).

Users who know what they are doing and willing to pay for what they
get may switch to the Thompson NFA (we call it BFS approach) by
defining this macro.

A regex that contains back references will never be executed by a Thompson NFA.

The whole patch series are booted and tested, but this one is not
seperatly tested.

Thanks!


-- 
Regards,
Tim Shen
commit 7375e9662232054465ea5c606c88130e38296ff1
Author: tim timshe...@gmail.com
Date:   Fri Apr 25 01:49:31 2014 -0400

2014-04-25  Tim Shen  timshe...@gmail.com

* include/bits/regex.tcc (__regex_algo_impl): Remove
_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use
_GLIBCXX_REGEX_USE_THOMPSON_NFA instead.
* include/bits/regex_automaton.h: Remove quantifier counting variable.
* include/bits/regex_automaton.tcc (_State_base::_M_dot):
Adjust debug NFA dump.

diff --git a/libstdc++-v3/include/bits/regex.tcc 
b/libstdc++-v3/include/bits/regex.tcc
index 5fa1f01..d4d16a6 100644
--- a/libstdc++-v3/include/bits/regex.tcc
+++ b/libstdc++-v3/include/bits/regex.tcc
@@ -28,12 +28,12 @@
  *  Do not attempt to use it directly. @headername{regex}
  */
 
-// See below __regex_algo_impl to get what this is talking about. The default
-// value 1 indicated a conservative optimization without giving up worst case
-// performance.
-#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
-#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
-#endif
+// A non-standard switch to let the user pick the matching algorithm.
+// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA
+// algorithm will be used. This algorithm is not by default, and cannot
+// be used if the regex contains back-references, but has better
+// (polynomial instead of exponential) worst case performace.
+// See __regex_algo_impl below.
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
@@ -68,7 +68,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   // This function decide which executor to use under given circumstances.
   // The _S_auto policy now is the following: if a NFA has no
-  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
+  // back-references and has more than _GLIBCXX_REGEX_USE_THOMPSON_NFA
   // quantifiers (*, +, ?), the BFS executor will be used, other wise
   // DFS executor. This is because DFS executor has a exponential upper
   // bound, but better best-case performace. Meanwhile, BFS executor can
@@ -82,8 +82,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   bool __ret;
   if (!__re._M_automaton-_M_has_backref
   (__policy == _RegexExecutorPolicy::_S_alternate
- || __re._M_automaton-_M_quant_count
-_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
+#ifdef _GLIBCXX_REGEX_USE_THOMPSON_NFA
+ || true
+#endif
+ ))
{
  _Executor_BiIter, _Alloc, _TraitsT, false
__executor(__s, __e, __m, __re, __flags);
diff --git a/libstdc++-v3/include/bits/regex_automaton.h 
b/libstdc++-v3/include/bits/regex_automaton.h
index a442cfe..64ecd6d 100644
--- a/libstdc++-v3/include/bits/regex_automaton.h
+++ b/libstdc++-v3/include/bits/regex_automaton.h
@@ -74,8 +74,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   size_t _M_backref_index;  // for _S_opcode_backref
   struct
   {
-   // for _S_opcode_alternative.
-   _StateIdT  _M_quant_index;
// for _S_opcode_alternative or _S_opcode_subexpr_lookahead
_StateIdT  _M_alt;
// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
@@ -120,7 +118,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 explicit
 _NFA_base(_FlagT __f)
 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
-_M_quant_count(0), _M_has_backref(false)
+_M_has_backref(false)
 { }
 
 _NFA_base(_NFA_base) = default;
@@ -145,7 +143,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _FlagT_M_flags;
 _StateIdT _M_start_state;
 _SizeT_M_subexpr_count;
-_SizeT_M_quant_count;
 bool  _M_has_backref;
   };
 
@@ -175,7 +172,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateT __tmp(_S_opcode_alternative);
// It labels every quantifier to make greedy comparison easier in BFS
// approach.
-   __tmp._M_quant_index = this-_M_quant_count++;
__tmp._M_next = __next;
__tmp._M_alt = __alt;
__tmp._M_neg = __neg;
diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc 
b/libstdc++-v3/include/bits/regex_automaton.tcc
index 1476ae2..38787fa 100644
--- 

Re: [Patch 1/3] Use manual regex algorithm switching

2014-04-25 Thread Jonathan Wakely

--- a/libstdc++-v3/include/bits/regex.tcc
+++ b/libstdc++-v3/include/bits/regex.tcc
@@ -28,12 +28,12 @@
 *  Do not attempt to use it directly. @headername{regex}
 */

-// See below __regex_algo_impl to get what this is talking about. The default
-// value 1 indicated a conservative optimization without giving up worst case
-// performance.
-#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
-#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
-#endif
+// A non-standard switch to let the user pick the matching algorithm.
+// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA
+// algorithm will be used. This algorithm is not by default, and cannot


This should say not enabled by default or not used by default.


+// be used if the regex contains back-references, but has better
+// (polynomial instead of exponential) worst case performace.
+// See __regex_algo_impl below.

namespace std _GLIBCXX_VISIBILITY(default)
{
@@ -68,7 +68,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION

  // This function decide which executor to use under given circumstances.
  // The _S_auto policy now is the following: if a NFA has no
-  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
+  // back-references and has more than _GLIBCXX_REGEX_USE_THOMPSON_NFA


This comment needs more than just a simple replacement, as the way the
macro is used changed, not just its name.


  // quantifiers (*, +, ?), the BFS executor will be used, other wise
  // DFS executor. This is because DFS executor has a exponential upper
  // bound, but better best-case performace. Meanwhile, BFS executor can
@@ -82,8 +82,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  bool __ret;
  if (!__re._M_automaton-_M_has_backref
   (__policy == _RegexExecutorPolicy::_S_alternate
- || __re._M_automaton-_M_quant_count
-_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
+#ifdef _GLIBCXX_REGEX_USE_THOMPSON_NFA
+ || true
+#endif
+ ))


I wonder if this would be simpler to read like this:

   if (!__re._M_automaton-_M_has_backref
#ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
__policy == _RegexExecutorPolicy::_S_alternate
#endif
  )

What do you think?



Re: [Patch 1/3] Use manual regex algorithm switching

2014-04-25 Thread Tim Shen
On Fri, Apr 25, 2014 at 5:14 PM, Jonathan Wakely jwak...@redhat.com wrote:
 I wonder if this would be simpler to read like this:

if (!__re._M_automaton-_M_has_backref
 #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
 __policy == _RegexExecutorPolicy::_S_alternate
 #endif
   )

I don't think this has equivalent logic. If
_GLIBCXX_REGEX_USE_THOMPSON_NFA is not defined, the _S_alternate
should also work, for testing purpose.

Other parts is good. Sorry for those inconvenience :(


-- 
Regards,
Tim Shen
commit 0ffd4c5e283dc4e8314051ca7bfe8b9043d36537
Author: tim timshe...@gmail.com
Date:   Fri Apr 25 01:49:31 2014 -0400

2014-04-25  Tim Shen  timshe...@gmail.com

* include/bits/regex.tcc (__regex_algo_impl): Remove
_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use
_GLIBCXX_REGEX_USE_THOMPSON_NFA instead.
* include/bits/regex_automaton.h: Remove quantifier counting variable.
* include/bits/regex_automaton.tcc (_State_base::_M_dot):
Adjust debug NFA dump.

diff --git a/libstdc++-v3/include/bits/regex.tcc 
b/libstdc++-v3/include/bits/regex.tcc
index 5fa1f01..71078a1 100644
--- a/libstdc++-v3/include/bits/regex.tcc
+++ b/libstdc++-v3/include/bits/regex.tcc
@@ -28,12 +28,12 @@
  *  Do not attempt to use it directly. @headername{regex}
  */
 
-// See below __regex_algo_impl to get what this is talking about. The default
-// value 1 indicated a conservative optimization without giving up worst case
-// performance.
-#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
-#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
-#endif
+// A non-standard switch to let the user pick the matching algorithm.
+// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA
+// algorithm will be used. This algorithm is not enabled by default,
+// and cannot be used if the regex contains back-references, but has better
+// (polynomial instead of exponential) worst case performace.
+// See __regex_algo_impl below.
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
@@ -66,24 +66,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   for (auto __it : __res)
__it.matched = false;
 
-  // This function decide which executor to use under given circumstances.
-  // The _S_auto policy now is the following: if a NFA has no
-  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
-  // quantifiers (*, +, ?), the BFS executor will be used, other wise
-  // DFS executor. This is because DFS executor has a exponential upper
-  // bound, but better best-case performace. Meanwhile, BFS executor can
-  // effectively prevent from exponential-long time matching (which must
-  // contains many quantifiers), but it's slower in average.
-  //
-  // For simple regex, BFS executor could be 2 or more times slower than
-  // DFS executor.
-  //
-  // Of course, BFS executor cannot handle back-references.
+  // __policy is used by testsuites so that they can use Thompson NFA
+  // without defining a macro. Users should define
+  // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
   bool __ret;
   if (!__re._M_automaton-_M_has_backref
   (__policy == _RegexExecutorPolicy::_S_alternate
- || __re._M_automaton-_M_quant_count
-_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
+#ifdef _GLIBCXX_REGEX_USE_THOMPSON_NFA
+ || true
+#endif
+ ))
{
  _Executor_BiIter, _Alloc, _TraitsT, false
__executor(__s, __e, __m, __re, __flags);
diff --git a/libstdc++-v3/include/bits/regex_automaton.h 
b/libstdc++-v3/include/bits/regex_automaton.h
index a442cfe..64ecd6d 100644
--- a/libstdc++-v3/include/bits/regex_automaton.h
+++ b/libstdc++-v3/include/bits/regex_automaton.h
@@ -74,8 +74,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   size_t _M_backref_index;  // for _S_opcode_backref
   struct
   {
-   // for _S_opcode_alternative.
-   _StateIdT  _M_quant_index;
// for _S_opcode_alternative or _S_opcode_subexpr_lookahead
_StateIdT  _M_alt;
// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
@@ -120,7 +118,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 explicit
 _NFA_base(_FlagT __f)
 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
-_M_quant_count(0), _M_has_backref(false)
+_M_has_backref(false)
 { }
 
 _NFA_base(_NFA_base) = default;
@@ -145,7 +143,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _FlagT_M_flags;
 _StateIdT _M_start_state;
 _SizeT_M_subexpr_count;
-_SizeT_M_quant_count;
 bool  _M_has_backref;
   };
 
@@ -175,7 +172,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateT __tmp(_S_opcode_alternative);
// It labels every quantifier to make greedy comparison easier in BFS
// approach.
-   __tmp._M_quant_index = 

Re: [Patch 1/3] Use manual regex algorithm switching

2014-04-25 Thread Jonathan Wakely

On 25/04/14 18:14 -0400, Tim Shen wrote:

On Fri, Apr 25, 2014 at 5:14 PM, Jonathan Wakely jwak...@redhat.com wrote:

I wonder if this would be simpler to read like this:

   if (!__re._M_automaton-_M_has_backref
#ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
__policy == _RegexExecutorPolicy::_S_alternate
#endif
  )


I don't think this has equivalent logic. If
_GLIBCXX_REGEX_USE_THOMPSON_NFA is not defined, the _S_alternate
should also work, for testing purpose.


That's why I thought #ifndef would work, not #ifdef


Other parts is good. Sorry for those inconvenience :(


No problem, that's why we all review each other's code publicly :)



Re: [Patch 1/3] Use manual regex algorithm switching

2014-04-25 Thread Tim Shen
On Fri, Apr 25, 2014 at 7:56 PM, Jonathan Wakely jwak...@redhat.com wrote:
 That's why I thought #ifndef would work, not #ifdef

Oh I didn't notice that. Now they are equivalent.


-- 
Regards,
Tim Shen
commit cb39603d6df4763e95a697792e7b45179994f096
Author: tim timshe...@gmail.com
Date:   Fri Apr 25 01:49:31 2014 -0400

2014-04-25  Tim Shen  timshe...@gmail.com

* include/bits/regex.tcc (__regex_algo_impl): Remove
_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use
_GLIBCXX_REGEX_USE_THOMPSON_NFA instead.
* include/bits/regex_automaton.h: Remove quantifier counting variable.
* include/bits/regex_automaton.tcc (_State_base::_M_dot):
Adjust debug NFA dump.

diff --git a/libstdc++-v3/include/bits/regex.tcc 
b/libstdc++-v3/include/bits/regex.tcc
index 5fa1f01..0d737a0 100644
--- a/libstdc++-v3/include/bits/regex.tcc
+++ b/libstdc++-v3/include/bits/regex.tcc
@@ -28,12 +28,12 @@
  *  Do not attempt to use it directly. @headername{regex}
  */
 
-// See below __regex_algo_impl to get what this is talking about. The default
-// value 1 indicated a conservative optimization without giving up worst case
-// performance.
-#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
-#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
-#endif
+// A non-standard switch to let the user pick the matching algorithm.
+// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA
+// algorithm will be used. This algorithm is not enabled by default,
+// and cannot be used if the regex contains back-references, but has better
+// (polynomial instead of exponential) worst case performace.
+// See __regex_algo_impl below.
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
@@ -66,24 +66,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   for (auto __it : __res)
__it.matched = false;
 
-  // This function decide which executor to use under given circumstances.
-  // The _S_auto policy now is the following: if a NFA has no
-  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
-  // quantifiers (*, +, ?), the BFS executor will be used, other wise
-  // DFS executor. This is because DFS executor has a exponential upper
-  // bound, but better best-case performace. Meanwhile, BFS executor can
-  // effectively prevent from exponential-long time matching (which must
-  // contains many quantifiers), but it's slower in average.
-  //
-  // For simple regex, BFS executor could be 2 or more times slower than
-  // DFS executor.
-  //
-  // Of course, BFS executor cannot handle back-references.
+  // __policy is used by testsuites so that they can use Thompson NFA
+  // without defining a macro. Users should define
+  // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
   bool __ret;
   if (!__re._M_automaton-_M_has_backref
-  (__policy == _RegexExecutorPolicy::_S_alternate
- || __re._M_automaton-_M_quant_count
-_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
+#ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
+  __policy == _RegexExecutorPolicy::_S_alternate
+#endif
+ )
{
  _Executor_BiIter, _Alloc, _TraitsT, false
__executor(__s, __e, __m, __re, __flags);
diff --git a/libstdc++-v3/include/bits/regex_automaton.h 
b/libstdc++-v3/include/bits/regex_automaton.h
index a442cfe..64ecd6d 100644
--- a/libstdc++-v3/include/bits/regex_automaton.h
+++ b/libstdc++-v3/include/bits/regex_automaton.h
@@ -74,8 +74,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   size_t _M_backref_index;  // for _S_opcode_backref
   struct
   {
-   // for _S_opcode_alternative.
-   _StateIdT  _M_quant_index;
// for _S_opcode_alternative or _S_opcode_subexpr_lookahead
_StateIdT  _M_alt;
// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
@@ -120,7 +118,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 explicit
 _NFA_base(_FlagT __f)
 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
-_M_quant_count(0), _M_has_backref(false)
+_M_has_backref(false)
 { }
 
 _NFA_base(_NFA_base) = default;
@@ -145,7 +143,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _FlagT_M_flags;
 _StateIdT _M_start_state;
 _SizeT_M_subexpr_count;
-_SizeT_M_quant_count;
 bool  _M_has_backref;
   };
 
@@ -175,7 +172,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateT __tmp(_S_opcode_alternative);
// It labels every quantifier to make greedy comparison easier in BFS
// approach.
-   __tmp._M_quant_index = this-_M_quant_count++;
__tmp._M_next = __next;
__tmp._M_alt = __alt;
__tmp._M_neg = __neg;
diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc 
b/libstdc++-v3/include/bits/regex_automaton.tcc
index 1476ae2..38787fa 100644
--- a/libstdc++-v3/include/bits/regex_automaton.tcc