In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/e0814905161f8420d3be47dd3883b1f3c9590a82?hp=99956112fb8debe619f1aa0f9a8c2a2887b1bd7a>
- Log ----------------------------------------------------------------- commit e0814905161f8420d3be47dd3883b1f3c9590a82 Author: Karl Williamson <[email protected]> Date: Wed Mar 13 19:11:46 2019 -0600 PATCH: [perl #133871] heap-buffer-overflow in S_reginsert The regex compiler was written assuming it knew how many parentheses pairs there were at code generation time. When I converted to a single pass in 7c932d07cab18751bfc7515b4320436273a459e2, most things were straight forward to not have to know this number, but there were a few where it was non-trivial (for me anyway) to figure out how to handle. So I punted on these and do a second pass when these are encountered. There are few of them and are less commonly used, namely (?R), (?|...) and forward references to groups (which most commonly will end up being a syntax error anyway). The fix in this commit is to avoid doing some parentheses relocations when a regnode is inserted when it is known that the parentheses counts are unreliable (during initial parsing of one of these tricky constructs). The code in the ticket is using a branch reset '(?|...)'. A second pass will get done, and the insert properly handled then, after the counts are reliable. commit b76e1c044f939bf713dab1a40f484ed2b22561a7 Author: Karl Williamson <[email protected]> Date: Wed Mar 13 18:41:45 2019 -0600 regcomp.c: Create macro, add comments commit 1c1a700b2cac24972380a42f01321ad242e0c70c Author: Karl Williamson <[email protected]> Date: Wed Mar 13 18:45:34 2019 -0600 regcomp.c: Rmv unused variable ----------------------------------------------------------------------- Summary of changes: regcomp.c | 42 +++++++++++++++++++++++++++--------------- t/re/pat.t | 6 +++++- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/regcomp.c b/regcomp.c index d7c3d46fc8..51679e9063 100644 --- a/regcomp.c +++ b/regcomp.c @@ -353,7 +353,7 @@ struct RExC_state_t { if (DEPENDS_SEMANTICS) { \ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ RExC_uni_semantics = 1; \ - if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) { \ + if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ /* No need to restart the parse if we haven't seen \ * anything that differs between /u and /d, and no need \ * to restart immediately if we're going to reparse \ @@ -368,7 +368,7 @@ struct RExC_state_t { #define REQUIRE_BRANCHJ(flagp, restart_retval) \ STMT_START { \ RExC_use_BRANCHJ = 1; \ - if (LIKELY(RExC_total_parens >= 0)) { \ + if (LIKELY(! IN_PARENS_PASS)) { \ /* No need to restart the parse immediately if we're \ * going to reparse anyway to count parens */ \ *flagp |= RESTART_PARSE; \ @@ -376,10 +376,19 @@ struct RExC_state_t { } \ } STMT_END +/* Until we have completed the parse, we leave RExC_total_parens at 0 or + * less. After that, it must always be positive, because the whole re is + * considered to be surrounded by virtual parens. Setting it to negative + * indicates there is some construct that needs to know the actual number of + * parens to be properly handled. And that means an extra pass will be + * required after we've counted them all */ +#define ALL_PARENS_COUNTED (RExC_total_parens > 0) #define REQUIRE_PARENS_PASS \ - STMT_START { \ - if (RExC_total_parens == 0) RExC_total_parens = -1; \ + STMT_START { /* No-op if have completed a pass */ \ + if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ } STMT_END +#define IN_PARENS_PASS (RExC_total_parens < 0) + /* This is used to return failure (zero) early from the calling function if * various flags in 'flags' are set. Two flags always cause a return: @@ -7667,9 +7676,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Do the parse */ if (reg(pRExC_state, 0, &flags, 1)) { - /* Success!, But if RExC_total_parens < 0, we need to redo the parse - * knowing how many parens there actually are */ - if (RExC_total_parens < 0) { + /* Success!, But we may need to redo the parse knowing how many parens + * there actually are */ + if (IN_PARENS_PASS) { flags |= RESTART_PARSE; } @@ -7711,7 +7720,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n")); } - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { /* Make enough room for all the known parens, and zero it */ Renew(RExC_open_parens, RExC_total_parens, regnode_offset); Zero(RExC_open_parens, RExC_total_parens, regnode_offset); @@ -8809,7 +8818,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* It might be a forward reference; we can't fail until we * know, by completing the parse to get all the groups, and * then reparsing */ - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { vFAIL("Reference to nonexistent named group"); } else { @@ -11595,7 +11604,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* It might be a forward reference; we can't fail until * we know, by completing the parse to get all the * groups, and then reparsing */ - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { RExC_parse++; vFAIL("Reference to nonexistent group"); } @@ -11621,7 +11630,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* It might be a forward reference; we can't fail until we * know, by completing the parse to get all the groups, and * then reparsing */ - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { if (num >= RExC_total_parens) { RExC_parse++; vFAIL("Reference to nonexistent group"); @@ -11952,7 +11961,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) capturing_parens: parno = RExC_npar; RExC_npar++; - if (RExC_total_parens <= 0) { + if (! ALL_PARENS_COUNTED) { /* If we are in our first pass through (and maybe only pass), * we need to allocate memory for the capturing parentheses * data structures. Since we start at npar=1, when it reaches @@ -12666,7 +12675,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, SV * substitute_parse = NULL; char *orig_end; char *save_start; - bool save_strict; I32 flags; GET_RE_DEBUG_FLAGS_DECL; @@ -13732,7 +13740,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* It might be a forward reference; we can't fail until we * know, by completing the parse to get all the groups, and * then reparsing */ - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { if (num >= RExC_total_parens) { vFAIL("Reference to nonexistent group"); } @@ -19542,7 +19550,11 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, src = REGNODE_p(RExC_emit); RExC_emit += size; dst = REGNODE_p(RExC_emit); - if (RExC_open_parens) { + + /* If we are in a "count the parentheses" pass, the numbers are unreliable, + * and [perl #133871] shows this can lead to problems, so skip this + * realignment of parens until a later pass when they are reliable */ + if (! IN_PARENS_PASS && RExC_open_parens) { int paren; /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/ /* remember that RExC_npar is rex->nparens + 1, diff --git a/t/re/pat.t b/t/re/pat.t index e97031f2d9..f1be50ae1b 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -24,7 +24,7 @@ BEGIN { skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader; skip_all_without_unicode_tables(); -plan tests => 854; # Update this when adding/deleting tests. +plan tests => 855; # Update this when adding/deleting tests. run_tests() unless caller; @@ -2002,6 +2002,10 @@ while( "\N{U+100}bc" =~ /(..?)(?{$^N})/g ) { } CODE } + { # [perl #133871], ASAN/valgrind out-of-bounds access +; + fresh_perl_like('qr/(?|(())|())|//', qr/syntax error/, {}, "[perl #133871]"); + } } # End of sub run_tests -- Perl5 Master Repository
