Change 29354 by [EMAIL PROTECTED] on 2006/11/22 17:09:33 Subject: \G with /g results in infinite loop in 5.6 and later From: demerphq <[EMAIL PROTECTED]> Date: Wed, 22 Nov 2006 17:11:02 +0100 Message-ID: <[EMAIL PROTECTED]>
Affected files ... ... //depot/perl/pod/perldiag.pod#460 edit ... //depot/perl/pod/perlre.pod#115 edit ... //depot/perl/pp_hot.c#484 edit ... //depot/perl/regcomp.c#504 edit ... //depot/perl/regcomp.h#98 edit ... //depot/perl/regcomp.sym#27 edit ... //depot/perl/regexec.c#492 edit ... //depot/perl/regexp.h#73 edit ... //depot/perl/t/op/pat.t#267 edit ... //depot/perl/t/op/regmesg.t#18 edit Differences ... ==== //depot/perl/pod/perldiag.pod#460 (text) ==== Index: perl/pod/perldiag.pod --- perl/pod/perldiag.pod#459~29279~ 2006-11-15 04:41:24.000000000 -0800 +++ perl/pod/perldiag.pod 2006-11-22 09:09:33.000000000 -0800 @@ -2208,11 +2208,10 @@ to check the return value of your socket() call? See L<perlfunc/listen>. -=item Lookbehind longer than %d not implemented in regex; marked by <-- HERE in m/%s/ +=item Lookbehind longer than %d not implemented in regex m/%s/ (F) There is currently a limit on the length of string which lookbehind can -handle. This restriction may be eased in a future release. The <-- HERE -shows in the regular expression about where the problem was discovered. +handle. This restriction may be eased in a future release. =item lstat() on filehandle %s @@ -4786,11 +4785,10 @@ that module. It usually means you put the wrong funny character on the front of your variable. -=item Variable length lookbehind not implemented in regex; marked by <-- HERE in m/%s/ +=item Variable length lookbehind not implemented in m/%s/ (F) Lookbehind is allowed only for subexpressions whose length is fixed and -known at compile time. The <-- HERE shows in the regular expression about -where the problem was discovered. See L<perlre>. +known at compile time. See L<perlre>. =item Variable length character upgraded in print ==== //depot/perl/pod/perlre.pod#115 (text) ==== Index: perl/pod/perlre.pod --- perl/pod/perlre.pod#114~29279~ 2006-11-15 04:41:24.000000000 -0800 +++ perl/pod/perlre.pod 2006-11-22 09:09:33.000000000 -0800 @@ -443,13 +443,25 @@ several patterns that you want to match against consequent substrings of your string, see the previous reference. The actual location where C<\G> will match can also be influenced by using C<pos()> as -an lvalue: see L<perlfunc/pos>. Currently C<\G> is only fully -supported when anchored to the start of the pattern; while it -is permitted to use it elsewhere, as in C</(?<=\G..)./g>, some -such uses (C</.\G/g>, for example) currently cause problems, and -it is recommended that you avoid such usage for now. +an lvalue: see L<perlfunc/pos>. Note that the rule for zero-length +matches is modified somewhat, in that contents to the left of C<\G> is +not counted when determining the length of the match. Thus the following +will not match forever: X<\G> + $str = 'ABC'; + pos($str) = 1; + while (/.\G/g) { + print $&; + } + +It will print 'A' and then terminate, as it considers the match to +be zero-width, and thus will not match at the same position twice in a +row. + +It is worth noting that C<\G> improperly used can result in an infinite +loop. Take care when using patterns that include C<\G> in an alternation. + =head3 Capture buffers The bracketing construct C<( ... )> creates capture buffers. To ==== //depot/perl/pp_hot.c#484 (text) ==== Index: perl/pp_hot.c --- perl/pp_hot.c#483~29336~ 2006-11-21 06:45:19.000000000 -0800 +++ perl/pp_hot.c 2006-11-22 09:09:33.000000000 -0800 @@ -1304,6 +1304,7 @@ const I32 oldsave = PL_savestack_ix; I32 update_minmatch = 1; I32 had_zerolen = 0; + U32 gpos = 0; if (PL_op->op_flags & OPf_STACKED) TARG = POPs; @@ -1355,13 +1356,18 @@ else if (rx->reganch & ROPT_ANCH_GPOS) { r_flags |= REXEC_IGNOREPOS; rx->endp[0] = rx->startp[0] = mg->mg_len; - } - minmatch = (mg->mg_flags & MGf_MINMATCH); + } else if (rx->reganch & ROPT_GPOS_FLOAT) + gpos = mg->mg_len; + else + rx->endp[0] = rx->startp[0] = mg->mg_len; + minmatch = (mg->mg_flags & MGf_MINMATCH) ? rx->gofs + 1 : 0; update_minmatch = 0; } } } - if ((!global && rx->nparens) + /* remove comment to get faster /g but possibly unsafe $1 vars after a + match. Test for the unsafe vars will fail as well*/ + if (( /* !global && */ rx->nparens) || SvTEMP(TARG) || PL_sawampersand || (pm->op_pmflags & PMf_EVAL)) r_flags |= REXEC_COPY_STR; if (SvSCREAM(TARG)) @@ -1369,8 +1375,8 @@ play_it_again: if (global && rx->startp[0] != -1) { - t = s = rx->endp[0] + truebase; - if ((s + rx->minlen) > strend) + t = s = rx->endp[0] + truebase - rx->gofs; + if ((s + rx->minlen) > strend || s < truebase) goto nope; if (update_minmatch++) minmatch = had_zerolen; @@ -1391,7 +1397,7 @@ && !SvROK(TARG)) /* Cannot trust since INTUIT cannot guess ^ */ goto yup; } - if (CALLREGEXEC(rx, (char*)s, (char *)strend, (char*)truebase, minmatch, TARG, NULL, r_flags)) + if (CALLREGEXEC(rx, (char*)s, (char *)strend, (char*)truebase, minmatch, TARG, (void*)gpos, r_flags)) { PL_curpm = pm; if (dynpm->op_pmflags & PMf_ONCE) @@ -1441,14 +1447,14 @@ } if (rx->startp[0] != -1) { mg->mg_len = rx->endp[0]; - if (rx->startp[0] == rx->endp[0]) + if (rx->startp[0] + rx->gofs == rx->endp[0]) mg->mg_flags |= MGf_MINMATCH; else mg->mg_flags &= ~MGf_MINMATCH; } } had_zerolen = (rx->startp[0] != -1 - && rx->startp[0] == rx->endp[0]); + && rx->startp[0] + rx->gofs == rx->endp[0]); PUTBACK; /* EVAL blocks may use stack */ r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST; goto play_it_again; @@ -1475,7 +1481,7 @@ } if (rx->startp[0] != -1) { mg->mg_len = rx->endp[0]; - if (rx->startp[0] == rx->endp[0]) + if (rx->startp[0] + rx->gofs == rx->endp[0]) mg->mg_flags |= MGf_MINMATCH; else mg->mg_flags &= ~MGf_MINMATCH; ==== //depot/perl/regcomp.c#504 (text) ==== Index: perl/regcomp.c --- perl/regcomp.c#503~29324~ 2006-11-20 05:36:10.000000000 -0800 +++ perl/regcomp.c 2006-11-22 09:09:33.000000000 -0800 @@ -370,7 +370,7 @@ * arg. Show regex, up to a maximum length. If it's too long, chop and add * "...". */ -#define FAIL(msg) STMT_START { \ +#define _FAIL(code) STMT_START { \ const char *ellipses = ""; \ IV len = RExC_end - RExC_precomp; \ \ @@ -381,10 +381,17 @@ len = RegexLengthToShowInErrorMessages - 10; \ ellipses = "..."; \ } \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses); \ + code; \ } STMT_END +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ + msg, (int)len, RExC_precomp, ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ + arg, (int)len, RExC_precomp, ellipses)) + /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan */ @@ -2426,6 +2433,8 @@ } else data_fake.last_closep = &fake; + + data_fake.pos_delta = delta; next = regnext(scan); scan = NEXTOPER(scan); if (code != BRANCH) @@ -2434,7 +2443,7 @@ cl_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; - } + } if (flags & SCF_WHILEM_VISITED_POS) f |= SCF_WHILEM_VISITED_POS; @@ -3475,6 +3484,7 @@ } else data_fake.last_closep = &fake; + data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ cl_init(pRExC_state, &intrnl); @@ -3489,10 +3499,10 @@ last, &data_fake, stopparen, recursed, NULL, f, depth+1); if (scan->flags) { if (deltanext) { - vFAIL("Variable length lookbehind not implemented"); + FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = (U8)minnext; } @@ -3546,6 +3556,7 @@ else data_fake.last_closep = &fake; data_fake.flags = 0; + data_fake.pos_delta = delta; if (is_inf) data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags @@ -3563,10 +3574,10 @@ last, &data_fake, stopparen, recursed, NULL, f,depth+1); if (scan->flags) { if (deltanext) { - vFAIL("Variable length lookbehind not implemented"); + FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = (U8)*minnextp; } @@ -3655,6 +3666,19 @@ cl_anything(pRExC_state, data->start_class); flags &= ~SCF_DO_STCLASS; } + else if (OP(scan) == GPOS) { + if (!(RExC_rx->reganch & ROPT_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) + { + if (!(RExC_rx->reganch & ROPT_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->reganch |= ROPT_ANCH_GPOS; + if (RExC_rx->gofs < (U32)min) + RExC_rx->gofs = min; + } else { + RExC_rx->reganch |= ROPT_GPOS_FLOAT; + RExC_rx->gofs = 0; + } + } #ifdef TRIE_STUDY_OPT #ifdef FULL_TRIE_STUDY else if (PL_regkind[OP(scan)] == TRIE) { @@ -3691,7 +3715,7 @@ } else data_fake.last_closep = &fake; - + data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { cl_init(pRExC_state, &this_class); data_fake.start_class = &this_class; @@ -4042,25 +4066,18 @@ #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char); +#else + /* bulk initialize fields with 0. */ + Zero(r, sizeof(regexp), char); #endif - /* initialization begins here */ + + /* non-zero initialization begins here */ r->engine= RE_ENGINE_PTR; r->refcnt = 1; r->prelen = xend - exp; r->precomp = savepvn(RExC_precomp, r->prelen); - r->subbeg = NULL; -#ifdef PERL_OLD_COPY_ON_WRITE - r->saved_copy = NULL; -#endif r->reganch = pm->op_pmflags & PMf_COMPILETIME; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - r->lastparen = 0; /* mg.c reads this. */ - - r->substrs = 0; /* Useful during FAIL. */ - r->startp = 0; /* Useful during FAIL. */ - r->endp = 0; - r->swap = NULL; - r->paren_names = 0; if (RExC_seen & REG_SEEN_RECURSE) { Newxz(RExC_open_parens, RExC_npar,regnode *); @@ -4235,7 +4252,7 @@ else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->reganch & ROPT_ANCH) ) + !(r->reganch & ROPT_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) { /* turn .* into ^.* with an implied $*=1 */ const int type = @@ -8135,7 +8152,7 @@ PerlIO_putc(Perl_debug_log, ' '); } if (r->reganch & ROPT_GPOS_SEEN) - PerlIO_printf(Perl_debug_log, "GPOS "); + PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", r->gofs); if (r->reganch & ROPT_SKIP) PerlIO_printf(Perl_debug_log, "plus "); if (r->reganch & ROPT_IMPLICIT) ==== //depot/perl/regcomp.h#98 (text) ==== Index: perl/regcomp.h --- perl/regcomp.h#97~29260~ 2006-11-13 06:00:41.000000000 -0800 +++ perl/regcomp.h 2006-11-22 09:09:33.000000000 -0800 @@ -11,10 +11,18 @@ typedef OP OP_4tree; /* Will be redefined later. */ +/* Convert branch sequences to more efficient trie ops? */ #define PERL_ENABLE_TRIE_OPTIMISATION 1 + +/* Be really agressive about optimising patterns with trie sequences? */ #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 + +/* Should the optimiser take positive assertions into account? */ #define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 1 + +/* Not for production use: */ #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0 + /* Unless the next line is uncommented it is illegal to combine lazy matching with possessive matching. Frankly it doesn't make much sense to allow it as X*?+ matches nothing, X+?+ matches a single char only, ==== //depot/perl/regcomp.sym#27 (text) ==== Index: perl/regcomp.sym --- perl/regcomp.sym#26~29260~ 2006-11-13 06:00:41.000000000 -0800 +++ perl/regcomp.sym 2006-11-22 09:09:33.000000000 -0800 @@ -182,7 +182,6 @@ COMMIT VERB, no-sv 1 Pattern fails outright if backtracking through this CUTGROUP VERB, no-sv 1 On failure go to the next alternation in the group - # NEW STUFF ABOVE THIS LINE -- Please update counts below. ################################################################################ ==== //depot/perl/regexec.c#492 (text) ==== Index: perl/regexec.c --- perl/regexec.c#491~29324~ 2006-11-20 05:36:10.000000000 -0800 +++ perl/regexec.c 2006-11-22 09:09:33.000000000 -0800 @@ -1635,7 +1635,9 @@ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ /* minend: end of match must be >=minend after stringarg. */ -/* data: May be used for some additional optimizations. */ +/* data: May be used for some additional optimizations. + Currently its only used, with a U32 cast, for transmitting + the ganch offset when doing a /g match. This will change */ /* nosave: For optimizations. */ { dVAR; @@ -1711,7 +1713,7 @@ MAGIC *mg; if (flags & REXEC_IGNOREPOS) /* Means: check only at start */ - reginfo.ganch = startpos; + reginfo.ganch = startpos + prog->gofs; else if (sv && SvTYPE(sv) >= SVt_PVMG && SvMAGIC(sv) && (mg = mg_find(sv, PERL_MAGIC_regex_global)) @@ -1720,10 +1722,12 @@ if (prog->reganch & ROPT_ANCH_GPOS) { if (s > reginfo.ganch) goto phooey; - s = reginfo.ganch; + s = reginfo.ganch - prog->gofs; } } - else /* pos() not defined */ + else if (data) { + reginfo.ganch = strbeg + (UV)data; + } else /* pos() not defined */ reginfo.ganch = strbeg; } if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) { @@ -1810,7 +1814,8 @@ /* the warning about reginfo.ganch being used without intialization is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN and we only enter this block when the same bit is set. */ - if (regtry(®info, ®info.ganch)) + char *tmp_s = reginfo.ganch - prog->gofs; + if (regtry(®info, &tmp_s)) goto got_it; goto phooey; } @@ -2623,6 +2628,7 @@ during a successfull match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; + /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop @@ -2643,7 +2649,7 @@ GET_RE_DEBUG_FLAGS_DECL; #endif - DEBUG_STACK_r( { + DEBUG_OPTIMISE_r( { PerlIO_printf(Perl_debug_log,"regmatch start\n"); }); /* on first ever call to regmatch, allocate first slab */ @@ -4688,6 +4694,7 @@ (long)(locinput - PL_reg_starttry), (long)(reginfo->till - PL_reg_starttry), PL_colors[5])); + sayNO_SILENT; /* Cannot match: too short. */ } PL_reginput = locinput; /* put where regtry can find it */ ==== //depot/perl/regexp.h#73 (text) ==== Index: perl/regexp.h --- perl/regexp.h#72~29295~ 2006-11-16 05:35:14.000000000 -0800 +++ perl/regexp.h 2006-11-22 09:09:33.000000000 -0800 @@ -54,6 +54,7 @@ I32 refcnt; I32 minlen; /* mininum possible length of string to match */ I32 minlenret; /* mininum possible length of $& */ + U32 gofs; /* chars left of pos that we search from */ I32 prelen; /* length of precomp */ U32 nparens; /* number of parentheses */ U32 lastparen; /* last paren matched */ @@ -114,6 +115,7 @@ #define ROPT_MATCH_UTF8 0x10000000 /* subbeg is utf-8 */ #define ROPT_VERBARG_SEEN 0x20000000 #define ROPT_CUTGROUP_SEEN 0x40000000 +#define ROPT_GPOS_FLOAT 0x80000000 #define RE_USE_INTUIT_NOML 0x00100000 /* Best to intuit before matching */ #define RE_USE_INTUIT_ML 0x00200000 ==== //depot/perl/t/op/pat.t#267 (xtext) ==== Index: perl/t/op/pat.t --- perl/t/op/pat.t#266~29333~ 2006-11-21 01:15:43.000000000 -0800 +++ perl/t/op/pat.t 2006-11-22 09:09:33.000000000 -0800 @@ -4053,10 +4053,10 @@ { local $Message="RT#22395"; our $count; - for my $l (1,10,100,1000) { + for my $l (10,100,1000) { $count=0; ('a' x $l) =~ /(.*)(?{$count++})[bc]/; - iseq($l+1,$count,"Should be L+1 not L*(L+3)/2 (L=$l)"); + iseq( $count, $l + 1, "# TODO Should be L+1 not L*(L+3)/2 (L=$l)"); } } { @@ -4083,6 +4083,17 @@ iseq($count,3); iseq($text,' word2 word4 word6 '); } +{ + # RT#6893 + local $_= qq(A\nB\nC\n); + my @res; + while (m#(\G|\n)([^\n]*)\n#gsx) + { + push @res,"$2"; + last if @res>3; + } + iseq("@res","A B C","RT#6893: /g pattern shouldn't infinite loop"); +} { # From Message-ID: <[EMAIL PROTECTED]> @@ -4094,6 +4105,13 @@ iseq($dow_name,$time_string,"UTF8 trie common prefix extraction"); } +{ + my $v; + ($v='bar')=~/(\w+)/g; + $v='foo'; + iseq("$1",'bar','# TODO $1 is safe after /g - may fail due to specialized config in pp_hot.c') +} + # Test counter is at bottom of file. Put new tests above here. #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4137,9 +4155,10 @@ } # Put new tests above the dotted line about a page above this comment - +iseq(0+$::test,$::TestCount,"Got the right number of tests!"); # Don't forget to update this! BEGIN { - $::TestCount = 1367; + $::TestCount = 1369; print "1..$::TestCount\n"; } + ==== //depot/perl/t/op/regmesg.t#18 (text) ==== Index: perl/t/op/regmesg.t --- perl/t/op/regmesg.t#17~29026~ 2006-10-16 05:55:28.000000000 -0700 +++ perl/t/op/regmesg.t 2006-11-22 09:09:33.000000000 -0800 @@ -28,9 +28,9 @@ ( '/[[=foo=]]/' => 'POSIX syntax [= =] is reserved for future extensions in regex; marked by {#} in m/[[=foo=]{#}]/', - '/(?<= .*)/' => 'Variable length lookbehind not implemented in regex; marked by {#} in m/(?<= .*){#}/', + '/(?<= .*)/' => 'Variable length lookbehind not implemented in regex m/(?<= .*)/', - '/(?<= x{1000})/' => 'Lookbehind longer than 255 not implemented in regex; marked by {#} in m/(?<= x{1000}){#}/', + '/(?<= x{1000})/' => 'Lookbehind longer than 255 not implemented in regex m/(?<= x{1000})/', '/(?@)/' => 'Sequence ([EMAIL PROTECTED]) not implemented in regex; marked by {#} in m/([EMAIL PROTECTED])/', End of Patch.