In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/44abdc3a8a76a7ca901b7f513b33f72cc9f21495?hp=3c1b037f652c65ec12fbb10e4d08e0b1599b6aa5>
- Log ----------------------------------------------------------------- commit 44abdc3a8a76a7ca901b7f513b33f72cc9f21495 Author: Karl Williamson <k...@cpan.org> Date: Sun Dec 16 12:38:28 2018 -0700 regcomp.c: Tighten embedded patterns in regex sets In the (?[ ... ]) regex sets features, one can embed another compiled regex set pattern. Such compiled patterns always have a flag of '^', which we weren't looking for prior to this commit. That meant that uncompiled patterns would be mistaken for compiled ones. commit 1c484ed4bdbb071c7ad4fc9a259c1aa027d27719 Author: Karl Williamson <k...@cpan.org> Date: Sun Dec 16 12:36:02 2018 -0700 perlrecharclass: Clarify embedding in regex sets commit fdec910a2ad5554e242cfe0137c49704248664df Author: Karl Williamson <k...@cpan.org> Date: Sun Dec 16 12:33:15 2018 -0700 perlre: Italicize variable text ----------------------------------------------------------------------- Summary of changes: pod/perlre.pod | 2 +- pod/perlrecharclass.pod | 30 +++++++++++++++--------------- regcomp.c | 19 ++++++++----------- t/re/reg_mesg.t | 4 ++-- 4 files changed, 26 insertions(+), 29 deletions(-) diff --git a/pod/perlre.pod b/pod/perlre.pod index e97d6c87df..5329df2f86 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -2051,7 +2051,7 @@ Full syntax: C<< (?(<name>)then|else) >> Checks whether the pattern matches (or does not match, for the C<"!"> variants). -Full syntax: C<< (?(?=lookahead)then|else) >> +Full syntax: C<< (?(?=I<lookahead>)I<then>|I<else>) >> =item C<(?{ I<CODE> })> diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 225a092c05..fb9dc432b0 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -1171,8 +1171,15 @@ closing C<])> characters. Just as in all regular expressions, the pattern can be built up by including variables that are interpolated at regex compilation time. -Care must be taken to ensure that you are getting what you expect. For -example: +But its best to compile each sub-component. + + my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; + my $lower = qr/(?[ \p{Lower} + \p{Digit} ])/; + +When these are embedded in another pattern, what they match does not +change, regardless of parenthesization or what modifiers are in effect +in that outer pattern. If you fail to compile the subcomponents, you +can get some nasty surprises. For example: my $thai_or_lao = '\p{Thai} + \p{Lao}'; ... @@ -1182,10 +1189,10 @@ compiles to qr/(?[ \p{Digit} & \p{Thai} + \p{Lao} ])/; -But this does not have the effect that someone reading the code would -likely expect, as the intersection applies just to C<\p{Thai}>, -excluding the Laotian. Pitfalls like this can be avoided by -parenthesizing the component pieces: +But this does not have the effect that someone reading the source code +would likely expect, as the intersection applies just to C<\p{Thai}>, +excluding the Laotian. Its best to compile the subcomponents, but you +could also parenthesize the component pieces: my $thai_or_lao = '( \p{Thai} + \p{Lao} )'; @@ -1194,15 +1201,8 @@ But any modifiers will still apply to all the components: my $lower = '\p{Lower} + \p{Digit}'; qr/(?[ \p{Greek} & $lower ])/i; -matches upper case things. You can avoid surprises by making the -components into instances of this construct by compiling them: - - my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; - my $lower = qr/(?[ \p{Lower} + \p{Digit} ])/; - -When these are embedded in another pattern, what they match does not -change, regardless of parenthesization or what modifiers are in effect -in that outer pattern. +matches upper case things. So just, compile the subcomponents, as +illustrated above. Due to the way that Perl parses things, your parentheses and brackets may need to be balanced, even including comments. If you run into any diff --git a/regcomp.c b/regcomp.c index 0fc793626f..83e7029d50 100644 --- a/regcomp.c +++ b/regcomp.c @@ -15822,10 +15822,11 @@ redo_curchar: case '(': - if ( RExC_parse < RExC_end - 1 - && (UCHARAT(RExC_parse + 1) == '?')) + if ( RExC_parse < RExC_end - 2 + && UCHARAT(RExC_parse + 1) == '?' + && UCHARAT(RExC_parse + 2) == '^') { - /* If is a '(?', could be an embedded '(?flags:(?[...])'. + /* If is a '(?', could be an embedded '(?^flags:(?[...])'. * This happens when we have some thing like * * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; @@ -15843,14 +15844,11 @@ redo_curchar: RExC_parse += 2; /* Skip past the '(?' */ save_parse = RExC_parse; - /* Parse any flags for the '(?' */ + /* Parse the flags for the '(?'. We already know the first + * flag to parse is a '^' */ parse_lparen_question_flags(pRExC_state); - if (RExC_parse == save_parse /* Makes sure there was at - least one flag (or else - this embedding wasn't - compiled) */ - || RExC_parse >= RExC_end - 4 + if ( RExC_parse >= RExC_end - 4 || UCHARAT(RExC_parse) != ':' || UCHARAT(++RExC_parse) != '(' || UCHARAT(++RExC_parse) != '?' @@ -15859,8 +15857,7 @@ redo_curchar: /* In combination with the above, this moves the * pointer to the point just after the first erroneous - * character (or if there are no flags, to where they - * should have been) */ + * character. */ if (RExC_parse >= RExC_end - 4) { RExC_parse = RExC_end; } diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 81f1441ae2..13a37b534b 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -261,8 +261,8 @@ my @death = 'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" {#} m/(?[[a-\pM{#}]])/', 'm/(?[[\pM-x]])/' => 'False [] range "\pM-" {#} m/(?[[\pM-{#}x]])/', 'm/(?[[^\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in inverted character class or as a range end-point is restricted to one character {#} m/(?[[^\N{U+100.300{#}}]])/', - 'm/(?[ \p{Digit} & (?(?[ \p{Thai} | \p{Lao} ]))])/' => 'Sequence (?(...) not recognized {#} m/(?[ \p{Digit} & (?({#}?[ \p{Thai} | \p{Lao} ]))])/', - 'm/(?[ \p{Digit} & (?:(?[ \p{Thai} | \p{Lao} ]))])/' => 'Expecting \'(?flags:(?[...\' {#} m/(?[ \p{Digit} & (?{#}:(?[ \p{Thai} | \p{Lao} ]))])/', + 'm/(?[ \p{Digit} & (?^(?[ \p{Thai} | \p{Lao} ]))])/' => 'Sequence (?^(...) not recognized {#} m/(?[ \p{Digit} & (?^({#}?[ \p{Thai} | \p{Lao} ]))])/', + 'm/(?[ \p{Digit} & (?(?[ \p{Thai} | \p{Lao} ]))])/' => 'Unexpected character {#} m/(?[ \p{Digit} & (?{#}(?[ \p{Thai} | \p{Lao} ]))])/', 'm/\o{/' => 'Missing right brace on \o{ {#} m/\o{{#}/', 'm/\o/' => 'Missing braces on \o{} {#} m/\o{#}/', 'm/\o{}/' => 'Number with no digits {#} m/\o{}{#}/', -- Perl5 Master Repository