In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/77c8f26370dcc0e16ca7c5f1b8f3bd1b99a57a28?hp=afa4768ac90fcd6a53a3661885a238d344a02f92>
- Log ----------------------------------------------------------------- commit 77c8f26370dcc0e16ca7c5f1b8f3bd1b99a57a28 Author: Karl Williamson <k...@cpan.org> Date: Thu Jan 12 11:07:47 2017 -0700 Add /xx regex pattern modifier This was first proposed in the thread starting at http://www.nntp.perl.org/group/perl.perl5.porters/2014/09/msg219394.html M MANIFEST M ext/re/re.pm M ext/re/t/reflags.t M pod/perlcheat.pod M pod/perldelta.pod M pod/perldiag.pod M pod/perlop.pod M pod/perlre.pod M pod/perlrecharclass.pod M pod/perlretut.pod M pod/perlstyle.pod M pod/perluniintro.pod M regcomp.c M regexp.h A t/re/keep_tabs.t M t/re/re_tests M t/re/reg_mesg.t M toke.c commit 2ab076704905c338cc874079818784698cd5bc85 Author: Karl Williamson <k...@cpan.org> Date: Fri Jan 13 11:17:25 2017 -0700 perlre: Clarifications, typos M pod/perlre.pod commit 563642b4907d9b1b6beaa96b472ae787ae81d56f Author: Karl Williamson <k...@cpan.org> Date: Tue Jan 10 19:10:58 2017 -0700 perlretut: Add some white space for legibility M pod/perlretut.pod commit fc54a9b2090b5f71905241c319706e3cca18acc9 Author: Karl Williamson <k...@cpan.org> Date: Wed Jan 11 22:18:53 2017 -0700 regcomp.c: Remove obsolete data structure element This was used for the removed feature of having the source in a different encoding. M regcomp.c commit af4226f2469589e84d382a08a6a5242303758f0b Author: Karl Williamson <k...@cpan.org> Date: Thu Jan 12 21:05:35 2017 -0700 pp_sys.c: White space only Vertically align some components of an 'if' for readability. M pp_sys.c ----------------------------------------------------------------------- Summary of changes: MANIFEST | 1 + ext/re/re.pm | 23 ++++++++++----- ext/re/t/reflags.t | 17 ++++++++--- pod/perlcheat.pod | 2 +- pod/perldelta.pod | 9 +++++- pod/perldiag.pod | 8 ----- pod/perlop.pod | 4 ++- pod/perlre.pod | 77 ++++++++++++++++++++++++++++++++++++++++++++----- pod/perlrecharclass.pod | 38 ++++++++++++++++-------- pod/perlretut.pod | 30 +++++++++++++++---- pod/perlstyle.pod | 5 ++-- pod/perluniintro.pod | 2 +- pp_sys.c | 10 +++---- regcomp.c | 33 ++++++++++++--------- regexp.h | 14 +++++++-- t/re/keep_tabs.t | 29 +++++++++++++++++++ t/re/re_tests | 12 ++++++++ t/re/reg_mesg.t | 8 ++--- toke.c | 8 ----- 19 files changed, 245 insertions(+), 85 deletions(-) create mode 100644 t/re/keep_tabs.t diff --git a/MANIFEST b/MANIFEST index d31ee1a8ed..4745b04ede 100644 --- a/MANIFEST +++ b/MANIFEST @@ -5726,6 +5726,7 @@ t/porting/utils.t Check that utility scripts still compile t/re/anyof.t See if bracketed char classes [...] compile properly t/re/charset.t See if regex modifiers like /d, /u work properly t/re/fold_grind.t See if case folding works properly +t/re/keep_tabs.t Tests where \t can't be expanded. t/re/no_utf8_pm.t Verify utf8.pm doesn't get loaded unless required t/re/overload.t Test against string corruption in pattern matches on overloaded objects t/re/pat.t See if esoteric patterns work diff --git a/ext/re/re.pm b/ext/re/re.pm index b924fd9fc7..123408c76f 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -4,7 +4,7 @@ package re; use strict; use warnings; -our $VERSION = "0.33"; +our $VERSION = "0.34"; our @ISA = qw(Exporter); our @EXPORT_OK = ('regmust', qw(is_regexp regexp_pattern @@ -23,6 +23,7 @@ my %reflags = ( s => 1 << ($PMMOD_SHIFT + 1), i => 1 << ($PMMOD_SHIFT + 2), x => 1 << ($PMMOD_SHIFT + 3), + xx => 1 << ($PMMOD_SHIFT + 4), n => 1 << ($PMMOD_SHIFT + 5), p => 1 << ($PMMOD_SHIFT + 6), strict => 1 << ($PMMOD_SHIFT + 10), @@ -112,7 +113,6 @@ sub bits { my $on = shift; my $bits = 0; my $turning_all_off = ! @_ && ! $on; - my %seen; # Has flag already been seen? if ($turning_all_off) { # Pretend were called with certain parameters, which are best dealt @@ -180,6 +180,7 @@ sub bits { } elsif ($s =~ s/^\///) { my $reflags = $^H{reflags} || 0; my $seen_charset; + my $x_count = 0; while ($s =~ m/( . )/gx) { local $_ = $1; if (/[adul]/) { @@ -225,7 +226,19 @@ sub bits { && $^H{reflags_charset} == $reflags{$_}; } } elsif (exists $reflags{$_}) { - $seen{$_}++; + if ($_ eq 'x') { + $x_count++; + if ($x_count > 2) { + require Carp; + Carp::carp( + qq 'The "x" flag may only appear a maximum of twice' + ); + } + elsif ($x_count == 2) { + $_ = 'xx'; # First time through got the /x + } + } + $on ? $reflags |= $reflags{$_} : ($reflags &= ~$reflags{$_}); @@ -247,10 +260,6 @@ sub bits { ")"); } } - if (exists $seen{'x'} && $seen{'x'} > 1) { - require Carp; - Carp::croak("Only one /x regex modifier is allowed"); - } if ($turning_all_off) { _load_unload(0); diff --git a/ext/re/t/reflags.t b/ext/re/t/reflags.t index a481c98799..595b4b28b4 100644 --- a/ext/re/t/reflags.t +++ b/ext/re/t/reflags.t @@ -11,7 +11,7 @@ BEGIN { use strict; -use Test::More tests => 67; +use Test::More tests => 74; my @flags = qw( a d l u ); @@ -24,10 +24,19 @@ ok "Foo" !~ /(??{'foo'})/, 'no re "/i" (??{})'; use re '/x'; ok "foo" =~ / foo /, 'use re "/x"'; ok "foo" =~ / (??{' foo '}) /, 'use re "/x" (??{})'; +like " ", qr/[a b]/, 'use re "/x" [a b]'; no re '/x'; ok "foo" !~ / foo /, 'no re "/x"'; ok "foo" !~ /(??{' foo '})/, 'no re "/x" (??{})'; ok "foo" !~ / (??{'foo'}) /, 'no re "/x" (??{})'; +use re '/xx'; +ok "foo" =~ / foo /, 'use re "/xx"'; +ok "foo" =~ / (??{' foo '}) /, 'use re "/xx" (??{})'; +unlike " ", qr/[a b]/, 'use re "/xx" [a b] # Space in [] gobbled up'; +no re '/xx'; +ok "foo" !~ / foo /, 'no re "/xx"'; +ok "foo" !~ /(??{' foo '})/, 'no re "/xx" (??{})'; +ok "foo" !~ / (??{'foo'}) /, 'no re "/xx" (??{})'; use re '/s'; ok "\n" =~ /./, 'use re "/s"'; ok "\n" =~ /(??{'.'})/, 'use re "/s" (??{})'; @@ -178,8 +187,8 @@ is qr//, '(?^:)', 'no re "/aai"'; "warning with eval \"use re \"/amaa\""; $w = ""; - eval "use re '/xamax'"; - like $@, qr/Only one \/x regex modifier is allowed/, - "error with eval \"use re \"/xamax\""; + eval "use re '/xamaxx'"; + like $w, qr/The "x" flag may only appear a maximum of twice/, + "warning with eval \"use re \"/xamaxx\""; } diff --git a/pod/perlcheat.pod b/pod/perlcheat.pod index 6e4e919ff5..99a8dfc547 100644 --- a/pod/perlcheat.pod +++ b/pod/perlcheat.pod @@ -41,7 +41,7 @@ already be overwhelming. && /i case insensitive ^ string begin || // /m line based ^$ $ str end (bfr \n) .. ... /s . includes \n + one or more - ?: /x ignore wh.space * zero or more + ?: /x /xx ign. wh.space * zero or more = += last goto /p preserve ? zero or one , => /a ASCII /aa safe {3,7} repeat in range list ops /l locale /d dual | alternation diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 460d118a1d..86b7e9a00f 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -25,7 +25,14 @@ XXX New core language features go here. Summarize user-visible core language enhancements. Particularly prominent performance optimisations could go here, but most should go in the L</Performance Enhancements> section. -[ List each enhancement as a =head2 entry ] +=head2 New regular expression modifier C</xx> + +Specifying two C<x> characters to modify a regular expression pattern +does everything that a single one does, but additionally TAB and SPACE +characters within a bracketed character class are generally ignored and +can be added to improve readability, like +S<C</[ ^ A-Z d-f p-x ]/xx>>. Details are at +L<perlre/E<sol>x and E<sol>xx>. =head1 Security diff --git a/pod/perldiag.pod b/pod/perldiag.pod index fe5ff9bcc4..7d6675c1dc 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -4230,14 +4230,6 @@ C<sysread()>ing a file, or when seeking past the end of a scalar opened for I/O (in anticipation of future reads and to imitate the behavior with real files). -=item Only one /x regex modifier is allowed - -=item Only one /x regex modifier is allowed in regex; marked by <-- HERE in m/%s/ - -(F) You used the C</x> regular expression pattern modifier at least twice in a -string of modifiers. This has been made illegal, in order to allow future -extensions to the Perl language. - =item %s() on unopened %s (W unopened) An I/O operation was attempted on a filehandle that was diff --git a/pod/perlop.pod b/pod/perlop.pod index 82dca55d52..3cf9db67e6 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -1743,7 +1743,9 @@ Options (specified by the following modifiers) are: m Treat string as multiple lines. s Treat string as single line. (Make . match a newline) i Do case-insensitive pattern matching. - x Use extended regular expressions. + x Use extended regular expressions; specifying two + x's means \t and the SPACE character are ignored within + square-bracketed character classes p When matching preserve a copy of the matched string so that ${^PREMATCH}, ${^MATCH}, ${^POSTMATCH} will be defined (ignored starting in v5.20) as these are always diff --git a/pod/perlre.pod b/pod/perlre.pod index 10783a30b8..e3fc62d305 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -86,11 +86,11 @@ inverted, which otherwise could be highly confusing. See L<perlrecharclass/Bracketed Character Classes>, and L<perlrecharclass/Negation>. -=item B<C<x>> +=item B<C<x>> and B<C<xx>> X</x> Extend your pattern's legibility by permitting whitespace and comments. -Details in L</"/x"> +Details in L</E<sol>x and E<sol>xx> =item B<C<p>> X</p> X<regex, preserve> X<regexp, preserve> @@ -144,7 +144,6 @@ L<perlretut/"Using regular expressions in Perl"> are: g - globally match the pattern repeatedly in the string Substitution-specific modifiers described in - L<perlop/"s/PATTERN/REPLACEMENT/msixpodualngcer"> are: e - evaluate the right-hand side as an expression @@ -165,12 +164,12 @@ the C<(?...)> construct, see L</Extended Patterns> below. Some of the modifiers require more explanation than given in the L</Overview> above. -=head4 /x +=head4 C</x> and C</xx> -C</x> tells +A single C</x> tells the regular expression parser to ignore most whitespace that is neither backslashed nor within a bracketed character class. You can use this to -break up your regular expression into (slightly) more readable parts. +break up your regular expression into more readable parts. Also, the C<"#"> character is treated as a metacharacter introducing a comment that runs up to the pattern's closing delimiter, or to the end of the current line if the pattern extends onto the next line. Hence, @@ -190,6 +189,24 @@ You can use L</(?#text)> to create a comment that ends earlier than the end of the current line, but C<text> also can't contain the closing delimiter unless escaped with a backslash. +A common pitfall is to forget that C<#> characters begin a comment under +C</x> and are not matched literally. Just keep that in mind when trying +to puzzle out why a particular C</x> pattern isn't working as expected. + +Starting in Perl v5.26, if the modifier has a second C<x> within it, +it does everything that a single C</x> does, but additionally +non-backslashed SPACE and TAB characters within bracketed character +classes are also generally ignored, and hence can be added to make the +classes more readable. + + / [d-e g-i 3-7]/xx + /[ ! @ " # $ % ^ & * () = ? <> ' ]/xx + +may be easier to grasp than the squashed equivalents + + /[d-eg-i3-7]/ + /[!@"#$%^&*()=?<>']/ + Taken together, these features go a long way towards making Perl's regular expressions more readable. Here's an example: @@ -554,7 +571,6 @@ meanings: X<metacharacter> X<\> X<^> X<.> X<$> X<|> X<(> X<()> X<[> X<[]> - \ Quote the next metacharacter ^ Match the beginning of the line . Match any character (except newline) @@ -1075,13 +1091,30 @@ a backslash if it appears in the comment. See L</E<sol>x> for another way to have comments in patterns. +Note that a comment can go just about anywhere, except in the middle of +an escape sequence. Examples: + + qr/foo(?#comment)bar/' # Matches 'foobar' + + # The pattern below matches 'abcd', 'abccd', or 'abcccd' + qr/abc(?#comment between literal and its quantifier){1,3}d/ + + # The pattern below generates a syntax error, because the '\p' must + # be followed immediately by a '{'. + qr/\p(?#comment between \p and its property name){Any}/ + + # The pattern below generates a syntax error, because the initial + # '\(' is a literal opening parenthesis, and so there is nothing + # for the closing ')' to match + qr/\(?#the backslash means this isn't a comment)p{Any}/ + =item C<(?adlupimnsx-imnsx)> =item C<(?^alupimnsx)> X<(?)> X<(?^)> One or more embedded pattern-match modifiers, to be turned on (or -turned off, if preceded by C<"-">) for the remainder of the pattern or +turned off if preceded by C<"-">) for the remainder of the pattern or the remainder of the enclosing pattern group (if any). This is particularly useful for dynamically-generated patterns, @@ -1111,6 +1144,29 @@ These modifiers do not carry over into named subpatterns called in the enclosing group. In other words, a pattern such as C<((?i)(?&NAME))> does not change the case-sensitivity of the C<"NAME"> pattern. +A modifier is overridden by later occurrences of this construct in the +same scope containing the same modifier, so that + + /((?im)foo(?-m)bar)/ + +matches all of C<foobar> case insensitively, but uses C</m> rules for +only the C<foo> portion. The C<a> flag overrides C<aa> as well; +likewise C<aa> overrides C<a>. The same goes for C<x> and C<xx>. +Hence, in + + /(?-x)foo/xx + +both C</x> and C</xx> are turned off during matching C<foo>. And in + + /(?x)foo/x + +C</x> but NOT C</xx> is turned on for matching C<foo>. (One might +mistakenly think that since the inner C<(?x)> is already in the scope of +C</x>, that the result would effectively be the sum of them, yielding +C</xx>. It doesn't work that way.) Similarly, doing something like +C<(?xx-x)foo> turns off all C<x> behavior for matching C<foo>, it is not +that you subtract 1 C<x> from 2 to get 1 C<x> remaining. + Any of these modifiers can be set to apply globally to all regular expressions compiled within the scope of a C<use re>. See L<re/"'/flags' mode">. @@ -1165,6 +1221,11 @@ is equivalent to the more verbose Note that any C<()> constructs enclosed within this one will still capture unless the C</n> modifier is in effect. +Like the L</(?adlupimnsx-imnsx)> construct, C<aa> and C<a> override each +other, as do C<xx> and C<x>. They are not additive. So, doing +something like C<(?xx-x:foo)> turns off all C<x> behavior for matching +C<foo>. + Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately after the C<"?"> is a shorthand equivalent to C<d-imnsx>. Any positive flags (except C<"d">) may follow the caret, so diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 93bb2e5e63..1c07632dec 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -576,6 +576,29 @@ Examples: # containing just [, and the character class is # followed by a ]. +=head3 Bracketed Character Classes and the C</xx> pattern modifier + +Normally SPACE and TAB characters have no special meaning inside a +bracketed character class; they are just added to the list of characters +matched by the class. But if the L<C</xx>|perlre/E<sol>x and E<sol>xx> +pattern modifier is in effect, they are generally ignored and can be +added to improve readability. They can't be added in the middle of a +single construct: + + / [ \x{10 FFFF} ] /xx # WRONG! + +The SPACE in the middle of the hex constant is illegal. + +To specify a literal SPACE character, you can escape it with a +backslash, like: + + /[ a e i o u \ ]/xx + +This matches the English vowels plus the SPACE character. + +For clarity, you should already have been using C<\t> to specify a +literal tab, and C<\t> is unaffected by C</xx>. + =head3 Character Ranges It is not uncommon to want to match a range of characters. Luckily, instead @@ -1016,7 +1039,7 @@ We can extend the example above: This matches digits that are in either the Thai or Laotian scripts. Notice the white space in these examples. This construct always has -the C<E<sol>x> modifier turned on within it. +the C<E<sol>xx> modifier turned on within it. The available binary operators are: @@ -1061,18 +1084,9 @@ C<\N{...}>, etc.) This last example shows the use of this construct to specify an ordinary bracketed character class without additional set operations. Note the -white space within it; a limited version of C<E<sol>x> is turned on even -within bracketed character classes, with only the SPACE and TAB (C<\t>) -characters allowed, and no comments. Hence, - - (?[ [#] ]) +white space within it. This is allowed because C<E<sol>xx> is +automatically turned on within this construct. -matches the literal character "#". To specify a literal white space character, -you can escape it with a backslash, like: - - /(?[ [ a e i o u \ ] ])/ - -This matches the English vowels plus the SPACE character. All the other escapes accepted by normal bracketed character classes are accepted here as well; but unrecognized escapes that generate warnings in normal classes are fatal errors here. diff --git a/pod/perlretut.pod b/pod/perlretut.pod index d74276c91d..9c7ab56042 100644 --- a/pod/perlretut.pod +++ b/pod/perlretut.pod @@ -1479,7 +1479,7 @@ we can rewrite our 'extended' regexp in the more pleasing form |\.\d+ # mantissa of the form .b |\d+ # integer of the form a ) - ([eE][+-]?\d+)? # finally, optionally match an exponent + ( [eE] [+-]? \d+ )? # finally, optionally match an exponent $/x; If whitespace is mostly irrelevant, how does one include space @@ -1497,7 +1497,7 @@ this to our regexp as follows: |\.\d+ # mantissa of the form .b |\d+ # integer of the form a ) - ([eE][+-]?\d+)? # finally, optionally match an exponent + ( [eE] [+-]? \d+ )? # finally, optionally match an exponent $/x; In this form, it is easier to see a way to simplify the @@ -1513,10 +1513,28 @@ could be factored out: )? # ? takes care of integers of the form a |\.\d+ # mantissa of the form .b ) - ([eE][+-]?\d+)? # finally, optionally match an exponent + ( [eE] [+-]? \d+ )? # finally, optionally match an exponent $/x; -or written in the compact form, +Starting in Perl v5.26, specifying C</xx> changes the square-bracketed +portions of a pattern to ignore tabs and space characters unless they +are escaped by preceding them with a backslash. So, we could write + + /^ + [ + - ]?\ * # first, match an optional sign + ( # then match integers or f.p. mantissas: + \d+ # start out with a ... + ( + \.\d* # mantissa of the form a.b or a. + )? # ? takes care of integers of the form a + |\.\d+ # mantissa of the form .b + ) + ( [ e E ] [ + - ]? \d+ )? # finally, optionally match an exponent + $/xx; + +This doesn't really improve the legibility of this example, but it's +available in case you want it. Squashing the pattern down to the +compact form, we have /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/; @@ -2379,7 +2397,7 @@ enclosed in parentheses up to two levels deep. Then the following regexp matches: $x = "abc(de(fg)h"; # unbalanced parentheses - $x =~ /\( ( [^()]+ | \([^()]*\) )+ \)/x; + $x =~ /\( ( [ ^ () ]+ | \( [ ^ () ]* \) )+ \)/xx; The regexp matches an open parenthesis, one or more copies of an alternation, and a close parenthesis. The alternation is two-way, with @@ -2393,7 +2411,7 @@ was no match possible. To prevent the exponential blowup, we need to prevent useless backtracking at some point. This can be done by enclosing the inner quantifier as an independent subexpression: - $x =~ /\( ( (?>[^()]+) | \([^()]*\) )+ \)/x; + $x =~ /\( ( (?> [ ^ () ]+ ) | \([ ^ () ]* \) )+ \)/xx; Here, C<< (?>[^()]+) >> breaks the degeneracy of string partitioning by gobbling up as much of the string as possible and keeping it. Then diff --git a/pod/perlstyle.pod b/pod/perlstyle.pod index 37dfaaf141..5c2534581e 100644 --- a/pod/perlstyle.pod +++ b/pod/perlstyle.pod @@ -210,8 +210,9 @@ function should not be used outside the package that defined it. =item * -If you have a really hairy regular expression, use the C</x> modifier and -put in some whitespace to make it look a little less like line noise. +If you have a really hairy regular expression, use the C</x> or C</xx> +modifiers and put in some whitespace to make it look a little less like +line noise. Don't use slash as a delimiter when your regexp has slashes or backslashes. =item * diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index 5b571fbbc1..ef4d07d1d6 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -645,7 +645,7 @@ Starting in v5.22, you can use Unicode code points as the end points of regular expression pattern character ranges, and the range will include all Unicode code points that lie between those end points, inclusive. - qr/ [\N{U+03]-\N{U+20}] /x + qr/ [ \N{U+03} - \N{U+20} ] /xx includes the code points C<\N{U+03}>, C<\N{U+04}>, ..., C<\N{U+20}>. diff --git a/pp_sys.c b/pp_sys.c index c0ef29812e..11193bcbc6 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -3598,14 +3598,14 @@ PP(pp_fttext) } else #endif - if (isPRINT_A(*s) - /* VT occurs so rarely in text, that we consider it odd */ - || (isSPACE_A(*s) && *s != VT_NATIVE) + if ( isPRINT_A(*s) + /* VT occurs so rarely in text, that we consider it odd */ + || (isSPACE_A(*s) && *s != VT_NATIVE) /* But there is a fair amount of backspaces and escapes in * some text */ - || *s == '\b' - || *s == ESC_NATIVE) + || *s == '\b' + || *s == ESC_NATIVE) { continue; } diff --git a/regcomp.c b/regcomp.c index 64d8907da0..2114773357 100644 --- a/regcomp.c +++ b/regcomp.c @@ -271,7 +271,6 @@ struct RExC_state_t { (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) -#define RExC_override_recoding (pRExC_state->override_recoding) #ifdef EBCDIC # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) #endif @@ -6517,8 +6516,12 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, *p++ = pat[s]; } *p++ = '\''; - if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) { *p++ = 'x'; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) { + *p++ = 'x'; + } + } *p++ = '\0'; DEBUG_COMPILE_r({ Perl_re_printf( aTHX_ @@ -7006,7 +7009,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; - RExC_override_recoding = 0; #ifdef EBCDIC RExC_recode_x_to_native = 0; #endif @@ -7164,7 +7166,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, == REG_RUN_ON_COMMENT_SEEN); U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); - const char *fptr = STD_PAT_MODS; /*"msixn"*/ + const char *fptr = STD_PAT_MODS; /*"msixxn"*/ char *p; /* We output all the necessary flags; we never output a minus, as all @@ -10428,18 +10430,23 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) } flagsp = &negflags; wastedflags = 0; /* reset so (?g-c) warns twice */ + x_mod_count = 0; break; case ':': case ')': + + if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) { + negflags |= RXf_PMf_EXTENDED_MORE; + } RExC_flags |= posflags; + + if (negflags & RXf_PMf_EXTENDED) { + negflags |= RXf_PMf_EXTENDED_MORE; + } RExC_flags &= ~negflags; set_regex_charset(&RExC_flags, cs); - if (UNLIKELY((x_mod_count) > 1)) { - vFAIL("Only one /x regex modifier is allowed"); - } return; - /*NOTREACHED*/ default: fail_modifiers: RExC_parse += SKIP_IF_CHAR(RExC_parse); @@ -12138,7 +12145,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, /* The values are Unicode, and therefore not subject to recoding, but * have to be converted to native on a non-Unicode (meaning non-ASCII) * platform. */ - RExC_override_recoding = 1; #ifdef EBCDIC RExC_recode_x_to_native = 1; #endif @@ -12159,7 +12165,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, RExC_start = RExC_adjusted_start = save_start; RExC_parse = endbrace; RExC_end = orig_end; - RExC_override_recoding = 0; #ifdef EBCDIC RExC_recode_x_to_native = 0; #endif @@ -15792,8 +15797,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, character; used under /i */ UV n; char * stop_ptr = RExC_end; /* where to stop parsing */ - const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white - space? */ + + /* ignore unescaped whitespace? */ + const bool skip_white = cBOOL( ret_invlist + || (RExC_flags & RXf_PMf_EXTENDED_MORE)); /* Unicode properties are stored in a swash; this holds the current one * being parsed. If this swash is the only above-latin1 component of the @@ -17008,7 +17015,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_adjusted_start = RExC_start + prefix_end; RExC_end = RExC_parse + len; RExC_in_multi_char_class = 1; - RExC_override_recoding = 1; RExC_emit = (regnode *)orig_emit; ret = reg(pRExC_state, 1, ®_flags, depth+1); @@ -17021,7 +17027,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_precomp_adj = 0; RExC_end = save_end; RExC_in_multi_char_class = 0; - RExC_override_recoding = 0; SvREFCNT_dec_NN(multi_char_matches); return ret; } diff --git a/regexp.h b/regexp.h index 7351afdc76..08b4fc32a8 100644 --- a/regexp.h +++ b/regexp.h @@ -278,18 +278,26 @@ and check for NULL. #include "op_reg_common.h" -#define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_NOCAPTURE) +#define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_NOCAPTURE) #define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, x_count) \ case IGNORE_PAT_MOD: *(pmfl) |= RXf_PMf_FOLD; break; \ case MULTILINE_PAT_MOD: *(pmfl) |= RXf_PMf_MULTILINE; break; \ case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \ - case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; (x_count)++; break;\ + case XTENDED_PAT_MOD: if (x_count == 0) { \ + *(pmfl) |= RXf_PMf_EXTENDED; \ + *(pmfl) &= ~RXf_PMf_EXTENDED_MORE; \ + } \ + else { \ + *(pmfl) |= RXf_PMf_EXTENDED \ + |RXf_PMf_EXTENDED_MORE; \ + } \ + (x_count)++; break; \ case NOCAPTURE_PAT_MOD: *(pmfl) |= RXf_PMf_NOCAPTURE; break; /* Note, includes charset ones, assumes 0 is the default for them */ #define STD_PMMOD_FLAGS_CLEAR(pmfl) \ - *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET|RXf_PMf_NOCAPTURE) + *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_CHARSET|RXf_PMf_NOCAPTURE) /* chars and strings used as regex pattern modifiers * Singular is a 'c'har, plural is a "string" diff --git a/t/re/keep_tabs.t b/t/re/keep_tabs.t new file mode 100644 index 0000000000..ec986c483f --- /dev/null +++ b/t/re/keep_tabs.t @@ -0,0 +1,29 @@ +# This file contains tests where \t characters should not be expanded into +# spaces. + +BEGIN { + chdir 't' if -d 't'; + require './test.pl'; +} + +{ + like("\t", qr/[a b]/x, '\t not ignored under /x'); + unlike("\t", qr/[a b]/xx, '\t ignored under /xx'); + like("a", qr/[a b]/xx, '"a" matches qr/[a b]/xx'); + like("b", qr/[a b]/xx, '"b" matches qr/[a b]/xx'); + like("\t", qr/[a\ b]/xx, '"\t" matches qr/[a\ b]/xx'); + like("a", qr/[a\ b]/xx, '"a" matches qr/[a\ b]/xx'); + like("b", qr/[a\ b]/xx, '"b" matches qr/[a\ b]/xx'); + + like("\t", qr/(?x:[a b])/, '\t not ignored under /x'); + unlike("\t", qr/(?xx:[a b])/, '\t ignored under /xx'); + like("a", qr/(?xx:[a b])/, '"a" matches qr/(?xx:[a b])/'); + like("b", qr/(?xx:[a b])/, '"b" matches qr/(?xx:[a b])/'); + like("\t", qr/(?xx:[a\ b])/, '"\t" matches qr/(?xx:[a\ b])/'); + like("a", qr/(?xx:[a\ b])/, '"a" matches qr/(?xx:[a\ b])/'); + like("b", qr/(?xx:[a\ b])/, '"b" matches qr/(?xx:[a\ b])/'); +} + +done_testing; + +# ex softtabstop=0 noexpandtab diff --git a/t/re/re_tests b/t/re/re_tests index e8a7fa9f34..2653b94ac1 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1977,5 +1977,17 @@ AB\s+\x{100} AB \x{100}X y - - (^(?:(\d)x)?\d$) 1 y [$1-$2] [1-] # make sure that we reset capture buffers properly (from regtry) (X{2,}[-X]{1,4}){3,}X{2,} XXX-XXX-XXX-- n - - # [perl #130307] +/[a b]/x \N{SPACE} yS $& # Note a space char here +/[a b]/xx \N{SPACE} n - - +/[a\ b]/xx \N{SPACE} y $& # Note a space char here +/[ ^ a b ]/xx a n - - +/[ ^ a b ]/xx b n - - +/[ ^ a b ]/xx A y $& A +/(?x:[a b])/xx \N{SPACE} yS $& # Note a space char here +/(?xx:[a b])/x \N{SPACE} n - - +/(?x)[a b]/xx \N{SPACE} yS $& # Note a space char here +/(?xx)[a b]/x \N{SPACE} n - - +/(?-x:[a b])/xx \N{SPACE} yS $& # Note a space char here + # Keep these lines at the end of the file # vim: softtabstop=0 noexpandtab diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 52bec7a473..7aa430ebd6 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -274,10 +274,10 @@ my @death = '/\A{/' => 'Unescaped left brace in regex is illegal here {#} m/\A{{#}/', '/:{4,a}/' => 'Unescaped left brace in regex is illegal here {#} m/:{{#}4,a}/', '/xa{3\,4}y/' => 'Unescaped left brace in regex is illegal here {#} m/xa{{#}3\,4}y/', - '/abc/xix' => 'Only one /x regex modifier is allowed', - '/(?xmsixp:abc)/' => 'Only one /x regex modifier is allowed {#} m/(?xmsixp{#}:abc)/', - '/(?xmsixp)abc/' => 'Only one /x regex modifier is allowed {#} m/(?xmsixp{#})abc/', - '/(?xxxx:abc)/' => 'Only one /x regex modifier is allowed {#} m/(?xxxx{#}:abc)/', + '/abc/xix' => "", + '/(?xmsixp:abc)/' => "", + '/(?xmsixp)abc/' => "", + '/(?xxxx:abc)/' => "", '/(?<=/' => 'Sequence (?... not terminated {#} m/(?<={#}/', # [perl #128170] ); diff --git a/toke.c b/toke.c index e6dad0a21e..3b36404392 100644 --- a/toke.c +++ b/toke.c @@ -9508,10 +9508,6 @@ S_scan_pat(pTHX_ char *start, I32 type) "Use of /c modifier is meaningless without /g" ); } - if (UNLIKELY((x_mod_count) > 1)) { - yyerror("Only one /x regex modifier is allowed"); - } - PL_lex_op = (OP*)pm; pl_yylval.ival = OP_MATCH; return s; @@ -9566,10 +9562,6 @@ S_scan_subst(pTHX_ char *start) } } - if (UNLIKELY((x_mod_count) > 1)) { - yyerror("Only one /x regex modifier is allowed"); - } - if ((pm->op_pmflags & PMf_CONTINUE)) { Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" ); } -- Perl5 Master Repository