[perl.git] branch blead, updated. v5.25.8-202-g77c8f26370

Karl Williamson Fri, 13 Jan 2017 10:45:06 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/77c8f26370dcc0e16ca7c5f1b8f3bd1b99a57a28?hp=afa4768ac90fcd6a53a3661885a238d344a02f92>


- Log -----------------------------------------------------------------
commit 77c8f26370dcc0e16ca7c5f1b8f3bd1b99a57a28
Author: Karl Williamson <k...@cpan.org>
Date:   Thu Jan 12 11:07:47 2017 -0700

    Add /xx regex pattern modifier
    
    This was first proposed in the thread starting at
    http://www.nntp.perl.org/group/perl.perl5.porters/2014/09/msg219394.html

M       MANIFEST
M       ext/re/re.pm
M       ext/re/t/reflags.t
M       pod/perlcheat.pod
M       pod/perldelta.pod
M       pod/perldiag.pod
M       pod/perlop.pod
M       pod/perlre.pod
M       pod/perlrecharclass.pod
M       pod/perlretut.pod
M       pod/perlstyle.pod
M       pod/perluniintro.pod
M       regcomp.c
M       regexp.h
A       t/re/keep_tabs.t
M       t/re/re_tests
M       t/re/reg_mesg.t
M       toke.c

commit 2ab076704905c338cc874079818784698cd5bc85
Author: Karl Williamson <k...@cpan.org>
Date:   Fri Jan 13 11:17:25 2017 -0700

    perlre: Clarifications, typos

M       pod/perlre.pod

commit 563642b4907d9b1b6beaa96b472ae787ae81d56f
Author: Karl Williamson <k...@cpan.org>
Date:   Tue Jan 10 19:10:58 2017 -0700

    perlretut: Add some white space for legibility

M       pod/perlretut.pod

commit fc54a9b2090b5f71905241c319706e3cca18acc9
Author: Karl Williamson <k...@cpan.org>
Date:   Wed Jan 11 22:18:53 2017 -0700

    regcomp.c: Remove obsolete data structure element
    
    This was used for the removed feature of having the source in a
    different encoding.

M       regcomp.c

commit af4226f2469589e84d382a08a6a5242303758f0b
Author: Karl Williamson <k...@cpan.org>
Date:   Thu Jan 12 21:05:35 2017 -0700

    pp_sys.c: White space only
    
    Vertically align some components of an 'if' for readability.

M       pp_sys.c
-----------------------------------------------------------------------

Summary of changes:
 MANIFEST                |  1 +
 ext/re/re.pm            | 23 ++++++++++-----
 ext/re/t/reflags.t      | 17 ++++++++---
 pod/perlcheat.pod       |  2 +-
 pod/perldelta.pod       |  9 +++++-
 pod/perldiag.pod        |  8 -----
 pod/perlop.pod          |  4 ++-
 pod/perlre.pod          | 77 ++++++++++++++++++++++++++++++++++++++++++++-----
 pod/perlrecharclass.pod | 38 ++++++++++++++++--------
 pod/perlretut.pod       | 30 +++++++++++++++----
 pod/perlstyle.pod       |  5 ++--
 pod/perluniintro.pod    |  2 +-
 pp_sys.c                | 10 +++----
 regcomp.c               | 33 ++++++++++++---------
 regexp.h                | 14 +++++++--
 t/re/keep_tabs.t        | 29 +++++++++++++++++++
 t/re/re_tests           | 12 ++++++++
 t/re/reg_mesg.t         |  8 ++---
 toke.c                  |  8 -----
 19 files changed, 245 insertions(+), 85 deletions(-)
 create mode 100644 t/re/keep_tabs.t

diff --git a/MANIFEST b/MANIFEST
index d31ee1a8ed..4745b04ede 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -5726,6 +5726,7 @@ t/porting/utils.t         Check that utility scripts 
still compile
 t/re/anyof.t                   See if bracketed char classes [...] compile 
properly
 t/re/charset.t                 See if regex modifiers like /d, /u work properly
 t/re/fold_grind.t              See if case folding works properly
+t/re/keep_tabs.t               Tests where \t can't be expanded.
 t/re/no_utf8_pm.t              Verify utf8.pm doesn't get loaded unless 
required
 t/re/overload.t                Test against string corruption in pattern 
matches on overloaded objects
 t/re/pat.t                     See if esoteric patterns work
diff --git a/ext/re/re.pm b/ext/re/re.pm
index b924fd9fc7..123408c76f 100644
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -4,7 +4,7 @@ package re;
 use strict;
 use warnings;
 
-our $VERSION     = "0.33";
+our $VERSION     = "0.34";
 our @ISA         = qw(Exporter);
 our @EXPORT_OK   = ('regmust',
                     qw(is_regexp regexp_pattern
@@ -23,6 +23,7 @@ my %reflags = (
     s => 1 << ($PMMOD_SHIFT + 1),
     i => 1 << ($PMMOD_SHIFT + 2),
     x => 1 << ($PMMOD_SHIFT + 3),
+   xx => 1 << ($PMMOD_SHIFT + 4),
     n => 1 << ($PMMOD_SHIFT + 5),
     p => 1 << ($PMMOD_SHIFT + 6),
     strict => 1 << ($PMMOD_SHIFT + 10),
@@ -112,7 +113,6 @@ sub bits {
     my $on = shift;
     my $bits = 0;
     my $turning_all_off = ! @_ && ! $on;
-    my %seen;   # Has flag already been seen?
     if ($turning_all_off) {
 
         # Pretend were called with certain parameters, which are best dealt
@@ -180,6 +180,7 @@ sub bits {
        } elsif ($s =~ s/^\///) {
            my $reflags = $^H{reflags} || 0;
            my $seen_charset;
+            my $x_count = 0;
            while ($s =~ m/( . )/gx) {
                 local $_ = $1;
                if (/[adul]/) {
@@ -225,7 +226,19 @@ sub bits {
                                         && $^H{reflags_charset} == 
$reflags{$_};
                    }
                } elsif (exists $reflags{$_}) {
-                    $seen{$_}++;
+                    if ($_ eq 'x') {
+                        $x_count++;
+                        if ($x_count > 2) {
+                           require Carp;
+                            Carp::carp(
+                            qq 'The "x" flag may only appear a maximum of 
twice'
+                            );
+                        }
+                        elsif ($x_count == 2) {
+                            $_ = 'xx';  # First time through got the /x
+                        }
+                    }
+
                     $on
                      ? $reflags |= $reflags{$_}
                      : ($reflags &= ~$reflags{$_});
@@ -247,10 +260,6 @@ sub bits {
                        ")");
        }
     }
-    if (exists $seen{'x'} && $seen{'x'} > 1) {
-        require Carp;
-        Carp::croak("Only one /x regex modifier is allowed");
-    }
 
     if ($turning_all_off) {
         _load_unload(0);
diff --git a/ext/re/t/reflags.t b/ext/re/t/reflags.t
index a481c98799..595b4b28b4 100644
--- a/ext/re/t/reflags.t
+++ b/ext/re/t/reflags.t
@@ -11,7 +11,7 @@ BEGIN {
 
 use strict;
 
-use Test::More tests => 67;
+use Test::More tests => 74;
 
 my @flags = qw( a d l u );
 
@@ -24,10 +24,19 @@ ok "Foo" !~ /(??{'foo'})/, 'no re "/i" (??{})';
 use re '/x';
 ok "foo" =~ / foo /, 'use re "/x"';
 ok "foo" =~ / (??{' foo '}) /, 'use re "/x" (??{})';
+like " ", qr/[a b]/, 'use re "/x" [a b]';
 no re '/x';
 ok "foo" !~ / foo /, 'no re "/x"';
 ok "foo" !~ /(??{' foo '})/, 'no re "/x" (??{})';
 ok "foo" !~ / (??{'foo'}) /, 'no re "/x" (??{})';
+use re '/xx';
+ok "foo" =~ / foo /, 'use re "/xx"';
+ok "foo" =~ / (??{' foo '}) /, 'use re "/xx" (??{})';
+unlike " ", qr/[a b]/, 'use re "/xx" [a b] # Space in [] gobbled up';
+no re '/xx';
+ok "foo" !~ / foo /, 'no re "/xx"';
+ok "foo" !~ /(??{' foo '})/, 'no re "/xx" (??{})';
+ok "foo" !~ / (??{'foo'}) /, 'no re "/xx" (??{})';
 use re '/s';
 ok "\n" =~ /./, 'use re "/s"';
 ok "\n" =~ /(??{'.'})/, 'use re "/s" (??{})';
@@ -178,8 +187,8 @@ is qr//, '(?^:)', 'no re "/aai"';
     "warning with eval \"use re \"/amaa\"";
 
   $w = "";
-  eval "use re '/xamax'";
-  like $@, qr/Only one \/x regex modifier is allowed/,
-    "error with eval \"use re \"/xamax\"";
+  eval "use re '/xamaxx'";
+  like $w, qr/The "x" flag may only appear a maximum of twice/,
+    "warning with eval \"use re \"/xamaxx\"";
 
 }
diff --git a/pod/perlcheat.pod b/pod/perlcheat.pod
index 6e4e919ff5..99a8dfc547 100644
--- a/pod/perlcheat.pod
+++ b/pod/perlcheat.pod
@@ -41,7 +41,7 @@ already be overwhelming.
   &&              /i case insensitive   ^      string begin
   || //           /m line based ^$      $      str end (bfr \n)
   .. ...          /s . includes \n      +      one or more
-  ?:              /x ignore wh.space    *      zero or more
+  ?:              /x /xx ign. wh.space  *      zero or more
   = += last goto  /p preserve           ?      zero or one
   , =>            /a ASCII    /aa safe  {3,7}  repeat in range
   list ops        /l locale   /d  dual  |      alternation
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 460d118a1d..86b7e9a00f 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -25,7 +25,14 @@ XXX New core language features go here.  Summarize 
user-visible core language
 enhancements.  Particularly prominent performance optimisations could go
 here, but most should go in the L</Performance Enhancements> section.
 
-[ List each enhancement as a =head2 entry ]
+=head2 New regular expression modifier C</xx>
+
+Specifying two C<x> characters to modify a regular expression pattern
+does everything that a single one does, but additionally TAB and SPACE
+characters within a bracketed character class are generally ignored and
+can be added to improve readability, like
+S<C</[ ^ A-Z d-f p-x ]/xx>>.  Details are at
+L<perlre/E<sol>x and E<sol>xx>.
 
 =head1 Security
 
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index fe5ff9bcc4..7d6675c1dc 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4230,14 +4230,6 @@ C<sysread()>ing a file, or when seeking past the end of 
a scalar opened
 for I/O (in anticipation of future reads and to imitate the behavior
 with real files).
 
-=item Only one /x regex modifier is allowed
-
-=item Only one /x regex modifier is allowed in regex; marked by <-- HERE in 
m/%s/
-
-(F) You used the C</x> regular expression pattern modifier at least twice in a
-string of modifiers.  This has been made illegal, in order to allow future
-extensions to the Perl language.
-
 =item %s() on unopened %s
 
 (W unopened) An I/O operation was attempted on a filehandle that was
diff --git a/pod/perlop.pod b/pod/perlop.pod
index 82dca55d52..3cf9db67e6 100644
--- a/pod/perlop.pod
+++ b/pod/perlop.pod
@@ -1743,7 +1743,9 @@ Options (specified by the following modifiers) are:
     m  Treat string as multiple lines.
     s  Treat string as single line. (Make . match a newline)
     i  Do case-insensitive pattern matching.
-    x  Use extended regular expressions.
+    x   Use extended regular expressions; specifying two
+        x's means \t and the SPACE character are ignored within
+        square-bracketed character classes
     p  When matching preserve a copy of the matched string so
         that ${^PREMATCH}, ${^MATCH}, ${^POSTMATCH} will be
         defined (ignored starting in v5.20) as these are always
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 10783a30b8..e3fc62d305 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -86,11 +86,11 @@ inverted, which otherwise could be highly confusing.  See
 L<perlrecharclass/Bracketed Character Classes>, and
 L<perlrecharclass/Negation>.
 
-=item B<C<x>>
+=item B<C<x>> and B<C<xx>>
 X</x>
 
 Extend your pattern's legibility by permitting whitespace and comments.
-Details in L</"/x">
+Details in L</E<sol>x and  E<sol>xx>
 
 =item B<C<p>>
 X</p> X<regex, preserve> X<regexp, preserve>
@@ -144,7 +144,6 @@ L<perlretut/"Using regular expressions in Perl"> are:
   g  - globally match the pattern repeatedly in the string
 
 Substitution-specific modifiers described in
-
 L<perlop/"s/PATTERN/REPLACEMENT/msixpodualngcer"> are:
 
   e  - evaluate the right-hand side as an expression
@@ -165,12 +164,12 @@ the C<(?...)> construct, see L</Extended Patterns> below.
 Some of the modifiers require more explanation than given in the
 L</Overview> above.
 
-=head4 /x
+=head4 C</x> and  C</xx>
 
-C</x> tells
+A single C</x> tells
 the regular expression parser to ignore most whitespace that is neither
 backslashed nor within a bracketed character class.  You can use this to
-break up your regular expression into (slightly) more readable parts.
+break up your regular expression into more readable parts.
 Also, the C<"#"> character is treated as a metacharacter introducing a
 comment that runs up to the pattern's closing delimiter, or to the end
 of the current line if the pattern extends onto the next line.  Hence,
@@ -190,6 +189,24 @@ You can use L</(?#text)> to create a comment that ends 
earlier than the
 end of the current line, but C<text> also can't contain the closing
 delimiter unless escaped with a backslash.
 
+A common pitfall is to forget that C<#> characters begin a comment under
+C</x> and are not matched literally.  Just keep that in mind when trying
+to puzzle out why a particular C</x> pattern isn't working as expected.
+
+Starting in Perl v5.26, if the modifier has a second C<x> within it,
+it does everything that a single C</x> does, but additionally
+non-backslashed SPACE and TAB characters within bracketed character
+classes are also generally ignored, and hence can be added to make the
+classes more readable.
+
+    / [d-e g-i 3-7]/xx
+    /[ ! @ " # $ % ^ & * () = ? <> ' ]/xx
+
+may be easier to grasp than the squashed equivalents
+
+    /[d-eg-i3-7]/
+    /[!@"#$%^&*()=?<>']/
+
 Taken together, these features go a long way towards
 making Perl's regular expressions more readable.  Here's an example:
 
@@ -554,7 +571,6 @@ meanings:
 X<metacharacter>
 X<\> X<^> X<.> X<$> X<|> X<(> X<()> X<[> X<[]>
 
-
     \        Quote the next metacharacter
     ^        Match the beginning of the line
     .        Match any character (except newline)
@@ -1075,13 +1091,30 @@ a backslash if it appears in the comment.
 
 See L</E<sol>x> for another way to have comments in patterns.
 
+Note that a comment can go just about anywhere, except in the middle of
+an escape sequence.   Examples:
+
+ qr/foo(?#comment)bar/'  # Matches 'foobar'
+
+ # The pattern below matches 'abcd', 'abccd', or 'abcccd'
+ qr/abc(?#comment between literal and its quantifier){1,3}d/
+
+ # The pattern below generates a syntax error, because the '\p' must
+ # be followed immediately by a '{'.
+ qr/\p(?#comment between \p and its property name){Any}/
+
+ # The pattern below generates a syntax error, because the initial
+ # '\(' is a literal opening parenthesis, and so there is nothing
+ # for the  closing ')' to match
+ qr/\(?#the backslash means this isn't a comment)p{Any}/
+
 =item C<(?adlupimnsx-imnsx)>
 
 =item C<(?^alupimnsx)>
 X<(?)> X<(?^)>
 
 One or more embedded pattern-match modifiers, to be turned on (or
-turned off, if preceded by C<"-">) for the remainder of the pattern or
+turned off if preceded by C<"-">) for the remainder of the pattern or
 the remainder of the enclosing pattern group (if any).
 
 This is particularly useful for dynamically-generated patterns,
@@ -1111,6 +1144,29 @@ These modifiers do not carry over into named subpatterns 
called in the
 enclosing group. In other words, a pattern such as C<((?i)(?&NAME))> does not
 change the case-sensitivity of the C<"NAME"> pattern.
 
+A modifier is overridden by later occurrences of this construct in the
+same scope containing the same modifier, so that
+
+    /((?im)foo(?-m)bar)/
+
+matches all of C<foobar> case insensitively, but uses C</m> rules for
+only the C<foo> portion.  The C<a> flag overrides C<aa> as well;
+likewise C<aa> overrides C<a>.  The same goes for C<x> and C<xx>.
+Hence, in
+
+    /(?-x)foo/xx
+
+both C</x> and C</xx> are turned off during matching C<foo>.  And in
+
+    /(?x)foo/x
+
+C</x> but NOT C</xx> is turned on for matching C<foo>.  (One might
+mistakenly think that since the inner C<(?x)> is already in the scope of
+C</x>, that the result would effectively be the sum of them, yielding
+C</xx>.  It doesn't work that way.)  Similarly, doing something like
+C<(?xx-x)foo> turns off all C<x> behavior for matching C<foo>, it is not
+that you subtract 1 C<x> from 2 to get 1 C<x> remaining.
+
 Any of these modifiers can be set to apply globally to all regular
 expressions compiled within the scope of a C<use re>.  See
 L<re/"'/flags' mode">.
@@ -1165,6 +1221,11 @@ is equivalent to the more verbose
 Note that any C<()> constructs enclosed within this one will still
 capture unless the C</n> modifier is in effect.
 
+Like the L</(?adlupimnsx-imnsx)> construct, C<aa> and C<a> override each
+other, as do C<xx> and C<x>.  They are not additive.  So, doing
+something like C<(?xx-x:foo)> turns off all C<x> behavior for matching
+C<foo>.
+
 Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately
 after the C<"?"> is a shorthand equivalent to C<d-imnsx>.  Any positive
 flags (except C<"d">) may follow the caret, so
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index 93bb2e5e63..1c07632dec 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -576,6 +576,29 @@ Examples:
                       #  containing just [, and the character class is
                       #  followed by a ].
 
+=head3 Bracketed Character Classes and the C</xx> pattern modifier
+
+Normally SPACE and TAB characters have no special meaning inside a
+bracketed character class; they are just added to the list of characters
+matched by the class.  But if the L<C</xx>|perlre/E<sol>x and E<sol>xx>
+pattern modifier is in effect, they are generally ignored and can be
+added to improve readability.  They can't be added in the middle of a
+single construct:
+
+ / [ \x{10 FFFF} ] /xx  # WRONG!
+
+The SPACE in the middle of the hex constant is illegal.
+
+To specify a literal SPACE character, you can escape it with a
+backslash, like:
+
+ /[ a e i o u \  ]/xx
+
+This matches the English vowels plus the SPACE character.
+
+For clarity, you should already have been using C<\t> to specify a
+literal tab, and C<\t> is unaffected by C</xx>.
+
 =head3 Character Ranges
 
 It is not uncommon to want to match a range of characters. Luckily, instead
@@ -1016,7 +1039,7 @@ We can extend the example above:
 This matches digits that are in either the Thai or Laotian scripts.
 
 Notice the white space in these examples.  This construct always has
-the C<E<sol>x> modifier turned on within it.
+the C<E<sol>xx> modifier turned on within it.
 
 The available binary operators are:
 
@@ -1061,18 +1084,9 @@ C<\N{...}>, etc.)
 
 This last example shows the use of this construct to specify an ordinary
 bracketed character class without additional set operations.  Note the
-white space within it; a limited version of C<E<sol>x> is turned on even
-within bracketed character classes, with only the SPACE and TAB (C<\t>)
-characters allowed, and no comments.  Hence,
-
- (?[ [#] ])
+white space within it.  This is allowed because C<E<sol>xx> is
+automatically turned on within this construct.
 
-matches the literal character "#".  To specify a literal white space character,
-you can escape it with a backslash, like:
-
- /(?[ [ a e i o u \  ] ])/
-
-This matches the English vowels plus the SPACE character.
 All the other escapes accepted by normal bracketed character classes are
 accepted here as well; but unrecognized escapes that generate warnings
 in normal classes are fatal errors here.
diff --git a/pod/perlretut.pod b/pod/perlretut.pod
index d74276c91d..9c7ab56042 100644
--- a/pod/perlretut.pod
+++ b/pod/perlretut.pod
@@ -1479,7 +1479,7 @@ we can rewrite our 'extended' regexp in the more pleasing 
form
          |\.\d+     # mantissa of the form .b
          |\d+       # integer of the form a
       )
-      ([eE][+-]?\d+)?  # finally, optionally match an exponent
+      ( [eE] [+-]? \d+ )?  # finally, optionally match an exponent
    $/x;
 
 If whitespace is mostly irrelevant, how does one include space
@@ -1497,7 +1497,7 @@ this to our regexp as follows:
          |\.\d+     # mantissa of the form .b
          |\d+       # integer of the form a
       )
-      ([eE][+-]?\d+)?  # finally, optionally match an exponent
+      ( [eE] [+-]? \d+ )?  # finally, optionally match an exponent
    $/x;
 
 In this form, it is easier to see a way to simplify the
@@ -1513,10 +1513,28 @@ could be factored out:
           )?        # ? takes care of integers of the form a
          |\.\d+     # mantissa of the form .b
       )
-      ([eE][+-]?\d+)?  # finally, optionally match an exponent
+      ( [eE] [+-]? \d+ )?  # finally, optionally match an exponent
    $/x;
 
-or written in the compact form,
+Starting in Perl v5.26, specifying C</xx> changes the square-bracketed
+portions of a pattern to ignore tabs and space characters unless they
+are escaped by preceding them with a backslash.  So, we could write
+
+   /^
+      [ + - ]?\ *   # first, match an optional sign
+      (             # then match integers or f.p. mantissas:
+          \d+       # start out with a ...
+          (
+              \.\d* # mantissa of the form a.b or a.
+          )?        # ? takes care of integers of the form a
+         |\.\d+     # mantissa of the form .b
+      )
+      ( [ e E ] [ + - ]? \d+ )?  # finally, optionally match an exponent
+   $/xx;
+
+This doesn't really improve the legibility of this example, but it's
+available in case you want it.  Squashing the pattern down to the
+compact form, we have
 
     /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/;
 
@@ -2379,7 +2397,7 @@ enclosed in parentheses up to two levels deep.  Then the 
following
 regexp matches:
 
     $x = "abc(de(fg)h";  # unbalanced parentheses
-    $x =~ /\( ( [^()]+ | \([^()]*\) )+ \)/x;
+    $x =~ /\( ( [ ^ () ]+ | \( [ ^ () ]* \) )+ \)/xx;
 
 The regexp matches an open parenthesis, one or more copies of an
 alternation, and a close parenthesis.  The alternation is two-way, with
@@ -2393,7 +2411,7 @@ was no match possible.  To prevent the exponential 
blowup, we need to
 prevent useless backtracking at some point.  This can be done by
 enclosing the inner quantifier as an independent subexpression:
 
-    $x =~ /\( ( (?>[^()]+) | \([^()]*\) )+ \)/x;
+    $x =~ /\( ( (?> [ ^ () ]+ ) | \([ ^ () ]* \) )+ \)/xx;
 
 Here, C<< (?>[^()]+) >> breaks the degeneracy of string partitioning
 by gobbling up as much of the string as possible and keeping it.   Then
diff --git a/pod/perlstyle.pod b/pod/perlstyle.pod
index 37dfaaf141..5c2534581e 100644
--- a/pod/perlstyle.pod
+++ b/pod/perlstyle.pod
@@ -210,8 +210,9 @@ function should not be used outside the package that 
defined it.
 
 =item *
 
-If you have a really hairy regular expression, use the C</x> modifier and
-put in some whitespace to make it look a little less like line noise.
+If you have a really hairy regular expression, use the C</x>  or C</xx>
+modifiers and put in some whitespace to make it look a little less like
+line noise.
 Don't use slash as a delimiter when your regexp has slashes or backslashes.
 
 =item *
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
index 5b571fbbc1..ef4d07d1d6 100644
--- a/pod/perluniintro.pod
+++ b/pod/perluniintro.pod
@@ -645,7 +645,7 @@ Starting in v5.22, you can use Unicode code points as the 
end points of
 regular expression pattern character ranges, and the range will include
 all Unicode code points that lie between those end points, inclusive.
 
- qr/ [\N{U+03]-\N{U+20}] /x
+ qr/ [ \N{U+03} - \N{U+20} ] /xx
 
 includes the code points
 C<\N{U+03}>, C<\N{U+04}>, ..., C<\N{U+20}>.
diff --git a/pp_sys.c b/pp_sys.c
index c0ef29812e..11193bcbc6 100644
--- a/pp_sys.c
+++ b/pp_sys.c
@@ -3598,14 +3598,14 @@ PP(pp_fttext)
         }
         else
 #endif
-        if (isPRINT_A(*s)
-                   /* VT occurs so rarely in text, that we consider it odd */
-                || (isSPACE_A(*s) && *s != VT_NATIVE)
+             if (  isPRINT_A(*s)
+                    /* VT occurs so rarely in text, that we consider it odd */
+                 || (isSPACE_A(*s) && *s != VT_NATIVE)
 
                     /* But there is a fair amount of backspaces and escapes in
                      * some text */
-                || *s == '\b'
-                || *s == ESC_NATIVE)
+                 || *s == '\b'
+                 || *s == ESC_NATIVE)
         {
             continue;
         }
diff --git a/regcomp.c b/regcomp.c
index 64d8907da0..2114773357 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -271,7 +271,6 @@ struct RExC_state_t {
                                    (pRExC_state->study_chunk_recursed_bytes)
 #define RExC_in_lookbehind     (pRExC_state->in_lookbehind)
 #define RExC_contains_locale   (pRExC_state->contains_locale)
-#define RExC_override_recoding (pRExC_state->override_recoding)
 #ifdef EBCDIC
 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 #endif
@@ -6517,8 +6516,12 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const 
pRExC_state,
            *p++ = pat[s];
        }
        *p++ = '\'';
-       if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
+       if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
            *p++ = 'x';
+            if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
+                *p++ = 'x';
+            }
+        }
        *p++ = '\0';
        DEBUG_COMPILE_r({
             Perl_re_printf( aTHX_
@@ -7006,7 +7009,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int 
pat_count,
     RExC_in_lookbehind = 0;
     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
     RExC_extralen = 0;
-    RExC_override_recoding = 0;
 #ifdef EBCDIC
     RExC_recode_x_to_native = 0;
 #endif
@@ -7164,7 +7166,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int 
pat_count,
                                                    == REG_RUN_ON_COMMENT_SEEN);
        U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
                            >> RXf_PMf_STD_PMMOD_SHIFT);
-       const char *fptr = STD_PAT_MODS;        /*"msixn"*/
+       const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
        char *p;
 
         /* We output all the necessary flags; we never output a minus, as all
@@ -10428,18 +10430,23 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t 
*pRExC_state)
                 }
                 flagsp = &negflags;
                 wastedflags = 0;  /* reset so (?g-c) warns twice */
+                x_mod_count = 0;
                 break;
             case ':':
             case ')':
+
+                if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == 
RXf_PMf_EXTENDED) {
+                    negflags |= RXf_PMf_EXTENDED_MORE;
+                }
                 RExC_flags |= posflags;
+
+                if (negflags & RXf_PMf_EXTENDED) {
+                    negflags |= RXf_PMf_EXTENDED_MORE;
+                }
                 RExC_flags &= ~negflags;
                 set_regex_charset(&RExC_flags, cs);
 
-                if (UNLIKELY((x_mod_count) > 1)) {
-                    vFAIL("Only one /x regex modifier is allowed");
-                }
                 return;
-                /*NOTREACHED*/
             default:
               fail_modifiers:
                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
@@ -12138,7 +12145,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         /* The values are Unicode, and therefore not subject to recoding, but
          * have to be converted to native on a non-Unicode (meaning non-ASCII)
          * platform. */
-       RExC_override_recoding = 1;
 #ifdef EBCDIC
         RExC_recode_x_to_native = 1;
 #endif
@@ -12159,7 +12165,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
        RExC_start = RExC_adjusted_start = save_start;
        RExC_parse = endbrace;
        RExC_end = orig_end;
-       RExC_override_recoding = 0;
 #ifdef EBCDIC
         RExC_recode_x_to_native = 0;
 #endif
@@ -15792,8 +15797,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
                                        character; used under /i */
     UV n;
     char * stop_ptr = RExC_end;    /* where to stop parsing */
-    const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
-                                                   space? */
+
+    /* ignore unescaped whitespace? */
+    const bool skip_white = cBOOL(   ret_invlist
+                                  || (RExC_flags & RXf_PMf_EXTENDED_MORE));
 
     /* Unicode properties are stored in a swash; this holds the current one
      * being parsed.  If this swash is the only above-latin1 component of the
@@ -17008,7 +17015,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
         RExC_adjusted_start = RExC_start + prefix_end;
        RExC_end = RExC_parse + len;
         RExC_in_multi_char_class = 1;
-       RExC_override_recoding = 1;
         RExC_emit = (regnode *)orig_emit;
 
        ret = reg(pRExC_state, 1, &reg_flags, depth+1);
@@ -17021,7 +17027,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
         RExC_precomp_adj = 0;
        RExC_end = save_end;
        RExC_in_multi_char_class = 0;
-       RExC_override_recoding = 0;
         SvREFCNT_dec_NN(multi_char_matches);
         return ret;
     }
diff --git a/regexp.h b/regexp.h
index 7351afdc76..08b4fc32a8 100644
--- a/regexp.h
+++ b/regexp.h
@@ -278,18 +278,26 @@ and check for NULL.
 
 #include "op_reg_common.h"
 
-#define RXf_PMf_STD_PMMOD      
(RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_NOCAPTURE)
+#define RXf_PMf_STD_PMMOD      
(RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_NOCAPTURE)
 
 #define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, x_count)                       \
     case IGNORE_PAT_MOD:    *(pmfl) |= RXf_PMf_FOLD;       break;           \
     case MULTILINE_PAT_MOD: *(pmfl) |= RXf_PMf_MULTILINE;  break;           \
     case SINGLE_PAT_MOD:    *(pmfl) |= RXf_PMf_SINGLELINE; break;           \
-    case XTENDED_PAT_MOD:   *(pmfl) |= RXf_PMf_EXTENDED; (x_count)++; break;\
+    case XTENDED_PAT_MOD:   if (x_count == 0) {                             \
+                                *(pmfl) |= RXf_PMf_EXTENDED;                \
+                                *(pmfl) &= ~RXf_PMf_EXTENDED_MORE;          \
+                            }                                               \
+                            else {                                          \
+                                *(pmfl) |= RXf_PMf_EXTENDED                 \
+                                          |RXf_PMf_EXTENDED_MORE;           \
+                            }                                               \
+                            (x_count)++; break;                             \
     case NOCAPTURE_PAT_MOD: *(pmfl) |= RXf_PMf_NOCAPTURE; break;
 
 /* Note, includes charset ones, assumes 0 is the default for them */
 #define STD_PMMOD_FLAGS_CLEAR(pmfl)                        \
-    *(pmfl) &= 
~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET|RXf_PMf_NOCAPTURE)
+    *(pmfl) &= 
~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_CHARSET|RXf_PMf_NOCAPTURE)
 
 /* chars and strings used as regex pattern modifiers
  * Singular is a 'c'har, plural is a "string"
diff --git a/t/re/keep_tabs.t b/t/re/keep_tabs.t
new file mode 100644
index 0000000000..ec986c483f
--- /dev/null
+++ b/t/re/keep_tabs.t
@@ -0,0 +1,29 @@
+# This file contains tests where \t characters should not be expanded into
+# spaces.
+
+BEGIN {
+    chdir 't' if -d 't';
+    require './test.pl';
+}
+
+{
+      like("\t", qr/[a b]/x, '\t not ignored under /x');
+    unlike("\t", qr/[a b]/xx, '\t ignored under /xx');
+    like("a", qr/[a    b]/xx, '"a" matches qr/[a       b]/xx');
+    like("b", qr/[a    b]/xx, '"b" matches qr/[a       b]/xx');
+    like("\t", qr/[a\  b]/xx, '"\t" matches qr/[a\     b]/xx');
+    like("a", qr/[a\   b]/xx, '"a" matches qr/[a\      b]/xx');
+    like("b", qr/[a\   b]/xx, '"b" matches qr/[a\      b]/xx');
+
+      like("\t", qr/(?x:[a     b])/, '\t not ignored under /x');
+    unlike("\t", qr/(?xx:[a    b])/, '\t ignored under /xx');
+    like("a", qr/(?xx:[a       b])/, '"a" matches qr/(?xx:[a   b])/');
+    like("b", qr/(?xx:[a       b])/, '"b" matches qr/(?xx:[a   b])/');
+    like("\t", qr/(?xx:[a\     b])/, '"\t" matches qr/(?xx:[a\ b])/');
+    like("a", qr/(?xx:[a\      b])/, '"a" matches qr/(?xx:[a\  b])/');
+    like("b", qr/(?xx:[a\      b])/, '"b" matches qr/(?xx:[a\  b])/');
+}
+
+done_testing;
+
+# ex softtabstop=0 noexpandtab
diff --git a/t/re/re_tests b/t/re/re_tests
index e8a7fa9f34..2653b94ac1 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1977,5 +1977,17 @@ AB\s+\x{100}     AB \x{100}X     y       -       -
 (^(?:(\d)x)?\d$)       1       y       [$1-$2] [1-]            #  make sure 
that we reset capture buffers properly (from regtry)
 (X{2,}[-X]{1,4}){3,}X{2,}      XXX-XXX-XXX--   n       -       -       # [perl 
#130307]
 
+/[a b]/x       \N{SPACE}       yS      $&                      # Note a space 
char here
+/[a b]/xx      \N{SPACE}       n       -       -
+/[a\ b]/xx     \N{SPACE}       y       $&                      # Note a space 
char here
+/[ ^ a b ]/xx  a       n       -       -
+/[ ^ a b ]/xx  b       n       -       -
+/[ ^ a b ]/xx  A       y       $&      A
+/(?x:[a b])/xx \N{SPACE}       yS      $&              # Note a space char here
+/(?xx:[a b])/x \N{SPACE}       n       -       -
+/(?x)[a b]/xx  \N{SPACE}       yS      $&              # Note a space char here
+/(?xx)[a b]/x  \N{SPACE}       n       -       -
+/(?-x:[a b])/xx        \N{SPACE}       yS      $&              # Note a space 
char here
+
 # Keep these lines at the end of the file
 # vim: softtabstop=0 noexpandtab
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 52bec7a473..7aa430ebd6 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -274,10 +274,10 @@ my @death =
  '/\A{/' => 'Unescaped left brace in regex is illegal here {#} m/\A{{#}/',
  '/:{4,a}/' => 'Unescaped left brace in regex is illegal here {#} 
m/:{{#}4,a}/',
  '/xa{3\,4}y/' => 'Unescaped left brace in regex is illegal here {#} 
m/xa{{#}3\,4}y/',
- '/abc/xix' => 'Only one /x regex modifier is allowed',
- '/(?xmsixp:abc)/' => 'Only one /x regex modifier is allowed {#} 
m/(?xmsixp{#}:abc)/',
- '/(?xmsixp)abc/' => 'Only one /x regex modifier is allowed {#} 
m/(?xmsixp{#})abc/',
- '/(?xxxx:abc)/' => 'Only one /x regex modifier is allowed {#} 
m/(?xxxx{#}:abc)/',
+ '/abc/xix' => "",
+ '/(?xmsixp:abc)/' => "",
+ '/(?xmsixp)abc/' => "",
+ '/(?xxxx:abc)/' => "",
  '/(?<=/' => 'Sequence (?... not terminated {#} m/(?<={#}/',                   
     # [perl #128170]
 
 );
diff --git a/toke.c b/toke.c
index e6dad0a21e..3b36404392 100644
--- a/toke.c
+++ b/toke.c
@@ -9508,10 +9508,6 @@ S_scan_pat(pTHX_ char *start, I32 type)
                       "Use of /c modifier is meaningless without /g" );
     }
 
-    if (UNLIKELY((x_mod_count) > 1)) {
-        yyerror("Only one /x regex modifier is allowed");
-    }
-
     PL_lex_op = (OP*)pm;
     pl_yylval.ival = OP_MATCH;
     return s;
@@ -9566,10 +9562,6 @@ S_scan_subst(pTHX_ char *start)
        }
     }
 
-    if (UNLIKELY((x_mod_count) > 1)) {
-        yyerror("Only one /x regex modifier is allowed");
-    }
-
     if ((pm->op_pmflags & PMf_CONTINUE)) {
         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is 
meaningless in s///" );
     }

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.25.8-202-g77c8f26370

Reply via email to