In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/8f71649941d02d5bdfe4fed5054c505b12dd2c0e?hp=1ca728708a2c9223bb840be4f99c58df50b3c648>
- Log ----------------------------------------------------------------- commit 8f71649941d02d5bdfe4fed5054c505b12dd2c0e Author: Karl Williamson <[email protected]> Date: Sun Feb 19 22:14:53 2017 -0700 handy.h: Guard controversial macro name This is so their use cannot spread easily until we have sorted things out in 5.27 M handy.h commit 2d5e9bac83412e103d6b07a688dd7aea80148bd8 Author: Karl Williamson <[email protected]> Date: Fri Feb 17 11:56:38 2017 -0700 perlretut: Note when metacharacters become ordinary M pod/perlretut.pod commit 9891e9b7d5c8747ecfb7ce7038187b087bd8c655 Author: Karl Williamson <[email protected]> Date: Wed Feb 15 21:22:59 2017 -0700 Revise documentation of eval and evalbytes M lib/feature.pm M pod/perlfunc.pod M regen/feature.pl commit 1a8fcdefd72ff92df9d60b5bd85529d817260b51 Author: Karl Williamson <[email protected]> Date: Tue Feb 14 16:59:49 2017 -0700 Clarify "User-visible changes" The pumpking agreed with this wording M Porting/release_schedule.pod commit 90400c5ccdc60c7f577dad31d06aa388ee118a28 Author: Karl Williamson <[email protected]> Date: Sun Feb 19 22:03:27 2017 -0700 Balance uniprops tests Commit 5656b1f654bb034c561558968ed3cf87a737b3e1 split the tests generated by mktables so that 10 separate files each execute 10% of the tests. But it turns out that some tests are much more involved than others, so that some of those 10 files still took much longer than average. This commit changes the split so that the amount of time each file takes is more balanced. It uses a natural breaking spot for the tests for the \b{} flavors, except that GCB and SB are each short (so are combined into being tested from one file), and LB is very long, so is split into 4 test groups. M charclass_invlists.h M lib/unicore/mktables M regcharclass.h commit ec2c235b8da47c613eb6c9cdac160311692ea63a Author: Karl Williamson <[email protected]> Date: Sun Feb 19 21:48:40 2017 -0700 Inline foldEQ, foldEQ_latin1, foldEQ_locale These short functions are called in inner loops and regex backtracking. M embed.fnc M inline.h M proto.h M util.c commit 4d2c9c8c6c9a82ad785b57b9e346e202f74a0c66 Author: Karl Williamson <[email protected]> Date: Sun Feb 19 21:43:40 2017 -0700 op.c: Add comment M op.c commit 7d4c055d01b7da928b947b1c377211e88f59396d Author: Karl Williamson <[email protected]> Date: Sun Feb 19 21:39:32 2017 -0700 perlrecharclass: Simplify by referring to other pod The (?[...] has 're strict' rules. Slightly reword to more directly refer to the documentation on that. M pod/perlrecharclass.pod ----------------------------------------------------------------------- Summary of changes: Porting/release_schedule.pod | 3 +- charclass_invlists.h | 2 +- embed.fnc | 6 +- handy.h | 5 + inline.h | 86 ++++++++++++ lib/feature.pm | 62 +++------ lib/unicore/mktables | 45 +++++-- op.c | 2 +- pod/perlfunc.pod | 307 +++++++++++++++++++++++++++---------------- pod/perlrecharclass.pod | 13 +- pod/perlretut.pod | 8 ++ proto.h | 15 +-- regcharclass.h | 2 +- regen/feature.pl | 62 +++------ util.c | 83 ------------ 15 files changed, 384 insertions(+), 317 deletions(-) diff --git a/Porting/release_schedule.pod b/Porting/release_schedule.pod index fe16124e3b..3bb606d52a 100644 --- a/Porting/release_schedule.pod +++ b/Porting/release_schedule.pod @@ -16,7 +16,8 @@ deemed necessary by the Pumpking. Code freezes (which happen in the 5.25.X series) 2016-12-20 5.25.8 Contentious changes freeze - 2017-01-20 5.25.9 User-visible changes freeze + 2017-01-20 5.25.9 User-visible changes to correctly + functioning programs freeze 2017-02-20 5.25.10 Full code freeze 2017-04-20 5.26.0 Stable release! diff --git a/charclass_invlists.h b/charclass_invlists.h index 732b6d0a8a..7b5b7eae1c 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -95407,7 +95407,7 @@ static const U8 WB_table[24][24] = { * 37f6186253da9824bdb27f4ad867bfe8c25d4dc6bdb2f05585e40a034675a348 lib/unicore/extracted/DLineBreak.txt * ef24061b5a5dc93d7e90c2e34530ec757180ee75d872cba65ffc946e52624ae8 lib/unicore/extracted/DNumType.txt * a197371fec9a1b517058b440841f60f9378d81682084eef8db22a88cb2f96e90 lib/unicore/extracted/DNumValues.txt - * 4bcfb4545be21663ca38a2acbfcbf2b0f3252652a34b50f1a56ef76cb959861b lib/unicore/mktables + * 79a7216aceb1d291f2857085545fdda289518bc540a09bc0a15cde105d76028d lib/unicore/mktables * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * 9534d0cc3914fa1f5d574332c3199605c3d14f8691a0729d68d8498ac2b36280 regen/mk_invlists.pl diff --git a/embed.fnc b/embed.fnc index 3b645143e6..89986b4acb 100644 --- a/embed.fnc +++ b/embed.fnc @@ -679,9 +679,9 @@ ApbmM |SV** |hv_store_flags |NULLOK HV *hv|NULLOK const char *key \ Amd |void |hv_undef |NULLOK HV *hv poX |void |hv_undef_flags |NULLOK HV *hv|U32 flags AmP |I32 |ibcmp |NN const char* a|NN const char* b|I32 len -AnpP |I32 |foldEQ |NN const char* a|NN const char* b|I32 len +Ainp |I32 |foldEQ |NN const char* a|NN const char* b|I32 len AmP |I32 |ibcmp_locale |NN const char* a|NN const char* b|I32 len -AnpP |I32 |foldEQ_locale |NN const char* a|NN const char* b|I32 len +Ainp |I32 |foldEQ_locale |NN const char* a|NN const char* b|I32 len Am |I32 |ibcmp_utf8 |NN const char *s1|NULLOK char **pe1|UV l1 \ |bool u1|NN const char *s2|NULLOK char **pe2 \ |UV l2|bool u2 @@ -691,7 +691,7 @@ Amd |I32 |foldEQ_utf8 |NN const char *s1|NULLOK char **pe1|UV l1 \ AMp |I32 |foldEQ_utf8_flags |NN const char *s1|NULLOK char **pe1|UV l1 \ |bool u1|NN const char *s2|NULLOK char **pe2 \ |UV l2|bool u2|U32 flags -AnpP |I32 |foldEQ_latin1 |NN const char* a|NN const char* b|I32 len +Ainp |I32 |foldEQ_latin1 |NN const char* a|NN const char* b|I32 len #if defined(PERL_IN_DOIO_C) sR |bool |ingroup |Gid_t testgid|bool effective #endif diff --git a/handy.h b/handy.h index 4d2f4bc933..80f9cf4b76 100644 --- a/handy.h +++ b/handy.h @@ -485,8 +485,13 @@ Returns zero if non-equal, or non-zero if equal. #define strnNE(s1,s2,l) (strncmp(s1,s2,l)) #define strnEQ(s1,s2,l) (!strncmp(s1,s2,l)) +/* These names are controversial, so guarding against their being used in more + * places than they already are. strBEGs and StrStartsWith are potential + * candidates */ +#if defined(PERL_IN_DOIO_C) || defined(PERL_IN_GV_C) || defined(PERL_IN_HV_C) || defined(PERL_IN_LOCALE_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_TOKE_C) || defined(PERL_EXT) #define strNEs(s1,s2) (strncmp(s1,"" s2 "", sizeof(s2)-1)) #define strEQs(s1,s2) (!strncmp(s1,"" s2 "", sizeof(s2)-1)) +#endif #ifdef HAS_MEMCMP # define memNE(s1,s2,l) (memcmp(s1,s2,l)) diff --git a/inline.h b/inline.h index acd19e5fb6..f7bd4a3076 100644 --- a/inline.h +++ b/inline.h @@ -1645,6 +1645,92 @@ S_cx_popgiven(pTHX_ PERL_CONTEXT *cx) SvREFCNT_dec(sv); } +/* ------------------ util.h ------------------------------------------- */ + +/* +=head1 Miscellaneous Functions + +=for apidoc foldEQ + +Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the +same +case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes +match themselves and their opposite case counterparts. Non-cased and non-ASCII +range bytes match only themselves. + +=cut +*/ + +PERL_STATIC_INLINE I32 +Perl_foldEQ(const char *s1, const char *s2, I32 len) +{ + const U8 *a = (const U8 *)s1; + const U8 *b = (const U8 *)s2; + + PERL_ARGS_ASSERT_FOLDEQ; + + assert(len >= 0); + + while (len--) { + if (*a != *b && *a != PL_fold[*b]) + return 0; + a++,b++; + } + return 1; +} + +I32 +Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) +{ + /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on + * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor + * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor + * does it check that the strings each have at least 'len' characters */ + + const U8 *a = (const U8 *)s1; + const U8 *b = (const U8 *)s2; + + PERL_ARGS_ASSERT_FOLDEQ_LATIN1; + + assert(len >= 0); + + while (len--) { + if (*a != *b && *a != PL_fold_latin1[*b]) { + return 0; + } + a++, b++; + } + return 1; +} + +/* +=for apidoc foldEQ_locale + +Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the +same case-insensitively in the current locale; false otherwise. + +=cut +*/ + +I32 +Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) +{ + dVAR; + const U8 *a = (const U8 *)s1; + const U8 *b = (const U8 *)s2; + + PERL_ARGS_ASSERT_FOLDEQ_LOCALE; + + assert(len >= 0); + + while (len--) { + if (*a != *b && *a != PL_fold_locale[*b]) + return 0; + a++,b++; + } + return 1; +} + /* * ex: set ts=8 sts=4 sw=4 et: */ diff --git a/lib/feature.pm b/lib/feature.pm index fe5c513e59..ed13273f11 100644 --- a/lib/feature.pm +++ b/lib/feature.pm @@ -5,7 +5,7 @@ package feature; -our $VERSION = '1.46'; +our $VERSION = '1.47'; our %feature = ( fc => 'feature_fc', @@ -180,50 +180,22 @@ operator|perlop/Range Operators>. =head2 The 'unicode_eval' and 'evalbytes' features -Under the C<unicode_eval> feature, Perl's C<eval> function, when passed a -string, will evaluate it as a string of characters, ignoring any -C<use utf8> declarations. C<use utf8> exists to declare the encoding of -the script, which only makes sense for a stream of bytes, not a string of -characters. Source filters are forbidden, as they also really only make -sense on strings of bytes. Any attempt to activate a source filter will -result in an error. - -The C<evalbytes> feature enables the C<evalbytes> keyword, which evaluates -the argument passed to it as a string of bytes. It dies if the string -contains any characters outside the 8-bit range. Source filters work -within C<evalbytes>: they apply to the contents of the string being -evaluated. - -Together, these two features are intended to replace the historical C<eval> -function, which has (at least) two bugs in it, that cannot easily be fixed -without breaking existing programs: - -=over - -=item * - -C<eval> behaves differently depending on the internal encoding of the -string, sometimes treating its argument as a string of bytes, and sometimes -as a string of characters. - -=item * - -Source filters activated within C<eval> leak out into whichever I<file> -scope is currently being compiled. To give an example with the CPAN module -L<Semi::Semicolons>: - - BEGIN { eval "use Semi::Semicolons; # not filtered here " } - # filtered here! - -C<evalbytes> fixes that to work the way one would expect: - - use feature "evalbytes"; - BEGIN { evalbytes "use Semi::Semicolons; # filtered " } - # not filtered - -=back - -These two features are available starting with Perl 5.16. +Together, these two features are intended to replace the legacy string +C<eval> function, which behaves problematically in some instances. They are +available starting with Perl 5.16, and are enabled by default by a +S<C<use 5.16>> or higher declaration. + +C<unicode_eval> changes the behavior of plain string C<eval> to work more +consistently, especially in the Unicode world. Certain (mis)behaviors +couldn't be changed without breaking some things that had come to rely on +them, so the feature can be enabled and disabled. Details are at +L<perlfunc/Under the "unicode_eval" feature>. + +C<evalbytes> is like string C<eval>, but operating on a byte stream that is +not UTF-8 encoded. Details are at L<perlfunc/evalbytes EXPR>. Without a +S<C<use feature 'evalbytes'>> nor a S<C<use v5.16>> (or higher) declaration in +the current scope, you can still access it by instead writing +C<CORE::evalbytes>. =head2 The 'current_sub' feature diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 542461742d..e3336f50e0 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -18835,31 +18835,56 @@ EOF_CODE ? "\nsub TODO_FAILING_BREAKS { 1 }\n" : "\nsub TODO_FAILING_BREAKS { 0 }\n"; - - push @output, - (map {"Test_GCB('$_');\n"} @backslash_X_tests), - (map {"Test_LB('$_');\n"} @LB_tests), - (map {"Test_SB('$_');\n"} @SB_tests), - (map {"Test_WB('$_');\n"} @WB_tests); - @output= map { map s/^/ /mgr, map "$_;\n", split /;\n/, $_ } @output; + # Cause there to be 'if' statements to only execute a portion of this + # long-running test each time, so that we can have a bunch of .t's running + # in parallel + my $chunks = 10 # Number of test files + - 1 # For GCB & SB + - 1 # For WB + - 4; # LB split into this many files my @output_chunked; my $chunk_count=0; - my $chunk_size= int(@output/10)+1; + my $chunk_size= int(@output / $chunks) + 1; while (@output) { $chunk_count++; my @chunk= splice @output, 0, $chunk_size; push @output_chunked, - "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count){\n", - @chunk, + "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", + @chunk, + "}\n"; + } + + $chunk_count++; + push @output_chunked, + "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", + (map {" Test_GCB('$_');\n"} @backslash_X_tests), + (map {" Test_SB('$_');\n"} @SB_tests), + "}\n"; + + + $chunk_size= int(@LB_tests / 4) + 1; + @LB_tests = map {" Test_LB('$_');\n"} @LB_tests; + while (@LB_tests) { + $chunk_count++; + my @chunk= splice @LB_tests, 0, $chunk_size; + push @output_chunked, + "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", + @chunk, "}\n"; } + $chunk_count++; + push @output_chunked, + "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", + (map {" Test_WB('$_');\n"} @WB_tests), + "}\n"; + &write($t_path, 0, # Not utf8; [$HEADER, diff --git a/op.c b/op.c index abe32e7e1a..51ffac2ac5 100644 --- a/op.c +++ b/op.c @@ -4642,7 +4642,7 @@ S_gen_constant_list(pTHX_ OP *o) old_next = o->op_next; o->op_next = 0; op_was_null = o->op_type == OP_NULL; - if (op_was_null) + if (op_was_null) /* b3698342565fb462291fba4b432cfcd05b6eb4e1 */ o->op_type = OP_CUSTOM; CALL_PEEP(curop); if (op_was_null) diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 6f62f3fb40..10651b4f4f 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -2035,86 +2035,187 @@ X<error, handling> X<exception, handling> =for Pod::Functions catch exceptions or compile and run code -In the first form, often referred to as a "string eval", the return -value of EXPR is parsed and executed as if it -were a little Perl program. The value of the expression (which is itself -determined within scalar context) is first parsed, and if there were no -errors, executed as a block within the lexical context of the current Perl -program. This means, that in particular, any outer lexical variables are -visible to it, and any package variable settings or subroutine and format -definitions remain afterwards. - -Note that the value is parsed every time the L<C<eval>|/eval EXPR> -executes. If EXPR is omitted, evaluates L<C<$_>|perlvar/$_>. This form -is typically used to delay parsing and subsequent execution of the text -of EXPR until run time. - -If the -L<C<"unicode_eval"> feature|feature/The 'unicode_eval' and 'evalbytes' features> -is enabled (which is the default under a -C<use 5.16> or higher declaration), EXPR or L<C<$_>|perlvar/$_> is -treated as a string of characters, so L<C<use utf8>|utf8> declarations -have no effect, and source filters are forbidden. In the absence of the -L<C<"unicode_eval"> feature|feature/The 'unicode_eval' and 'evalbytes' features>, -will sometimes be treated as characters and sometimes as bytes, -depending on the internal encoding, and source filters activated within -the L<C<eval>|/eval EXPR> exhibit the erratic, but historical, behaviour -of affecting some outer file scope that is still compiling. See also -the L<C<evalbytes>|/evalbytes EXPR> operator, which always treats its -input as a byte stream and works properly with source filters, and the -L<feature> pragma. - -Problems can arise if the string expands a scalar containing a floating -point number. That scalar can expand to letters, such as C<"NaN"> or -C<"Infinity">; or, within the scope of a L<C<use locale>|locale>, the -decimal point character may be something other than a dot (such as a -comma). None of these are likely to parse as you are likely expecting. - -In the second form, the code within the BLOCK is parsed only once--at the -same time the code surrounding the L<C<eval>|/eval EXPR> itself was -parsed--and executed +C<eval> in all its forms is used to execute a little Perl program, +trapping any errors encountered so they don't crash the calling program. + +Plain C<eval> with no argument is just C<eval EXPR>, where the +expression is understood to be contained in L<C<$_>|perlvar/$_>. Thus +there are only two real C<eval> forms; the one with an EXPR is often +called "string eval". In a string eval, the value of the expression +(which is itself determined within scalar context) is first parsed, and +if there were no errors, executed as a block within the lexical context +of the current Perl program. This form is typically used to delay +parsing and subsequent execution of the text of EXPR until run time. +Note that the value is parsed every time the C<eval> executes. + +The other form is called "block eval". It is less general than string +eval, but the code within the BLOCK is parsed only once (at the same +time the code surrounding the C<eval> itself was parsed) and executed within the context of the current Perl program. This form is typically -used to trap exceptions more efficiently than the first (see below), while -also providing the benefit of checking the code within BLOCK at compile -time. - -The final semicolon, if any, may be omitted from the value of EXPR or within -the BLOCK. +used to trap exceptions more efficiently than the first, while also +providing the benefit of checking the code within BLOCK at compile time. +BLOCK is parsed and compiled just once. Since errors are trapped, it +often is used to check if a given feature is available. In both forms, the value returned is the value of the last expression -evaluated inside the mini-program; a return statement may be also used, just +evaluated inside the mini-program; a return statement may also be used, just as with subroutines. The expression providing the return value is evaluated in void, scalar, or list context, depending on the context of the -L<C<eval>|/eval EXPR> itself. See L<C<wantarray>|/wantarray> for more +C<eval> itself. See L<C<wantarray>|/wantarray> for more on how the evaluation context can be determined. If there is a syntax error or runtime error, or a L<C<die>|/die LIST> -statement is executed, L<C<eval>|/eval EXPR> returns -L<C<undef>|/undef EXPR> in scalar context or an empty list in list +statement is executed, C<eval> returns +L<C<undef>|/undef EXPR> in scalar context, or an empty list in list context, and L<C<$@>|perlvar/$@> is set to the error message. (Prior to 5.16, a bug caused L<C<undef>|/undef EXPR> to be returned in list context for syntax errors, but not for runtime errors.) If there was no error, L<C<$@>|perlvar/$@> is set to the empty string. A control flow operator like L<C<last>|/last LABEL> or L<C<goto>|/goto LABEL> can bypass the setting of L<C<$@>|perlvar/$@>. Beware that using -L<C<eval>|/eval EXPR> neither silences Perl from printing warnings to +C<eval> neither silences Perl from printing warnings to STDERR, nor does it stuff the text of warning messages into L<C<$@>|perlvar/$@>. To do either of those, you have to use the L<C<$SIG{__WARN__}>|perlvar/%SIG> facility, or turn off warnings inside the BLOCK or EXPR using S<C<no warnings 'all'>>. See L<C<warn>|/warn LIST>, L<perlvar>, and L<warnings>. -Note that, because L<C<eval>|/eval EXPR> traps otherwise-fatal errors, +Note that, because C<eval> traps otherwise-fatal errors, it is useful for determining whether a particular feature (such as L<C<socket>|/socket SOCKET,DOMAIN,TYPE,PROTOCOL> or L<C<symlink>|/symlink OLDFILE,NEWFILE>) is implemented. It is also Perl's exception-trapping mechanism, where the L<C<die>|/die LIST> operator is used to raise exceptions. -If you want to trap errors when loading an XS module, some problems with -the binary interface (such as Perl version skew) may be fatal even with -L<C<eval>|/eval EXPR> unless C<$ENV{PERL_DL_NONLAZY}> is set. See -L<perlrun>. +Before Perl 5.14, the assignment to L<C<$@>|perlvar/$@> occurred before +restoration +of localized variables, which means that for your code to run on older +versions, a temporary is required if you want to mask some, but not all +errors: + + # alter $@ on nefarious repugnancy only + { + my $e; + { + local $@; # protect existing $@ + eval { test_repugnancy() }; + # $@ =~ /nefarious/ and die $@; # Perl 5.14 and higher only + $@ =~ /nefarious/ and $e = $@; + } + die $e if defined $e + } + +There are some different considerations for each form: + +=over 4 + +=item String eval + +Since the return value of EXPR is executed as a block within the lexical +context of the current Perl program, any outer lexical variables are +visible to it, and any package variable settings or subroutine and +format definitions remain afterwards. + +=over 4 + +=item Under the L<C<"unicode_eval"> feature|feature/The 'unicode_eval' and 'evalbytes' features> + +If this feature is enabled (which is the default under a C<use 5.16> or +higher declaration), EXPR is considered to be +in the same encoding as the surrounding program. Thus if +S<L<C<use utf8>|utf8>> is in effect, the string will be treated as being +UTF-8 encoded. Otherwise, the string is considered to be a sequence of +independent bytes. Bytes that correspond to ASCII-range code points +will have their normal meanings for operators in the string. The +treatment of the other bytes depends on if the +L<C<'unicode_strings"> feature|feature/The 'unicode_strings' feature> is +in effect. + +In a plain C<eval> without an EXPR argument, being in S<C<use utf8>> or +not is irrelevant; the UTF-8ness of C<$_> itself determines the +behavior. + +Any S<C<use utf8>> or S<C<no utf8>> declarations within the string have +no effect, and source filters are forbidden. (C<unicode_strings>, +however, can appear within the string. See also the +L<C<evalbytes>|/evalbytes EXPR> operator, which works properly with +source filters. + +Variables defined outside the C<eval> and used inside it retain their +original UTF-8ness. Everything inside the string follows the normal +rules for a Perl program with the given state of S<C<use utf8>>. + +=item Outside the C<"unicode_eval"> feature + +In this case, the behavior is problematic and is not so easily +described. Here are two bugs that cannot easily be fixed without +breaking existing programs: + +=over 4 + +=item * + +It can lose track of whether something should be encoded as UTF-8 or +not. + +=item * + +Source filters activated within C<eval> leak out into whichever file +scope is currently being compiled. To give an example with the CPAN module +L<Semi::Semicolons>: + + BEGIN { eval "use Semi::Semicolons; # not filtered" } + # filtered here! + +L<C<evalbytes>|/evalbytes EXPR> fixes that to work the way one would +expect: + + use feature "evalbytes"; + BEGIN { evalbytes "use Semi::Semicolons; # filtered" } + # not filtered + +=back + +=back + +Problems can arise if the string expands a scalar containing a floating +point number. That scalar can expand to letters, such as C<"NaN"> or +C<"Infinity">; or, within the scope of a L<C<use locale>|locale>, the +decimal point character may be something other than a dot (such as a +comma). None of these are likely to parse as you are likely expecting. + +You should be especially careful to remember what's being looked at +when: + + eval $x; # CASE 1 + eval "$x"; # CASE 2 + + eval '$x'; # CASE 3 + eval { $x }; # CASE 4 + + eval "\$$x++"; # CASE 5 + $$x++; # CASE 6 + +Cases 1 and 2 above behave identically: they run the code contained in +the variable $x. (Although case 2 has misleading double quotes making +the reader wonder what else might be happening (nothing is).) Cases 3 +and 4 likewise behave in the same way: they run the code C<'$x'>, which +does nothing but return the value of $x. (Case 4 is preferred for +purely visual reasons, but it also has the advantage of compiling at +compile-time instead of at run-time.) Case 5 is a place where +normally you I<would> like to use double quotes, except that in this +particular situation, you can just use symbolic references instead, as +in case 6. + +An C<eval ''> executed within a subroutine defined +in the C<DB> package doesn't see the usual +surrounding lexical scope, but rather the scope of the first non-DB piece +of code that called it. You don't normally need to worry about this unless +you are writing a Perl debugger. + +The final semicolon, if any, may be omitted from the value of EXPR. + +=item Block eval If the code to be executed doesn't vary, you may use the eval-BLOCK form to trap run-time errors without incurring the penalty of @@ -2134,6 +2235,11 @@ Examples: # a run-time error eval '$answer ='; # sets $@ +If you want to trap errors when loading an XS module, some problems with +the binary interface (such as Perl version skew) may be fatal even with +C<eval> unless C<$ENV{PERL_DL_NONLAZY}> is set. See +L<perlrun>. + Using the C<eval {}> form as an exception trap in libraries does have some issues. Due to the current arguably broken state of C<__DIE__> hooks, you may wish not to trigger any C<__DIE__> hooks that user code may have installed. @@ -2159,56 +2265,13 @@ messages: Because this promotes action at a distance, this counterintuitive behavior may be fixed in a future release. -With an L<C<eval>|/eval EXPR>, you should be especially careful to -remember what's being looked at when: - - eval $x; # CASE 1 - eval "$x"; # CASE 2 - - eval '$x'; # CASE 3 - eval { $x }; # CASE 4 - - eval "\$$x++"; # CASE 5 - $$x++; # CASE 6 - -Cases 1 and 2 above behave identically: they run the code contained in -the variable $x. (Although case 2 has misleading double quotes making -the reader wonder what else might be happening (nothing is).) Cases 3 -and 4 likewise behave in the same way: they run the code C<'$x'>, which -does nothing but return the value of $x. (Case 4 is preferred for -purely visual reasons, but it also has the advantage of compiling at -compile-time instead of at run-time.) Case 5 is a place where -normally you I<would> like to use double quotes, except that in this -particular situation, you can just use symbolic references instead, as -in case 6. - -Before Perl 5.14, the assignment to L<C<$@>|perlvar/$@> occurred before -restoration -of localized variables, which means that for your code to run on older -versions, a temporary is required if you want to mask some but not all -errors: - - # alter $@ on nefarious repugnancy only - { - my $e; - { - local $@; # protect existing $@ - eval { test_repugnancy() }; - # $@ =~ /nefarious/ and die $@; # Perl 5.14 and higher only - $@ =~ /nefarious/ and $e = $@; - } - die $e if defined $e - } - C<eval BLOCK> does I<not> count as a loop, so the loop control statements L<C<next>|/next LABEL>, L<C<last>|/last LABEL>, or L<C<redo>|/redo LABEL> cannot be used to leave or restart the block. -An C<eval ''> executed within a subroutine defined -in the C<DB> package doesn't see the usual -surrounding lexical scope, but rather the scope of the first non-DB piece -of code that called it. You don't normally need to worry about this unless -you are writing a Perl debugger. +The final semicolon, if any, may be omitted from within the BLOCK. + +=back =item evalbytes EXPR X<evalbytes> @@ -2217,18 +2280,42 @@ X<evalbytes> =for Pod::Functions +evalbytes similar to string eval, but intend to parse a bytestream -This function is like L<C<eval>|/eval EXPR> with a string argument, -except it always parses its argument, or L<C<$_>|perlvar/$_> if EXPR is -omitted, as a string of bytes. A string containing characters whose -ordinal value exceeds 255 results in an error. Source filters activated -within the evaluated code apply to the code itself. +This function is similar to a L<string eval|/eval EXPR>, except it +always parses its argument (or L<C<$_>|perlvar/$_> if EXPR is omitted) +as a string of independent bytes. -L<C<evalbytes>|/evalbytes EXPR> is available only if the -L<C<"evalbytes"> feature|feature/The 'unicode_eval' and 'evalbytes' features> -is enabled or if it is prefixed with C<CORE::>. The +If called when S<C<use utf8>> is in effect, the string will be assumed +to be encoded in UTF-8, and C<evalbytes> will make a temporary +downgraded to non-UTF-8 copy to work from. If this is not possible +(because one or more characters in it require UTF-8), the C<evalbytes> +will fail with the error stored in C<$@>. + +Bytes that correspond to ASCII-range code points will have their normal +meanings for operators in the string. The treatment of the other bytes +depends on if the L<C<'unicode_strings"> feature|feature/The +'unicode_strings' feature> is in effect. + +Of course, variables that are UTF-8 and are referred to in the string +retain that: + + my $a = "\x{100}"; + evalbytes 'print ord $a, "\n"'; + +prints + + 256 + +and C<$@> is empty. + +Source filters activated within the evaluated code apply to the code +itself. + +L<C<evalbytes>|/evalbytes EXPR> is available starting in Perl v5.16. To +access it, you must say C<CORE::evalbytes>, but you can omit the +C<CORE::> if the L<C<"evalbytes"> feature|feature/The 'unicode_eval' and 'evalbytes' features> -is enabled automatically with a C<use v5.16> (or higher) declaration in -the current scope. +is enabled. This is enabled automatically with a C<use v5.16> (or +higher) declaration in the current scope. =item exec LIST X<exec> X<execute> diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 1c07632dec..22f71ab211 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -1088,11 +1088,14 @@ white space within it. This is allowed because C<E<sol>xx> is automatically turned on within this construct. All the other escapes accepted by normal bracketed character classes are -accepted here as well; but unrecognized escapes that generate warnings -in normal classes are fatal errors here. - -All warnings from these class elements are fatal, as well as some -practices that don't currently warn. For example you cannot say +accepted here as well. + +Because this construct compiles under +L<C<use re 'strict>|re/'strict' mode>, unrecognized escapes that +generate warnings in normal classes are fatal errors here, as well as +all other warnings from these class elements, as well as some +practices that don't currently warn outside C<re 'strict'>. For example +you cannot say /(?[ [ \xF ] ])/ # Syntax error! diff --git a/pod/perlretut.pod b/pod/perlretut.pod index 9c7ab56042..87ef42b145 100644 --- a/pod/perlretut.pod +++ b/pod/perlretut.pod @@ -197,6 +197,14 @@ be backslashed: 'C:\WIN32' =~ /C:\\WIN/; # matches +In situations where it doesn't make sense for a particular metacharacter +to mean what it normally does, it automatically loses its +metacharacter-ness and becomes an ordinary character that is to be +matched literally. For example, the C<'}'> is a metacharacter only when +it is the mate of a C<'{'> metacharacter. Otherwise it is treated as a +literal RIGHT CURLY BRACKET. This may lead to unexpected results. +L<C<use re 'strict'>|re/'strict' mode> can catch some of these. + In addition to the metacharacters, there are some ASCII characters which don't have printable character equivalents and are instead represented by I<escape sequences>. Common examples are C<\t> for a diff --git a/proto.h b/proto.h index 59f9b4b370..fea633f86b 100644 --- a/proto.h +++ b/proto.h @@ -886,24 +886,15 @@ PERL_CALLCONV PADOFFSET Perl_find_rundefsvoffset(pTHX) PERL_CALLCONV char* Perl_find_script(pTHX_ const char *scriptname, bool dosearch, const char *const *const search_ext, I32 flags); #define PERL_ARGS_ASSERT_FIND_SCRIPT \ assert(scriptname) -PERL_CALLCONV I32 Perl_foldEQ(const char* a, const char* b, I32 len) - __attribute__warn_unused_result__ - __attribute__pure__; +PERL_STATIC_INLINE I32 Perl_foldEQ(const char* a, const char* b, I32 len); #define PERL_ARGS_ASSERT_FOLDEQ \ assert(a); assert(b) - -PERL_CALLCONV I32 Perl_foldEQ_latin1(const char* a, const char* b, I32 len) - __attribute__warn_unused_result__ - __attribute__pure__; +PERL_STATIC_INLINE I32 Perl_foldEQ_latin1(const char* a, const char* b, I32 len); #define PERL_ARGS_ASSERT_FOLDEQ_LATIN1 \ assert(a); assert(b) - -PERL_CALLCONV I32 Perl_foldEQ_locale(const char* a, const char* b, I32 len) - __attribute__warn_unused_result__ - __attribute__pure__; +PERL_STATIC_INLINE I32 Perl_foldEQ_locale(const char* a, const char* b, I32 len); #define PERL_ARGS_ASSERT_FOLDEQ_LOCALE \ assert(a); assert(b) - /* PERL_CALLCONV I32 foldEQ_utf8(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2); */ PERL_CALLCONV I32 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags); #define PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS \ diff --git a/regcharclass.h b/regcharclass.h index 4be75bcac7..f66d1479e9 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1897,7 +1897,7 @@ * 37f6186253da9824bdb27f4ad867bfe8c25d4dc6bdb2f05585e40a034675a348 lib/unicore/extracted/DLineBreak.txt * ef24061b5a5dc93d7e90c2e34530ec757180ee75d872cba65ffc946e52624ae8 lib/unicore/extracted/DNumType.txt * a197371fec9a1b517058b440841f60f9378d81682084eef8db22a88cb2f96e90 lib/unicore/extracted/DNumValues.txt - * 4bcfb4545be21663ca38a2acbfcbf2b0f3252652a34b50f1a56ef76cb959861b lib/unicore/mktables + * 79a7216aceb1d291f2857085545fdda289518bc540a09bc0a15cde105d76028d lib/unicore/mktables * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * 1d27ae8b75d81a082b1fc594673e08540280f8169309a7b5047015c8091a2bfb regen/regcharclass.pl diff --git a/regen/feature.pl b/regen/feature.pl index 66fc017da6..579120e456 100755 --- a/regen/feature.pl +++ b/regen/feature.pl @@ -367,7 +367,7 @@ read_only_bottom_close_and_rename($h); __END__ package feature; -our $VERSION = '1.46'; +our $VERSION = '1.47'; FEATURES @@ -490,50 +490,22 @@ operator|perlop/Range Operators>. =head2 The 'unicode_eval' and 'evalbytes' features -Under the C<unicode_eval> feature, Perl's C<eval> function, when passed a -string, will evaluate it as a string of characters, ignoring any -C<use utf8> declarations. C<use utf8> exists to declare the encoding of -the script, which only makes sense for a stream of bytes, not a string of -characters. Source filters are forbidden, as they also really only make -sense on strings of bytes. Any attempt to activate a source filter will -result in an error. - -The C<evalbytes> feature enables the C<evalbytes> keyword, which evaluates -the argument passed to it as a string of bytes. It dies if the string -contains any characters outside the 8-bit range. Source filters work -within C<evalbytes>: they apply to the contents of the string being -evaluated. - -Together, these two features are intended to replace the historical C<eval> -function, which has (at least) two bugs in it, that cannot easily be fixed -without breaking existing programs: - -=over - -=item * - -C<eval> behaves differently depending on the internal encoding of the -string, sometimes treating its argument as a string of bytes, and sometimes -as a string of characters. - -=item * - -Source filters activated within C<eval> leak out into whichever I<file> -scope is currently being compiled. To give an example with the CPAN module -L<Semi::Semicolons>: - - BEGIN { eval "use Semi::Semicolons; # not filtered here " } - # filtered here! - -C<evalbytes> fixes that to work the way one would expect: - - use feature "evalbytes"; - BEGIN { evalbytes "use Semi::Semicolons; # filtered " } - # not filtered - -=back - -These two features are available starting with Perl 5.16. +Together, these two features are intended to replace the legacy string +C<eval> function, which behaves problematically in some instances. They are +available starting with Perl 5.16, and are enabled by default by a +S<C<use 5.16>> or higher declaration. + +C<unicode_eval> changes the behavior of plain string C<eval> to work more +consistently, especially in the Unicode world. Certain (mis)behaviors +couldn't be changed without breaking some things that had come to rely on +them, so the feature can be enabled and disabled. Details are at +L<perlfunc/Under the "unicode_eval" feature>. + +C<evalbytes> is like string C<eval>, but operating on a byte stream that is +not UTF-8 encoded. Details are at L<perlfunc/evalbytes EXPR>. Without a +S<C<use feature 'evalbytes'>> nor a S<C<use v5.16>> (or higher) declaration in +the current scope, you can still access it by instead writing +C<CORE::evalbytes>. =head2 The 'current_sub' feature diff --git a/util.c b/util.c index 406286c219..bd568bc22a 100644 --- a/util.c +++ b/util.c @@ -1022,89 +1022,6 @@ Perl_fbm_instr(pTHX_ unsigned char *big, unsigned char *bigend, SV *littlestr, U } } - -/* -=for apidoc foldEQ - -Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the -same -case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes -match themselves and their opposite case counterparts. Non-cased and non-ASCII -range bytes match only themselves. - -=cut -*/ - - -I32 -Perl_foldEQ(const char *s1, const char *s2, I32 len) -{ - const U8 *a = (const U8 *)s1; - const U8 *b = (const U8 *)s2; - - PERL_ARGS_ASSERT_FOLDEQ; - - assert(len >= 0); - - while (len--) { - if (*a != *b && *a != PL_fold[*b]) - return 0; - a++,b++; - } - return 1; -} -I32 -Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) -{ - /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on - * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor - * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor - * does it check that the strings each have at least 'len' characters */ - - const U8 *a = (const U8 *)s1; - const U8 *b = (const U8 *)s2; - - PERL_ARGS_ASSERT_FOLDEQ_LATIN1; - - assert(len >= 0); - - while (len--) { - if (*a != *b && *a != PL_fold_latin1[*b]) { - return 0; - } - a++, b++; - } - return 1; -} - -/* -=for apidoc foldEQ_locale - -Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the -same case-insensitively in the current locale; false otherwise. - -=cut -*/ - -I32 -Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) -{ - dVAR; - const U8 *a = (const U8 *)s1; - const U8 *b = (const U8 *)s2; - - PERL_ARGS_ASSERT_FOLDEQ_LOCALE; - - assert(len >= 0); - - while (len--) { - if (*a != *b && *a != PL_fold_locale[*b]) - return 0; - a++,b++; - } - return 1; -} - /* copy a string to a safe spot */ /* -- Perl5 Master Repository
