In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/69930b1860ecb5c4bddc59bb2ee41e5cea977545?hp=a9e4bc69d5e57e574e1d63e589e86d007e873f88>
- Log ----------------------------------------------------------------- commit 69930b1860ecb5c4bddc59bb2ee41e5cea977545 Author: Jesse Vincent <je...@bestpractical.com> Date: Tue May 3 17:20:16 2011 -0400 Bump Module::CoreList because the content changed since RC1 and we have nice, zealous porting tests M dist/Module-CoreList/Changes M dist/Module-CoreList/lib/Module/CoreList.pm commit 4e31220dc51f11a5699fc39e0e529f2a824f2fb7 Author: Jesse Vincent <je...@bestpractical.com> Date: Fri Apr 29 02:11:43 2011 +0800 RC1 -> RC2; push off the date of 5.14.0 until a week from tomorrow M dist/Module-CoreList/lib/Module/CoreList.pm M patchlevel.h M pod/perlhist.pod commit 3d21943ed8af6419b6284007a118fd1217b51971 Author: Jesse Vincent <je...@bestpractical.com> Date: Tue May 3 16:58:58 2011 -0400 Documentation for sprintf updates in Perl 5.14 M pod/perlfunc.pod commit 1f59b28370e2e2b18e56e01ba9cf10440343bcd1 Author: Karl Williamson <pub...@khwilliamson.com> Date: Tue May 3 14:08:43 2011 -0600 Doc changes for [perl #89750] M pod/perldelta.pod M pod/perlre.pod M pod/perlrecharclass.pod commit 7b4a7e586ed8557b4b47ff04c789aa6a65b1c944 Author: Karl Williamson <pub...@khwilliamson.com> Date: Tue May 3 11:47:50 2011 -0600 regcomp.c: White space only A previous commit added an 'if' around this code. This now indents the block properly. M regcomp.c commit 827f5bb80b513fa181ae206648e6d58d9d82eb29 Author: Karl Williamson <pub...@khwilliamson.com> Date: Tue May 3 11:44:28 2011 -0600 PATCH: [perl #89750]: Unicode regex negated case-insensitivity This patch causes inverted [bracketed] character classes to not handle multi-character folds. The reason is that these can lead to very counter-intuitive results (see bug discussion). In an inverted character class, only single-char folds are now generated. However the fold for \xDF=>ss is hard-coded in, and it was too much trouble sending flags to the sub-sub routine that does this, so another check is done at the point of storing the list of multi-char folds. Since \xDF doesn't have a single char fold, this works. M regcomp.c M t/re/fold_grind.t M t/re/re_tests commit 36bb2ab64fa2ef022d7870082c0dcc6db902c86e Author: Karl Williamson <pub...@khwilliamson.com> Date: Tue May 3 10:12:00 2011 -0600 utf8.c: Add _flags version of to_utf8_fold() And also to_uni_fold(). The flag allows retrieving either simple or full folds. The interface is subject to change, so these are marked experimental and their names begin with underscore. The old versions are turned into macros calling the new versions with the correct extra parameter. M embed.fnc M embed.h M global.sym M proto.h M utf8.c M utf8.h commit 7bee57f6d73fe07d73dd9b4a538e4ffd60c7eed8 Author: Karl Williamson <pub...@khwilliamson.com> Date: Tue May 3 09:52:49 2011 -0600 embed.fnc: Allow NULL arg to to_utf8_case() Code within the function doesn't assume that the parameter is non-null, and in fact the specials are retrieved by swash_init(). Having the parameter null just means that no specials will be retrieved in the current call. M embed.fnc M proto.h commit a5eda76fc2190125cec66d38de6fa46913045ca0 Author: Jesse Vincent <je...@bestpractical.com> Date: Tue May 3 12:07:41 2011 -0400 Small typo fixes in perldelta M pod/perldelta.pod commit 27747dc0b5987fcab125aea502e520925bbf667f Author: Father Chrysostomos <spr...@cpan.org> Date: Tue May 3 12:05:40 2011 -0400 Minor perldelta fixes â¢Â Remove C<...> around get-magic and set-magic. Those are prose descriptions of what is known internally as mg_get, SvGETMAGIC, SVs_GMG, etc. ⢠Re-instate the message that 804b5feed removed, but in the form in which it appears in perldiag. ⢠Remove the thing about version class methods. Itâs a bug fix, not a problem (whether known or unknown :-), and not a significant one. ⢠Spelling mistake M pod/perldelta.pod ----------------------------------------------------------------------- Summary of changes: dist/Module-CoreList/Changes | 2 +- dist/Module-CoreList/lib/Module/CoreList.pm | 6 ++-- embed.fnc | 8 +++-- embed.h | 4 +- global.sym | 4 +- patchlevel.h | 2 +- pod/perldelta.pod | 52 +++++++++++++++++++------- pod/perlfunc.pod | 29 +++++++++++---- pod/perlhist.pod | 3 +- pod/perlre.pod | 6 +++- pod/perlrecharclass.pod | 32 +++++++++++++++-- proto.h | 29 +++++++++------ regcomp.c | 30 +++++++++++++--- t/re/fold_grind.t | 2 + t/re/re_tests | 5 +++ utf8.c | 19 ++++++---- utf8.h | 3 ++ 17 files changed, 175 insertions(+), 61 deletions(-) diff --git a/dist/Module-CoreList/Changes b/dist/Module-CoreList/Changes index 03ab739..8ddb7e4 100644 --- a/dist/Module-CoreList/Changes +++ b/dist/Module-CoreList/Changes @@ -1,4 +1,4 @@ -2.47 +2.48 - Updated for v5.14.0 2.46 Sun Mar 20 2011 diff --git a/dist/Module-CoreList/lib/Module/CoreList.pm b/dist/Module-CoreList/lib/Module/CoreList.pm index b15ff1f..2983988 100644 --- a/dist/Module-CoreList/lib/Module/CoreList.pm +++ b/dist/Module-CoreList/lib/Module/CoreList.pm @@ -2,7 +2,7 @@ package Module::CoreList; use strict; use vars qw/$VERSION %released %version %families %upstream %bug_tracker %deprecated/; -$VERSION = '2.47'; +$VERSION = '2.48'; =head1 NAME @@ -347,7 +347,7 @@ sub removed_raw { 5.013009 => '2011-01-20', 5.013010 => '2011-02-20', 5.013011 => '2011-03-20', - 5.014000 => '2012-04-28', + 5.014000 => '2012-05-11', ); for my $version ( sort { $a <=> $b } keys %released ) { @@ -24090,7 +24090,7 @@ for my $version ( sort { $a <=> $b } keys %released ) { 'Module::Build::PodParser'=> '0.3800', 'Module::Build::Version'=> '0.87', 'Module::Build::YAML' => '1.41', - 'Module::CoreList' => '2.47', + 'Module::CoreList' => '2.48', 'Module::Load' => '0.18', 'Module::Load::Conditional'=> '0.44', 'Module::Loaded' => '0.06', diff --git a/embed.fnc b/embed.fnc index b891b43..65116ad 100644 --- a/embed.fnc +++ b/embed.fnc @@ -562,7 +562,8 @@ ApPR |bool |is_uni_xdigit |UV c Ap |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp Ap |UV |to_uni_lower |UV c|NN U8 *p|NN STRLEN *lenp -Ap |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp +Amp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp +AMp |UV |_to_uni_fold_flags|UV c|NN U8 *p|NN STRLEN *lenp|U8 flags ApPR |bool |is_uni_alnum_lc|UV c ApPR |bool |is_uni_idfirst_lc|UV c ApPR |bool |is_uni_alpha_lc|UV c @@ -1318,11 +1319,12 @@ EsMR |HV* |invlist_union |NN HV* const a|NN HV* const b Ap |void |taint_env Ap |void |taint_proper |NULLOK const char* f|NN const char *const s Apd |UV |to_utf8_case |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp \ - |NN SV **swashp|NN const char *normal|NN const char *special + |NN SV **swashp|NN const char *normal|NULLOK const char *special Apd |UV |to_utf8_lower |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp Apd |UV |to_utf8_upper |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp Apd |UV |to_utf8_title |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp -Apd |UV |to_utf8_fold |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp +Ampd |UV |to_utf8_fold |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp +AMp |UV |_to_utf8_fold_flags|NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|U8 flags #if defined(UNLINK_ALL_VERSIONS) Ap |I32 |unlnk |NN const char* f #endif diff --git a/embed.h b/embed.h index 89c4fa8..9ff6440 100644 --- a/embed.h +++ b/embed.h @@ -27,6 +27,8 @@ /* Hide global symbols */ #define Gv_AMupdate(a,b) Perl_Gv_AMupdate(aTHX_ a,b) +#define _to_uni_fold_flags(a,b,c,d) Perl__to_uni_fold_flags(aTHX_ a,b,c,d) +#define _to_utf8_fold_flags(a,b,c,d) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d) #define amagic_call(a,b,c,d) Perl_amagic_call(aTHX_ a,b,c,d) #define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b) #define apply_attrs_string(a,b,c,d) Perl_apply_attrs_string(aTHX_ a,b,c,d) @@ -623,7 +625,6 @@ #define taint_env() Perl_taint_env(aTHX) #define taint_proper(a,b) Perl_taint_proper(aTHX_ a,b) #define tmps_grow(a) Perl_tmps_grow(aTHX_ a) -#define to_uni_fold(a,b,c) Perl_to_uni_fold(aTHX_ a,b,c) #define to_uni_lower(a,b,c) Perl_to_uni_lower(aTHX_ a,b,c) #define to_uni_lower_lc(a) Perl_to_uni_lower_lc(aTHX_ a) #define to_uni_title(a,b,c) Perl_to_uni_title(aTHX_ a,b,c) @@ -631,7 +632,6 @@ #define to_uni_upper(a,b,c) Perl_to_uni_upper(aTHX_ a,b,c) #define to_uni_upper_lc(a) Perl_to_uni_upper_lc(aTHX_ a) #define to_utf8_case(a,b,c,d,e,f) Perl_to_utf8_case(aTHX_ a,b,c,d,e,f) -#define to_utf8_fold(a,b,c) Perl_to_utf8_fold(aTHX_ a,b,c) #define to_utf8_lower(a,b,c) Perl_to_utf8_lower(aTHX_ a,b,c) #define to_utf8_title(a,b,c) Perl_to_utf8_title(aTHX_ a,b,c) #define to_utf8_upper(a,b,c) Perl_to_utf8_upper(aTHX_ a,b,c) diff --git a/global.sym b/global.sym index dde11d4..89fb825 100644 --- a/global.sym +++ b/global.sym @@ -21,6 +21,8 @@ Perl__append_range_to_invlist Perl__new_invlist Perl__swash_inversion_hash Perl__swash_to_invlist +Perl__to_uni_fold_flags +Perl__to_utf8_fold_flags Perl_amagic_call Perl_amagic_deref_call Perl_apply_attrs_string @@ -732,7 +734,6 @@ Perl_sys_term Perl_taint_env Perl_taint_proper Perl_tmps_grow -Perl_to_uni_fold Perl_to_uni_lower Perl_to_uni_lower_lc Perl_to_uni_title @@ -740,7 +741,6 @@ Perl_to_uni_title_lc Perl_to_uni_upper Perl_to_uni_upper_lc Perl_to_utf8_case -Perl_to_utf8_fold Perl_to_utf8_lower Perl_to_utf8_title Perl_to_utf8_upper diff --git a/patchlevel.h b/patchlevel.h index ffdef78..72a4e12 100644 --- a/patchlevel.h +++ b/patchlevel.h @@ -133,7 +133,7 @@ hunk. # endif static const char * const local_patches[] = { NULL - ,"RC1" + ,"RC2" #ifdef PERL_GIT_UNCOMMITTED_CHANGES ,"uncommitted-changes" #endif diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 817e84f..4319436 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -56,6 +56,8 @@ This release provides full functionality for C<use feature 'unicode_strings'>. Under its scope, all string operations executed and regular expressions compiled (even if executed outside its scope) have Unicode semantics. See L<feature/"the 'unicode_strings' feature">. +However, see L</Inverted bracketed character classes and multi-character folds>, +below. This feature avoids most forms of the "Unicode Bug" (see L<perlunicode/The "Unicode Bug"> for details). If there is any @@ -529,6 +531,29 @@ In addition to the sections that follow, see L</C API Changes>. =head2 Regular Expressions and String Escapes +=head3 Inverted bracketed character classes and multi-character folds + +Some characters match a sequence of two or three characters in C</i> +regular expression matching under Unicode rules. One example is +C<LATIN SMALL LETTER SHARP S> which matches the sequence C<ss>. + + 'ss' =~ /\A[\N{LATIN SMALL LETTER SHARP S}]\z/i # Matches + +This, however, can lead to very counter-intuitive results, especially +when inverted. Because of this, Perl 5.14 does not use multi-character C</i> +matching in inverted character classes. + + 'ss' =~ /\A[^\N{LATIN SMALL LETTER SHARP S}]+\z/i # ??? + +This should match any sequences of characters that aren't the C<SHARP S> +nor what C<SHARP S> matches under C</i>. C<"s"> isn't C<SHARP S>, but +Unicode says that C<"ss"> is what C<SHARP S> matches under C</i>. So +which one "wins"? Do you fail the match because the string has C<ss> or +accept it because it has an C<s> followed by another C<s>? + +Earlier releases of Perl did allow this multi-character matching, +but due to bugs, it mostly did not work. + =head3 \400-\777 In certain circumstances, C<\400>-C<\777> in regexes have behaved @@ -1839,7 +1864,6 @@ L<Module::Load::Conditional> has been upgraded from version 0.34 to 0.44. The L<mro> pragma has been upgraded from version 1.02 to 1.07. - =item * L<NDBM_File> has been upgraded from version 1.08 to 1.12. @@ -2320,6 +2344,13 @@ L<Ambiguous use of %c resolved as operator %c|perldiag/"Ambiguous use of %c reso L<Ambiguous use of %c{%s} resolved to %c%s|perldiag/"Ambiguous use of %c{%s} resolved to %c%s"> +=item * + +L<Ambiguous use of %c{%s[...]} resolved to %c%s[...]|perldiag/"Ambiguous use of %c{%s[...]} resolved to %c%s[...]"> + +=item * + +L<Ambiguous use of %c{%s{...}} resolved to %c%s{...}|perldiag/"Ambiguous use of %c{%s{...}} resolved to %c%s{...}"> =item * @@ -3073,8 +3104,8 @@ side-chains of the optree. The following functions/macros have been added to the API. The C<*_nomg> macros are equivalent to their non-C<_nomg> variants, except that they ignore -C<get-magic>. Those ending in C<_flags> allow one to specify whether -C<get-magic> is processed. +get-magic. Those ending in C<_flags> allow one to specify whether +get-magic is processed. sv_2bool_flags SvTRUE_nomg @@ -3920,7 +3951,7 @@ reference already (such as from a previous FETCH) [perl #72144]. =item * -C<splice> now calls C<set-magic> (so changes made +C<splice> now calls set-magic (so changes made by C<splice @ISA> are respected by method calls) [perl #78400]. =item * @@ -3930,7 +3961,7 @@ FETCH/STORE at all [perl #43789] (5.12.2). =item * -utf8::is_utf8() now respects C<get-magic> (like C<$1>) (5.12.1). +utf8::is_utf8() now respects get-magic (like C<$1>) (5.12.1). =back @@ -4419,11 +4450,6 @@ interrupted by a signal. =item * -L<version> now prevents object methods from being called as class methods -(d808b68) - -=item * - The changes in prototype handling break L<Switch>. A patch has been sent upstream and will hopefully appear on CPAN soon. @@ -4431,7 +4457,7 @@ upstream and will hopefully appear on CPAN soon. The upgrade to F<ExtUtils-MakeMaker-6.57_05> has caused some tests in the F<Module-Install> distribution on CPAN to -fail. (Specifically, F<02_mymeta.t> tests 5 and 21l; F<18_all_from.t> +fail. (Specifically, F<02_mymeta.t> tests 5 and 21; F<18_all_from.t> tests 6 and 15; F<19_authors.t> tests 5, 13, 21, and 29; and F<20_authors_with_special_characters.t> tests 6, 15, and 23 in version 1.00 of that distribution now fail.) @@ -4464,7 +4490,7 @@ that release's L<perl5120delta>. split() no longer modifies C<@_> when called in scalar or void context. In void context it now produces a "Useless use of split" warning. -This was also a perl 5.12.0 changed that missed the perldelta. +This was also a perl 5.12.0 change that missed the perldelta. =head1 Obituary @@ -4475,7 +4501,6 @@ was richer for his involvement. He will be missed. =head1 Acknowledgements - Perl 5.14.0 represents one year of development since Perl 5.12.0 and contains nearly 550,000 lines of changes across nearly 3,000 files from 150 authors and committers. @@ -4528,7 +4553,6 @@ Many of the changes included in this version originated in the CPAN modules included in Perl's core. We're grateful to the entire CPAN community for helping Perl to flourish. - =head1 Reporting Bugs If you find what you think is a bug, you might check the articles diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 26b8949..3e49e2a 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -6161,15 +6161,30 @@ whatever the default integer size is on your platform (usually 32 or 64 bits), but you can override this to use instead one of the standard C types, as supported by the compiler used to build Perl: - l interpret integer as C type "long" or "unsigned long" + hh interpret integer as C type "char" or "unsigned char" + on Perl 5.14 or later h interpret integer as C type "short" or "unsigned short" - q, L or ll interpret integer as C type "long long", "unsigned long long". - or "quads" (typically 64-bit integers) + j intepret integer as C type "intmax_t" on Perl 5.14 + or later, and only with a C99 compiler (unportable) + l interpret integer as C type "long" or "unsigned long" + q, L, or ll interpret integer as C type "long long", "unsigned long long", + or "quad" (typically 64-bit integers) + t intepret integer as C type "ptrdiff_t" on Perl 5.14 or later + z intepret integer as C type "size_t" on Perl 5.14 or later + +As of 5.14, none of these raises an exception if they are not supported on +your platform. However, if warnings are enabled, a warning of the +C<printf> warning class is issued on an unsupported conversion flag. +Should you instead prefer an exception, do this: + + use warnings FATAL => "printf"; + +If you would like to know about a version dependency before you +start running the program, put something like this at its top: + + use 5.014; # for hh/j/t/z/ printf modifiers -The last will raise an exception if Perl does not understand "quads" in your -installation. (This requires either that the platform natively support quads, -or that Perl were specifically compiled to support quads.) You can find out -whether your Perl supports quads via L<Config>: +You can find out whether your Perl supports quads via L<Config>: use Config; if ($Config{use64bitint} eq "define" || $Config{longsize} >= 8) { diff --git a/pod/perlhist.pod b/pod/perlhist.pod index 28f6631..e782738 100644 --- a/pod/perlhist.pod +++ b/pod/perlhist.pod @@ -452,8 +452,9 @@ the strings?). Ãvar 5.13.10 2011-Feb-20 Florian 5.13.11 2011-Mar-20 Jesse 5.14.0RC1 2011-Apr-20 + Jesse 5.14.0RC2 2011-May-04 - Jesse 5.14.0 2011-Apr-28 + Jesse 5.14.0 2011-May-11 =head2 SELECTED RELEASE SIZES diff --git a/pod/perlre.pod b/pod/perlre.pod index 12617e2..c4ec417 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -72,7 +72,11 @@ are split between groupings, or when one or more are quantified. Thus # be even if it did!! "\N{LATIN SMALL LIGATURE FI}" =~ /(f)(i)/i; # Doesn't match! -Also, this matching doesn't fully conform to the current Unicode +Perl doesn't match multiple characters in an inverted bracketed +character class, which otherwise could be highly confusing. See +L<perlrecharclass/Negation>. + +Also, Perl matching doesn't fully conform to the current Unicode C</i> recommendations, which ask that the matching be made upon the NFD (Normalization Form Decomposed) of the text. However, Unicode is in the process of reconsidering and revising their recommendations. diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 4c91931..2b76dfb 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -401,7 +401,7 @@ The third form of character class you can use in Perl regular expressions is the bracketed character class. In its simplest form, it lists the characters that may be matched, surrounded by square brackets, like this: C<[aeiou]>. This matches one of C<a>, C<e>, C<i>, C<o> or C<u>. Like the other -character classes, exactly one character is matched. To match +character classes, exactly one character is matched.* To match a longer string consisting of characters mentioned in the character class, follow the character class with a L<quantifier|perlre/Quantifiers>. For instance, C<[aeiou]+> matches one or more lowercase English vowels. @@ -417,6 +417,19 @@ Examples: # a single character. "ae" =~ /^[aeiou]+$/ # Match, due to the quantifier. + ------- + +* There is an exception to a bracketed character class matching a only a +single character. When the class is to match caselessely under C</i> +matching rules, and a character inside the class matches a +multiple-character sequence caselessly under Unicode rules, the class +(when not L<inverted|/Negation>) will also match that sequence. For +example, Unicode says that the letter C<LATIN SMALL LETTER SHARP S> +should match the sequence C<ss> under C</i> rules. Thus, + + 'ss' =~ /\A\N{LATIN SMALL LETTER SHARP S}\z/i # Matches + 'ss' =~ /\A[aeioust\N{LATIN SMALL LETTER SHARP S}]\z/i # Matches + =head3 Special Characters Inside a Bracketed Character Class Most characters that are meta characters in regular expressions (that @@ -525,13 +538,26 @@ It is also possible to instead list the characters you do not want to match. You can do so by using a caret (C<^>) as the first character in the character class. For instance, C<[^a-z]> matches any character that is not a lowercase ASCII letter, which therefore includes almost a hundred thousand -Unicode letters. +Unicode letters. The class is said to be "negated" or "inverted". This syntax make the caret a special character inside a bracketed character class, but only if it is the first character of the class. So if you want the caret as one of the characters to match, either escape the caret or else not list it first. +In inverted bracketed character classes, Perl ignores the Unicode rules +that normally say that a given character matches a sequence of multiple +characters under caseless C</i> matching, which otherwise could be +highly confusing: + + "ss" =~ /^[^\xDF]+$/ui; + +This should match any sequences of characters that aren't C<\xDF> nor +what C<\xDF> matches under C</i>. C<"s"> isn't C<\xDF>, but Unicode +says that C<"ss"> is what C<\xDF> matches under C</i>. So which one +"wins"? Do you fail the match because the string has C<ss> or accept it +because it has an C<s> followed by another C<s>? + Examples: "e" =~ /[^aeiou]/ # No match, the 'e' is listed. @@ -765,7 +791,7 @@ C<\p{HorizSpace}> and \C<\p{XPosixBlank}>. For example, C<\p{PosixAlpha}> can be written as C<\p{Alpha}>. All are listed in L<perluniprops/Properties accessible through \p{} and \P{}>. -=head4 Negation +=head4 Negation of POSIX character classes X<character class, negation> A Perl extension to the POSIX character class is the ability to diff --git a/proto.h b/proto.h index a8c066a..c83fd12 100644 --- a/proto.h +++ b/proto.h @@ -43,6 +43,18 @@ PERL_CALLCONV HV* Perl__swash_to_invlist(pTHX_ SV* const swash) #define PERL_ARGS_ASSERT__SWASH_TO_INVLIST \ assert(swash) +PERL_CALLCONV UV Perl__to_uni_fold_flags(pTHX_ UV c, U8 *p, STRLEN *lenp, U8 flags) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS \ + assert(p); assert(lenp) + +PERL_CALLCONV UV Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2); +#define PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS \ + assert(p); assert(ustrp) + PERL_CALLCONV PADOFFSET Perl_allocmy(pTHX_ const char *const name, const STRLEN len, const U32 flags) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_ALLOCMY \ @@ -4213,11 +4225,9 @@ PERL_CALLCONV OP * Perl_tied_method(pTHX_ const char *const methname, SV **sp, S assert(methname); assert(sp); assert(sv); assert(mg) PERL_CALLCONV void Perl_tmps_grow(pTHX_ I32 n); -PERL_CALLCONV UV Perl_to_uni_fold(pTHX_ UV c, U8 *p, STRLEN *lenp) +/* PERL_CALLCONV UV Perl_to_uni_fold(pTHX_ UV c, U8 *p, STRLEN *lenp) __attribute__nonnull__(pTHX_2) - __attribute__nonnull__(pTHX_3); -#define PERL_ARGS_ASSERT_TO_UNI_FOLD \ - assert(p); assert(lenp) + __attribute__nonnull__(pTHX_3); */ PERL_CALLCONV UV Perl_to_uni_lower(pTHX_ UV c, U8 *p, STRLEN *lenp) __attribute__nonnull__(pTHX_2) @@ -4253,16 +4263,13 @@ PERL_CALLCONV UV Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, S __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_4) - __attribute__nonnull__(pTHX_5) - __attribute__nonnull__(pTHX_6); + __attribute__nonnull__(pTHX_5); #define PERL_ARGS_ASSERT_TO_UTF8_CASE \ - assert(p); assert(ustrp); assert(swashp); assert(normal); assert(special) + assert(p); assert(ustrp); assert(swashp); assert(normal) -PERL_CALLCONV UV Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) +/* PERL_CALLCONV UV Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) __attribute__nonnull__(pTHX_1) - __attribute__nonnull__(pTHX_2); -#define PERL_ARGS_ASSERT_TO_UTF8_FOLD \ - assert(p); assert(ustrp) + __attribute__nonnull__(pTHX_2); */ PERL_CALLCONV UV Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) __attribute__nonnull__(pTHX_1) diff --git a/regcomp.c b/regcomp.c index 0858841..1094789 100644 --- a/regcomp.c +++ b/regcomp.c @@ -9552,6 +9552,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) IV namedclass; char *rangebegin = NULL; bool need_class = 0; + bool allow_full_fold = TRUE; /* Assume wants multi-char folding */ SV *listsv = NULL; STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ @@ -9608,6 +9609,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) RExC_parse++; if (!SIZE_ONLY) ANYOF_FLAGS(ret) |= ANYOF_INVERT; + + /* We have decided to not allow multi-char folds in inverted character + * classes, due to the confusion that can happen, even with classes + * that are designed for a non-Unicode world: You have the peculiar + * case that: + "s s" =~ /^[^\xDF]+$/i => Y + "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + allow_full_fold = FALSE; } if (SIZE_ONLY) { @@ -10136,7 +10147,8 @@ parseit: /* Get its fold */ U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; - const UV f = to_uni_fold(j, foldbuf, &foldlen); + const UV f = + _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold); if (foldlen > (STRLEN)UNISKIP(f)) { @@ -10437,10 +10449,18 @@ parseit: * used later (regexec.c:S_reginclass()). */ av_store(av, 0, listsv); av_store(av, 1, NULL); - av_store(av, 2, MUTABLE_SV(unicode_alternate)); - if (unicode_alternate) { /* This node is variable length */ - OP(ret) = ANYOFV; - } + + /* Store any computed multi-char folds only if we are allowing + * them */ + if (allow_full_fold) { + av_store(av, 2, MUTABLE_SV(unicode_alternate)); + if (unicode_alternate) { /* This node is variable length */ + OP(ret) = ANYOFV; + } + } + else { + av_store(av, 2, NULL); + } rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index 82ca6ad..460d296 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -452,6 +452,8 @@ foreach my $test (sort { numerically } keys %tests) { foreach my $bracketed (0, 1) { # Put rhs in [...], or not foreach my $inverted (0,1) { next if $inverted && ! $bracketed; # inversion only valid in [^...] + next if $inverted && @target != 1; # [perl #89750] multi-char + # not valid in [^...] # In some cases, add an extra character that doesn't fold, and # looks ok in the output. diff --git a/t/re/re_tests b/t/re/re_tests index 9d5341b..35a7220 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1517,4 +1517,9 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer /s/aia S y $& S /(?aia:s)/ \x{17F} n - - /(?aia:s)/ S y $& S + +# Normally 1E9E generates a multi-char fold, but not in inverted class; +# See [perl #89750]. This makes sure that the simple fold gets generated +# in that case, to DF. +/[^\x{1E9E}]/i \x{DF} n - - # vim: softtabstop=0 noexpandtab diff --git a/utf8.c b/utf8.c index 9c2061d..11c2fa4 100644 --- a/utf8.c +++ b/utf8.c @@ -1341,12 +1341,12 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) } UV -Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp) +Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) { - PERL_ARGS_ASSERT_TO_UNI_FOLD; + PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; uvchr_to_utf8(p, c); - return to_utf8_fold(p, p, lenp); + return _to_utf8_fold_flags(p, p, lenp, flags); } /* for now these all assume no locale info available for Unicode > 255 */ @@ -1799,7 +1799,7 @@ of the result. The "swashp" is a pointer to the swash to use. -Both the special and normal mappings are stored lib/unicore/To/Foo.pl, +Both the special and normal mappings are stored in lib/unicore/To/Foo.pl, and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually, but not always, a multicharacter mapping), is tried first. @@ -2026,15 +2026,20 @@ The first character of the foldcased version is returned =cut */ +/* Not currently externally documented is 'flags', which currently is non-zero + * if full case folds are to be used; otherwise simple folds */ + UV -Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) +Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) { + const char *specials = (flags) ? "utf8::ToSpecFold" : NULL; + dVAR; - PERL_ARGS_ASSERT_TO_UTF8_FOLD; + PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, - &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold"); + &PL_utf8_tofold, "ToFold", specials); } /* Note: diff --git a/utf8.h b/utf8.h index a08ba04..c40fb58 100644 --- a/utf8.h +++ b/utf8.h @@ -16,6 +16,9 @@ # define USE_UTF8_IN_NAMES (PL_hints & HINT_UTF8) #endif +#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, 1) +#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, 1) + /* Source backward compatibility. */ #define uvuni_to_utf8(d, uv) uvuni_to_utf8_flags(d, uv, 0) #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) -- Perl5 Master Repository