In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/6cdc5cd8f36f88172b0fcefdcadec75f5b6600b2?hp=1ae6ead94905dfee43773cf3b18949c91b33f9d1>
- Log ----------------------------------------------------------------- commit 6cdc5cd8f36f88172b0fcefdcadec75f5b6600b2 Author: Karl Williamson <[email protected]> Date: Fri Dec 2 09:35:53 2016 -0700 PATCH: [perl #126310] single quote UTF-8 malformation detection This adds UTF-8 wellformedness checking in Perl_lex_next_chunk, which should get called for all program text, so this makes sure the entire program is well-formed, not just single- or double-quoted strings. M pod/perldelta.pod M t/lib/warnings/utf8 M toke.c commit 86ae6e94ac83e2ce56c4363d483fee82c38cb7f9 Author: Karl Williamson <[email protected]> Date: Wed Dec 7 21:08:38 2016 -0700 Die on malformed isFOO_utf8() input At the p5p core hackathon in November 2016, it was decided to make the previous deprecation message fatal for malformed input passed to the isFOO_utf8() macros and friends. M pod/perldelta.pod M pod/perldiag.pod M utf8.c commit 75219bacf5aacd315b96083de24e82cd8238e99a Author: Karl Williamson <[email protected]> Date: Fri Dec 9 08:45:18 2016 -0700 Use fnc to force out malformed warnings The previous commit added a function to do this task. This current commit changes the several places in the core that have here-to-fore done this in an ad-hoc (and not as reliable) manner to use the new function. A couple of messages in toke.c are left in so as to avoid changing diagnostics unnecessarily. If those messages had been created in the project after the enhanced malformation warnings were created, they would have been phrased differently. The reason some of the methods weren't so reliable, is they relied on fatalizing the warnng message. However if warnings are turned off, it never gets to the point of outputting, hence doesn't necessarily die. M regexec.c M t/op/lex.t M t/uni/parser.t M toke.c commit 9cbfb8abb5bb7ce49134acc57b93eb9ae475e339 Author: Karl Williamson <[email protected]> Date: Wed Dec 7 20:48:40 2016 -0700 Add fnc to force out UTF-8 malform warnings at death The bottom level UTF-8 decode routine now generates detailed messages when it encounters malformations. In some instances these should be treated as croak reasons and output even if warnings are off, just before dying. This commit adds a function to do this. M embed.fnc M embed.h M proto.h M utf8.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 3 +++ embed.h | 1 + pod/perldelta.pod | 6 +++-- pod/perldiag.pod | 11 -------- proto.h | 3 +++ regexec.c | 12 ++++++--- t/lib/warnings/utf8 | 22 +++++++++++----- t/op/lex.t | 7 ++--- t/uni/parser.t | 2 ++ toke.c | 73 +++++++++++++++++++++++++++-------------------------- utf8.c | 62 ++++++++++++++++++++++++++++++++++++++------- 11 files changed, 131 insertions(+), 71 deletions(-) diff --git a/embed.fnc b/embed.fnc index 49cf3f4941..4743524f17 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1754,6 +1754,9 @@ Ap |void |unsharepvn |NULLOK const char* sv|I32 len|U32 hash p |void |unshare_hek |NULLOK HEK* hek : Used in perly.y p |void |utilize |int aver|I32 floor|NULLOK OP* version|NN OP* idop|NULLOK OP* arg +ApM |void |_force_out_malformed_utf8_message \ + |NN const U8 *const p|NN const U8 * const e|const U32 flags \ + |const bool die_here Ap |U8* |utf16_to_utf8 |NN U8* p|NN U8 *d|I32 bytelen|NN I32 *newlen Ap |U8* |utf16_to_utf8_reversed|NN U8* p|NN U8 *d|I32 bytelen|NN I32 *newlen AdpPR |STRLEN |utf8_length |NN const U8* s|NN const U8 *e diff --git a/embed.h b/embed.h index 2ea48e3ff8..66fe0ccfc9 100644 --- a/embed.h +++ b/embed.h @@ -27,6 +27,7 @@ /* Hide global symbols */ #define Gv_AMupdate(a,b) Perl_Gv_AMupdate(aTHX_ a,b) +#define _force_out_malformed_utf8_message(a,b,c,d) Perl__force_out_malformed_utf8_message(aTHX_ a,b,c,d) #define _is_in_locale_category(a,b) Perl__is_in_locale_category(aTHX_ a,b) #define _is_uni_FOO(a,b) Perl__is_uni_FOO(aTHX_ a,b) #define _is_uni_perl_idcont(a) Perl__is_uni_perl_idcont(aTHX_ a) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 99fb9c77c2..b6feb46b3e 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -327,7 +327,8 @@ well. =item * -XXX +Calling macros like C<isALPHA_utf8> on malformed UTF-8 have issued a +deprecation warning since Perl v5.18. They now die. =back @@ -342,7 +343,8 @@ files in F<ext/> and F<lib/> are best summarized in L</Modules and Pragmata>. =item * -XXX +Under C<use utf8>, the entire Perl program is now checked that the UTF-8 +is wellformed. This resolves [perl #126310]. =back diff --git a/pod/perldiag.pod b/pod/perldiag.pod index c0a717ccc1..13cf13d620 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -4671,17 +4671,6 @@ Remember that "my", "our", "local" and "state" bind tighter than comma. (F) Parsing code supplied by an extension violated the parser's API in a detectable way. -=item Passing malformed UTF-8 to "%s" is deprecated - -(D deprecated, utf8) This message indicates a bug either in the Perl -core or in XS code. Such code was trying to find out if a character, -allegedly stored internally encoded as UTF-8, was of a given type, such -as being punctuation or a digit. But the character was not encoded in -legal UTF-8. The C<%s> is replaced by a string that can be used by -knowledgeable people to determine what the type being checked against -was. If C<utf8> warnings are enabled, a further message is raised, -giving details of the malformation. - =item Pattern subroutine nesting without pos change exceeded limit in regex (F) You used a pattern that uses too many nested subpattern calls without diff --git a/proto.h b/proto.h index ecf6f71c11..c7065cd680 100644 --- a/proto.h +++ b/proto.h @@ -41,6 +41,9 @@ PERL_CALLCONV void* Perl_Slab_Alloc(pTHX_ size_t sz) PERL_CALLCONV void Perl_Slab_Free(pTHX_ void *op); #define PERL_ARGS_ASSERT_SLAB_FREE \ assert(op) +PERL_CALLCONV void Perl__force_out_malformed_utf8_message(pTHX_ const U8 *const p, const U8 * const e, const U32 flags, const bool die_here); +#define PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE \ + assert(p); assert(e) PERL_CALLCONV bool Perl__is_in_locale_category(pTHX_ const bool compiling, const int category); PERL_CALLCONV bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) __attribute__warn_unused_result__; diff --git a/regexec.c b/regexec.c index f6f293d56e..e9c74e6ea2 100644 --- a/regexec.c +++ b/regexec.c @@ -9225,10 +9225,14 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_IS_INVARIANT() works even if not in UTF-8 */ if (! UTF8_IS_INVARIANT(c) && utf8_target) { STRLEN c_len = 0; - c = utf8n_to_uvchr(p, p_end - p, &c_len, ( UTF8_ALLOW_DEFAULT - | UTF8_CHECK_ONLY)); - if (c_len == (STRLEN)-1) - Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + const U32 utf8n_flags = UTF8_ALLOW_DEFAULT; + c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY); + if (c_len == (STRLEN)-1) { + _force_out_malformed_utf8_message(p, p_end, + utf8n_flags, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ + } if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) { _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); } diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index dded118d32..3431b86b32 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -15,6 +15,7 @@ __END__ # utf8.c [utf8_to_uvchr_buf] -W +# NAME Malformed under 'use utf8' in double-quoted string BEGIN { if (ord('A') == 193) { print "SKIPPED\n# ebcdic platforms generates different Malformed UTF-8 warnings."; @@ -22,16 +23,25 @@ BEGIN { } } use utf8 ; +no warnings; # Malformed is a fatal error, so gets output anyway. my $a = "snøstorm" ; -{ - no warnings 'utf8' ; - my $a = "snøstorm"; - use warnings 'utf8' ; - my $a = "snøstorm"; +EXPECT +Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 10. +Malformed UTF-8 character (fatal) at - line 10. +######## +# NAME Malformed under 'use utf8' in single-quoted string +BEGIN { + if (ord('A') == 193) { + print "SKIPPED\n# ebcdic platforms generates different Malformed UTF-8 warnings."; + exit 0; + } } +use utf8 ; +no warnings; # Malformed is a fatal error, so gets output anyway. +my $a = 'snøstorm' ; EXPECT Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 9. -Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 14. +Malformed UTF-8 character (fatal) at - line 9. ######## use warnings 'utf8'; my $d7ff = uc(chr(0xD7FF)); diff --git a/t/op/lex.t b/t/op/lex.t index bd6bb0f06b..e50f0ebb65 100644 --- a/t/op/lex.t +++ b/t/op/lex.t @@ -254,8 +254,9 @@ SKIP: or skip "These tests won't work on EBCIDIC", 3; fresh_perl_is( "BEGIN{\$^H=hex ~0}\xF3", - "Integer overflow in hexadecimal number at - line 1.\n" . - "Malformed UTF-8 character: \\xf3 (too short; 1 byte available, need 4) at - line 1.", + "Integer overflow in hexadecimal number at - line 1.\n" + . "Malformed UTF-8 character: \\xf3 (too short; 1 byte available, need 4) at - line 1.\n" + . "Malformed UTF-8 character (fatal) at - line 1.", {}, '[perl #128996] - use of PL_op after op is freed' ); @@ -267,7 +268,7 @@ SKIP: ); fresh_perl_like( qq(BEGIN{\$^H=0x800000}\n 0m 0\xB5\xB500\xB5\0), - qr/Unrecognized character \\x\{0\}; marked by <-- HERE after 0m.*<-- HERE near column 12 at - line 2./, + qr/Malformed UTF-8 character: \\xb5 \(unexpected continuation byte 0xb5, with no preceding start byte\)/, {}, '[perl #129000] read before buffer' ); diff --git a/t/uni/parser.t b/t/uni/parser.t index 6c524b2e2f..624fdd04df 100644 --- a/t/uni/parser.t +++ b/t/uni/parser.t @@ -191,6 +191,8 @@ like( $@, qr/Bad name after Fï½ï½'/, 'Bad name after Fï½ï½\'' ); { no warnings 'utf8'; + local $SIG{__WARN__} = sub { }; # The eval will also output a warning, + # which we ignore my $malformed_to_be = ($::IS_EBCDIC) # Overlong sequence ? "\x{74}\x{41}" : "\x{c0}\x{a0}"; diff --git a/toke.c b/toke.c index f0a7dbc3a0..9273425de6 100644 --- a/toke.c +++ b/toke.c @@ -1032,13 +1032,11 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) { p++; highhalf++; - } else if (! UTF8_IS_INVARIANT(c)) { - /* malformed UTF-8 */ - ENTER; - SAVESPTR(PL_warnhook); - PL_warnhook = PERL_WARNHOOK_FATAL; - utf8n_to_uvchr((U8*)p, e-p, NULL, 0); - LEAVE; + } else if (! UTF8_IS_INVARIANT(c)) { + _force_out_malformed_utf8_message((U8 *) p, (U8 *) e, + 0, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ } } if (!highhalf) @@ -1288,6 +1286,8 @@ Perl_lex_next_chunk(pTHX_ U32 flags) STRLEN linestart_pos, last_uni_pos, last_lop_pos; bool got_some_for_debugger = 0; bool got_some; + const U8* first_bad_char_loc; + if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM)) Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk"); if (!(flags & LEX_NO_TERM) && PL_lex_inwhat) @@ -1352,6 +1352,19 @@ Perl_lex_next_chunk(pTHX_ U32 flags) new_bufend_pos = SvCUR(linestr); PL_parser->bufend = buf + new_bufend_pos; PL_parser->bufptr = buf + bufptr_pos; + + if (UTF && ! is_utf8_string_loc((U8 *) PL_parser->bufptr, + PL_parser->bufend - PL_parser->bufptr, + &first_bad_char_loc)) + { + + _force_out_malformed_utf8_message(first_bad_char_loc, + (U8 *) PL_parser->bufend, + 0, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ + } + PL_parser->oldbufptr = buf + oldbufptr_pos; PL_parser->oldoldbufptr = buf + oldoldbufptr_pos; PL_parser->linestart = buf + linestart_pos; @@ -1428,12 +1441,11 @@ Perl_lex_peek_unichar(pTHX_ U32 flags) } unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY); if (retlen == (STRLEN)-1) { - /* malformed UTF-8 */ - ENTER; - SAVESPTR(PL_warnhook); - PL_warnhook = PERL_WARNHOOK_FATAL; - utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0); - LEAVE; + _force_out_malformed_utf8_message((U8 *) s, + (U8 *) bufend, + 0, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ } return unichar; } else { @@ -2554,15 +2566,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) e - backslash_ptr, &first_bad_char_loc)) { - /* If warnings are on, this will print a more detailed analysis of what - * is wrong than the error message below */ - utf8n_to_uvchr(first_bad_char_loc, - e - ((char *) first_bad_char_loc), - NULL, 0); - - /* We deliberately don't try to print the malformed character, which - * might not print very well; it also may be just the first of many - * malformations, so don't print what comes after it */ + _force_out_malformed_utf8_message(first_bad_char_loc, + (U8 *) PL_parser->bufend, + 0, + 0 /* 0 means don't die */ ); yyerror_pv(Perl_form(aTHX_ "Malformed UTF-8 character immediately after '%.*s'", (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr), @@ -2695,15 +2702,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) STRLEN len; const char* const str = SvPV_const(res, len); if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) { - /* If warnings are on, this will print a more detailed analysis of - * what is wrong than the error message below */ - utf8n_to_uvchr(first_bad_char_loc, - (char *) first_bad_char_loc - str, - NULL, 0); - - /* We deliberately don't try to print the malformed character, - * which might not print very well; it also may be just the first - * of many malformations, so don't print what comes after it */ + _force_out_malformed_utf8_message(first_bad_char_loc, + (U8 *) PL_parser->bufend, + 0, + 0 /* 0 means don't die */ ); yyerror_pv( Perl_form(aTHX_ "Malformed UTF-8 returned by %.*s immediately after '%.*s'", @@ -4902,11 +4904,10 @@ Perl_yylex(pTHX) default: if (UTF) { if (! isUTF8_CHAR((U8 *) s, (U8 *) PL_bufend)) { - ENTER; - SAVESPTR(PL_warnhook); - PL_warnhook = PERL_WARNHOOK_FATAL; - utf8n_to_uvchr((U8*)s, PL_bufend-s, NULL, 0); - LEAVE; + _force_out_malformed_utf8_message((U8 *) s, (U8 *) PL_bufend, + 0, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ } if (isIDFIRST_utf8((U8*)s)) { goto keylookup; diff --git a/utf8.c b/utf8.c index 9fe9b03ed8..6b2c12856d 100644 --- a/utf8.c +++ b/utf8.c @@ -52,6 +52,54 @@ within non-zero characters. =cut */ +void +Perl__force_out_malformed_utf8_message(pTHX_ + const U8 *const p, /* First byte in UTF-8 sequence */ + const U8 * const e, /* Final byte in sequence (may include + multiple chars */ + const U32 flags, /* Flags to pass to utf8n_to_uvchr(), + usually 0, or some DISALLOW flags */ + const bool die_here) /* If TRUE, this function does not return */ +{ + /* This core-only function is to be called when a malformed UTF-8 character + * is found, in order to output the detailed information about the + * malformation before dieing. The reason it exists is for the occasions + * when such a malformation is fatal, but warnings might be turned off, so + * that normally they would not be actually output. This ensures that they + * do get output. Because a sequence may be malformed in more than one + * way, multiple messages may be generated, so we can't make them fatal, as + * that would cause the first one to die. + * + * Instead we pretend -W was passed to perl, then die afterwards. The + * flexibility is here to return to the caller so they can finish up and + * die themselves */ + U32 errors; + + PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE; + + ENTER; + SAVESPTR(PL_dowarn); + SAVESPTR(PL_curcop); + + PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; + if (PL_curcop) { + PL_curcop->cop_warnings = pWARN_ALL; + } + + (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors); + + LEAVE; + + if (! errors) { + Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should" + " be called only when there are errors found"); + } + + if (die_here) { + Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + } +} + /* =for apidoc uvoffuni_to_utf8_flags @@ -2377,16 +2425,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, * character without reading beyond the end, and pass that number on to the * validating routine */ if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) { - if (ckWARN_d(WARN_UTF8)) { - Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8), - "Passing malformed UTF-8 to \"%s\" is deprecated", swashname); - if (ckWARN(WARN_UTF8)) { /* This will output details as to the - what the malformation is */ - utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL); - } - } - return FALSE; + _force_out_malformed_utf8_message(p, p + UTF8SKIP(p), + 0, + 1 /* Die */ ); + NOT_REACHED; /* NOTREACHED */ } + if (!*swash) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; *swash = _core_swash_init("utf8", -- Perl5 Master Repository
