[perl.git] branch blead, updated. v5.25.8-64-g607313a197

Karl Williamson Fri, 23 Dec 2016 21:37:19 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/607313a19740cb756ab98d5e58e6040ea8c125d4?hp=71a0317b30064fe6979f8616f6d11553b35de971>


- Log -----------------------------------------------------------------
commit 607313a19740cb756ab98d5e58e6040ea8c125d4
Author: Karl Williamson <[email protected]>
Date:   Mon Dec 19 11:23:22 2016 -0700

    Deprecate toFOO_utf8()
    
    Now that there are _safe versions, deprecate the unsafe ones.

M       embed.fnc
M       embed.h
M       ext/XS-APItest/APItest.xs
M       ext/XS-APItest/t/handy.t
M       handy.h
M       pod/perldelta.pod
M       proto.h
M       utf8.c
M       utf8.h

commit a1a5ec35e6a3df0994b103aadb28a8c1a3a278da
Author: Karl Williamson <[email protected]>
Date:   Mon Dec 19 11:12:48 2016 -0700

    Convert core to use toFOO_utf8_safe()

M       handy.h
M       pp.c
M       regcomp.c
M       regexec.c
M       utf8.c

commit a239b1e291a3367448da17ea47c2f2aca5a07b69
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 18 18:05:46 2016 -0700

    Add toFOO_utf8_safe() macros

M       embed.fnc
M       embed.h
M       ext/XS-APItest/APItest.xs
M       ext/XS-APItest/t/handy.t
M       handy.h
M       pod/perldelta.pod
M       proto.h
M       utf8.c
M       utf8.h

commit 042d9e5039128be63013ec1b4e120e4b3cebc48c
Author: Karl Williamson <[email protected]>
Date:   Thu Dec 15 20:11:00 2016 -0700

    Convert some calls to test for malformations
    
    Code review showed several places in core where a UTF-8 sequence that
    was for a code point below 256 could be malformed, and be blindly
    accepted.  Convert these to use the similar macro that does the check.
    
    One place in regexec.c was not converted because it is working on the
    pattern, which perl should have generated itself, so very unlikely to be
    bemalformed.
    
    I didn't add tests for these, as it would be a pain to figure out
    somehow to trigger them, and this is precautionary, based on code
    reading rather than any known field experience.

M       locale.c
M       pp.c
M       regexec.c
M       utf8.c

commit fa8ab37438503dde8c6e781d0d56daf54a633f6b
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 18 16:33:08 2016 -0700

    Don't assume input to case change macros is valid
    
    Experience has shown that they can be invalid, and this commit now checks
    for that.  Further checking will be done in the next commit

M       pod/perldelta.pod
M       t/lib/warnings/utf8
M       utf8.c

commit 567b353c280f568f67de0e8d8b78d7abc7c931f7
Author: Karl Williamson <[email protected]>
Date:   Wed Dec 14 13:02:06 2016 -0700

    For character case changing, create macros and use
    
    This creates several macros that future commits will use to provide a
    layer between the caller and the function.

M       handy.h
M       pp.c
M       regcomp.c
M       regexec.c
M       utf8.c

commit d4f48b064914f271411fb517d698b2d59bbbb6e3
Author: Karl Williamson <[email protected]>
Date:   Wed Dec 14 13:00:45 2016 -0700

    regcomp.c, mathoms.c: Convert to use preferred macro
    
    Better to use the macro than to directly call the function it wraps

M       mathoms.c
M       regcomp.c
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc                 |  35 ++++++----
 embed.h                   |   9 +--
 ext/XS-APItest/APItest.xs |  72 +++++++++++++++++---
 ext/XS-APItest/t/handy.t  |  55 +++++++++++++++-
 handy.h                   | 108 ++++++++++++++++++++++++++----
 locale.c                  |   3 +-
 mathoms.c                 |   8 +--
 pod/perldelta.pod         |   6 +-
 pp.c                      |  20 +++---
 proto.h                   |  35 ++++++----
 regcomp.c                 |   8 ++-
 regexec.c                 |  22 ++++---
 t/lib/warnings/utf8       |   1 +
 utf8.c                    | 163 ++++++++++++++++++++++++++++++++++++++++------
 utf8.h                    |  13 ++--
 15 files changed, 455 insertions(+), 103 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 561ad9f564..7e1c3f26b0 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1729,6 +1729,13 @@ s        |void   |warn_on_first_deprecated_use           
                    \
                                |const bool use_locale                      \
                                |NN const char * const file                 \
                                |const unsigned line
+s      |U32    |check_and_deprecate                                        \
+                               |NN const U8 * p                            \
+                               |NN const U8 ** e                           \
+                               |const unsigned type                        \
+                               |const bool use_locale                      \
+                               |NN const char * const file                 \
+                               |const unsigned line
 s      |UV     |_to_utf8_case  |const UV uv1                                   
\
                                |NN const U8 *p                                 
\
                                |NN U8* ustrp                                   
\
@@ -1737,18 +1744,22 @@ s       |UV     |_to_utf8_case  |const UV uv1           
                        \
                                |NN const char *normal                          
\
                                |NULLOK const char *special
 #endif
-Apbmd  |UV     |to_utf8_lower  |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
-AMp    |UV     |_to_utf8_lower_flags   |NN const U8 *p|NN U8* ustrp  \
-                               |NULLOK STRLEN *lenp|bool flags
-Apbmd  |UV     |to_utf8_upper  |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
-AMp    |UV     |_to_utf8_upper_flags   |NN const U8 *p|NN U8* ustrp   \
-                               |NULLOK STRLEN *lenp|bool flags
-Apbmd  |UV     |to_utf8_title  |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
-AMp    |UV     |_to_utf8_title_flags   |NN const U8 *p|NN U8* ustrp   \
-                               |NULLOK STRLEN *lenp|bool flags
-Apbmd  |UV     |to_utf8_fold   |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
-AMp    |UV     |_to_utf8_fold_flags|NN const U8 *p|NN U8* ustrp       \
-                               |NULLOK STRLEN *lenp|U8 flags
+ApbmdD |UV     |to_utf8_lower  |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
+AMp    |UV     |_to_utf8_lower_flags|NN const U8 *p|NULLOK const U8* e         
\
+                               |NN U8* ustrp|NULLOK STRLEN *lenp|bool flags    
\
+                               |NN const char * const file|const int line
+ApbmdD |UV     |to_utf8_upper  |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
+AMp    |UV     |_to_utf8_upper_flags   |NN const U8 *p|NULLOK const U8 *e      
\
+                               |NN U8* ustrp|NULLOK STRLEN *lenp|bool flags    
\
+                               |NN const char * const file|const int line
+ApbmdD |UV     |to_utf8_title  |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
+AMp    |UV     |_to_utf8_title_flags   |NN const U8 *p|NULLOK const U8* e      
\
+                               |NN U8* ustrp|NULLOK STRLEN *lenp|bool flags    
\
+                               |NN const char * const file|const int line
+ApbmdD |UV     |to_utf8_fold   |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
+AMp    |UV     |_to_utf8_fold_flags|NN const U8 *p|NULLOK const U8 *e          
\
+                               |NN U8* ustrp|NULLOK STRLEN *lenp|U8 flags  \
+                               |NN const char * const file|const int line
 #if defined(PERL_IN_MG_C) || defined(PERL_IN_PP_C)
 pn     |bool   |translate_substr_offsets|STRLEN curlen|IV pos1_iv \
                                         |bool pos1_is_uv|IV len_iv \
diff --git a/embed.h b/embed.h
index 4687806c08..3b3eb8695d 100644
--- a/embed.h
+++ b/embed.h
@@ -42,10 +42,10 @@
 #define _is_utf8_xidcont(a)    Perl__is_utf8_xidcont(aTHX_ a)
 #define _is_utf8_xidstart(a)   Perl__is_utf8_xidstart(aTHX_ a)
 #define _to_uni_fold_flags(a,b,c,d)    Perl__to_uni_fold_flags(aTHX_ a,b,c,d)
-#define _to_utf8_fold_flags(a,b,c,d)   Perl__to_utf8_fold_flags(aTHX_ a,b,c,d)
-#define _to_utf8_lower_flags(a,b,c,d)  Perl__to_utf8_lower_flags(aTHX_ a,b,c,d)
-#define _to_utf8_title_flags(a,b,c,d)  Perl__to_utf8_title_flags(aTHX_ a,b,c,d)
-#define _to_utf8_upper_flags(a,b,c,d)  Perl__to_utf8_upper_flags(aTHX_ a,b,c,d)
+#define _to_utf8_fold_flags(a,b,c,d,e,f,g)     Perl__to_utf8_fold_flags(aTHX_ 
a,b,c,d,e,f,g)
+#define _to_utf8_lower_flags(a,b,c,d,e,f,g)    Perl__to_utf8_lower_flags(aTHX_ 
a,b,c,d,e,f,g)
+#define _to_utf8_title_flags(a,b,c,d,e,f,g)    Perl__to_utf8_title_flags(aTHX_ 
a,b,c,d,e,f,g)
+#define _to_utf8_upper_flags(a,b,c,d,e,f,g)    Perl__to_utf8_upper_flags(aTHX_ 
a,b,c,d,e,f,g)
 #define amagic_call(a,b,c,d)   Perl_amagic_call(aTHX_ a,b,c,d)
 #define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b)
 #define apply_attrs_string(a,b,c,d)    Perl_apply_attrs_string(aTHX_ a,b,c,d)
@@ -1834,6 +1834,7 @@
 #  if defined(PERL_IN_UTF8_C)
 #define _byte_dump_string(a,b) S__byte_dump_string(aTHX_ a,b)
 #define _to_utf8_case(a,b,c,d,e,f,g)   S__to_utf8_case(aTHX_ a,b,c,d,e,f,g)
+#define check_and_deprecate(a,b,c,d,e,f)       S_check_and_deprecate(aTHX_ 
a,b,c,d,e,f)
 #define check_locale_boundary_crossing(a,b,c,d)        
S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
 #define does_utf8_overflow     S_does_utf8_overflow
 #define isFF_OVERLONG          S_isFF_OVERLONG
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index e9d28c8d49..39af336fb2 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -6197,17 +6197,31 @@ test_toLOWER_uvchr(UV ord)
         RETVAL
 
 AV *
-test_toLOWER_utf8(SV * p)
+test_toLOWER_utf8(SV * p, int type)
     PREINIT:
         U8 *input;
         U8 s[UTF8_MAXBYTES_CASE + 1];
         STRLEN len;
         AV *av;
         SV *utf8;
+       const unsigned char * e;
+        UV resultant_cp;
     CODE:
         input = (U8 *) SvPV(p, len);
         av = newAV();
-        av_push(av, newSVuv(toLOWER_utf8(input, s, &len)));
+        if (type >= 0) {
+            e = input + UTF8SKIP(input) - type;
+            resultant_cp = toLOWER_utf8_safe(input, e, s, &len);
+        }
+        else if (type == -1) {
+            resultant_cp = toLOWER_utf8(input, s, &len);
+        }
+#ifndef NO_MATHOMS
+        else {
+            resultant_cp = Perl_to_utf8_lower(aTHX_ input, s, &len);
+        }
+#endif
+        av_push(av, newSVuv(resultant_cp));
 
         utf8 = newSVpvn((char *) s, len);
         SvUTF8_on(utf8);
@@ -6273,17 +6287,31 @@ test_toFOLD_uvchr(UV ord)
         RETVAL
 
 AV *
-test_toFOLD_utf8(SV * p)
+test_toFOLD_utf8(SV * p, int type)
     PREINIT:
         U8 *input;
         U8 s[UTF8_MAXBYTES_CASE + 1];
         STRLEN len;
         AV *av;
         SV *utf8;
+       const unsigned char * e;
+        UV resultant_cp;
     CODE:
         input = (U8 *) SvPV(p, len);
         av = newAV();
-        av_push(av, newSVuv(toFOLD_utf8(input, s, &len)));
+        if (type >= 0) {
+            e = input + UTF8SKIP(input) - type;
+            resultant_cp = toFOLD_utf8_safe(input, e, s, &len);
+        }
+        else if (type == -1) {
+            resultant_cp = toFOLD_utf8(input, s, &len);
+        }
+#ifndef NO_MATHOMS
+        else {
+            resultant_cp = Perl_to_utf8_fold(aTHX_ input, s, &len);
+        }
+#endif
+        av_push(av, newSVuv(resultant_cp));
 
         utf8 = newSVpvn((char *) s, len);
         SvUTF8_on(utf8);
@@ -6349,17 +6377,31 @@ test_toUPPER_uvchr(UV ord)
         RETVAL
 
 AV *
-test_toUPPER_utf8(SV * p)
+test_toUPPER_utf8(SV * p, int type)
     PREINIT:
         U8 *input;
         U8 s[UTF8_MAXBYTES_CASE + 1];
         STRLEN len;
         AV *av;
         SV *utf8;
+       const unsigned char * e;
+        UV resultant_cp;
     CODE:
         input = (U8 *) SvPV(p, len);
         av = newAV();
-        av_push(av, newSVuv(toUPPER_utf8(input, s, &len)));
+        if (type >= 0) {
+            e = input + UTF8SKIP(input) - type;
+            resultant_cp = toUPPER_utf8_safe(input, e, s, &len);
+        }
+        else if (type == -1) {
+            resultant_cp = toUPPER_utf8(input, s, &len);
+        }
+#ifndef NO_MATHOMS
+        else {
+            resultant_cp = Perl_to_utf8_upper(aTHX_ input, s, &len);
+        }
+#endif
+        av_push(av, newSVuv(resultant_cp));
 
         utf8 = newSVpvn((char *) s, len);
         SvUTF8_on(utf8);
@@ -6418,17 +6460,31 @@ test_toTITLE_uvchr(UV ord)
         RETVAL
 
 AV *
-test_toTITLE_utf8(SV * p)
+test_toTITLE_utf8(SV * p, int type)
     PREINIT:
         U8 *input;
         U8 s[UTF8_MAXBYTES_CASE + 1];
         STRLEN len;
         AV *av;
         SV *utf8;
+       const unsigned char * e;
+        UV resultant_cp;
     CODE:
         input = (U8 *) SvPV(p, len);
         av = newAV();
-        av_push(av, newSVuv(toTITLE_utf8(input, s, &len)));
+        if (type >= 0) {
+            e = input + UTF8SKIP(input) - type;
+            resultant_cp = toTITLE_utf8_safe(input, e, s, &len);
+        }
+        else if (type == -1) {
+            resultant_cp = toTITLE_utf8(input, s, &len);
+        }
+#ifndef NO_MATHOMS
+        else {
+            resultant_cp = Perl_to_utf8_title(aTHX_ input, s, &len);
+        }
+#endif
+        av_push(av, newSVuv(resultant_cp));
 
         utf8 = newSVpvn((char *) s, len);
         SvUTF8_on(utf8);
diff --git a/ext/XS-APItest/t/handy.t b/ext/XS-APItest/t/handy.t
index 81e4c7c75b..8712524770 100644
--- a/ext/XS-APItest/t/handy.t
+++ b/ext/XS-APItest/t/handy.t
@@ -161,6 +161,7 @@ my %utf8_param_code = (
                         "_safe"                 =>  0,
                         "_safe, malformed"      =>  1,
                         "deprecated unsafe"     => -1,
+                        "deprecated mathoms"    => -2,
                       );
 
 foreach my $name (sort keys %properties, 'octal') {
@@ -534,10 +535,31 @@ foreach my $name (sort keys %to_properties) {
         my $char = chr($j);
         utf8::upgrade($char);
         $char = quotemeta $char if $char eq '\\' || $char eq "'";
+        foreach my $utf8_param("_safe",
+                                "_safe, malformed",
+                                "deprecated unsafe",
+                                "deprecated mathoms",
+                                )
         {
-            my $display_call = "to${function}_utf8($display_name )";
-            $ret = eval   "test_to${function}_utf8('$char')";
-            if (is ($@, "", "$display_call didn't give error")) {
+            use Config;
+            next if    $utf8_param eq 'deprecated mathoms'
+                    && $Config{'ccflags'} =~ /-DNO_MATHOMS/;
+
+            my $utf8_param_code = $utf8_param_code{$utf8_param};
+            my $expect_error = $utf8_param_code > 0;
+
+            # Skip if can't malform (because is a UTF-8 invariant)
+            next if $expect_error && $i < ((ord "A" == 65) ? 128 : 160);
+
+            my $display_call = "to${function}_utf8($display_name, $utf8_param 
)";
+            $ret = eval   "test_to${function}_utf8('$char', $utf8_param_code)";
+            if ($expect_error) {
+                isnt ($@, "", "expected and got error in $display_call");
+                like($@, qr/Malformed UTF-8 character/,
+                     "${tab}And got expected message");
+                undef @warnings;
+            }
+            elsif (is ($@, "", "$display_call didn't give error")) {
                 is ($ret->[0], $first_ord_should_be,
                     sprintf("${tab}And correctly returned 0x%02X",
                                                     $first_ord_should_be));
@@ -545,6 +567,33 @@ foreach my $name (sort keys %to_properties) {
                 use bytes;
                 is ($ret->[2], length $utf8_should_be,
                     "${tab}Got correct number of bytes for utf8 length");
+                if ($utf8_param_code < 0) {
+                    my $warnings_ok;
+                    if (! $seen{"${function}_utf8$utf8_param"}++) {
+                        $warnings_ok = is(@warnings, 1,
+                                                   "${tab}Got a single 
warning");
+                        if ($warnings_ok) {
+                            my $expected;
+                            if ($utf8_param_code == -2) {
+                                my $lc_func = lc $function;
+                                $expected
+                = qr/starting in Perl .* to_utf8_$lc_func\(\) will be removed/;
+                            }
+                            else {
+                                $expected
+                = qr/starting in Perl .* will require an additional parameter/;
+                            }
+                            $warnings_ok = like($warnings[0], $expected,
+                                      "${tab}Got expected deprecation 
warning");
+                        }
+                    }
+                    else {
+                        $warnings_ok = is(@warnings, 0,
+                                  "${tab}Deprecation warned only the one 
time");
+                    }
+                    $warnings_ok or diag("@warnings");
+                    undef @warnings;
+                }
             }
         }
     }
diff --git a/handy.h b/handy.h
index 0d94ff178f..330f812ae0 100644
--- a/handy.h
+++ b/handy.h
@@ -849,8 +849,9 @@ The first code point of the uppercased version is returned
 (but note, as explained at L<the top of this section|/Character case
 changing>, that there may be more.)
 
-=for apidoc Am|UV|toUPPER_utf8|U8* p|U8* s|STRLEN* lenp
-Converts the UTF-8 encoded character at C<p> to its uppercase version, and
+=for apidoc Am|UV|toUPPER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
+Converts the first UTF-8 encoded character in the sequence starting at C<p> and
+extending no further than S<C<e - 1>> to its uppercase version, and
 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>.  Note
 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
 bytes since the uppercase version may be longer than the original character.
@@ -859,7 +860,24 @@ The first code point of the uppercased version is returned
 (but note, as explained at L<the top of this section|/Character case
 changing>, that there may be more).
 
-The input character at C<p> is assumed to be well-formed.
+The suffix C<_safe> in the function's name indicates that it will not attempt
+to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is
+true (this is asserted for in C<-DDEBUGGING> builds).  If the UTF-8 for the
+input character is malformed in some way, the program may croak, or the
+function may return the REPLACEMENT CHARACTER, at the discretion of the
+implementation, and subject to change in future releases.
+
+=for apidoc Am|UV|toUPPER_utf8|U8* p|U8* s|STRLEN* lenp
+This is like C<L</toUPPER_utf8_safe>>, but doesn't have the C<e>
+parameter  The function therefore can't check if it is reading
+beyond the end of the string.  Starting in Perl v5.30, it will take the C<e>
+parameter, becoming a synonym for C<toUPPER_utf8_safe>.  At that time every
+program that uses it will have to be changed to successfully compile.  In the
+meantime, the first runtime call to C<toUPPER_utf8> from each call point in the
+program will raise a deprecation warning, enabled by default.  You can convert
+your program now to use C<toUPPER_utf8_safe>, and avoid the warnings, and get 
an
+extra measure of protection, or you can wait until v5.30, when you'll be forced
+to add the C<e> parameter.
 
 =for apidoc Am|U8|toFOLD|U8 ch
 Converts the specified character to foldcase.  If the input is anything but an
@@ -878,8 +896,9 @@ The first code point of the foldcased version is returned
 (but note, as explained at L<the top of this section|/Character case
 changing>, that there may be more).
 
-=for apidoc Am|UV|toFOLD_utf8|U8* p|U8* s|STRLEN* lenp
-Converts the UTF-8 encoded character at C<p> to its foldcase version, and
+=for apidoc Am|UV|toFOLD_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
+Converts the first UTF-8 encoded character in the sequence starting at C<p> and
+extending no further than S<C<e - 1>> to its foldcase version, and
 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>.  Note
 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
 bytes since the foldcase version may be longer than the original character.
@@ -888,7 +907,24 @@ The first code point of the foldcased version is returned
 (but note, as explained at L<the top of this section|/Character case
 changing>, that there may be more).
 
-The input character at C<p> is assumed to be well-formed.
+The suffix C<_safe> in the function's name indicates that it will not attempt
+to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is
+true (this is asserted for in C<-DDEBUGGING> builds).  If the UTF-8 for the
+input character is malformed in some way, the program may croak, or the
+function may return the REPLACEMENT CHARACTER, at the discretion of the
+implementation, and subject to change in future releases.
+
+=for apidoc Am|UV|toFOLD_utf8|U8* p|U8* s|STRLEN* lenp
+This is like C<L</toFOLD_utf8_safe>>, but doesn't have the C<e>
+parameter  The function therefore can't check if it is reading
+beyond the end of the string.  Starting in Perl v5.30, it will take the C<e>
+parameter, becoming a synonym for C<toFOLD_utf8_safe>.  At that time every
+program that uses it will have to be changed to successfully compile.  In the
+meantime, the first runtime call to C<toFOLD_utf8> from each call point in the
+program will raise a deprecation warning, enabled by default.  You can convert
+your program now to use C<toFOLD_utf8_safe>, and avoid the warnings, and get an
+extra measure of protection, or you can wait until v5.30, when you'll be forced
+to add the C<e> parameter.
 
 =for apidoc Am|U8|toLOWER|U8 ch
 Converts the specified character to lowercase.  If the input is anything but an
@@ -914,8 +950,10 @@ The first code point of the lowercased version is returned
 (but note, as explained at L<the top of this section|/Character case
 changing>, that there may be more).
 
-=for apidoc Am|UV|toLOWER_utf8|U8* p|U8* s|STRLEN* lenp
-Converts the UTF-8 encoded character at C<p> to its lowercase version, and
+
+=for apidoc Am|UV|toLOWER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
+Converts the first UTF-8 encoded character in the sequence starting at C<p> and
+extending no further than S<C<e - 1>> to its lowercase version, and
 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>.  Note
 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
 bytes since the lowercase version may be longer than the original character.
@@ -924,7 +962,24 @@ The first code point of the lowercased version is returned
 (but note, as explained at L<the top of this section|/Character case
 changing>, that there may be more).
 
-The input character at C<p> is assumed to be well-formed.
+The suffix C<_safe> in the function's name indicates that it will not attempt
+to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is
+true (this is asserted for in C<-DDEBUGGING> builds).  If the UTF-8 for the
+input character is malformed in some way, the program may croak, or the
+function may return the REPLACEMENT CHARACTER, at the discretion of the
+implementation, and subject to change in future releases.
+
+=for apidoc Am|UV|toLOWER_utf8|U8* p|U8* s|STRLEN* lenp
+This is like C<L</toLOWER_utf8_safe>>, but doesn't have the C<e>
+parameter  The function therefore can't check if it is reading
+beyond the end of the string.  Starting in Perl v5.30, it will take the C<e>
+parameter, becoming a synonym for C<toLOWER_utf8_safe>.  At that time every
+program that uses it will have to be changed to successfully compile.  In the
+meantime, the first runtime call to C<toLOWER_utf8> from each call point in the
+program will raise a deprecation warning, enabled by default.  You can convert
+your program now to use C<toLOWER_utf8_safe>, and avoid the warnings, and get 
an
+extra measure of protection, or you can wait until v5.30, when you'll be forced
+to add the C<e> parameter.
 
 =for apidoc Am|U8|toTITLE|U8 ch
 Converts the specified character to titlecase.  If the input is anything but an
@@ -944,8 +999,9 @@ The first code point of the titlecased version is returned
 (but note, as explained at L<the top of this section|/Character case
 changing>, that there may be more).
 
-=for apidoc Am|UV|toTITLE_utf8|U8* p|U8* s|STRLEN* lenp
-Converts the UTF-8 encoded character at C<p> to its titlecase version, and
+=for apidoc Am|UV|toTITLE_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp
+Converts the first UTF-8 encoded character in the sequence starting at C<p> and
+extending no further than S<C<e - 1>> to its titlecase version, and
 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>.  Note
 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1>
 bytes since the titlecase version may be longer than the original character.
@@ -954,7 +1010,24 @@ The first code point of the titlecased version is returned
 (but note, as explained at L<the top of this section|/Character case
 changing>, that there may be more).
 
-The input character at C<p> is assumed to be well-formed.
+The suffix C<_safe> in the function's name indicates that it will not attempt
+to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is
+true (this is asserted for in C<-DDEBUGGING> builds).  If the UTF-8 for the
+input character is malformed in some way, the program may croak, or the
+function may return the REPLACEMENT CHARACTER, at the discretion of the
+implementation, and subject to change in future releases.
+
+=for apidoc Am|UV|toTITLE_utf8|U8* p|U8* s|STRLEN* lenp
+This is like C<L</toLOWER_utf8_safe>>, but doesn't have the C<e>
+parameter  The function therefore can't check if it is reading
+beyond the end of the string.  Starting in Perl v5.30, it will take the C<e>
+parameter, becoming a synonym for C<toTITLE_utf8_safe>.  At that time every
+program that uses it will have to be changed to successfully compile.  In the
+meantime, the first runtime call to C<toTITLE_utf8> from each call point in the
+program will raise a deprecation warning, enabled by default.  You can convert
+your program now to use C<toTITLE_utf8_safe>, and avoid the warnings, and get 
an
+extra measure of protection, or you can wait until v5.30, when you'll be forced
+to add the C<e> parameter.
 
 =cut
 
@@ -1880,6 +1953,17 @@ _generic_utf8_safe(classnum, p, e, 
_is_utf8_FOO_with_len(classnum, p, e))
 #define toTITLE_utf8(p,s,l)    to_utf8_title(p,s,l)
 #define toUPPER_utf8(p,s,l)    to_utf8_upper(p,s,l)
 
+/* For internal core use only, subject to change */
+#define _toFOLD_utf8_flags(p,e,s,l,f)  _to_utf8_fold_flags (p,e,s,l,f, "", 0)
+#define _toLOWER_utf8_flags(p,e,s,l,f) _to_utf8_lower_flags(p,e,s,l,f, "", 0)
+#define _toTITLE_utf8_flags(p,e,s,l,f) _to_utf8_title_flags(p,e,s,l,f, "", 0)
+#define _toUPPER_utf8_flags(p,e,s,l,f) _to_utf8_upper_flags(p,e,s,l,f, "", 0)
+
+#define toFOLD_utf8_safe(p,e,s,l)   _toFOLD_utf8_flags(p,e,s,l, 
FOLD_FLAGS_FULL)
+#define toLOWER_utf8_safe(p,e,s,l)  _toLOWER_utf8_flags(p,e,s,l, 0)
+#define toTITLE_utf8_safe(p,e,s,l)  _toTITLE_utf8_flags(p,e,s,l, 0)
+#define toUPPER_utf8_safe(p,e,s,l)  _toUPPER_utf8_flags(p,e,s,l, 0)
+
 /* For internal core Perl use only: the base macros for defining macros like
  * isALPHA_LC_utf8.  These are like _generic_utf8, but if the first code point
  * in 'p' is within the 0-255 range, it uses locale rules from the passed-in
diff --git a/locale.c b/locale.c
index 07f599c032..b86077ffa4 100644
--- a/locale.c
+++ b/locale.c
@@ -1723,13 +1723,14 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 {
                     STRLEN i;
                     STRLEN d= 0;
+                    char * e = (char *) t + len;
 
                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
                         U8 cur_char = t[i];
                         if (UTF8_IS_INVARIANT(cur_char)) {
                             s[d++] = cur_char;
                         }
-                        else if (UTF8_IS_DOWNGRADEABLE_START(cur_char)) {
+                        else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, 
t[i+1]);
                         }
                         else {  /* Replace illegal cp with highest collating
diff --git a/mathoms.c b/mathoms.c
index c74a38625a..92cd77a3c7 100644
--- a/mathoms.c
+++ b/mathoms.c
@@ -1145,7 +1145,7 @@ Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp)
 {
     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
 
-    return _to_utf8_fold_flags(p, ustrp, lenp, FOLD_FLAGS_FULL);
+    return toFOLD_utf8(p, ustrp, lenp);
 }
 
 UV
@@ -1153,7 +1153,7 @@ Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp)
 {
     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
 
-    return _to_utf8_lower_flags(p, ustrp, lenp, FALSE);
+    return toLOWER_utf8(p, ustrp, lenp);
 }
 
 UV
@@ -1161,7 +1161,7 @@ Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp)
 {
     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
 
-    return _to_utf8_title_flags(p, ustrp, lenp, FALSE);
+    return toTITLE_utf8(p, ustrp, lenp);
 }
 
 UV
@@ -1169,7 +1169,7 @@ Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp)
 {
     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
 
-    return _to_utf8_upper_flags(p, ustrp, lenp, FALSE);
+    return toUPPER_utf8(p, ustrp, lenp);
 }
 
 SV *
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 472d45bbda..f34e3e00c0 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -327,11 +327,12 @@ well.
 
 =item *
 
-New versions of macros like C<isALPHA_utf8> have been added, each with the
+New versions of macros like C<isALPHA_utf8> and C<toLOWER_utf8>  have
+been added, each with the
 suffix C<_safe>, like C<isSPACE_utf8_safe>.  These take an extra
 parameter, giving an upper limit of how far into the string it is safe
 to read.  Using the old versions could cause attempts to read beyond the
-end of the input buffer if the UTF-8 is not well-formed, and their use
+end of the input buffer if the UTF-8 is not well-formed, and ther use
 now raises a deprecation warning.  Details are at
 L<perlapi/Character classification>.
 
@@ -339,6 +340,7 @@ L<perlapi/Character classification>.
 
 Calling macros like C<isALPHA_utf8> on malformed UTF-8 have issued a
 deprecation warning since Perl v5.18.  They now die.
+Similarly, macros like C<toLOWER_utf8> on malformed UTF-8 now die.
 
 =item *
 
diff --git a/pp.c b/pp.c
index 6fb20f684e..300d786421 100644
--- a/pp.c
+++ b/pp.c
@@ -3790,16 +3790,16 @@ PP(pp_ucfirst)
         ulen = UTF8SKIP(s);
         if (op_type == OP_UCFIRST) {
 #ifdef USE_LOCALE_CTYPE
-           _to_utf8_title_flags(s, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE));
+           _toTITLE_utf8_flags(s, s +slen, tmpbuf, &tculen, 
IN_LC_RUNTIME(LC_CTYPE));
 #else
-           _to_utf8_title_flags(s, tmpbuf, &tculen, 0);
+           _toTITLE_utf8_flags(s, s +slen, tmpbuf, &tculen, 0);
 #endif
        }
         else {
 #ifdef USE_LOCALE_CTYPE
-           _to_utf8_lower_flags(s, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE));
+           _toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, 
IN_LC_RUNTIME(LC_CTYPE));
 #else
-           _to_utf8_lower_flags(s, tmpbuf, &tculen, 0);
+           _toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, 0);
 #endif
        }
 
@@ -4090,9 +4090,9 @@ PP(pp_uc)
 
             u = UTF8SKIP(s);
 #ifdef USE_LOCALE_CTYPE
-            uv = _to_utf8_upper_flags(s, tmpbuf, &ulen, 
IN_LC_RUNTIME(LC_CTYPE));
+            uv = _toUPPER_utf8_flags(s, send, tmpbuf, &ulen, 
IN_LC_RUNTIME(LC_CTYPE));
 #else
-            uv = _to_utf8_upper_flags(s, tmpbuf, &ulen, 0);
+            uv = _toUPPER_utf8_flags(s, send, tmpbuf, &ulen, 0);
 #endif
 #define GREEK_CAPITAL_LETTER_IOTA 0x0399
 #define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
@@ -4306,9 +4306,9 @@ PP(pp_lc)
            STRLEN ulen;
 
 #ifdef USE_LOCALE_CTYPE
-           _to_utf8_lower_flags(s, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE));
+           _toLOWER_utf8_flags(s, send, tmpbuf, &ulen, 
IN_LC_RUNTIME(LC_CTYPE));
 #else
-           _to_utf8_lower_flags(s, tmpbuf, &ulen, 0);
+           _toLOWER_utf8_flags(s, send, tmpbuf, &ulen, 0);
 #endif
 
            /* Here is where we would do context-sensitive actions.  See the
@@ -4404,7 +4404,7 @@ PP(pp_quotemeta)
                        to_quote = TRUE;
                    }
                }
-               else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+               else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, s + len)) {
                    if (
 #ifdef USE_LOCALE_CTYPE
                    /* In locale, we quote all non-ASCII Latin1 chars.
@@ -4516,7 +4516,7 @@ PP(pp_fc)
             const STRLEN u = UTF8SKIP(s);
             STRLEN ulen;
 
-            _to_utf8_fold_flags(s, tmpbuf, &ulen, flags);
+            _toFOLD_utf8_flags(s, send, tmpbuf, &ulen, flags);
 
             if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
                 const UV o = d - (U8*)SvPVX_const(dest);
diff --git a/proto.h b/proto.h
index 939e821b95..3c3a6ce189 100644
--- a/proto.h
+++ b/proto.h
@@ -102,18 +102,18 @@ PERL_CALLCONV bool        Perl__is_utf8_xidstart(pTHX_ 
const U8 *p)
 PERL_CALLCONV UV       Perl__to_uni_fold_flags(pTHX_ UV c, U8 *p, STRLEN 
*lenp, U8 flags);
 #define PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS    \
        assert(p); assert(lenp)
-PERL_CALLCONV UV       Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, 
STRLEN *lenp, U8 flags);
+PERL_CALLCONV UV       Perl__to_utf8_fold_flags(pTHX_ const U8 *p, const U8 
*e, U8* ustrp, STRLEN *lenp, U8 flags, const char * const file, const int line);
 #define PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS   \
-       assert(p); assert(ustrp)
-PERL_CALLCONV UV       Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, 
STRLEN *lenp, bool flags);
+       assert(p); assert(ustrp); assert(file)
+PERL_CALLCONV UV       Perl__to_utf8_lower_flags(pTHX_ const U8 *p, const U8* 
e, U8* ustrp, STRLEN *lenp, bool flags, const char * const file, const int 
line);
 #define PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS  \
-       assert(p); assert(ustrp)
-PERL_CALLCONV UV       Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, 
STRLEN *lenp, bool flags);
+       assert(p); assert(ustrp); assert(file)
+PERL_CALLCONV UV       Perl__to_utf8_title_flags(pTHX_ const U8 *p, const U8* 
e, U8* ustrp, STRLEN *lenp, bool flags, const char * const file, const int 
line);
 #define PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS  \
-       assert(p); assert(ustrp)
-PERL_CALLCONV UV       Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, 
STRLEN *lenp, bool flags);
+       assert(p); assert(ustrp); assert(file)
+PERL_CALLCONV UV       Perl__to_utf8_upper_flags(pTHX_ const U8 *p, const U8 
*e, U8* ustrp, STRLEN *lenp, bool flags, const char * const file, const int 
line);
 #define PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS  \
-       assert(p); assert(ustrp)
+       assert(p); assert(ustrp); assert(file)
 PERL_CALLCONV void     Perl__warn_problematic_locale(void);
 PERL_CALLCONV LOGOP*   Perl_alloc_LOGOP(pTHX_ I32 type, OP *first, OP *other);
 PERL_CALLCONV PADOFFSET        Perl_allocmy(pTHX_ const char *const name, 
const STRLEN len, const U32 flags);
@@ -3464,25 +3464,33 @@ PERL_CALLCONV UV        Perl_to_utf8_case(pTHX_ const 
U8 *p, U8* ustrp, STRLEN *lenp, S
        assert(p); assert(ustrp); assert(swashp); assert(normal)
 
 #ifndef NO_MATHOMS
-PERL_CALLCONV UV       Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp);
+PERL_CALLCONV UV       Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp)
+                       __attribute__deprecated__;
 #define PERL_ARGS_ASSERT_TO_UTF8_FOLD  \
        assert(p); assert(ustrp)
 #endif
+
 #ifndef NO_MATHOMS
-PERL_CALLCONV UV       Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp);
+PERL_CALLCONV UV       Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp)
+                       __attribute__deprecated__;
 #define PERL_ARGS_ASSERT_TO_UTF8_LOWER \
        assert(p); assert(ustrp)
 #endif
+
 #ifndef NO_MATHOMS
-PERL_CALLCONV UV       Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp);
+PERL_CALLCONV UV       Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp)
+                       __attribute__deprecated__;
 #define PERL_ARGS_ASSERT_TO_UTF8_TITLE \
        assert(p); assert(ustrp)
 #endif
+
 #ifndef NO_MATHOMS
-PERL_CALLCONV UV       Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp);
+PERL_CALLCONV UV       Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN 
*lenp)
+                       __attribute__deprecated__;
 #define PERL_ARGS_ASSERT_TO_UTF8_UPPER \
        assert(p); assert(ustrp)
 #endif
+
 PERL_CALLCONV bool     Perl_try_amagic_bin(pTHX_ int method, int flags);
 PERL_CALLCONV bool     Perl_try_amagic_un(pTHX_ int method, int flags);
 PERL_CALLCONV I32      Perl_unpack_str(pTHX_ const char *pat, const char 
*patend, const char *s, const char *strbeg, const char *strend, char **new_s, 
I32 ocnt, U32 flags);
@@ -5611,6 +5619,9 @@ STATIC char *     S__byte_dump_string(pTHX_ const U8 * s, 
const STRLEN len);
 STATIC UV      S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, 
STRLEN *lenp, SV **swashp, const char *normal, const char *special);
 #define PERL_ARGS_ASSERT__TO_UTF8_CASE \
        assert(p); assert(ustrp); assert(swashp); assert(normal)
+STATIC U32     S_check_and_deprecate(pTHX_ const U8 * p, const U8 ** e, const 
unsigned type, const bool use_locale, const char * const file, const unsigned 
line);
+#define PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE   \
+       assert(p); assert(e); assert(file)
 STATIC UV      S_check_locale_boundary_crossing(pTHX_ const U8* const p, const 
UV result, U8* const ustrp, STRLEN *lenp)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING        \
diff --git a/regcomp.c b/regcomp.c
index 7578a25dd0..953a94d45f 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3911,7 +3911,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode 
*scan,
                     }
                     else {
                         STRLEN len;
-                        _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
+                        _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
                         d += len;
                     }
                     s += s_len;
@@ -10047,7 +10047,9 @@ Perl__load_PL_utf8_foldclosures (pTHX)
         U8 dummy[UTF8_MAXBYTES_CASE+1];
 
         /* This string is just a short named one above \xff */
-        to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
+        toFOLD_utf8_safe((U8*) HYPHEN_UTF8,
+                         (U8 *) HYPHEN_UTF8 + sizeof(HYPHEN_UTF8),
+                         dummy, NULL);
         assert(PL_utf8_tofold); /* Verify that worked */
     }
     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
@@ -10198,7 +10200,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, 
regnode *node)
                 }
                 else {
                     STRLEN len;
-                    to_utf8_fold(s, d, &len);
+                    toFOLD_utf8_safe(s, e, d, &len);
                     d += len;
                     s += UTF8SKIP(s);
                 }
diff --git a/regexec.c b/regexec.c
index 340a49eb89..5c5241c57a 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1500,8 +1500,9 @@ STMT_START {
             uscan += len;                                                      
     \
             len=0;                                                             
     \
         } else {                                                               
     \
-            uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, 
flags);   \
             len = UTF8SKIP(uc);                                                
     \
+            uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, 
&foldlen,  \
+                                                                            
flags); \
             skiplen = UVCHR_SKIP( uvc );                                       
     \
             foldlen -= skiplen;                                                
     \
             uscan = foldbuf + skiplen;                                         
     \
@@ -2429,7 +2430,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
                     if ((UTF8_IS_INVARIANT(*s)
                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
                                                                 classnum)))
-                        || (UTF8_IS_DOWNGRADEABLE_START(*s)
+                        || (   UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
                             && to_complement ^ cBOOL(
                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
                                                                       *(s + 
1)),
@@ -4133,10 +4134,11 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const 
text_node, int *c1p,
                     }
                     else {
                         STRLEN len;
-                        _to_utf8_fold_flags(s,
-                                            d,
-                                            &len,
-                                            FOLD_FLAGS_FULL | 
FOLD_FLAGS_LOCALE);
+                        _toFOLD_utf8_flags(s,
+                                           pat_end,
+                                           d,
+                                           &len,
+                                           FOLD_FLAGS_FULL | 
FOLD_FLAGS_LOCALE);
                         d += len;
                         s += UTF8SKIP(s);
                     }
@@ -6373,8 +6375,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
                 break;
             }
 
-            if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 
code point */
-                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 
reginfo->strend);
+            if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
+                /* An above Latin-1 code point, or malformed */
+                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+                                                       reginfo->strend);
                 goto utf8_posix_above_latin1;
             }
 
@@ -6458,7 +6462,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
                 }
                 locinput++;
             }
-            else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+            else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, 
reginfo->strend)) {
                 if (! (to_complement
                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
                                                                *(locinput + 
1)),
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8
index 3431b86b32..3f8265b9ea 100644
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -763,6 +763,7 @@ Use of code point 0x80+ is deprecated; the permissible max 
is 0x7F+ in regexp co
 Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ at - line 
\d+.
 Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ in regexp 
compilation at - line \d+.
 Operation "uc" returns its argument for non-Unicode code point 0x7F+ at - line 
\d+.
+Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ in uc at - 
line \d+.
 Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ at - line 
\d+.
 Operation "uc" returns its argument for non-Unicode code point 0x80+ at - line 
\d+.
 Code point 0x7F+ is not Unicode, may not be portable in print at - line \d+.
diff --git a/utf8.c b/utf8.c
index 5fca6f7248..dc4c5b856e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2428,13 +2428,13 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN 
*lenp, U8 flags)
        uvchr_to_utf8(p, c);
        return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
     }
-    else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
+    else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
               the special flags. */
        U8 utf8_c[UTF8_MAXBYTES + 1];
 
       needs_full_generality:
        uvchr_to_utf8(utf8_c, c);
-       return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
+       return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c), p, lenp, 
flags);
     }
 }
 
@@ -2541,11 +2541,20 @@ S_warn_on_first_deprecated_use(pTHX_ const char * const 
name,
                Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
             }
 
+            if (strEQ(file, "mathoms.c")) {
+                Perl_warner(aTHX_ WARN_DEPRECATED,
+                            "In %s, line %d, starting in Perl v5.30, %s()"
+                            " will be removed.  Avoid this message by"
+                            " converting to use %s().\n",
+                            file, line, name, alternative);
+            }
+            else {
                 Perl_warner(aTHX_ WARN_DEPRECATED,
                             "In %s, line %d, starting in Perl v5.30, %s() will"
                             " require an additional parameter.  Avoid this"
                             " message by converting to use %s().\n",
                             file, line, name, alternative);
+            }
         }
     }
 }
@@ -2718,10 +2727,10 @@ Perl__is_utf8_mark(pTHX_ const U8 *p)
 /*
 =for apidoc to_utf8_case
 
-Instead use the appropriate one of L</toUPPER_utf8>,
-L</toTITLE_utf8>,
-L</toLOWER_utf8>,
-or L</toFOLD_utf8>.
+Instead use the appropriate one of L</toUPPER_utf8_safe>,
+L</toTITLE_utf8_safe>,
+L</toLOWER_utf8_safe>,
+or L</toFOLD_utf8_safe>.
 
 C<p> contains the pointer to the UTF-8 string encoding
 the character that is being converted.  This routine assumes that the character
@@ -2989,6 +2998,84 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const 
p, const UV result, U8* c
     return original;
 }
 
+STATIC U32
+S_check_and_deprecate(pTHX_ const U8 *p,
+                            const U8 **e,
+                            const unsigned int type,    /* See below */
+                            const bool use_locale,      /* Is this a 'LC_'
+                                                           macro call? */
+                            const char * const file,
+                            const unsigned line)
+{
+    /* This is a temporary function to deprecate the unsafe calls to the case
+     * changing macros and functions.  It keeps all the special stuff in just
+     * one place.
+     *
+     * It updates *e with the pointer to the end of the input string.  If using
+     * the old-style macros, *e is NULL on input, and so this function assumes
+     * the input string is long enough to hold the entire UTF-8 sequence, and
+     * sets *e accordingly, but it then returns a flag to pass the
+     * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
+     * using the full length if possible.
+     *
+     * It also does the assert that *e > p when *e is not NULL.  This should be
+     * migrated to the callers when this function gets deleted.
+     *
+     * The 'type' parameter is used for the caller to specify which case
+     * changing function this is called from: */
+
+#       define DEPRECATE_TO_UPPER 0
+#       define DEPRECATE_TO_TITLE 1
+#       define DEPRECATE_TO_LOWER 2
+#       define DEPRECATE_TO_FOLD  3
+
+    U32 utf8n_flags = 0;
+    const char * name;
+    const char * alternative;
+
+    PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
+
+    if (*e == NULL) {
+        utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
+        *e = p + UTF8SKIP(p);
+
+        /* For mathoms.c calls, we use the function name we know is stored
+         * there */
+        if (type == DEPRECATE_TO_UPPER) {
+            name = strEQ(file, "mathoms.c")
+                   ? "to_utf8_upper"
+                   : "toUPPER_utf8";
+            alternative = "toUPPER_utf8_safe";
+        }
+        else if (type == DEPRECATE_TO_TITLE) {
+            name = strEQ(file, "mathoms.c")
+                   ? "to_utf8_title"
+                   : "toTITLE_utf8";
+            alternative = "toTITLE_utf8_safe";
+        }
+        else if (type == DEPRECATE_TO_LOWER) {
+            name = strEQ(file, "mathoms.c")
+                   ? "to_utf8_lower"
+                   : "toLOWER_utf8";
+            alternative = "toLOWER_utf8_safe";
+        }
+        else if (type == DEPRECATE_TO_FOLD) {
+            name = strEQ(file, "mathoms.c")
+                   ? "to_utf8_fold"
+                   : "toFOLD_utf8";
+            alternative = "toFOLD_utf8_safe";
+        }
+        else Perl_croak(aTHX_ "panic: Unexpected case change type");
+
+        warn_on_first_deprecated_use(name, alternative, use_locale, file, 
line);
+    }
+    else {
+        assert (p < *e);
+    }
+
+    return utf8n_flags;
+}
+
 /* The process for changing the case is essentially the same for the four case
  * change types, except there are complications for folding.  Otherwise the
  * difference is only which case to change to.  To make sure that they all do
@@ -3019,6 +3106,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, 
const UV result, U8* c
  * going on. */
 #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func,    \
                                L1_func_extra_param)                          \
+                                                                             \
     if (flags & (locale_flags)) {                                            \
         /* Treat a UTF-8 locale as not being in locale at all */             \
         if (IN_UTF8_CTYPE_LOCALE) {                                          \
@@ -3037,7 +3125,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, 
const UV result, U8* c
             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
         }                                                                    \
     }                                                                        \
-    else if UTF8_IS_DOWNGRADEABLE_START(*p) {                                \
+    else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
         if (flags & (locale_flags)) {                                        \
             result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p,         \
                                                                  *(p+1)));   \
@@ -3047,8 +3135,13 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const 
p, const UV result, U8* c
                            ustrp, lenp,  L1_func_extra_param);               \
         }                                                                    \
     }                                                                        \
-    else {  /* malformed UTF-8 */                                            \
-        result = valid_utf8_to_uvchr(p, NULL);                               \
+    else {  /* malformed UTF-8 or ord above 255 */                           \
+        STRLEN len_result;                                                   \
+        result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
+        if (len_result == (STRLEN) -1) {                                     \
+            _force_out_malformed_utf8_message(p, e, utf8n_flags,             \
+                                                            1 /* Die */ );   \
+        }
 
 #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
         result = change_macro(result, p, ustrp, lenp);                       \
@@ -3075,7 +3168,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, 
const UV result, U8* c
 /*
 =for apidoc to_utf8_upper
 
-Instead use L</toUPPER_utf8>.
+Instead use L</toUPPER_utf8_safe>.
 
 =cut */
 
@@ -3084,9 +3177,17 @@ Instead use L</toUPPER_utf8>.
  *         be used. */
 
 UV
-Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool 
flags)
+Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
+                                const U8 *e,
+                                U8* ustrp,
+                                STRLEN *lenp,
+                                bool flags,
+                                const char * const file,
+                                const int line)
 {
     UV result;
+    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
+                                                cBOOL(flags), file, line);
 
     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
 
@@ -3099,7 +3200,7 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, 
STRLEN *lenp, bool flags
 /*
 =for apidoc to_utf8_title
 
-Instead use L</toTITLE_utf8>.
+Instead use L</toTITLE_utf8_safe>.
 
 =cut */
 
@@ -3110,9 +3211,17 @@ Instead use L</toTITLE_utf8>.
  */
 
 UV
-Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool 
flags)
+Perl__to_utf8_title_flags(pTHX_ const U8 *p,
+                                const U8 *e,
+                                U8* ustrp,
+                                STRLEN *lenp,
+                                bool flags,
+                                const char * const file,
+                                const int line)
 {
     UV result;
+    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
+                                                cBOOL(flags), file, line);
 
     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
 
@@ -3124,7 +3233,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, 
STRLEN *lenp, bool flags
 /*
 =for apidoc to_utf8_lower
 
-Instead use L</toLOWER_utf8>.
+Instead use L</toLOWER_utf8_safe>.
 
 =cut */
 
@@ -3134,9 +3243,17 @@ Instead use L</toLOWER_utf8>.
  */
 
 UV
-Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool 
flags)
+Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
+                                const U8 *e,
+                                U8* ustrp,
+                                STRLEN *lenp,
+                                bool flags,
+                                const char * const file,
+                                const int line)
 {
     UV result;
+    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
+                                                cBOOL(flags), file, line);
 
     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
 
@@ -3147,7 +3264,7 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, 
STRLEN *lenp, bool flags
 /*
 =for apidoc to_utf8_fold
 
-Instead use L</toFOLD_utf8>.
+Instead use L</toFOLD_utf8_safe>.
 
 =cut */
 
@@ -3162,9 +3279,17 @@ Instead use L</toFOLD_utf8>.
  */
 
 UV
-Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
+Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
+                               const U8 *e,
+                               U8* ustrp,
+                               STRLEN *lenp,
+                               U8 flags,
+                               const char * const file,
+                               const int line)
 {
     UV result;
+    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
+                                                cBOOL(flags), file, line);
 
     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
 
@@ -5151,7 +5276,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, 
UV l1, bool u1, const c
                     *foldbuf1 = toFOLD(*p1);
                 }
                 else if (u1) {
-                    _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
+                    _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, 
flags_for_folder);
                 }
                 else {  /* Not UTF-8, get UTF-8 fold */
                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
@@ -5175,7 +5300,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, 
UV l1, bool u1, const c
                     *foldbuf2 = toFOLD(*p2);
                 }
                 else if (u2) {
-                    _to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder);
+                    _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, 
flags_for_folder);
                 }
                 else {
                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
diff --git a/utf8.h b/utf8.h
index 571f5754a5..0fbe4b79d0 100644
--- a/utf8.h
+++ b/utf8.h
@@ -76,10 +76,15 @@ the string is invariant.
                                 utf8n_to_uvchr_error(s, len, lenp, flags, 0)
 
 #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
-#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, 
FOLD_FLAGS_FULL)
-#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0)
-#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0)
-#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0)
+
+#define to_utf8_fold(s, r, lenr)                                               
 \
+    _to_utf8_fold_flags (s, NULL, r, lenr, FOLD_FLAGS_FULL, __FILE__, __LINE__)
+#define to_utf8_lower(s, r, lenr)                                              
 \
+                  _to_utf8_lower_flags(s, NULL, r ,lenr, 0, __FILE__, __LINE__)
+#define to_utf8_upper(s, r, lenr)                                              
 \
+                  _to_utf8_upper_flags(s, NULL, r, lenr, 0, __FILE__, __LINE__)
+#define to_utf8_title(s, r, lenr)                                              
 \
+                  _to_utf8_title_flags(s, NULL, r, lenr ,0, __FILE__, __LINE__)
 
 #define foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
                    foldEQ_utf8_flags(s1, pe1, l1, u1, s2, pe2, l2, u2, 0)

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.25.8-64-g607313a197

Reply via email to