In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/b0b342d4b3fa61880a96b645a3dab648f7c4c682?hp=c8d6633c09cc20070ed5f0a632bf0717748dd6a2>

- Log -----------------------------------------------------------------
commit b0b342d4b3fa61880a96b645a3dab648f7c4c682
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 1 22:12:51 2016 -0600

    Fix wrong UTF-8 overflow error on 32-bit platforms
    
    Commit 2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930 changed the algorithm
    for detecting overflow during decoding UTF-8 into code points.  However,
    on 32-bit platforms, this change caused it to claim some things overflow
    that really don't.  ALl such are overlong malformations, which are
    normally forbidden, but not necessarily.  This commit fixes that.

M       embed.fnc
M       embed.h
M       ext/XS-APItest/t/utf8.t
M       proto.h
M       utf8.c

commit 22123136705b458f89846c8e559433731724adf7
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 1 22:13:21 2016 -0600

    APItest/t/utf8.t: Correct to uppercase in print
    
    This worked so long as we didn't have hex digits A-F.

M       ext/XS-APItest/t/utf8.t
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc               |  1 +
 embed.h                 |  1 +
 ext/XS-APItest/t/utf8.t | 26 ++++++++++++++++++++++++--
 proto.h                 |  6 ++++++
 utf8.c                  | 44 ++++++++++++++++++++++++++------------------
 5 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 5cc73b7..d6312dc 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1683,6 +1683,7 @@ ApdD      |UV     |to_utf8_case   |NN const U8 *p         
                        \
 #if defined(PERL_IN_UTF8_C)
 inRP   |bool   |does_utf8_overflow|NN const U8 * const s|NN const U8 * e
 inRP   |bool   |is_utf8_overlong_given_start_byte_ok|NN const U8 * const 
s|const STRLEN len
+inRP   |bool   |isFF_OVERLONG  |NN const U8 * const s|const STRLEN len
 sMR    |char * |unexpected_non_continuation_text                       \
                |NN const U8 * const s                                  \
                |STRLEN print_len                                       \
diff --git a/embed.h b/embed.h
index 1af2917..b8ee773 100644
--- a/embed.h
+++ b/embed.h
@@ -1832,6 +1832,7 @@
 #define _to_utf8_case(a,b,c,d,e,f,g)   S__to_utf8_case(aTHX_ a,b,c,d,e,f,g)
 #define check_locale_boundary_crossing(a,b,c,d)        
S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
 #define does_utf8_overflow     S_does_utf8_overflow
+#define isFF_OVERLONG          S_isFF_OVERLONG
 #define is_utf8_common(a,b,c,d)        S_is_utf8_common(aTHX_ a,b,c,d)
 #define is_utf8_cp_above_31_bits       S_is_utf8_cp_above_31_bits
 #define is_utf8_overlong_given_start_byte_ok   
S_is_utf8_overlong_given_start_byte_ok
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index fc04dfc..121c6ef 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -1420,7 +1420,7 @@ sub nonportable_regex ($) {
 
     my $code_point = shift;
 
-    my $string = sprintf '(Code point 0x%x is not Unicode, and'
+    my $string = sprintf '(Code point 0x%X is not Unicode, and'
                        . '|Any UTF-8 sequence that starts with'
                        . ' "(\\\x[[:xdigit:]]{2})+" is for a'
                        . ' non-Unicode code point, and is) not portable',
@@ -1731,6 +1731,15 @@ my @tests = (
         'utf8', 0x80000000, (isASCII) ? 7 : $max_bytes,
         nonportable_regex(0x80000000)
     ],
+    [ "highest 32 bit code point",
+        (isASCII)
+         ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
+         : 
I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
+        $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
+        $UTF8_GOT_ABOVE_31_BIT,
+        'utf8', 0xFFFFFFFF, (isASCII) ? 7 : $max_bytes,
+        nonportable_regex(0xffffffff)
+    ],
     [ "requires at least 32 bits, and use SUPER-type flags, instead of 
ABOVE_31_BIT",
         (isASCII)
          ? "\xfe\x82\x80\x80\x80\x80\x80"
@@ -1764,7 +1773,20 @@ my @tests = (
     ],
 );
 
-if ($is64bit) {
+if (! $is64bit) {
+    if (isASCII) {
+        no warnings qw{portable overflow};
+        push @tests,
+            [ "Lowest 33 bit code point: overflow",
+                "\xFE\x84\x80\x80\x80\x80\x80",
+                $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
+                'utf8', 0x100000000, 7,
+                qr/and( is)? not portable/
+            ];
+    }
+}
+else {
     no warnings qw{portable overflow};
     push @tests,
         [ "More than 32 bits",
diff --git a/proto.h b/proto.h
index 1d79c46..6708016 100644
--- a/proto.h
+++ b/proto.h
@@ -5605,6 +5605,12 @@ PERL_STATIC_INLINE bool  S_does_utf8_overflow(const U8 * 
const s, const U8 * e)
 #define PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW    \
        assert(s); assert(e)
 
+PERL_STATIC_INLINE bool        S_isFF_OVERLONG(const U8 * const s, const 
STRLEN len)
+                       __attribute__warn_unused_result__
+                       __attribute__pure__;
+#define PERL_ARGS_ASSERT_ISFF_OVERLONG \
+       assert(s)
+
 PERL_STATIC_INLINE bool        S_is_utf8_common(pTHX_ const U8 *const p, SV 
**swash, const char * const swashname, SV* const invlist)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_IS_UTF8_COMMON        \
diff --git a/utf8.c b/utf8.c
index 80bafad..8b301b2 100644
--- a/utf8.c
+++ b/utf8.c
@@ -442,7 +442,20 @@ S_does_utf8_overflow(const U8 * const s, const U8 * e)
      * that could result in a non-overflowing code point */
 
     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
-    assert(s + UTF8SKIP(s) >= e);
+    assert(s <= e && s + UTF8SKIP(s) >= e);
+
+#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
+
+    /* On 32 bit ASCII machines, many overlongs that start with FF don't
+     * overflow */
+
+    if (isFF_OVERLONG(s, e - s)) {
+        const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
+        return memGE(s, max_32_bit_overlong,
+                                    MIN(e - s, sizeof(max_32_bit_overlong)));
+    }
+
+#endif
 
     for (x = s; x < e; x++, y++) {
 
@@ -521,27 +534,22 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const 
s, const STRLEN len)
         return TRUE;
     }
 
-#   if defined(UV_IS_QUAD) || defined(EBCDIC)
+    /* Check for the FF overlong */
+    return isFF_OVERLONG(s, len);
+}
+
+PERL_STATIC_INLINE bool
+S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
+{
+    PERL_ARGS_ASSERT_ISFF_OVERLONG;
 
     /* Check for the FF overlong.  This happens only if all these bytes match;
      * what comes after them doesn't matter.  See tables in utf8.h,
-     * utfebcdic.h.  (Can't happen on ASCII 32-bit platforms, as overflows
-     * instead.) */
-
-    if (   len >= sizeof(FF_OVERLONG_PREFIX) - 1
-        && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
-                                            sizeof(FF_OVERLONG_PREFIX) - 1)))
-    {
-        return TRUE;
-    }
-
-#else
-
-    PERL_UNUSED_ARG(len);
-
-#endif
+     * utfebcdic.h. */
 
-    return FALSE;
+    return    len >= sizeof(FF_OVERLONG_PREFIX) - 1
+           && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
+                                            sizeof(FF_OVERLONG_PREFIX) - 1));
 }
 
 #undef F0_ABOVE_OVERLONG

--
Perl5 Master Repository

Reply via email to