In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/867a901b7ec194e3895eb595338d0d0ea4fc783f?hp=5654fe3cb7747930ab73ec2386527c2d0007459f>
- Log ----------------------------------------------------------------- commit 867a901b7ec194e3895eb595338d0d0ea4fc783f Author: Karl Williamson <[email protected]> Date: Thu Sep 29 21:07:19 2016 -0600 APItest/t/utf8.t: Add some constants Though unused now, they were convenient during some debugging, and may be again. M ext/XS-APItest/t/utf8.t commit 57b7eb534d50ce7d2c2ce15e7d1783578c55be79 Author: Karl Williamson <[email protected]> Date: Thu Sep 29 21:05:49 2016 -0600 APItest/t/utf8.t: Fix 3-byte overlong test Unlike ASCII platforms, one can tell on EBCDIC ones if an otherwise valid 3-byte sequence is overlong on the first byte. M ext/XS-APItest/t/utf8.t commit 418080dc73a4b9e525a76d6d3b5034ff616716b4 Author: Karl Williamson <[email protected]> Date: Thu Sep 29 21:03:30 2016 -0600 APItest/t/utf8.t: Fix EBCDIC test Unlike on ASCII platforms, it may take more than one byte of a partial character to determine if it represents a code point that needs 32 or more bits to represent. This fixes the test to account for that. M ext/XS-APItest/t/utf8.t commit f880f78a9370d267502630934d0d3033b34389a1 Author: Karl Williamson <[email protected]> Date: Thu Sep 29 21:01:36 2016 -0600 utf8.c: Add missing type specifier to declaration This code was missing a STRLEN specifier; only compiled on EBCDIC. M utf8.c commit c0d8738ea8562747c49e0e96160c92ba55a7873e Author: Karl Williamson <[email protected]> Date: Thu Sep 29 11:51:41 2016 -0600 APItest/t/utf8.t: Skip some tests if major one fails If the patched test fails, the subsequent ones in the loop are meaningless, so don't execute them. M ext/XS-APItest/t/utf8.t commit a9e5eeaac5003e6ffbb826b05a079a9b3c1cd563 Author: Karl Williamson <[email protected]> Date: Thu Sep 29 11:50:51 2016 -0600 APItest/t/utf8.t: Fix typo This was a typo in the UTF-EBCDIC for a code point, so affected only tests on that platform M ext/XS-APItest/t/utf8.t commit 7ae54b83ee5a2592c85f68253d73dcfc43161de2 Author: Karl Williamson <[email protected]> Date: Wed Sep 28 20:42:30 2016 -0600 utf8n_to_uvchr() Fix EBCDIC bug with overlongs The comment removed in this commit was wrong, and so was the code it described. On EBCDIC platforms, there are malformations that need to be converted from Unicode to native. When I wrote that I wasn't thinking about overlongs, which can evaluate to any code point. The new tests in d566bd20c27a46aecd668d2f739b9515f46ac74f caught this. M utf8.c ----------------------------------------------------------------------- Summary of changes: ext/XS-APItest/t/utf8.t | 34 +++++++++++++++++++++++++++++++--- utf8.c | 6 ++---- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index fd3c903..2061232 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -46,10 +46,19 @@ my @i8_to_native = ( # Only code page 1047 so far. 0xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE, ); +my @native_to_i8; +for (my $i = 0; $i < 256; $i++) { + $native_to_i8[$i8_to_native[$i]] = $i; +} + *I8_to_native = (isASCII) ? sub { return shift } : sub { return join "", map { chr $i8_to_native[ord $_] } split "", shift }; +*native_to_I8 = (isASCII) + ? sub { return shift } + : sub { return join "", map { chr $native_to_i8[ord $_] } + split "", shift }; my $is64bit = length sprintf("%x", ~0) > 8; @@ -74,6 +83,10 @@ my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE; my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR; +my $UTF8_WARN_ILLEGAL_C9_INTERCHANGE + = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE; +my $UTF8_WARN_ILLEGAL_INTERCHANGE + = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR; # Test uvchr_to_utf8(). my $UNICODE_WARN_SURROGATE = 0x0001; @@ -189,7 +202,7 @@ my %code_points = ( 0xFDEB => (isASCII) ? "\xef\xb7\xab" : I8_to_native("\xf1\xbf\xaf\xab"), 0xFDEC => (isASCII) ? "\xef\xb7\xac" : I8_to_native("\xf1\xbf\xaf\xac"), 0xFDED => (isASCII) ? "\xef\xb7\xad" : I8_to_native("\xf1\xbf\xaf\xad"), - 0xFDEE => (isASCII) ? "\xef\xb7\xae" : I8_to_native("\xf1\xbf\xae\xae"), + 0xFDEE => (isASCII) ? "\xef\xb7\xae" : I8_to_native("\xf1\xbf\xaf\xae"), 0xFDEF => (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"), @@ -523,8 +536,12 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } my $display_flags = sprintf "0x%x", $this_utf8_flags; my $display_bytes = display_bytes($bytes); my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags); - is($ret_ref->[0], $n, "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n"); - is($ret_ref->[1], $len, "Verify utf8n_to_uvchr() for $hex_n returns expected length: $len"); + + # Rest of tests likely meaningless if it gets the wrong code point. + next unless is($ret_ref->[0], $n, + "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n"); + is($ret_ref->[1], $len, + "Verify utf8n_to_uvchr() for $hex_n returns expected length: $len"); unless (is(scalar @warnings, 0, "Verify utf8n_to_uvchr() for $hex_n generated no warnings")) @@ -1221,6 +1238,9 @@ foreach my $test (@malformations) { $ret_should_be = 1; $comment = ", but need 2 bytes to discern:"; } + elsif ($testname =~ /overlong/ && ! isASCII && $length == 3) { + # 3-byte overlongs on EBCDIC are determinable on the first byte + } elsif ($testname =~ /overlong/ && $length > 2) { if ($length <= 7 && $j < 2) { $ret_should_be = 1; @@ -1802,6 +1822,14 @@ foreach my $test (@tests) { $ret_should_be = 1; $comment .= ", but need 2 bytes to discern"; } + elsif ( ! isASCII + && $testname =~ /requires at least 32 bits/) + { + # On EBCDIC, the boundary between 31 and 32 bits is + # more complicated. + $ret_should_be = 1 if native_to_I8($partial) le + "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF"; + } } undef @warnings; diff --git a/utf8.c b/utf8.c index 7f8df9d..ec550c3 100644 --- a/utf8.c +++ b/utf8.c @@ -386,7 +386,7 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) const U8 * const prefix = "\x41\x41\x41\x41\x41\x41\x42"; const STRLEN prefix_len = sizeof(prefix) - 1; const STRLEN len = e - s; - const cmp_len = MIN(prefix_len, len - 1); + const STRLEN cmp_len = MIN(prefix_len, len - 1); #else @@ -1046,9 +1046,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } if (sv) { - outlier_ret = uv; /* Note we don't bother to convert to native, - as all the outlier code points are the same - in both ASCII and EBCDIC */ + outlier_ret = UNI_TO_NATIVE(uv); goto do_warn; } -- Perl5 Master Repository
