On Tue, 20 Sep 2005 15:51:34 +0530, Sastry <[EMAIL PROTECTED]> wrote
> Hi Sadahiro > > All the existing test suite passes. But there are couple of new tests > failing probably due to multibyte representation \x{1000} which is > represented in three byte sequence in EBCDIC . These two tests are > > $c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/\x{1000}\x89-\x91/X/; > is($c, 8); > is($a, "XXXXXXXX"); > > $c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/\x{1000}\xc9-\xd1/X/; > is($c, 8); > is($a, "XXXXXXXX"); > > The output is: > > not ok 1 > # Failed at t/op/tr_new.t line 32 > # got '6' > # expected '8' > not ok 2 > # Failed at t/op/tr_new.t line 33 > # got 'XXXðýXXX' > # expected 'XXXXXXXX' > not ok 3 > # Failed at t/op/tr_new.t line 36 > # got '4' > # expected '8' > not ok 4 > # Failed at t/op/tr_new.t line 37 > # got 'XXôöòõXX' > # expected 'XXXXXXXX' > > One observation is that since this unicode appears first in the tr// > as there seemed a problem in \x{100} case, Seems like it doesn't > handle the multibyte (>2) > > regards > Sastry This newer patch uses NATIVE_TO_ASCII(i) instead of NATIVE_TO_UTF(i). This is only thing which I found being wrong about the prev patch; but your result seems different from my expectation about how the output will be with NATIVE_TO_UTF(i) in the prev patch... If newer patch is still wrong, would you set DEBUG in lib/utf8_heavy.pl to be true (that is to replace the line 5 sub DEBUG () { 0 } to sub DEBUG () { 1 } and run it again? Then many verbose info will be out. Regards, SADAHIRO Tomoyuki diff -ur [EMAIL PROTECTED]/t/op/tr.t perl/t/op/tr.t --- [EMAIL PROTECTED]/t/op/tr.t Thu Aug 18 18:27:25 2005 +++ perl/t/op/tr.t Sun Sep 18 19:59:13 2005 @@ -6,7 +6,7 @@ require './test.pl'; } -plan tests => 100; +plan tests => 120; my $Is_EBCDIC = (ord('i') == 0x89 & ord('J') == 0xd1); @@ -259,7 +259,6 @@ # UTF8 range tests from Inaba Hiroto -# Not working in EBCDIC as of 12674. ($a = v300.196.172.302.197.172) =~ tr/\x{12c}-\x{130}/\xc0-\xc4/; is($a, v192.196.172.194.197.172, 'UTF range'); @@ -272,6 +271,15 @@ ($a = "\x{0100}") =~ tr/\x00-\x{100}/X/; is($a, "X"); +($a = "\x{0100}") =~ tr/\x00-\x{101}/X/; +is($a, "X"); + +($a = "\x{0100}\x{0101}") =~ tr/\x00-\x{102}/X/; +is($a, "XX"); + +($a = "\x{0101}\x{0102}") =~ tr/\x00-\x{103}/X/; +is($a, "XX"); + ($a = "\x{0100}") =~ tr/\x{0000}-\x{00ff}/X/c; is($a, "X"); @@ -303,8 +311,16 @@ is($c, 8); is($a, "XXXXXXXX"); +$c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/\x{1000}\x89-\x91/X/; +is($c, 8); +is($a, "XXXXXXXX"); + +$c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/\x{1000}\xc9-\xd1/X/; +is($c, 8); +is($a, "XXXXXXXX"); + SKIP: { - skip "not EBCDIC", 4 unless $Is_EBCDIC; + skip "not EBCDIC", 12 unless $Is_EBCDIC; $c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/i-j/X/; is($c, 2); @@ -313,7 +329,38 @@ $c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/I-J/X/; is($c, 2); is($a, "X\xca\xcb\xcc\xcd\xcf\xd0X"); + + $c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/\x{1000}i-j/X/; + is($c, 2); + is($a, "X\x8a\x8b\x8c\x8d\x8f\x90X"); + + $c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/\x{1000}I-J/X/; + is($c, 2); + is($a, "X\xca\xcb\xcc\xcd\xcf\xd0X"); + + $c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/i-j\x{1000}/X/; + is($c, 2); + is($a, "X\x8a\x8b\x8c\x8d\x8f\x90X"); + + $c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/I-J\x{1000}/X/; + is($c, 2); + is($a, "X\xca\xcb\xcc\xcd\xcf\xd0X"); } + +($a = "\xfc\xfd\xfe\xff") =~ tr/\x00-\xff/X/; +is($a, "XXXX"); + +($a = "\xfc\xfd\xfe\xff") =~ tr/\x{1000}\x00-\xff/X/; +is($a, "XXXX"); + +($a = "\xfc\xfd\xfe\xff\x{100}") =~ tr/\x{1000}\x00-\x{100}/X/; +is($a, "XXXXX"); + +($a = "\xfc\xfd\xfe\xff\x{100}") =~ tr/\x00-\x{200}/X/; +is($a, "XXXXX"); + +($a = "\xfc\xfd\xfe\xff\x{100}") =~ tr/\x{1000}\x00-\xff/X/c; +is($a, "\xfc\xfd\xfe\xffX"); ($a = "\x{100}") =~ tr/\x00-\xff/X/c; is(ord($a), ord("X")); diff -ur [EMAIL PROTECTED]/toke.c perl/toke.c --- [EMAIL PROTECTED]/toke.c Wed Sep 14 17:40:19 2005 +++ perl/toke.c Tue Sep 20 23:09:13 2005 @@ -1407,6 +1407,7 @@ UV uv; #ifdef EBCDIC UV literal_endpoint = 0; + bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode */ #endif const char *leaveit = /* set of acceptably-backslashed characters */ @@ -1429,8 +1430,14 @@ I32 i; /* current expanded character */ I32 min; /* first character in range */ I32 max; /* last character in range */ - - if (has_utf8) { +#ifdef EBCDIC + UV uvmax = 0; /* last character above byte */ +#endif + if (has_utf8 +#ifdef EBCDIC + && !native_range +#endif + ) { char * const c = (char*)utf8_hop((U8*)d, -1); char *e = d++; while (e-- > c) @@ -1443,12 +1450,45 @@ } i = d - SvPVX_const(sv); /* remember current offset */ +#ifdef EBCDIC + if (has_utf8) { + SvGROW(sv, SvLEN(sv) + 512 - UTF_CONTINUATION_MARK + + UNISKIP(0x100)); + /* how many two-byte within 0..255: 128 in UTF-8, 96 in UTF-8-mod */ + } + else { +#endif SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */ +#ifdef EBCDIC + } +#endif d = SvPVX(sv) + i; /* refresh d after realloc */ - d -= 2; /* eat the first char and the - */ +#ifdef EBCDIC + if (has_utf8) { + int j; + for (j = 0; j <= 1; j++) { + char * const c = (char*)utf8_hop((U8*)d, -1); + const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0); + if (j) + min = (U8)uv; + else if (uv < 256) + max = (U8)uv; + else { + max = (U8)0xff; /* only to \xff */ + uvmax = uv; /* \x{100} to uvmax */ + } + d = c; /* eat endpoint chars */ + } + } + else { +#endif + d -= 2; /* eat the first char and the last char */ min = (U8)*d; /* first char in range */ max = (U8)d[1]; /* last char in range */ +#ifdef EBCDIC + } +#endif if (min > max) { Perl_croak(aTHX_ @@ -1473,14 +1513,32 @@ else #endif for (i = min; i <= max; i++) +#ifdef EBCDIC + if (has_utf8) { + const U8 ch = (U8)NATIVE_TO_ASCII(i); + if (UNI_IS_INVARIANT(ch)) + *d++ = (U8)i; + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(ch); + *d++ = (U8)UTF8_EIGHT_BIT_LO(ch); + } + } + else +#endif *d++ = (char)i; +#ifdef EBCDIC + if (uvmax) { + d = (char*)uvchr_to_utf8((U8*)d, 0x100); + if (uvmax > 0x101) + *d++ = (char)UTF_TO_NATIVE(0xff); + if (uvmax > 0x100) + d = (char*)uvchr_to_utf8((U8*)d, uvmax); + } +#endif /* mark the range as done, and continue */ dorange = FALSE; didrange = TRUE; -#ifdef EBCDIC - literal_endpoint = 0; -#endif continue; } @@ -1489,7 +1547,11 @@ if (didrange) { Perl_croak(aTHX_ "Ambiguous range in transliteration operator"); } - if (has_utf8) { + if (has_utf8 +#ifdef EBCDIC + && !native_range +#endif + ) { *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */ s++; continue; @@ -1501,6 +1563,7 @@ didrange = FALSE; #ifdef EBCDIC literal_endpoint = 0; + native_range = TRUE; #endif } } @@ -1705,6 +1768,10 @@ (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); } +#ifdef EBCDIC + if (uv > 255 && !dorange) + native_range = FALSE; +#endif } else { *d++ = (char)uv; @@ -1782,6 +1849,10 @@ SvGROW(sv, (SvLEN(sv) + len - (e - s + 4))); d = SvPVX(sv) + (d - odest); } +#ifdef EBCDIC + if (!dorange) + native_range = FALSE; /* \N{} is guessed to be Unicode */ +#endif Copy(str, d, len, char); d += len; SvREFCNT_dec(res); @@ -1855,6 +1926,10 @@ } d = (char*)uvchr_to_utf8((U8*)d, uv); has_utf8 = TRUE; +#ifdef EBCDIC + if (uv > 255 && !dorange) + native_range = FALSE; +#endif } else { *d++ = NATIVE_TO_NEED(has_utf8,*s++); ###END OF PATCH