)on EBCDIC platform

SADAHIRO Tomoyuki Tue, 20 Sep 2005 08:13:01 -0700

On Tue, 20 Sep 2005 15:51:34 +0530, Sastry <[EMAIL PROTECTED]> wrote


> Hi Sadahiro
> 
> All the existing test suite passes. But there are couple of new tests
> failing probably due to multibyte representation \x{1000} which is
> represented in three byte sequence in EBCDIC . These two tests are
> 
> $c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/\x{1000}\x89-\x91/X/;
> is($c, 8);
> is($a, "XXXXXXXX");
> 
> $c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/\x{1000}\xc9-\xd1/X/;
> is($c, 8);
> is($a, "XXXXXXXX");
> 
> The output is:
> 
> not ok 1
> # Failed at t/op/tr_new.t line 32
> #      got '6'
> # expected '8'
> not ok 2
> # Failed at t/op/tr_new.t line 33
> #      got 'XXXðýXXX'
> # expected 'XXXXXXXX'
> not ok 3
> # Failed at t/op/tr_new.t line 36
> #      got '4'
> # expected '8'
> not ok 4
> # Failed at t/op/tr_new.t line 37
> #      got 'XXôöòõXX'
> # expected 'XXXXXXXX'
> 
> One observation is that since this unicode appears first in the tr//
> as there seemed a problem in \x{100} case, Seems like it doesn't
> handle the  multibyte (>2)
> 
> regards
> Sastry

This newer patch uses NATIVE_TO_ASCII(i) instead of
NATIVE_TO_UTF(i). This is only thing which I found being wrong
about the prev patch; but your result seems different from my
expectation about how the output will be with NATIVE_TO_UTF(i)
in the prev patch...

If newer patch is still wrong, would you set DEBUG
in lib/utf8_heavy.pl to be true (that is to replace the line 5

sub DEBUG () { 0 }

to

sub DEBUG () { 1 }

and run it again? Then many verbose info will be out.

Regards,
SADAHIRO Tomoyuki

diff -ur [EMAIL PROTECTED]/t/op/tr.t perl/t/op/tr.t
--- [EMAIL PROTECTED]/t/op/tr.t Thu Aug 18 18:27:25 2005
+++ perl/t/op/tr.t      Sun Sep 18 19:59:13 2005
@@ -6,7 +6,7 @@
     require './test.pl';
 }
 
-plan tests => 100;
+plan tests => 120;
 
 my $Is_EBCDIC = (ord('i') == 0x89 & ord('J') == 0xd1);
 
@@ -259,7 +259,6 @@
 
 # UTF8 range tests from Inaba Hiroto
 
-# Not working in EBCDIC as of 12674.
 ($a = v300.196.172.302.197.172) =~ tr/\x{12c}-\x{130}/\xc0-\xc4/;
 is($a, v192.196.172.194.197.172,    'UTF range');
 
@@ -272,6 +271,15 @@
 ($a = "\x{0100}") =~ tr/\x00-\x{100}/X/;
 is($a, "X");
 
+($a = "\x{0100}") =~ tr/\x00-\x{101}/X/;
+is($a, "X");
+
+($a = "\x{0100}\x{0101}") =~ tr/\x00-\x{102}/X/;
+is($a, "XX");
+
+($a = "\x{0101}\x{0102}") =~ tr/\x00-\x{103}/X/;
+is($a, "XX");
+
 ($a = "\x{0100}") =~ tr/\x{0000}-\x{00ff}/X/c;
 is($a, "X");
 
@@ -303,8 +311,16 @@
 is($c, 8);
 is($a, "XXXXXXXX");
 
+$c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/\x{1000}\x89-\x91/X/;
+is($c, 8);
+is($a, "XXXXXXXX");
+
+$c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/\x{1000}\xc9-\xd1/X/;
+is($c, 8);
+is($a, "XXXXXXXX");
+
 SKIP: {
-    skip "not EBCDIC", 4 unless $Is_EBCDIC;
+    skip "not EBCDIC", 12 unless $Is_EBCDIC;
 
     $c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/i-j/X/;
     is($c, 2);
@@ -313,7 +329,38 @@
     $c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/I-J/X/;
     is($c, 2);
     is($a, "X\xca\xcb\xcc\xcd\xcf\xd0X");
+
+    $c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/\x{1000}i-j/X/;
+    is($c, 2);
+    is($a, "X\x8a\x8b\x8c\x8d\x8f\x90X");
+
+    $c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/\x{1000}I-J/X/;
+    is($c, 2);
+    is($a, "X\xca\xcb\xcc\xcd\xcf\xd0X");
+
+    $c = ($a = "\x89\x8a\x8b\x8c\x8d\x8f\x90\x91") =~ tr/i-j\x{1000}/X/;
+    is($c, 2);
+    is($a, "X\x8a\x8b\x8c\x8d\x8f\x90X");
+
+    $c = ($a = "\xc9\xca\xcb\xcc\xcd\xcf\xd0\xd1") =~ tr/I-J\x{1000}/X/;
+    is($c, 2);
+    is($a, "X\xca\xcb\xcc\xcd\xcf\xd0X");
 }
+
+($a = "\xfc\xfd\xfe\xff") =~ tr/\x00-\xff/X/;
+is($a, "XXXX");
+
+($a = "\xfc\xfd\xfe\xff") =~ tr/\x{1000}\x00-\xff/X/;
+is($a, "XXXX");
+
+($a = "\xfc\xfd\xfe\xff\x{100}") =~ tr/\x{1000}\x00-\x{100}/X/;
+is($a, "XXXXX");
+
+($a = "\xfc\xfd\xfe\xff\x{100}") =~ tr/\x00-\x{200}/X/;
+is($a, "XXXXX");
+
+($a = "\xfc\xfd\xfe\xff\x{100}") =~ tr/\x{1000}\x00-\xff/X/c;
+is($a, "\xfc\xfd\xfe\xffX");
 
 ($a = "\x{100}") =~ tr/\x00-\xff/X/c;
 is(ord($a), ord("X"));
diff -ur [EMAIL PROTECTED]/toke.c perl/toke.c
--- [EMAIL PROTECTED]/toke.c    Wed Sep 14 17:40:19 2005
+++ perl/toke.c Tue Sep 20 23:09:13 2005
@@ -1407,6 +1407,7 @@
     UV uv;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
+    bool native_range = TRUE; /* turned to FALSE if the first endpoint is 
Unicode */
 #endif
 
     const char *leaveit =      /* set of acceptably-backslashed characters */
@@ -1429,8 +1430,14 @@
                I32 i;                          /* current expanded character */
                I32 min;                        /* first character in range */
                I32 max;                        /* last character in range */
-
-               if (has_utf8) {
+#ifdef EBCDIC
+               UV  uvmax = 0;                  /* last character above byte */
+#endif
+               if (has_utf8
+#ifdef EBCDIC
+                   && !native_range
+#endif
+               ) {
                    char * const c = (char*)utf8_hop((U8*)d, -1);
                    char *e = d++;
                    while (e-- > c)
@@ -1443,12 +1450,45 @@
                }
 
                i = d - SvPVX_const(sv);                /* remember current 
offset */
+#ifdef EBCDIC
+               if (has_utf8) {
+                   SvGROW(sv, SvLEN(sv) + 512 - UTF_CONTINUATION_MARK
+                               + UNISKIP(0x100));
+               /* how many two-byte within 0..255: 128 in UTF-8, 96 in 
UTF-8-mod */
+               }
+               else {
+#endif
                SvGROW(sv, SvLEN(sv) + 256);    /* never more than 256 chars in 
a range */
+#ifdef EBCDIC
+               }
+#endif
                d = SvPVX(sv) + i;              /* refresh d after realloc */
-               d -= 2;                         /* eat the first char and the - 
*/
 
+#ifdef EBCDIC
+               if (has_utf8) {
+                   int j;
+                   for (j = 0; j <= 1; j++) {
+                       char * const c = (char*)utf8_hop((U8*)d, -1);
+                       const UV uv    = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
+                       if (j)
+                           min = (U8)uv;
+                       else if (uv < 256)
+                           max = (U8)uv;
+                       else {
+                           max = (U8)0xff; /* only to \xff */
+                           uvmax = uv; /* \x{100} to uvmax */
+                       }
+                       d = c; /* eat endpoint chars */
+                    }
+               }
+               else {
+#endif
+               d -= 2;         /* eat the first char and the last char */
                min = (U8)*d;                   /* first char in range */
                max = (U8)d[1];                 /* last char in range  */
+#ifdef EBCDIC
+               }
+#endif
 
                 if (min > max) {
                    Perl_croak(aTHX_
@@ -1473,14 +1513,32 @@
                else
 #endif
                    for (i = min; i <= max; i++)
+#ifdef EBCDIC
+                       if (has_utf8) {
+                           const U8 ch = (U8)NATIVE_TO_ASCII(i);
+                           if (UNI_IS_INVARIANT(ch))
+                               *d++ = (U8)i;
+                           else {
+                               *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
+                               *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
+                           }
+                       }
+                       else
+#endif
                        *d++ = (char)i;
 
+#ifdef EBCDIC
+               if (uvmax) {
+                   d = (char*)uvchr_to_utf8((U8*)d, 0x100);
+                   if (uvmax > 0x101)
+                       *d++ = (char)UTF_TO_NATIVE(0xff);
+                   if (uvmax > 0x100)
+                       d = (char*)uvchr_to_utf8((U8*)d, uvmax);
+               }
+#endif
                /* mark the range as done, and continue */
                dorange = FALSE;
                didrange = TRUE;
-#ifdef EBCDIC
-               literal_endpoint = 0;
-#endif
                continue;
            }
 
@@ -1489,7 +1547,11 @@
                if (didrange) {
                    Perl_croak(aTHX_ "Ambiguous range in transliteration 
operator");
                }
-               if (has_utf8) {
+               if (has_utf8
+#ifdef EBCDIC
+                       && !native_range
+#endif
+               ) {
                    *d++ = (char)UTF_TO_NATIVE(0xff);   /* use illegal utf8 
byte--see pmtrans */
                    s++;
                    continue;
@@ -1501,6 +1563,7 @@
                didrange = FALSE;
 #ifdef EBCDIC
                literal_endpoint = 0;
+               native_range = TRUE;
 #endif
            }
        }
@@ -1705,6 +1768,10 @@
                                (PL_lex_repl ? OPpTRANS_FROM_UTF
                                             : OPpTRANS_TO_UTF);
                        }
+#ifdef EBCDIC
+                       if (uv > 255 && !dorange)
+                           native_range = FALSE;
+#endif
                     }
                    else {
                        *d++ = (char)uv;
@@ -1782,6 +1849,10 @@
                        SvGROW(sv, (SvLEN(sv) + len - (e - s + 4)));
                        d = SvPVX(sv) + (d - odest);
                    }
+#ifdef EBCDIC
+                   if (!dorange)
+                       native_range = FALSE; /* \N{} is guessed to be Unicode 
*/
+#endif
                    Copy(str, d, len, char);
                    d += len;
                    SvREFCNT_dec(res);
@@ -1855,6 +1926,10 @@
            }
            d = (char*)uvchr_to_utf8((U8*)d, uv);
            has_utf8 = TRUE;
+#ifdef EBCDIC
+           if (uv > 255 && !dorange)
+               native_range = FALSE;
+#endif
        }
        else {
            *d++ = NATIVE_TO_NEED(has_utf8,*s++);
###END OF PATCH

Re: Transliteration operator(tr//)on EBCDIC platform

Reply via email to