# New Ticket Created by  Leopold Toetsch 
# Please include the string:  [perl #28494]
# in the subject line of all future correspondence about this issue. 
# <URL: http://rt.perl.org:80/rt3/Ticket/Display.html?id=28494 >


Attached patch:
* adds a new test file for Unicode-related string tests
* reimplements string_unescape_cstring which uses now ICU for the work
* fixes a bug in string_compare with equally length strings

It's also by far more efficient then the old code.

TODO: move it out of string.c, docs.

Jeff, please have a look at it.

leo
--- parrot/MANIFEST     Mon Apr 12 15:43:05 2004
+++ parrot-leo/MANIFEST Mon Apr 12 18:41:07 2004
@@ -2596,6 +2596,7 @@
 t/op/rx.t                                         []
 t/op/stacks.t                                     []
 t/op/string.t                                     []
+t/op/stringu.t                                    []
 t/op/time.t                                       []
 t/op/trans.t                                      []
 t/op/types.t                                      []
--- /dev/null   Fri Feb 28 14:27:28 2003
+++ parrot-leo/t/op/stringu.t   Mon Apr 12 18:40:40 2004
@@ -0,0 +1,57 @@
+#! perl -w
+# Copyright: 2001-2004 The Perl Foundation.  All Rights Reserved.
+# $Id$
+
+=head1 NAME
+
+t/op/stringu.t - Unicode String Test
+
+=head1 SYNOPSIS
+
+       % perl -Ilib t/op/stringu.t
+
+=head1 DESCRIPTION
+
+Tests Parrot's unicode string system.
+
+=cut
+#'
+
+use Parrot::Test tests => 4;
+use Test::More;
+
+output_is( <<'CODE', <<OUTPUT, "angstrom" );
+    chr S0, 0x212B
+    print S0
+    print "\n"
+    end
+CODE
+\xe2\x84\xab
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, " escaped angstrom" );
+    set S0, "\x{212b}"
+    print S0
+    print "\n"
+    end
+CODE
+\xe2\x84\xab
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, " escaped angstrom 2" );
+    set S0, "aaaaaa\x{212b}"
+    print S0
+    print "\n"
+    end
+CODE
+aaaaaa\xe2\x84\xab
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, " escaped angstrom 3" );
+    set S0, "aaaaaa\x{212b}-aaaaaa"
+    print S0
+    print "\n"
+    end
+CODE
+aaaaaa\xe2\x84\xab-aaaaaa
+OUTPUT
--- parrot/src/string.c Sun Apr 11 15:16:48 2004
+++ parrot-leo/src/string.c     Mon Apr 12 18:40:29 2004
@@ -1626,24 +1626,28 @@
     type1 *curr1 = (type1 *)s1->strstart; \
     type2 *curr2 = (type2 *)s2->strstart; \
      \
-    while( (_index++ < minlen) && (*curr1 == *curr2) ) \
+    while( (_index < minlen) && (*curr1 == *curr2) ) \
     { \
         ++curr1; \
         ++curr2; \
+        ++_index; \
     } \
+    if (_index == minlen && s1->strlen == s2->strlen) { \
+        result = 0; \
+        break; \
+    } \
+    result = *curr1 - *curr2; \
      \
-    *result = *curr1 - *curr2; \
-     \
-    if( !*result ) \
+    if( !result ) \
     { \
         if( s1->strlen != s2->strlen ) \
         { \
-            *result = s1->strlen > s2->strlen ? 1 : -1; \
+            result = s1->strlen > s2->strlen ? 1 : -1; \
         } \
     } \
     else \
     { \
-        *result = *result > 0 ? 1 : -1; \
+        result = result > 0 ? 1 : -1; \
     } \
 } while(0)
 
@@ -1691,13 +1695,13 @@
         {
             case enum_stringrep_one:
                 /* could use memcmp in this one case; faster?? */
-                COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, &cmp);
+                COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, cmp);
                 break;
             case enum_stringrep_two:
-                COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, &cmp);
+                COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, cmp);
                 break;
             case enum_stringrep_four:
-                COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, &cmp);
+                COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, cmp);
                 break;
             default:
                 /* trouble! */
@@ -1731,18 +1735,18 @@
             if( smaller->representation == enum_stringrep_two )
             {
                 COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt2, 
-                    larger, smaller, &cmp);
+                    larger, smaller, cmp);
             }
             else /* smaller->representation == enum_stringrep_one */
             {
                 COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt1, 
-                    larger, smaller, &cmp);
+                    larger, smaller, cmp);
             }
         }
         else /* larger->representation == enum_stringrep_two, 
             smaller->representation == enum_stringrep_one */
         {
-            COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, &cmp);
+            COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, cmp);
         }
 
         return cmp * multiplier;
@@ -3052,7 +3056,69 @@
 =cut
 
 */
+#if 1
+/* TODO move this out of string.c */
+#include <unicode/ustring.h>
+static UChar
+char_at(Parrot_Int4 offs, void* context)
+{
+    return *((char*)context + offs);
 
+}
+STRING *
+string_unescape_cstring(struct Parrot_Interp * interpreter,
+    char *cstring, char delimiter)
+{
+    size_t clength = strlen(cstring);
+    STRING *result;
+    int offs, d;
+    char *p;
+    Parrot_UInt4 r;
+    int had_int16;
+
+    result = string_make(interpreter, cstring, clength, "iso-8859-1",
+            PObj_constant_FLAG);
+    for (offs = d = had_int16 = 0; ; ) {
+        p = cstring + offs;
+        if (!*p || *p == delimiter)
+            break;
+        if (*p == '\\' && p[1]) {
+            ++offs;
+            r = u_unescapeAt(char_at, &offs, clength, cstring);
+            if (had_int16) {
+set_16:
+                ((Parrot_UInt2*)result->strstart)[d++] = r;
+            }
+            else {
+                /* TODO r = 0xffffffff for error */
+                if (r <= 0xff) {
+                    ((char*)result->strstart)[d++] = r;
+                }
+                else {
+                    assert(r <= 0xffff);    /* TODO */
+                    /* current result is this */
+                    result->strlen = result->bufused = d;
+                    _string_upscale(interpreter, result,
+                            enum_stringrep_two, clength);
+                    had_int16 = 1;
+                    goto set_16;
+                }
+            }
+        }
+        else {
+            if (had_int16)
+                ((Parrot_UInt2*)result->strstart)[d++] = *p;
+            else
+                ((char*)result->strstart)[d++] = *p;
+            ++offs;
+        }
+    }
+    result->strlen = d;
+    result->bufused = d * (had_int16 ? 2 : 1);
+    return result;
+}
+
+#else
 STRING *
 string_unescape_cstring(struct Parrot_Interp * interpreter, 
     char *cstring, char delimiter)
@@ -3169,6 +3235,7 @@
 
     return string_constant_copy(interpreter, result);
 }
+#endif
 
 /*
 
--- parrot/t/op/string.t        Sun Apr 11 11:58:46 2004
+++ parrot-leo/t/op/string.t    Mon Apr 12 18:13:41 2004
@@ -16,7 +16,7 @@
 
 =cut
 
-use Parrot::Test tests => 132;
+use Parrot::Test tests => 131;
 use Test::More;
 
 output_is( <<'CODE', <<OUTPUT, "set_s_s|sc" );
@@ -2427,15 +2427,6 @@
 CODE
 ok 1
 ok 2
-OUTPUT
-
-output_is( <<'CODE', <<OUTPUT, "angstrom" );
- chr S0, 0x212B
- print S0
- print "\n"
- end
-CODE
-\xe2\x84\xab
 OUTPUT
 
 1;

Reply via email to