That's really funny--I wrote almost exactly the same code w.r.t. string_unescape_cstring last night, and I also always use U+212b for testing any time I need to come up with a readable character outside of the Latin range. Strange coincidences.

I'll take a look and see if there is anything significantly different in our implementations, and get back to you. (It's definitely convenient, especially for testing, to have a way to represent arbitrary characters in string literals.)

JEff

On Apr 12, 2004, at 9:54 AM, Leopold Toetsch (via RT) wrote:

# New Ticket Created by  Leopold Toetsch
# Please include the string:  [perl #28494]
# in the subject line of all future correspondence about this issue.
# <URL: http://rt.perl.org:80/rt3/Ticket/Display.html?id=28494 >


Attached patch: * adds a new test file for Unicode-related string tests * reimplements string_unescape_cstring which uses now ICU for the work * fixes a bug in string_compare with equally length strings

It's also by far more efficient then the old code.

TODO: move it out of string.c, docs.

Jeff, please have a look at it.

leo
--- parrot/MANIFEST     Mon Apr 12 15:43:05 2004
+++ parrot-leo/MANIFEST Mon Apr 12 18:41:07 2004
@@ -2596,6 +2596,7 @@
 t/op/rx.t                                         []
 t/op/stacks.t                                     []
 t/op/string.t                                     []
+t/op/stringu.t                                    []
 t/op/time.t                                       []
 t/op/trans.t                                      []
 t/op/types.t                                      []
--- /dev/null   Fri Feb 28 14:27:28 2003
+++ parrot-leo/t/op/stringu.t   Mon Apr 12 18:40:40 2004
@@ -0,0 +1,57 @@
+#! perl -w
+# Copyright: 2001-2004 The Perl Foundation.  All Rights Reserved.
+# $Id$
+
+=head1 NAME
+
+t/op/stringu.t - Unicode String Test
+
+=head1 SYNOPSIS
+
+       % perl -Ilib t/op/stringu.t
+
+=head1 DESCRIPTION
+
+Tests Parrot's unicode string system.
+
+=cut
+#'
+
+use Parrot::Test tests => 4;
+use Test::More;
+
+output_is( <<'CODE', <<OUTPUT, "angstrom" );
+    chr S0, 0x212B
+    print S0
+    print "\n"
+    end
+CODE
+\xe2\x84\xab
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, " escaped angstrom" );
+    set S0, "\x{212b}"
+    print S0
+    print "\n"
+    end
+CODE
+\xe2\x84\xab
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, " escaped angstrom 2" );
+    set S0, "aaaaaa\x{212b}"
+    print S0
+    print "\n"
+    end
+CODE
+aaaaaa\xe2\x84\xab
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, " escaped angstrom 3" );
+    set S0, "aaaaaa\x{212b}-aaaaaa"
+    print S0
+    print "\n"
+    end
+CODE
+aaaaaa\xe2\x84\xab-aaaaaa
+OUTPUT
--- parrot/src/string.c Sun Apr 11 15:16:48 2004
+++ parrot-leo/src/string.c     Mon Apr 12 18:40:29 2004
@@ -1626,24 +1626,28 @@
     type1 *curr1 = (type1 *)s1->strstart; \
     type2 *curr2 = (type2 *)s2->strstart; \
      \
-    while( (_index++ < minlen) && (*curr1 == *curr2) ) \
+    while( (_index < minlen) && (*curr1 == *curr2) ) \
     { \
         ++curr1; \
         ++curr2; \
+        ++_index; \
     } \
+    if (_index == minlen && s1->strlen == s2->strlen) { \
+        result = 0; \
+        break; \
+    } \
+    result = *curr1 - *curr2; \
      \
-    *result = *curr1 - *curr2; \
-     \
-    if( !*result ) \
+    if( !result ) \
     { \
         if( s1->strlen != s2->strlen ) \
         { \
-            *result = s1->strlen > s2->strlen ? 1 : -1; \
+            result = s1->strlen > s2->strlen ? 1 : -1; \
         } \
     } \
     else \
     { \
-        *result = *result > 0 ? 1 : -1; \
+        result = result > 0 ? 1 : -1; \
     } \
 } while(0)

@@ -1691,13 +1695,13 @@
{
case enum_stringrep_one:
/* could use memcmp in this one case; faster?? */
- COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, cmp);
break;
case enum_stringrep_two:
- COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, cmp);
break;
case enum_stringrep_four:
- COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, cmp);
break;
default:
/* trouble! */
@@ -1731,18 +1735,18 @@
if( smaller->representation == enum_stringrep_two )
{
COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt2,
- larger, smaller, &cmp);
+ larger, smaller, cmp);
}
else /* smaller->representation == enum_stringrep_one */
{
COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt1,
- larger, smaller, &cmp);
+ larger, smaller, cmp);
}
}
else /* larger->representation == enum_stringrep_two,
smaller->representation == enum_stringrep_one */
{
- COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, &cmp);
+ COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, cmp);
}


         return cmp * multiplier;
@@ -3052,7 +3056,69 @@
 =cut

 */
+#if 1
+/* TODO move this out of string.c */
+#include <unicode/ustring.h>
+static UChar
+char_at(Parrot_Int4 offs, void* context)
+{
+    return *((char*)context + offs);

+}
+STRING *
+string_unescape_cstring(struct Parrot_Interp * interpreter,
+    char *cstring, char delimiter)
+{
+    size_t clength = strlen(cstring);
+    STRING *result;
+    int offs, d;
+    char *p;
+    Parrot_UInt4 r;
+    int had_int16;
+
+    result = string_make(interpreter, cstring, clength, "iso-8859-1",
+            PObj_constant_FLAG);
+    for (offs = d = had_int16 = 0; ; ) {
+        p = cstring + offs;
+        if (!*p || *p == delimiter)
+            break;
+        if (*p == '\\' && p[1]) {
+            ++offs;
+            r = u_unescapeAt(char_at, &offs, clength, cstring);
+            if (had_int16) {
+set_16:
+                ((Parrot_UInt2*)result->strstart)[d++] = r;
+            }
+            else {
+                /* TODO r = 0xffffffff for error */
+                if (r <= 0xff) {
+                    ((char*)result->strstart)[d++] = r;
+                }
+                else {
+                    assert(r <= 0xffff);    /* TODO */
+                    /* current result is this */
+                    result->strlen = result->bufused = d;
+                    _string_upscale(interpreter, result,
+                            enum_stringrep_two, clength);
+                    had_int16 = 1;
+                    goto set_16;
+                }
+            }
+        }
+        else {
+            if (had_int16)
+                ((Parrot_UInt2*)result->strstart)[d++] = *p;
+            else
+                ((char*)result->strstart)[d++] = *p;
+            ++offs;
+        }
+    }
+    result->strlen = d;
+    result->bufused = d * (had_int16 ? 2 : 1);
+    return result;
+}
+
+#else
 STRING *
 string_unescape_cstring(struct Parrot_Interp * interpreter,
     char *cstring, char delimiter)
@@ -3169,6 +3235,7 @@

     return string_constant_copy(interpreter, result);
 }
+#endif

/*

--- parrot/t/op/string.t        Sun Apr 11 11:58:46 2004
+++ parrot-leo/t/op/string.t    Mon Apr 12 18:13:41 2004
@@ -16,7 +16,7 @@

=cut

-use Parrot::Test tests => 132;
+use Parrot::Test tests => 131;
 use Test::More;

 output_is( <<'CODE', <<OUTPUT, "set_s_s|sc" );
@@ -2427,15 +2427,6 @@
 CODE
 ok 1
 ok 2
-OUTPUT
-
-output_is( <<'CODE', <<OUTPUT, "angstrom" );
- chr S0, 0x212B
- print S0
- print "\n"
- end
-CODE
-\xe2\x84\xab
 OUTPUT

1;



Reply via email to