I'll take a look and see if there is anything significantly different in our implementations, and get back to you. (It's definitely convenient, especially for testing, to have a way to represent arbitrary characters in string literals.)
JEff
On Apr 12, 2004, at 9:54 AM, Leopold Toetsch (via RT) wrote:
# New Ticket Created by Leopold Toetsch # Please include the string: [perl #28494] # in the subject line of all future correspondence about this issue. # <URL: http://rt.perl.org:80/rt3/Ticket/Display.html?id=28494 >
Attached patch: * adds a new test file for Unicode-related string tests * reimplements string_unescape_cstring which uses now ICU for the work * fixes a bug in string_compare with equally length strings
It's also by far more efficient then the old code.
TODO: move it out of string.c, docs.
Jeff, please have a look at it.
leo --- parrot/MANIFEST Mon Apr 12 15:43:05 2004 +++ parrot-leo/MANIFEST Mon Apr 12 18:41:07 2004 @@ -2596,6 +2596,7 @@ t/op/rx.t [] t/op/stacks.t [] t/op/string.t [] +t/op/stringu.t [] t/op/time.t [] t/op/trans.t [] t/op/types.t [] --- /dev/null Fri Feb 28 14:27:28 2003 +++ parrot-leo/t/op/stringu.t Mon Apr 12 18:40:40 2004 @@ -0,0 +1,57 @@ +#! perl -w +# Copyright: 2001-2004 The Perl Foundation. All Rights Reserved. +# $Id$ + +=head1 NAME + +t/op/stringu.t - Unicode String Test + +=head1 SYNOPSIS + + % perl -Ilib t/op/stringu.t + +=head1 DESCRIPTION + +Tests Parrot's unicode string system. + +=cut +#' + +use Parrot::Test tests => 4; +use Test::More; + +output_is( <<'CODE', <<OUTPUT, "angstrom" ); + chr S0, 0x212B + print S0 + print "\n" + end +CODE +\xe2\x84\xab +OUTPUT + +output_is( <<'CODE', <<OUTPUT, " escaped angstrom" ); + set S0, "\x{212b}" + print S0 + print "\n" + end +CODE +\xe2\x84\xab +OUTPUT + +output_is( <<'CODE', <<OUTPUT, " escaped angstrom 2" ); + set S0, "aaaaaa\x{212b}" + print S0 + print "\n" + end +CODE +aaaaaa\xe2\x84\xab +OUTPUT + +output_is( <<'CODE', <<OUTPUT, " escaped angstrom 3" ); + set S0, "aaaaaa\x{212b}-aaaaaa" + print S0 + print "\n" + end +CODE +aaaaaa\xe2\x84\xab-aaaaaa +OUTPUT --- parrot/src/string.c Sun Apr 11 15:16:48 2004 +++ parrot-leo/src/string.c Mon Apr 12 18:40:29 2004 @@ -1626,24 +1626,28 @@ type1 *curr1 = (type1 *)s1->strstart; \ type2 *curr2 = (type2 *)s2->strstart; \ \ - while( (_index++ < minlen) && (*curr1 == *curr2) ) \ + while( (_index < minlen) && (*curr1 == *curr2) ) \ { \ ++curr1; \ ++curr2; \ + ++_index; \ } \ + if (_index == minlen && s1->strlen == s2->strlen) { \ + result = 0; \ + break; \ + } \ + result = *curr1 - *curr2; \ \ - *result = *curr1 - *curr2; \ - \ - if( !*result ) \ + if( !result ) \ { \ if( s1->strlen != s2->strlen ) \ { \ - *result = s1->strlen > s2->strlen ? 1 : -1; \ + result = s1->strlen > s2->strlen ? 1 : -1; \ } \ } \ else \ { \ - *result = *result > 0 ? 1 : -1; \ + result = result > 0 ? 1 : -1; \ } \ } while(0)
@@ -1691,13 +1695,13 @@
{
case enum_stringrep_one:
/* could use memcmp in this one case; faster?? */
- COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, cmp);
break;
case enum_stringrep_two:
- COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, cmp);
break;
case enum_stringrep_four:
- COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, cmp);
break;
default:
/* trouble! */
@@ -1731,18 +1735,18 @@
if( smaller->representation == enum_stringrep_two )
{
COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt2,
- larger, smaller, &cmp);
+ larger, smaller, cmp);
}
else /* smaller->representation == enum_stringrep_one */
{
COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt1,
- larger, smaller, &cmp);
+ larger, smaller, cmp);
}
}
else /* larger->representation == enum_stringrep_two,
smaller->representation == enum_stringrep_one */
{
- COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, &cmp);
+ COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, cmp);
}
return cmp * multiplier; @@ -3052,7 +3056,69 @@ =cut
*/ +#if 1 +/* TODO move this out of string.c */ +#include <unicode/ustring.h> +static UChar +char_at(Parrot_Int4 offs, void* context) +{ + return *((char*)context + offs);
+} +STRING * +string_unescape_cstring(struct Parrot_Interp * interpreter, + char *cstring, char delimiter) +{ + size_t clength = strlen(cstring); + STRING *result; + int offs, d; + char *p; + Parrot_UInt4 r; + int had_int16; + + result = string_make(interpreter, cstring, clength, "iso-8859-1", + PObj_constant_FLAG); + for (offs = d = had_int16 = 0; ; ) { + p = cstring + offs; + if (!*p || *p == delimiter) + break; + if (*p == '\\' && p[1]) { + ++offs; + r = u_unescapeAt(char_at, &offs, clength, cstring); + if (had_int16) { +set_16: + ((Parrot_UInt2*)result->strstart)[d++] = r; + } + else { + /* TODO r = 0xffffffff for error */ + if (r <= 0xff) { + ((char*)result->strstart)[d++] = r; + } + else { + assert(r <= 0xffff); /* TODO */ + /* current result is this */ + result->strlen = result->bufused = d; + _string_upscale(interpreter, result, + enum_stringrep_two, clength); + had_int16 = 1; + goto set_16; + } + } + } + else { + if (had_int16) + ((Parrot_UInt2*)result->strstart)[d++] = *p; + else + ((char*)result->strstart)[d++] = *p; + ++offs; + } + } + result->strlen = d; + result->bufused = d * (had_int16 ? 2 : 1); + return result; +} + +#else STRING * string_unescape_cstring(struct Parrot_Interp * interpreter, char *cstring, char delimiter) @@ -3169,6 +3235,7 @@
return string_constant_copy(interpreter, result); } +#endif
/*
--- parrot/t/op/string.t Sun Apr 11 11:58:46 2004 +++ parrot-leo/t/op/string.t Mon Apr 12 18:13:41 2004 @@ -16,7 +16,7 @@
=cut
-use Parrot::Test tests => 132; +use Parrot::Test tests => 131; use Test::More;
output_is( <<'CODE', <<OUTPUT, "set_s_s|sc" ); @@ -2427,15 +2427,6 @@ CODE ok 1 ok 2 -OUTPUT - -output_is( <<'CODE', <<OUTPUT, "angstrom" ); - chr S0, 0x212B - print S0 - print "\n" - end -CODE -\xe2\x84\xab OUTPUT
1;