op string.t

Leopold Toetsch Tue, 13 Apr 2004 07:18:39 -0700

cvsuser     04/04/13 07:18:33


  Modified:    .        Configure.pl
               config/gen/config_h config_h.in
               config/gen icu.pl
               include/parrot string_primitives.h
               src      string.c string_primitives.c
               t/op     string.t
  Log:
  [perl #28473] [PATCH] ICU data directory configuration
  
  Here's a patch to make the location if ICU's data files configurable,
  and also to cause parrot to throw an exception at string_init time if
  the data files are not found.
  
  Courtesy of Jeff Clites <[EMAIL PROTECTED]>
  
  [perl #28494]
  
  Some bits of it WRT unescaping.
  
  Changed unescape to use new code. Currently 16-bit codepoints only to
  keep diff smaller (leo)
  
  Revision  Changes    Path
  1.140     +9 -3      parrot/Configure.pl
  
  Index: Configure.pl
  ===================================================================
  RCS file: /cvs/public/parrot/Configure.pl,v
  retrieving revision 1.139
  retrieving revision 1.140
  diff -u -w -r1.139 -r1.140
  --- Configure.pl      9 Apr 2004 20:31:51 -0000       1.139
  +++ Configure.pl      13 Apr 2004 14:18:16 -0000      1.140
  @@ -1,6 +1,6 @@
   #! perl -w
   # Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -# $Id: Configure.pl,v 1.139 2004/04/09 20:31:51 dan Exp $
  +# $Id: Configure.pl,v 1.140 2004/04/13 14:18:16 leo Exp $
   
   =head1 NAME
   
  @@ -54,7 +54,7 @@
   You can add and remove option values with C<< :rem{<opt>} >> and
   C<< :add{<opt>} >>. For example:
   
  -    perl Configure.pl --ccflags="rem{-g} :add{-O2}"
  +    perl Configure.pl --ccflags=":rem{-g} :add{-O2}"
   
   =over
   
  @@ -167,6 +167,10 @@
   be one of: C<gc>, C<libc>, C<malloc> or C<malloc-trace>. The default is
   C<gc>.
   
  +=item C<--icudatadir=(path)>
  +
  +Use the given directory to locate ICU's data file(s)
  +
   =back
   
   Other Options (may not be implemented)
  @@ -216,7 +220,7 @@
   
     for($key) {
       /version/ && do {
  -      my $cvsid='$Id: Configure.pl,v 1.139 2004/04/09 20:31:51 dan Exp $';
  +      my $cvsid='$Id: Configure.pl,v 1.140 2004/04/13 14:18:16 leo Exp $';
         print <<"END";
   Parrot Version $parrot_version Configure 2.0
   $cvsid
  @@ -274,6 +278,8 @@
      --execcapable        Use JIT to emit a native executable
      --gc=(type)          Determine the type of garbage collection
                           type=(gc|libc|malloc|malloc-trace) default is gc
  +
  +   --icudatadir=(path)  Use the given directory to locate ICU's data file(s)
   
   Other Options (may not be implemented):
   
  
  
  
  1.22      +2 -0      parrot/config/gen/config_h/config_h.in
  
  Index: config_h.in
  ===================================================================
  RCS file: /cvs/public/parrot/config/gen/config_h/config_h.in,v
  retrieving revision 1.21
  retrieving revision 1.22
  diff -u -w -r1.21 -r1.22
  --- config_h.in       10 Apr 2004 09:49:15 -0000      1.21
  +++ config_h.in       13 Apr 2004 14:18:20 -0000      1.22
  @@ -134,6 +134,8 @@
   #define PARROT_CORE_CG_OPLIB_INIT Parrot_DynOp_core_cg_${MAJOR}_${MINOR}_${PATCH}
   #define PARROT_CORE_CGP_OPLIB_INIT Parrot_DynOp_core_cgp_${MAJOR}_${MINOR}_${PATCH}
   
  +#define DEFAULT_ICU_DATA_DIR "${icudatadir}"
  +
   #define INTVAL_FMT "${intvalfmt}"
   #define FLOATVAL_FMT "${floatvalfmt}"
   
  
  
  
  1.8       +10 -3     parrot/config/gen/icu.pl
  
  Index: icu.pl
  ===================================================================
  RCS file: /cvs/public/parrot/config/gen/icu.pl,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -w -r1.7 -r1.8
  --- icu.pl    12 Apr 2004 20:36:36 -0000      1.7
  +++ icu.pl    13 Apr 2004 14:18:23 -0000      1.8
  @@ -1,6 +1,6 @@
   #! perl -w
   # Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -# $Id: icu.pl,v 1.7 2004/04/12 20:36:36 dan Exp $
  +# $Id: icu.pl,v 1.8 2004/04/13 14:18:23 leo Exp $
   
   =head1 NAME
   
  @@ -20,10 +20,17 @@
   
   $description="Configuring ICU if requested...";
   
  [EMAIL PROTECTED](buildicu verbose);
  [EMAIL PROTECTED](buildicu verbose icudatadir);
   
   sub runstep {
  -  my ($buildicu, $verbose) = @_;
  +  my ($buildicu, $verbose, $icudatadir) = @_;
  +
  +  if( !defined $icudatadir )
  +  {
  +       $icudatadir = 'blib/lib/icu/2.6.1';
  +  }
  +
  +  Configure::Data->set( icudatadir => $icudatadir );
   
   #  unless ($buildicu) {
   #    print " [Skipped] " if $verbose;
  
  
  
  1.2       +8 -2      parrot/include/parrot/string_primitives.h
  
  Index: string_primitives.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/string_primitives.h,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -w -r1.1 -r1.2
  --- string_primitives.h       9 Apr 2004 20:32:24 -0000       1.1
  +++ string_primitives.h       13 Apr 2004 14:18:26 -0000      1.2
  @@ -1,7 +1,7 @@
   /* string_funcs.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: string_primitives.h,v 1.1 2004/04/09 20:32:24 dan Exp $
  + *     $Id: string_primitives.h,v 1.2 2004/04/13 14:18:26 leo Exp $
    *  Overview:
    *     This is the api header for the string subsystem
    *  Data Structure and Algorithms:
  @@ -22,6 +22,12 @@
   /* Convert from any supported encoding, into our internal format */
   void string_fill_from_buffer(struct Parrot_Interp *interpreter,
        const void *buffer, UINTVAL len, const char *encoding_name, STRING *s);
  +
  +/* Utility method which knows how to uwind a single escape sequence */
  +typedef Parrot_UInt2 (*Parrot_unescape_cb)(Parrot_Int4 offset, void *context);
  +Parrot_UInt4
  +string_unescape_one(Parrot_unescape_cb cb,
  +    Parrot_UInt4 *offset, Parrot_UInt4 input_length, void *string);
   
   UINTVAL
   Parrot_char_digit_value(struct Parrot_Interp *interpreter, UINTVAL character);
  
  
  
  1.186     +142 -175  parrot/src/string.c
  
  Index: string.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string.c,v
  retrieving revision 1.185
  retrieving revision 1.186
  diff -u -w -r1.185 -r1.186
  --- string.c  11 Apr 2004 13:14:07 -0000      1.185
  +++ string.c  13 Apr 2004 14:18:29 -0000      1.186
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -$Id: string.c,v 1.185 2004/04/11 13:14:07 mikescott Exp $
  +$Id: string.c,v 1.186 2004/04/13 14:18:29 leo Exp $
   
   =head1 NAME
   
  @@ -243,8 +243,9 @@
   void
   string_init(void)
   {
  -/* XXXX: pull out into a config */
  -    string_set_data_directory("blib/lib/icu/2.6.1");
  +/* DEFAULT_ICU_DATA_DIR is configured at build time. Need a way to
  +    specify this at runtime as well. */
  +    string_set_data_directory(DEFAULT_ICU_DATA_DIR);
   /*
       encoding_init();
       chartype_init();
  @@ -1626,24 +1627,28 @@
       type1 *curr1 = (type1 *)s1->strstart; \
       type2 *curr2 = (type2 *)s2->strstart; \
        \
  -    while( (_index++ < minlen) && (*curr1 == *curr2) ) \
  +    while( (_index < minlen) && (*curr1 == *curr2) ) \
       { \
           ++curr1; \
           ++curr2; \
  +        ++_index; \
       } \
  +    if (_index == minlen && s1->strlen == s2->strlen) { \
  +        result = 0; \
  +        break; \
  +    } \
  +    result = *curr1 - *curr2; \
        \
  -    *result = *curr1 - *curr2; \
  -     \
  -    if( !*result ) \
  +    if( !result ) \
       { \
           if( s1->strlen != s2->strlen ) \
           { \
  -            *result = s1->strlen > s2->strlen ? 1 : -1; \
  +            result = s1->strlen > s2->strlen ? 1 : -1; \
           } \
       } \
       else \
       { \
  -        *result = *result > 0 ? 1 : -1; \
  +        result = result > 0 ? 1 : -1; \
       } \
   } while(0)
   
  @@ -1691,13 +1696,13 @@
           {
               case enum_stringrep_one:
                   /* could use memcmp in this one case; faster?? */
  -                COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, &cmp);
  +                COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, cmp);
                   break;
               case enum_stringrep_two:
  -                COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, &cmp);
  +                COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, cmp);
                   break;
               case enum_stringrep_four:
  -                COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, &cmp);
  +                COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, cmp);
                   break;
               default:
                   /* trouble! */
  @@ -1731,18 +1736,18 @@
               if( smaller->representation == enum_stringrep_two )
               {
                   COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt2, 
  -                    larger, smaller, &cmp);
  +                    larger, smaller, cmp);
               }
               else /* smaller->representation == enum_stringrep_one */
               {
                   COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt1, 
  -                    larger, smaller, &cmp);
  +                    larger, smaller, cmp);
               }
           }
           else /* larger->representation == enum_stringrep_two, 
               smaller->representation == enum_stringrep_one */
           {
  -            COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, &cmp);
  +            COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, cmp);
           }
   
           return cmp * multiplier;
  @@ -3047,128 +3052,89 @@
   string_unescape_cstring(struct Parrot_Interp * interpreter, 
       char *cstring, char delimiter)>
   
  -Unescapes the specified C string.
  +Unescapes the specified C string. These sequences are covered:
  +
  +  \xhh        1..2 hex digits
  +  \ooo        1..3 oct digits
  +  \cX         control char X
  +  \x{h..h}    1..8 hex digits
  +  \uhhhh      4 hex digits
  +  \Uhhhhhhhh  8 hex digits
  +  \a, \b, \t, \n, \v, \f, \r, \e, \?
   
   =cut
   
   */
   
  -STRING *
  -string_unescape_cstring(struct Parrot_Interp * interpreter, 
  -    char *cstring, char delimiter)
  -{
  -    char *p, *string;
  -    char hexdigits[] = "0123456789abcdef";
  -    STRING *result;
  -    size_t clength = strlen(cstring);
   
  -    if( !cstring || !clength ) return NULL;
  -
  -    result = string_make(interpreter, cstring, clength, "iso-8859-1", 0);
  -
  -    for (p = (char *)result->strstart, string = cstring ; *string; ++string)
  +static Parrot_UInt2
  +char8_at(Parrot_Int4 offs, void* context)
       {
  -        if (*string == '\\' && string[1]) {
  -            switch (*++string) {
  -                case 'n':
  -                    *p++ = '\n';
  -                    break;
  -                case 'r':
  -                    *p++ = '\r';
  -                    break;
  -                case 't':
  -                    *p++ = '\t';
  -                    break;
  -                case 'a':
  -                    *p++ = '\a';
  -                    break;
  -                case 'f':
  -                    *p++ = '\f';
  -                    break;
  -                case 'e':
  -                    *p++ = '\033';
  -                    break;
  -                case '\\':
  -                    *p++ = '\\';
  -                    break;
  -                case 'x':       /* XXX encoding??? */
  -                    {
  -                        int c1 = tolower(*++string);
  -                        char *p1 = strchr(hexdigits, c1);
  -                        char *p2;
  -                        if (p1) {
  -                            int c2 = tolower(*++string);
  -                            p2 = strchr(hexdigits, c2);
  -                            if (p2)
  -                                *p++ = ((p1-hexdigits) << 4) | (p2-hexdigits);
  -                            else {
  -                                --string;
  -                                *p++ = (p1-hexdigits);
  -                            }
  -                        }
  -                        else {
  -                            /* XXX warning? */
  -                            *p++ = *--string;
  -                        }
  +    return ((char*)((STRING *)context)->strstart)[offs];
  +
                       }
  -                    break;
  -                case 'u':       /* XXX encoding??? */
  -                    {
  -                        UINTVAL cval = 0;
  -                        int count = 4;
   
  -                        while (count-- && string[1])
  +static Parrot_UInt2
  +char16_at(Parrot_Int4 offs, void* context)
                           {
  -                            int c1 = tolower(*++string);
  -                            char *p1 = strchr(hexdigits, c1);
  -
  -                            if (p1) {
  -                                cval = (cval << 4) | (p1-hexdigits);
  -                            }
  -                            else {
  -                                /* XXX warning? */
  -                                --string;
  -                                break;
  -                            }
  +    return ((Parrot_UInt2*)((STRING *)context)->strstart)[offs];
   
                           }
   
  -                        if( cval <= 0xFF )
  +STRING *
  +string_unescape_cstring(struct Parrot_Interp * interpreter,
  +    char *cstring, char delimiter)
                           {
  -                            *p++ = cval;
  -                        }
  +    size_t clength = strlen(cstring);
  +    STRING *result;
  +    int offs, d;
  +    Parrot_UInt4 r;
  +    int had_int16 = 0;
  +    Parrot_unescape_cb char_at = char8_at;
  +
  +    if (delimiter && clength)
  +        --clength;
  +    result = string_make(interpreter, cstring, clength, "iso-8859-1",
  +            PObj_constant_FLAG);
  +    if (result->representation == enum_stringrep_two) {
  +        had_int16 = 1;
  +        char_at = char16_at;
  +    }
  +
  +    for (offs = d =  0; ; ++offs) {
  +        r = (char_at)(offs, result);
  +        if (!r || r == (Parrot_UInt4)delimiter)
  +            break;
  +        if (r == '\\') {
  +            ++offs;
  +            r = string_unescape_one(char_at, &offs, result->strlen, result);
  +            --offs;
  +            /* TODO r = 0xffffffff for error */
  +            if (r >= 0x100 && !had_int16) {
  +                assert(r <= 0xffff);    /* TODO */
  +                /* current result is this */
  +                result->strlen = result->bufused = clength;
  +                _string_upscale(interpreter, result,
  +                        enum_stringrep_two, clength);
  +                had_int16 = 1;
  +                char_at = char16_at;
  +            }
  +        }
  +        if (d == offs) {
  +            ++d;
  +            continue;
  +        }
  +        /* TODO create set functions too */
  +        if (had_int16)
  +            ((Parrot_UInt2*)result->strstart)[d++] = r;
                           else
  -                        {
  -                            /* fall back to a method which handles
  -                            non-rep-1 strings */
  -                            ++string;
  -
  -                            /* finish up the string so far */
  -                            result->bufused = p - (char *)result->strstart;
  -                            string_compute_strlen(interpreter, result);
  -
  -                            _string_unescape_cstring_large(interpreter, string,
  -                                                        result, cval, delimiter);
  -                            return string_constant_copy(interpreter, result);
  -                        }
  -                    }
  -                    break;
  -                default:
  -                    *p++ = *string;
  -                    break;
  -            }
  +            ((char*)result->strstart)[d++] = r;
           }
  -        else if (*string == delimiter)
  -            break;
  -        else
  -            *p++ = *string;
  +    result->strlen = d;
  +    result->bufused = string_max_bytes(interpreter, result, d);
  +    return result;
       }
   
  -    result->bufused = p - (char *)result->strstart;
  -    string_compute_strlen(interpreter, result);
  -
  -    return string_constant_copy(interpreter, result);
  -}
   
   /*
   
  @@ -3183,6 +3149,7 @@
   =cut
   
   */
  +
   
   STRING *
   string_upcase(struct Parrot_Interp *interpreter, const STRING *s)
  
  
  
  1.5       +21 -76    parrot/src/string_primitives.c
  
  Index: string_primitives.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string_primitives.c,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -w -r1.4 -r1.5
  --- string_primitives.c       11 Apr 2004 13:14:07 -0000      1.4
  +++ string_primitives.c       13 Apr 2004 14:18:29 -0000      1.5
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: string_primitives.c,v 1.4 2004/04/11 13:14:07 mikescott Exp $
  +$Id: string_primitives.c,v 1.5 2004/04/13 14:18:29 leo Exp $
   
   =head1 NAME
   
  @@ -23,6 +23,7 @@
   #include <unicode/ucnv.h>
   #include <unicode/utypes.h>
   #include <unicode/uchar.h>
  +#include <unicode/ustring.h>
   #include <assert.h>
   
   /*
  @@ -41,6 +42,19 @@
   string_set_data_directory(const char *dir)
   {
       u_setDataDirectory(dir);
  +
  +    /* Since u_setDataDirectory doesn't have a result code, we'll spot
  +       check that everything is okay by making sure that '9' had decimal
  +       value 9. Using 57 rather than '9' so that the encoding of this
  +       source code file isn't an issue.... (Don't want to get bitten by
  +       EBCDIC.) */
  +
  +    if( !u_isdigit(57) || (u_charDigitValue(57) != 9) )
  +    {
  +            internal_exception(ICU_ERROR,
  +                "string_set_data_directory: ICU data files not found"
  +                "(apparently) for directory [%s]", dir);
  +    }
   }
   
   /*
  @@ -125,82 +139,13 @@
        string_compute_strlen(interpreter, s);
   }
   
  -/*
  -{
  -
  -    UErrorCode myError = U_ZERO_ERROR;
  -    UConverter *conv = ucnv_open("ISO-8859-1", &myError);
  -
  -    NSData *data = [NSData dataWithContentsOfFile:@"/var/tmp/ja.txt"];
  -
  -
  -    UConverter *conv2 = ucnv_open("UTF-8", &myError);
  -    UChar *outputBuffer = malloc(50*sizeof(UChar));
  -    UChar *start = outputBuffer;
  -    char *source = [data bytes];
  -    UErrorCode convError = 0;
  -
  -    ucnv_toUnicode(conv2, &outputBuffer, &(outputBuffer[50]), &source, source + 
[data length], NULL, TRUE, &convError);
  -
  -    NSLog(@"conv error = %i", convError);
  -    NSLog(@"conv length = %i", outputBuffer - start);
  -
  -
  -    UConverter *conv3 = ucnv_open("UCS-2", &myError); //ISO-8859-1"
  -    char *outbuffer = malloc(50);
  -    char *outstart = outbuffer;
  -    UErrorCode redoErr = 0;
   
  -    NSLog(@"conv3 = %p", conv3);
  -    NSLog(@"%s", ucnv_getName(conv3, &redoErr));
  -
  -        UErrorCode callbackStatus = 0;
  -
  -          ucnv_setFromUCallBack(conv3,
  -                       UCNV_FROM_U_CALLBACK_STOP, //UCNV_FROM_U_CALLBACK_ESCAPE
  -                       NULL,
  -                       NULL,
  -                       NULL,
  -                       &callbackStatus);
  -
  -    ucnv_fromUnicode(conv3, &outbuffer, outbuffer + 50, &start, outputBuffer, NULL, 
TRUE, &redoErr);
  -//    ucnv_fromUnicode(conv3, NULL, NULL, &start, outputBuffer, NULL, TRUE, 
&redoErr);
  -
  -    NSLog(@"redo error = %i", redoErr);
  -    NSLog(@"%s", u_errorName(redoErr));
  -    NSLog(@"redo length = %i", outbuffer - outstart);
  -
  -NSLog(@"%x", *outstart);
  -NSLog(@"%d", *outstart);
  -NSLog(@"%x", outstart[1]);
  -    NSLog(@"%@", [[NSString alloc] initWithData:[NSData dataWithBytes:outstart 
length:(outbuffer - outstart)] encoding:NSISOLatin1StringEncoding]);
  -
  -{
  -    UConverter *jisConv = ucnv_open("shift_jis", &myError);
  -    USet *set =  uset_open(0, 0);
  -    UErrorCode setError = U_ZERO_ERROR;
  -
  -    ucnv_getUnicodeSet( jisConv, set, UCNV_ROUNDTRIP_SET, &setError);
  -
  -    NSLog(@"set size = %d", uset_size(set));
  -    NSLog(@"contains range = %d", uset_containsRange(set, 0x10000, 0x10ffff));
  -    NSLog(@"uset_containsNoneOfRange = %d", uset_containsNoneOfRange(set, 0x10000, 
0x10ffff));
  -}
  +Parrot_UInt4
  +string_unescape_one(Parrot_unescape_cb cb, Parrot_UInt4 *offset,
  +        Parrot_UInt4 input_length, void *string)
   {
  -    UConverter *utf8Conv = ucnv_open("UTF-8", &myError);
  -    USet *set =  uset_open(0, 0);
  -    UErrorCode setError = U_ZERO_ERROR;
  -
  -    ucnv_getUnicodeSet( utf8Conv, set, UCNV_ROUNDTRIP_SET, &setError);
  -
  -    NSLog(@"set size = %d", uset_size(set));
  -    NSLog(@"contains range = %d", uset_containsRange(set, 0x10000, 0x10ffff));
  -    NSLog(@"uset_containsNoneOfRange = %d", uset_containsNoneOfRange(set, 0x10000, 
0x10ffff));
  -}
  -    [pool release];
  -    return 0;
  +    return u_unescapeAt(cb, offset, input_length, string);
   }
  -*/
   
   /*
   
  
  
  
  1.73      +2 -11     parrot/t/op/string.t
  
  Index: string.t
  ===================================================================
  RCS file: /cvs/public/parrot/t/op/string.t,v
  retrieving revision 1.72
  retrieving revision 1.73
  diff -u -w -r1.72 -r1.73
  --- string.t  11 Apr 2004 09:56:32 -0000      1.72
  +++ string.t  13 Apr 2004 14:18:33 -0000      1.73
  @@ -1,6 +1,6 @@
   #! perl -w
   # Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -# $Id: string.t,v 1.72 2004/04/11 09:56:32 leo Exp $
  +# $Id: string.t,v 1.73 2004/04/13 14:18:33 leo Exp $
   
   =head1 NAME
   
  @@ -16,7 +16,7 @@
   
   =cut
   
  -use Parrot::Test tests => 132;
  +use Parrot::Test tests => 131;
   use Test::More;
   
   output_is( <<'CODE', <<OUTPUT, "set_s_s|sc" );
  @@ -2427,15 +2427,6 @@
   CODE
   ok 1
   ok 2
  -OUTPUT
  -
  -output_is( <<'CODE', <<OUTPUT, "angstrom" );
  - chr S0, 0x212B
  - print S0
  - print "\n"
  - end
  -CODE
  -\xe2\x84\xab
   OUTPUT
   
   1;

cvs commit: parrot/t/op string.t

Reply via email to