cvsuser 04/04/13 07:18:33
Modified: . Configure.pl
config/gen/config_h config_h.in
config/gen icu.pl
include/parrot string_primitives.h
src string.c string_primitives.c
t/op string.t
Log:
[perl #28473] [PATCH] ICU data directory configuration
Here's a patch to make the location if ICU's data files configurable,
and also to cause parrot to throw an exception at string_init time if
the data files are not found.
Courtesy of Jeff Clites <[EMAIL PROTECTED]>
[perl #28494]
Some bits of it WRT unescaping.
Changed unescape to use new code. Currently 16-bit codepoints only to
keep diff smaller (leo)
Revision Changes Path
1.140 +9 -3 parrot/Configure.pl
Index: Configure.pl
===================================================================
RCS file: /cvs/public/parrot/Configure.pl,v
retrieving revision 1.139
retrieving revision 1.140
diff -u -w -r1.139 -r1.140
--- Configure.pl 9 Apr 2004 20:31:51 -0000 1.139
+++ Configure.pl 13 Apr 2004 14:18:16 -0000 1.140
@@ -1,6 +1,6 @@
#! perl -w
# Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-# $Id: Configure.pl,v 1.139 2004/04/09 20:31:51 dan Exp $
+# $Id: Configure.pl,v 1.140 2004/04/13 14:18:16 leo Exp $
=head1 NAME
@@ -54,7 +54,7 @@
You can add and remove option values with C<< :rem{<opt>} >> and
C<< :add{<opt>} >>. For example:
- perl Configure.pl --ccflags="rem{-g} :add{-O2}"
+ perl Configure.pl --ccflags=":rem{-g} :add{-O2}"
=over
@@ -167,6 +167,10 @@
be one of: C<gc>, C<libc>, C<malloc> or C<malloc-trace>. The default is
C<gc>.
+=item C<--icudatadir=(path)>
+
+Use the given directory to locate ICU's data file(s)
+
=back
Other Options (may not be implemented)
@@ -216,7 +220,7 @@
for($key) {
/version/ && do {
- my $cvsid='$Id: Configure.pl,v 1.139 2004/04/09 20:31:51 dan Exp $';
+ my $cvsid='$Id: Configure.pl,v 1.140 2004/04/13 14:18:16 leo Exp $';
print <<"END";
Parrot Version $parrot_version Configure 2.0
$cvsid
@@ -274,6 +278,8 @@
--execcapable Use JIT to emit a native executable
--gc=(type) Determine the type of garbage collection
type=(gc|libc|malloc|malloc-trace) default is gc
+
+ --icudatadir=(path) Use the given directory to locate ICU's data file(s)
Other Options (may not be implemented):
1.22 +2 -0 parrot/config/gen/config_h/config_h.in
Index: config_h.in
===================================================================
RCS file: /cvs/public/parrot/config/gen/config_h/config_h.in,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -w -r1.21 -r1.22
--- config_h.in 10 Apr 2004 09:49:15 -0000 1.21
+++ config_h.in 13 Apr 2004 14:18:20 -0000 1.22
@@ -134,6 +134,8 @@
#define PARROT_CORE_CG_OPLIB_INIT Parrot_DynOp_core_cg_${MAJOR}_${MINOR}_${PATCH}
#define PARROT_CORE_CGP_OPLIB_INIT Parrot_DynOp_core_cgp_${MAJOR}_${MINOR}_${PATCH}
+#define DEFAULT_ICU_DATA_DIR "${icudatadir}"
+
#define INTVAL_FMT "${intvalfmt}"
#define FLOATVAL_FMT "${floatvalfmt}"
1.8 +10 -3 parrot/config/gen/icu.pl
Index: icu.pl
===================================================================
RCS file: /cvs/public/parrot/config/gen/icu.pl,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -w -r1.7 -r1.8
--- icu.pl 12 Apr 2004 20:36:36 -0000 1.7
+++ icu.pl 13 Apr 2004 14:18:23 -0000 1.8
@@ -1,6 +1,6 @@
#! perl -w
# Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-# $Id: icu.pl,v 1.7 2004/04/12 20:36:36 dan Exp $
+# $Id: icu.pl,v 1.8 2004/04/13 14:18:23 leo Exp $
=head1 NAME
@@ -20,10 +20,17 @@
$description="Configuring ICU if requested...";
[EMAIL PROTECTED](buildicu verbose);
[EMAIL PROTECTED](buildicu verbose icudatadir);
sub runstep {
- my ($buildicu, $verbose) = @_;
+ my ($buildicu, $verbose, $icudatadir) = @_;
+
+ if( !defined $icudatadir )
+ {
+ $icudatadir = 'blib/lib/icu/2.6.1';
+ }
+
+ Configure::Data->set( icudatadir => $icudatadir );
# unless ($buildicu) {
# print " [Skipped] " if $verbose;
1.2 +8 -2 parrot/include/parrot/string_primitives.h
Index: string_primitives.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/string_primitives.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -w -r1.1 -r1.2
--- string_primitives.h 9 Apr 2004 20:32:24 -0000 1.1
+++ string_primitives.h 13 Apr 2004 14:18:26 -0000 1.2
@@ -1,7 +1,7 @@
/* string_funcs.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: string_primitives.h,v 1.1 2004/04/09 20:32:24 dan Exp $
+ * $Id: string_primitives.h,v 1.2 2004/04/13 14:18:26 leo Exp $
* Overview:
* This is the api header for the string subsystem
* Data Structure and Algorithms:
@@ -22,6 +22,12 @@
/* Convert from any supported encoding, into our internal format */
void string_fill_from_buffer(struct Parrot_Interp *interpreter,
const void *buffer, UINTVAL len, const char *encoding_name, STRING *s);
+
+/* Utility method which knows how to uwind a single escape sequence */
+typedef Parrot_UInt2 (*Parrot_unescape_cb)(Parrot_Int4 offset, void *context);
+Parrot_UInt4
+string_unescape_one(Parrot_unescape_cb cb,
+ Parrot_UInt4 *offset, Parrot_UInt4 input_length, void *string);
UINTVAL
Parrot_char_digit_value(struct Parrot_Interp *interpreter, UINTVAL character);
1.186 +142 -175 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.185
retrieving revision 1.186
diff -u -w -r1.185 -r1.186
--- string.c 11 Apr 2004 13:14:07 -0000 1.185
+++ string.c 13 Apr 2004 14:18:29 -0000 1.186
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.185 2004/04/11 13:14:07 mikescott Exp $
+$Id: string.c,v 1.186 2004/04/13 14:18:29 leo Exp $
=head1 NAME
@@ -243,8 +243,9 @@
void
string_init(void)
{
-/* XXXX: pull out into a config */
- string_set_data_directory("blib/lib/icu/2.6.1");
+/* DEFAULT_ICU_DATA_DIR is configured at build time. Need a way to
+ specify this at runtime as well. */
+ string_set_data_directory(DEFAULT_ICU_DATA_DIR);
/*
encoding_init();
chartype_init();
@@ -1626,24 +1627,28 @@
type1 *curr1 = (type1 *)s1->strstart; \
type2 *curr2 = (type2 *)s2->strstart; \
\
- while( (_index++ < minlen) && (*curr1 == *curr2) ) \
+ while( (_index < minlen) && (*curr1 == *curr2) ) \
{ \
++curr1; \
++curr2; \
+ ++_index; \
} \
+ if (_index == minlen && s1->strlen == s2->strlen) { \
+ result = 0; \
+ break; \
+ } \
+ result = *curr1 - *curr2; \
\
- *result = *curr1 - *curr2; \
- \
- if( !*result ) \
+ if( !result ) \
{ \
if( s1->strlen != s2->strlen ) \
{ \
- *result = s1->strlen > s2->strlen ? 1 : -1; \
+ result = s1->strlen > s2->strlen ? 1 : -1; \
} \
} \
else \
{ \
- *result = *result > 0 ? 1 : -1; \
+ result = result > 0 ? 1 : -1; \
} \
} while(0)
@@ -1691,13 +1696,13 @@
{
case enum_stringrep_one:
/* could use memcmp in this one case; faster?? */
- COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt1, Parrot_UInt1, s1, s2, cmp);
break;
case enum_stringrep_two:
- COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt2, s1, s2, cmp);
break;
case enum_stringrep_four:
- COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, &cmp);
+ COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt4, s1, s2, cmp);
break;
default:
/* trouble! */
@@ -1731,18 +1736,18 @@
if( smaller->representation == enum_stringrep_two )
{
COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt2,
- larger, smaller, &cmp);
+ larger, smaller, cmp);
}
else /* smaller->representation == enum_stringrep_one */
{
COMPARE_STRINGS(Parrot_UInt4, Parrot_UInt1,
- larger, smaller, &cmp);
+ larger, smaller, cmp);
}
}
else /* larger->representation == enum_stringrep_two,
smaller->representation == enum_stringrep_one */
{
- COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, &cmp);
+ COMPARE_STRINGS(Parrot_UInt2, Parrot_UInt1, larger, smaller, cmp);
}
return cmp * multiplier;
@@ -3047,128 +3052,89 @@
string_unescape_cstring(struct Parrot_Interp * interpreter,
char *cstring, char delimiter)>
-Unescapes the specified C string.
+Unescapes the specified C string. These sequences are covered:
+
+ \xhh 1..2 hex digits
+ \ooo 1..3 oct digits
+ \cX control char X
+ \x{h..h} 1..8 hex digits
+ \uhhhh 4 hex digits
+ \Uhhhhhhhh 8 hex digits
+ \a, \b, \t, \n, \v, \f, \r, \e, \?
=cut
*/
-STRING *
-string_unescape_cstring(struct Parrot_Interp * interpreter,
- char *cstring, char delimiter)
-{
- char *p, *string;
- char hexdigits[] = "0123456789abcdef";
- STRING *result;
- size_t clength = strlen(cstring);
- if( !cstring || !clength ) return NULL;
-
- result = string_make(interpreter, cstring, clength, "iso-8859-1", 0);
-
- for (p = (char *)result->strstart, string = cstring ; *string; ++string)
+static Parrot_UInt2
+char8_at(Parrot_Int4 offs, void* context)
{
- if (*string == '\\' && string[1]) {
- switch (*++string) {
- case 'n':
- *p++ = '\n';
- break;
- case 'r':
- *p++ = '\r';
- break;
- case 't':
- *p++ = '\t';
- break;
- case 'a':
- *p++ = '\a';
- break;
- case 'f':
- *p++ = '\f';
- break;
- case 'e':
- *p++ = '\033';
- break;
- case '\\':
- *p++ = '\\';
- break;
- case 'x': /* XXX encoding??? */
- {
- int c1 = tolower(*++string);
- char *p1 = strchr(hexdigits, c1);
- char *p2;
- if (p1) {
- int c2 = tolower(*++string);
- p2 = strchr(hexdigits, c2);
- if (p2)
- *p++ = ((p1-hexdigits) << 4) | (p2-hexdigits);
- else {
- --string;
- *p++ = (p1-hexdigits);
- }
- }
- else {
- /* XXX warning? */
- *p++ = *--string;
- }
+ return ((char*)((STRING *)context)->strstart)[offs];
+
}
- break;
- case 'u': /* XXX encoding??? */
- {
- UINTVAL cval = 0;
- int count = 4;
- while (count-- && string[1])
+static Parrot_UInt2
+char16_at(Parrot_Int4 offs, void* context)
{
- int c1 = tolower(*++string);
- char *p1 = strchr(hexdigits, c1);
-
- if (p1) {
- cval = (cval << 4) | (p1-hexdigits);
- }
- else {
- /* XXX warning? */
- --string;
- break;
- }
+ return ((Parrot_UInt2*)((STRING *)context)->strstart)[offs];
}
- if( cval <= 0xFF )
+STRING *
+string_unescape_cstring(struct Parrot_Interp * interpreter,
+ char *cstring, char delimiter)
{
- *p++ = cval;
- }
+ size_t clength = strlen(cstring);
+ STRING *result;
+ int offs, d;
+ Parrot_UInt4 r;
+ int had_int16 = 0;
+ Parrot_unescape_cb char_at = char8_at;
+
+ if (delimiter && clength)
+ --clength;
+ result = string_make(interpreter, cstring, clength, "iso-8859-1",
+ PObj_constant_FLAG);
+ if (result->representation == enum_stringrep_two) {
+ had_int16 = 1;
+ char_at = char16_at;
+ }
+
+ for (offs = d = 0; ; ++offs) {
+ r = (char_at)(offs, result);
+ if (!r || r == (Parrot_UInt4)delimiter)
+ break;
+ if (r == '\\') {
+ ++offs;
+ r = string_unescape_one(char_at, &offs, result->strlen, result);
+ --offs;
+ /* TODO r = 0xffffffff for error */
+ if (r >= 0x100 && !had_int16) {
+ assert(r <= 0xffff); /* TODO */
+ /* current result is this */
+ result->strlen = result->bufused = clength;
+ _string_upscale(interpreter, result,
+ enum_stringrep_two, clength);
+ had_int16 = 1;
+ char_at = char16_at;
+ }
+ }
+ if (d == offs) {
+ ++d;
+ continue;
+ }
+ /* TODO create set functions too */
+ if (had_int16)
+ ((Parrot_UInt2*)result->strstart)[d++] = r;
else
- {
- /* fall back to a method which handles
- non-rep-1 strings */
- ++string;
-
- /* finish up the string so far */
- result->bufused = p - (char *)result->strstart;
- string_compute_strlen(interpreter, result);
-
- _string_unescape_cstring_large(interpreter, string,
- result, cval, delimiter);
- return string_constant_copy(interpreter, result);
- }
- }
- break;
- default:
- *p++ = *string;
- break;
- }
+ ((char*)result->strstart)[d++] = r;
}
- else if (*string == delimiter)
- break;
- else
- *p++ = *string;
+ result->strlen = d;
+ result->bufused = string_max_bytes(interpreter, result, d);
+ return result;
}
- result->bufused = p - (char *)result->strstart;
- string_compute_strlen(interpreter, result);
-
- return string_constant_copy(interpreter, result);
-}
/*
@@ -3183,6 +3149,7 @@
=cut
*/
+
STRING *
string_upcase(struct Parrot_Interp *interpreter, const STRING *s)
1.5 +21 -76 parrot/src/string_primitives.c
Index: string_primitives.c
===================================================================
RCS file: /cvs/public/parrot/src/string_primitives.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -w -r1.4 -r1.5
--- string_primitives.c 11 Apr 2004 13:14:07 -0000 1.4
+++ string_primitives.c 13 Apr 2004 14:18:29 -0000 1.5
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: string_primitives.c,v 1.4 2004/04/11 13:14:07 mikescott Exp $
+$Id: string_primitives.c,v 1.5 2004/04/13 14:18:29 leo Exp $
=head1 NAME
@@ -23,6 +23,7 @@
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
#include <unicode/uchar.h>
+#include <unicode/ustring.h>
#include <assert.h>
/*
@@ -41,6 +42,19 @@
string_set_data_directory(const char *dir)
{
u_setDataDirectory(dir);
+
+ /* Since u_setDataDirectory doesn't have a result code, we'll spot
+ check that everything is okay by making sure that '9' had decimal
+ value 9. Using 57 rather than '9' so that the encoding of this
+ source code file isn't an issue.... (Don't want to get bitten by
+ EBCDIC.) */
+
+ if( !u_isdigit(57) || (u_charDigitValue(57) != 9) )
+ {
+ internal_exception(ICU_ERROR,
+ "string_set_data_directory: ICU data files not found"
+ "(apparently) for directory [%s]", dir);
+ }
}
/*
@@ -125,82 +139,13 @@
string_compute_strlen(interpreter, s);
}
-/*
-{
-
- UErrorCode myError = U_ZERO_ERROR;
- UConverter *conv = ucnv_open("ISO-8859-1", &myError);
-
- NSData *data = [NSData dataWithContentsOfFile:@"/var/tmp/ja.txt"];
-
-
- UConverter *conv2 = ucnv_open("UTF-8", &myError);
- UChar *outputBuffer = malloc(50*sizeof(UChar));
- UChar *start = outputBuffer;
- char *source = [data bytes];
- UErrorCode convError = 0;
-
- ucnv_toUnicode(conv2, &outputBuffer, &(outputBuffer[50]), &source, source +
[data length], NULL, TRUE, &convError);
-
- NSLog(@"conv error = %i", convError);
- NSLog(@"conv length = %i", outputBuffer - start);
-
-
- UConverter *conv3 = ucnv_open("UCS-2", &myError); //ISO-8859-1"
- char *outbuffer = malloc(50);
- char *outstart = outbuffer;
- UErrorCode redoErr = 0;
- NSLog(@"conv3 = %p", conv3);
- NSLog(@"%s", ucnv_getName(conv3, &redoErr));
-
- UErrorCode callbackStatus = 0;
-
- ucnv_setFromUCallBack(conv3,
- UCNV_FROM_U_CALLBACK_STOP, //UCNV_FROM_U_CALLBACK_ESCAPE
- NULL,
- NULL,
- NULL,
- &callbackStatus);
-
- ucnv_fromUnicode(conv3, &outbuffer, outbuffer + 50, &start, outputBuffer, NULL,
TRUE, &redoErr);
-// ucnv_fromUnicode(conv3, NULL, NULL, &start, outputBuffer, NULL, TRUE,
&redoErr);
-
- NSLog(@"redo error = %i", redoErr);
- NSLog(@"%s", u_errorName(redoErr));
- NSLog(@"redo length = %i", outbuffer - outstart);
-
-NSLog(@"%x", *outstart);
-NSLog(@"%d", *outstart);
-NSLog(@"%x", outstart[1]);
- NSLog(@"%@", [[NSString alloc] initWithData:[NSData dataWithBytes:outstart
length:(outbuffer - outstart)] encoding:NSISOLatin1StringEncoding]);
-
-{
- UConverter *jisConv = ucnv_open("shift_jis", &myError);
- USet *set = uset_open(0, 0);
- UErrorCode setError = U_ZERO_ERROR;
-
- ucnv_getUnicodeSet( jisConv, set, UCNV_ROUNDTRIP_SET, &setError);
-
- NSLog(@"set size = %d", uset_size(set));
- NSLog(@"contains range = %d", uset_containsRange(set, 0x10000, 0x10ffff));
- NSLog(@"uset_containsNoneOfRange = %d", uset_containsNoneOfRange(set, 0x10000,
0x10ffff));
-}
+Parrot_UInt4
+string_unescape_one(Parrot_unescape_cb cb, Parrot_UInt4 *offset,
+ Parrot_UInt4 input_length, void *string)
{
- UConverter *utf8Conv = ucnv_open("UTF-8", &myError);
- USet *set = uset_open(0, 0);
- UErrorCode setError = U_ZERO_ERROR;
-
- ucnv_getUnicodeSet( utf8Conv, set, UCNV_ROUNDTRIP_SET, &setError);
-
- NSLog(@"set size = %d", uset_size(set));
- NSLog(@"contains range = %d", uset_containsRange(set, 0x10000, 0x10ffff));
- NSLog(@"uset_containsNoneOfRange = %d", uset_containsNoneOfRange(set, 0x10000,
0x10ffff));
-}
- [pool release];
- return 0;
+ return u_unescapeAt(cb, offset, input_length, string);
}
-*/
/*
1.73 +2 -11 parrot/t/op/string.t
Index: string.t
===================================================================
RCS file: /cvs/public/parrot/t/op/string.t,v
retrieving revision 1.72
retrieving revision 1.73
diff -u -w -r1.72 -r1.73
--- string.t 11 Apr 2004 09:56:32 -0000 1.72
+++ string.t 13 Apr 2004 14:18:33 -0000 1.73
@@ -1,6 +1,6 @@
#! perl -w
# Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-# $Id: string.t,v 1.72 2004/04/11 09:56:32 leo Exp $
+# $Id: string.t,v 1.73 2004/04/13 14:18:33 leo Exp $
=head1 NAME
@@ -16,7 +16,7 @@
=cut
-use Parrot::Test tests => 132;
+use Parrot::Test tests => 131;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "set_s_s|sc" );
@@ -2427,15 +2427,6 @@
CODE
ok 1
ok 2
-OUTPUT
-
-output_is( <<'CODE', <<OUTPUT, "angstrom" );
- chr S0, 0x212B
- print S0
- print "\n"
- end
-CODE
-\xe2\x84\xab
OUTPUT
1;