Here is the UTF8 patch. Is this something we want to apply now?
---------------------------------------------------------------------------
Dominic Mitchell wrote:
> Before christmas, I started a thread in dbi-users about the support for
> setting the utf8 flag on returned values[1]. I've got another patch
> now, which is less intrusive. This adds $dbh->{pg_do_utf8}, which will
> turn on marking returned data as UTF-8 if necessary.
>
> I would like this to be considered for inclusion with DBD::Pg, as I feel
> it's necessary to correct broken behaviour that I am seeing.
>
> I'm aware that Tim Bunce thinks that a better interface should be found
> for this sort of thing, and I agree. But unfortunately, I need to get
> this problem solved, and the attached patch would be an extremely useful
> stop gap measure.
>
> Thanks,
> -Dom
>
> [1] http:[EMAIL PROTECTED]/msg15428.html
>
> --
> | Semantico: creators of major online resources |
> | URL: http://www.semantico.com/ |
> | Tel: +44 (1273) 722222 |
> | Address: 33 Bond St., Brighton, Sussex, BN1 1RD, UK. |
> ? TESTLOG
> ? TESTLOG-commented-out-utf8-bits
> ? t/.nfs00a6dec800000014
> Index: Pg.pm
> ===================================================================
> RCS file: /usr/local/cvsroot/dbdpg/dbdpg/Pg.pm,v
> retrieving revision 1.17
> diff -u -r1.17 Pg.pm
> --- Pg.pm 30 Dec 2002 04:59:05 -0000 1.17
> +++ Pg.pm 10 Jan 2003 11:59:59 -0000
> @@ -1288,6 +1288,15 @@
> escaped by a backslash. Any other ASCII character can be used directly in a
> string constant.
>
> +=item B<pg_do_utf8> (boolean)
> +
> +PostgreSQL specific attribute. If true, then the utf8 flag will be
> +turned for returned character data (if the data is valid utf8). For
> +details about the utf8 flag, see L<Encode>. This is only relevant under
> +perl 5.8 and higher.
> +
> +B<NB>: This attribute is experimental and may be subject to change.
> +
> =item B<pg_INV_READ> (integer, read-only)
>
> Constant to be used for the mode in lo_creat and lo_open.
> Index: dbdimp.c
> ===================================================================
> RCS file: /usr/local/cvsroot/dbdpg/dbdpg/dbdimp.c,v
> retrieving revision 1.10
> diff -u -r1.10 dbdimp.c
> --- dbdimp.c 8 Jan 2003 22:08:17 -0000 1.10
> +++ dbdimp.c 10 Jan 2003 12:00:01 -0000
> @@ -470,6 +470,8 @@
> imp_dbh->pg_auto_escape = newval;
> } else if (kl==10 && strEQ(key, "pg_bool_tf")) {
> imp_dbh->pg_bool_tf = newval;
> + } else if (kl==10 && strEQ(key, "pg_do_utf8")) {
> + imp_dbh->pg_do_utf8 = newval;
> } else {
> return 0;
> }
> @@ -494,6 +496,8 @@
> retsv = newSViv((IV)imp_dbh->pg_auto_escape);
> } else if (kl==10 && strEQ(key, "pg_bool_tf")) {
> retsv = newSViv((IV)imp_dbh->pg_bool_tf);
> + } else if (kl==10 && strEQ(key, "pg_do_utf8")) {
> + retsv = newSViv((IV)imp_dbh->pg_do_utf8);
> } else if (kl==11 && strEQ(key, "pg_INV_READ")) {
> retsv = newSViv((IV)INV_READ);
> } else if (kl==12 && strEQ(key, "pg_INV_WRITE")) {
> @@ -1332,6 +1336,15 @@
> }
>
>
> +int
> +is_high_bit_set(val)
> + char *val;
> +{
> + while (*val++)
> + if (*val & 0x80) return 1;
> + return 0;
> +}
> +
> AV *
> dbd_st_fetch (sth, imp_sth)
> SV *sth;
> @@ -1403,6 +1416,14 @@
> val[val_len] = '\0';
> }
> sv_setpvn(sv, val, val_len);
> + if (imp_dbh->pg_do_utf8) {
> + SvUTF8_off(sv);
> + /* XXX Is this all the character data types? */
> + if (18 == type || 25 == type || 1042 ==type || 1043 == type) {
> + if (is_high_bit_set(val) && is_utf8_string(val, val_len))
> + SvUTF8_on(sv);
> + }
> + }
> }
> }
>
> Index: dbdimp.h
> ===================================================================
> RCS file: /usr/local/cvsroot/dbdpg/dbdpg/dbdimp.h,v
> retrieving revision 1.4
> diff -u -r1.4 dbdimp.h
> --- dbdimp.h 8 Jan 2003 22:08:17 -0000 1.4
> +++ dbdimp.h 10 Jan 2003 12:00:01 -0000
> @@ -23,6 +23,7 @@
> int init_commit; /* initialize AutoCommit */
> int pg_auto_escape; /* initialize AutoEscape */
> int pg_bool_tf; /* do bools return 't'/'f' */
> + int pg_do_utf8; /* should we attempt to make utf8 strings? */
> };
>
> /* Define sth implementor data structure */
> Index: t/05fetch.t
> ===================================================================
> RCS file: /usr/local/cvsroot/dbdpg/dbdpg/t/05fetch.t,v
> retrieving revision 1.3
> diff -u -r1.3 05fetch.t
> --- t/05fetch.t 27 Nov 2002 09:24:36 -0000 1.3
> +++ t/05fetch.t 10 Jan 2003 12:00:01 -0000
> @@ -3,7 +3,7 @@
> use Test::More;
>
> if (defined $ENV{DBI_DSN}) {
> - plan tests => 7;
> + plan tests => 10;
> } else {
> plan skip_all => 'cannot test without DB info';
> }
> @@ -80,6 +80,30 @@
> ok($rows == 1,
> 'fetch one row on id'
> );
> +
> +# Attempt to test whether or not we can get unicode out of the database
> +# correctly. Reuse the previous sth.
> +SKIP: {
> + eval "use Encode";
> + skip "need Encode module for unicode tests", 3 if $@;
> + local $dbh->{pg_do_utf8} = 1;
> + $dbh->do("INSERT INTO test (id, name, val) VALUES (4, '\x{0100}dam', 'cow')");
> + $sth->execute(4);
> + my ($id, $name) = $sth->fetchrow_array();
> + ok(Encode::is_utf8($name),
> + 'returned data has utf8 bit set'
> + );
> + is(length($name), 4,
> + 'returned utf8 data is not corrupted'
> + );
> + $sth->finish();
> + $sth->execute(1);
> + my ($id2, $name2) = $sth->fetchrow_array();
> + ok(! Encode::is_utf8($name2),
> + 'returned ASCII data has not got utf8 bit set'
> + );
> + $sth->finish();
> +}
>
> $sql = <<SQL;
> SELECT id
--
Bruce Momjian | http://candle.pha.pa.us
[EMAIL PROTECTED] | (610) 359-1001
+ If your life is a hard drive, | 13 Roberts Road
+ Christ can be your backup. | Newtown Square, Pennsylvania 19073