On Thu, Feb 9, 2012 at 03:21, Christoph Berg <c...@df7cb.de> wrote:
> Hi,
>
> we have a database that is storing strings in various encodings (and
> non-encodings, namely the arbitrary byte soup [ ... ]
> For this reason, the database uses
> sql_ascii encoding

> ...snip...

> In sql_ascii databases, utf_e2u does not do any recoding, but then
> SvUTF8_on still marks the string as utf-8, while it isn't.
>
> (Returned values might also need fixing.)
>
> In my view, this is clearly a bug in pl/perl on sql_ascii databases.

Yeah, there was some musing about this over in:
http://archives.postgresql.org/pgsql-hackers/2011-02/msg01142.php

Seems like we missed the fact that we still did SvUTF8_on() in sv2cstr
and SvPVUTF8() when turning a perl string into a cstring.

With the attached I get:
=> create or replace function perl_white(a text) returns text as $$
return shift; $$ language plperlu;
=> select perl_white(E'\200'), perl_white(E'\200')::bytea,
coalesce(perl_white(E'\200'), 'null');
 perl_white | perl_white | coalesce
------------+------------+----------
            | \x80       |

=> select perl_white(E'\401');
 perl_white
------------
 \x01
(1 row)

Does the attached fix the issue for you?

Ill note that all the pls seem to behave a bit differently:

=> create or replace function py_white(a text) returns text as $$
return a; $$ language plpython3u;
=> select py_white(E'\200'), py_white(E'\200')::bytea,
coalesce(py_white(E'\200'), 'null');
py_white | py_white | coalesce
----------+----------+----------
          |          | null
(1 row)

=>select py_white(E'\401');
 py_white
----------
 \x01
(1 row)

=> create or replace function tcl_white(text) returns text as $$
return $1; $$ language pltcl;
=> select tcl_white(E'\200'), tcl_white(E'\200')::bytea,
coalesce(tcl_white(E'\200'), 'null');
 tcl_white | tcl_white | coalesce
-----------+-----------+----------
           | \x80      |

 => select tcl_white(E'\402');
 tcl_white
-----------
 \x02
(1 row)
*** a/src/pl/plperl/plperl_helpers.h
--- b/src/pl/plperl/plperl_helpers.h
***************
*** 5,23 ****
   * convert from utf8 to database encoding
   */
  static inline char *
! utf_u2e(const char *utf8_str, size_t len)
  {
! 	int 	    enc = GetDatabaseEncoding();
! 
! 	char	   *ret = (char *) pg_do_encoding_conversion((unsigned char *) utf8_str, len, PG_UTF8, enc);
  
  	/*
! 	* when we are a PG_UTF8 or SQL_ASCII database
! 	* pg_do_encoding_conversion() will not do any conversion or
! 	* verification. we need to do it manually instead.
  	*/
  	if (enc == PG_UTF8 || enc == PG_SQL_ASCII)
! 		pg_verify_mbstr_len(PG_UTF8, utf8_str, len, false);
  
  	if (ret == utf8_str)
  		ret = pstrdup(ret);
--- 5,24 ----
   * convert from utf8 to database encoding
   */
  static inline char *
! utf_u2e(char *utf8_str, size_t len)
  {
! 	int		   enc = GetDatabaseEncoding();
! 	char	   *ret = utf8_str;
  
  	/*
! 	* when we are a PG_UTF8 or SQL_ASCII database pg_do_encoding_conversion()
! 	* will not do any conversion or verification. we need to do it manually
! 	* instead.
  	*/
  	if (enc == PG_UTF8 || enc == PG_SQL_ASCII)
! 		pg_verify_mbstr_len(enc, utf8_str, len, false);
! 	else
! 		ret = (char *) pg_do_encoding_conversion((unsigned char *) utf8_str, len, PG_UTF8, enc);
  
  	if (ret == utf8_str)
  		ret = pstrdup(ret);
***************
*** 66,72 **** sv2cstr(SV *sv)
  		 * we are done */
  		SvREFCNT_inc(sv);
  
! 	val = SvPVutf8(sv, len);
  
  	/*
  	 * we use perl's length in the event we had an embedded null byte to ensure
--- 67,80 ----
  		 * we are done */
  		SvREFCNT_inc(sv);
  
! 	/*
! 	 * when SQL_ASCII just treat it as byte soup, that is fetch the string out
! 	 * however it is currently stored by perl
! 	 */
! 	if (GetDatabaseEncoding() == PG_SQL_ASCII)
! 		val = SvPV(sv, len);
! 	else
! 		val = SvPVutf8(sv, len);
  
  	/*
  	 * we use perl's length in the event we had an embedded null byte to ensure
***************
*** 89,99 **** static inline SV *
  cstr2sv(const char *str)
  {
  	SV		   *sv;
! 	char	   *utf8_str = utf_e2u(str);
  
  	sv = newSVpv(utf8_str, 0);
  	SvUTF8_on(sv);
- 
  	pfree(utf8_str);
  
  	return sv;
--- 97,112 ----
  cstr2sv(const char *str)
  {
  	SV		   *sv;
! 	char	   *utf8_str;
! 
! 	/* no conversion when SQL_ASCII */
! 	if (GetDatabaseEncoding() == PG_SQL_ASCII)
! 		return newSVpv(str, 0);
! 
! 	utf8_str = utf_e2u(str);
  
  	sv = newSVpv(utf8_str, 0);
  	SvUTF8_on(sv);
  	pfree(utf8_str);
  
  	return sv;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to