Re: [HACKERS] pl/perl and utf-8 in sql_ascii databases

Kyotaro HORIGUCHI Thu, 21 Jun 2012 04:23:45 -0700

Hello,

> > Seems like we missed the fact that we still did SvUTF8_on() in sv2cstr
> > and SvPVUTF8() when turning a perl string into a cstring.
> 
> Right.


I spent a bit longer time catching on pl/perl and now understand
what is the problem...

> So I played a bit with this patch, and touched it a bit mainly just to
> add some more comments; and while at it I noticed that some of the
> functions in Util.xs might leak some memory, so I made an attempt to
> plug them, as in the attached patch (which supersedes yours).

Ok, Is it ok to look into the newer patch including fix of leaks
at first?

-- Coding and styles.

This also seems to have polished the previous one on some codes,
styles and comments which generally look reasonable. And patch
style was corrected into unified.

-- Functions
I seems to work properly on the database the encodings of which
are SQL_ASCII and UTF8 (and EUC-JP) as below,

=================
=> create or replace function foo(text) returns text language plperlu as $$ $a 
= shift; return "BOO!" if ($a != "a\x80cあ"); return $a; $$;
SQL_ASCII=> select foo(E'a\200cあ') = E'a\200cあ';
 ?column? 
----------
 t
UTF8=> select foo(E'a\200cあ');
ERROR:  invalid byte sequence for encoding "UTF8": 0x80
UTF8=> select foo(E'a\302\200cあ') = E'a\u0080cあ';
 ?column? 
----------
 t
=================

This looks quite valid according to the definition of the
encodings and perl's nature as far as I see.


-- The others

Variable naming in util_quote_*() seems a bit confusing,

>      text *arg = sv2text(sv);
>      text *ret = DatumGetTextP(..., PointerGetDatum(arg)));
>      char *str;
>      pfree(arg);
>      str = text_to_cstring(ret);
>      RETVAL = cstr2sv(str);
>      pfree(str);

Renaming ret to quoted and str to ret as the patch attached might
make it easily readable.


> Now, with my version of the patch applied and using a SQL_ASCII database
> to test the problem in the original report, I notice that we now have a
> regression failure:
snip.
> I'm not really sure what to do here -- maybe have a second expected file
> for that test is a good enough answer?  Or should I just take the test
> out?  Opinions please.


The attached ugly patch does it. We seem should put NO_LOCALE=1
on the 'make check' command line for the encodings not compatible
with the environmental locale, although it looks work.

# UtfToLocal() seems to have a bug that always report illegal
# encoding was "UTF8" regardless of the real encoding. But
# plper_lc_*.(sql|out) increases if the bug is fixed.

regards,

-- 
Kyotaro Horiguchi
NTT Open Source Software Center

== My e-mail address has been changed since Apr. 1, 2012.

diff --git a/src/pl/plperl/Util.xs b/src/pl/plperl/Util.xs
index 7d0102b..4b4b680 100644
--- a/src/pl/plperl/Util.xs
+++ b/src/pl/plperl/Util.xs
@@ -67,8 +67,11 @@ static text *
 sv2text(SV *sv)
 {
 	char	   *str = sv2cstr(sv);
+	text	   *text;
 
-	return cstring_to_text(str);
+	text = cstring_to_text(str);
+	pfree(str);
+	return text;
 }
 
 MODULE = PostgreSQL::InServer::Util PREFIX = util_
@@ -113,10 +116,12 @@ util_quote_literal(sv)
     }
     else {
         text *arg = sv2text(sv);
-        text *ret = DatumGetTextP(DirectFunctionCall1(quote_literal, PointerGetDatum(arg)));
-		char *str = text_to_cstring(ret);
-		RETVAL = cstr2sv(str);
-		pfree(str);
+        text *quoted = DatumGetTextP(DirectFunctionCall1(quote_literal, PointerGetDatum(arg)));
+        char *ret;
+        pfree(arg);
+        ret = text_to_cstring(quoted);
+        RETVAL = cstr2sv(ret);
+        pfree(ret);
     }
     OUTPUT:
     RETVAL
@@ -132,10 +137,12 @@ util_quote_nullable(sv)
     else
 	{
         text *arg = sv2text(sv);
-        text *ret = DatumGetTextP(DirectFunctionCall1(quote_nullable, PointerGetDatum(arg)));
-		char *str = text_to_cstring(ret);
-		RETVAL = cstr2sv(str);
-		pfree(str);
+        text *quoted = DatumGetTextP(DirectFunctionCall1(quote_nullable, PointerGetDatum(arg)));
+        char *ret;
+        pfree(arg);
+        ret = text_to_cstring(quoted);
+        RETVAL = cstr2sv(ret);
+        pfree(ret);
     }
     OUTPUT:
     RETVAL
@@ -145,14 +152,15 @@ util_quote_ident(sv)
     SV *sv
     PREINIT:
         text *arg;
-        text *ret;
-		char *str;
+        text *quoted;
+        char *ret;
     CODE:
         arg = sv2text(sv);
-        ret = DatumGetTextP(DirectFunctionCall1(quote_ident, PointerGetDatum(arg)));
-		str = text_to_cstring(ret);
-		RETVAL = cstr2sv(str);
-		pfree(str);
+        quoted = DatumGetTextP(DirectFunctionCall1(quote_ident, PointerGetDatum(arg)));
+        pfree(arg);
+		ret = text_to_cstring(quoted);
+        RETVAL = cstr2sv(ret);
+        pfree(ret);
     OUTPUT:
     RETVAL
 
diff --git a/src/pl/plperl/plperl_helpers.h b/src/pl/plperl/plperl_helpers.h
index 1b6648b..ed99194 100644
--- a/src/pl/plperl/plperl_helpers.h
+++ b/src/pl/plperl/plperl_helpers.h
@@ -3,21 +3,29 @@
 
 /*
  * convert from utf8 to database encoding
+ *
+ * Returns a palloc'ed copy of the original string
  */
 static inline char *
-utf_u2e(const char *utf8_str, size_t len)
+utf_u2e(char *utf8_str, size_t len)
 {
 	int			enc = GetDatabaseEncoding();
-
-	char	   *ret = (char *) pg_do_encoding_conversion((unsigned char *) utf8_str, len, PG_UTF8, enc);
+	char	   *ret;
 
 	/*
-	 * when we are a PG_UTF8 or SQL_ASCII database pg_do_encoding_conversion()
-	 * will not do any conversion or verification. we need to do it manually
-	 * instead.
+	 * When we are in a PG_UTF8 or SQL_ASCII database
+	 * pg_do_encoding_conversion() will not do any conversion (which is good)
+	 * or verification (not so much), so we need to run the verification step
+	 * separately.
 	 */
 	if (enc == PG_UTF8 || enc == PG_SQL_ASCII)
-		pg_verify_mbstr_len(PG_UTF8, utf8_str, len, false);
+	{
+		pg_verify_mbstr_len(enc, utf8_str, len, false);
+		ret = utf8_str;
+	}
+	else
+		ret = (char *) pg_do_encoding_conversion((unsigned char *) utf8_str,
+												 len, PG_UTF8, enc);
 
 	if (ret == utf8_str)
 		ret = pstrdup(ret);
@@ -27,11 +35,15 @@ utf_u2e(const char *utf8_str, size_t len)
 
 /*
  * convert from database encoding to utf8
+ *
+ * Returns a palloc'ed copy of the original string
  */
 static inline char *
 utf_e2u(const char *str)
 {
-	char	   *ret = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), GetDatabaseEncoding(), PG_UTF8);
+	char	   *ret =
+		(char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
+										   GetDatabaseEncoding(), PG_UTF8);
 
 	if (ret == str)
 		ret = pstrdup(ret);
@@ -41,6 +53,8 @@ utf_e2u(const char *str)
 
 /*
  * Convert an SV to a char * in the current database encoding
+ *
+ * Returns a palloc'ed copy of the original string
  */
 static inline char *
 sv2cstr(SV *sv)
@@ -51,7 +65,9 @@ sv2cstr(SV *sv)
 
 	/*
 	 * get a utf8 encoded char * out of perl. *note* it may not be valid utf8!
-	 *
+	 */
+
+	/*
 	 * SvPVutf8() croaks nastily on certain things, like typeglobs and
 	 * readonly objects such as $^V. That's a perl bug - it's not supposed to
 	 * happen. To avoid crashing the backend, we make a copy of the sv before
@@ -63,18 +79,27 @@ sv2cstr(SV *sv)
 		(SvTYPE(sv) > SVt_PVLV && SvTYPE(sv) != SVt_PVFM))
 		sv = newSVsv(sv);
 	else
-
+	{
 		/*
 		 * increase the reference count so we can just SvREFCNT_dec() it when
 		 * we are done
 		 */
 		SvREFCNT_inc_simple_void(sv);
+	}
 
-	val = SvPVutf8(sv, len);
+	/*
+	 * Request the string from Perl, in UTF-8 encoding; but if we're in a
+	 * SQL_ASCII database, just request the byte soup without trying to make it
+	 * UTF8, because that might fail.
+	 */
+	if (GetDatabaseEncoding() == PG_SQL_ASCII)
+		val = SvPV(sv, len);
+	else
+		val = SvPVutf8(sv, len);
 
 	/*
-	 * we use perl's length in the event we had an embedded null byte to
-	 * ensure we error out properly
+	 * Now convert to database encoding.  We use perl's length in the event we
+	 * had an embedded null byte to ensure we error out properly.
 	 */
 	res = utf_u2e(val, len);
 
@@ -88,16 +113,20 @@ sv2cstr(SV *sv)
  * Create a new SV from a string assumed to be in the current database's
  * encoding.
  */
-
 static inline SV *
 cstr2sv(const char *str)
 {
 	SV		   *sv;
-	char	   *utf8_str = utf_e2u(str);
+	char	   *utf8_str;
+
+	/* no conversion when SQL_ASCII */
+	if (GetDatabaseEncoding() == PG_SQL_ASCII)
+		return newSVpv(str, 0);
+
+	utf8_str = utf_e2u(str);
 
 	sv = newSVpv(utf8_str, 0);
 	SvUTF8_on(sv);
-
 	pfree(utf8_str);
 
 	return sv;

diff --git a/src/pl/plperl/GNUmakefile b/src/pl/plperl/GNUmakefile
index 188d7d2..8ab90a6 100644
--- a/src/pl/plperl/GNUmakefile
+++ b/src/pl/plperl/GNUmakefile
@@ -44,7 +44,9 @@ PERLCHUNKS = plc_perlboot.pl plc_trusted.pl
 SHLIB_LINK = $(perl_embed_ldflags)
 
 REGRESS_OPTS = --dbname=$(PL_TESTDB) --load-extension=plperl  --load-extension=plperlu
-REGRESS = plperl plperl_trigger plperl_shared plperl_elog plperl_util plperl_init plperlu plperl_array
+REGRESS_LC0 = $(subst .sql,,$(shell cd sql; ls plperl_lc_$(shell echo $(ENCODING) | tr "A-Z-" "a-z_").sql 2>/dev/null))
+REGRESS_LC = $(if $(REGRESS_LC0),$(REGRESS_LC0),plperl_lc)
+REGRESS = plperl $(REGRESS_LC) plperl_trigger plperl_shared plperl_elog plperl_util plperl_init plperlu plperl_array
 # if Perl can support two interpreters in one backend,
 # test plperl-and-plperlu cases
 ifneq ($(PERL),)
diff --git a/src/pl/plperl/expected/plperl.out b/src/pl/plperl/expected/plperl.out
index df54937..906dc15 100644
--- a/src/pl/plperl/expected/plperl.out
+++ b/src/pl/plperl/expected/plperl.out
@@ -650,16 +650,6 @@ CONTEXT:  PL/Perl anonymous code block
 DO $do$ use warnings FATAL => qw(void) ; my @y; my $x = sort @y; 1; $do$ LANGUAGE plperl;
 ERROR:  Useless use of sort in scalar context at line 1.
 CONTEXT:  PL/Perl anonymous code block
---
--- Make sure strings are validated
--- Should fail for all encodings, as nul bytes are never permitted.
---
-CREATE OR REPLACE FUNCTION perl_zerob() RETURNS TEXT AS $$
-  return "abcd\0efg";
-$$ LANGUAGE plperl;
-SELECT perl_zerob();
-ERROR:  invalid byte sequence for encoding "UTF8": 0x00
-CONTEXT:  PL/Perl function "perl_zerob"
 -- make sure functions marked as VOID without an explicit return work
 CREATE OR REPLACE FUNCTION myfuncs() RETURNS void AS $$
    $_SHARED{myquote} = sub {
diff --git a/src/pl/plperl/expected/plperl_lc.out b/src/pl/plperl/expected/plperl_lc.out
new file mode 100644
index 0000000..4f8c08f
--- /dev/null
+++ b/src/pl/plperl/expected/plperl_lc.out
@@ -0,0 +1,10 @@
+--
+-- Make sure strings are validated
+-- Should fail for all encodings, as nul bytes are never permitted.
+--
+CREATE OR REPLACE FUNCTION perl_zerob() RETURNS TEXT AS $$
+  return "abcd\0efg";
+$$ LANGUAGE plperl;
+SELECT perl_zerob();
+ERROR:  invalid byte sequence for encoding "UTF8": 0x00
+CONTEXT:  PL/Perl function "perl_zerob"
diff --git a/src/pl/plperl/expected/plperl_lc_sql_ascii.out b/src/pl/plperl/expected/plperl_lc_sql_ascii.out
new file mode 100644
index 0000000..022c3e2
--- /dev/null
+++ b/src/pl/plperl/expected/plperl_lc_sql_ascii.out
@@ -0,0 +1,10 @@
+--
+-- Make sure strings are validated
+-- Should fail for all encodings, as nul bytes are never permitted.
+--
+CREATE OR REPLACE FUNCTION perl_zerob() RETURNS TEXT AS $$
+  return "abcd\0efg";
+$$ LANGUAGE plperl;
+SELECT perl_zerob();
+ERROR:  invalid byte sequence for encoding "SQL_ASCII": 0x00
+CONTEXT:  PL/Perl function "perl_zerob"
diff --git a/src/pl/plperl/sql/plperl.sql b/src/pl/plperl/sql/plperl.sql
index 84af1fd..a5e3840 100644
--- a/src/pl/plperl/sql/plperl.sql
+++ b/src/pl/plperl/sql/plperl.sql
@@ -423,15 +423,6 @@ DO $do$ use strict; my $name = "foo"; my $ref = $$name; $do$ LANGUAGE plperl;
 -- yields "ERROR:  Useless use of sort in scalar context."
 DO $do$ use warnings FATAL => qw(void) ; my @y; my $x = sort @y; 1; $do$ LANGUAGE plperl;
 
---
--- Make sure strings are validated
--- Should fail for all encodings, as nul bytes are never permitted.
---
-CREATE OR REPLACE FUNCTION perl_zerob() RETURNS TEXT AS $$
-  return "abcd\0efg";
-$$ LANGUAGE plperl;
-SELECT perl_zerob();
-
 -- make sure functions marked as VOID without an explicit return work
 CREATE OR REPLACE FUNCTION myfuncs() RETURNS void AS $$
    $_SHARED{myquote} = sub {
diff --git a/src/pl/plperl/sql/plperl_lc.sql b/src/pl/plperl/sql/plperl_lc.sql
new file mode 100644
index 0000000..a4a06e7
--- /dev/null
+++ b/src/pl/plperl/sql/plperl_lc.sql
@@ -0,0 +1,8 @@
+--
+-- Make sure strings are validated
+-- Should fail for all encodings, as nul bytes are never permitted.
+--
+CREATE OR REPLACE FUNCTION perl_zerob() RETURNS TEXT AS $$
+  return "abcd\0efg";
+$$ LANGUAGE plperl;
+SELECT perl_zerob();
diff --git a/src/pl/plperl/sql/plperl_lc_sql_ascii.sql b/src/pl/plperl/sql/plperl_lc_sql_ascii.sql
new file mode 120000
index 0000000..9da97db
--- /dev/null
+++ b/src/pl/plperl/sql/plperl_lc_sql_ascii.sql
@@ -0,0 +1 @@
+plperl_lc.sql
\ No newline at end of file

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] pl/perl and utf-8 in sql_ascii databases

Reply via email to