Hi

2017-08-19 22:53 GMT+02:00 Pavel Stehule <pavel.steh...@gmail.com>:

> Hi
>
> I am sending some POC  - it does support XPATH and XMLTABLE for not UTF8
> server encoding.
>
> In this case, all strings should be converted to UTF8 before call libXML2
> functions, and result should be converted back from UTF8.
>
> I found some previous experiments https://marc.info/?l=pgsql-bugs&m=
> 123407176408688
>
> Note: I got some information so used xmlNodeDump function is deprecated -
> so we should to replace it too sometime.
>
> Regards
>
>
I forgot a debug elog in previous patch




> Pavel
>
>
>
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index c47624eff6..a43cf13d16 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -147,6 +147,7 @@ static int xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
 					   ArrayBuildState *astate,
 					   PgXmlErrorContext *xmlerrcxt);
 static xmlChar *pg_xmlCharStrndup(char *str, size_t len);
+static xmlChar *pg_xmlCharUtf8(char *str, size_t len);
 #endif							/* USE_LIBXML */
 
 static void xmldata_root_element_start(StringInfo result, const char *eltname,
@@ -459,8 +460,28 @@ cstring_to_xmltype(const char *string)
 static xmltype *
 xmlBuffer_to_xmltype(xmlBufferPtr buf)
 {
-	return (xmltype *) cstring_to_text_with_len((const char *) xmlBufferContent(buf),
+	if (GetDatabaseEncoding() != PG_UTF8)
+	{
+		char *utf8str = (char *) xmlBufferContent(buf);
+		char *str;
+		xmltype *result;
+
+		str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
+											   xmlBufferLength(buf),
+											   PG_UTF8,
+											   GetDatabaseEncoding());
+
+		Assert(str != utf8str);
+		result = (xmltype *) cstring_to_text(str);
+		pfree(str);
+
+		return result;
+	}
+	else
+	{
+		return (xmltype *) cstring_to_text_with_len((const char *) xmlBufferContent(buf),
 												xmlBufferLength(buf));
+	}
 }
 #endif
 
@@ -1176,6 +1197,28 @@ pg_xmlCharStrndup(char *str, size_t len)
 }
 
 /*
+ * LibXML2 internal encoding is UTF8. Sometimes LibXML2 enforce
+ * encoding to UTF8 by self, sometimes it expects UTF8 strings.
+ * This function is used for encoding from database encoding to
+ * UTF8.
+ */
+static xmlChar *
+pg_xmlCharUtf8(char *str, size_t len)
+{
+	char *result;
+
+	result = (char *) pg_do_encoding_conversion((unsigned char *) str,
+										   len,
+										   GetDatabaseEncoding(),
+										   PG_UTF8);
+
+	if (result != str)
+		return BAD_CAST result;
+
+	return pg_xmlCharStrndup(str, len);
+}
+
+/*
  * str is the null-terminated input string.  Remaining arguments are
  * output arguments; each can be NULL if value is not wanted.
  * version and encoding are returned as locally-palloc'd strings.
@@ -3714,9 +3757,16 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt)
 	}
 	else
 	{
-		xmlChar    *str;
+		xmlChar    *utf8str;
+		char	   *str = NULL;
+
+		utf8str = xmlXPathCastNodeToString(cur);
+
+		str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
+										   strlen((char *) utf8str),
+										   PG_UTF8,
+										   GetDatabaseEncoding());
 
-		str = xmlXPathCastNodeToString(cur);
 		PG_TRY();
 		{
 			/* Here we rely on XML having the same representation as TEXT */
@@ -3727,11 +3777,18 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt)
 		}
 		PG_CATCH();
 		{
-			xmlFree(str);
+			if (str != (char *) utf8str)
+				pfree(str);
+
+			xmlFree(utf8str);
 			PG_RE_THROW();
 		}
 		PG_END_TRY();
-		xmlFree(str);
+
+		if (str != (char *) utf8str)
+			pfree(str);
+
+		xmlFree(utf8str);
 	}
 
 	return result;
@@ -3758,6 +3815,7 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
 	Datum		datum;
 	Oid			datumtype;
 	char	   *result_str;
+	char	   *str = NULL;
 
 	switch (xpathobj->type)
 	{
@@ -3797,7 +3855,18 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
 		case XPATH_STRING:
 			if (astate == NULL)
 				return 1;
-			datum = CStringGetDatum((char *) xpathobj->stringval);
+
+			/*
+			 * returned string is in UTF8 encoding - should be encoded
+			 * to database encoding first.
+			 */
+			str = (char *) pg_do_encoding_conversion((unsigned char *) xpathobj->stringval,
+										   strlen((char *) xpathobj->stringval),
+										   PG_UTF8,
+										   GetDatabaseEncoding());
+
+			datum = CStringGetDatum(str);
+
 			datumtype = CSTRINGOID;
 			break;
 
@@ -3812,6 +3881,7 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
 	datum = PointerGetDatum(cstring_to_xmltype(result_str));
 	(void) accumArrayResult(astate, datum, false,
 							XMLOID, CurrentMemoryContext);
+
 	return 1;
 }
 
@@ -3895,7 +3965,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
 				 errmsg("empty XPath expression")));
 
 	string = pg_xmlCharStrndup(datastr, len);
-	xpath_expr = pg_xmlCharStrndup(VARDATA_ANY(xpath_expr_text), xpath_len);
+	xpath_expr = pg_xmlCharUtf8(VARDATA_ANY(xpath_expr_text), xpath_len);
 
 	xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
 
@@ -3911,7 +3981,9 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
 		if (ctxt == NULL || xmlerrcxt->err_occurred)
 			xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
 						"could not allocate parser context");
-		doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
+		doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL,
+								pg_encoding_to_char(GetDatabaseEncoding()), 0);
+
 		if (doc == NULL || xmlerrcxt->err_occurred)
 			xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
 						"could not parse XML document");
@@ -3929,22 +4001,25 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
 		{
 			for (i = 0; i < ns_count; i++)
 			{
-				char	   *ns_name;
-				char	   *ns_uri;
+				text	   *ns_name;
+				text	   *ns_uri;
 
 				if (ns_names_uris_nulls[i * 2] ||
 					ns_names_uris_nulls[i * 2 + 1])
 					ereport(ERROR,
 							(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
 							 errmsg("neither namespace name nor URI may be null")));
-				ns_name = TextDatumGetCString(ns_names_uris[i * 2]);
-				ns_uri = TextDatumGetCString(ns_names_uris[i * 2 + 1]);
+				ns_name = DatumGetTextP(ns_names_uris[i * 2]);
+				ns_uri = DatumGetTextP(ns_names_uris[i * 2 + 1]);
 				if (xmlXPathRegisterNs(xpathctx,
-									   (xmlChar *) ns_name,
-									   (xmlChar *) ns_uri) != 0)
+									   pg_xmlCharUtf8(VARDATA_ANY(ns_name),
+									  				  VARSIZE(ns_name) - VARHDRSZ),
+									   pg_xmlCharUtf8(VARDATA_ANY(ns_uri),
+									  				  VARSIZE(ns_uri) - VARHDRSZ)) != 0)
 					ereport(ERROR,	/* is this an internal error??? */
 							(errmsg("could not register XML namespace with name \"%s\" and URI \"%s\"",
-									ns_name, ns_uri)));
+									TextDatumGetCString(ns_name),
+									TextDatumGetCString(ns_uri))));
 			}
 		}
 
@@ -4242,18 +4317,14 @@ XmlTableSetDocument(TableFuncScanState *state, Datum value)
 
 	xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetDocument");
 
-	/*
-	 * Use out function for casting to string (remove encoding property). See
-	 * comment in xml_out.
-	 */
-	str = xml_out_internal(xmlval, 0);
-
-	length = strlen(str);
+	str = VARDATA(xmlval);
+	length = VARSIZE(xmlval) - VARHDRSZ;
 	xstr = pg_xmlCharStrndup(str, length);
 
 	PG_TRY();
 	{
-		doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, 0);
+		doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL,
+								pg_encoding_to_char(GetDatabaseEncoding()), 0);
 		if (doc == NULL || xtCxt->xmlerrcxt->err_occurred)
 			xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
 						"could not parse XML document");
@@ -4301,8 +4372,8 @@ XmlTableSetNamespace(TableFuncScanState *state, char *name, char *uri)
 	xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetNamespace");
 
 	if (xmlXPathRegisterNs(xtCxt->xpathcxt,
-						   pg_xmlCharStrndup(name, strlen(name)),
-						   pg_xmlCharStrndup(uri, strlen(uri))))
+						   pg_xmlCharUtf8(name, strlen(name)),
+						   pg_xmlCharUtf8(uri, strlen(uri))))
 		xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_DATA_EXCEPTION,
 					"could not set XML namespace");
 #else
@@ -4328,7 +4399,7 @@ XmlTableSetRowFilter(TableFuncScanState *state, char *path)
 				(errcode(ERRCODE_DATA_EXCEPTION),
 				 errmsg("row path filter must not be empty string")));
 
-	xstr = pg_xmlCharStrndup(path, strlen(path));
+	xstr = pg_xmlCharUtf8(path, strlen(path));
 
 	xtCxt->xpathcomp = xmlXPathCompile(xstr);
 	if (xtCxt->xpathcomp == NULL || xtCxt->xmlerrcxt->err_occurred)
@@ -4359,7 +4430,7 @@ XmlTableSetColumnFilter(TableFuncScanState *state, char *path, int colnum)
 				(errcode(ERRCODE_DATA_EXCEPTION),
 				 errmsg("column path filter must not be empty string")));
 
-	xstr = pg_xmlCharStrndup(path, strlen(path));
+	xstr = pg_xmlCharUtf8(path, strlen(path));
 
 	xtCxt->xpathscomp[colnum] = xmlXPathCompile(xstr);
 	if (xtCxt->xpathscomp[colnum] == NULL || xtCxt->xmlerrcxt->err_occurred)
@@ -4502,7 +4573,15 @@ XmlTableGetValue(TableFuncScanState *state, int colnum,
 				{
 					PG_TRY();
 					{
-						cstr = pstrdup((char *) str);
+						if (GetDatabaseEncoding() != PG_UTF8)
+						{
+							cstr = (char *) pg_do_encoding_conversion((unsigned char *) str,
+															 strlen((char *) str),
+															 PG_UTF8,
+															 GetDatabaseEncoding());
+						}
+						else
+							cstr = pstrdup((char *) str);
 					}
 					PG_CATCH();
 					{
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index bcc585d427..6a43896d40 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -1452,3 +1452,24 @@ SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c
  14
 (4 rows)
 
+-- XML is saved in database encoding with original encoding declaration.
+-- There can be incosistency based on wrong user input, different server/client
+-- encoding or reading XML with recv function. All XML functions should to
+-- work with this partially broken XML.
+DO $$
+DECLARE str text;
+BEGIN
+  -- leave early without error, when we are not sure about result of conversion
+  IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF;
+
+  -- build valid UTF8 XML with broken encoding declaration
+  str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>'
+          || convert_from('\xf2', 'windows-1250')
+          || '</remark></vino></enprimeur>';
+
+  -- should to work
+  RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml);
+  RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int));
+END; $$;
+NOTICE:  {<id>909</id>}
+NOTICE:  909
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index eb4687fb09..97a3aa9de2 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -558,3 +558,23 @@ INSERT INTO xmltest2 VALUES('<d><r><dc>2</dc></r></d>', 'D');
 SELECT xmltable.* FROM xmltest2, LATERAL xmltable('/d/r' PASSING x COLUMNS a int PATH '' || lower(_path) || 'c');
 SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH '.');
 SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH 'x' DEFAULT ascii(_path) - 54);
+
+-- XML is saved in database encoding with original encoding declaration.
+-- There can be incosistency based on wrong user input, different server/client
+-- encoding or reading XML with recv function. All XML functions should to
+-- work with this partially broken XML.
+DO $$
+DECLARE str text;
+BEGIN
+  -- leave early without error, when we are not sure about result of conversion
+  IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF;
+
+  -- build valid UTF8 XML with broken encoding declaration
+  str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>'
+          || convert_from('\xf2', 'windows-1250')
+          || '</remark></vino></enprimeur>';
+
+  -- should to work
+  RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml);
+  RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int));
+END; $$;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to