Hi

I am sending some POC  - it does support XPATH and XMLTABLE for not UTF8
server encoding.

In this case, all strings should be converted to UTF8 before call libXML2
functions, and result should be converted back from UTF8.

I found some previous experiments https://marc.info/?l=pgsql-bug
s&m=123407176408688

Note: I got some information so used xmlNodeDump function is deprecated -
so we should to replace it too sometime.

Regards

Pavel
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index c47624eff6..d02cec88ab 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -147,6 +147,7 @@ static int xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
 					   ArrayBuildState *astate,
 					   PgXmlErrorContext *xmlerrcxt);
 static xmlChar *pg_xmlCharStrndup(char *str, size_t len);
+static xmlChar *pg_xmlCharUtf8(char *str, size_t len);
 #endif							/* USE_LIBXML */
 
 static void xmldata_root_element_start(StringInfo result, const char *eltname,
@@ -459,8 +460,28 @@ cstring_to_xmltype(const char *string)
 static xmltype *
 xmlBuffer_to_xmltype(xmlBufferPtr buf)
 {
-	return (xmltype *) cstring_to_text_with_len((const char *) xmlBufferContent(buf),
+	if (GetDatabaseEncoding() != PG_UTF8)
+	{
+		char *utf8str = (char *) xmlBufferContent(buf);
+		char *str;
+		xmltype *result;
+
+		str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
+											   xmlBufferLength(buf),
+											   PG_UTF8,
+											   GetDatabaseEncoding());
+
+		Assert(str != utf8str);
+		result = (xmltype *) cstring_to_text(str);
+		pfree(str);
+
+		return result;
+	}
+	else
+	{
+		return (xmltype *) cstring_to_text_with_len((const char *) xmlBufferContent(buf),
 												xmlBufferLength(buf));
+	}
 }
 #endif
 
@@ -1176,6 +1197,28 @@ pg_xmlCharStrndup(char *str, size_t len)
 }
 
 /*
+ * LibXML2 internal encoding is UTF8. Sometimes LibXML2 enforce
+ * encoding to UTF8 by self, sometimes it expects UTF8 strings.
+ * This function is used for encoding from database encoding to
+ * UTF8.
+ */
+static xmlChar *
+pg_xmlCharUtf8(char *str, size_t len)
+{
+	char *result;
+
+	result = (char *) pg_do_encoding_conversion((unsigned char *) str,
+										   len,
+										   GetDatabaseEncoding(),
+										   PG_UTF8);
+
+	if (result != str)
+		return BAD_CAST result;
+
+	return pg_xmlCharStrndup(str, len);
+}
+
+/*
  * str is the null-terminated input string.  Remaining arguments are
  * output arguments; each can be NULL if value is not wanted.
  * version and encoding are returned as locally-palloc'd strings.
@@ -3714,9 +3757,16 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt)
 	}
 	else
 	{
-		xmlChar    *str;
+		xmlChar    *utf8str;
+		char	   *str = NULL;
+
+		utf8str = xmlXPathCastNodeToString(cur);
+
+		str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
+										   strlen((char *) utf8str),
+										   PG_UTF8,
+										   GetDatabaseEncoding());
 
-		str = xmlXPathCastNodeToString(cur);
 		PG_TRY();
 		{
 			/* Here we rely on XML having the same representation as TEXT */
@@ -3727,11 +3777,18 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt)
 		}
 		PG_CATCH();
 		{
-			xmlFree(str);
+			if (str != (char *) utf8str)
+				pfree(str);
+
+			xmlFree(utf8str);
 			PG_RE_THROW();
 		}
 		PG_END_TRY();
-		xmlFree(str);
+
+		if (str != (char *) utf8str)
+			pfree(str);
+
+		xmlFree(utf8str);
 	}
 
 	return result;
@@ -3758,6 +3815,7 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
 	Datum		datum;
 	Oid			datumtype;
 	char	   *result_str;
+	char	   *str = NULL;
 
 	switch (xpathobj->type)
 	{
@@ -3797,7 +3855,20 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
 		case XPATH_STRING:
 			if (astate == NULL)
 				return 1;
-			datum = CStringGetDatum((char *) xpathobj->stringval);
+
+			/*
+			 * returned string is in UTF8 encoding - should be encoded
+			 * to database encoding first.
+			 */
+			str = (char *) pg_do_encoding_conversion((unsigned char *) xpathobj->stringval,
+										   strlen((char *) xpathobj->stringval),
+										   PG_UTF8,
+										   GetDatabaseEncoding());
+
+elog(NOTICE, ">>>%s<<<", str);
+
+			datum = CStringGetDatum(str);
+
 			datumtype = CSTRINGOID;
 			break;
 
@@ -3812,6 +3883,7 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj,
 	datum = PointerGetDatum(cstring_to_xmltype(result_str));
 	(void) accumArrayResult(astate, datum, false,
 							XMLOID, CurrentMemoryContext);
+
 	return 1;
 }
 
@@ -3895,7 +3967,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
 				 errmsg("empty XPath expression")));
 
 	string = pg_xmlCharStrndup(datastr, len);
-	xpath_expr = pg_xmlCharStrndup(VARDATA_ANY(xpath_expr_text), xpath_len);
+	xpath_expr = pg_xmlCharUtf8(VARDATA_ANY(xpath_expr_text), xpath_len);
 
 	xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
 
@@ -3911,7 +3983,9 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
 		if (ctxt == NULL || xmlerrcxt->err_occurred)
 			xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
 						"could not allocate parser context");
-		doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
+		doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL,
+								pg_encoding_to_char(GetDatabaseEncoding()), 0);
+
 		if (doc == NULL || xmlerrcxt->err_occurred)
 			xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
 						"could not parse XML document");
@@ -3929,22 +4003,25 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
 		{
 			for (i = 0; i < ns_count; i++)
 			{
-				char	   *ns_name;
-				char	   *ns_uri;
+				text	   *ns_name;
+				text	   *ns_uri;
 
 				if (ns_names_uris_nulls[i * 2] ||
 					ns_names_uris_nulls[i * 2 + 1])
 					ereport(ERROR,
 							(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
 							 errmsg("neither namespace name nor URI may be null")));
-				ns_name = TextDatumGetCString(ns_names_uris[i * 2]);
-				ns_uri = TextDatumGetCString(ns_names_uris[i * 2 + 1]);
+				ns_name = DatumGetTextP(ns_names_uris[i * 2]);
+				ns_uri = DatumGetTextP(ns_names_uris[i * 2 + 1]);
 				if (xmlXPathRegisterNs(xpathctx,
-									   (xmlChar *) ns_name,
-									   (xmlChar *) ns_uri) != 0)
+									   pg_xmlCharUtf8(VARDATA_ANY(ns_name),
+									  				  VARSIZE(ns_name) - VARHDRSZ),
+									   pg_xmlCharUtf8(VARDATA_ANY(ns_uri),
+									  				  VARSIZE(ns_uri) - VARHDRSZ)) != 0)
 					ereport(ERROR,	/* is this an internal error??? */
 							(errmsg("could not register XML namespace with name \"%s\" and URI \"%s\"",
-									ns_name, ns_uri)));
+									TextDatumGetCString(ns_name),
+									TextDatumGetCString(ns_uri))));
 			}
 		}
 
@@ -4242,18 +4319,14 @@ XmlTableSetDocument(TableFuncScanState *state, Datum value)
 
 	xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetDocument");
 
-	/*
-	 * Use out function for casting to string (remove encoding property). See
-	 * comment in xml_out.
-	 */
-	str = xml_out_internal(xmlval, 0);
-
-	length = strlen(str);
+	str = VARDATA(xmlval);
+	length = VARSIZE(xmlval) - VARHDRSZ;
 	xstr = pg_xmlCharStrndup(str, length);
 
 	PG_TRY();
 	{
-		doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, 0);
+		doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL,
+								pg_encoding_to_char(GetDatabaseEncoding()), 0);
 		if (doc == NULL || xtCxt->xmlerrcxt->err_occurred)
 			xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
 						"could not parse XML document");
@@ -4301,8 +4374,8 @@ XmlTableSetNamespace(TableFuncScanState *state, char *name, char *uri)
 	xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetNamespace");
 
 	if (xmlXPathRegisterNs(xtCxt->xpathcxt,
-						   pg_xmlCharStrndup(name, strlen(name)),
-						   pg_xmlCharStrndup(uri, strlen(uri))))
+						   pg_xmlCharUtf8(name, strlen(name)),
+						   pg_xmlCharUtf8(uri, strlen(uri))))
 		xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_DATA_EXCEPTION,
 					"could not set XML namespace");
 #else
@@ -4328,7 +4401,7 @@ XmlTableSetRowFilter(TableFuncScanState *state, char *path)
 				(errcode(ERRCODE_DATA_EXCEPTION),
 				 errmsg("row path filter must not be empty string")));
 
-	xstr = pg_xmlCharStrndup(path, strlen(path));
+	xstr = pg_xmlCharUtf8(path, strlen(path));
 
 	xtCxt->xpathcomp = xmlXPathCompile(xstr);
 	if (xtCxt->xpathcomp == NULL || xtCxt->xmlerrcxt->err_occurred)
@@ -4359,7 +4432,7 @@ XmlTableSetColumnFilter(TableFuncScanState *state, char *path, int colnum)
 				(errcode(ERRCODE_DATA_EXCEPTION),
 				 errmsg("column path filter must not be empty string")));
 
-	xstr = pg_xmlCharStrndup(path, strlen(path));
+	xstr = pg_xmlCharUtf8(path, strlen(path));
 
 	xtCxt->xpathscomp[colnum] = xmlXPathCompile(xstr);
 	if (xtCxt->xpathscomp[colnum] == NULL || xtCxt->xmlerrcxt->err_occurred)
@@ -4502,7 +4575,15 @@ XmlTableGetValue(TableFuncScanState *state, int colnum,
 				{
 					PG_TRY();
 					{
-						cstr = pstrdup((char *) str);
+						if (GetDatabaseEncoding() != PG_UTF8)
+						{
+							cstr = (char *) pg_do_encoding_conversion((unsigned char *) str,
+															 strlen((char *) str),
+															 PG_UTF8,
+															 GetDatabaseEncoding());
+						}
+						else
+							cstr = pstrdup((char *) str);
 					}
 					PG_CATCH();
 					{
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index bcc585d427..6a43896d40 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -1452,3 +1452,24 @@ SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c
  14
 (4 rows)
 
+-- XML is saved in database encoding with original encoding declaration.
+-- There can be incosistency based on wrong user input, different server/client
+-- encoding or reading XML with recv function. All XML functions should to
+-- work with this partially broken XML.
+DO $$
+DECLARE str text;
+BEGIN
+  -- leave early without error, when we are not sure about result of conversion
+  IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF;
+
+  -- build valid UTF8 XML with broken encoding declaration
+  str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>'
+          || convert_from('\xf2', 'windows-1250')
+          || '</remark></vino></enprimeur>';
+
+  -- should to work
+  RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml);
+  RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int));
+END; $$;
+NOTICE:  {<id>909</id>}
+NOTICE:  909
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index eb4687fb09..97a3aa9de2 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -558,3 +558,23 @@ INSERT INTO xmltest2 VALUES('<d><r><dc>2</dc></r></d>', 'D');
 SELECT xmltable.* FROM xmltest2, LATERAL xmltable('/d/r' PASSING x COLUMNS a int PATH '' || lower(_path) || 'c');
 SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH '.');
 SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH 'x' DEFAULT ascii(_path) - 54);
+
+-- XML is saved in database encoding with original encoding declaration.
+-- There can be incosistency based on wrong user input, different server/client
+-- encoding or reading XML with recv function. All XML functions should to
+-- work with this partially broken XML.
+DO $$
+DECLARE str text;
+BEGIN
+  -- leave early without error, when we are not sure about result of conversion
+  IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF;
+
+  -- build valid UTF8 XML with broken encoding declaration
+  str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>'
+          || convert_from('\xf2', 'windows-1250')
+          || '</remark></vino></enprimeur>';
+
+  -- should to work
+  RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml);
+  RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int));
+END; $$;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to