Hi 2017-08-19 22:53 GMT+02:00 Pavel Stehule <pavel.steh...@gmail.com>:
> Hi > > I am sending some POC - it does support XPATH and XMLTABLE for not UTF8 > server encoding. > > In this case, all strings should be converted to UTF8 before call libXML2 > functions, and result should be converted back from UTF8. > > I found some previous experiments https://marc.info/?l=pgsql-bugs&m= > 123407176408688 > > Note: I got some information so used xmlNodeDump function is deprecated - > so we should to replace it too sometime. > > Regards > > I forgot a debug elog in previous patch > Pavel > > >
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index c47624eff6..a43cf13d16 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -147,6 +147,7 @@ static int xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj, ArrayBuildState *astate, PgXmlErrorContext *xmlerrcxt); static xmlChar *pg_xmlCharStrndup(char *str, size_t len); +static xmlChar *pg_xmlCharUtf8(char *str, size_t len); #endif /* USE_LIBXML */ static void xmldata_root_element_start(StringInfo result, const char *eltname, @@ -459,8 +460,28 @@ cstring_to_xmltype(const char *string) static xmltype * xmlBuffer_to_xmltype(xmlBufferPtr buf) { - return (xmltype *) cstring_to_text_with_len((const char *) xmlBufferContent(buf), + if (GetDatabaseEncoding() != PG_UTF8) + { + char *utf8str = (char *) xmlBufferContent(buf); + char *str; + xmltype *result; + + str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str, + xmlBufferLength(buf), + PG_UTF8, + GetDatabaseEncoding()); + + Assert(str != utf8str); + result = (xmltype *) cstring_to_text(str); + pfree(str); + + return result; + } + else + { + return (xmltype *) cstring_to_text_with_len((const char *) xmlBufferContent(buf), xmlBufferLength(buf)); + } } #endif @@ -1176,6 +1197,28 @@ pg_xmlCharStrndup(char *str, size_t len) } /* + * LibXML2 internal encoding is UTF8. Sometimes LibXML2 enforce + * encoding to UTF8 by self, sometimes it expects UTF8 strings. + * This function is used for encoding from database encoding to + * UTF8. + */ +static xmlChar * +pg_xmlCharUtf8(char *str, size_t len) +{ + char *result; + + result = (char *) pg_do_encoding_conversion((unsigned char *) str, + len, + GetDatabaseEncoding(), + PG_UTF8); + + if (result != str) + return BAD_CAST result; + + return pg_xmlCharStrndup(str, len); +} + +/* * str is the null-terminated input string. Remaining arguments are * output arguments; each can be NULL if value is not wanted. * version and encoding are returned as locally-palloc'd strings. @@ -3714,9 +3757,16 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) } else { - xmlChar *str; + xmlChar *utf8str; + char *str = NULL; + + utf8str = xmlXPathCastNodeToString(cur); + + str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str, + strlen((char *) utf8str), + PG_UTF8, + GetDatabaseEncoding()); - str = xmlXPathCastNodeToString(cur); PG_TRY(); { /* Here we rely on XML having the same representation as TEXT */ @@ -3727,11 +3777,18 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) } PG_CATCH(); { - xmlFree(str); + if (str != (char *) utf8str) + pfree(str); + + xmlFree(utf8str); PG_RE_THROW(); } PG_END_TRY(); - xmlFree(str); + + if (str != (char *) utf8str) + pfree(str); + + xmlFree(utf8str); } return result; @@ -3758,6 +3815,7 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj, Datum datum; Oid datumtype; char *result_str; + char *str = NULL; switch (xpathobj->type) { @@ -3797,7 +3855,18 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj, case XPATH_STRING: if (astate == NULL) return 1; - datum = CStringGetDatum((char *) xpathobj->stringval); + + /* + * returned string is in UTF8 encoding - should be encoded + * to database encoding first. + */ + str = (char *) pg_do_encoding_conversion((unsigned char *) xpathobj->stringval, + strlen((char *) xpathobj->stringval), + PG_UTF8, + GetDatabaseEncoding()); + + datum = CStringGetDatum(str); + datumtype = CSTRINGOID; break; @@ -3812,6 +3881,7 @@ xml_xpathobjtoxmlarray(xmlXPathObjectPtr xpathobj, datum = PointerGetDatum(cstring_to_xmltype(result_str)); (void) accumArrayResult(astate, datum, false, XMLOID, CurrentMemoryContext); + return 1; } @@ -3895,7 +3965,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, errmsg("empty XPath expression"))); string = pg_xmlCharStrndup(datastr, len); - xpath_expr = pg_xmlCharStrndup(VARDATA_ANY(xpath_expr_text), xpath_len); + xpath_expr = pg_xmlCharUtf8(VARDATA_ANY(xpath_expr_text), xpath_len); xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); @@ -3911,7 +3981,9 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, if (ctxt == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); - doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0); + doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, + pg_encoding_to_char(GetDatabaseEncoding()), 0); + if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML document"); @@ -3929,22 +4001,25 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, { for (i = 0; i < ns_count; i++) { - char *ns_name; - char *ns_uri; + text *ns_name; + text *ns_uri; if (ns_names_uris_nulls[i * 2] || ns_names_uris_nulls[i * 2 + 1]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("neither namespace name nor URI may be null"))); - ns_name = TextDatumGetCString(ns_names_uris[i * 2]); - ns_uri = TextDatumGetCString(ns_names_uris[i * 2 + 1]); + ns_name = DatumGetTextP(ns_names_uris[i * 2]); + ns_uri = DatumGetTextP(ns_names_uris[i * 2 + 1]); if (xmlXPathRegisterNs(xpathctx, - (xmlChar *) ns_name, - (xmlChar *) ns_uri) != 0) + pg_xmlCharUtf8(VARDATA_ANY(ns_name), + VARSIZE(ns_name) - VARHDRSZ), + pg_xmlCharUtf8(VARDATA_ANY(ns_uri), + VARSIZE(ns_uri) - VARHDRSZ)) != 0) ereport(ERROR, /* is this an internal error??? */ (errmsg("could not register XML namespace with name \"%s\" and URI \"%s\"", - ns_name, ns_uri))); + TextDatumGetCString(ns_name), + TextDatumGetCString(ns_uri)))); } } @@ -4242,18 +4317,14 @@ XmlTableSetDocument(TableFuncScanState *state, Datum value) xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetDocument"); - /* - * Use out function for casting to string (remove encoding property). See - * comment in xml_out. - */ - str = xml_out_internal(xmlval, 0); - - length = strlen(str); + str = VARDATA(xmlval); + length = VARSIZE(xmlval) - VARHDRSZ; xstr = pg_xmlCharStrndup(str, length); PG_TRY(); { - doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, 0); + doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, + pg_encoding_to_char(GetDatabaseEncoding()), 0); if (doc == NULL || xtCxt->xmlerrcxt->err_occurred) xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML document"); @@ -4301,8 +4372,8 @@ XmlTableSetNamespace(TableFuncScanState *state, char *name, char *uri) xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetNamespace"); if (xmlXPathRegisterNs(xtCxt->xpathcxt, - pg_xmlCharStrndup(name, strlen(name)), - pg_xmlCharStrndup(uri, strlen(uri)))) + pg_xmlCharUtf8(name, strlen(name)), + pg_xmlCharUtf8(uri, strlen(uri)))) xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_DATA_EXCEPTION, "could not set XML namespace"); #else @@ -4328,7 +4399,7 @@ XmlTableSetRowFilter(TableFuncScanState *state, char *path) (errcode(ERRCODE_DATA_EXCEPTION), errmsg("row path filter must not be empty string"))); - xstr = pg_xmlCharStrndup(path, strlen(path)); + xstr = pg_xmlCharUtf8(path, strlen(path)); xtCxt->xpathcomp = xmlXPathCompile(xstr); if (xtCxt->xpathcomp == NULL || xtCxt->xmlerrcxt->err_occurred) @@ -4359,7 +4430,7 @@ XmlTableSetColumnFilter(TableFuncScanState *state, char *path, int colnum) (errcode(ERRCODE_DATA_EXCEPTION), errmsg("column path filter must not be empty string"))); - xstr = pg_xmlCharStrndup(path, strlen(path)); + xstr = pg_xmlCharUtf8(path, strlen(path)); xtCxt->xpathscomp[colnum] = xmlXPathCompile(xstr); if (xtCxt->xpathscomp[colnum] == NULL || xtCxt->xmlerrcxt->err_occurred) @@ -4502,7 +4573,15 @@ XmlTableGetValue(TableFuncScanState *state, int colnum, { PG_TRY(); { - cstr = pstrdup((char *) str); + if (GetDatabaseEncoding() != PG_UTF8) + { + cstr = (char *) pg_do_encoding_conversion((unsigned char *) str, + strlen((char *) str), + PG_UTF8, + GetDatabaseEncoding()); + } + else + cstr = pstrdup((char *) str); } PG_CATCH(); { diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out index bcc585d427..6a43896d40 100644 --- a/src/test/regress/expected/xml.out +++ b/src/test/regress/expected/xml.out @@ -1452,3 +1452,24 @@ SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c 14 (4 rows) +-- XML is saved in database encoding with original encoding declaration. +-- There can be incosistency based on wrong user input, different server/client +-- encoding or reading XML with recv function. All XML functions should to +-- work with this partially broken XML. +DO $$ +DECLARE str text; +BEGIN + -- leave early without error, when we are not sure about result of conversion + IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF; + + -- build valid UTF8 XML with broken encoding declaration + str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>' + || convert_from('\xf2', 'windows-1250') + || '</remark></vino></enprimeur>'; + + -- should to work + RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml); + RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int)); +END; $$; +NOTICE: {<id>909</id>} +NOTICE: 909 diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql index eb4687fb09..97a3aa9de2 100644 --- a/src/test/regress/sql/xml.sql +++ b/src/test/regress/sql/xml.sql @@ -558,3 +558,23 @@ INSERT INTO xmltest2 VALUES('<d><r><dc>2</dc></r></d>', 'D'); SELECT xmltable.* FROM xmltest2, LATERAL xmltable('/d/r' PASSING x COLUMNS a int PATH '' || lower(_path) || 'c'); SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH '.'); SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH 'x' DEFAULT ascii(_path) - 54); + +-- XML is saved in database encoding with original encoding declaration. +-- There can be incosistency based on wrong user input, different server/client +-- encoding or reading XML with recv function. All XML functions should to +-- work with this partially broken XML. +DO $$ +DECLARE str text; +BEGIN + -- leave early without error, when we are not sure about result of conversion + IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF; + + -- build valid UTF8 XML with broken encoding declaration + str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>' + || convert_from('\xf2', 'windows-1250') + || '</remark></vino></enprimeur>'; + + -- should to work + RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml); + RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int)); +END; $$;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers