> > Isn't the most correct solution to call xml_parse function? > I am reply to self. Probably not.
Now, I am thinking so I found a reason of this issue. The document processed in xpath_internal is passed to libXML2 by doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0); We don't pass a encoding parameter so libXML2 expecting "UTF8" or expecting correct encoding decl in XML document. When we pass incorrect document - XML is in database encoding, but encoding decl is original, then it should to fail. the regress test can looks like your (but all chars are valid there) postgres=# do $$ declare str text; begin if current_setting('server_encoding') <> 'UTF8' then return; end if; str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>' || convert_from('\xc588', 'UTF8') || '</remark></vino></enprimeur>'; raise notice '%', xpath('/enprimeur/vino/id', str::xml); end; $$; ERROR: could not parse XML document DETAIL: input conversion failed due to input error, bytes 0x88 0x3C 0x2F 0x72 line 1: switching encoding: encoder error �</remark></vino></enprimeur> ^ CONTEXT: PL/pgSQL function inline_code_block line 8 at RAISE After correct fix: doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, pg_encoding_to_char(GetDatabaseEncoding()), 0); It is working postgres=# do $$ declare str text; begin if current_setting('server_encoding') <> 'UTF8' then return; end if; str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>' || convert_from('\xc588', 'UTF8') || '</remark></vino></enprimeur>'; raise notice '%', xpath('/enprimeur/vino/id', str::xml); end; $$; NOTICE: {<id>909</id>} DO This fix should be apply to xmltable function too. patch attached It doesn't fix xpath and xmltable functions issues when server encoding is not UTF8. Looks so XPath functions from libXML2 requires UTF8 encoded strings and the result is in UTF8 too - so result should be recoded to server encoding. I didn't find any info how to enable libXML2 XPath functions for other encoding than UTF8 :( ?? Regards Pavel > Regards > > Pavel >
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index c47624eff6..b09ce23cdb 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -3911,7 +3911,9 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, if (ctxt == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); - doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0); + doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, + pg_encoding_to_char(GetDatabaseEncoding()), 0); + if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML document"); @@ -4242,18 +4244,14 @@ XmlTableSetDocument(TableFuncScanState *state, Datum value) xtCxt = GetXmlTableBuilderPrivateData(state, "XmlTableSetDocument"); - /* - * Use out function for casting to string (remove encoding property). See - * comment in xml_out. - */ - str = xml_out_internal(xmlval, 0); - - length = strlen(str); + str = VARDATA(xmlval); + length = VARSIZE(xmlval) - VARHDRSZ; xstr = pg_xmlCharStrndup(str, length); PG_TRY(); { - doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, 0); + doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, + pg_encoding_to_char(GetDatabaseEncoding()), 0); if (doc == NULL || xtCxt->xmlerrcxt->err_occurred) xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML document"); diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out index bcc585d427..6a43896d40 100644 --- a/src/test/regress/expected/xml.out +++ b/src/test/regress/expected/xml.out @@ -1452,3 +1452,24 @@ SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c 14 (4 rows) +-- XML is saved in database encoding with original encoding declaration. +-- There can be incosistency based on wrong user input, different server/client +-- encoding or reading XML with recv function. All XML functions should to +-- work with this partially broken XML. +DO $$ +DECLARE str text; +BEGIN + -- leave early without error, when we are not sure about result of conversion + IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF; + + -- build valid UTF8 XML with broken encoding declaration + str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>' + || convert_from('\xf2', 'windows-1250') + || '</remark></vino></enprimeur>'; + + -- should to work + RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml); + RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int)); +END; $$; +NOTICE: {<id>909</id>} +NOTICE: 909 diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql index eb4687fb09..97a3aa9de2 100644 --- a/src/test/regress/sql/xml.sql +++ b/src/test/regress/sql/xml.sql @@ -558,3 +558,23 @@ INSERT INTO xmltest2 VALUES('<d><r><dc>2</dc></r></d>', 'D'); SELECT xmltable.* FROM xmltest2, LATERAL xmltable('/d/r' PASSING x COLUMNS a int PATH '' || lower(_path) || 'c'); SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH '.'); SELECT xmltable.* FROM xmltest2, LATERAL xmltable(('/d/r/' || lower(_path) || 'c') PASSING x COLUMNS a int PATH 'x' DEFAULT ascii(_path) - 54); + +-- XML is saved in database encoding with original encoding declaration. +-- There can be incosistency based on wrong user input, different server/client +-- encoding or reading XML with recv function. All XML functions should to +-- work with this partially broken XML. +DO $$ +DECLARE str text; +BEGIN + -- leave early without error, when we are not sure about result of conversion + IF current_setting('server_encoding') NOT IN ('UTF8', 'LATIN2') THEN return; END IF; + + -- build valid UTF8 XML with broken encoding declaration + str = '<?xml version="1.0" encoding="windows-1250"?><enprimeur><vino><id>909</id><remark>' + || convert_from('\xf2', 'windows-1250') + || '</remark></vino></enprimeur>'; + + -- should to work + RAISE NOTICE '%', xpath('/enprimeur/vino/id', str::xml); + RAISE NOTICE '%', (SELECT id FROM xmltable('/enprimeur/vino' PASSING (str::xml) COLUMNS id int)); +END; $$;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers