On 10.09.24 19:43, Tom Lane wrote: > How about instead introducing a plain function along the lines of > "xml_canonicalize(xml, bool keep_comments) returns text" ? The SQL > committee will certainly never do that, but we won't regret having > created a plain function whenever they get around to doing something > in the same space. A second function to serialize xml documents may sound a bit redundant, but I totally understand the concern of possibly conflicting with SQL/XMl spec in the feature. I guess we can always come back here and extend xmlserialize when the SQL committee moves in this direction.
v14 attached adds the function xmlcanonicalize, as suggested. Thanks -- Jim
From 08850417c9f0e1504a5e0cfbbd815c3a7aaaf7e8 Mon Sep 17 00:00:00 2001 From: Jim Jones <jim.jo...@uni-muenster.de> Date: Thu, 12 Sep 2024 12:23:34 +0200 Subject: [PATCH v14] Add xmlcanonicalize function This patch introduces the function xmlcanonicalize, which serializes xml documents in their canonical form - as described in the W3C Canonical XML Version 1.1 specification. xmlcanonicalize(doc xml, keep_comments boolean) -> xml doc: the XML document to be canonicalized keep_comments: keeps or removes xml comments from doc This feature is based on the function xmlC14NDocDumpMemory from the C14N module of libxml2. --- doc/src/sgml/func.sgml | 48 ++++++++++++++++++++ src/backend/utils/adt/xml.c | 40 +++++++++++++++++ src/include/catalog/pg_proc.dat | 3 ++ src/test/regress/expected/xml.out | 70 +++++++++++++++++++++++++++++ src/test/regress/expected/xml_1.out | 69 ++++++++++++++++++++++++++++ src/test/regress/expected/xml_2.out | 70 +++++++++++++++++++++++++++++ src/test/regress/sql/xml.sql | 49 ++++++++++++++++++++ 7 files changed, 349 insertions(+) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 1bde4091ca..f63787d633 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -14325,6 +14325,54 @@ SELECT xmltext('< foo & bar >'); </para> </sect3> +<sect3 id="functions-producing-xml-xmlcanonicalize"> + <title><literal>xmlcanonicalize</literal></title> + + <indexterm> + <primary>xmlcanonicalize</primary> + </indexterm> + +<synopsis> +<function>xmlcanonicalize</function> ( <parameter>doc</parameter> <type>xml</type>, <parameter>keep_comments</parameter> <type>boolean</type> ) <returnvalue>xml</returnvalue> +</synopsis> + + <para> + This function converts a given XML document into its <ulink url="https://www.w3.org/TR/xml-c14n11/#Terminology">canonical form</ulink> + based on the <ulink url="https://www.w3.org/TR/xml-c14n11/">W3C Canonical XML 1.1 Specification</ulink>. + It is basically designed to provide applications the ability to compare xml documents or test if they + have been changed. The parameter <parameter>keep_comments</parameter>, specifies if the XML comments from the given document should be kept or not. + </para> + + <para> + Example: +<screen><![CDATA[ +SELECT + xmlcanonicalize( + '<foo> + <!-- a comment --> + <bar c="3" b="2" a="1">42</bar> + <empty/> + </foo>'::xml, true); + xmlcanonicalize +----------------------------------------------------------------------------- + <foo><!-- a comment --><bar a="1" b="2" c="3">42</bar><empty></empty></foo> +(1 row) + +SELECT + xmlcanonicalize( + '<foo> + <!-- a comment --> + <bar c="3" b="2" a="1">42</bar> + <empty/> + </foo>'::xml, false); + xmlcanonicalize +----------------------------------------------------------- + <foo><bar a="1" b="2" c="3">42</bar><empty></empty></foo> +(1 row) +]]></screen> + </para> + </sect3> + <sect3 id="functions-producing-xml-xmlcomment"> <title><literal>xmlcomment</literal></title> diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 1a07876cd5..3d9ca2e040 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -58,6 +58,7 @@ #include <libxml/xmlwriter.h> #include <libxml/xpath.h> #include <libxml/xpathInternals.h> +#include <libxml/c14n.h> /* * We used to check for xmlStructuredErrorContext via a configure test; but @@ -545,6 +546,45 @@ xmltext(PG_FUNCTION_ARGS) } +Datum +xmlcanonicalize(PG_FUNCTION_ARGS) +{ +#ifdef USE_LIBXML + xmltype *arg = PG_GETARG_XML_P(0); + bool keep_comments = PG_GETARG_BOOL(1); + text *result; + int nbytes; + xmlDocPtr doc; + xmlChar *xmlbuf = NULL; + + doc = xml_parse(arg, XMLOPTION_DOCUMENT, false, + GetDatabaseEncoding(), NULL, NULL, NULL); + + /* + * This dumps the canonicalized XML doc into the xmlChar* buffer. + * mode = 2 means the doc will be canonicalized using the C14N 1.1 standard. + */ + nbytes = xmlC14NDocDumpMemory(doc, NULL, 2, NULL, keep_comments, &xmlbuf); + + if(doc) + xmlFreeDoc(doc); + + if(nbytes < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not canonicalize the given XML document"))); + + result = cstring_to_text_with_len((const char *) xmlbuf, nbytes); + + xmlFree(xmlbuf); + + PG_RETURN_XML_P(result); +#else + NO_XML_SUPPORT(); + return 0; +#endif /* not USE_LIBXML */ +} + /* * TODO: xmlconcat needs to merge the notations and unparsed entities * of the argument values. Not very important in practice, though. diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index ff5436acac..1a177647ef 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8905,6 +8905,9 @@ { oid => '3813', descr => 'generate XML text node', proname => 'xmltext', prorettype => 'xml', proargtypes => 'text', prosrc => 'xmltext' }, +{ oid => '3814', descr => 'generate the canonical form of an XML document', + proname => 'xmlcanonicalize', prorettype => 'xml', proargtypes => 'xml bool', + prosrc => 'xmlcanonicalize' }, { oid => '2923', descr => 'map table contents to XML', proname => 'table_to_xml', procost => '100', provolatile => 's', diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out index 361a6f9b27..4994f31778 100644 --- a/src/test/regress/expected/xml.out +++ b/src/test/regress/expected/xml.out @@ -1866,3 +1866,73 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char); x<P>73</P>0.42truej (1 row) +-- xmlserialize: canonical +CREATE TABLE xmlcanonicalize_test (doc xml); +INSERT INTO xmlcanonicalize_test VALUES + ('<?xml version="1.0" encoding="ISO-8859-1"?> + <!DOCTYPE doc SYSTEM "doc.dtd" [ + <!ENTITY val "42"> + <!ATTLIST xyz attr CDATA "default"> + ]> + + <!-- attributes and namespces will be sorted --> + <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am" + xmlns:b="http://www.ietf.org" + xmlns:a="http://www.w3.org" + xmlns="http://example.org"> + + <!-- Normalization of whitespace in start and end tags --> + <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> + <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar > + + <!-- empty element will be converted to start-end tag pair --> + <empty/> + + <!-- text will be transcoded to UTF-8 --> + <transcode>1</transcode> + + <!-- whitespace inside tag will be preserved --> + <whitespace> 321 </whitespace> + + <!-- empty namespace will be removed of child tag --> + <emptyns xmlns="" > + <emptyns_child xmlns=""></emptyns_child> + </emptyns> + + <!-- CDATA section will be replaced by its value --> + <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute> + </foo> <!-- comment outside root element --> '); +SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test; + xmlcanonicalize +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + <!-- attributes and namespces will be sorted --> + + <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo>+ + <!-- comment outside root element --> +(1 row) + +SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test; + xmlcanonicalize +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo> +(1 row) + +SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- + +(1 row) + +SELECT xmlcanonicalize(NULL, true); + xmlcanonicalize +----------------- + +(1 row) + +\set VERBOSITY terse +SELECT xmlcanonicalize('', true); +ERROR: invalid XML document +SELECT xmlcanonicalize(' ', true); +ERROR: invalid XML document +SELECT xmlcanonicalize('foo', true); +ERROR: invalid XML document +\set VERBOSITY default diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out index d26e10441e..640db2c05c 100644 --- a/src/test/regress/expected/xml_1.out +++ b/src/test/regress/expected/xml_1.out @@ -1477,3 +1477,72 @@ ERROR: unsupported XML feature LINE 1: SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j':... ^ DETAIL: This functionality requires the server to be built with libxml support. +-- xmlserialize: canonical +CREATE TABLE xmlcanonicalize_test (doc xml); +INSERT INTO xmlcanonicalize_test VALUES + ('<?xml version="1.0" encoding="ISO-8859-1"?> + <!DOCTYPE doc SYSTEM "doc.dtd" [ + <!ENTITY val "42"> + <!ATTLIST xyz attr CDATA "default"> + ]> + + <!-- attributes and namespces will be sorted --> + <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am" + xmlns:b="http://www.ietf.org" + xmlns:a="http://www.w3.org" + xmlns="http://example.org"> + + <!-- Normalization of whitespace in start and end tags --> + <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> + <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar > + + <!-- empty element will be converted to start-end tag pair --> + <empty/> + + <!-- text will be transcoded to UTF-8 --> + <transcode>1</transcode> + + <!-- whitespace inside tag will be preserved --> + <whitespace> 321 </whitespace> + + <!-- empty namespace will be removed of child tag --> + <emptyns xmlns="" > + <emptyns_child xmlns=""></emptyns_child> + </emptyns> + + <!-- CDATA section will be replaced by its value --> + <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute> + </foo> <!-- comment outside root element --> '); +ERROR: unsupported XML feature +LINE 2: ('<?xml version="1.0" encoding="ISO-8859-1"?> + ^ +DETAIL: This functionality requires the server to be built with libxml support. +SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- +(0 rows) + +SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- +(0 rows) + +SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- +(0 rows) + +SELECT xmlcanonicalize(NULL, true); + xmlcanonicalize +----------------- + +(1 row) + +\set VERBOSITY terse +SELECT xmlcanonicalize('', true); +ERROR: unsupported XML feature at character 24 +SELECT xmlcanonicalize(' ', true); +ERROR: unsupported XML feature at character 24 +SELECT xmlcanonicalize('foo', true); +ERROR: unsupported XML feature at character 24 +\set VERBOSITY default diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out index 73c2851d3f..89835430d5 100644 --- a/src/test/regress/expected/xml_2.out +++ b/src/test/regress/expected/xml_2.out @@ -1852,3 +1852,73 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char); x<P>73</P>0.42truej (1 row) +-- xmlserialize: canonical +CREATE TABLE xmlcanonicalize_test (doc xml); +INSERT INTO xmlcanonicalize_test VALUES + ('<?xml version="1.0" encoding="ISO-8859-1"?> + <!DOCTYPE doc SYSTEM "doc.dtd" [ + <!ENTITY val "42"> + <!ATTLIST xyz attr CDATA "default"> + ]> + + <!-- attributes and namespces will be sorted --> + <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am" + xmlns:b="http://www.ietf.org" + xmlns:a="http://www.w3.org" + xmlns="http://example.org"> + + <!-- Normalization of whitespace in start and end tags --> + <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> + <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar > + + <!-- empty element will be converted to start-end tag pair --> + <empty/> + + <!-- text will be transcoded to UTF-8 --> + <transcode>1</transcode> + + <!-- whitespace inside tag will be preserved --> + <whitespace> 321 </whitespace> + + <!-- empty namespace will be removed of child tag --> + <emptyns xmlns="" > + <emptyns_child xmlns=""></emptyns_child> + </emptyns> + + <!-- CDATA section will be replaced by its value --> + <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute> + </foo> <!-- comment outside root element --> '); +SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test; + xmlcanonicalize +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + <!-- attributes and namespces will be sorted --> + + <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo>+ + <!-- comment outside root element --> +(1 row) + +SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test; + xmlcanonicalize +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo> +(1 row) + +SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test; + xmlcanonicalize +----------------- + +(1 row) + +SELECT xmlcanonicalize(NULL, true); + xmlcanonicalize +----------------- + +(1 row) + +\set VERBOSITY terse +SELECT xmlcanonicalize('', true); +ERROR: invalid XML document +SELECT xmlcanonicalize(' ', true); +ERROR: invalid XML document +SELECT xmlcanonicalize('foo', true); +ERROR: invalid XML document +\set VERBOSITY default diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql index f752ecb142..02ea3834ab 100644 --- a/src/test/regress/sql/xml.sql +++ b/src/test/regress/sql/xml.sql @@ -673,3 +673,52 @@ SELECT xmltext(' '); SELECT xmltext('foo `$_-+?=*^%!|/\()[]{}'); SELECT xmltext('foo & <"bar">'); SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char); + +-- xmlserialize: canonical +CREATE TABLE xmlcanonicalize_test (doc xml); +INSERT INTO xmlcanonicalize_test VALUES + ('<?xml version="1.0" encoding="ISO-8859-1"?> + <!DOCTYPE doc SYSTEM "doc.dtd" [ + <!ENTITY val "42"> + <!ATTLIST xyz attr CDATA "default"> + ]> + + <!-- attributes and namespces will be sorted --> + <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am" + xmlns:b="http://www.ietf.org" + xmlns:a="http://www.w3.org" + xmlns="http://example.org"> + + <!-- Normalization of whitespace in start and end tags --> + <!-- Elimination of superfluous namespace declarations, as already declared in <foo> --> + <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar > + + <!-- empty element will be converted to start-end tag pair --> + <empty/> + + <!-- text will be transcoded to UTF-8 --> + <transcode>1</transcode> + + <!-- whitespace inside tag will be preserved --> + <whitespace> 321 </whitespace> + + <!-- empty namespace will be removed of child tag --> + <emptyns xmlns="" > + <emptyns_child xmlns=""></emptyns_child> + </emptyns> + + <!-- CDATA section will be replaced by its value --> + <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute> + </foo> <!-- comment outside root element --> '); + +SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test; +SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test; + +SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test; +SELECT xmlcanonicalize(NULL, true); + +\set VERBOSITY terse +SELECT xmlcanonicalize('', true); +SELECT xmlcanonicalize(' ', true); +SELECT xmlcanonicalize('foo', true); +\set VERBOSITY default \ No newline at end of file -- 2.34.1