Re: [PATCH] Add CANONICAL option to xmlserialize

Jim Jones Thu, 12 Sep 2024 03:57:13 -0700

On 10.09.24 19:43, Tom Lane wrote:
> How about instead introducing a plain function along the lines of
> "xml_canonicalize(xml, bool keep_comments) returns text" ?  The SQL
> committee will certainly never do that, but we won't regret having
> created a plain function whenever they get around to doing something
> in the same space.
A second function to serialize xml documents may sound a bit redundant,
but I totally understand the concern of possibly conflicting with
SQL/XMl spec in the feature. I guess we can always come back here and
extend xmlserialize when the SQL committee moves in this direction.


v14 attached adds the function xmlcanonicalize, as suggested.

Thanks

-- 
Jim

From 08850417c9f0e1504a5e0cfbbd815c3a7aaaf7e8 Mon Sep 17 00:00:00 2001
From: Jim Jones <jim.jo...@uni-muenster.de>
Date: Thu, 12 Sep 2024 12:23:34 +0200
Subject: [PATCH v14] Add xmlcanonicalize function

This patch introduces the function xmlcanonicalize, which
serializes xml documents in their canonical form - as described in
the W3C Canonical XML Version 1.1 specification.

xmlcanonicalize(doc xml, keep_comments boolean) -> xml

doc: the XML document to be canonicalized
keep_comments: keeps or removes xml comments from doc

This feature is based on the function xmlC14NDocDumpMemory from the
C14N module of libxml2.
---
 doc/src/sgml/func.sgml              | 48 ++++++++++++++++++++
 src/backend/utils/adt/xml.c         | 40 +++++++++++++++++
 src/include/catalog/pg_proc.dat     |  3 ++
 src/test/regress/expected/xml.out   | 70 +++++++++++++++++++++++++++++
 src/test/regress/expected/xml_1.out | 69 ++++++++++++++++++++++++++++
 src/test/regress/expected/xml_2.out | 70 +++++++++++++++++++++++++++++
 src/test/regress/sql/xml.sql        | 49 ++++++++++++++++++++
 7 files changed, 349 insertions(+)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 1bde4091ca..f63787d633 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -14325,6 +14325,54 @@ SELECT xmltext('< foo & bar >');
     </para>
    </sect3>
 
+<sect3 id="functions-producing-xml-xmlcanonicalize">
+    <title><literal>xmlcanonicalize</literal></title>
+
+    <indexterm>
+     <primary>xmlcanonicalize</primary>
+    </indexterm>
+
+<synopsis>
+<function>xmlcanonicalize</function> ( <parameter>doc</parameter> <type>xml</type>, <parameter>keep_comments</parameter> <type>boolean</type> ) <returnvalue>xml</returnvalue>
+</synopsis>
+
+    <para>
+     This function converts a given XML document into its <ulink url="https://www.w3.org/TR/xml-c14n11/#Terminology";>canonical form</ulink>
+     based on the <ulink url="https://www.w3.org/TR/xml-c14n11/";>W3C Canonical XML 1.1 Specification</ulink>.
+     It is basically designed to provide applications the ability to compare xml documents or test if they
+     have been changed. The parameter <parameter>keep_comments</parameter>, specifies if the XML comments from the given document should be kept or not.
+    </para>
+
+    <para>
+     Example:
+<screen><![CDATA[
+SELECT
+  xmlcanonicalize(
+    '<foo>
+       <!-- a comment -->
+       <bar c="3" b="2" a="1">42</bar>
+       <empty/>
+     </foo>'::xml, true);
+                               xmlcanonicalize
+-----------------------------------------------------------------------------
+ <foo><!-- a comment --><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
+(1 row)
+
+SELECT
+  xmlcanonicalize(
+    '<foo>
+       <!-- a comment -->
+       <bar c="3" b="2" a="1">42</bar>
+       <empty/>
+     </foo>'::xml, false);
+                      xmlcanonicalize
+-----------------------------------------------------------
+ <foo><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
+(1 row)
+]]></screen>
+    </para>
+   </sect3>
+
    <sect3 id="functions-producing-xml-xmlcomment">
     <title><literal>xmlcomment</literal></title>
 
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 1a07876cd5..3d9ca2e040 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -58,6 +58,7 @@
 #include <libxml/xmlwriter.h>
 #include <libxml/xpath.h>
 #include <libxml/xpathInternals.h>
+#include <libxml/c14n.h>
 
 /*
  * We used to check for xmlStructuredErrorContext via a configure test; but
@@ -545,6 +546,45 @@ xmltext(PG_FUNCTION_ARGS)
 }
 
 
+Datum
+xmlcanonicalize(PG_FUNCTION_ARGS)
+{
+#ifdef USE_LIBXML
+	xmltype    *arg = PG_GETARG_XML_P(0);
+	bool		keep_comments = PG_GETARG_BOOL(1);
+	text	   *result;
+	int			nbytes;
+	xmlDocPtr	doc;
+	xmlChar    *xmlbuf = NULL;
+
+	doc = xml_parse(arg, XMLOPTION_DOCUMENT, false,
+					GetDatabaseEncoding(), NULL, NULL, NULL);
+
+	/*
+	 * This dumps the canonicalized XML doc into the xmlChar* buffer.
+	 * mode = 2 means the doc will be canonicalized using the C14N 1.1 standard.
+	 */
+	nbytes = xmlC14NDocDumpMemory(doc, NULL, 2, NULL, keep_comments, &xmlbuf);
+
+	if(doc)
+		xmlFreeDoc(doc);
+
+	if(nbytes < 0)
+		ereport(ERROR,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+			errmsg("could not canonicalize the given XML document")));
+
+	result = cstring_to_text_with_len((const char *) xmlbuf, nbytes);
+
+	xmlFree(xmlbuf);
+
+	PG_RETURN_XML_P(result);
+#else
+	NO_XML_SUPPORT();
+	return 0;
+#endif							/* not USE_LIBXML */
+}
+
 /*
  * TODO: xmlconcat needs to merge the notations and unparsed entities
  * of the argument values.  Not very important in practice, though.
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index ff5436acac..1a177647ef 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -8905,6 +8905,9 @@
 { oid => '3813', descr => 'generate XML text node',
   proname => 'xmltext', prorettype => 'xml', proargtypes => 'text',
   prosrc => 'xmltext' },
+{ oid => '3814', descr => 'generate the canonical form of an XML document',
+  proname => 'xmlcanonicalize', prorettype => 'xml', proargtypes => 'xml bool',
+  prosrc => 'xmlcanonicalize' },
 
 { oid => '2923', descr => 'map table contents to XML',
   proname => 'table_to_xml', procost => '100', provolatile => 's',
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index 361a6f9b27..4994f31778 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -1866,3 +1866,73 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
  x&lt;P&gt;73&lt;/P&gt;0.42truej
 (1 row)
 
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+  ('<?xml version="1.0" encoding="ISO-8859-1"?>
+  <!DOCTYPE doc SYSTEM "doc.dtd" [
+                  <!ENTITY val "42">
+      <!ATTLIST xyz attr CDATA "default">
+  ]>
+
+  <!-- attributes and namespces will be sorted -->
+  <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+      xmlns:b="http://www.ietf.org";
+      xmlns:a="http://www.w3.org";
+      xmlns="http://example.org";>
+
+    <!-- Normalization of whitespace in start and end tags -->
+    <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+    <bar     xmlns="" xmlns:a="http://www.w3.org";     >&val;</bar     >
+
+    <!-- empty element will be converted to start-end tag pair -->
+    <empty/>
+
+    <!-- text will be transcoded to UTF-8 -->
+    <transcode>&#49;</transcode>
+
+    <!-- whitespace inside tag will be preserved -->
+    <whitespace> 321 </whitespace>
+
+    <!-- empty namespace will be removed of child tag -->
+    <emptyns  xmlns="" >
+       <emptyns_child xmlns=""></emptyns_child>
+    </emptyns>
+
+    <!-- CDATA section will be replaced by its value -->
+    <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+  </foo>      <!-- comment outside root element -->          ');
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+                                                                                                                                                                                                                                                                                                                                                                                            xmlcanonicalize                                                                                                                                                                                                                                                                                                                                                                                             
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <!-- attributes and namespces will be sorted -->                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      +
+ <foo xmlns="http://example.org"; xmlns:a="http://www.w3.org"; xmlns:b="http://www.ietf.org"; attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>+
+ <!-- comment outside root element -->
+(1 row)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+                                                                                                                                                                                   xmlcanonicalize                                                                                                                                                                                    
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <foo xmlns="http://example.org"; xmlns:a="http://www.w3.org"; xmlns:b="http://www.ietf.org"; attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>
+(1 row)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR:  invalid XML document
+SELECT xmlcanonicalize('  ', true);
+ERROR:  invalid XML document
+SELECT xmlcanonicalize('foo', true);
+ERROR:  invalid XML document
+\set VERBOSITY default
diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out
index d26e10441e..640db2c05c 100644
--- a/src/test/regress/expected/xml_1.out
+++ b/src/test/regress/expected/xml_1.out
@@ -1477,3 +1477,72 @@ ERROR:  unsupported XML feature
 LINE 1: SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j':...
                              ^
 DETAIL:  This functionality requires the server to be built with libxml support.
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+  ('<?xml version="1.0" encoding="ISO-8859-1"?>
+  <!DOCTYPE doc SYSTEM "doc.dtd" [
+                  <!ENTITY val "42">
+      <!ATTLIST xyz attr CDATA "default">
+  ]>
+
+  <!-- attributes and namespces will be sorted -->
+  <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+      xmlns:b="http://www.ietf.org";
+      xmlns:a="http://www.w3.org";
+      xmlns="http://example.org";>
+
+    <!-- Normalization of whitespace in start and end tags -->
+    <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+    <bar     xmlns="" xmlns:a="http://www.w3.org";     >&val;</bar     >
+
+    <!-- empty element will be converted to start-end tag pair -->
+    <empty/>
+
+    <!-- text will be transcoded to UTF-8 -->
+    <transcode>&#49;</transcode>
+
+    <!-- whitespace inside tag will be preserved -->
+    <whitespace> 321 </whitespace>
+
+    <!-- empty namespace will be removed of child tag -->
+    <emptyns  xmlns="" >
+       <emptyns_child xmlns=""></emptyns_child>
+    </emptyns>
+
+    <!-- CDATA section will be replaced by its value -->
+    <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+  </foo>      <!-- comment outside root element -->          ');
+ERROR:  unsupported XML feature
+LINE 2:   ('<?xml version="1.0" encoding="ISO-8859-1"?>
+           ^
+DETAIL:  This functionality requires the server to be built with libxml support.
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR:  unsupported XML feature at character 24
+SELECT xmlcanonicalize('  ', true);
+ERROR:  unsupported XML feature at character 24
+SELECT xmlcanonicalize('foo', true);
+ERROR:  unsupported XML feature at character 24
+\set VERBOSITY default
diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out
index 73c2851d3f..89835430d5 100644
--- a/src/test/regress/expected/xml_2.out
+++ b/src/test/regress/expected/xml_2.out
@@ -1852,3 +1852,73 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
  x&lt;P&gt;73&lt;/P&gt;0.42truej
 (1 row)
 
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+  ('<?xml version="1.0" encoding="ISO-8859-1"?>
+  <!DOCTYPE doc SYSTEM "doc.dtd" [
+                  <!ENTITY val "42">
+      <!ATTLIST xyz attr CDATA "default">
+  ]>
+
+  <!-- attributes and namespces will be sorted -->
+  <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+      xmlns:b="http://www.ietf.org";
+      xmlns:a="http://www.w3.org";
+      xmlns="http://example.org";>
+
+    <!-- Normalization of whitespace in start and end tags -->
+    <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+    <bar     xmlns="" xmlns:a="http://www.w3.org";     >&val;</bar     >
+
+    <!-- empty element will be converted to start-end tag pair -->
+    <empty/>
+
+    <!-- text will be transcoded to UTF-8 -->
+    <transcode>&#49;</transcode>
+
+    <!-- whitespace inside tag will be preserved -->
+    <whitespace> 321 </whitespace>
+
+    <!-- empty namespace will be removed of child tag -->
+    <emptyns  xmlns="" >
+       <emptyns_child xmlns=""></emptyns_child>
+    </emptyns>
+
+    <!-- CDATA section will be replaced by its value -->
+    <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+  </foo>      <!-- comment outside root element -->          ');
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+                                                                                                                                                                                                                                                                                                                                                                                            xmlcanonicalize                                                                                                                                                                                                                                                                                                                                                                                             
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <!-- attributes and namespces will be sorted -->                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      +
+ <foo xmlns="http://example.org"; xmlns:a="http://www.w3.org"; xmlns:b="http://www.ietf.org"; attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>+
+ <!-- comment outside root element -->
+(1 row)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+                                                                                                                                                                                   xmlcanonicalize                                                                                                                                                                                    
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <foo xmlns="http://example.org"; xmlns:a="http://www.w3.org"; xmlns:b="http://www.ietf.org"; attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute></foo>
+(1 row)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize 
+-----------------
+ 
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR:  invalid XML document
+SELECT xmlcanonicalize('  ', true);
+ERROR:  invalid XML document
+SELECT xmlcanonicalize('foo', true);
+ERROR:  invalid XML document
+\set VERBOSITY default
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index f752ecb142..02ea3834ab 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -673,3 +673,52 @@ SELECT xmltext('  ');
 SELECT xmltext('foo `$_-+?=*^%!|/\()[]{}');
 SELECT xmltext('foo & <"bar">');
 SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
+
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+  ('<?xml version="1.0" encoding="ISO-8859-1"?>
+  <!DOCTYPE doc SYSTEM "doc.dtd" [
+                  <!ENTITY val "42">
+      <!ATTLIST xyz attr CDATA "default">
+  ]>
+
+  <!-- attributes and namespces will be sorted -->
+  <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+      xmlns:b="http://www.ietf.org";
+      xmlns:a="http://www.w3.org";
+      xmlns="http://example.org";>
+
+    <!-- Normalization of whitespace in start and end tags -->
+    <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+    <bar     xmlns="" xmlns:a="http://www.w3.org";     >&val;</bar     >
+
+    <!-- empty element will be converted to start-end tag pair -->
+    <empty/>
+
+    <!-- text will be transcoded to UTF-8 -->
+    <transcode>&#49;</transcode>
+
+    <!-- whitespace inside tag will be preserved -->
+    <whitespace> 321 </whitespace>
+
+    <!-- empty namespace will be removed of child tag -->
+    <emptyns  xmlns="" >
+       <emptyns_child xmlns=""></emptyns_child>
+    </emptyns>
+
+    <!-- CDATA section will be replaced by its value -->
+    <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+  </foo>      <!-- comment outside root element -->          ');
+
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(NULL, true);
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+SELECT xmlcanonicalize('  ', true);
+SELECT xmlcanonicalize('foo', true);
+\set VERBOSITY default
\ No newline at end of file
-- 
2.34.1

Re: [PATCH] Add CANONICAL option to xmlserialize

Reply via email to