Hi Pavel,

Thanks for taking the time to review my patch. Attached is a new version addressing your concerns.

On 29/07/10 14:21, Pavel Stehule wrote:
I have a few issues:
* broken regress test (fedora 13 - xmllint: using libxml version 20707)

postgres=# SELECT xml_is_well_formed('<pg:foo
xmlns:pg="http://postgresql.org/stuff";;>bar</pg:foo>');
  xml_is_well_formed
--------------------
  f
(1 row)

this xml is broken - but in regress tests is ok

[pa...@pavel-stehule ~]$ xmllint xxx
xxx:1: parser error : error parsing attribute name
<pg:foo xmlns:pg="http://postgresql.org/stuff";;>bar</pg:foo>

This is a problem that was observered recently by Thom Brown - the commit fest webapp adds the semicolon after the quote. If you look at the attachment I sent in a email client you'll find there is no semicolon. The patch attached here will also not have the semicolon.

* xml_is_well_formed returns true for simple text

postgres=# SELECT xml_is_well_formed('ssss');
  xml_is_well_formed
--------------------
  t
(1 row)

it is probably wrong result - is it ok??

Yes this is OK, pure text is valid XML content.

* I don't understand to this fragment

        PG_TRY();
+       {
+               size_t          count;
+               xmlChar    *version = NULL;
+               int                     standalone = -1;
+.
+               res_code = parse_xml_decl(string,&count,&version,
NULL,&standalone);
+               if (res_code != 0)
+                       xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT,
+                                                 "invalid XML
content: invalid XML declaration",
+                                                       res_code);
+.
+               doc = xmlNewDoc(version);
+               doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
+               doc->standalone = 1;

why? This function can raise exception when declaration is wrong. It
is wrong - I think, this function should returns false instead
exception.


You're quite right, I should be returning false here and not causing an exception. I have corrected this in the attached patch.

Regards,

--
Mike Fowler
Registered Linux user: 379787

*** a/contrib/xml2/xpath.c
--- b/contrib/xml2/xpath.c
***************
*** 27,33 **** PG_MODULE_MAGIC;
  
  /* externally accessible functions */
  
- Datum		xml_is_well_formed(PG_FUNCTION_ARGS);
  Datum		xml_encode_special_chars(PG_FUNCTION_ARGS);
  Datum		xpath_nodeset(PG_FUNCTION_ARGS);
  Datum		xpath_string(PG_FUNCTION_ARGS);
--- 27,32 ----
***************
*** 70,97 **** pgxml_parser_init(void)
  	xmlLoadExtDtdDefaultValue = 1;
  }
  
- 
- /* Returns true if document is well-formed */
- 
- PG_FUNCTION_INFO_V1(xml_is_well_formed);
- 
- Datum
- xml_is_well_formed(PG_FUNCTION_ARGS)
- {
- 	text	   *t = PG_GETARG_TEXT_P(0);		/* document buffer */
- 	int32		docsize = VARSIZE(t) - VARHDRSZ;
- 	xmlDocPtr	doctree;
- 
- 	pgxml_parser_init();
- 
- 	doctree = xmlParseMemory((char *) VARDATA(t), docsize);
- 	if (doctree == NULL)
- 		PG_RETURN_BOOL(false);	/* i.e. not well-formed */
- 	xmlFreeDoc(doctree);
- 	PG_RETURN_BOOL(true);
- }
- 
- 
  /* Encodes special characters (<, >, &, " and \r) as XML entities */
  
  PG_FUNCTION_INFO_V1(xml_encode_special_chars);
--- 69,74 ----
*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
***************
*** 8554,8562 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab;
  ]]></screen>
      </para>
     </sect3>
  
     <sect3>
!     <title>XML Predicates</title>
  
      <indexterm>
       <primary>IS DOCUMENT</primary>
--- 8554,8566 ----
  ]]></screen>
      </para>
     </sect3>
+   </sect2>
+ 
+   <sect2>
+    <title>XML Predicates</title>
  
     <sect3>
!     <title>IS DOCUMENT</title>
  
      <indexterm>
       <primary>IS DOCUMENT</primary>
***************
*** 8574,8579 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab;
--- 8578,8675 ----
       between documents and content fragments.
      </para>
     </sect3>
+ 
+    <sect3>
+     <title>xml_is_well_formed</title>
+ 
+     <indexterm>
+      <primary>xml_is_well_formed</primary>
+      <secondary>well formed</secondary>
+     </indexterm>
+ 
+ <synopsis>
+ <function>xml_is_well_formed</function>(<replaceable>text</replaceable>)
+ </synopsis>
+ 
+     <para>
+      The function <function>xml_is_well_formed</function> evaluates whether
+      the <replaceable>text</replaceable> is well formed XML content, returning
+      a boolean.
+     </para>
+     <para>
+     Example:
+ <screen><![CDATA[
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+  xml_is_well_formed
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo');
+  xml_is_well_formed
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo><bar>stuff</foo>');
+  xml_is_well_formed
+ --------------------
+  f
+ (1 row)
+ ]]></screen>
+     </para>
+     <para>
+     In addition to the structure checks, the function ensures that namespaces are correcty matched.
+ <screen><![CDATA[
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff";>bar</my:foo>');
+  xml_is_well_formed
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff";>bar</pg:foo>');
+  xml_is_well_formed
+ --------------------
+  t
+ (1 row)
+ ]]></screen>
+     </para>
+     <para>
+     This function can be combined with the IS DOCUMENT predicate to prevent
+     invalid XML content errors from occuring in queries. For example, given a
+     table that may have rows with invalid XML mixed in with rows of valid
+     XML, <function>xml_is_well_formed</function> can be used to filter out all
+     the invalid rows.
+     </para>
+     <para>
+     Example:
+ <screen><![CDATA[
+ SELECT * FROM mixed;
+              data
+ ------------------------------
+  <foo>bar</foo>
+  <foo>bar</foo
+  <foo>bar</foo><bar>foo</bar>
+  <foo>bar</foo><bar>foo</bar
+ (4 rows)
+ 
+ SELECT COUNT(data) FROM mixed WHERE data::xml IS DOCUMENT;
+ ERROR:  invalid XML content
+ DETAIL:  Entity: line 1: parser error : expected '>'
+ <foo>bar</foo
+              ^
+ Entity: line 1: parser error : chunk is not well balanced
+ <foo>bar</foo
+              ^
+ 
+ SELECT COUNT(data) FROM mixed WHERE xml_is_well_formed(data) AND data::xml IS DOCUMENT;
+  count
+ -------
+      1
+ (1 row)
+ ]]></screen>
+     </para>
+    </sect3>
    </sect2>
  
    <sect2 id="functions-xml-processing">
*** a/src/backend/utils/adt/xml.c
--- b/src/backend/utils/adt/xml.c
***************
*** 3293,3298 **** xml_xmlnodetoxmltype(xmlNodePtr cur)
--- 3293,3366 ----
  }
  #endif
  
+ Datum
+ xml_is_well_formed(PG_FUNCTION_ARGS)
+ {
+ #ifdef USE_LIBXML
+ 	text				*data = PG_GETARG_TEXT_P(0);
+ 	bool				result;
+ 	int					res_code;
+ 	int32				len;
+ 	const xmlChar		*string;
+ 	xmlParserCtxtPtr	ctxt;
+ 	xmlDocPtr			doc = NULL;
+ 
+ 	len = VARSIZE(data) - VARHDRSZ;
+ 	string = xml_text2xmlChar(data);
+ 
+ 	/* Start up libxml and its parser (no-ops if already done) */
+ 	pg_xml_init();
+ 	xmlInitParser();
+ 
+ 	ctxt = xmlNewParserCtxt();
+ 	if (ctxt == NULL)
+ 		xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
+ 					"could not allocate parser context");
+ 
+ 	PG_TRY();
+ 	{
+ 		size_t		count;
+ 		xmlChar    *version = NULL;
+ 		int			standalone = -1;
+ 
+ 		res_code = parse_xml_decl(string, &count, &version, NULL, &standalone);
+ 		if (res_code != 0)
+ 			result = 0;
+ 		else
+ 		{
+ 
+ 			doc = xmlNewDoc(version);
+ 			doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
+ 			doc->standalone = 1;
+ 
+ 			res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, string + count, NULL);
+ 
+ 			result = !res_code;
+ 		}
+ 	}
+ 	PG_CATCH();
+ 	{
+ 		if (doc)
+ 			xmlFreeDoc(doc);
+ 		if (ctxt)
+ 			xmlFreeParserCtxt(ctxt);
+ 
+ 		PG_RE_THROW();
+ 	}
+ 	PG_END_TRY();
+ 
+ 	if (doc)
+ 		xmlFreeDoc(doc);
+ 	if (ctxt)
+ 		xmlFreeParserCtxt(ctxt);
+ 
+ 	return result;
+ #else
+ 	NO_XML_SUPPORT();
+ 	return 0;
+ #endif
+ }
+ 
  
  /*
   * Evaluate XPath expression and return array of XML values.
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4391,4396 **** DESCR("evaluate XPath expression, with namespaces support");
--- 4391,4399 ----
  DATA(insert OID = 2932 (  xpath		 PGNSP PGUID 14 1 0 0 f f f t f i 2 0 143 "25 142" _null_ _null_ _null_ _null_ "select pg_catalog.xpath($1, $2, ''{}''::pg_catalog.text[])" _null_ _null_ _null_ ));
  DESCR("evaluate XPath expression");
  
+ DATA(insert OID = 3037 (  xml_is_well_formed	PGNSP PGUID 12 1 0 0 f f f t f i 1 0 16 "25" _null_ _null_ _null_ _null_ xml_is_well_formed _null_ _null_ _null_ ));
+ DESCR("determine if a text fragment is well formed XML");
+ 
  /* uuid */
  DATA(insert OID = 2952 (  uuid_in		   PGNSP PGUID 12 1 0 0 f f f t f i 1 0 2950 "2275" _null_ _null_ _null_ _null_ uuid_in _null_ _null_ _null_ ));
  DESCR("I/O");
*** a/src/include/utils/xml.h
--- b/src/include/utils/xml.h
***************
*** 46,51 **** extern Datum query_to_xmlschema(PG_FUNCTION_ARGS);
--- 46,52 ----
  extern Datum cursor_to_xmlschema(PG_FUNCTION_ARGS);
  extern Datum table_to_xml_and_xmlschema(PG_FUNCTION_ARGS);
  extern Datum query_to_xml_and_xmlschema(PG_FUNCTION_ARGS);
+ extern Datum xml_is_well_formed(PG_FUNCTION_ARGS);
  
  extern Datum schema_to_xml(PG_FUNCTION_ARGS);
  extern Datum schema_to_xmlschema(PG_FUNCTION_ARGS);
*** a/src/test/regress/expected/xml.out
--- b/src/test/regress/expected/xml.out
***************
*** 502,504 **** SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
--- 502,577 ----
   {<b>two</b>,<b>etc</b>}
  (1 row)
  
+ -- Test xml_is_well_formed
+ SELECT xml_is_well_formed('<>');
+  xml_is_well_formed 
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('abc');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<abc/>');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo');
+  xml_is_well_formed 
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo><bar>baz</foo>');
+  xml_is_well_formed 
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1";><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT;
+  ?column? 
+ ----------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT;
+  ?column? 
+ ----------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT;
+  ?column? 
+ ----------
+  t
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff";>bar</my:foo>');
+  xml_is_well_formed 
+ --------------------
+  f
+ (1 row)
+ 
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff";>bar</pg:foo>');
+  xml_is_well_formed 
+ --------------------
+  t
+ (1 row)
+ 
*** a/src/test/regress/sql/xml.sql
--- b/src/test/regress/sql/xml.sql
***************
*** 163,165 **** SELECT xpath('', '<!-- error -->');
--- 163,180 ----
  SELECT xpath('//text()', '<local:data xmlns:local="http://127.0.0.1";><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
  SELECT xpath('//loc:piece/@id', '<local:data xmlns:local="http://127.0.0.1";><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>', ARRAY[ARRAY['loc', 'http://127.0.0.1']]);
  SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
+ 
+ -- Test xml_is_well_formed
+ 
+ SELECT xml_is_well_formed('<>');
+ SELECT xml_is_well_formed('abc');
+ SELECT xml_is_well_formed('<abc/>');
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+ SELECT xml_is_well_formed('<foo>bar</foo');
+ SELECT xml_is_well_formed('<foo><bar>baz</foo>');
+ SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1";><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
+ SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT;
+ SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT;
+ SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT;
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff";>bar</my:foo>');
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff";>bar</pg:foo>');
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to