Re: [HACKERS] proposal: function parse_ident

Pavel Stehule Tue, 16 Feb 2016 20:54:21 -0800

2016-02-17 1:38 GMT+01:00 Jim Nasby <[email protected]>:

> On 2/11/16 1:27 AM, Pavel Stehule wrote:
>
>>     I editorialized the docs and some of the comments. In particular, I
>>     documented behavior of not truncating, and recommended casting to
>>     name[] if user cares about that. Added a unit test to verify that
>>     works. BTW, I saw mention in the thread about not truncated spaces,
>>     but the function *does* truncate them, unless they're inside quotes,
>>     where they're legitimate.
>>
>>
>> ok
>>
>
> I missed some of my edits. Updated patch with those in place attached.
>
>     Also added test for invalid characters.
>>
>>     I think "strict" would be more in line with other uses in code.
>>     There are currently no other occurrences of 'strictmode' in the
>>     code. There are loads of references to 'strict', but I didn't go
>>     through all of them to see if any were used as externally visible
>>     function parameter names.
>>
>>
>> I am sorry, I don't understand to this point. You unlike the name of
>> parameter "strictmode" ? Have you any proposal? Maybe "restrictive" ?
>>
>
> I would just call it strict. There's precedent for that in the code.
>


+1

fixed in attached patch


>
> The almost all code +/- is related to human readable error messages. We
>> can move some code to separate static functions - read_quoted_ident,
>> read_unquoted_ident, but there will be some magic about parameters, and
>> the code will not be much better, than it is now.
>>
>
> What I'm saying is that most places that need to do de-quoting or similar
> just run a simple while loop and use an in_quote variable to track whether
> they're inside a quote or not. See backend/utils/adt/rowtypes.c line 199
> for an example.
>
> As I said, I don't have a strong opinion on it, so if you prefer it this
> way that's fine with me.


yes, I don't see string differences between for(;;) and break and
while(var). I prefer current state.

Regards

Pavel


>
> --
> Jim Nasby, Data Architect, Blue Treble Consulting, Austin TX
> Experts in Analytics, Data Architecture and PostgreSQL
> Data in Trouble? Get it in Treble! http://BlueTreble.com
>

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
new file mode 100644
index f9eea76..9eed19a
*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
***************
*** 1821,1826 ****
--- 1821,1847 ----
        <row>
         <entry>
          <indexterm>
+          <primary>parse_ident</primary>
+         </indexterm>
+         <literal><function>parse_ident(<parameter>str</parameter> <type>text</type>,
+            [ <parameter>strictmode</parameter> <type>boolean</type> DEFAULT true ] )</function></literal>
+        </entry>
+        <entry><type>text[]</type></entry>
+        <entry>Split <parameter>qualified identifier</parameter> into array <parameter>parts</parameter>.
+        When <parameter>strictmode</parameter> is false, extra characters after the identifier are ignored.
+        This is useful for parsing identifiers for objects like functions and arrays that may have trailing
+        characters. By default, extra characters after the last identifier are considered an error.
+        second parameter is false, then chars after last identifier are ignored. Note that this function
+        does not truncate quoted identifiers. If you care about that you should cast the result of this
+ 	   function to name[].
+        </entry>
+        <entry><literal>parse_ident('"SomeSchema".someTable')</literal></entry>
+        <entry><literal>"SomeSchema,sometable"</literal></entry>
+       </row>
+ 
+       <row>
+        <entry>
+         <indexterm>
           <primary>pg_client_encoding</primary>
          </indexterm>
          <literal><function>pg_client_encoding()</function></literal>
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
new file mode 100644
index 923fe58..9a65bc9
*** a/src/backend/catalog/system_views.sql
--- b/src/backend/catalog/system_views.sql
*************** RETURNS jsonb
*** 965,967 ****
--- 965,974 ----
  LANGUAGE INTERNAL
  STRICT IMMUTABLE
  AS 'jsonb_set';
+ 
+ CREATE OR REPLACE FUNCTION
+   parse_ident(str text, strict boolean DEFAULT true)
+ RETURNS text[]
+ LANGUAGE INTERNAL
+ STRICT IMMUTABLE
+ AS 'parse_ident';
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
new file mode 100644
index 2b4ab20..7aa5b76
*** a/src/backend/parser/scansup.c
--- b/src/backend/parser/scansup.c
*************** scanstr(const char *s)
*** 130,135 ****
--- 130,144 ----
  char *
  downcase_truncate_identifier(const char *ident, int len, bool warn)
  {
+ 	return downcase_identifier(ident, len, warn, true);
+ }
+ 
+ /*
+  * a workhorse for downcase_truncate_identifier
+  */
+ char *
+ downcase_identifier(const char *ident, int len, bool warn, bool truncate)
+ {
  	char	   *result;
  	int			i;
  	bool		enc_is_single_byte;
*************** downcase_truncate_identifier(const char
*** 158,169 ****
  	}
  	result[i] = '\0';
  
! 	if (i >= NAMEDATALEN)
  		truncate_identifier(result, i, warn);
  
  	return result;
  }
  
  /*
   * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
   *
--- 167,179 ----
  	}
  	result[i] = '\0';
  
! 	if (i >= NAMEDATALEN && truncate)
  		truncate_identifier(result, i, warn);
  
  	return result;
  }
  
+ 
  /*
   * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
   *
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
new file mode 100644
index 43f36db..3a8d9f5
*** a/src/backend/utils/adt/misc.c
--- b/src/backend/utils/adt/misc.c
***************
*** 21,32 ****
--- 21,35 ----
  #include <unistd.h>
  
  #include "access/sysattr.h"
+ #include "access/htup_details.h"
  #include "catalog/catalog.h"
+ #include "catalog/namespace.h"
  #include "catalog/pg_tablespace.h"
  #include "catalog/pg_type.h"
  #include "commands/dbcommands.h"
  #include "funcapi.h"
  #include "miscadmin.h"
+ #include "parser/scansup.h"
  #include "parser/keywords.h"
  #include "postmaster/syslogger.h"
  #include "rewrite/rewriteHandler.h"
***************
*** 38,43 ****
--- 41,47 ----
  #include "utils/ruleutils.h"
  #include "tcop/tcopprot.h"
  #include "utils/acl.h"
+ #include "utils/array.h"
  #include "utils/builtins.h"
  #include "utils/timestamp.h"
  
*************** pg_column_is_updatable(PG_FUNCTION_ARGS)
*** 719,721 ****
--- 723,896 ----
  
  	PG_RETURN_BOOL((events & REQ_EVENTS) == REQ_EVENTS);
  }
+ 
+ 
+ /*
+  * This simple parser utility are compatible with lexer implementation,
+  * used only in parse_ident function
+  */
+ static bool
+ is_ident_start(unsigned char c)
+ {
+ 	if (c == '_')
+ 		return true;
+ 	if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+ 		return true;
+ 
+ 	if (c >= 0200 && c <= 0377)
+ 		return true;
+ 
+ 	return false;
+ }
+ 
+ static bool
+ is_ident_cont(unsigned char c)
+ {
+ 	if (c >= '0' && c <= '9')
+ 		return true;
+ 
+ 	return is_ident_start(c);
+ }
+ 
+ /*
+  * parse_ident - parse SQL composed identifier to separate identifiers.
+  * When strict mode is active (second parameter), then any chars after
+  * last identifiers are disallowed.
+  */
+ Datum
+ parse_ident(PG_FUNCTION_ARGS)
+ {
+ 	text		*qualname;
+ 	char		*qualname_str;
+ 	bool		strict;
+ 	ArrayBuildState *astate = NULL;
+ 	char	*nextp;
+ 	bool		after_dot = false;
+ 
+ 	qualname = PG_GETARG_TEXT_PP(0);
+ 	qualname_str = text_to_cstring(qualname);
+ 	strict = PG_GETARG_BOOL(1);
+ 
+ 	nextp = qualname_str;
+ 
+ 	/* skip leading whitespace */
+ 	while (isspace((unsigned char) *nextp))
+ 		nextp++;
+ 
+ 	for (;;)
+ 	{
+ 		char		*curname;
+ 		char		*endp;
+ 		bool		missing_ident;
+ 
+ 		missing_ident = true;
+ 
+ 		if (*nextp == '\"')
+ 		{
+ 			curname = nextp + 1;
+ 			for (;;)
+ 			{
+ 				endp = strchr(nextp + 1, '\"');
+ 				if (endp == NULL)
+ 					ereport(ERROR,
+ 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 						 errmsg("unclosed double quotes"),
+ 						 errdetail("string \"%s\" is not valid identifier",
+ 													qualname_str)));
+ 				if (endp[1] != '\"')
+ 					break;
+ 				memmove(endp, endp + 1, strlen(endp));
+ 				nextp = endp;
+ 			}
+ 			nextp = endp + 1;
+ 			*endp = '\0';
+ 
+ 			if (endp - curname == 0)
+ 				ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("identifier should not be empty: \"%s\"",
+ 												qualname_str)));
+ 
+ 			astate = accumArrayResult(astate,
+ 				CStringGetTextDatum(curname), false,
+ 						    TEXTOID, CurrentMemoryContext);
+ 			missing_ident = false;
+ 		}
+ 		else
+ 		{
+ 			if (is_ident_start((unsigned char) *nextp))
+ 			{
+ 				char *downname;
+ 				int	len;
+ 				text	*part;
+ 
+ 				curname = nextp++;
+ 				while (is_ident_cont((unsigned char) *nextp))
+ 					nextp++;
+ 
+ 				len = nextp - curname;
+ 
+ 				/*
+ 				 * Unlike name, we don't implicitly truncate identifiers. This
+ 				 * is useful for allowing the user to check for specific parts
+ 				 * of the identifier being too long. It's easy enough for the
+ 				 * user to get the truncated names by casting our output to
+ 				 * name[].
+ 				 */
+ 				downname = downcase_identifier(curname, len, false, false);
+ 				part = cstring_to_text_with_len(downname, len);
+ 				astate = accumArrayResult(astate,
+ 					PointerGetDatum(part), false,
+ 							    TEXTOID, CurrentMemoryContext);
+ 				missing_ident = false;
+ 			}
+ 		}
+ 
+ 		if (missing_ident)
+ 		{
+ 			/* Different error messages based on where we failed. */
+ 			if (*nextp == '.')
+ 				ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("missing identifier before \".\" symbol: \"%s\"",
+ 												qualname_str)));
+ 			else if (after_dot)
+ 				ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("missing identifier after \".\" symbol: \"%s\"",
+ 												qualname_str)));
+ 			else
+ 				ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("missing identifier: \"%s\"",
+ 												qualname_str)));
+ 		}
+ 
+ 		while (isspace((unsigned char) *nextp))
+ 			nextp++;
+ 
+ 		if (*nextp == '.')
+ 		{
+ 			after_dot = true;
+ 			nextp++;
+ 			while (isspace((unsigned char) *nextp))
+ 				nextp++;
+ 			continue;
+ 		}
+ 		else if (*nextp == '\0')
+ 		{
+ 			break;
+ 		}
+ 		else
+ 		{
+ 			if (strict)
+ 				ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("identifier contains disallowed characters: \"%s\"",
+ 												qualname_str)));
+ 			break;
+ 		}
+ 	}
+ 
+ 	PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
+ }
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
new file mode 100644
index b24e434..8354a82
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
*************** DESCR("I/O");
*** 3457,3462 ****
--- 3457,3465 ----
  DATA(insert OID = 4086 (  to_regnamespace	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 4089 "25" _null_ _null_ _null_ _null_ _null_ to_regnamespace _null_ _null_ _null_ ));
  DESCR("convert namespace name to regnamespace");
  
+ DATA(insert OID = 3318 (  parse_ident		PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 1009 "25 16" _null_ _null_ "{str,strict}" _null_ _null_ parse_ident _null_ _null_ _null_ ));
+ DESCR("parse qualified identifier to array of identifiers");
+ 
  DATA(insert OID = 2246 ( fmgr_internal_validator PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_internal_validator _null_ _null_ _null_ ));
  DESCR("(internal)");
  DATA(insert OID = 2247 ( fmgr_c_validator	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_c_validator _null_ _null_ _null_ ));
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
new file mode 100644
index 4f4164b..4f95c81
*** a/src/include/parser/scansup.h
--- b/src/include/parser/scansup.h
*************** extern char *scanstr(const char *s);
*** 20,25 ****
--- 20,28 ----
  extern char *downcase_truncate_identifier(const char *ident, int len,
  							 bool warn);
  
+ extern char *downcase_identifier(const char *ident, int len,
+ 							 bool warn, bool truncate);
+ 
  extern void truncate_identifier(char *ident, int len, bool warn);
  
  extern bool scanner_isspace(char ch);
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
new file mode 100644
index affcc01..b22a6b8
*** a/src/include/utils/builtins.h
--- b/src/include/utils/builtins.h
*************** extern Datum pg_typeof(PG_FUNCTION_ARGS)
*** 508,513 ****
--- 508,514 ----
  extern Datum pg_collation_for(PG_FUNCTION_ARGS);
  extern Datum pg_relation_is_updatable(PG_FUNCTION_ARGS);
  extern Datum pg_column_is_updatable(PG_FUNCTION_ARGS);
+ extern Datum parse_ident(PG_FUNCTION_ARGS);
  
  /* oid.c */
  extern Datum oidin(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/name.out b/src/test/regress/expected/name.out
new file mode 100644
index b359d52..2a8876f
*** a/src/test/regress/expected/name.out
--- b/src/test/regress/expected/name.out
*************** SELECT '' AS two, c.f1 FROM NAME_TBL c W
*** 124,126 ****
--- 124,178 ----
  (2 rows)
  
  DROP TABLE NAME_TBL;
+ DO $$
+ DECLARE r text[];
+ BEGIN
+   r := parse_ident('Schemax.Tabley');
+   RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+   r := parse_ident('"SchemaX"."TableY"');
+   RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+ END;
+ $$;
+ NOTICE:  schemax.tabley
+ NOTICE:  "SchemaX"."TableY"
+ SELECT parse_ident('foo.boo');
+  parse_ident 
+ -------------
+  {foo,boo}
+ (1 row)
+ 
+ SELECT parse_ident('foo.boo[]'); -- should fail
+ ERROR:  identifier contains disallowed characters: "foo.boo[]"
+ SELECT parse_ident('foo.boo[]', strict => false); -- ok
+  parse_ident 
+ -------------
+  {foo,boo}
+ (1 row)
+ 
+ -- should fail
+ SELECT parse_ident(' ');
+ ERROR:  missing identifier: " "
+ SELECT parse_ident(' .aaa');
+ ERROR:  missing identifier before "." symbol: " .aaa"
+ SELECT parse_ident(' aaa . ');
+ ERROR:  missing identifier after "." symbol: " aaa . "
+ SELECT parse_ident('aaa.a%b');
+ ERROR:  identifier contains disallowed characters: "aaa.a%b"
+ SELECT length(a[1]), length(a[2]) from parse_ident('"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx".yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy') as a ;
+  length | length 
+ --------+--------
+     414 |    289
+ (1 row)
+ 
+ SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || repeat('x',66) || '"');
+                                                 parse_ident                                                
+ -----------------------------------------------------------------------------------------------------------
+  {first,"  second  ","   third   ","  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
+ (1 row)
+ 
+ SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || repeat('x',66) || '"')::name[];
+                                              parse_ident                                              
+ ------------------------------------------------------------------------------------------------------
+  {first,"  second  ","   third   ","  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
+ (1 row)
+ 
diff --git a/src/test/regress/sql/name.sql b/src/test/regress/sql/name.sql
new file mode 100644
index 1c7a671..90b7ea0
*** a/src/test/regress/sql/name.sql
--- b/src/test/regress/sql/name.sql
*************** SELECT '' AS three, c.f1 FROM NAME_TBL c
*** 52,54 ****
--- 52,79 ----
  SELECT '' AS two, c.f1 FROM NAME_TBL c WHERE c.f1 ~ '.*asdf.*';
  
  DROP TABLE NAME_TBL;
+ 
+ DO $$
+ DECLARE r text[];
+ BEGIN
+   r := parse_ident('Schemax.Tabley');
+   RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+   r := parse_ident('"SchemaX"."TableY"');
+   RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+ END;
+ $$;
+ 
+ SELECT parse_ident('foo.boo');
+ SELECT parse_ident('foo.boo[]'); -- should fail
+ SELECT parse_ident('foo.boo[]', strict => false); -- ok
+ 
+ -- should fail
+ SELECT parse_ident(' ');
+ SELECT parse_ident(' .aaa');
+ SELECT parse_ident(' aaa . ');
+ SELECT parse_ident('aaa.a%b');
+ 
+ SELECT length(a[1]), length(a[2]) from parse_ident('"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx".yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy') as a ;
+ 
+ SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || repeat('x',66) || '"');
+ SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || repeat('x',66) || '"')::name[];

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] proposal: function parse_ident

Reply via email to