[HACKERS] allow COPY routines to read arbitrary numbers of fields

Andrew Dunstan Sun, 05 Dec 2010 09:52:56 -0800

Attached is a patch that allows CopyReadAttibutesText() andCopyReadAttributesCSV() to read arbitrary numbers of attributes.Underflowing attributes are recorded as null, and space is made foroverflowing attributes on a line.

This patch doesn't result in any user-visible behavior. The currentcalling code will fail if the number of attributes read is not what isexpected, as happens now. But it will allow the API to be used (whenexposed) by a foreign data wrapper that can accept arbitrary numbers ofattributes. My aim here is to get to something like:


   CREATE FOREIGN TABLE my_csv (
        t text[]
   )
   SERVER file_server
   OPTIONS (format 'csv', filename '/path/to/my/data.csv', textarray
   'true',
             header 'true', delimiter ';', quote '@', escape '"', null '');

   SELECT t[3] as f1, t[1] as f2, t[9999] as probably_null
   FROM my_csv;

It would probably be nice to apply this before we start exposing theCOPY API to FDW routines, as discussed earlier today.


cheers

andrew

*** a/src/backend/commands/copy.c
--- b/src/backend/commands/copy.c
***************
*** 141,146 **** typedef struct CopyStateData
--- 141,151 ----
  	 */
  	StringInfoData attribute_buf;
  
+ 	/* field raw data pointers found by COPY FROM */
+ 
+ 	int max_fields;
+ 	char ** raw_fields;
+ 
  	/*
  	 * Similarly, line_buf holds the whole input line being processed. The
  	 * input cycle is first to read the whole line into line_buf, convert it
***************
*** 250,259 **** static void CopyOneRowTo(CopyState cstate, Oid tupleOid,
  static void CopyFrom(CopyState cstate);
  static bool CopyReadLine(CopyState cstate);
  static bool CopyReadLineText(CopyState cstate);
! static int CopyReadAttributesText(CopyState cstate, int maxfields,
! 					   char **fieldvals);
! static int CopyReadAttributesCSV(CopyState cstate, int maxfields,
! 					  char **fieldvals);
  static Datum CopyReadBinaryAttribute(CopyState cstate,
  						int column_no, FmgrInfo *flinfo,
  						Oid typioparam, int32 typmod,
--- 255,262 ----
  static void CopyFrom(CopyState cstate);
  static bool CopyReadLine(CopyState cstate);
  static bool CopyReadLineText(CopyState cstate);
! static int CopyReadAttributesText(CopyState cstate, int maxfields);
! static int CopyReadAttributesCSV(CopyState cstate, int maxfields);
  static Datum CopyReadBinaryAttribute(CopyState cstate,
  						int column_no, FmgrInfo *flinfo,
  						Oid typioparam, int32 typmod,
***************
*** 1679,1685 **** CopyFrom(CopyState cstate)
  	Oid			in_func_oid;
  	Datum	   *values;
  	bool	   *nulls;
! 	int			nfields;
  	char	  **field_strings;
  	bool		done = false;
  	bool		isnull;
--- 1682,1688 ----
  	Oid			in_func_oid;
  	Datum	   *values;
  	bool	   *nulls;
! 	int			nfields = 0;
  	char	  **field_strings;
  	bool		done = false;
  	bool		isnull;
***************
*** 1920,1927 **** CopyFrom(CopyState cstate)
  	nulls = (bool *) palloc(num_phys_attrs * sizeof(bool));
  
  	/* create workspace for CopyReadAttributes results */
! 	nfields = file_has_oids ? (attr_count + 1) : attr_count;
! 	field_strings = (char **) palloc(nfields * sizeof(char *));
  
  	/* Initialize state variables */
  	cstate->fe_eof = false;
--- 1923,1934 ----
  	nulls = (bool *) palloc(num_phys_attrs * sizeof(bool));
  
  	/* create workspace for CopyReadAttributes results */
! 	if (! cstate->binary)
! 	{
! 		nfields = file_has_oids ? (attr_count + 1) : attr_count;
! 		cstate->max_fields = nfields;
! 		cstate->raw_fields = (char **) palloc(nfields * sizeof(char *));
! 	}
  
  	/* Initialize state variables */
  	cstate->fe_eof = false;
***************
*** 1985,1994 **** CopyFrom(CopyState cstate)
  
  			/* Parse the line into de-escaped field values */
  			if (cstate->csv_mode)
! 				fldct = CopyReadAttributesCSV(cstate, nfields, field_strings);
  			else
! 				fldct = CopyReadAttributesText(cstate, nfields, field_strings);
  			fieldno = 0;
  
  			/* Read the OID field if present */
  			if (file_has_oids)
--- 1992,2009 ----
  
  			/* Parse the line into de-escaped field values */
  			if (cstate->csv_mode)
! 				fldct = CopyReadAttributesCSV(cstate, nfields);
  			else
! 				fldct = CopyReadAttributesText(cstate, nfields);
! 
! 			/* check for overflowing fields */
! 			if (nfields > 0 && fldct > nfields)
! 				ereport(ERROR,
! 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 						 errmsg("extra data after last expected column")));
! 
  			fieldno = 0;
+ 			field_strings = cstate->raw_fields;
  
  			/* Read the OID field if present */
  			if (file_has_oids)
***************
*** 2218,2224 **** CopyFrom(CopyState cstate)
  
  	pfree(values);
  	pfree(nulls);
! 	pfree(field_strings);
  
  	pfree(in_functions);
  	pfree(typioparams);
--- 2233,2240 ----
  
  	pfree(values);
  	pfree(nulls);
! 	if (! cstate->binary)
! 		pfree(cstate->raw_fields);
  
  	pfree(in_functions);
  	pfree(typioparams);
***************
*** 2717,2737 **** GetDecimalFromHex(char hex)
   * performing de-escaping as needed.
   *
   * The input is in line_buf.  We use attribute_buf to hold the result
!  * strings.  fieldvals[k] is set to point to the k'th attribute string,
!  * or NULL when the input matches the null marker string.  (Note that the
!  * caller cannot check for nulls since the returned string would be the
!  * post-de-escaping equivalent, which may look the same as some valid data
!  * string.)
   *
   * delim is the column delimiter string (must be just one byte for now).
   * null_print is the null marker string.  Note that this is compared to
   * the pre-de-escaped input string.
   *
!  * The return value is the number of fields actually read.	(We error out
!  * if this would exceed maxfields, which is the length of fieldvals[].)
   */
  static int
! CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
  {
  	char		delimc = cstate->delim[0];
  	int			fieldno;
--- 2733,2754 ----
   * performing de-escaping as needed.
   *
   * The input is in line_buf.  We use attribute_buf to hold the result
!  * strings.  cstate->raw_fields[k] is set to point to the k'th attribute 
!  * string, * or NULL when the input matches the null marker string.  
!  * This array is expanded as necessary.
!  *
!  * (Note that the caller cannot check for nulls since the returned 
!  * string would be the post-de-escaping equivalent, which may look 
!  * the same as some valid data string.)
   *
   * delim is the column delimiter string (must be just one byte for now).
   * null_print is the null marker string.  Note that this is compared to
   * the pre-de-escaped input string.
   *
!  * The return value is the number of fields actually read.
   */
  static int
! CopyReadAttributesText(CopyState cstate, int maxfields)
  {
  	char		delimc = cstate->delim[0];
  	int			fieldno;
***************
*** 2759,2765 **** CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
  	 * data line, so we can just force attribute_buf to be large enough and
  	 * then transfer data without any checks for enough space.	We need to do
  	 * it this way because enlarging attribute_buf mid-stream would invalidate
! 	 * pointers already stored into fieldvals[].
  	 */
  	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
  		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
--- 2776,2782 ----
  	 * data line, so we can just force attribute_buf to be large enough and
  	 * then transfer data without any checks for enough space.	We need to do
  	 * it this way because enlarging attribute_buf mid-stream would invalidate
! 	 * pointers already stored into cstate->raw_fields[].
  	 */
  	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
  		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
***************
*** 2779,2793 **** CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
  		int			input_len;
  		bool		saw_non_ascii = false;
  
! 		/* Make sure space remains in fieldvals[] */
! 		if (fieldno >= maxfields)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 					 errmsg("extra data after last expected column")));
  
  		/* Remember start of field on both input and output sides */
  		start_ptr = cur_ptr;
! 		fieldvals[fieldno] = output_ptr;
  
  		/* Scan data for field */
  		for (;;)
--- 2796,2813 ----
  		int			input_len;
  		bool		saw_non_ascii = false;
  
! 		/* Make sure there is enough space for the next value */
! 		if (fieldno >= cstate->max_fields)
! 		{
! 			cstate->max_fields *= 2;
! 			cstate->raw_fields = 
! 				repalloc(cstate->raw_fields, cstate->max_fields*sizeof(char *));
! 		}
  
  		/* Remember start of field on both input and output sides */
  		start_ptr = cur_ptr;
! 		cstate->raw_fields[fieldno] = output_ptr;
! 
  
  		/* Scan data for field */
  		for (;;)
***************
*** 2912,2918 **** CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
  		 */
  		if (saw_non_ascii)
  		{
! 			char	   *fld = fieldvals[fieldno];
  
  			pg_verifymbstr(fld, output_ptr - (fld + 1), false);
  		}
--- 2932,2938 ----
  		 */
  		if (saw_non_ascii)
  		{
! 			char	   *fld = cstate->raw_fields[fieldno];
  
  			pg_verifymbstr(fld, output_ptr - (fld + 1), false);
  		}
***************
*** 2921,2927 **** CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
  		input_len = end_ptr - start_ptr;
  		if (input_len == cstate->null_print_len &&
  			strncmp(start_ptr, cstate->null_print, input_len) == 0)
! 			fieldvals[fieldno] = NULL;
  
  		fieldno++;
  		/* Done if we hit EOL instead of a delim */
--- 2941,2947 ----
  		input_len = end_ptr - start_ptr;
  		if (input_len == cstate->null_print_len &&
  			strncmp(start_ptr, cstate->null_print, input_len) == 0)
! 			cstate->raw_fields[fieldno] = NULL;
  
  		fieldno++;
  		/* Done if we hit EOL instead of a delim */
***************
*** 2944,2950 **** CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
   * "standard" (i.e. common) CSV usage.
   */
  static int
! CopyReadAttributesCSV(CopyState cstate, int maxfields, char **fieldvals)
  {
  	char		delimc = cstate->delim[0];
  	char		quotec = cstate->quote[0];
--- 2964,2970 ----
   * "standard" (i.e. common) CSV usage.
   */
  static int
! CopyReadAttributesCSV(CopyState cstate, int maxfields)
  {
  	char		delimc = cstate->delim[0];
  	char		quotec = cstate->quote[0];
***************
*** 2974,2980 **** CopyReadAttributesCSV(CopyState cstate, int maxfields, char **fieldvals)
  	 * data line, so we can just force attribute_buf to be large enough and
  	 * then transfer data without any checks for enough space.	We need to do
  	 * it this way because enlarging attribute_buf mid-stream would invalidate
! 	 * pointers already stored into fieldvals[].
  	 */
  	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
  		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
--- 2994,3000 ----
  	 * data line, so we can just force attribute_buf to be large enough and
  	 * then transfer data without any checks for enough space.	We need to do
  	 * it this way because enlarging attribute_buf mid-stream would invalidate
! 	 * pointers already stored into cstate->raw_fields[].
  	 */
  	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
  		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
***************
*** 2994,3008 **** CopyReadAttributesCSV(CopyState cstate, int maxfields, char **fieldvals)
  		char	   *end_ptr;
  		int			input_len;
  
! 		/* Make sure space remains in fieldvals[] */
! 		if (fieldno >= maxfields)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 					 errmsg("extra data after last expected column")));
  
  		/* Remember start of field on both input and output sides */
  		start_ptr = cur_ptr;
! 		fieldvals[fieldno] = output_ptr;
  
  		/*
  		 * Scan data for field,
--- 3014,3030 ----
  		char	   *end_ptr;
  		int			input_len;
  
! 		/* Make sure there is enough space for the next value */
! 		if (fieldno >= cstate->max_fields)
! 		{
! 			cstate->max_fields *= 2;
! 			cstate->raw_fields = 
! 				repalloc(cstate->raw_fields, cstate->max_fields*sizeof(char *));
! 		}
  
  		/* Remember start of field on both input and output sides */
  		start_ptr = cur_ptr;
! 		cstate->raw_fields[fieldno] = output_ptr;
  
  		/*
  		 * Scan data for field,
***************
*** 3090,3096 **** endfield:
  		input_len = end_ptr - start_ptr;
  		if (!saw_quote && input_len == cstate->null_print_len &&
  			strncmp(start_ptr, cstate->null_print, input_len) == 0)
! 			fieldvals[fieldno] = NULL;
  
  		fieldno++;
  		/* Done if we hit EOL instead of a delim */
--- 3112,3118 ----
  		input_len = end_ptr - start_ptr;
  		if (!saw_quote && input_len == cstate->null_print_len &&
  			strncmp(start_ptr, cstate->null_print, input_len) == 0)
! 			cstate->raw_fields[fieldno] = NULL;
  
  		fieldno++;
  		/* Done if we hit EOL instead of a delim */

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] allow COPY routines to read arbitrary numbers of fields

Reply via email to