Tom Lane wrote:
Josh Berkus <j...@agliodbs.com> writes:
It's not as if we don't have the ability to measure performance impact.
 It's reasonable to make a requirement that new options to COPY
shouldn't slow it down noticeably if those options aren't used.  And we
can test that, and even make such testing part of the patch review.

Really?  Where is your agreed-on, demonstrated-to-be-reproducible
benchmark for COPY speed?

My experience is that reliably measuring performance costs in the
percent-or-so range is *hard*.  It's only after you've added a few of
them and they start to mount up that it becomes obvious that all those
insignificant additions really did cost something.

Well, I strongly suspect that the cost of the patch I'm working with is not in the percent-or-so range, and much more likely to be in the tiny-fraction-of-a-percent range. The total overhead in the non-ragged case is one extra test per field, plus one per null field, plus two tests per line.

But since you raise the question I'll conduct some tests and then you can criticize those. Ruling out tests a priori seems a bit extreme.

The current patch is attached for information (and in case anyone else wants to try some testing).

cheers

andrew
Index: src/backend/commands/copy.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.316
diff -c -r1.316 copy.c
*** src/backend/commands/copy.c	29 Jul 2009 20:56:18 -0000	1.316
--- src/backend/commands/copy.c	13 Sep 2009 02:57:16 -0000
***************
*** 116,121 ****
--- 116,122 ----
  	char	   *escape;			/* CSV escape char (must be 1 byte) */
  	bool	   *force_quote_flags;		/* per-column CSV FQ flags */
  	bool	   *force_notnull_flags;	/* per-column CSV FNN flags */
+ 	bool        ragged;         /* allow ragged CSV input? */
  
  	/* these are just for error messages, see copy_in_error_callback */
  	const char *cur_relname;	/* table name for error messages */
***************
*** 822,827 ****
--- 823,836 ----
  						 errmsg("conflicting or redundant options")));
  			force_notnull = (List *) defel->arg;
  		}
+ 		else if (strcmp(defel->defname, "ragged") == 0)
+ 		{
+ 			if (cstate->ragged)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_SYNTAX_ERROR),
+ 						 errmsg("conflicting or redundant options")));
+ 			cstate->ragged = intVal(defel->arg);
+ 		}
  		else
  			elog(ERROR, "option \"%s\" not recognized",
  				 defel->defname);
***************
*** 948,953 ****
--- 957,972 ----
  				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
  			  errmsg("COPY force not null only available using COPY FROM")));
  
+ 	/* Check ragged */
+ 	if (!cstate->csv_mode && cstate->ragged)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ 				 errmsg("COPY ragged available only in CSV mode")));
+ 	if (cstate->ragged && !is_from)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ 			  errmsg("COPY  ragged only available using COPY FROM")));
+ 
  	/* Don't allow the delimiter to appear in the null string. */
  	if (strchr(cstate->null_print, cstate->delim[0]) != NULL)
  		ereport(ERROR,
***************
*** 2951,2964 ****
  		int			input_len;
  
  		/* Make sure space remains in fieldvals[] */
! 		if (fieldno >= maxfields)
  			ereport(ERROR,
  					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
  					 errmsg("extra data after last expected column")));
  
  		/* Remember start of field on both input and output sides */
  		start_ptr = cur_ptr;
! 		fieldvals[fieldno] = output_ptr;
  
  		/*
  		 * Scan data for field,
--- 2970,2984 ----
  		int			input_len;
  
  		/* Make sure space remains in fieldvals[] */
! 		if (fieldno >= maxfields && ! cstate->ragged)
  			ereport(ERROR,
  					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
  					 errmsg("extra data after last expected column")));
  
  		/* Remember start of field on both input and output sides */
  		start_ptr = cur_ptr;
! 		if (fieldno < maxfields)
! 			fieldvals[fieldno] = output_ptr;
  
  		/*
  		 * Scan data for field,
***************
*** 3045,3051 ****
  		/* Check whether raw input matched null marker */
  		input_len = end_ptr - start_ptr;
  		if (!saw_quote && input_len == cstate->null_print_len &&
! 			strncmp(start_ptr, cstate->null_print, input_len) == 0)
  			fieldvals[fieldno] = NULL;
  
  		fieldno++;
--- 3065,3072 ----
  		/* Check whether raw input matched null marker */
  		input_len = end_ptr - start_ptr;
  		if (!saw_quote && input_len == cstate->null_print_len &&
! 			strncmp(start_ptr, cstate->null_print, input_len) == 0 &&
! 			fieldno < maxfields)
  			fieldvals[fieldno] = NULL;
  
  		fieldno++;
***************
*** 3059,3065 ****
  	Assert(*output_ptr == '\0');
  	cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
  
! 	return fieldno;
  }
  
  
--- 3080,3092 ----
  	Assert(*output_ptr == '\0');
  	cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
  
! 	/* for ragged input, set field null for underflowed fields */
! 	if (cstate->ragged)
! 		while (fieldno  < maxfields)
! 			fieldvals[fieldno++] = NULL;
! 
! 
! 	return cstate->ragged ? maxfields  : fieldno;
  }
  
  
Index: src/backend/parser/gram.y
===================================================================
RCS file: /cvsroot/pgsql/src/backend/parser/gram.y,v
retrieving revision 2.677
diff -c -r2.677 gram.y
*** src/backend/parser/gram.y	18 Aug 2009 23:40:20 -0000	2.677
--- src/backend/parser/gram.y	13 Sep 2009 02:57:17 -0000
***************
*** 504,510 ****
  
  	QUOTE
  
! 	RANGE READ REAL REASSIGN RECHECK RECURSIVE REFERENCES REINDEX
  	RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA RESET RESTART
  	RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROW ROWS RULE
  
--- 504,510 ----
  
  	QUOTE
  
! 	RAGGED RANGE READ REAL REASSIGN RECHECK RECURSIVE REFERENCES REINDEX
  	RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA RESET RESTART
  	RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROW ROWS RULE
  
***************
*** 2050,2055 ****
--- 2050,2059 ----
  				{
  					$$ = makeDefElem("force_notnull", (Node *)$4);
  				}
+ 			| RAGGED
+ 				{
+ 					$$ = makeDefElem("ragged",(Node *)makeInteger(TRUE));
+ 				}
  		;
  
  /* The following exist for backward compatibility */
***************
*** 10373,10378 ****
--- 10377,10383 ----
  			| PROCEDURAL
  			| PROCEDURE
  			| QUOTE
+ 			| RAGGED
  			| RANGE
  			| READ
  			| REASSIGN
Index: src/bin/psql/copy.c
===================================================================
RCS file: /cvsroot/pgsql/src/bin/psql/copy.c,v
retrieving revision 1.82
diff -c -r1.82 copy.c
*** src/bin/psql/copy.c	7 Aug 2009 20:16:11 -0000	1.82
--- src/bin/psql/copy.c	13 Sep 2009 02:57:17 -0000
***************
*** 34,40 ****
   * The documented syntax is:
   *	\copy tablename [(columnlist)] from|to filename
   *	  [ with ] [ binary ] [ oids ] [ delimiter [as] char ] [ null [as] string ]
!  *	  [ csv  [ header ] [ quote [ AS ] string ]  escape [as] string
   *		[ force not null column [, ...] | force quote column [, ...] | * ] ]
   *
   *	\copy ( select stmt ) to filename
--- 34,40 ----
   * The documented syntax is:
   *	\copy tablename [(columnlist)] from|to filename
   *	  [ with ] [ binary ] [ oids ] [ delimiter [as] char ] [ null [as] string ]
!  *	  [ csv  [ header ] [ quote [ AS ] string ]  escape [as] string [ ragged ]
   *		[ force not null column [, ...] | force quote column [, ...] | * ] ]
   *
   *	\copy ( select stmt ) to filename
***************
*** 69,74 ****
--- 69,75 ----
  	char	   *escape;
  	char	   *force_quote_list;
  	char	   *force_notnull_list;
+ 	bool        ragged;
  };
  
  
***************
*** 268,273 ****
--- 269,276 ----
  				result->csv_mode = true;
  			else if (pg_strcasecmp(token, "header") == 0)
  				result->header = true;
+ 			else if (pg_strcasecmp(token, "ragged") == 0)
+ 				result->ragged = true;
  			else if (pg_strcasecmp(token, "delimiter") == 0)
  			{
  				if (result->delim)
***************
*** 477,482 ****
--- 480,488 ----
  	if (options->header)
  		appendPQExpBuffer(&query, " HEADER");
  
+ 	if (options->ragged)
+ 		appendPQExpBuffer(&query, " RAGGED");
+ 
  	if (options->quote)
  		emit_copy_option(&query, " QUOTE AS ", options->quote);
  
Index: src/bin/psql/tab-complete.c
===================================================================
RCS file: /cvsroot/pgsql/src/bin/psql/tab-complete.c,v
retrieving revision 1.185
diff -c -r1.185 tab-complete.c
*** src/bin/psql/tab-complete.c	2 Aug 2009 22:14:52 -0000	1.185
--- src/bin/psql/tab-complete.c	13 Sep 2009 02:57:18 -0000
***************
*** 1249,1255 ****
  			  pg_strcasecmp(prev3_wd, "TO") == 0))
  	{
  		static const char *const list_CSV[] =
! 		{"HEADER", "QUOTE", "ESCAPE", "FORCE QUOTE", NULL};
  
  		COMPLETE_WITH_LIST(list_CSV);
  	}
--- 1249,1255 ----
  			  pg_strcasecmp(prev3_wd, "TO") == 0))
  	{
  		static const char *const list_CSV[] =
! 			{"HEADER", "QUOTE", "ESCAPE", "FORCE QUOTE", "RAGGED", NULL};
  
  		COMPLETE_WITH_LIST(list_CSV);
  	}
Index: src/include/parser/kwlist.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/parser/kwlist.h,v
retrieving revision 1.2
diff -c -r1.2 kwlist.h
*** src/include/parser/kwlist.h	6 Apr 2009 08:42:53 -0000	1.2
--- src/include/parser/kwlist.h	13 Sep 2009 02:57:18 -0000
***************
*** 294,299 ****
--- 294,300 ----
  PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD)
  PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD)
  PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD)
+ PG_KEYWORD("ragged", RAGGED, UNRESERVED_KEYWORD)
  PG_KEYWORD("range", RANGE, UNRESERVED_KEYWORD)
  PG_KEYWORD("read", READ, UNRESERVED_KEYWORD)
  PG_KEYWORD("real", REAL, COL_NAME_KEYWORD)
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to