[PATCH v2] Allow COPY "text" to output a header and add header matching mode to COPY FROM

Rémi Lapeyre Mon, 09 Mar 2020 10:14:53 -0700

Here's a rebased version that should apply cleanly on HEAD.

---
 contrib/file_fdw/input/file_fdw.source  |  7 +-
 contrib/file_fdw/output/file_fdw.source | 13 ++--
 doc/src/sgml/ref/copy.sgml              |  9 ++-
 src/backend/commands/copy.c             | 93 ++++++++++++++++++++++---
 src/test/regress/input/copy.source      | 71 ++++++++++++++-----
 src/test/regress/output/copy.source     | 58 ++++++++++-----
 6 files changed, 202 insertions(+), 49 deletions(-)

diff --git a/contrib/file_fdw/input/file_fdw.source b/contrib/file_fdw/input/file_fdw.source
index 45b728eeb3..7a3983c785 100644
--- a/contrib/file_fdw/input/file_fdw.source
+++ b/contrib/file_fdw/input/file_fdw.source
@@ -37,7 +37,6 @@ CREATE USER MAPPING FOR regress_no_priv_user SERVER file_server;

 -- validator tests
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'xml');  -- ERROR
-CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', header 'true');      -- ERROR
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', quote ':');          -- ERROR
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', escape ':');         -- ERROR
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'binary', header 'true');    -- ERROR
@@ -80,6 +79,12 @@ CREATE FOREIGN TABLE agg_bad (
 OPTIONS (format 'csv', filename '@abs_srcdir@/data/agg.bad', header 'true', delimiter ';', quote '@', escape '"', null '');
 ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);

+-- test header matching
+CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename '@abs_srcdir@/data/list1.csv', delimiter ',', header 'match');
+CREATE FOREIGN TABLE header_dont_match (a int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename '@abs_srcdir@/data/list1.csv', delimiter ',', header 'match');	-- ERROR
+
 -- per-column options tests
 CREATE FOREIGN TABLE text_csv (
     word1 text OPTIONS (force_not_null 'true'),
diff --git a/contrib/file_fdw/output/file_fdw.source b/contrib/file_fdw/output/file_fdw.source
index 52b4d5f1df..d76a3dc6f8 100644
--- a/contrib/file_fdw/output/file_fdw.source
+++ b/contrib/file_fdw/output/file_fdw.source
@@ -33,14 +33,12 @@ CREATE USER MAPPING FOR regress_no_priv_user SERVER file_server;
 -- validator tests
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'xml');  -- ERROR
 ERROR:  COPY format "xml" not recognized
-CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', header 'true');      -- ERROR
-ERROR:  COPY HEADER available only in CSV mode
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', quote ':');          -- ERROR
 ERROR:  COPY quote available only in CSV mode
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'text', escape ':');         -- ERROR
 ERROR:  COPY escape available only in CSV mode
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'binary', header 'true');    -- ERROR
-ERROR:  COPY HEADER available only in CSV mode
+ERROR:  COPY HEADER available only in CSV and text mode
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'binary', quote ':');        -- ERROR
 ERROR:  COPY quote available only in CSV mode
 CREATE FOREIGN TABLE tbl () SERVER file_server OPTIONS (format 'binary', escape ':');       -- ERROR
@@ -95,6 +93,11 @@ CREATE FOREIGN TABLE agg_bad (
 ) SERVER file_server
 OPTIONS (format 'csv', filename '@abs_srcdir@/data/agg.bad', header 'true', delimiter ';', quote '@', escape '"', null '');
 ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
+-- test header matching
+CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename '/Users/remi/src/postgresql/contrib/file_fdw/data/list1.csv', delimiter ',', header 'match');
+CREATE FOREIGN TABLE header_dont_match (a int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename '/Users/remi/src/postgresql/contrib/file_fdw/data/list1.csv', delimiter ',', header 'match');	-- ERROR
 -- per-column options tests
 CREATE FOREIGN TABLE text_csv (
     word1 text OPTIONS (force_not_null 'true'),
@@ -441,12 +444,14 @@ SET ROLE regress_file_fdw_superuser;
 -- cleanup
 RESET ROLE;
 DROP EXTENSION file_fdw CASCADE;
-NOTICE:  drop cascades to 7 other objects
+NOTICE:  drop cascades to 9 other objects
 DETAIL:  drop cascades to server file_server
 drop cascades to user mapping for regress_file_fdw_superuser on server file_server
 drop cascades to user mapping for regress_no_priv_user on server file_server
 drop cascades to foreign table agg_text
 drop cascades to foreign table agg_csv
 drop cascades to foreign table agg_bad
+drop cascades to foreign table header_match
+drop cascades to foreign table header_dont_match
 drop cascades to foreign table text_csv
 DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index a99f8155e4..36bdd87726 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -37,7 +37,7 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     FREEZE [ <replaceable class="parameter">boolean</replaceable> ]
     DELIMITER '<replaceable class="parameter">delimiter_character</replaceable>'
     NULL '<replaceable class="parameter">null_string</replaceable>'
-    HEADER [ <replaceable class="parameter">boolean</replaceable> ]
+    HEADER { <literal>match</literal> | <literal>true</literal> | <literal>false</literal> }
     QUOTE '<replaceable class="parameter">quote_character</replaceable>'
     ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
     FORCE_QUOTE { ( <replaceable class="parameter">column_name</replaceable> [, ...] ) | * }
@@ -269,8 +269,11 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
      <para>
       Specifies that the file contains a header line with the names of each
       column in the file.  On output, the first line contains the column
-      names from the table, and on input, the first line is ignored.
-      This option is allowed only when using <literal>CSV</literal> format.
+      names from the table, and on input, the first line is required to match
+      the column names if set to <literal>match</literal> or discarded when set
+      to <literal>true</literal>.
+      This option is allowed only when using <literal>CSV</literal> or
+      <literal>text</literal> format.
      </para>
     </listitem>
    </varlistentry>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index e79ede4cb8..f1bbbf841b 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -94,6 +94,16 @@ typedef enum CopyInsertMethod
 	CIM_MULTI_CONDITIONAL		/* use table_multi_insert only if valid */
 } CopyInsertMethod;

+/*
+ * Represents whether the head must be absent, present or present and match.
+ */
+typedef enum CopyHeader
+{
+	COPY_HEADER_ABSENT,
+	COPY_HEADER_PRESENT,
+	COPY_HEADER_MATCH
+} CopyHeader;
+
 /*
  * This struct contains all the state variables used throughout a COPY
  * operation. For simplicity, we use the same struct for all variants of COPY,
@@ -135,7 +145,7 @@ typedef struct CopyStateData
 	bool		binary;			/* binary format? */
 	bool		freeze;			/* freeze rows on loading? */
 	bool		csv_mode;		/* Comma Separated Value format? */
-	bool		header_line;	/* CSV header line? */
+	CopyHeader  header_line;	/* CSV or text header line? */
 	char	   *null_print;		/* NULL marker string (server encoding!) */
 	int			null_print_len; /* length of same */
 	char	   *null_print_client;	/* same converted to file encoding */
@@ -1184,7 +1194,28 @@ ProcessCopyOptions(ParseState *pstate,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("conflicting or redundant options"),
 						 parser_errposition(pstate, defel->location)));
-			cstate->header_line = defGetBoolean(defel);
+
+			PG_TRY();
+			{
+				if (defGetBoolean(defel))
+					cstate->header_line = COPY_HEADER_PRESENT;
+				else
+					cstate->header_line = COPY_HEADER_ABSENT;
+			}
+			PG_CATCH();
+			{
+				if (!cstate->is_copy_from)
+					PG_RE_THROW();
+
+				char	   *sval = defGetString(defel);
+				if (pg_strcasecmp(sval, "match") == 0)
+					cstate->header_line = COPY_HEADER_MATCH;
+				else
+					ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("header requires a boolean or \"match\"")));
+			}
+			PG_END_TRY();
 		}
 		else if (strcmp(defel->defname, "quote") == 0)
 		{
@@ -1365,10 +1396,10 @@ ProcessCopyOptions(ParseState *pstate,
 				 errmsg("COPY delimiter cannot be \"%s\"", cstate->delim)));

 	/* Check header */
-	if (!cstate->csv_mode && cstate->header_line)
+	if (cstate->binary && cstate->header_line)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("COPY HEADER available only in CSV mode")));
+				 errmsg("COPY HEADER available only in CSV and text mode")));

 	/* Check quote */
 	if (!cstate->csv_mode && cstate->quote != NULL)
@@ -2100,8 +2131,11 @@ CopyTo(CopyState cstate)

 				colname = NameStr(TupleDescAttr(tupDesc, attnum - 1)->attname);

-				CopyAttributeOutCSV(cstate, colname, false,
-									list_length(cstate->attnumlist) == 1);
+				if (cstate->csv_mode)
+					CopyAttributeOutCSV(cstate, colname, false,
+										list_length(cstate->attnumlist) == 1);
+				else
+					CopyAttributeOutText(cstate, colname);
 			}

 			CopySendEndOfRow(cstate);
@@ -3639,12 +3673,53 @@ NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields)
 	/* only available for text or csv input */
 	Assert(!cstate->binary);

-	/* on input just throw the header line away */
+	/* on input check that the header line is correct if needed */
 	if (cstate->cur_lineno == 0 && cstate->header_line)
 	{
+		ListCell   *cur;
+		TupleDesc   tupDesc;
+
+		tupDesc = RelationGetDescr(cstate->rel);
+
 		cstate->cur_lineno++;
-		if (CopyReadLine(cstate))
-			return false;		/* done */
+		done = CopyReadLine(cstate);
+
+		if (cstate->header_line == COPY_HEADER_MATCH)
+		{
+			if (cstate->csv_mode)
+				fldct = CopyReadAttributesCSV(cstate);
+			else
+				fldct = CopyReadAttributesText(cstate);
+
+			if (fldct < list_length(cstate->attnumlist))
+				ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						errmsg("missing header")));
+			else if (fldct > list_length(cstate->attnumlist))
+				ereport(ERROR,
+					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+					errmsg("extra data after last expected header")));
+
+			foreach(cur, cstate->attnumlist)
+			{
+				int				attnum = lfirst_int(cur);
+				char		*colName = cstate->raw_fields[attnum - 1];
+				Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
+
+				if (colName == NULL)
+					colName = cstate->null_print;
+
+				if (namestrcmp(&attr->attname, colName) != 0) {
+					ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						errmsg("wrong header for column \"%s\": got \"%s\"",
+								NameStr(attr->attname), colName)));
+				}
+			}
+		}
+
+		if (done)
+			return false;
 	}

 	cstate->cur_lineno++;
diff --git a/src/test/regress/input/copy.source b/src/test/regress/input/copy.source
index a1d529ad36..dc7341529f 100644
--- a/src/test/regress/input/copy.source
+++ b/src/test/regress/input/copy.source
@@ -87,52 +87,66 @@ ANALYZE bt_f8_heap;
 ANALYZE array_op_test;
 ANALYZE array_index_op_test;

+-- test header line feature
+
+create temp table copytest (
+	c1 int,
+	"col with tabulation: 	" text);
+
+copy copytest from stdin (header);
+this is just a line full of junk that would error out if parsed
+1	a
+2	b
+\.
+
+copy copytest to stdout (header);
+
 --- test copying in CSV mode with various styles
 --- of embedded line ending characters

-create temp table copytest (
+create temp table copytest2 (
 	style	text,
 	test 	text,
 	filler	int);

-insert into copytest values('DOS',E'abc\r\ndef',1);
-insert into copytest values('Unix',E'abc\ndef',2);
-insert into copytest values('Mac',E'abc\rdef',3);
-insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
+insert into copytest2 values('DOS',E'abc\r\ndef',1);
+insert into copytest2 values('Unix',E'abc\ndef',2);
+insert into copytest2 values('Mac',E'abc\rdef',3);
+insert into copytest2 values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);

-copy copytest to '@abs_builddir@/results/copytest.csv' csv;
+copy copytest2 to '@abs_builddir@/results/copytest.csv' csv;

-create temp table copytest2 (like copytest);
+create temp table copytest3 (like copytest2);

-copy copytest2 from '@abs_builddir@/results/copytest.csv' csv;
+copy copytest3 from '@abs_builddir@/results/copytest.csv' csv;

-select * from copytest except select * from copytest2;
+select * from copytest2 except select * from copytest3;

-truncate copytest2;
+truncate copytest3;

 --- same test but with an escape char different from quote char

-copy copytest to '@abs_builddir@/results/copytest.csv' csv quote '''' escape E'\\';
+copy copytest2 to '@abs_builddir@/results/copytest.csv' csv quote '''' escape E'\\';

-copy copytest2 from '@abs_builddir@/results/copytest.csv' csv quote '''' escape E'\\';
+copy copytest3 from '@abs_builddir@/results/copytest.csv' csv quote '''' escape E'\\';

-select * from copytest except select * from copytest2;
+select * from copytest2 except select * from copytest3;


 -- test header line feature

-create temp table copytest3 (
+create temp table copytest4 (
 	c1 int,
 	"col with , comma" text,
 	"col with "" quote"  int);

-copy copytest3 from stdin csv header;
+copy copytest4 from stdin csv header;
 this is just a line full of junk that would error out if parsed
 1,a,1
 2,b,2
 \.

-copy copytest3 to stdout csv header;
+copy copytest4 to stdout csv header;

 -- test copy from with a partitioned table
 create table parted_copytest (
@@ -201,3 +215,28 @@ select * from parted_copytest where b = 1;
 select * from parted_copytest where b = 2;

 drop table parted_copytest;
+
+-- Test header matching feature
+create table header_copytest (
+	a int,
+	b int,
+	c text
+);
+copy header_copytest from stdin with (header wrong_choice);
+copy header_copytest from stdin with (header match);
+a	b	c
+1	2	foo
+\.
+copy header_copytest from stdin with (header match);
+a	b
+1	2
+\.
+copy header_copytest from stdin with (header match);
+a	b	c	d
+1	2	foo	bar
+\.
+copy header_copytest from stdin with (header match, format csv);
+a,b,c
+1,2,foo
+\.
+drop table header_copytest;
diff --git a/src/test/regress/output/copy.source b/src/test/regress/output/copy.source
index 938d3551da..c50a2f092c 100644
--- a/src/test/regress/output/copy.source
+++ b/src/test/regress/output/copy.source
@@ -58,40 +58,49 @@ ANALYZE bt_txt_heap;
 ANALYZE bt_f8_heap;
 ANALYZE array_op_test;
 ANALYZE array_index_op_test;
+-- test header line feature
+create temp table copytest (
+	c1 int,
+	"col with tabulation: 	" text);
+copy copytest from stdin (header);
+copy copytest to stdout (header);
+c1	col with tabulation: \t
+1	a
+2	b
 --- test copying in CSV mode with various styles
 --- of embedded line ending characters
-create temp table copytest (
+create temp table copytest2 (
 	style	text,
 	test 	text,
 	filler	int);
-insert into copytest values('DOS',E'abc\r\ndef',1);
-insert into copytest values('Unix',E'abc\ndef',2);
-insert into copytest values('Mac',E'abc\rdef',3);
-insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
-copy copytest to '@abs_builddir@/results/copytest.csv' csv;
-create temp table copytest2 (like copytest);
-copy copytest2 from '@abs_builddir@/results/copytest.csv' csv;
-select * from copytest except select * from copytest2;
+insert into copytest2 values('DOS',E'abc\r\ndef',1);
+insert into copytest2 values('Unix',E'abc\ndef',2);
+insert into copytest2 values('Mac',E'abc\rdef',3);
+insert into copytest2 values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
+copy copytest2 to '@abs_builddir@/results/copytest.csv' csv;
+create temp table copytest3 (like copytest2);
+copy copytest3 from '@abs_builddir@/results/copytest.csv' csv;
+select * from copytest2 except select * from copytest3;
  style | test | filler
 -------+------+--------
 (0 rows)

-truncate copytest2;
+truncate copytest3;
 --- same test but with an escape char different from quote char
-copy copytest to '@abs_builddir@/results/copytest.csv' csv quote '''' escape E'\\';
-copy copytest2 from '@abs_builddir@/results/copytest.csv' csv quote '''' escape E'\\';
-select * from copytest except select * from copytest2;
+copy copytest2 to '@abs_builddir@/results/copytest.csv' csv quote '''' escape E'\\';
+copy copytest3 from '@abs_builddir@/results/copytest.csv' csv quote '''' escape E'\\';
+select * from copytest2 except select * from copytest3;
  style | test | filler
 -------+------+--------
 (0 rows)

 -- test header line feature
-create temp table copytest3 (
+create temp table copytest4 (
 	c1 int,
 	"col with , comma" text,
 	"col with "" quote"  int);
-copy copytest3 from stdin csv header;
-copy copytest3 to stdout csv header;
+copy copytest4 from stdin csv header;
+copy copytest4 to stdout csv header;
 c1,"col with , comma","col with "" quote"
 1,a,1
 2,b,2
@@ -165,3 +174,20 @@ select * from parted_copytest where b = 2;
 (1 row)

 drop table parted_copytest;
+-- Test header matching feature
+create table header_copytest (
+	a int,
+	b int,
+	c text
+);
+copy header_copytest from stdin with (header wrong_choice);
+ERROR:  header requires a boolean or "match"
+copy header_copytest from stdin with (header match);
+copy header_copytest from stdin with (header match);
+ERROR:  missing header
+CONTEXT:  COPY header_copytest, line 1: "a	b"
+copy header_copytest from stdin with (header match);
+ERROR:  extra data after last expected header
+CONTEXT:  COPY header_copytest, line 1: "a	b	c	d"
+copy header_copytest from stdin with (header match, format csv);
+drop table header_copytest;

[PATCH v2] Allow COPY "text" to output a header and add header matching mode to COPY FROM

Reply via email to