From 1fbfd87a4070b91a4ac3630b36a1ff1dadecaa5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Lapeyre?= <remi.lapeyre@lenstra.fr>
Date: Tue, 13 Oct 2020 14:45:56 +0200
Subject: [PATCH v12] Add header matching mode to "COPY FROM"
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.35.0"

This is a multi-part message in MIME format.
--------------2.35.0
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit


COPY FROM supports the HEADER option to silently discard the header from
a CSV or text file. It is possible to load by mistake a file that
matches the expected format, for example if two text columns have been
swapped, resulting in garbage in the database.

This option adds the possibility to actually check the header to make
sure it matches what is expected and exit immediatly if it does not.

Discussion: https://www.postgresql.org/message-id/flat/CAF1-J-0PtCWMeLtswwGV2M70U26n4g33gpe1rcKQqe6wVQDrFA@mail.gmail.com
---
 contrib/file_fdw/expected/file_fdw.out | 10 +++-
 contrib/file_fdw/sql/file_fdw.sql      |  7 +++
 doc/src/sgml/ref/copy.sgml             | 11 +++--
 src/backend/commands/copy.c            | 65 ++++++++++++++++++++++++--
 src/backend/commands/copyfromparse.c   | 52 +++++++++++++++++++--
 src/backend/commands/copyto.c          |  2 +-
 src/include/commands/copy.h            | 12 ++++-
 src/test/regress/expected/copy.out     | 20 ++++++++
 src/test/regress/sql/copy.sql          | 29 ++++++++++++
 9 files changed, 195 insertions(+), 13 deletions(-)


--------------2.35.0
Content-Type: text/x-patch; name="v12-0001-Add-header-matching-mode-to-COPY-FROM.patch"
Content-Transfer-Encoding: 8bit
Content-Disposition: attachment; filename="v12-0001-Add-header-matching-mode-to-COPY-FROM.patch"

diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
index 0ac6e4e0d7..b1617640d8 100644
--- a/contrib/file_fdw/expected/file_fdw.out
+++ b/contrib/file_fdw/expected/file_fdw.out
@@ -113,6 +113,12 @@ CREATE FOREIGN TABLE agg_bad (
 ) SERVER file_server
 OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null '');
 ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
+-- test header matching
+\set filename :abs_srcdir '/data/list1.csv'
+CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
+CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');	-- ERROR
 -- per-column options tests
 \set filename :abs_srcdir '/data/text.csv'
 CREATE FOREIGN TABLE text_csv (
@@ -464,12 +470,14 @@ SET ROLE regress_file_fdw_superuser;
 -- cleanup
 RESET ROLE;
 DROP EXTENSION file_fdw CASCADE;
-NOTICE:  drop cascades to 7 other objects
+NOTICE:  drop cascades to 9 other objects
 DETAIL:  drop cascades to server file_server
 drop cascades to user mapping for regress_file_fdw_superuser on server file_server
 drop cascades to user mapping for regress_no_priv_user on server file_server
 drop cascades to foreign table agg_text
 drop cascades to foreign table agg_csv
 drop cascades to foreign table agg_bad
+drop cascades to foreign table header_match
+drop cascades to foreign table header_doesnt_match
 drop cascades to foreign table text_csv
 DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
index 86f876d565..9f3c7219d0 100644
--- a/contrib/file_fdw/sql/file_fdw.sql
+++ b/contrib/file_fdw/sql/file_fdw.sql
@@ -103,6 +103,13 @@ CREATE FOREIGN TABLE agg_bad (
 OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null '');
 ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
 
+-- test header matching
+\set filename :abs_srcdir '/data/list1.csv'
+CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
+CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');	-- ERROR
+
 -- per-column options tests
 \set filename :abs_srcdir '/data/text.csv'
 CREATE FOREIGN TABLE text_csv (
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 1b7d001963..f36eca02fb 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -36,7 +36,7 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     FREEZE [ <replaceable class="parameter">boolean</replaceable> ]
     DELIMITER '<replaceable class="parameter">delimiter_character</replaceable>'
     NULL '<replaceable class="parameter">null_string</replaceable>'
-    HEADER [ <replaceable class="parameter">boolean</replaceable> ]
+    HEADER [ <literal>match</literal> | <literal>true</literal> | <literal>false</literal> ]
     QUOTE '<replaceable class="parameter">quote_character</replaceable>'
     ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
     FORCE_QUOTE { ( <replaceable class="parameter">column_name</replaceable> [, ...] ) | * }
@@ -276,8 +276,13 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
      <para>
       Specifies that the file contains a header line with the names of each
       column in the file.  On output, the first line contains the column
-      names from the table, and on input, the first line is ignored.
-      This option is not allowed when using <literal>binary</literal> format.
+      names from the table. On input, the first line is discarded when
+      <literal>header</literal> is set to <literal>true</literal> or required
+      to match the column names if set to <literal>match</literal>. If the
+      number of columns in the header is not correct, their order differs
+      from the one expected, or the name or case doesn't match, the copy will
+      be aborted with an error.  This option is allowed only when using
+      <literal>CSV</literal> or <literal>text</literal> format.
      </para>
     </listitem>
    </varlistentry>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 7da7105d44..830bd9f762 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -314,7 +314,66 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt,
 }
 
 /*
- * Process the statement option list for COPY.
+* Extract a CopyHeader value from a DefElem.
+*/
+static CopyHeader
+DefGetCopyHeader(DefElem *def)
+{
+	/*
+	* If no parameter given, assume "true" is meant.
+	*/
+	if (def->arg == NULL)
+		return COPY_HEADER_PRESENT;
+
+	/*
+	* Allow 0, 1, "true", "false", "on", "off" or "match".
+	*/
+	switch (nodeTag(def->arg))
+	{
+		case T_Integer:
+			switch (intVal(def->arg))
+			{
+				case 0:
+					return COPY_HEADER_ABSENT;
+				case 1:
+					return COPY_HEADER_PRESENT;
+				default:
+					/* otherwise, error out below */
+					break;
+			}
+			break;
+		default:
+			{
+				char	*sval = defGetString(def);
+
+				/*
+				* The set of strings accepted here should match up with the
+				* grammar's opt_boolean_or_string production.
+				*/
+				if (pg_strcasecmp(sval, "true") == 0)
+						return COPY_HEADER_PRESENT;
+				if (pg_strcasecmp(sval, "false") == 0)
+						return COPY_HEADER_ABSENT;
+				if (pg_strcasecmp(sval, "on") == 0)
+						return COPY_HEADER_PRESENT;
+				if (pg_strcasecmp(sval, "off") == 0)
+						return COPY_HEADER_ABSENT;
+				if (pg_strcasecmp(sval, "match") == 0)
+						return COPY_HEADER_MATCH;
+
+			}
+			break;
+	}
+
+	ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("%s requires a boolean or \"match\"",
+					def->defname)));
+	return COPY_HEADER_ABSENT;						/* keep compiler quiet */
+}
+
+/*
+* Process the statement option list for COPY.
  *
  * Scan the options list (a list of DefElem) and transpose the information
  * into *opts_out, applying appropriate error checking.
@@ -394,7 +453,7 @@ ProcessCopyOptions(ParseState *pstate,
 			if (header_specified)
 				errorConflictingDefElem(defel, pstate);
 			header_specified = true;
-			opts_out->header_line = defGetBoolean(defel);
+			opts_out->header_line = DefGetCopyHeader(defel);
 		}
 		else if (strcmp(defel->defname, "quote") == 0)
 		{
@@ -555,7 +614,7 @@ ProcessCopyOptions(ParseState *pstate,
 				 errmsg("COPY delimiter cannot be \"%s\"", opts_out->delim)));
 
 	/* Check header */
-	if (opts_out->binary && opts_out->header_line)
+	if (opts_out->binary && opts_out->header_line != COPY_HEADER_ABSENT)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot specify HEADER in BINARY mode")));
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index baf328b620..5ab59c0631 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -72,6 +72,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "port/pg_bswap.h"
+#include "utils/builtins.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -758,12 +759,55 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
 	/* only available for text or csv input */
 	Assert(!cstate->opts.binary);
 
-	/* on input just throw the header line away */
-	if (cstate->cur_lineno == 0 && cstate->opts.header_line)
+	/* on input check that the header line is correct if needed */
+	if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_ABSENT)
 	{
+		ListCell   *cur;
+		TupleDesc   tupDesc;
+
+		tupDesc = RelationGetDescr(cstate->rel);
+
 		cstate->cur_lineno++;
-		if (CopyReadLine(cstate))
-			return false;		/* done */
+		done = CopyReadLine(cstate);
+
+		if (cstate->opts.header_line == COPY_HEADER_MATCH)
+		{
+			if (cstate->opts.csv_mode)
+				fldct = CopyReadAttributesCSV(cstate);
+			else
+				fldct = CopyReadAttributesText(cstate);
+
+			if (fldct < list_length(cstate->attnumlist))
+				ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						 errmsg("incomplete header, expected %d columns but got %d",
+								list_length(cstate->attnumlist), fldct)));
+			else if (fldct > list_length(cstate->attnumlist))
+				ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						 errmsg("extra data after last expected header, expected %d columns but got %d",
+								list_length(cstate->attnumlist), fldct)));
+
+			foreach(cur, cstate->attnumlist)
+			{
+				int                             attnum = lfirst_int(cur);
+				char              *colName = cstate->raw_fields[attnum - 1];
+				Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
+
+				if (colName == NULL)
+					colName = cstate->opts.null_print;
+
+				if (namestrcmp(&attr->attname, colName) != 0) {
+					ereport(ERROR,
+							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+							 errmsg("wrong header for column \"%s\": got \"%s\"",
+									NameStr(attr->attname), colName)));
+				}
+			}
+		}
+
+		if (done)
+			return false;
 	}
 
 	cstate->cur_lineno++;
diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c
index 3283ef50d0..6c611c0277 100644
--- a/src/backend/commands/copyto.c
+++ b/src/backend/commands/copyto.c
@@ -846,7 +846,7 @@ DoCopyTo(CopyToState cstate)
 															  cstate->file_encoding);
 
 		/* if a header has been requested send the line */
-		if (cstate->opts.header_line)
+		if (cstate->opts.header_line != COPY_HEADER_ABSENT)
 		{
 			bool		hdr_delim = false;
 
diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h
index 8694da5004..6fb2cade6b 100644
--- a/src/include/commands/copy.h
+++ b/src/include/commands/copy.h
@@ -19,6 +19,16 @@
 #include "parser/parse_node.h"
 #include "tcop/dest.h"
 
+/*
+ * Represents whether the header must match, be absent or be present.
+ */
+typedef enum CopyHeader
+{
+	COPY_HEADER_ABSENT,
+	COPY_HEADER_PRESENT,
+	COPY_HEADER_MATCH
+} CopyHeader;
+
 /*
  * A struct to hold COPY options, in a parsed form. All of these are related
  * to formatting, except for 'freeze', which doesn't really belong here, but
@@ -32,7 +42,7 @@ typedef struct CopyFormatOptions
 	bool		binary;			/* binary format? */
 	bool		freeze;			/* freeze rows on loading? */
 	bool		csv_mode;		/* Comma Separated Value format? */
-	bool		header_line;	/* header line? */
+	CopyHeader	header_line;	/* CSV or text header line? */
 	char	   *null_print;		/* NULL marker string (server encoding!) */
 	int			null_print_len; /* length of same */
 	char	   *null_print_client;	/* same converted to file encoding */
diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out
index 851b9a4a2d..77b9a70b74 100644
--- a/src/test/regress/expected/copy.out
+++ b/src/test/regress/expected/copy.out
@@ -254,3 +254,23 @@ INFO:  progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progre
 drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
 drop function notice_after_tab_progress_reporting();
 drop table tab_progress_reporting;
+-- Test header matching feature
+create table header_copytest (
+	a int,
+	b int,
+	c text
+);
+copy header_copytest from stdin with (header wrong_choice);
+ERROR:  header requires a boolean or "match"
+copy header_copytest from stdin with (header match);
+copy header_copytest from stdin with (header match);
+ERROR:  incomplete header, expected 3 columns but got 2
+CONTEXT:  COPY header_copytest, line 1: "a	b"
+copy header_copytest from stdin with (header match);
+ERROR:  extra data after last expected header, expected 3 columns but got 4
+CONTEXT:  COPY header_copytest, line 1: "a	b	c	d"
+copy header_copytest from stdin with (header match);
+ERROR:  wrong header for column "c": got "d"
+CONTEXT:  COPY header_copytest, line 1: "a	b	d"
+copy header_copytest from stdin with (header match, format csv);
+drop table header_copytest;
diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql
index 016fedf675..5e192428a3 100644
--- a/src/test/regress/sql/copy.sql
+++ b/src/test/regress/sql/copy.sql
@@ -303,3 +303,32 @@ copy tab_progress_reporting from :'filename'
 drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
 drop function notice_after_tab_progress_reporting();
 drop table tab_progress_reporting;
+
+-- Test header matching feature
+create table header_copytest (
+	a int,
+	b int,
+	c text
+);
+copy header_copytest from stdin with (header wrong_choice);
+copy header_copytest from stdin with (header match);
+a	b	c
+1	2	foo
+\.
+copy header_copytest from stdin with (header match);
+a	b
+1	2
+\.
+copy header_copytest from stdin with (header match);
+a	b	c	d
+1	2	foo	bar
+\.
+copy header_copytest from stdin with (header match);
+a	b	d
+1	2	foo
+\.
+copy header_copytest from stdin with (header match, format csv);
+a,b,c
+1,2,foo
+\.
+drop table header_copytest;

--------------2.35.0--


