Andrew Dunstan wrote:
In fact, in the patch I sent in, no quoted string is marked as null when being read (so even if you use \N as the null marker, "\N" will be that literal and not a null marker). And the null marker, whatever it is, should be made quote safe by us throwing an error if it contains the quote marker, just as we now make sure that the null marker is delimiter-safe.
What value does an int column get if the input file has ',,'. Don't
tell me zero? :-) Error?
If the null marker is not an empty string, it gets an error, of course - if it is it gets a null:
[EMAIL PROTECTED] pginst]$ echo ',,' | bin/psql -c "create temp table foo (a int, b text, c text); copy foo from stdin delimiter ',\"' null '\\\\N';"
ERROR: invalid input syntax for integer: ""
CONTEXT: COPY foo, line 1, column a: ""
[EMAIL PROTECTED] pginst]$ echo ',,' | bin/psql -c "create temp table foo (a int, b text, c text); copy foo from stdin delimiter ',\"' ;"
[EMAIL PROTECTED] pginst]$
I hope that is expected behaviour - it's what *I* expect, at least.
I will check on the write behaviour - it might need ammending too.
I'll submit a revised patch based on the original syntax scheme, and then you (Bruce) can make the syntax/psql changes that seem to be agreed on now - is that ok?
OK, go as far as you want and post it. I will turn around a new patch in a few hours after you post.
The default NULL value issue can be determined at the end of any exhaustive debate we have - in the end it's a one line code change ;-)
Agreed.
Attached patch has these additions to previously posted patch:
. quote character may not appear in NULL marker
. any non-null value that matches the NULL marker is forced to be quoted when written.
cheers
andrew
Index: src/backend/commands/copy.c =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/commands/copy.c,v retrieving revision 1.219 diff -c -r1.219 copy.c *** src/backend/commands/copy.c 6 Apr 2004 13:21:33 -0000 1.219 --- src/backend/commands/copy.c 12 Apr 2004 16:21:33 -0000 *************** *** 70,76 **** typedef enum CopyReadResult { NORMAL_ATTR, ! END_OF_LINE } CopyReadResult; /* --- 70,77 ---- typedef enum CopyReadResult { NORMAL_ATTR, ! END_OF_LINE, ! UNTERMINATED_FIELD } CopyReadResult; /* *************** *** 136,144 **** --- 137,148 ---- static bool CopyReadLine(void); static char *CopyReadAttribute(const char *delim, const char *null_print, CopyReadResult *result, bool *isnull); + static char *CopyReadAttributeCSV(const char *delim, const char *null_print, + CopyReadResult *result, bool *isnull); static Datum CopyReadBinaryAttribute(int column_no, FmgrInfo *flinfo, Oid typelem, bool *isnull); static void CopyAttributeOut(char *string, char *delim); + static void CopyAttributeOutCSV(char *string, char *delim, bool force_quote); static List *CopyGetAttnums(Relation rel, List *attnamelist); static void limit_printout_length(StringInfo buf); *************** *** 682,687 **** --- 686,692 ---- List *attnumlist; bool binary = false; bool oids = false; + bool csv_mode = false; char *delim = NULL; char *null_print = NULL; Relation rel; *************** *** 744,751 **** if (!delim) delim = "\t"; if (!null_print) ! null_print = "\\N"; /* * Open and lock the relation, using the appropriate lock type. --- 749,759 ---- if (!delim) delim = "\t"; + if (strlen(delim) > 1) + csv_mode = true; + if (!null_print) ! null_print = csv_mode ? "" : "\\N"; /* * Open and lock the relation, using the appropriate lock type. *************** *** 772,783 **** "psql's \\copy command also works for anyone."))); /* ! * Presently, only single-character delimiter strings are supported. */ ! if (strlen(delim) != 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY delimiter must be a single character"))); /* * Don't allow the delimiter to appear in the null string. --- 780,806 ---- "psql's \\copy command also works for anyone."))); /* ! * Only single-character delimiter strings are supported, ! * except in CSV mode, where the string must be ! * delimiter-char quote-char [escape-char] */ ! if (!csv_mode && strlen(delim) != 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY delimiter must be a single character"))); + else if (csv_mode) + { + if(strlen(delim) > 3) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("COPY delimiters for CSV must be a 2 or 3 characters"))); + if (delim[0] == delim[1] || + (strlen(delim) == 3 && delim[0] == delim[2])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("CSV delimiter character must not be same as quote character or escape character"))); + + } /* * Don't allow the delimiter to appear in the null string. *************** *** 788,793 **** --- 811,833 ---- errmsg("COPY delimiter must not appear in the NULL specification"))); /* + * Don't allow the csv quote char to appear in the null string. + */ + if (csv_mode && strchr(null_print, delim[1]) != NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("CSV quote character must not appear in the NULL specification"))); + + /* + * Don't allow OIDs in CSV mode + */ + + if (csv_mode && oids) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Cannot specify OIDS in CSV mode "))); + + /* * Don't allow COPY w/ OIDs to or from a table without them */ if (oids && !rel->rd_rel->relhasoids) *************** *** 969,974 **** --- 1009,1015 ---- FmgrInfo *out_functions; Oid *elements; bool *isvarlena; + bool csv_mode; char *string; Snapshot mySnapshot; List *cur; *************** *** 979,984 **** --- 1020,1026 ---- attr = tupDesc->attrs; num_phys_attrs = tupDesc->natts; attr_count = length(attnumlist); + csv_mode = (strlen(delim) > 1); /* * Get info about the columns we need to process. *************** *** 1051,1057 **** while ((tuple = heap_getnext(scandesc, ForwardScanDirection)) != NULL) { bool need_delim = false; - CHECK_FOR_INTERRUPTS(); MemoryContextReset(mycontext); --- 1093,1098 ---- *************** *** 1113,1119 **** value, ObjectIdGetDatum(elements[attnum - 1]), Int32GetDatum(attr[attnum - 1]->atttypmod))); ! CopyAttributeOut(string, delim); } else { --- 1154,1167 ---- value, ObjectIdGetDatum(elements[attnum - 1]), Int32GetDatum(attr[attnum - 1]->atttypmod))); ! if (csv_mode) ! { ! bool force_quote = (strcmp(string,null_print) == 0); ! CopyAttributeOutCSV(string, delim, force_quote); ! } ! else ! CopyAttributeOut(string, delim); ! } else { *************** *** 1263,1268 **** --- 1311,1317 ---- Datum *values; char *nulls; bool done = false; + bool csv_mode; bool isnull; ResultRelInfo *resultRelInfo; EState *estate = CreateExecutorState(); /* for ExecConstraints() */ *************** *** 1280,1285 **** --- 1329,1335 ---- num_phys_attrs = tupDesc->natts; attr_count = length(attnumlist); num_defaults = 0; + csv_mode = (strlen(delim) > 1); /* * We need a ResultRelInfo so we can use the regular executor's *************** *** 1499,1504 **** --- 1549,1555 ---- if (file_has_oids) { + /* can't be in CSV mode here */ string = CopyReadAttribute(delim, null_print, &result, &isnull); *************** *** 1537,1544 **** errmsg("missing data for column \"%s\"", NameStr(attr[m]->attname)))); ! string = CopyReadAttribute(delim, null_print, ! &result, &isnull); if (isnull) { --- 1588,1608 ---- errmsg("missing data for column \"%s\"", NameStr(attr[m]->attname)))); ! if (csv_mode) ! { ! string = CopyReadAttributeCSV(delim, null_print, ! &result, &isnull); ! if (result == UNTERMINATED_FIELD) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unterminated CSV quoted field"))); ! } ! else ! { ! string = CopyReadAttribute(delim, null_print, ! &result, &isnull); ! } ! if (isnull) { *************** *** 2069,2074 **** --- 2133,2288 ---- return attribute_buf.data; } + + /* + * Read the value of a single attribute in CSV mode, + * performing de-escaping as needed. Escaping does not follow the normal + * PostgreSQL text mode, but instead "standard" (i.e. common) CSV usage. + * + * Quoted fields can span lines, in which case the line end is embedded + * in the returned string. + * + * delim is a 2- or 3-character string. The first character is the + * field delimiter, the second the quote character, the third is the + * escape character indise quotes, and defaults to the quote character. + * + * null_print is the null marker string. Note that this is compared to + * the pre-de-escaped input string (thus if it is quoted it is not a NULL). + * + * *result is set to indicate what terminated the read: + * NORMAL_ATTR: column delimiter + * END_OF_LINE: end of line + * UNTERMINATED_FIELD no quote detected at end of a quoted field + * + * In any case, the string read up to the terminator (or end of file) + * is returned. + * + * *isnull is set true or false depending on whether the input matched + * the null marker. Note that the caller cannot check this since the + * returned string will be the post-de-escaping equivalent, which may + * look the same as some valid data string. + *---------- + */ + + static char * + CopyReadAttributeCSV(const char *delim, const char *null_print, + CopyReadResult *result, bool *isnull) + { + char delimc = delim[0]; + char quotec = delim[1]; + char escapec = delim[2] ? delim[2] : delim[1]; + char c; + int start_cursor = line_buf.cursor; + int end_cursor = start_cursor;; + int input_len; + bool in_quote = false; + bool saw_quote = false; + + /* reset attribute_buf to empty */ + attribute_buf.len = 0; + attribute_buf.data[0] = '\0'; + + /* set default status */ + *result = END_OF_LINE; + + for (;;) + { + /* handle multiline quoted fields */ + if (in_quote && line_buf.cursor >= line_buf.len) + { + bool done; + + switch(eol_type) + { + case EOL_NL: + appendStringInfoString(&attribute_buf,"\n"); + break; + case EOL_CR: + appendStringInfoString(&attribute_buf,"\r"); + break; + case EOL_CRNL: + appendStringInfoString(&attribute_buf,"\r\n"); + break; + case EOL_UNKNOWN: + /* shouldn't happen - just keep going */ + break; + } + + copy_lineno++; + done = CopyReadLine(); + if (done && line_buf.len == 0) + break; + start_cursor = line_buf.cursor; + } + + end_cursor = line_buf.cursor; + if (line_buf.cursor >= line_buf.len) + break; + c = line_buf.data[line_buf.cursor++]; + /* + * unquoted field delimiter + */ + if (!in_quote && c == delimc) + { + *result = NORMAL_ATTR; + break; + } + /* + * start of quoted field (or part of field) + */ + if (!in_quote && c == quotec) + { + saw_quote = true; + in_quote = true; + continue; + } + /* + * escape within a quoted field + */ + if (in_quote && c == escapec) + { + /* + * peek at the next char if available, and escape it if it + * is an escape char or a quote char + */ + if (line_buf.cursor <= line_buf.len) + { + char nextc = line_buf.data[line_buf.cursor]; + if (nextc == escapec || nextc == quotec) + { + appendStringInfoCharMacro(&attribute_buf, nextc); + line_buf.cursor++; + continue; + } + } + } + /* + * end of quoted field. + * Must do this test after testing for escape in case quote char + * and escape char are the same (which is the common case). + */ + if(in_quote && c == quotec) + { + in_quote = false; + continue; + } + appendStringInfoCharMacro(&attribute_buf, c); + } + + if (in_quote) + *result = UNTERMINATED_FIELD; + + /* check whether raw input matched null marker */ + input_len = end_cursor - start_cursor; + if (!saw_quote && input_len == strlen(null_print) && + strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0) + *isnull = true; + else + *isnull = false; + + return attribute_buf.data; + } + /* * Read a binary attribute */ *************** *** 2192,2197 **** --- 2406,2479 ---- break; } } + } + + /* + * Send CSV representation of one attribute, with conversion and + * CSV type escaping + */ + static void + CopyAttributeOutCSV(char *server_string, char *delim, bool force_quote) + { + char *string; + char c; + char delimc = delim[0]; + char quotec = delim[1]; + char escapec = delim[2] ? delim[2] : delim[1]; + bool need_quote = force_quote; + char *test_string; + bool same_encoding; + int mblen; + int i; + + same_encoding = (server_encoding == client_encoding); + if (!same_encoding) + string = (char *) pg_server_to_client((unsigned char *) server_string, + strlen(server_string)); + else + string = server_string; + + /* have to run through the string twice, + * first time to see if it needs quoting, second to actually send it + */ + + for(test_string = string; + !need_quote && (c = *test_string) != '\0'; + test_string += mblen) + { + if (c == delimc || c == quotec || c == '\n' || c == '\r') + { + need_quote = true; + } + if (!same_encoding) + mblen = pg_encoding_mblen(client_encoding, test_string); + else + mblen = 1; + } + + if (need_quote) + CopySendChar(quotec); + + for (; (c = *string) != '\0'; string += mblen) + { + if (c == quotec || c == escapec) + CopySendChar(escapec); + + CopySendChar(c); + + if (!same_encoding) + { + /* send additional bytes of the char, if any */ + mblen = pg_encoding_mblen(client_encoding, string); + for (i = 1; i < mblen; i++) + CopySendChar(string[i]); + } + else + mblen = 1; + } + + if (need_quote) + CopySendChar(quotec); } /*
---------------------------(end of broadcast)--------------------------- TIP 4: Don't 'kill -9' the postmaster