FYI, this CVS is turning into quite a job, but doing it right takes this
kind of effort.

---------------------------------------------------------------------------

Andrew Dunstan wrote:
> Bruce Momjian wrote:
> 
> >Andrew Dunstan wrote:
> >  
> >
> >>In fact, in the patch I sent in, no quoted string is marked as null when 
> >>being read (so even if you use \N as the null marker, "\N" will be that 
> >>literal and not a null marker). And the null marker, whatever it is, 
> >>should be made quote safe by us throwing an error if it contains the 
> >>quote marker, just as we now make sure that the null marker is 
> >>delimiter-safe.
> >>    
> >>
> >
> >What value does an int column get if the input file has ',,'.  Don't
> >tell me zero?  :-)  Error?
> >  
> >
> 
> If the null marker is not an empty string, it gets an error, of course - 
> if it is it gets a null:
> 
> [EMAIL PROTECTED] pginst]$ echo ',,' | bin/psql -c "create temp table 
> foo (a int, b text, c text); copy foo from stdin delimiter ',\"' null 
> '\\\\N';"
> ERROR:  invalid input syntax for integer: ""
> CONTEXT:  COPY foo, line 1, column a: ""
> [EMAIL PROTECTED] pginst]$ echo ',,' | bin/psql -c "create temp table 
> foo (a int, b text, c text); copy foo from stdin delimiter ',\"' ;"
> [EMAIL PROTECTED] pginst]$
> 
> 
> I hope that is expected behaviour - it's what *I* expect, at least.
> 
> 
> >  
> >
> >>I will check on the write behaviour - it might need ammending too.
> >>
> >>I'll submit a revised patch based on the original syntax scheme, and 
> >>then you (Bruce) can make the syntax/psql changes that seem to be agreed 
> >>on now - is that ok?
> >>    
> >>
> >
> >OK, go as far as you want and post it.  I will turn around a new patch
> >in a few hours after you post.
> >
> >  
> >
> >>The default NULL value issue can be determined at the end of any 
> >>exhaustive debate we have - in the end it's a one line code change ;-)
> >>    
> >>
> >
> >Agreed.
> >
> >  
> >
> 
> Attached patch has these additions to previously posted patch:
> . quote character may not appear in NULL marker
> . any non-null value that matches the NULL marker is forced to be quoted 
> when written.
> 
> 
> cheers
> 
> andrew
> 

> Index: src/backend/commands/copy.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql-server/src/backend/commands/copy.c,v
> retrieving revision 1.219
> diff -c -r1.219 copy.c
> *** src/backend/commands/copy.c       6 Apr 2004 13:21:33 -0000       1.219
> --- src/backend/commands/copy.c       12 Apr 2004 16:21:33 -0000
> ***************
> *** 70,76 ****
>   typedef enum CopyReadResult
>   {
>       NORMAL_ATTR,
> !     END_OF_LINE
>   } CopyReadResult;
>   
>   /*
> --- 70,77 ----
>   typedef enum CopyReadResult
>   {
>       NORMAL_ATTR,
> !     END_OF_LINE,
> !     UNTERMINATED_FIELD
>   } CopyReadResult;
>   
>   /*
> ***************
> *** 136,144 ****
> --- 137,148 ----
>   static bool CopyReadLine(void);
>   static char *CopyReadAttribute(const char *delim, const char *null_print,
>                                                          CopyReadResult *result, 
> bool *isnull);
> + static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
> +                                                        CopyReadResult *result, 
> bool *isnull);
>   static Datum CopyReadBinaryAttribute(int column_no, FmgrInfo *flinfo,
>                                               Oid typelem, bool *isnull);
>   static void CopyAttributeOut(char *string, char *delim);
> + static void CopyAttributeOutCSV(char *string, char *delim, bool force_quote);
>   static List *CopyGetAttnums(Relation rel, List *attnamelist);
>   static void limit_printout_length(StringInfo buf);
>   
> ***************
> *** 682,687 ****
> --- 686,692 ----
>       List       *attnumlist;
>       bool            binary = false;
>       bool            oids = false;
> +     bool        csv_mode = false;
>       char       *delim = NULL;
>       char       *null_print = NULL;
>       Relation        rel;
> ***************
> *** 744,751 ****
>       if (!delim)
>               delim = "\t";
>   
>       if (!null_print)
> !             null_print = "\\N";
>   
>       /*
>        * Open and lock the relation, using the appropriate lock type.
> --- 749,759 ----
>       if (!delim)
>               delim = "\t";
>   
> +     if (strlen(delim) > 1)
> +             csv_mode = true;
> + 
>       if (!null_print)
> !             null_print = csv_mode ? "" : "\\N";
>   
>       /*
>        * Open and lock the relation, using the appropriate lock type.
> ***************
> *** 772,783 ****
>                                          "psql's \\copy command also works for 
> anyone.")));
>   
>       /*
> !      * Presently, only single-character delimiter strings are supported.
>        */
> !     if (strlen(delim) != 1)
>               ereport(ERROR,
>                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
>                                errmsg("COPY delimiter must be a single character")));
>   
>       /*
>        * Don't allow the delimiter to appear in the null string.
> --- 780,806 ----
>                                          "psql's \\copy command also works for 
> anyone.")));
>   
>       /*
> !      * Only single-character delimiter strings are supported,
> !      * except in CSV mode, where the string must be 
> !      * delimiter-char quote-char [escape-char]
>        */
> !     if (!csv_mode && strlen(delim) != 1)
>               ereport(ERROR,
>                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
>                                errmsg("COPY delimiter must be a single character")));
> +     else if (csv_mode)
> +     {
> +             if(strlen(delim) > 3)
> +                     ereport(ERROR,
> +                             (errcode(ERRCODE_SYNTAX_ERROR),
> +                              errmsg("COPY delimiters for CSV must be a 2 or 3 
> characters")));
> +             if (delim[0] == delim[1] || 
> +                     (strlen(delim) == 3 && delim[0] == delim[2]))
> +                     ereport(ERROR,
> +                             (errcode(ERRCODE_SYNTAX_ERROR),
> +                              errmsg("CSV delimiter character must not be same as 
> quote character or escape character")));
> +                     
> +     }
>   
>       /*
>        * Don't allow the delimiter to appear in the null string.
> ***************
> *** 788,793 ****
> --- 811,833 ----
>                                errmsg("COPY delimiter must not appear in the NULL 
> specification")));
>   
>       /*
> +      * Don't allow the csv quote char to appear in the null string.
> +      */
> +     if (csv_mode && strchr(null_print, delim[1]) != NULL)
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
> +                              errmsg("CSV quote character  must not appear in the 
> NULL specification")));
> + 
> +     /*
> +      * Don't allow OIDs in CSV mode
> +      */
> + 
> +     if (csv_mode && oids)
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_SYNTAX_ERROR),
> +                              errmsg("Cannot specify OIDS in CSV mode ")));
> + 
> +     /*
>        * Don't allow COPY w/ OIDs to or from a table without them
>        */
>       if (oids && !rel->rd_rel->relhasoids)
> ***************
> *** 969,974 ****
> --- 1009,1015 ----
>       FmgrInfo   *out_functions;
>       Oid                *elements;
>       bool       *isvarlena;
> +     bool       csv_mode;
>       char       *string;
>       Snapshot        mySnapshot;
>       List       *cur;
> ***************
> *** 979,984 ****
> --- 1020,1026 ----
>       attr = tupDesc->attrs;
>       num_phys_attrs = tupDesc->natts;
>       attr_count = length(attnumlist);
> +     csv_mode = (strlen(delim) > 1);
>   
>       /*
>        * Get info about the columns we need to process.
> ***************
> *** 1051,1057 ****
>       while ((tuple = heap_getnext(scandesc, ForwardScanDirection)) != NULL)
>       {
>               bool            need_delim = false;
> - 
>               CHECK_FOR_INTERRUPTS();
>   
>               MemoryContextReset(mycontext);
> --- 1093,1098 ----
> ***************
> *** 1113,1119 ****
>                                                                                      
>                             value,
>                                                                 
> ObjectIdGetDatum(elements[attnum - 1]),
>                                                       Int32GetDatum(attr[attnum - 
> 1]->atttypmod)));
> !                                     CopyAttributeOut(string, delim);
>                               }
>                               else
>                               {
> --- 1154,1167 ----
>                                                                                      
>                             value,
>                                                                 
> ObjectIdGetDatum(elements[attnum - 1]),
>                                                       Int32GetDatum(attr[attnum - 
> 1]->atttypmod)));
> !                                     if (csv_mode)
> !                                     {
> !                                             bool force_quote = 
> (strcmp(string,null_print) == 0);
> !                                             CopyAttributeOutCSV(string, delim, 
> force_quote);
> !                                     }
> !                                     else
> !                                             CopyAttributeOut(string, delim);
> ! 
>                               }
>                               else
>                               {
> ***************
> *** 1263,1268 ****
> --- 1311,1317 ----
>       Datum      *values;
>       char       *nulls;
>       bool            done = false;
> +     bool        csv_mode;
>       bool            isnull;
>       ResultRelInfo *resultRelInfo;
>       EState     *estate = CreateExecutorState(); /* for ExecConstraints() */
> ***************
> *** 1280,1285 ****
> --- 1329,1335 ----
>       num_phys_attrs = tupDesc->natts;
>       attr_count = length(attnumlist);
>       num_defaults = 0;
> +     csv_mode = (strlen(delim) > 1);
>   
>       /*
>        * We need a ResultRelInfo so we can use the regular executor's
> ***************
> *** 1499,1504 ****
> --- 1549,1555 ----
>   
>                       if (file_has_oids)
>                       {
> +                             /* can't be in CSV mode here */
>                               string = CopyReadAttribute(delim, null_print,
>                                                                                  
> &result, &isnull);
>   
> ***************
> *** 1537,1544 ****
>                                                        errmsg("missing data for 
> column \"%s\"",
>                                                                       
> NameStr(attr[m]->attname))));
>   
> !                             string = CopyReadAttribute(delim, null_print,
> !                                                                                
> &result, &isnull);
>   
>                               if (isnull)
>                               {
> --- 1588,1608 ----
>                                                        errmsg("missing data for 
> column \"%s\"",
>                                                                       
> NameStr(attr[m]->attname))));
>   
> !                             if (csv_mode)
> !                             {
> !                                     string = CopyReadAttributeCSV(delim, 
> null_print, 
> !                                                                                    
>            &result, &isnull);
> !                                     if (result == UNTERMINATED_FIELD)
> !                                             ereport(ERROR,
> !                                                             
> (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
> !                                                              errmsg("unterminated 
> CSV quoted field")));
> !                             }
> !                             else
> !                             {
> !                                     string = CopyReadAttribute(delim, null_print, 
> !                                                                                    
>     &result, &isnull);
> !                             }
> !                                     
>   
>                               if (isnull)
>                               {
> ***************
> *** 2069,2074 ****
> --- 2133,2288 ----
>       return attribute_buf.data;
>   }
>   
> + 
> + /*
> +  * Read the value of a single attribute in CSV mode, 
> +  * performing de-escaping as needed. Escaping does not follow the normal
> +  * PostgreSQL text mode, but instead "standard" (i.e. common) CSV usage.
> +  *
> +  * Quoted fields can span lines, in which case the line end is embedded
> +  * in the returned string.
> +  *
> +  * delim is a 2- or 3-character string. The first character is the
> +  * field delimiter, the second the quote character, the third is the
> +  * escape character indise quotes, and defaults to the quote character.
> +  *
> +  * null_print is the null marker string.  Note that this is compared to
> +  * the pre-de-escaped input string (thus if it is quoted it is not a NULL).
> +  *
> +  * *result is set to indicate what terminated the read:
> +  *          NORMAL_ATTR:    column delimiter
> +  *          END_OF_LINE:    end of line
> +  *      UNTERMINATED_FIELD no quote detected at end of a quoted field
> +  *
> +  * In any case, the string read up to the terminator (or end of file)
> +  * is returned.
> +  *
> +  * *isnull is set true or false depending on whether the input matched
> +  * the null marker.  Note that the caller cannot check this since the
> +  * returned string will be the post-de-escaping equivalent, which may
> +  * look the same as some valid data string.
> +  *----------
> +  */
> + 
> + static char *
> + CopyReadAttributeCSV(const char *delim, const char *null_print,
> +                               CopyReadResult *result, bool *isnull)
> + {
> +     char        delimc = delim[0];
> +     char        quotec = delim[1];
> +     char        escapec = delim[2] ? delim[2] : delim[1];
> +     char            c;
> +     int                     start_cursor = line_buf.cursor;
> +     int                     end_cursor = start_cursor;;
> +     int                     input_len;
> +     bool        in_quote = false;
> +     bool        saw_quote = false;
> + 
> +     /* reset attribute_buf to empty */
> +     attribute_buf.len = 0;
> +     attribute_buf.data[0] = '\0';
> + 
> +     /* set default status */
> +     *result = END_OF_LINE;
> + 
> +     for (;;)
> +     {
> +             /* handle multiline quoted fields */
> +             if (in_quote && line_buf.cursor >= line_buf.len)
> +             {
> +                     bool done;
> + 
> +                     switch(eol_type)
> +                     {
> +                             case EOL_NL:
> +                                     appendStringInfoString(&attribute_buf,"\n");
> +                                     break;
> +                             case EOL_CR:
> +                                     appendStringInfoString(&attribute_buf,"\r");
> +                                     break;
> +                             case EOL_CRNL:
> +                                     appendStringInfoString(&attribute_buf,"\r\n");
> +                                     break;
> +                             case EOL_UNKNOWN:
> +                                     /* shouldn't happen - just keep going */
> +                                     break;
> +                     }
> + 
> +                     copy_lineno++;
> +                     done = CopyReadLine();
> +                     if (done && line_buf.len == 0)
> +                             break;
> +                     start_cursor = line_buf.cursor;
> +             }
> + 
> +             end_cursor = line_buf.cursor;
> +             if (line_buf.cursor >= line_buf.len)
> +                     break;
> +             c = line_buf.data[line_buf.cursor++];
> +             /* 
> +              * unquoted field delimiter 
> +              */
> +             if (!in_quote && c == delimc)
> +             {
> +                     *result = NORMAL_ATTR;
> +                     break;
> +             }
> +             /* 
> +              * start of quoted field (or part of field) 
> +              */
> +             if (!in_quote && c == quotec)
> +             {
> +                     saw_quote = true;
> +                     in_quote = true;
> +                     continue;
> +             }
> +             /* 
> +              * escape within a quoted field
> +              */
> +             if (in_quote && c == escapec)
> +             {
> +                     /* 
> +                      * peek at the next char if available, and escape it if it
> +                      * is an escape char or a quote char
> +                      */
> +                     if (line_buf.cursor <= line_buf.len)
> +                     {
> +                             char nextc = line_buf.data[line_buf.cursor];
> +                             if (nextc == escapec || nextc == quotec)
> +                             {
> +                                     appendStringInfoCharMacro(&attribute_buf, 
> nextc);
> +                                     line_buf.cursor++;
> +                                     continue;
> +                             }
> +                     }
> +             } 
> +             /*
> +              * end of quoted field. 
> +              * Must do this test after testing for escape in case quote char
> +              * and escape char are the same (which is the common case).
> +              */
> +             if(in_quote && c == quotec)
> +             {
> +                     in_quote = false;
> +                     continue;
> +             }
> +             appendStringInfoCharMacro(&attribute_buf, c);
> +     }
> + 
> +     if (in_quote)
> +             *result = UNTERMINATED_FIELD;
> + 
> +     /* check whether raw input matched null marker */
> +     input_len = end_cursor - start_cursor;
> +     if (!saw_quote && input_len == strlen(null_print) &&
> +             strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0)
> +             *isnull = true;
> +     else
> +             *isnull = false;
> + 
> +     return attribute_buf.data;
> + }
> + 
>   /*
>    * Read a binary attribute
>    */
> ***************
> *** 2192,2197 ****
> --- 2406,2479 ----
>                               break;
>               }
>       }
> + }
> + 
> + /*
> +  * Send CSV representation of one attribute, with conversion and 
> +  * CSV type escaping
> +  */
> + static void
> + CopyAttributeOutCSV(char *server_string, char *delim, bool force_quote)
> + {
> +     char       *string;
> +     char            c;
> +     char            delimc = delim[0];
> +     char        quotec = delim[1];
> +     char        escapec = delim[2] ? delim[2] : delim[1];
> +     bool        need_quote = force_quote;
> +     char        *test_string;
> +     bool            same_encoding;
> +     int                     mblen;
> +     int                     i;
> + 
> +     same_encoding = (server_encoding == client_encoding);
> +     if (!same_encoding)
> +             string = (char *) pg_server_to_client((unsigned char *) server_string,
> +                                                                                    
>    strlen(server_string));
> +     else
> +             string = server_string;
> + 
> +     /* have to run through the string twice,
> +      * first time to see if it needs quoting, second to actually send it
> +      */
> + 
> +     for(test_string = string; 
> +             !need_quote && (c = *test_string) != '\0'; 
> +             test_string += mblen)
> +     {
> +             if (c == delimc || c == quotec || c == '\n' || c == '\r')
> +             {
> +                     need_quote = true;
> +             }
> +             if (!same_encoding)
> +                     mblen = pg_encoding_mblen(client_encoding, test_string);
> +             else
> +                     mblen = 1;
> +     }
> + 
> +     if (need_quote)
> +             CopySendChar(quotec);
> + 
> +     for (; (c = *string) != '\0'; string += mblen)
> +     {
> +             if (c == quotec || c == escapec)
> +                     CopySendChar(escapec);
> + 
> +             CopySendChar(c);
> + 
> +             if (!same_encoding)
> +             {
> +                     /* send additional bytes of the char, if any */
> +                     mblen = pg_encoding_mblen(client_encoding, string);
> +                     for (i = 1; i < mblen; i++)
> +                             CopySendChar(string[i]);
> +             }
> +             else
> +                     mblen = 1;
> +     }
> + 
> +     if (need_quote)
> +             CopySendChar(quotec);
>   }
>   
>   /*

> 
> ---------------------------(end of broadcast)---------------------------
> TIP 4: Don't 'kill -9' the postmaster

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  [EMAIL PROTECTED]               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

---------------------------(end of broadcast)---------------------------
TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]

Reply via email to