[PATCHES] CSV multiline final fix

Andrew Dunstan Sun, 20 Feb 2005 17:09:20 -0800

Well, in response to the huge number (0) of comments on my POC patch to fix this, I prepared the attached patch, which improves on my previous effort a bit (there was one obscure failure case which is now handled).

Basically, all the required logic is in a new function CopyReadLineCSV() which is almost but not quite like CopyReadLine(). The new function keeps just enough state to know if a line ending sequence (CR, CRLF, or LF) is part of a quoted field or not. This gets rid of the need for special casing embedded line endings on input elsewhere, so that is removed, as is the warning about them on output that we added back in december (as we then thought just before release). Lastly, the docs are also patched.

Also attached is my tiny test file - maybe we need to cover this in regression tests?

cheers

andrew

diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml 
./doc/src/sgml/ref/copy.sgml
*** ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml      Mon Jan 
 3 19:39:53 2005
--- ./doc/src/sgml/ref/copy.sgml        Sun Feb 20 19:18:54 2005
***************
*** 496,508 ****
      <para>
       CSV mode will both recognize and produce CSV files with quoted
       values containing embedded carriage returns and line feeds. Thus
!      the files are not strictly one line per table row like text-mode
!      files. However, <productname>PostgreSQL</productname> will reject
!      <command>COPY</command> input if any fields contain embedded line
!      end character sequences that do not match the line ending
!      convention used in the CSV file itself. It is generally safer to
!      import data containing embedded line end characters using the
!      text or binary formats rather than CSV.
      </para>
     </note>
  
--- 496,503 ----
      <para>
       CSV mode will both recognize and produce CSV files with quoted
       values containing embedded carriage returns and line feeds. Thus
!      the files are not strictly one line per table row as are text-mode
!      files.
      </para>
     </note>
  
***************
*** 513,518 ****
--- 508,515 ----
       might encounter some files that cannot be imported using this
       mechanism, and <command>COPY</> might produce files that other
       programs cannot process.
+      It is generally safer to import data using the text or binary formats,
+        if possible, rather than using CSV format.
      </para>
     </note>
      
diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c 
./src/backend/commands/copy.c
*** ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c     Fri Dec 
31 16:59:41 2004
--- ./src/backend/commands/copy.c       Sun Feb 20 13:40:56 2005
***************
*** 98,104 ****
  static EolType eol_type;              /* EOL type of input */
  static int    client_encoding;        /* remote side's character encoding */
  static int    server_encoding;        /* local encoding */
- static bool embedded_line_warning;
  
  /* these are just for error messages, see copy_in_error_callback */
  static bool copy_binary;              /* is it a binary copy? */
--- 98,103 ----
***************
*** 140,145 ****
--- 139,145 ----
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
                 List *force_notnull_atts);
  static bool CopyReadLine(void);
+ static bool CopyReadLineCSV(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
                                  CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1191,1197 ****
        attr = tupDesc->attrs;
        num_phys_attrs = tupDesc->natts;
        attr_count = list_length(attnumlist);
-       embedded_line_warning = false;
  
        /*
         * Get info about the columns we need to process.
--- 1191,1196 ----
***************
*** 1718,1724 ****
                        ListCell   *cur;
  
                        /* Actually read the line into memory here */
!                       done = CopyReadLine();
  
                        /*
                         * EOF at start of line means we're done.  If we see 
EOF after
--- 1717,1723 ----
                        ListCell   *cur;
  
                        /* Actually read the line into memory here */
!                       done = csv_mode ? CopyReadLineCSV(quote, escape) : 
CopyReadLine();
  
                        /*
                         * EOF at start of line means we're done.  If we see 
EOF after
***************
*** 2194,2199 ****
--- 2193,2448 ----
        return result;
  }
  
+ /*
+  * Read a line for CSV copy mode. Differences from standard mode:
+  * . CR an NL are not special inside quoted fields - they just get added
+  *   to the buffer.
+  * . \ is not magical except as the start of the end of data marker.
+  *
+  */
+ 
+ static bool
+ CopyReadLineCSV(char * quote, char * escape)
+ {
+       bool            result;
+       bool            change_encoding = (client_encoding != server_encoding);
+       int                     c;
+       int                     mblen;
+       int                     j;
+       unsigned char s[2];
+       char       *cvt;
+       bool        in_quote = false, last_was_esc = false;
+       char        quotec = quote[0];
+       char        escapec = escape[0];
+ 
+       s[1] = 0;
+ 
+       /* ignore special escape processing if it's the same as quote */
+       if (quotec == escapec)
+               escapec = '\0';
+ 
+       /* reset line_buf to empty */
+       line_buf.len = 0;
+       line_buf.data[0] = '\0';
+       line_buf.cursor = 0;
+ 
+       /* mark that encoding conversion hasn't occurred yet */
+       line_buf_converted = false;
+ 
+       /* set default status */
+       result = false;
+ 
+       /*
+        * In this loop we only care for detecting newlines (\r and/or \n)
+        * and the end-of-copy marker (\.).  These four
+        * characters, and only these four, are assumed the same in frontend
+        * and backend encodings.  We do not assume that second and later bytes
+        * of a frontend multibyte character couldn't look like ASCII 
characters.
+        *
+        * What about the encoding implications of the quote / excape chars?
+        *
+        * However, CR and NL characters that are inside a quoted field are
+        * not special, and are simply a part of the data value. The parsing 
rule
+        * used is a bit rough and ready, but probably adequate for our 
purposes.
+        */
+ 
+       for (;;)
+       {
+               c = CopyGetChar();
+               if (c == EOF)
+               {
+                       result = true;
+                       break;
+               }
+ 
+               /*
+                * Dealing with quotes and escapes here is mildly tricky. If the
+                * quote char is also the escape char, there's no problem - we
+                * just use the char as a toggle. If they are different, we need
+                * to ensure that we only take account of an escape inside a 
quoted
+                * field and immediately preceding a quote char, and not the
+                * second in a escape-escape sequence.
+                */
+ 
+               if (in_quote && c == escapec)
+                       last_was_esc = ! last_was_esc;
+ 
+               if (c == quotec && ! last_was_esc)
+                       in_quote = ! in_quote;
+ 
+               if (c != escapec)
+                       last_was_esc = false;
+ 
+               /* 
+                * updating the line count for embedded CR and/or LF chars is
+                * necessarily a little fragile - this test is probably about
+                * the best we can do.
+                */
+               if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
+                       copy_lineno++;
+               
+               if (!in_quote && c == '\r')
+               {
+                       if (eol_type == EOL_NL)
+                               ereport(ERROR,
+                                               
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                errmsg("unquoted carriage 
return found in CSV data"),
+                                 errhint("Use quoted CSV field to represent 
carriage return.")));
+                       /* Check for \r\n on first line, _and_ handle \r\n. */
+                       if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
+                       {
+                               int                     c2 = CopyPeekChar();
+ 
+                               if (c2 == '\n')
+                               {
+                                       CopyDonePeek(c2, true);         /* eat 
newline */
+                                       eol_type = EOL_CRNL;
+                               }
+                               else
+                               {
+                                       /* found \r, but no \n */
+                                       if (eol_type == EOL_CRNL)
+                                               ereport(ERROR,
+                                                               
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                errmsg("unquoted carriage 
return found in CSV data"),
+                                                                errhint("Use 
quoted CSV field to represent carriage return.")));
+ 
+                                       /*
+                                        * if we got here, it is the first line 
and we didn't
+                                        * get \n, so put it back
+                                        */
+                                       CopyDonePeek(c2, false);
+                                       eol_type = EOL_CR;
+                               }
+                       }
+                       break;
+               }
+               if (!in_quote && c == '\n')
+               {
+                       if (eol_type == EOL_CR || eol_type == EOL_CRNL)
+                               ereport(ERROR,
+                                               
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                errmsg("unquoted newline found 
in CSV data"),
+                                                errhint("Use quoted CSV field 
to represent newline.")));
+                       eol_type = EOL_NL;
+                       break;
+               }
+ 
+               /* \ is only potentially magical at the start of a line */
+               if (line_buf.len == 0 && c == '\\')
+               {
+                       int                     c2 = CopyPeekChar();
+ 
+                       if (c2 == EOF)
+                       {
+                               result = true;
+ 
+                               CopyDonePeek(c2, true); /* eat it - do we need 
to? */
+ 
+                               break;
+                       }
+                       if (c2 == '.')
+                       {
+ 
+                               CopyDonePeek(c2, true); /* so we can keep 
calling GetChar() */
+ 
+                               if (eol_type == EOL_CRNL)
+                               {
+                                       c = CopyGetChar();
+                                       if (c == '\n')
+                                               ereport(ERROR,
+                                                               
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                                
errmsg("end-of-copy marker does not match previous newline style")));
+                                       if (c != '\r')
+                                               ereport(ERROR,
+                                                               
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                                
errmsg("end-of-copy marker corrupt")));
+                               }
+                               c = CopyGetChar();
+                               if (c != '\r' && c != '\n')
+                                       ereport(ERROR,
+                                                       
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                        errmsg("end-of-copy 
marker corrupt")));
+                               if ((eol_type == EOL_NL && c != '\n') ||
+                                       (eol_type == EOL_CRNL && c != '\n') ||
+                                       (eol_type == EOL_CR && c != '\r'))
+                                       ereport(ERROR,
+                                                       
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                                        errmsg("end-of-copy 
marker does not match previous newline style")));
+ 
+                               /*
+                                * In protocol version 3, we should ignore 
anything
+                                * after \. up to the protocol end of copy 
data.  (XXX
+                                * maybe better not to treat \. as special?)
+                                */
+                               if (copy_dest == COPY_NEW_FE)
+                               {
+                                       while (c != EOF)
+                                               c = CopyGetChar();
+                               }
+                               result = true;  /* report EOF */
+                               break;
+                       }
+ 
+                       CopyDonePeek(c2, false); /* not a dot, so put it back */
+ 
+               }
+ 
+               appendStringInfoCharMacro(&line_buf, c);
+ 
+               /*
+                * When client encoding != server, must be careful to read the
+                * extra bytes of a multibyte character exactly, since the 
encoding
+                * might not ensure they don't look like ASCII.  When the 
encodings
+                * are the same, we need not do this, since no server encoding 
we
+                * use has ASCII-like following bytes.
+                */
+               if (change_encoding)
+               {
+                       s[0] = c;
+                       mblen = pg_encoding_mblen(client_encoding, s);
+                       for (j = 1; j < mblen; j++)
+                       {
+                               c = CopyGetChar();
+                               if (c == EOF)
+                               {
+                                       result = true;
+                                       break;
+                               }
+                               appendStringInfoCharMacro(&line_buf, c);
+                       }
+                       if (result)
+                               break;                  /* out of outer loop */
+               }
+       } /* end of outer loop */
+ 
+       /*
+        * Done reading the line.  Convert it to server encoding.
+        *
+        * Note: set line_buf_converted to true *before* attempting conversion;
+        * this prevents infinite recursion during error reporting should
+        * pg_client_to_server() issue an error, due to copy_in_error_callback
+        * again attempting the same conversion.  We'll end up issuing the 
message
+        * without conversion, which is bad but better than nothing ...
+        */
+       line_buf_converted = true;
+ 
+       if (change_encoding)
+       {
+               cvt = (char *) pg_client_to_server((unsigned char *) 
line_buf.data,
+                                                                               
   line_buf.len);
+               if (cvt != line_buf.data)
+               {
+                       /* transfer converted data back to line_buf */
+                       line_buf.len = 0;
+                       line_buf.data[0] = '\0';
+                       appendBinaryStringInfo(&line_buf, cvt, strlen(cvt));
+               }
+       }
+ 
+       return result;
+ }
+ 
  /*----------
   * Read the value of a single attribute, performing de-escaping as needed.
   *
***************
*** 2369,2402 ****
  
        for (;;)
        {
-               /* handle multiline quoted fields */
-               if (in_quote && line_buf.cursor >= line_buf.len)
-               {
-                       bool            done;
- 
-                       switch (eol_type)
-                       {
-                               case EOL_NL:
-                                       appendStringInfoString(&attribute_buf, 
"\n");
-                                       break;
-                               case EOL_CR:
-                                       appendStringInfoString(&attribute_buf, 
"\r");
-                                       break;
-                               case EOL_CRNL:
-                                       appendStringInfoString(&attribute_buf, 
"\r\n");
-                                       break;
-                               case EOL_UNKNOWN:
-                                       /* shouldn't happen - just keep going */
-                                       break;
-                       }
- 
-                       copy_lineno++;
-                       done = CopyReadLine();
-                       if (done && line_buf.len == 0)
-                               break;
-                       start_cursor = line_buf.cursor;
-               }
- 
                end_cursor = line_buf.cursor;
                if (line_buf.cursor >= line_buf.len)
                        break;
--- 2618,2623 ----
***************
*** 2629,2653 ****
                 !use_quote && (c = *test_string) != '\0';
                 test_string += mblen)
        {
-               /*
-                * We don't know here what the surrounding line end characters
-                * might be. It might not even be under postgres' control. So
-                * we simple warn on ANY embedded line ending character.
-                *
-                * This warning will disappear when we make line parsing 
field-aware,
-                * so that we can reliably read in embedded line ending 
characters
-                * regardless of the file's line-end context.
-                *
-                */
- 
-               if (!embedded_line_warning  && (c == '\n' || c == '\r') )
-               {
-                       embedded_line_warning = true;
-                       elog(WARNING,
-                                "CSV fields with embedded linefeed or carriage 
return "
-                                "characters might not be able to be 
reimported");
-               }
- 
                if (c == delimc || c == quotec || c == '\n' || c == '\r')
                        use_quote = true;
                if (!same_encoding)
--- 2850,2855 ----


create temp table copytest (
	style	text,
	test 	text,
	filler	int);

insert into copytest values('DOS','abc\r\ndef',1);
insert into copytest values('Unix','abc\ndef',2);
insert into copytest values('Mac','abc\rdef',3);
insert into copytest values('esc\\ape','a\\r\\\r\\\n\\nb',4);

copy copytest to '/tmp/copytest.csv' csv;

create temp table copytest2 (like copytest);

copy copytest2 from '/tmp/copytest.csv' csv;

select * from copytest except select * from copytest2;

truncate copytest2;

copy copytest to '/tmp/copytest.csv' csv quote '\'' escape '\\';

copy copytest2 from '/tmp/copytest.csv' csv quote '\'' escape '\\';

select * from copytest except select * from copytest2;

create or replace function nice_quote(text) returns text language plperl as $$

  my $txt = shift;
  foreach ($txt) 
  {	
	s/\r/\\r/g;
	s/\n/\\n/g;
  };
  return $txt;

$$;

select nice_quote(quote_literal(style)) as style, 
       nice_quote(quote_literal(test)) as test 
from copytest;

---------------------------(end of broadcast)---------------------------
TIP 6: Have you searched our list archives?

               http://archives.postgresql.org

[PATCHES] CSV multiline final fix

Reply via email to