Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Bruce Momjian

Wow, that is significantly different.  Thanks.

Your patch has been added to the PostgreSQL unapplied patches list at:

http://momjian.postgresql.org/cgi-bin/pgpatches

It will be applied as soon as one of the PostgreSQL committers reviews
and approves it.

---


Andrew Dunstan wrote:
> 
> 
> Andrew Dunstan wrote:
> 
> >Bruce Momjian said:
> >  
> >
> >>Shame we had to duplicate CopyReadLine() in a sense.
> >>
> >>
> >>
> >>
> >
> >
> >If you can find a clean way to merge them please do - I'll be very grateful.
> > My head started to spin when I tried. In general I dislike having more than
> >2 or 2 levels of logic in a given piece of code.
> >
> >
> >  
> >
> 
> Previous comment courtesy clumsy fingers and the Department of 
> Redundancy Department (of course, I meant 2 or 3).
> 
> Anyway, please review this patch for copy.c - it's possibly more to your 
> taste. It's less redundant, but I'm not sure it's more clear.
> 
> cheers
> 
> andrew


-- 
  Bruce Momjian|  http://candle.pha.pa.us
  pgman@candle.pha.pa.us   |  (610) 359-1001
  +  If your life is a hard drive, |  13 Roberts Road
  +  Christ can be your backup.|  Newtown Square, Pennsylvania 19073

---(end of broadcast)---
TIP 4: Don't 'kill -9' the postmaster


Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Andrew Dunstan

Andrew Dunstan wrote:
Bruce Momjian said:
 

Shame we had to duplicate CopyReadLine() in a sense.
   


If you can find a clean way to merge them please do - I'll be very grateful.
My head started to spin when I tried. In general I dislike having more than
2 or 2 levels of logic in a given piece of code.
 

Previous comment courtesy clumsy fingers and the Department of 
Redundancy Department (of course, I meant 2 or 3).

Anyway, please review this patch for copy.c - it's possibly more to your 
taste. It's less redundant, but I'm not sure it's more clear.

cheers
andrew
*** copy.c.orig	Mon Feb 21 23:12:41 2005
--- copy.c	Mon Feb 21 23:35:22 2005
***
*** 98,104 
  static EolType eol_type;		/* EOL type of input */
  static int	client_encoding;	/* remote side's character encoding */
  static int	server_encoding;	/* local encoding */
- static bool embedded_line_warning;
  
  /* these are just for error messages, see copy_in_error_callback */
  static bool copy_binary;		/* is it a binary copy? */
--- 98,103 
***
*** 139,145 
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
  		 List *force_notnull_atts);
! static bool CopyReadLine(void);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
    CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
--- 138,144 
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
  		 List *force_notnull_atts);
! static bool CopyReadLine(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
    CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***
*** 1191,1197 
  	attr = tupDesc->attrs;
  	num_phys_attrs = tupDesc->natts;
  	attr_count = list_length(attnumlist);
- 	embedded_line_warning = false;
  
  	/*
  	 * Get info about the columns we need to process.
--- 1190,1195 
***
*** 1718,1724 
  			ListCell   *cur;
  
  			/* Actually read the line into memory here */
! 			done = CopyReadLine();
  
  			/*
  			 * EOF at start of line means we're done.  If we see EOF after
--- 1716,1723 
  			ListCell   *cur;
  
  			/* Actually read the line into memory here */
! 			done = csv_mode ? 
! CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);
  
  			/*
  			 * EOF at start of line means we're done.  If we see EOF after
***
*** 2006,2012 
   * by newline.
   */
  static bool
! CopyReadLine(void)
  {
  	bool		result;
  	bool		change_encoding = (client_encoding != server_encoding);
--- 2005,2011 
   * by newline.
   */
  static bool
! CopyReadLine(char * quote, char * escape)
  {
  	bool		result;
  	bool		change_encoding = (client_encoding != server_encoding);
***
*** 2015,2020 
--- 2014,2032 
  	int			j;
  	unsigned char s[2];
  	char	   *cvt;
+ 	boolin_quote = false, last_was_esc = false, csv_mode = false;
+ 	charquotec = '\0', escapec = '\0';
+ 
+ 	if (quote)
+ 	{
+ 		csv_mode = true;
+ 		quotec = quote[0];
+ 		escapec = escape[0];
+ 		/* ignore special escape processing if it's the same as quotec */
+ 		if (quotec == escapec)
+ 			escapec = '\0';
+ 	}
+ 
  
  	s[1] = 0;
  
***
*** 2031,2041 
  
  	/*
  	 * In this loop we only care for detecting newlines (\r and/or \n) and
! 	 * the end-of-copy marker (\.).  For backwards compatibility we allow
  	 * backslashes to escape newline characters.  Backslashes other than
  	 * the end marker get put into the line_buf, since CopyReadAttribute
! 	 * does its own escape processing.	These four characters, and only
! 	 * these four, are assumed the same in frontend and backend encodings.
  	 * We do not assume that second and later bytes of a frontend
  	 * multibyte character couldn't look like ASCII characters.
  	 */
--- 2043,2062 
  
  	/*
  	 * In this loop we only care for detecting newlines (\r and/or \n) and
! 	 * the end-of-copy marker (\.).  
! 	 *
! 	 * In Text mode, for backwards compatibility we allow
  	 * backslashes to escape newline characters.  Backslashes other than
  	 * the end marker get put into the line_buf, since CopyReadAttribute
! 	 * does its own escape processing.	
! 	 *
! 	 * In CSV mode, CR and NL inside q quoted field are just part of the
! 	 * data value and are put in line_buf. We keep just enough state
! 	 * to know if we are currently in a quoted field or not.
! 	 *
! 	 * These four characters, and only these four, are assumed the same in 
! 	 * frontend and backend encodings.
! 	 *
  	 * We do not assume that second and later bytes of a frontend
  	 * multibyte character couldn't look like ASCII characters.
  	 */
***
***

Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Bruce Momjian
Andrew Dunstan wrote:
> Bruce Momjian said:
> >
> > Shame we had to duplicate CopyReadLine() in a sense.
> >
> >
> 
> 
> If you can find a clean way to merge them please do - I'll be very grateful.
>  My head started to spin when I tried. In general I dislike having more than
> 2 or 2 levels of logic in a given piece of code.

OK, will look at that.  Thanks.

-- 
  Bruce Momjian|  http://candle.pha.pa.us
  pgman@candle.pha.pa.us   |  (610) 359-1001
  +  If your life is a hard drive, |  13 Roberts Road
  +  Christ can be your backup.|  Newtown Square, Pennsylvania 19073

---(end of broadcast)---
TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]


Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Andrew Dunstan
Bruce Momjian said:
>
> Shame we had to duplicate CopyReadLine() in a sense.
>
>


If you can find a clean way to merge them please do - I'll be very grateful.
 My head started to spin when I tried. In general I dislike having more than
2 or 2 levels of logic in a given piece of code.

cheers

andrew



---(end of broadcast)---
TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]


Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Bruce Momjian

Shame we had to duplicate CopyReadLine() in a sense.


Your patch has been added to the PostgreSQL unapplied patches list at:

http://momjian.postgresql.org/cgi-bin/pgpatches

It will be applied as soon as one of the PostgreSQL committers reviews
and approves it.

---


Andrew Dunstan wrote:
> 
> Well, in response to the huge number (0) of comments on my POC patch to 
> fix this, I prepared the attached patch, which improves on my previous 
> effort a bit (there was one obscure failure case which is now handled).
> 
> Basically, all the required logic is in a new function CopyReadLineCSV() 
> which is almost but not quite like CopyReadLine(). The new function 
> keeps just enough state to know if a line ending sequence (CR, CRLF, or 
> LF) is part of a quoted field or not. This gets rid of the need for 
> special casing embedded line endings on input elsewhere, so that is 
> removed, as is the warning about them on output that we added back in 
> december (as we then thought just before release). Lastly, the docs are 
> also patched.
> 
> Also attached is my tiny test file - maybe we need to cover this in 
> regression tests?
> 
> cheers
> 
> andrew

> diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml 
> ./doc/src/sgml/ref/copy.sgml
> *** ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgmlMon Jan 
>  3 19:39:53 2005
> --- ./doc/src/sgml/ref/copy.sgml  Sun Feb 20 19:18:54 2005
> ***
> *** 496,508 
>   
>CSV mode will both recognize and produce CSV files with quoted
>values containing embedded carriage returns and line feeds. Thus
> !  the files are not strictly one line per table row like text-mode
> !  files. However, PostgreSQL will reject
> !  COPY input if any fields contain embedded line
> !  end character sequences that do not match the line ending
> !  convention used in the CSV file itself. It is generally safer to
> !  import data containing embedded line end characters using the
> !  text or binary formats rather than CSV.
>   
>  
>   
> --- 496,503 
>   
>CSV mode will both recognize and produce CSV files with quoted
>values containing embedded carriage returns and line feeds. Thus
> !  the files are not strictly one line per table row as are text-mode
> !  files.
>   
>  
>   
> ***
> *** 513,518 
> --- 508,515 
>might encounter some files that cannot be imported using this
>mechanism, and COPY might produce files that other
>programs cannot process.
> +  It is generally safer to import data using the text or binary formats,
> +  if possible, rather than using CSV format.
>   
>  
>   
> diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c 
> ./src/backend/commands/copy.c
> *** ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c   Fri Dec 
> 31 16:59:41 2004
> --- ./src/backend/commands/copy.c Sun Feb 20 13:40:56 2005
> ***
> *** 98,104 
>   static EolType eol_type;/* EOL type of input */
>   static int  client_encoding;/* remote side's character encoding */
>   static int  server_encoding;/* local encoding */
> - static bool embedded_line_warning;
>   
>   /* these are just for error messages, see copy_in_error_callback */
>   static bool copy_binary;/* is it a binary copy? */
> --- 98,103 
> ***
> *** 140,145 
> --- 139,145 
>char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
>List *force_notnull_atts);
>   static bool CopyReadLine(void);
> + static bool CopyReadLineCSV(char * quote, char * escape);
>   static char *CopyReadAttribute(const char *delim, const char *null_print,
> CopyReadResult *result, bool *isnull);
>   static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
> ***
> *** 1191,1197 
>   attr = tupDesc->attrs;
>   num_phys_attrs = tupDesc->natts;
>   attr_count = list_length(attnumlist);
> - embedded_line_warning = false;
>   
>   /*
>* Get info about the columns we need to process.
> --- 1191,1196 
> ***
> *** 1718,1724 
>   ListCell   *cur;
>   
>   /* Actually read the line into memory here */
> ! done = CopyReadLine();
>   
>   /*
>* EOF at start of line means we're done.  If we see 
> EOF after
> --- 1717,1723 
>   ListCell   *cur;
>   
>   /* Actually read the line into memory here */
> ! done = csv_mode ? CopyReadLineCSV(quote, escape) : 
> CopyReadLine();
>   
>   /*
>* EOF at start of line means 

[PATCHES] CSV multiline final fix

2005-02-20 Thread Andrew Dunstan
Well, in response to the huge number (0) of comments on my POC patch to 
fix this, I prepared the attached patch, which improves on my previous 
effort a bit (there was one obscure failure case which is now handled).

Basically, all the required logic is in a new function CopyReadLineCSV() 
which is almost but not quite like CopyReadLine(). The new function 
keeps just enough state to know if a line ending sequence (CR, CRLF, or 
LF) is part of a quoted field or not. This gets rid of the need for 
special casing embedded line endings on input elsewhere, so that is 
removed, as is the warning about them on output that we added back in 
december (as we then thought just before release). Lastly, the docs are 
also patched.

Also attached is my tiny test file - maybe we need to cover this in 
regression tests?

cheers
andrew
diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml 
./doc/src/sgml/ref/copy.sgml
*** ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml  Mon Jan 
 3 19:39:53 2005
--- ./doc/src/sgml/ref/copy.sgmlSun Feb 20 19:18:54 2005
***
*** 496,508 
  
   CSV mode will both recognize and produce CSV files with quoted
   values containing embedded carriage returns and line feeds. Thus
!  the files are not strictly one line per table row like text-mode
!  files. However, PostgreSQL will reject
!  COPY input if any fields contain embedded line
!  end character sequences that do not match the line ending
!  convention used in the CSV file itself. It is generally safer to
!  import data containing embedded line end characters using the
!  text or binary formats rather than CSV.
  
 
  
--- 496,503 
  
   CSV mode will both recognize and produce CSV files with quoted
   values containing embedded carriage returns and line feeds. Thus
!  the files are not strictly one line per table row as are text-mode
!  files.
  
 
  
***
*** 513,518 
--- 508,515 
   might encounter some files that cannot be imported using this
   mechanism, and COPY might produce files that other
   programs cannot process.
+  It is generally safer to import data using the text or binary formats,
+if possible, rather than using CSV format.
  
 
  
diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c 
./src/backend/commands/copy.c
*** ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c Fri Dec 
31 16:59:41 2004
--- ./src/backend/commands/copy.c   Sun Feb 20 13:40:56 2005
***
*** 98,104 
  static EolType eol_type;  /* EOL type of input */
  static intclient_encoding;/* remote side's character encoding */
  static intserver_encoding;/* local encoding */
- static bool embedded_line_warning;
  
  /* these are just for error messages, see copy_in_error_callback */
  static bool copy_binary;  /* is it a binary copy? */
--- 98,103 
***
*** 140,145 
--- 139,145 
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
 List *force_notnull_atts);
  static bool CopyReadLine(void);
+ static bool CopyReadLineCSV(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
  CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***
*** 1191,1197 
attr = tupDesc->attrs;
num_phys_attrs = tupDesc->natts;
attr_count = list_length(attnumlist);
-   embedded_line_warning = false;
  
/*
 * Get info about the columns we need to process.
--- 1191,1196 
***
*** 1718,1724 
ListCell   *cur;
  
/* Actually read the line into memory here */
!   done = CopyReadLine();
  
/*
 * EOF at start of line means we're done.  If we see 
EOF after
--- 1717,1723 
ListCell   *cur;
  
/* Actually read the line into memory here */
!   done = csv_mode ? CopyReadLineCSV(quote, escape) : 
CopyReadLine();
  
/*
 * EOF at start of line means we're done.  If we see 
EOF after
***
*** 2194,2199 
--- 2193,2448 
return result;
  }
  
+ /*
+  * Read a line for CSV copy mode. Differences from standard mode:
+  * . CR an NL are not special inside quoted fields - they just get added
+  *   to the buffer.
+  * . \ is not magical except as the start of the end of data marker.
+  *
+  */
+ 
+ static bool
+ CopyReadLineCSV(char * quote, char * escape)
+ {
+   boolresult;
+   boolchange_encoding = (client_encoding != server_encoding);
+