Ok, I've just added support to embedded new lines and partial support to escaped double-quotes. By partial support, I mean they are not unescaped yet...
Index: ext/csv/csv.c =================================================================== --- ext/csv/csv.c +++ ext/csv/csv.c @@ -120,10 +120,11 @@ */ static char *csv_getline( CSV *pCSV ){ int n = 0; int bEol = 0; int bShrink = 0; + int bQuotedCol = 0; /* allocate initial row buffer */ if( pCSV->maxRow < 1 ){ pCSV->zRow = sqlite3_malloc( 100 ); if( pCSV->zRow ){ @@ -150,19 +151,32 @@ pCSV->zRow[n] = '\0'; bEol = -1; break; } /* look for line delimiter */ - while( pCSV->zRow[n] ){ n++; } - if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){ + while( pCSV->zRow[n] ){ + if( pCSV->zRow[n]=='\"' ){ + if( bQuotedCol ) { + if( pCSV->zRow[n+1]=='\"' ) { /* escaped */ + n++; + }else{ + bQuotedCol = 0; + } + }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){ + bQuotedCol = 1; + } + } + n++; + } + if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) && !bQuotedCol ){ pCSV->zRow[n-1] = '\n'; /* uniform line ending */ pCSV->zRow[n] = '\0'; bEol = -1; } } if( bShrink ){ - pCSV->zRow = realloc( pCSV->zRow, n+1 ); + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 ); pCSV->maxRow = n+1; } return bEol ? pCSV->zRow : 0; } @@ -332,17 +346,23 @@ do{ /* if it begins with a quote, assume it's a quoted col */ if( *s=='\"' ){ s++; /* skip quote */ pCSV->aCols[nCol] = s; /* save pointer for this col */ - /* TBD: handle escaped quotes "" */ /* find closing quote */ - s = strchr(s, '\"'); - if( !s ){ - /* no closing quote */ - pCSV->eof = -1; - return SQLITE_ERROR; + while( 1 ){ + s = strchr(s, '\"'); + if( !s ){ + /* no closing quote */ + pCSV->eof = -1; + return SQLITE_ERROR; + }else if ( *(s+1)=='\"' ){ + /* TBD: replace all escaped quotes by a single one */ + s+=2; + }else{ + break; + } } *s = '\0'; /* null terminate this col */ /* fall through and look for following ",\n" */ s++; }else{ @@ -608,11 +628,11 @@ *pzErr = sqlite3_mprintf("%s", aErrMsg[4]); sqlite3_free(zSql); csvRelease( pCSV ); return SQLITE_ERROR; } - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail); + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail); }else{ zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail); } sqlite3_free(zTmp); } On Sat, May 8, 2010 at 3:45 PM, gwenn <gwenn.k...@gmail.com> wrote: > While looking in csv1.test, I found a solution to the case when header row > contains spaces: just wrap the column name with double quotes. > > Index: ext/csv/csv.c > =================================================================== > --- ext/csv/csv.c > +++ ext/csv/csv.c > @@ -158,11 +158,11 @@ > pCSV->zRow[n] = '\0'; > bEol = -1; > } > } > if( bShrink ){ > - pCSV->zRow = realloc( pCSV->zRow, n+1 ); > + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 ); > pCSV->maxRow = n+1; > } > return bEol ? pCSV->zRow : 0; > } > > @@ -608,11 +608,11 @@ > *pzErr = sqlite3_mprintf("%s", aErrMsg[4]); > sqlite3_free(zSql); > csvRelease( pCSV ); > return SQLITE_ERROR; > } > - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail); > + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail); > }else{ > zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail); > } > sqlite3_free(zTmp); > } > > Index: ext/csv/csv1.test > =================================================================== > --- ext/csv/csv1.test > +++ ext/csv/csv1.test > @@ -23,10 +23,11 @@ > # > # csv-1.*: Creating/destroying csv tables. > # csv-2.*: Linear scans of csv data. > # csv-3.*: Test renaming an csv table. > # csv-4.*: CREATE errors > +# csv-5.*: Dirty header and long line. > # > > ifcapable !csv { > finish_test > return > @@ -36,10 +37,12 @@ > set test1csv [file join [file dirname [info script]] test1.csv] > # This file is delimited by '|' and has quoted fields. > set test2csv [file join [file dirname [info script]] test2.csv] > # This file is delimited by '|'. It does NOT have quoted fields. > set test3csv [file join [file dirname [info script]] test3.csv] > +# This file contains a dirty header and one long line. > +set test4csv [file join [file dirname [info script]] test4.csv] > > > #---------------------------------------------------------------------------- > # Test cases csv-1.* test CREATE and DROP table statements. > # > > @@ -249,5 +252,14 @@ > catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') " > } {1 {Error opening CSV file: 'foo'}} > do_test csv-4.1.3 { > catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) " > } {1 {Error opening CSV file: 'foo foo'}} > + > > +#---------------------------------------------------------------------------- > +# Test cases csv-5.* test file with dirty header and long line. > +# > + > +do_test csv-5.1.1 { > + execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') " > + execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',', > USE_HEADER_ROW) " > +} {} > > ADDED ext/csv/test4.csv > col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10 > > 123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789 > > I tried to handle double-quoted fields with embedded line breaks without > success but I am not stuck yet. > Regards. > > On Wed, Apr 21, 2010 at 8:22 PM, gwenn <gwenn.k...@gmail.com> wrote: > >> Thanks for this great extension. >> It works smoothly with 500Mo files. >> >> And it's a workaround to some shortcomings of the '.import' command: >> - no need to create a table before, >> - no need to delete the header row before/after, >> - no error if the number of columns is not homogeneous, >> - ... >> It's a nightmare to work with the CSV format but I have to. >> >> I made a quick and dirty fix to the USE_HEADER_ROW mode to replace >> whitespaces, slashes or hyphens by underscores. >> But I look for a better solution. Is there any way to make sure a string >> is a valid column name? >> >> Regards >> >> On Mon Apr 19 17:32:13 GMT 2010, Shane Harrelson <shane at sqlite.org> >> wrote: >> > Thanks for the report. The extension is still very a much a >> > work-in-progress and any feedback is greatly appreciated. >> > >> > -Shane >> >> > >> > >> > On Sun, Apr 18, 2010 at 12:51 PM, gwenn <gwenn.kahz at gmail.com> >> wrote: >> > > Hello, >> > > There is a little bug/typo in the csv extension when lines exceed 100 >> > > characters: >> > > *** glibc detected *** sqlite3: realloc(): invalid pointer: >> > > 0x0000000000ad1a78 *** >> > > ======= Backtrace: ========= >> > > /lib/libc.so.6[0x7f6dab009d16] >> > > /lib/libc.so.6(realloc+0x321)[0x7f6dab00fda1] >> > > ./libSqliteCsv.so[0x7f6da9ef9dbf] >> > > >> > > A possible patch is: >> > > --- /tmp/SQLite-d474195a997b9d94/ext/csv/csv.c 2009-11-05 >> 05:14:30.000000000 >> > > +0100 >> > > +++ csv.c 2010-04-18 18:48:04.000000000 +0200 >> > > @@ -160,7 +160,7 @@ >> > > } >> > > } >> > > if( bShrink ){ >> > > - pCSV->zRow = realloc( pCSV->zRow, n+1 ); >> > > + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 ); >> > > pCSV->maxRow = n+1; >> > > } >> > > return bEol ? pCSV->zRow : 0; >> > > >> > > Regards. >> > > _______________________________________________ >> > > sqlite-users mailing list >> > > sqlite-users at sqlite.org >> > > http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users >> > > >> > > _______________________________________________ sqlite-users mailing list sqlite-users@sqlite.org http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users