Here is some tests: --- csv1.test +++ csv1.test @@ -23,10 +23,11 @@ # # csv-1.*: Creating/destroying csv tables. # csv-2.*: Linear scans of csv data. # csv-3.*: Test renaming an csv table. # csv-4.*: CREATE errors +# csv-5.*: Dirty header, long line, escaped quotes, escaped newlines. #
ifcapable !csv { finish_test return @@ -36,10 +37,13 @@ set test1csv [file join [file dirname [info script]] test1.csv] # This file is delimited by '|' and has quoted fields. set test2csv [file join [file dirname [info script]] test2.csv] # This file is delimited by '|'. It does NOT have quoted fields. set test3csv [file join [file dirname [info script]] test3.csv] +# This file contains a dirty header, one long line, escaped quotes, escaped +# new lines. +set test4csv [file join [file dirname [info script]] test4.csv] #---------------------------------------------------------------------------- # Test cases csv-1.* test CREATE and DROP table statements. # @@ -249,5 +253,40 @@ catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') " } {1 {Error opening CSV file: 'foo'}} do_test csv-4.1.3 { catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) " } {1 {Error opening CSV file: 'foo foo'}} + +#---------------------------------------------------------------------------- +# Test cases csv-5.* test file with dirty header and long line. +# + +do_test csv-5.1.1 { + execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') " + execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',', USE_HEADER_ROW) " +} {} +do_test csv-5.1.2 { + execsql { + SELECT col1 FROM t1 limit 1 offset 1; + } +} {123456789} +do_test csv-5.1.3 { + execsql { + SELECT * FROM t1 limit 1 offset 3; + } +} {{123456789 +} { +} {} {123456789 +} {1234\\567'89 +} {123456"789 +} {123""456789 +} 1234\"5678\"9 123456789\" {}} +do_test csv-5.1.4 { + execsql { + SELECT col1,col2,col3 FROM t1 limit 1 offset 4; + } +} {{} ' {}} +do_test csv-5.1.5 { + execsql { + SELECT col1 FROM t1 limit 1 offset 5; + } +} {'} And the test file (test4.csv): col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10 123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 "123456789 "," ","","123456789 ","1234\\567'89 ","123456""789 ","123""""456789 ",1234"5678"9,123456789","" "",', ' On Thu, May 13, 2010 at 9:28 PM, gwenn <gwenn.k...@gmail.com> wrote: > Done! > > Index: ext/csv/csv.c > =================================================================== > --- ext/csv/csv.c > +++ ext/csv/csv.c > @@ -60,10 +60,11 @@ > char *zRow; /* Buffer for current CSV row */ > char cDelim; /* Character to use for delimiting columns > */ > int nCol; /* Number of columns in current row */ > int maxCol; /* Size of aCols array */ > char **aCols; /* Array of parsed columns */ > + int *aEscapedQuotes; /* Number of escaped quotes for each column > in aCols */ > }; > > > /* > ** An CSV cursor object. > @@ -120,10 +121,11 @@ > */ > static char *csv_getline( CSV *pCSV ){ > int n = 0; > int bEol = 0; > int bShrink = 0; > + int bQuotedCol = 0; > > /* allocate initial row buffer */ > if( pCSV->maxRow < 1 ){ > pCSV->zRow = sqlite3_malloc( 100 ); > if( pCSV->zRow ){ > @@ -135,10 +137,13 @@ > /* read until eol */ > while( !bEol ){ > /* grow row buffer as needed */ > if( n+100>pCSV->maxRow ){ > int newSize = pCSV->maxRow*2 + 100; > + if( newSize>=pCSV->db->aLimit[SQLITE_LIMIT_LENGTH] ){ > + return 0; > + } > char *p = sqlite3_realloc(pCSV->zRow, newSize); > if( !p ) return 0; > pCSV->maxRow = newSize; > pCSV->zRow = p; > bShrink = -1; > @@ -150,19 +155,32 @@ > pCSV->zRow[n] = '\0'; > bEol = -1; > break; > } > /* look for line delimiter */ > - while( pCSV->zRow[n] ){ n++; } > - if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){ > + while( pCSV->zRow[n] ){ > + if( pCSV->zRow[n]=='\"' ){ > + if( bQuotedCol ) { > + if( pCSV->zRow[n+1]=='\"' ) { /* escaped */ > + n++; > + }else{ > + bQuotedCol = 0; > + } > + }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){ > + bQuotedCol = 1; > + } > + } > + n++; > + } > + if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) && > !bQuotedCol ){ > pCSV->zRow[n-1] = '\n'; /* uniform line ending */ > pCSV->zRow[n] = '\0'; > bEol = -1; > } > } > if( bShrink ){ > - pCSV->zRow = realloc( pCSV->zRow, n+1 ); > + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 ); > pCSV->maxRow = n+1; > } > return bEol ? pCSV->zRow : 0; > } > > @@ -317,15 +335,16 @@ > /* allocate initial space for the column pointers */ > if( pCSV->maxCol < 1 ){ > /* take a guess */ > int maxCol = (int)(strlen(pCSV->zRow) / 5 + 1); > pCSV->aCols = (char **)sqlite3_malloc( sizeof(char*) * maxCol ); > + pCSV->aEscapedQuotes = (int *)sqlite3_malloc( sizeof(int) * maxCol ); > if( pCSV->aCols ){ > pCSV->maxCol = maxCol; > } > } > - if( !pCSV->aCols ) return SQLITE_NOMEM; > + if( !pCSV->aCols || !pCSV->aEscapedQuotes ) return SQLITE_NOMEM; > > /* add custom delim character */ > zDelims[0] = pCSV->cDelim; > > /* parse the zRow into individual columns */ > @@ -332,23 +351,32 @@ > do{ > /* if it begins with a quote, assume it's a quoted col */ > if( *s=='\"' ){ > s++; /* skip quote */ > pCSV->aCols[nCol] = s; /* save pointer for this col */ > - /* TBD: handle escaped quotes "" */ > + pCSV->aEscapedQuotes[nCol] = 0; > /* find closing quote */ > - s = strchr(s, '\"'); > - if( !s ){ > - /* no closing quote */ > - pCSV->eof = -1; > - return SQLITE_ERROR; > + while( 1 ){ > + s = strchr(s, '\"'); > + if( !s ){ > + /* no closing quote */ > + pCSV->eof = -1; > + return SQLITE_ERROR; > + }else if ( *(s+1)=='\"' ){ > + /* escaped quote */ > + pCSV->aEscapedQuotes[nCol]++; > + s+=2; > + }else{ > + break; > + } > } > *s = '\0'; /* null terminate this col */ > /* fall through and look for following ",\n" */ > s++; > }else{ > pCSV->aCols[nCol] = s; /* save pointer for this col */ > + pCSV->aEscapedQuotes[nCol] = 0; > } > s = strpbrk(s, zDelims); > if( !s ){ > /* no col delimiter */ > pCSV->eof = -1; > @@ -362,18 +390,27 @@ > if( cDelim == '\n' ) break; > /* move to start of next col */ > s++; /* skip delimiter */ > > if(nCol >= pCSV->maxCol ){ > + if( nCol>=pCSV->db->aLimit[SQLITE_LIMIT_COLUMN] ){ > + return SQLITE_ERROR; > + } > /* we need to grow our col pointer array */ > char **p = (char **)sqlite3_realloc( pCSV->aCols, sizeof(char*) * > (nCol+5) ); > if( !p ){ > /* out of memory */ > return SQLITE_ERROR; > } > pCSV->maxCol = nCol + 5; > pCSV->aCols = p; > + int *p1 = (int *)sqlite3_realloc( pCSV->aEscapedQuotes, sizeof(int) > * (nCol+5) ); > + if( !p1 ){ > + /* out of memory */ > + return SQLITE_ERROR; > + } > + pCSV->aEscapedQuotes = p1; > } > > }while( *s ); > > pCSV->nCol = nCol; > @@ -402,13 +439,38 @@ > CSV *pCSV = (CSV *)pVtabCursor->pVtab; > > if( i<0 || i>=pCSV->nCol ){ > sqlite3_result_null( ctx ); > }else{ > - char *col = pCSV->aCols[i]; > + const char *col = pCSV->aCols[i]; > if( !col ){ > sqlite3_result_null( ctx ); > + }else if( pCSV->aEscapedQuotes[i] ){ > + char *z; > + > + int nByte = (int)(strlen(col) - pCSV->aEscapedQuotes[i]); > + if( nByte>pCSV->db->aLimit[SQLITE_LIMIT_LENGTH] ){ > + sqlite3_result_error_toobig( ctx ); > + z = 0; > + }else{ > + z = sqlite3_malloc( nByte ); > + if( !z ){ > + sqlite3_result_error_nomem( ctx ); > + } > + } > + if( z ){ > + int j,k; > + for(j=0, k=0; col[j]; j++){ > + z[k++] = col[j]; > + if( col[j]=='\"' ){ > + /* unescape quote */ > + j++; > + } > + } > + z[k] = 0; > + sqlite3_result_text( ctx, z, k, sqlite3_free ); > + } > }else{ > sqlite3_result_text( ctx, col, -1, SQLITE_TRANSIENT ); > } > } > > @@ -473,10 +535,11 @@ > /* finalize any prepared statements here */ > > csv_close( pCSV ); > if( pCSV->zRow ) sqlite3_free( pCSV->zRow ); > if( pCSV->aCols ) sqlite3_free( pCSV->aCols ); > + if( pCSV->aEscapedQuotes ) sqlite3_free( pCSV->aEscapedQuotes ); > sqlite3_free( pCSV ); > } > return 0; > } > > @@ -539,10 +602,11 @@ > return SQLITE_NOMEM; > } > > /* intialize virtual table object */ > memset(pCSV, 0, sizeof(CSV)+nDb+nName+nFile+3); > + pCSV->db = db; > pCSV->nBusy = 1; > pCSV->base.pModule = &csvModule; > pCSV->cDelim = cDelim; > pCSV->zDb = (char *)&pCSV[1]; > pCSV->zName = &pCSV->zDb[nDb+1]; > @@ -608,11 +672,11 @@ > *pzErr = sqlite3_mprintf("%s", aErrMsg[4]); > sqlite3_free(zSql); > csvRelease( pCSV ); > return SQLITE_ERROR; > } > - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail); > + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail); > }else{ > zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail); > } > sqlite3_free(zTmp); > } > > I'll add some testcases... > > > On Sat, May 8, 2010 at 9:44 PM, gwenn <gwenn.k...@gmail.com> wrote: > >> Ok, >> I've just added support to embedded new lines and partial support to >> escaped double-quotes. >> By partial support, I mean they are not unescaped yet... >> >> Index: ext/csv/csv.c >> =================================================================== >> --- ext/csv/csv.c >> +++ ext/csv/csv.c >> @@ -120,10 +120,11 @@ >> */ >> static char *csv_getline( CSV *pCSV ){ >> int n = 0; >> int bEol = 0; >> int bShrink = 0; >> + int bQuotedCol = 0; >> >> /* allocate initial row buffer */ >> if( pCSV->maxRow < 1 ){ >> pCSV->zRow = sqlite3_malloc( 100 ); >> if( pCSV->zRow ){ >> @@ -150,19 +151,32 @@ >> pCSV->zRow[n] = '\0'; >> bEol = -1; >> break; >> } >> /* look for line delimiter */ >> - while( pCSV->zRow[n] ){ n++; } >> - if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){ >> + while( pCSV->zRow[n] ){ >> + if( pCSV->zRow[n]=='\"' ){ >> + if( bQuotedCol ) { >> + if( pCSV->zRow[n+1]=='\"' ) { /* escaped */ >> + n++; >> + }else{ >> + bQuotedCol = 0; >> + } >> + }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){ >> + bQuotedCol = 1; >> + } >> + } >> + n++; >> + } >> + if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) && >> !bQuotedCol ){ >> pCSV->zRow[n-1] = '\n'; /* uniform line ending */ >> pCSV->zRow[n] = '\0'; >> bEol = -1; >> } >> } >> if( bShrink ){ >> - pCSV->zRow = realloc( pCSV->zRow, n+1 ); >> + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 ); >> pCSV->maxRow = n+1; >> } >> return bEol ? pCSV->zRow : 0; >> } >> >> @@ -332,17 +346,23 @@ >> do{ >> /* if it begins with a quote, assume it's a quoted col */ >> if( *s=='\"' ){ >> s++; /* skip quote */ >> pCSV->aCols[nCol] = s; /* save pointer for this col */ >> - /* TBD: handle escaped quotes "" */ >> /* find closing quote */ >> - s = strchr(s, '\"'); >> - if( !s ){ >> - /* no closing quote */ >> - pCSV->eof = -1; >> - return SQLITE_ERROR; >> + while( 1 ){ >> + s = strchr(s, '\"'); >> + if( !s ){ >> + /* no closing quote */ >> + pCSV->eof = -1; >> + return SQLITE_ERROR; >> + }else if ( *(s+1)=='\"' ){ >> + /* TBD: replace all escaped quotes by a single one */ >> + s+=2; >> + }else{ >> + break; >> + } >> } >> *s = '\0'; /* null terminate this col */ >> /* fall through and look for following ",\n" */ >> s++; >> }else{ >> @@ -608,11 +628,11 @@ >> *pzErr = sqlite3_mprintf("%s", aErrMsg[4]); >> sqlite3_free(zSql); >> csvRelease( pCSV ); >> return SQLITE_ERROR; >> } >> - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail); >> + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail); >> }else{ >> zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail); >> } >> sqlite3_free(zTmp); >> } >> >> >> On Sat, May 8, 2010 at 3:45 PM, gwenn <gwenn.k...@gmail.com> wrote: >> >>> While looking in csv1.test, I found a solution to the case when header >>> row contains spaces: just wrap the column name with double quotes. >>> >>> Index: ext/csv/csv.c >>> =================================================================== >>> --- ext/csv/csv.c >>> +++ ext/csv/csv.c >>> @@ -158,11 +158,11 @@ >>> pCSV->zRow[n] = '\0'; >>> bEol = -1; >>> } >>> } >>> if( bShrink ){ >>> - pCSV->zRow = realloc( pCSV->zRow, n+1 ); >>> + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 ); >>> pCSV->maxRow = n+1; >>> } >>> return bEol ? pCSV->zRow : 0; >>> } >>> >>> @@ -608,11 +608,11 @@ >>> *pzErr = sqlite3_mprintf("%s", aErrMsg[4]); >>> sqlite3_free(zSql); >>> csvRelease( pCSV ); >>> return SQLITE_ERROR; >>> } >>> - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail); >>> + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail); >>> }else{ >>> zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail); >>> } >>> sqlite3_free(zTmp); >>> } >>> >>> Index: ext/csv/csv1.test >>> =================================================================== >>> --- ext/csv/csv1.test >>> +++ ext/csv/csv1.test >>> @@ -23,10 +23,11 @@ >>> # >>> # csv-1.*: Creating/destroying csv tables. >>> # csv-2.*: Linear scans of csv data. >>> # csv-3.*: Test renaming an csv table. >>> # csv-4.*: CREATE errors >>> +# csv-5.*: Dirty header and long line. >>> # >>> >>> ifcapable !csv { >>> finish_test >>> return >>> @@ -36,10 +37,12 @@ >>> set test1csv [file join [file dirname [info script]] test1.csv] >>> # This file is delimited by '|' and has quoted fields. >>> set test2csv [file join [file dirname [info script]] test2.csv] >>> # This file is delimited by '|'. It does NOT have quoted fields. >>> set test3csv [file join [file dirname [info script]] test3.csv] >>> +# This file contains a dirty header and one long line. >>> +set test4csv [file join [file dirname [info script]] test4.csv] >>> >>> >>> >>> #---------------------------------------------------------------------------- >>> # Test cases csv-1.* test CREATE and DROP table statements. >>> # >>> >>> @@ -249,5 +252,14 @@ >>> catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') " >>> } {1 {Error opening CSV file: 'foo'}} >>> do_test csv-4.1.3 { >>> catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) " >>> } {1 {Error opening CSV file: 'foo foo'}} >>> + >>> >>> +#---------------------------------------------------------------------------- >>> +# Test cases csv-5.* test file with dirty header and long line. >>> +# >>> + >>> +do_test csv-5.1.1 { >>> + execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') " >>> + execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',', >>> USE_HEADER_ROW) " >>> +} {} >>> >>> ADDED ext/csv/test4.csv >>> col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10 >>> >>> 123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789 >>> >>> I tried to handle double-quoted fields with embedded line breaks without >>> success but I am not stuck yet. >>> Regards. >>> >>> On Wed, Apr 21, 2010 at 8:22 PM, gwenn <gwenn.k...@gmail.com> wrote: >>> >>>> Thanks for this great extension. >>>> It works smoothly with 500Mo files. >>>> >>>> And it's a workaround to some shortcomings of the '.import' command: >>>> - no need to create a table before, >>>> - no need to delete the header row before/after, >>>> - no error if the number of columns is not homogeneous, >>>> - ... >>>> It's a nightmare to work with the CSV format but I have to. >>>> >>>> I made a quick and dirty fix to the USE_HEADER_ROW mode to replace >>>> whitespaces, slashes or hyphens by underscores. >>>> But I look for a better solution. Is there any way to make sure a string >>>> is a valid column name? >>>> >>>> Regards >>>> >>>> On Mon Apr 19 17:32:13 GMT 2010, Shane Harrelson <shane at sqlite.org> >>>> wrote: >>>> > Thanks for the report. The extension is still very a much a >>>> > work-in-progress and any feedback is greatly appreciated. >>>> > >>>> > -Shane >>>> >>>> > >>>> > >>>> > On Sun, Apr 18, 2010 at 12:51 PM, gwenn <gwenn.kahz at gmail.com> >>>> wrote: >>>> > > Hello, >>>> > > There is a little bug/typo in the csv extension when lines exceed >>>> 100 >>>> > > characters: >>>> > > *** glibc detected *** sqlite3: realloc(): invalid pointer: >>>> > > 0x0000000000ad1a78 *** >>>> > > ======= Backtrace: ========= >>>> > > /lib/libc.so.6[0x7f6dab009d16] >>>> > > /lib/libc.so.6(realloc+0x321)[0x7f6dab00fda1] >>>> > > ./libSqliteCsv.so[0x7f6da9ef9dbf] >>>> > > >>>> > > A possible patch is: >>>> > > --- /tmp/SQLite-d474195a997b9d94/ext/csv/csv.c 2009-11-05 >>>> 05:14:30.000000000 >>>> > > +0100 >>>> > > +++ csv.c 2010-04-18 18:48:04.000000000 +0200 >>>> > > @@ -160,7 +160,7 @@ >>>> > > } >>>> > > } >>>> > > if( bShrink ){ >>>> > > - pCSV->zRow = realloc( pCSV->zRow, n+1 ); >>>> > > + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 ); >>>> > > pCSV->maxRow = n+1; >>>> > > } >>>> > > return bEol ? pCSV->zRow : 0; >>>> > > >>>> > > Regards. >>>> > > _______________________________________________ >>>> > > sqlite-users mailing list >>>> > > sqlite-users at sqlite.org >>>> > > http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users >>>> > > >>>> >>> >>> >> > _______________________________________________ sqlite-users mailing list sqlite-users@sqlite.org http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users