Done!
Index: ext/csv/csv.c
===================================================================
--- ext/csv/csv.c
+++ ext/csv/csv.c
@@ -60,10 +60,11 @@
char *zRow; /* Buffer for current CSV row */
char cDelim; /* Character to use for delimiting columns
*/
int nCol; /* Number of columns in current row */
int maxCol; /* Size of aCols array */
char **aCols; /* Array of parsed columns */
+ int *aEscapedQuotes; /* Number of escaped quotes for each column
in aCols */
};
/*
** An CSV cursor object.
@@ -120,10 +121,11 @@
*/
static char *csv_getline( CSV *pCSV ){
int n = 0;
int bEol = 0;
int bShrink = 0;
+ int bQuotedCol = 0;
/* allocate initial row buffer */
if( pCSV->maxRow < 1 ){
pCSV->zRow = sqlite3_malloc( 100 );
if( pCSV->zRow ){
@@ -135,10 +137,13 @@
/* read until eol */
while( !bEol ){
/* grow row buffer as needed */
if( n+100>pCSV->maxRow ){
int newSize = pCSV->maxRow*2 + 100;
+ if( newSize>=pCSV->db->aLimit[SQLITE_LIMIT_LENGTH] ){
+ return 0;
+ }
char *p = sqlite3_realloc(pCSV->zRow, newSize);
if( !p ) return 0;
pCSV->maxRow = newSize;
pCSV->zRow = p;
bShrink = -1;
@@ -150,19 +155,32 @@
pCSV->zRow[n] = '\0';
bEol = -1;
break;
}
/* look for line delimiter */
- while( pCSV->zRow[n] ){ n++; }
- if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){
+ while( pCSV->zRow[n] ){
+ if( pCSV->zRow[n]=='\"' ){
+ if( bQuotedCol ) {
+ if( pCSV->zRow[n+1]=='\"' ) { /* escaped */
+ n++;
+ }else{
+ bQuotedCol = 0;
+ }
+ }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){
+ bQuotedCol = 1;
+ }
+ }
+ n++;
+ }
+ if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) &&
!bQuotedCol ){
pCSV->zRow[n-1] = '\n'; /* uniform line ending */
pCSV->zRow[n] = '\0';
bEol = -1;
}
}
if( bShrink ){
- pCSV->zRow = realloc( pCSV->zRow, n+1 );
+ pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
pCSV->maxRow = n+1;
}
return bEol ? pCSV->zRow : 0;
}
@@ -317,15 +335,16 @@
/* allocate initial space for the column pointers */
if( pCSV->maxCol < 1 ){
/* take a guess */
int maxCol = (int)(strlen(pCSV->zRow) / 5 + 1);
pCSV->aCols = (char **)sqlite3_malloc( sizeof(char*) * maxCol );
+ pCSV->aEscapedQuotes = (int *)sqlite3_malloc( sizeof(int) * maxCol );
if( pCSV->aCols ){
pCSV->maxCol = maxCol;
}
}
- if( !pCSV->aCols ) return SQLITE_NOMEM;
+ if( !pCSV->aCols || !pCSV->aEscapedQuotes ) return SQLITE_NOMEM;
/* add custom delim character */
zDelims[0] = pCSV->cDelim;
/* parse the zRow into individual columns */
@@ -332,23 +351,32 @@
do{
/* if it begins with a quote, assume it's a quoted col */
if( *s=='\"' ){
s++; /* skip quote */
pCSV->aCols[nCol] = s; /* save pointer for this col */
- /* TBD: handle escaped quotes "" */
+ pCSV->aEscapedQuotes[nCol] = 0;
/* find closing quote */
- s = strchr(s, '\"');
- if( !s ){
- /* no closing quote */
- pCSV->eof = -1;
- return SQLITE_ERROR;
+ while( 1 ){
+ s = strchr(s, '\"');
+ if( !s ){
+ /* no closing quote */
+ pCSV->eof = -1;
+ return SQLITE_ERROR;
+ }else if ( *(s+1)=='\"' ){
+ /* escaped quote */
+ pCSV->aEscapedQuotes[nCol]++;
+ s+=2;
+ }else{
+ break;
+ }
}
*s = '\0'; /* null terminate this col */
/* fall through and look for following ",\n" */
s++;
}else{
pCSV->aCols[nCol] = s; /* save pointer for this col */
+ pCSV->aEscapedQuotes[nCol] = 0;
}
s = strpbrk(s, zDelims);
if( !s ){
/* no col delimiter */
pCSV->eof = -1;
@@ -362,18 +390,27 @@
if( cDelim == '\n' ) break;
/* move to start of next col */
s++; /* skip delimiter */
if(nCol >= pCSV->maxCol ){
+ if( nCol>=pCSV->db->aLimit[SQLITE_LIMIT_COLUMN] ){
+ return SQLITE_ERROR;
+ }
/* we need to grow our col pointer array */
char **p = (char **)sqlite3_realloc( pCSV->aCols, sizeof(char*) *
(nCol+5) );
if( !p ){
/* out of memory */
return SQLITE_ERROR;
}
pCSV->maxCol = nCol + 5;
pCSV->aCols = p;
+ int *p1 = (int *)sqlite3_realloc( pCSV->aEscapedQuotes, sizeof(int) *
(nCol+5) );
+ if( !p1 ){
+ /* out of memory */
+ return SQLITE_ERROR;
+ }
+ pCSV->aEscapedQuotes = p1;
}
}while( *s );
pCSV->nCol = nCol;
@@ -402,13 +439,38 @@
CSV *pCSV = (CSV *)pVtabCursor->pVtab;
if( i<0 || i>=pCSV->nCol ){
sqlite3_result_null( ctx );
}else{
- char *col = pCSV->aCols[i];
+ const char *col = pCSV->aCols[i];
if( !col ){
sqlite3_result_null( ctx );
+ }else if( pCSV->aEscapedQuotes[i] ){
+ char *z;
+
+ int nByte = (int)(strlen(col) - pCSV->aEscapedQuotes[i]);
+ if( nByte>pCSV->db->aLimit[SQLITE_LIMIT_LENGTH] ){
+ sqlite3_result_error_toobig( ctx );
+ z = 0;
+ }else{
+ z = sqlite3_malloc( nByte );
+ if( !z ){
+ sqlite3_result_error_nomem( ctx );
+ }
+ }
+ if( z ){
+ int j,k;
+ for(j=0, k=0; col[j]; j++){
+ z[k++] = col[j];
+ if( col[j]=='\"' ){
+ /* unescape quote */
+ j++;
+ }
+ }
+ z[k] = 0;
+ sqlite3_result_text( ctx, z, k, sqlite3_free );
+ }
}else{
sqlite3_result_text( ctx, col, -1, SQLITE_TRANSIENT );
}
}
@@ -473,10 +535,11 @@
/* finalize any prepared statements here */
csv_close( pCSV );
if( pCSV->zRow ) sqlite3_free( pCSV->zRow );
if( pCSV->aCols ) sqlite3_free( pCSV->aCols );
+ if( pCSV->aEscapedQuotes ) sqlite3_free( pCSV->aEscapedQuotes );
sqlite3_free( pCSV );
}
return 0;
}
@@ -539,10 +602,11 @@
return SQLITE_NOMEM;
}
/* intialize virtual table object */
memset(pCSV, 0, sizeof(CSV)+nDb+nName+nFile+3);
+ pCSV->db = db;
pCSV->nBusy = 1;
pCSV->base.pModule = &csvModule;
pCSV->cDelim = cDelim;
pCSV->zDb = (char *)&pCSV[1];
pCSV->zName = &pCSV->zDb[nDb+1];
@@ -608,11 +672,11 @@
*pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
sqlite3_free(zSql);
csvRelease( pCSV );
return SQLITE_ERROR;
}
- zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
+ zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
}else{
zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
}
sqlite3_free(zTmp);
}
I'll add some testcases...
On Sat, May 8, 2010 at 9:44 PM, gwenn <[email protected]> wrote:
> Ok,
> I've just added support to embedded new lines and partial support to
> escaped double-quotes.
> By partial support, I mean they are not unescaped yet...
>
> Index: ext/csv/csv.c
> ===================================================================
> --- ext/csv/csv.c
> +++ ext/csv/csv.c
> @@ -120,10 +120,11 @@
> */
> static char *csv_getline( CSV *pCSV ){
> int n = 0;
> int bEol = 0;
> int bShrink = 0;
> + int bQuotedCol = 0;
>
> /* allocate initial row buffer */
> if( pCSV->maxRow < 1 ){
> pCSV->zRow = sqlite3_malloc( 100 );
> if( pCSV->zRow ){
> @@ -150,19 +151,32 @@
> pCSV->zRow[n] = '\0';
> bEol = -1;
> break;
> }
> /* look for line delimiter */
> - while( pCSV->zRow[n] ){ n++; }
> - if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){
> + while( pCSV->zRow[n] ){
> + if( pCSV->zRow[n]=='\"' ){
> + if( bQuotedCol ) {
> + if( pCSV->zRow[n+1]=='\"' ) { /* escaped */
> + n++;
> + }else{
> + bQuotedCol = 0;
> + }
> + }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){
> + bQuotedCol = 1;
> + }
> + }
> + n++;
> + }
> + if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) &&
> !bQuotedCol ){
> pCSV->zRow[n-1] = '\n'; /* uniform line ending */
> pCSV->zRow[n] = '\0';
> bEol = -1;
> }
> }
> if( bShrink ){
> - pCSV->zRow = realloc( pCSV->zRow, n+1 );
> + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
> pCSV->maxRow = n+1;
> }
> return bEol ? pCSV->zRow : 0;
> }
>
> @@ -332,17 +346,23 @@
> do{
> /* if it begins with a quote, assume it's a quoted col */
> if( *s=='\"' ){
> s++; /* skip quote */
> pCSV->aCols[nCol] = s; /* save pointer for this col */
> - /* TBD: handle escaped quotes "" */
> /* find closing quote */
> - s = strchr(s, '\"');
> - if( !s ){
> - /* no closing quote */
> - pCSV->eof = -1;
> - return SQLITE_ERROR;
> + while( 1 ){
> + s = strchr(s, '\"');
> + if( !s ){
> + /* no closing quote */
> + pCSV->eof = -1;
> + return SQLITE_ERROR;
> + }else if ( *(s+1)=='\"' ){
> + /* TBD: replace all escaped quotes by a single one */
> + s+=2;
> + }else{
> + break;
> + }
> }
> *s = '\0'; /* null terminate this col */
> /* fall through and look for following ",\n" */
> s++;
> }else{
> @@ -608,11 +628,11 @@
> *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
> sqlite3_free(zSql);
> csvRelease( pCSV );
> return SQLITE_ERROR;
> }
> - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
> + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
> }else{
> zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
> }
> sqlite3_free(zTmp);
> }
>
>
> On Sat, May 8, 2010 at 3:45 PM, gwenn <[email protected]> wrote:
>
>> While looking in csv1.test, I found a solution to the case when header row
>> contains spaces: just wrap the column name with double quotes.
>>
>> Index: ext/csv/csv.c
>> ===================================================================
>> --- ext/csv/csv.c
>> +++ ext/csv/csv.c
>> @@ -158,11 +158,11 @@
>> pCSV->zRow[n] = '\0';
>> bEol = -1;
>> }
>> }
>> if( bShrink ){
>> - pCSV->zRow = realloc( pCSV->zRow, n+1 );
>> + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>> pCSV->maxRow = n+1;
>> }
>> return bEol ? pCSV->zRow : 0;
>> }
>>
>> @@ -608,11 +608,11 @@
>> *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
>> sqlite3_free(zSql);
>> csvRelease( pCSV );
>> return SQLITE_ERROR;
>> }
>> - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
>> + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
>> }else{
>> zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
>> }
>> sqlite3_free(zTmp);
>> }
>>
>> Index: ext/csv/csv1.test
>> ===================================================================
>> --- ext/csv/csv1.test
>> +++ ext/csv/csv1.test
>> @@ -23,10 +23,11 @@
>> #
>> # csv-1.*: Creating/destroying csv tables.
>> # csv-2.*: Linear scans of csv data.
>> # csv-3.*: Test renaming an csv table.
>> # csv-4.*: CREATE errors
>> +# csv-5.*: Dirty header and long line.
>> #
>>
>> ifcapable !csv {
>> finish_test
>> return
>> @@ -36,10 +37,12 @@
>> set test1csv [file join [file dirname [info script]] test1.csv]
>> # This file is delimited by '|' and has quoted fields.
>> set test2csv [file join [file dirname [info script]] test2.csv]
>> # This file is delimited by '|'. It does NOT have quoted fields.
>> set test3csv [file join [file dirname [info script]] test3.csv]
>> +# This file contains a dirty header and one long line.
>> +set test4csv [file join [file dirname [info script]] test4.csv]
>>
>>
>>
>> #----------------------------------------------------------------------------
>> # Test cases csv-1.* test CREATE and DROP table statements.
>> #
>>
>> @@ -249,5 +252,14 @@
>> catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') "
>> } {1 {Error opening CSV file: 'foo'}}
>> do_test csv-4.1.3 {
>> catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) "
>> } {1 {Error opening CSV file: 'foo foo'}}
>> +
>>
>> +#----------------------------------------------------------------------------
>> +# Test cases csv-5.* test file with dirty header and long line.
>> +#
>> +
>> +do_test csv-5.1.1 {
>> + execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') "
>> + execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',',
>> USE_HEADER_ROW) "
>> +} {}
>>
>> ADDED ext/csv/test4.csv
>> col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10
>>
>> 123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789
>>
>> I tried to handle double-quoted fields with embedded line breaks without
>> success but I am not stuck yet.
>> Regards.
>>
>> On Wed, Apr 21, 2010 at 8:22 PM, gwenn <[email protected]> wrote:
>>
>>> Thanks for this great extension.
>>> It works smoothly with 500Mo files.
>>>
>>> And it's a workaround to some shortcomings of the '.import' command:
>>> - no need to create a table before,
>>> - no need to delete the header row before/after,
>>> - no error if the number of columns is not homogeneous,
>>> - ...
>>> It's a nightmare to work with the CSV format but I have to.
>>>
>>> I made a quick and dirty fix to the USE_HEADER_ROW mode to replace
>>> whitespaces, slashes or hyphens by underscores.
>>> But I look for a better solution. Is there any way to make sure a string
>>> is a valid column name?
>>>
>>> Regards
>>>
>>> On Mon Apr 19 17:32:13 GMT 2010, Shane Harrelson <shane at sqlite.org>
>>> wrote:
>>> > Thanks for the report. The extension is still very a much a
>>> > work-in-progress and any feedback is greatly appreciated.
>>> >
>>> > -Shane
>>>
>>> >
>>> >
>>> > On Sun, Apr 18, 2010 at 12:51 PM, gwenn <gwenn.kahz at gmail.com>
>>> wrote:
>>> > > Hello,
>>> > > There is a little bug/typo in the csv extension when lines exceed 100
>>> > > characters:
>>> > > *** glibc detected *** sqlite3: realloc(): invalid pointer:
>>> > > 0x0000000000ad1a78 ***
>>> > > ======= Backtrace: =========
>>> > > /lib/libc.so.6[0x7f6dab009d16]
>>> > > /lib/libc.so.6(realloc+0x321)[0x7f6dab00fda1]
>>> > > ./libSqliteCsv.so[0x7f6da9ef9dbf]
>>> > >
>>> > > A possible patch is:
>>> > > --- /tmp/SQLite-d474195a997b9d94/ext/csv/csv.c 2009-11-05
>>> 05:14:30.000000000
>>> > > +0100
>>> > > +++ csv.c 2010-04-18 18:48:04.000000000 +0200
>>> > > @@ -160,7 +160,7 @@
>>> > > }
>>> > > }
>>> > > if( bShrink ){
>>> > > - pCSV->zRow = realloc( pCSV->zRow, n+1 );
>>> > > + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>>> > > pCSV->maxRow = n+1;
>>> > > }
>>> > > return bEol ? pCSV->zRow : 0;
>>> > >
>>> > > Regards.
>>> > > _______________________________________________
>>> > > sqlite-users mailing list
>>> > > sqlite-users at sqlite.org
>>> > > http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users
>>> > >
>>>
>>
>>
>
_______________________________________________
sqlite-users mailing list
[email protected]
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users