Here is some tests:
--- csv1.test
+++ csv1.test
@@ -23,10 +23,11 @@
#
# csv-1.*: Creating/destroying csv tables.
# csv-2.*: Linear scans of csv data.
# csv-3.*: Test renaming an csv table.
# csv-4.*: CREATE errors
+# csv-5.*: Dirty header, long line, escaped quotes, escaped newlines.
#
ifcapable !csv {
finish_test
return
@@ -36,10 +37,13 @@
set test1csv [file join [file dirname [info script]] test1.csv]
# This file is delimited by '|' and has quoted fields.
set test2csv [file join [file dirname [info script]] test2.csv]
# This file is delimited by '|'. It does NOT have quoted fields.
set test3csv [file join [file dirname [info script]] test3.csv]
+# This file contains a dirty header, one long line, escaped quotes, escaped
+# new lines.
+set test4csv [file join [file dirname [info script]] test4.csv]
#----------------------------------------------------------------------------
# Test cases csv-1.* test CREATE and DROP table statements.
#
@@ -249,5 +253,40 @@
catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') "
} {1 {Error opening CSV file: 'foo'}}
do_test csv-4.1.3 {
catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) "
} {1 {Error opening CSV file: 'foo foo'}}
+
+#----------------------------------------------------------------------------
+# Test cases csv-5.* test file with dirty header and long line.
+#
+
+do_test csv-5.1.1 {
+ execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') "
+ execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',',
USE_HEADER_ROW) "
+} {}
+do_test csv-5.1.2 {
+ execsql {
+ SELECT col1 FROM t1 limit 1 offset 1;
+ }
+} {123456789}
+do_test csv-5.1.3 {
+ execsql {
+ SELECT * FROM t1 limit 1 offset 3;
+ }
+} {{123456789
+} {
+} {} {123456789
+} {1234\\567'89
+} {123456"789
+} {123""456789
+} 1234\"5678\"9 123456789\" {}}
+do_test csv-5.1.4 {
+ execsql {
+ SELECT col1,col2,col3 FROM t1 limit 1 offset 4;
+ }
+} {{} ' {}}
+do_test csv-5.1.5 {
+ execsql {
+ SELECT col1 FROM t1 limit 1 offset 5;
+ }
+} {'}
And the test file (test4.csv):
col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10
123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
"123456789
","
","","123456789
","1234\\567'89
","123456""789
","123""""456789
",1234"5678"9,123456789",""
"",',
'
On Thu, May 13, 2010 at 9:28 PM, gwenn <[email protected]> wrote:
> Done!
>
> Index: ext/csv/csv.c
> ===================================================================
> --- ext/csv/csv.c
> +++ ext/csv/csv.c
> @@ -60,10 +60,11 @@
> char *zRow; /* Buffer for current CSV row */
> char cDelim; /* Character to use for delimiting columns
> */
> int nCol; /* Number of columns in current row */
> int maxCol; /* Size of aCols array */
> char **aCols; /* Array of parsed columns */
> + int *aEscapedQuotes; /* Number of escaped quotes for each column
> in aCols */
> };
>
>
> /*
> ** An CSV cursor object.
> @@ -120,10 +121,11 @@
> */
> static char *csv_getline( CSV *pCSV ){
> int n = 0;
> int bEol = 0;
> int bShrink = 0;
> + int bQuotedCol = 0;
>
> /* allocate initial row buffer */
> if( pCSV->maxRow < 1 ){
> pCSV->zRow = sqlite3_malloc( 100 );
> if( pCSV->zRow ){
> @@ -135,10 +137,13 @@
> /* read until eol */
> while( !bEol ){
> /* grow row buffer as needed */
> if( n+100>pCSV->maxRow ){
> int newSize = pCSV->maxRow*2 + 100;
> + if( newSize>=pCSV->db->aLimit[SQLITE_LIMIT_LENGTH] ){
> + return 0;
> + }
> char *p = sqlite3_realloc(pCSV->zRow, newSize);
> if( !p ) return 0;
> pCSV->maxRow = newSize;
> pCSV->zRow = p;
> bShrink = -1;
> @@ -150,19 +155,32 @@
> pCSV->zRow[n] = '\0';
> bEol = -1;
> break;
> }
> /* look for line delimiter */
> - while( pCSV->zRow[n] ){ n++; }
> - if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){
> + while( pCSV->zRow[n] ){
> + if( pCSV->zRow[n]=='\"' ){
> + if( bQuotedCol ) {
> + if( pCSV->zRow[n+1]=='\"' ) { /* escaped */
> + n++;
> + }else{
> + bQuotedCol = 0;
> + }
> + }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){
> + bQuotedCol = 1;
> + }
> + }
> + n++;
> + }
> + if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) &&
> !bQuotedCol ){
> pCSV->zRow[n-1] = '\n'; /* uniform line ending */
> pCSV->zRow[n] = '\0';
> bEol = -1;
> }
> }
> if( bShrink ){
> - pCSV->zRow = realloc( pCSV->zRow, n+1 );
> + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
> pCSV->maxRow = n+1;
> }
> return bEol ? pCSV->zRow : 0;
> }
>
> @@ -317,15 +335,16 @@
> /* allocate initial space for the column pointers */
> if( pCSV->maxCol < 1 ){
> /* take a guess */
> int maxCol = (int)(strlen(pCSV->zRow) / 5 + 1);
> pCSV->aCols = (char **)sqlite3_malloc( sizeof(char*) * maxCol );
> + pCSV->aEscapedQuotes = (int *)sqlite3_malloc( sizeof(int) * maxCol );
> if( pCSV->aCols ){
> pCSV->maxCol = maxCol;
> }
> }
> - if( !pCSV->aCols ) return SQLITE_NOMEM;
> + if( !pCSV->aCols || !pCSV->aEscapedQuotes ) return SQLITE_NOMEM;
>
> /* add custom delim character */
> zDelims[0] = pCSV->cDelim;
>
> /* parse the zRow into individual columns */
> @@ -332,23 +351,32 @@
> do{
> /* if it begins with a quote, assume it's a quoted col */
> if( *s=='\"' ){
> s++; /* skip quote */
> pCSV->aCols[nCol] = s; /* save pointer for this col */
> - /* TBD: handle escaped quotes "" */
> + pCSV->aEscapedQuotes[nCol] = 0;
> /* find closing quote */
> - s = strchr(s, '\"');
> - if( !s ){
> - /* no closing quote */
> - pCSV->eof = -1;
> - return SQLITE_ERROR;
> + while( 1 ){
> + s = strchr(s, '\"');
> + if( !s ){
> + /* no closing quote */
> + pCSV->eof = -1;
> + return SQLITE_ERROR;
> + }else if ( *(s+1)=='\"' ){
> + /* escaped quote */
> + pCSV->aEscapedQuotes[nCol]++;
> + s+=2;
> + }else{
> + break;
> + }
> }
> *s = '\0'; /* null terminate this col */
> /* fall through and look for following ",\n" */
> s++;
> }else{
> pCSV->aCols[nCol] = s; /* save pointer for this col */
> + pCSV->aEscapedQuotes[nCol] = 0;
> }
> s = strpbrk(s, zDelims);
> if( !s ){
> /* no col delimiter */
> pCSV->eof = -1;
> @@ -362,18 +390,27 @@
> if( cDelim == '\n' ) break;
> /* move to start of next col */
> s++; /* skip delimiter */
>
> if(nCol >= pCSV->maxCol ){
> + if( nCol>=pCSV->db->aLimit[SQLITE_LIMIT_COLUMN] ){
> + return SQLITE_ERROR;
> + }
> /* we need to grow our col pointer array */
> char **p = (char **)sqlite3_realloc( pCSV->aCols, sizeof(char*) *
> (nCol+5) );
> if( !p ){
> /* out of memory */
> return SQLITE_ERROR;
> }
> pCSV->maxCol = nCol + 5;
> pCSV->aCols = p;
> + int *p1 = (int *)sqlite3_realloc( pCSV->aEscapedQuotes, sizeof(int)
> * (nCol+5) );
> + if( !p1 ){
> + /* out of memory */
> + return SQLITE_ERROR;
> + }
> + pCSV->aEscapedQuotes = p1;
> }
>
> }while( *s );
>
> pCSV->nCol = nCol;
> @@ -402,13 +439,38 @@
> CSV *pCSV = (CSV *)pVtabCursor->pVtab;
>
> if( i<0 || i>=pCSV->nCol ){
> sqlite3_result_null( ctx );
> }else{
> - char *col = pCSV->aCols[i];
> + const char *col = pCSV->aCols[i];
> if( !col ){
> sqlite3_result_null( ctx );
> + }else if( pCSV->aEscapedQuotes[i] ){
> + char *z;
> +
> + int nByte = (int)(strlen(col) - pCSV->aEscapedQuotes[i]);
> + if( nByte>pCSV->db->aLimit[SQLITE_LIMIT_LENGTH] ){
> + sqlite3_result_error_toobig( ctx );
> + z = 0;
> + }else{
> + z = sqlite3_malloc( nByte );
> + if( !z ){
> + sqlite3_result_error_nomem( ctx );
> + }
> + }
> + if( z ){
> + int j,k;
> + for(j=0, k=0; col[j]; j++){
> + z[k++] = col[j];
> + if( col[j]=='\"' ){
> + /* unescape quote */
> + j++;
> + }
> + }
> + z[k] = 0;
> + sqlite3_result_text( ctx, z, k, sqlite3_free );
> + }
> }else{
> sqlite3_result_text( ctx, col, -1, SQLITE_TRANSIENT );
> }
> }
>
> @@ -473,10 +535,11 @@
> /* finalize any prepared statements here */
>
> csv_close( pCSV );
> if( pCSV->zRow ) sqlite3_free( pCSV->zRow );
> if( pCSV->aCols ) sqlite3_free( pCSV->aCols );
> + if( pCSV->aEscapedQuotes ) sqlite3_free( pCSV->aEscapedQuotes );
> sqlite3_free( pCSV );
> }
> return 0;
> }
>
> @@ -539,10 +602,11 @@
> return SQLITE_NOMEM;
> }
>
> /* intialize virtual table object */
> memset(pCSV, 0, sizeof(CSV)+nDb+nName+nFile+3);
> + pCSV->db = db;
> pCSV->nBusy = 1;
> pCSV->base.pModule = &csvModule;
> pCSV->cDelim = cDelim;
> pCSV->zDb = (char *)&pCSV[1];
> pCSV->zName = &pCSV->zDb[nDb+1];
> @@ -608,11 +672,11 @@
> *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
> sqlite3_free(zSql);
> csvRelease( pCSV );
> return SQLITE_ERROR;
> }
> - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
> + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
> }else{
> zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
> }
> sqlite3_free(zTmp);
> }
>
> I'll add some testcases...
>
>
> On Sat, May 8, 2010 at 9:44 PM, gwenn <[email protected]> wrote:
>
>> Ok,
>> I've just added support to embedded new lines and partial support to
>> escaped double-quotes.
>> By partial support, I mean they are not unescaped yet...
>>
>> Index: ext/csv/csv.c
>> ===================================================================
>> --- ext/csv/csv.c
>> +++ ext/csv/csv.c
>> @@ -120,10 +120,11 @@
>> */
>> static char *csv_getline( CSV *pCSV ){
>> int n = 0;
>> int bEol = 0;
>> int bShrink = 0;
>> + int bQuotedCol = 0;
>>
>> /* allocate initial row buffer */
>> if( pCSV->maxRow < 1 ){
>> pCSV->zRow = sqlite3_malloc( 100 );
>> if( pCSV->zRow ){
>> @@ -150,19 +151,32 @@
>> pCSV->zRow[n] = '\0';
>> bEol = -1;
>> break;
>> }
>> /* look for line delimiter */
>> - while( pCSV->zRow[n] ){ n++; }
>> - if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){
>> + while( pCSV->zRow[n] ){
>> + if( pCSV->zRow[n]=='\"' ){
>> + if( bQuotedCol ) {
>> + if( pCSV->zRow[n+1]=='\"' ) { /* escaped */
>> + n++;
>> + }else{
>> + bQuotedCol = 0;
>> + }
>> + }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){
>> + bQuotedCol = 1;
>> + }
>> + }
>> + n++;
>> + }
>> + if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) &&
>> !bQuotedCol ){
>> pCSV->zRow[n-1] = '\n'; /* uniform line ending */
>> pCSV->zRow[n] = '\0';
>> bEol = -1;
>> }
>> }
>> if( bShrink ){
>> - pCSV->zRow = realloc( pCSV->zRow, n+1 );
>> + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>> pCSV->maxRow = n+1;
>> }
>> return bEol ? pCSV->zRow : 0;
>> }
>>
>> @@ -332,17 +346,23 @@
>> do{
>> /* if it begins with a quote, assume it's a quoted col */
>> if( *s=='\"' ){
>> s++; /* skip quote */
>> pCSV->aCols[nCol] = s; /* save pointer for this col */
>> - /* TBD: handle escaped quotes "" */
>> /* find closing quote */
>> - s = strchr(s, '\"');
>> - if( !s ){
>> - /* no closing quote */
>> - pCSV->eof = -1;
>> - return SQLITE_ERROR;
>> + while( 1 ){
>> + s = strchr(s, '\"');
>> + if( !s ){
>> + /* no closing quote */
>> + pCSV->eof = -1;
>> + return SQLITE_ERROR;
>> + }else if ( *(s+1)=='\"' ){
>> + /* TBD: replace all escaped quotes by a single one */
>> + s+=2;
>> + }else{
>> + break;
>> + }
>> }
>> *s = '\0'; /* null terminate this col */
>> /* fall through and look for following ",\n" */
>> s++;
>> }else{
>> @@ -608,11 +628,11 @@
>> *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
>> sqlite3_free(zSql);
>> csvRelease( pCSV );
>> return SQLITE_ERROR;
>> }
>> - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
>> + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
>> }else{
>> zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
>> }
>> sqlite3_free(zTmp);
>> }
>>
>>
>> On Sat, May 8, 2010 at 3:45 PM, gwenn <[email protected]> wrote:
>>
>>> While looking in csv1.test, I found a solution to the case when header
>>> row contains spaces: just wrap the column name with double quotes.
>>>
>>> Index: ext/csv/csv.c
>>> ===================================================================
>>> --- ext/csv/csv.c
>>> +++ ext/csv/csv.c
>>> @@ -158,11 +158,11 @@
>>> pCSV->zRow[n] = '\0';
>>> bEol = -1;
>>> }
>>> }
>>> if( bShrink ){
>>> - pCSV->zRow = realloc( pCSV->zRow, n+1 );
>>> + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>>> pCSV->maxRow = n+1;
>>> }
>>> return bEol ? pCSV->zRow : 0;
>>> }
>>>
>>> @@ -608,11 +608,11 @@
>>> *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
>>> sqlite3_free(zSql);
>>> csvRelease( pCSV );
>>> return SQLITE_ERROR;
>>> }
>>> - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
>>> + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
>>> }else{
>>> zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
>>> }
>>> sqlite3_free(zTmp);
>>> }
>>>
>>> Index: ext/csv/csv1.test
>>> ===================================================================
>>> --- ext/csv/csv1.test
>>> +++ ext/csv/csv1.test
>>> @@ -23,10 +23,11 @@
>>> #
>>> # csv-1.*: Creating/destroying csv tables.
>>> # csv-2.*: Linear scans of csv data.
>>> # csv-3.*: Test renaming an csv table.
>>> # csv-4.*: CREATE errors
>>> +# csv-5.*: Dirty header and long line.
>>> #
>>>
>>> ifcapable !csv {
>>> finish_test
>>> return
>>> @@ -36,10 +37,12 @@
>>> set test1csv [file join [file dirname [info script]] test1.csv]
>>> # This file is delimited by '|' and has quoted fields.
>>> set test2csv [file join [file dirname [info script]] test2.csv]
>>> # This file is delimited by '|'. It does NOT have quoted fields.
>>> set test3csv [file join [file dirname [info script]] test3.csv]
>>> +# This file contains a dirty header and one long line.
>>> +set test4csv [file join [file dirname [info script]] test4.csv]
>>>
>>>
>>>
>>> #----------------------------------------------------------------------------
>>> # Test cases csv-1.* test CREATE and DROP table statements.
>>> #
>>>
>>> @@ -249,5 +252,14 @@
>>> catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') "
>>> } {1 {Error opening CSV file: 'foo'}}
>>> do_test csv-4.1.3 {
>>> catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) "
>>> } {1 {Error opening CSV file: 'foo foo'}}
>>> +
>>>
>>> +#----------------------------------------------------------------------------
>>> +# Test cases csv-5.* test file with dirty header and long line.
>>> +#
>>> +
>>> +do_test csv-5.1.1 {
>>> + execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') "
>>> + execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',',
>>> USE_HEADER_ROW) "
>>> +} {}
>>>
>>> ADDED ext/csv/test4.csv
>>> col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10
>>>
>>> 123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789
>>>
>>> I tried to handle double-quoted fields with embedded line breaks without
>>> success but I am not stuck yet.
>>> Regards.
>>>
>>> On Wed, Apr 21, 2010 at 8:22 PM, gwenn <[email protected]> wrote:
>>>
>>>> Thanks for this great extension.
>>>> It works smoothly with 500Mo files.
>>>>
>>>> And it's a workaround to some shortcomings of the '.import' command:
>>>> - no need to create a table before,
>>>> - no need to delete the header row before/after,
>>>> - no error if the number of columns is not homogeneous,
>>>> - ...
>>>> It's a nightmare to work with the CSV format but I have to.
>>>>
>>>> I made a quick and dirty fix to the USE_HEADER_ROW mode to replace
>>>> whitespaces, slashes or hyphens by underscores.
>>>> But I look for a better solution. Is there any way to make sure a string
>>>> is a valid column name?
>>>>
>>>> Regards
>>>>
>>>> On Mon Apr 19 17:32:13 GMT 2010, Shane Harrelson <shane at sqlite.org>
>>>> wrote:
>>>> > Thanks for the report. The extension is still very a much a
>>>> > work-in-progress and any feedback is greatly appreciated.
>>>> >
>>>> > -Shane
>>>>
>>>> >
>>>> >
>>>> > On Sun, Apr 18, 2010 at 12:51 PM, gwenn <gwenn.kahz at gmail.com>
>>>> wrote:
>>>> > > Hello,
>>>> > > There is a little bug/typo in the csv extension when lines exceed
>>>> 100
>>>> > > characters:
>>>> > > *** glibc detected *** sqlite3: realloc(): invalid pointer:
>>>> > > 0x0000000000ad1a78 ***
>>>> > > ======= Backtrace: =========
>>>> > > /lib/libc.so.6[0x7f6dab009d16]
>>>> > > /lib/libc.so.6(realloc+0x321)[0x7f6dab00fda1]
>>>> > > ./libSqliteCsv.so[0x7f6da9ef9dbf]
>>>> > >
>>>> > > A possible patch is:
>>>> > > --- /tmp/SQLite-d474195a997b9d94/ext/csv/csv.c 2009-11-05
>>>> 05:14:30.000000000
>>>> > > +0100
>>>> > > +++ csv.c 2010-04-18 18:48:04.000000000 +0200
>>>> > > @@ -160,7 +160,7 @@
>>>> > > }
>>>> > > }
>>>> > > if( bShrink ){
>>>> > > - pCSV->zRow = realloc( pCSV->zRow, n+1 );
>>>> > > + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>>>> > > pCSV->maxRow = n+1;
>>>> > > }
>>>> > > return bEol ? pCSV->zRow : 0;
>>>> > >
>>>> > > Regards.
>>>> > > _______________________________________________
>>>> > > sqlite-users mailing list
>>>> > > sqlite-users at sqlite.org
>>>> > > http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users
>>>> > >
>>>>
>>>
>>>
>>
>
_______________________________________________
sqlite-users mailing list
[email protected]
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users