Ok,
I've just added support to embedded new lines and partial support to escaped
double-quotes.
By partial support, I mean they are not unescaped yet...
Index: ext/csv/csv.c
===================================================================
--- ext/csv/csv.c
+++ ext/csv/csv.c
@@ -120,10 +120,11 @@
*/
static char *csv_getline( CSV *pCSV ){
int n = 0;
int bEol = 0;
int bShrink = 0;
+ int bQuotedCol = 0;
/* allocate initial row buffer */
if( pCSV->maxRow < 1 ){
pCSV->zRow = sqlite3_malloc( 100 );
if( pCSV->zRow ){
@@ -150,19 +151,32 @@
pCSV->zRow[n] = '\0';
bEol = -1;
break;
}
/* look for line delimiter */
- while( pCSV->zRow[n] ){ n++; }
- if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){
+ while( pCSV->zRow[n] ){
+ if( pCSV->zRow[n]=='\"' ){
+ if( bQuotedCol ) {
+ if( pCSV->zRow[n+1]=='\"' ) { /* escaped */
+ n++;
+ }else{
+ bQuotedCol = 0;
+ }
+ }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){
+ bQuotedCol = 1;
+ }
+ }
+ n++;
+ }
+ if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) &&
!bQuotedCol ){
pCSV->zRow[n-1] = '\n'; /* uniform line ending */
pCSV->zRow[n] = '\0';
bEol = -1;
}
}
if( bShrink ){
- pCSV->zRow = realloc( pCSV->zRow, n+1 );
+ pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
pCSV->maxRow = n+1;
}
return bEol ? pCSV->zRow : 0;
}
@@ -332,17 +346,23 @@
do{
/* if it begins with a quote, assume it's a quoted col */
if( *s=='\"' ){
s++; /* skip quote */
pCSV->aCols[nCol] = s; /* save pointer for this col */
- /* TBD: handle escaped quotes "" */
/* find closing quote */
- s = strchr(s, '\"');
- if( !s ){
- /* no closing quote */
- pCSV->eof = -1;
- return SQLITE_ERROR;
+ while( 1 ){
+ s = strchr(s, '\"');
+ if( !s ){
+ /* no closing quote */
+ pCSV->eof = -1;
+ return SQLITE_ERROR;
+ }else if ( *(s+1)=='\"' ){
+ /* TBD: replace all escaped quotes by a single one */
+ s+=2;
+ }else{
+ break;
+ }
}
*s = '\0'; /* null terminate this col */
/* fall through and look for following ",\n" */
s++;
}else{
@@ -608,11 +628,11 @@
*pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
sqlite3_free(zSql);
csvRelease( pCSV );
return SQLITE_ERROR;
}
- zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
+ zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
}else{
zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
}
sqlite3_free(zTmp);
}
On Sat, May 8, 2010 at 3:45 PM, gwenn <[email protected]> wrote:
> While looking in csv1.test, I found a solution to the case when header row
> contains spaces: just wrap the column name with double quotes.
>
> Index: ext/csv/csv.c
> ===================================================================
> --- ext/csv/csv.c
> +++ ext/csv/csv.c
> @@ -158,11 +158,11 @@
> pCSV->zRow[n] = '\0';
> bEol = -1;
> }
> }
> if( bShrink ){
> - pCSV->zRow = realloc( pCSV->zRow, n+1 );
> + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
> pCSV->maxRow = n+1;
> }
> return bEol ? pCSV->zRow : 0;
> }
>
> @@ -608,11 +608,11 @@
> *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
> sqlite3_free(zSql);
> csvRelease( pCSV );
> return SQLITE_ERROR;
> }
> - zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
> + zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
> }else{
> zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
> }
> sqlite3_free(zTmp);
> }
>
> Index: ext/csv/csv1.test
> ===================================================================
> --- ext/csv/csv1.test
> +++ ext/csv/csv1.test
> @@ -23,10 +23,11 @@
> #
> # csv-1.*: Creating/destroying csv tables.
> # csv-2.*: Linear scans of csv data.
> # csv-3.*: Test renaming an csv table.
> # csv-4.*: CREATE errors
> +# csv-5.*: Dirty header and long line.
> #
>
> ifcapable !csv {
> finish_test
> return
> @@ -36,10 +37,12 @@
> set test1csv [file join [file dirname [info script]] test1.csv]
> # This file is delimited by '|' and has quoted fields.
> set test2csv [file join [file dirname [info script]] test2.csv]
> # This file is delimited by '|'. It does NOT have quoted fields.
> set test3csv [file join [file dirname [info script]] test3.csv]
> +# This file contains a dirty header and one long line.
> +set test4csv [file join [file dirname [info script]] test4.csv]
>
>
> #----------------------------------------------------------------------------
> # Test cases csv-1.* test CREATE and DROP table statements.
> #
>
> @@ -249,5 +252,14 @@
> catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') "
> } {1 {Error opening CSV file: 'foo'}}
> do_test csv-4.1.3 {
> catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) "
> } {1 {Error opening CSV file: 'foo foo'}}
> +
>
> +#----------------------------------------------------------------------------
> +# Test cases csv-5.* test file with dirty header and long line.
> +#
> +
> +do_test csv-5.1.1 {
> + execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') "
> + execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',',
> USE_HEADER_ROW) "
> +} {}
>
> ADDED ext/csv/test4.csv
> col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10
>
> 123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789
>
> I tried to handle double-quoted fields with embedded line breaks without
> success but I am not stuck yet.
> Regards.
>
> On Wed, Apr 21, 2010 at 8:22 PM, gwenn <[email protected]> wrote:
>
>> Thanks for this great extension.
>> It works smoothly with 500Mo files.
>>
>> And it's a workaround to some shortcomings of the '.import' command:
>> - no need to create a table before,
>> - no need to delete the header row before/after,
>> - no error if the number of columns is not homogeneous,
>> - ...
>> It's a nightmare to work with the CSV format but I have to.
>>
>> I made a quick and dirty fix to the USE_HEADER_ROW mode to replace
>> whitespaces, slashes or hyphens by underscores.
>> But I look for a better solution. Is there any way to make sure a string
>> is a valid column name?
>>
>> Regards
>>
>> On Mon Apr 19 17:32:13 GMT 2010, Shane Harrelson <shane at sqlite.org>
>> wrote:
>> > Thanks for the report. The extension is still very a much a
>> > work-in-progress and any feedback is greatly appreciated.
>> >
>> > -Shane
>>
>> >
>> >
>> > On Sun, Apr 18, 2010 at 12:51 PM, gwenn <gwenn.kahz at gmail.com>
>> wrote:
>> > > Hello,
>> > > There is a little bug/typo in the csv extension when lines exceed 100
>> > > characters:
>> > > *** glibc detected *** sqlite3: realloc(): invalid pointer:
>> > > 0x0000000000ad1a78 ***
>> > > ======= Backtrace: =========
>> > > /lib/libc.so.6[0x7f6dab009d16]
>> > > /lib/libc.so.6(realloc+0x321)[0x7f6dab00fda1]
>> > > ./libSqliteCsv.so[0x7f6da9ef9dbf]
>> > >
>> > > A possible patch is:
>> > > --- /tmp/SQLite-d474195a997b9d94/ext/csv/csv.c 2009-11-05
>> 05:14:30.000000000
>> > > +0100
>> > > +++ csv.c 2010-04-18 18:48:04.000000000 +0200
>> > > @@ -160,7 +160,7 @@
>> > > }
>> > > }
>> > > if( bShrink ){
>> > > - pCSV->zRow = realloc( pCSV->zRow, n+1 );
>> > > + pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>> > > pCSV->maxRow = n+1;
>> > > }
>> > > return bEol ? pCSV->zRow : 0;
>> > >
>> > > Regards.
>> > > _______________________________________________
>> > > sqlite-users mailing list
>> > > sqlite-users at sqlite.org
>> > > http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users
>> > >
>>
>
>
_______________________________________________
sqlite-users mailing list
[email protected]
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users