Re: [sqlite] [csv extension] Error while reading long lines

gwenn Sat, 08 May 2010 12:44:59 -0700

Ok,
I've just added support to embedded new lines and partial support to escaped
double-quotes.
By partial support, I mean they are not unescaped yet...


Index: ext/csv/csv.c
===================================================================
--- ext/csv/csv.c
+++ ext/csv/csv.c
@@ -120,10 +120,11 @@
 */
 static char *csv_getline( CSV *pCSV ){
   int n = 0;
   int bEol = 0;
   int bShrink = 0;
+  int bQuotedCol = 0;

   /* allocate initial row buffer */
   if( pCSV->maxRow < 1 ){
     pCSV->zRow = sqlite3_malloc( 100 );
     if( pCSV->zRow ){
@@ -150,19 +151,32 @@
       pCSV->zRow[n] = '\0';
       bEol = -1;
       break;
     }
     /* look for line delimiter */
-    while( pCSV->zRow[n] ){ n++; }
-    if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){
+    while( pCSV->zRow[n] ){
+      if( pCSV->zRow[n]=='\"' ){
+        if( bQuotedCol ) {
+          if( pCSV->zRow[n+1]=='\"' ) { /* escaped */
+            n++;
+          }else{
+            bQuotedCol = 0;
+          }
+        }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){
+          bQuotedCol = 1;
+        }
+      }
+      n++;
+    }
+    if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) &&
!bQuotedCol ){
       pCSV->zRow[n-1] = '\n'; /* uniform line ending */
       pCSV->zRow[n] = '\0';
       bEol = -1;
     }
   }
   if( bShrink ){
-    pCSV->zRow = realloc( pCSV->zRow, n+1 );
+    pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
     pCSV->maxRow = n+1;
   }
   return bEol ? pCSV->zRow : 0;
 }

@@ -332,17 +346,23 @@
   do{
     /* if it begins with a quote, assume it's a quoted col */
     if( *s=='\"' ){
       s++;  /* skip quote */
       pCSV->aCols[nCol] = s; /* save pointer for this col */
-      /* TBD: handle escaped quotes "" */
       /* find closing quote */
-      s = strchr(s, '\"');
-      if( !s ){
-        /* no closing quote */
-        pCSV->eof = -1;
-        return SQLITE_ERROR;
+      while( 1 ){
+        s = strchr(s, '\"');
+        if( !s ){
+          /* no closing quote */
+          pCSV->eof = -1;
+          return SQLITE_ERROR;
+        }else if ( *(s+1)=='\"' ){
+          /* TBD: replace all escaped quotes by a single one */
+          s+=2;
+        }else{
+          break;
+        }
       }
       *s = '\0'; /* null terminate this col */
       /* fall through and look for following ",\n" */
       s++;
     }else{
@@ -608,11 +628,11 @@
         *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
         sqlite3_free(zSql);
         csvRelease( pCSV );
         return SQLITE_ERROR;
       }
-      zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
+      zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
     }else{
       zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
     }
     sqlite3_free(zTmp);
   }


On Sat, May 8, 2010 at 3:45 PM, gwenn <gwenn.k...@gmail.com> wrote:

> While looking in csv1.test, I found a solution to the case when header row
> contains spaces: just wrap the column name with double quotes.
>
> Index: ext/csv/csv.c
> ===================================================================
> --- ext/csv/csv.c
> +++ ext/csv/csv.c
> @@ -158,11 +158,11 @@
>        pCSV->zRow[n] = '\0';
>        bEol = -1;
>      }
>    }
>    if( bShrink ){
> -    pCSV->zRow = realloc( pCSV->zRow, n+1 );
> +    pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>      pCSV->maxRow = n+1;
>    }
>    return bEol ? pCSV->zRow : 0;
>  }
>
> @@ -608,11 +608,11 @@
>          *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
>          sqlite3_free(zSql);
>          csvRelease( pCSV );
>          return SQLITE_ERROR;
>        }
> -      zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
> +      zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
>      }else{
>        zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
>      }
>      sqlite3_free(zTmp);
>    }
>
> Index: ext/csv/csv1.test
> ===================================================================
> --- ext/csv/csv1.test
> +++ ext/csv/csv1.test
> @@ -23,10 +23,11 @@
>  #
>  #   csv-1.*: Creating/destroying csv tables.
>  #   csv-2.*: Linear scans of csv data.
>  #   csv-3.*: Test renaming an csv table.
>  #   csv-4.*: CREATE errors
> +#   csv-5.*: Dirty header and long line.
>  #
>
>  ifcapable !csv {
>    finish_test
>    return
> @@ -36,10 +37,12 @@
>  set test1csv [file join [file dirname [info script]] test1.csv]
>  # This file is delimited by '|' and has quoted fields.
>  set test2csv [file join [file dirname [info script]] test2.csv]
>  # This file is delimited by '|'.  It does NOT have quoted fields.
>  set test3csv [file join [file dirname [info script]] test3.csv]
> +# This file contains a dirty header and one long line.
> +set test4csv [file join [file dirname [info script]] test4.csv]
>
>
>  #----------------------------------------------------------------------------
>  # Test cases csv-1.* test CREATE and DROP table statements.
>  #
>
> @@ -249,5 +252,14 @@
>    catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') "
>  } {1 {Error opening CSV file: 'foo'}}
>  do_test csv-4.1.3 {
>    catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) "
>  } {1 {Error opening CSV file: 'foo foo'}}
> +
>
> +#----------------------------------------------------------------------------
> +# Test cases csv-5.* test file with dirty header and long line.
> +#
> +
> +do_test csv-5.1.1 {
> +  execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') "
> +  execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',',
> USE_HEADER_ROW) "
> +} {}
>
> ADDED    ext/csv/test4.csv
> col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10
>
> 123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789
>
> I tried to handle double-quoted fields with embedded line breaks without
> success but I am not stuck yet.
> Regards.
>
> On Wed, Apr 21, 2010 at 8:22 PM, gwenn <gwenn.k...@gmail.com> wrote:
>
>> Thanks for this great extension.
>> It works smoothly with 500Mo files.
>>
>> And it's a workaround to some shortcomings of the '.import' command:
>>  - no need to create a table before,
>>  - no need to delete the header row before/after,
>>  - no error if the number of columns is not homogeneous,
>>  - ...
>> It's a nightmare to work with the CSV format but I have to.
>>
>> I made a quick and dirty fix to the USE_HEADER_ROW mode to replace
>> whitespaces, slashes or hyphens by underscores.
>> But I look for a better solution. Is there any way to make sure a string
>> is a valid column name?
>>
>> Regards
>>
>> On Mon Apr 19 17:32:13 GMT 2010, Shane Harrelson <shane at sqlite.org>
>> wrote:
>> > Thanks for the report.  The extension is still very a much a
>> > work-in-progress and any feedback is greatly appreciated.
>> >
>> > -Shane
>>
>> >
>> >
>> > On Sun, Apr 18, 2010 at 12:51 PM, gwenn <gwenn.kahz at gmail.com>
>> wrote:
>> > > Hello,
>> > > There is a little bug/typo in the csv extension when lines exceed 100
>> > > characters:
>> > > *** glibc detected *** sqlite3: realloc(): invalid pointer:
>> > > 0x0000000000ad1a78 ***
>> > > ======= Backtrace: =========
>> > > /lib/libc.so.6[0x7f6dab009d16]
>> > > /lib/libc.so.6(realloc+0x321)[0x7f6dab00fda1]
>> > > ./libSqliteCsv.so[0x7f6da9ef9dbf]
>> > >
>> > > A possible patch is:
>> > > --- /tmp/SQLite-d474195a997b9d94/ext/csv/csv.c 2009-11-05
>> 05:14:30.000000000
>> > > +0100
>> > > +++ csv.c 2010-04-18 18:48:04.000000000 +0200
>> > > @@ -160,7 +160,7 @@
>> > >     }
>> > >   }
>> > >   if( bShrink ){
>> > > -    pCSV->zRow = realloc( pCSV->zRow, n+1 );
>> > > +    pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>> > >     pCSV->maxRow = n+1;
>> > >   }
>> > >   return bEol ? pCSV->zRow : 0;
>> > >
>> > > Regards.
>> > > _______________________________________________
>> > > sqlite-users mailing list
>> > > sqlite-users at sqlite.org
>> > > http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users
>> > >
>>
>
>
_______________________________________________
sqlite-users mailing list
sqlite-users@sqlite.org
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users

Re: [sqlite] [csv extension] Error while reading long lines

Reply via email to