I modified my csvimport to allow for quoted fields. Still automaticallhy
detects the # of columns.
If quoted fields also contain internal quotes they must be escaped by doubling
them.
e.g.
col1,"this is, a ""quoted"" field",col3
It's not fully RFC4180 compliant as it doesn't allow for CF/LF inside quotes --
does anybody really need/use that?
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "sqlite3.h"
#define BUFSIZE 1000000
#define MOD 100000
#define MAXTOK 100000
char *nextToken(char *line) {
static char token[MAXTOK];
static char *p;
int n=0;
if (line) {
p = line;
}
while(*p && *p != ',' && *p!='\r' && *p!= '\n') {
if (*p == '"') { // quoted field
++p; // move past first quote
while((*p && *p!='"') || *(p+1)=='"') {
if (*p == '"' && *(p+1)=='"') { // escaped quote?
token[n++]='"';
p+=2;
continue;
}
token[n++] = *p;
++p;
}
token[n++]=0;
++p;
if (*p == ',') ++p;
return token;
}
token[n++]=*p;
++p;
}
if (n > MAXTOK) {
fprintf(stderr,"Token too big??\n");
exit(1);
}
token[n]=0;
++p;
if (*p == ',') ++p;
if (n > 0) {
return token;
}
return NULL;
}
// Add comma delimited file to exisiting database/table
// Quoted strings are accepted
int main(int argc, char *argv[]) {
sqlite3 *db;
sqlite3_stmt *stmt;
int rc;
int ncol=0;
int nline=0;
char *buf=malloc(BUFSIZE);
char sql[8192];
FILE *fp;
char *filename;
char *databasename;
char *tablename;
if (argc != 4) {
fprintf(stderr,"Usage: %s filename databasename tablename\n",argv[0]);
exit(1);
}
filename = argv[1];
databasename = argv[2];
tablename = argv[3];
rc = sqlite3_open_v2(databasename,&db,SQLITE_OPEN_READWRITE,NULL);
if (rc) {
fprintf(stderr,"Error opening database '%s':
%s\n",databasename,sqlite3_errmsg(db));
exit(1);
}
sprintf(sql,"insert into %s values (",tablename);
fp=fopen(filename,"r");
if (fp == NULL) {
perror(filename);
exit(1);
}
buf[BUFSIZE-1] = '*';
fgets(buf,BUFSIZE,fp);
if (buf[BUFSIZE-1] != '*') {
fprintf(stderr,"BUFSIZE not big enough...aborting\n");
exit(1);
}
// count the columns
char *p=nextToken(buf);
ncol=0;
while(p) {
++ncol;
strcat(sql,ncol==1?"":",");
strcat(sql,"?");
p=nextToken(NULL);
}
printf("%d columns detected\n",ncol);
strcat(sql,")");
rewind(fp);
// Let's wrap things in a transaction
rc = sqlite3_exec(db,"BEGIN",NULL,NULL,NULL);
if (rc) {
fprintf(stderr,"BEGIN failed on '%s': %s\n",sql,sqlite3_errmsg(db));
exit(1);
}
// prepare our statement
rc = sqlite3_prepare(db,sql,strlen(sql),&stmt,NULL);
if (rc) {
fprintf(stderr,"Prepare failed on '%s': %s\n",sql,sqlite3_errmsg(db));
exit(1);
}
// Loop over file file
while(fgets(buf,BUFSIZE,fp)) {
char *p=nextToken(buf);
int i=1;
++nline;
if ((nline % MOD)==0) {
printf("%d\r",nline);
fflush(stdout);
}
while(p) { // bind the columns as text, table will take care of conversion
to column types
rc=sqlite3_bind_text(stmt,i,p,-1,SQLITE_TRANSIENT);
if (rc) {
fprintf(stderr,"bind_text failed on '%s': %s\n",sql,sqlite3_errmsg(db));
exit(1);
}
++i;
p=nextToken(NULL);
}
if (--i != ncol) {
fprintf(stderr,"expected %d cols, got %d cols on line#%d\n",ncol,i,nline);
} else {
rc = sqlite3_step(stmt);
if (rc != SQLITE_DONE) {
fprintf(stderr,"Insert failed on '%s': %s\n",sql,sqlite3_errmsg(db));
exit(1);
}
rc = sqlite3_reset(stmt);
if (rc) {
fprintf(stderr,"Reset failed on '%s': %s\n",sql,sqlite3_errmsg(db));
exit(1);
}
}
}
rc=sqlite3_finalize(stmt);
if (rc) {
fprintf(stderr,"Finalize failed: %s\n",sqlite3_errmsg(db));
exit(1);
}
printf("%d inserts, committing...\n",nline);
rc = sqlite3_exec(db,"COMMIT",NULL,NULL,NULL);
if (rc) {
fprintf(stderr,"COMMIT failed on '%s': %s\n",sql,sqlite3_errmsg(db));
exit(1);
}
rc=sqlite3_close(db);
if (rc) {
fprintf(stderr,"Close failed on '%s': %s\n",argv[2],sqlite3_errmsg(db));
exit(1);
}
fclose(fp);
return 0;
}
Michael D. Black
Senior Scientist
Advanced Analytics Directorate
Advanced GEOINT Solutions Operating Unit
Northrop Grumman Information Systems
From: [email protected] [[email protected]] on
behalf of Oliver Schneider [[email protected]]
Sent: Friday, May 04, 2012 12:40 PM
To: [email protected]
Subject: EXT :Re: [sqlite] is SQLite the right tool to analyze a 44GB file
On 2012-05-01 20:41, Baruch Burstein wrote:
> It is already wrapped in a transaction.
> I seem to remember seeing somewhere that the .import command doesn't
> understand escaping, e.g.
>
> "one","two,three"
>
> will get imported as
>
> "one" | "two | three"
>
> (the quotes are part of the data, and the second column was split into two
> by the comma)
> Just a point to be aware of.
That could be mitigated by writing a small script that "converts" the
CSV contents line-wise to SQL-statements, right?
// Oliver
PS: re-sending after the mailing lists chokes on my message with S/MIME
signature.
_______________________________________________
sqlite-users mailing list
[email protected]
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users
_______________________________________________
sqlite-users mailing list
[email protected]
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users