Next iteration of my csvimport utility. I think it handles Simon's test cases
adequately (or it gives an error).
Now allows for multiline fields
Correctly(?) handles fields with escaped quotes.
Checks the entire file to ensure all records have the correct field count
before processing.
Determines # of columns from the 1st record in the file.
Can now change delimiters for fields, records, and field enclosures.
Aborts on malformed records.
Usage: csvimport [-d] [-e] [-r] filename databasename tablename
-d Change field delimiter, default -d,
-e Change field enclosure char, default -e\"
-r Change record delimiter, default -r
On my system using 1M records generated with this:
main() {
int i;
for(i=0; i<1000000; ++i) {
printf("%d,%d,%d,%d\n",i,i+1,i+2,i+3);
}
}
Takes 4.1 seconds to import the 1M records.
// csvimport -- At least somewhat RFC4180 compliant
// quoted fields can span multiple lines
// quotes can be inserted by escaping with another quote
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "sqlite3.h"
#define BUFSIZE 1000000
#define MOD 100000
#define MAXTOKEN 100000
int nline=0;
int ncol;
char buf[BUFSIZE];
int peek;
char delim=',';
char quote='"';
char eol='\n';
inline int nextChar(FILE *fp) {
static char *ptr=buf;
if (*ptr == 0) {
fgets(buf,BUFSIZE,fp);
if (feof(fp)) return EOF;
++nline;
ptr=buf;
}
peek = *(ptr+1);
return *ptr++;
}
int countColumns(FILE *fp) {
// 1st line(s) of the file determine how many columns we have
int p;
int ncol = 0;
while((p=nextChar(fp))!=EOF) {
if (p == quote) { // start quote
p=nextChar(fp);
if (p==quote) continue; // escaped quote
while((p=nextChar(fp)) && p!=quote); // skip to next quote
}
if (p == delim) {
++ncol;
}
if (p == eol) {
break;
}
}
if (feof(fp)) {
return 0;
}
return ++ncol;
}
inline char *nextToken(FILE *fp) {
// 1st line(s) of the file determine how many columns we have
int p;
int n=0;
static char *token;
int inquote=0;
if (token == NULL) {
token=malloc(MAXTOKEN);
if (token==NULL) {
fprintf(stderr,"malloc error: %%m\n");
exit(1);
}
}
token[0]=0;
while((p=nextChar(fp))!=EOF) {
if (p == quote) { // start quote
p=nextChar(fp);
if (p==quote) { // escaped quote
token[n++]=p;
token[n]=0;
continue;
}
token[n++]=p;
token[n]=0;
inquote = 1;
while(inquote) {
p=nextChar(fp);
if (p==quote && peek==quote) {
p=nextChar(fp); // skip on escaped quote
token[n++]=p;
token[n]=0;
continue;
}
if (p==quote) {
inquote=0;
} else {
token[n++]=p;
token[n]=0;
}
}
continue;
}
if (p == delim) {
token[n]=0;
return token;
}
if (p == eol) {
break;
}
token[n++]=p;
token[n]=0;
}
if (feof(fp)) {
return NULL;
}
token[n]=0;
return token;
}
void checkrc(sqlite3 *db,int rc,int checkrc,int flag,char *msg,char *str) {
if (rc != checkrc) {
fprintf(stderr,"\nLine#%d: ",nline);
fprintf(stderr,msg,str);
fprintf(stderr,"%s\n",sqlite3_errmsg(db));
if (flag) { // then fatal
exit(1);
}
}
}
char escape(char *s) {
if (*s != '\\') return *s;
++s;
switch(*s) {
case 't':
return '\t';
case 'n':
return '\n';
case '"':
return '"';
}
fprintf(stderr,"Unknown escape sequence=%s\n",--s);
return '\0';
}
// Add comma delimited file to exisiting database/table
// Quoted strings are accepted
int main(int argc, char *argv[]) {
sqlite3 *db;
sqlite3_stmt *stmt;
int rc;
int ninsert=0;
char sql[32768];
FILE *fp;
char *filename;
char *databasename;
char *tablename;
while(argv[1] && argv[1][0]=='-') {
switch(argv[1][1]) {
case 'd':
delim=escape(&argv[1][2]);
break;
case 'e':
eol=escape(&argv[1][2]);
break;
default:
fprintf(stderr,"Bad option: %s\n",argv[1]);
}
++argv;
--argc;
}
if (argc != 4) {
fprintf(stderr,"Usage: csvimport [-d] [-e] [-r] filename databasename
tablename\n");
fprintf(stderr," -d Change field delimiter, default -d,\n");
fprintf(stderr," -e Change field enclosure char, default -e\\\"\n");
fprintf(stderr," -r Change record delimiter, default -r\\n\n");
exit(1);
}
filename = argv[1];
databasename = argv[2];
tablename = argv[3];
rc = sqlite3_open_v2(databasename,&db,SQLITE_OPEN_READWRITE,NULL);
checkrc(db,SQLITE_OK,rc,1,"Error opening database '%s': ",databasename);
fp=fopen(filename,"r");
if (fp == NULL) {
perror(filename);
exit(1);
}
// count the columns
ncol = countColumns(fp);
fprintf(stderr,"%d columns detected...checking file contents\n",ncol);
rewind(fp);
int mycol;
nline=0;
while((mycol=countColumns(fp)) == ncol);
if (!feof(fp)) {
fprintf(stderr,"Error at line#%d, expected %d cols, got %d
cols...aborting\n",nline,ncol,mycol);
exit(1);
}
rewind(fp);
printf("importing %d lines...\n",nline);
sprintf(sql,"insert into %s values (",tablename);
for(mycol=0; mycol<ncol; ++mycol) {
strcat(sql,mycol==0?"":",");
strcat(sql,"?");
}
strcat(sql,")");
// prepare our statement
rc = sqlite3_prepare(db,sql,strlen(sql),&stmt,NULL);
checkrc(db,SQLITE_OK,rc,1,"Prepare failed on '%s':",sql);
// Let's wrap things in a transaction
rc = sqlite3_exec(db,"BEGIN",NULL,NULL,NULL);
checkrc(db,SQLITE_OK,rc,1,"BEGIN failed on '%s': ",sql);
// Loop over file
rewind(fp);
nline=0;
while(!feof(fp)) {
int i;
for(i=1; i<=ncol; ++i) { // bind the columns as text, table will take care
of conversion to column types
char *p=nextToken(fp);
if (feof(fp)) break;
rc=sqlite3_bind_text(stmt,i,p,-1,SQLITE_TRANSIENT);
checkrc(db,SQLITE_OK,rc,0,"Bind_text failed on '%s': ",sql);
}
if (feof(fp)) break;
rc = sqlite3_step(stmt);
checkrc(db,SQLITE_DONE,rc,1,"Insert failed on '%s': ",databasename);
++ninsert;
if ((ninsert % MOD)==0) {
printf("%d\r",ninsert);
fflush(stdout);
}
rc = sqlite3_reset(stmt);
checkrc(db,SQLITE_OK,rc,1,"Reset failed on '%s': ",sql);
}
rc=sqlite3_finalize(stmt);
checkrc(db,SQLITE_OK,rc,1,"Finalize failed: ",NULL);
printf("%d lines, %d inserts, committing...\n",nline,ninsert);
rc = sqlite3_exec(db,"COMMIT",NULL,NULL,NULL);
checkrc(db,SQLITE_OK,rc,1,"COMMIT failed on '%s': ",sql);
rc=sqlite3_close(db);
checkrc(db,SQLITE_OK,rc,1,"Close failed on '%s': ",argv[2]);
fclose(fp);
return 0;
}
Michael D. Black
Senior Scientist
Advanced Analytics Directorate
Advanced GEOINT Solutions Operating Unit
Northrop Grumman Information Systems
_______________________________________________
sqlite-users mailing list
[email protected]
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users