Next iteration of my csvimport utility. I think it handles Simon's test cases adequately (or it gives an error).
Now allows for multiline fields Correctly(?) handles fields with escaped quotes. Checks the entire file to ensure all records have the correct field count before processing. Determines # of columns from the 1st record in the file. Can now change delimiters for fields, records, and field enclosures. Aborts on malformed records. Usage: csvimport [-d] [-e] [-r] filename databasename tablename -d Change field delimiter, default -d, -e Change field enclosure char, default -e\" -r Change record delimiter, default -r On my system using 1M records generated with this: main() { int i; for(i=0; i<1000000; ++i) { printf("%d,%d,%d,%d\n",i,i+1,i+2,i+3); } } Takes 4.1 seconds to import the 1M records. // csvimport -- At least somewhat RFC4180 compliant // quoted fields can span multiple lines // quotes can be inserted by escaping with another quote #include <stdio.h> #include <string.h> #include <stdlib.h> #include "sqlite3.h" #define BUFSIZE 1000000 #define MOD 100000 #define MAXTOKEN 100000 int nline=0; int ncol; char buf[BUFSIZE]; int peek; char delim=','; char quote='"'; char eol='\n'; inline int nextChar(FILE *fp) { static char *ptr=buf; if (*ptr == 0) { fgets(buf,BUFSIZE,fp); if (feof(fp)) return EOF; ++nline; ptr=buf; } peek = *(ptr+1); return *ptr++; } int countColumns(FILE *fp) { // 1st line(s) of the file determine how many columns we have int p; int ncol = 0; while((p=nextChar(fp))!=EOF) { if (p == quote) { // start quote p=nextChar(fp); if (p==quote) continue; // escaped quote while((p=nextChar(fp)) && p!=quote); // skip to next quote } if (p == delim) { ++ncol; } if (p == eol) { break; } } if (feof(fp)) { return 0; } return ++ncol; } inline char *nextToken(FILE *fp) { // 1st line(s) of the file determine how many columns we have int p; int n=0; static char *token; int inquote=0; if (token == NULL) { token=malloc(MAXTOKEN); if (token==NULL) { fprintf(stderr,"malloc error: %%m\n"); exit(1); } } token[0]=0; while((p=nextChar(fp))!=EOF) { if (p == quote) { // start quote p=nextChar(fp); if (p==quote) { // escaped quote token[n++]=p; token[n]=0; continue; } token[n++]=p; token[n]=0; inquote = 1; while(inquote) { p=nextChar(fp); if (p==quote && peek==quote) { p=nextChar(fp); // skip on escaped quote token[n++]=p; token[n]=0; continue; } if (p==quote) { inquote=0; } else { token[n++]=p; token[n]=0; } } continue; } if (p == delim) { token[n]=0; return token; } if (p == eol) { break; } token[n++]=p; token[n]=0; } if (feof(fp)) { return NULL; } token[n]=0; return token; } void checkrc(sqlite3 *db,int rc,int checkrc,int flag,char *msg,char *str) { if (rc != checkrc) { fprintf(stderr,"\nLine#%d: ",nline); fprintf(stderr,msg,str); fprintf(stderr,"%s\n",sqlite3_errmsg(db)); if (flag) { // then fatal exit(1); } } } char escape(char *s) { if (*s != '\\') return *s; ++s; switch(*s) { case 't': return '\t'; case 'n': return '\n'; case '"': return '"'; } fprintf(stderr,"Unknown escape sequence=%s\n",--s); return '\0'; } // Add comma delimited file to exisiting database/table // Quoted strings are accepted int main(int argc, char *argv[]) { sqlite3 *db; sqlite3_stmt *stmt; int rc; int ninsert=0; char sql[32768]; FILE *fp; char *filename; char *databasename; char *tablename; while(argv[1] && argv[1][0]=='-') { switch(argv[1][1]) { case 'd': delim=escape(&argv[1][2]); break; case 'e': eol=escape(&argv[1][2]); break; default: fprintf(stderr,"Bad option: %s\n",argv[1]); } ++argv; --argc; } if (argc != 4) { fprintf(stderr,"Usage: csvimport [-d] [-e] [-r] filename databasename tablename\n"); fprintf(stderr," -d Change field delimiter, default -d,\n"); fprintf(stderr," -e Change field enclosure char, default -e\\\"\n"); fprintf(stderr," -r Change record delimiter, default -r\\n\n"); exit(1); } filename = argv[1]; databasename = argv[2]; tablename = argv[3]; rc = sqlite3_open_v2(databasename,&db,SQLITE_OPEN_READWRITE,NULL); checkrc(db,SQLITE_OK,rc,1,"Error opening database '%s': ",databasename); fp=fopen(filename,"r"); if (fp == NULL) { perror(filename); exit(1); } // count the columns ncol = countColumns(fp); fprintf(stderr,"%d columns detected...checking file contents\n",ncol); rewind(fp); int mycol; nline=0; while((mycol=countColumns(fp)) == ncol); if (!feof(fp)) { fprintf(stderr,"Error at line#%d, expected %d cols, got %d cols...aborting\n",nline,ncol,mycol); exit(1); } rewind(fp); printf("importing %d lines...\n",nline); sprintf(sql,"insert into %s values (",tablename); for(mycol=0; mycol<ncol; ++mycol) { strcat(sql,mycol==0?"":","); strcat(sql,"?"); } strcat(sql,")"); // prepare our statement rc = sqlite3_prepare(db,sql,strlen(sql),&stmt,NULL); checkrc(db,SQLITE_OK,rc,1,"Prepare failed on '%s':",sql); // Let's wrap things in a transaction rc = sqlite3_exec(db,"BEGIN",NULL,NULL,NULL); checkrc(db,SQLITE_OK,rc,1,"BEGIN failed on '%s': ",sql); // Loop over file rewind(fp); nline=0; while(!feof(fp)) { int i; for(i=1; i<=ncol; ++i) { // bind the columns as text, table will take care of conversion to column types char *p=nextToken(fp); if (feof(fp)) break; rc=sqlite3_bind_text(stmt,i,p,-1,SQLITE_TRANSIENT); checkrc(db,SQLITE_OK,rc,0,"Bind_text failed on '%s': ",sql); } if (feof(fp)) break; rc = sqlite3_step(stmt); checkrc(db,SQLITE_DONE,rc,1,"Insert failed on '%s': ",databasename); ++ninsert; if ((ninsert % MOD)==0) { printf("%d\r",ninsert); fflush(stdout); } rc = sqlite3_reset(stmt); checkrc(db,SQLITE_OK,rc,1,"Reset failed on '%s': ",sql); } rc=sqlite3_finalize(stmt); checkrc(db,SQLITE_OK,rc,1,"Finalize failed: ",NULL); printf("%d lines, %d inserts, committing...\n",nline,ninsert); rc = sqlite3_exec(db,"COMMIT",NULL,NULL,NULL); checkrc(db,SQLITE_OK,rc,1,"COMMIT failed on '%s': ",sql); rc=sqlite3_close(db); checkrc(db,SQLITE_OK,rc,1,"Close failed on '%s': ",argv[2]); fclose(fp); return 0; } Michael D. Black Senior Scientist Advanced Analytics Directorate Advanced GEOINT Solutions Operating Unit Northrop Grumman Information Systems _______________________________________________ sqlite-users mailing list sqlite-users@sqlite.org http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users