Next iteration of my csvimport utility.  I think it handles Simon's test cases 
adequately (or it gives an error).

Now allows for multiline fields
Correctly(?) handles  fields with escaped quotes.
Checks the entire file to ensure all records have the correct field count 
before processing.
Determines # of columns from the 1st record in the file.
Can now change delimiters for fields, records, and field enclosures.

Aborts on malformed records.

Usage: csvimport [-d] [-e] [-r] filename databasename tablename
    -d   Change field delimiter, default -d,
    -e   Change field enclosure char, default -e\"
    -r   Change record delimiter, default -r

On my system using 1M records generated with this:
main() {
  int i;
  for(i=0; i<1000000; ++i) {
    printf("%d,%d,%d,%d\n",i,i+1,i+2,i+3);
  }
}

Takes 4.1 seconds to import the 1M records.


// csvimport -- At least somewhat RFC4180 compliant
// quoted fields can span multiple lines
// quotes can be inserted by escaping with another quote
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "sqlite3.h"

#define BUFSIZE 1000000
#define MOD 100000
#define MAXTOKEN 100000

int nline=0;
int ncol;
char buf[BUFSIZE];
int peek;
char delim=',';
char quote='"';
char eol='\n';

inline int nextChar(FILE *fp) {
  static char *ptr=buf;
  if (*ptr == 0) {
    fgets(buf,BUFSIZE,fp);
    if (feof(fp)) return EOF;
    ++nline;
    ptr=buf;
  }
  peek = *(ptr+1);
  return *ptr++;
}

int countColumns(FILE *fp) {
  // 1st line(s) of the file determine how many columns we have
  int p;
  int ncol = 0;
  while((p=nextChar(fp))!=EOF) {
    if (p == quote) { // start quote
      p=nextChar(fp);
      if (p==quote) continue; // escaped quote
      while((p=nextChar(fp)) && p!=quote); // skip to next quote
    }
    if (p == delim) {
      ++ncol;
    }
    if (p == eol) {
      break;
    }
  }
  if (feof(fp)) {
    return 0;
  }
  return ++ncol;
}

inline char *nextToken(FILE *fp) {
  // 1st line(s) of the file determine how many columns we have
  int p;
  int n=0;
  static char *token;
  int inquote=0;
  if (token == NULL) {
    token=malloc(MAXTOKEN);
    if (token==NULL) {
      fprintf(stderr,"malloc error: %%m\n");
      exit(1);
    }
  }
  token[0]=0;
  while((p=nextChar(fp))!=EOF) {
    if (p == quote) { // start quote
      p=nextChar(fp);
      if (p==quote) { // escaped quote
        token[n++]=p;
        token[n]=0;
        continue;
      }
      token[n++]=p;
      token[n]=0;
      inquote = 1;
      while(inquote) {
        p=nextChar(fp);
        if (p==quote && peek==quote) {
          p=nextChar(fp); // skip on escaped quote
          token[n++]=p;
          token[n]=0;
          continue;
        }
        if (p==quote) {
          inquote=0;
        } else {
          token[n++]=p;
          token[n]=0;
        }
      }
      continue;
    }
    if (p == delim) {
      token[n]=0;
      return token;
    }
    if (p == eol) {
      break;
    }
    token[n++]=p;
    token[n]=0;
  }
  if (feof(fp)) {
    return NULL;
  }
  token[n]=0;
  return token;
}

void checkrc(sqlite3 *db,int rc,int checkrc,int flag,char *msg,char *str) {
  if (rc != checkrc) {
    fprintf(stderr,"\nLine#%d: ",nline);
    fprintf(stderr,msg,str);
    fprintf(stderr,"%s\n",sqlite3_errmsg(db));
    if (flag) { // then fatal
      exit(1);
    }
  }
}

char escape(char *s) {
  if (*s != '\\') return *s;
  ++s;
  switch(*s) {
  case 't':
    return '\t';
  case 'n':
    return '\n';
  case '"':
    return '"';
  }
  fprintf(stderr,"Unknown escape sequence=%s\n",--s);
  return '\0';
}

// Add comma delimited file to exisiting database/table
// Quoted strings are accepted
int main(int argc, char *argv[]) {
  sqlite3 *db;
  sqlite3_stmt *stmt;
  int rc;
  int ninsert=0;
  char sql[32768];
  FILE *fp;
  char *filename;
  char *databasename;
  char *tablename;

  while(argv[1] && argv[1][0]=='-') {
    switch(argv[1][1]) {
    case 'd':
      delim=escape(&argv[1][2]);
      break;
    case 'e':
      eol=escape(&argv[1][2]);
      break;
    default:
      fprintf(stderr,"Bad option: %s\n",argv[1]);
    }
    ++argv;
    --argc;
  }
  if (argc != 4) {
    fprintf(stderr,"Usage: csvimport [-d] [-e] [-r] filename databasename 
tablename\n");
    fprintf(stderr,"    -d   Change field delimiter, default -d,\n");
    fprintf(stderr,"    -e   Change field enclosure char, default -e\\\"\n");
    fprintf(stderr,"    -r   Change record delimiter, default -r\\n\n");
    exit(1);
  }
  filename = argv[1];
  databasename = argv[2];
  tablename = argv[3];
  rc = sqlite3_open_v2(databasename,&db,SQLITE_OPEN_READWRITE,NULL);
  checkrc(db,SQLITE_OK,rc,1,"Error opening database '%s': ",databasename);
  fp=fopen(filename,"r");
  if (fp == NULL) {
    perror(filename);
    exit(1);
  }
  // count the columns
  ncol = countColumns(fp);

  fprintf(stderr,"%d columns detected...checking file contents\n",ncol);
  rewind(fp);
  int mycol;
  nline=0;
  while((mycol=countColumns(fp)) == ncol);
  if (!feof(fp)) {
    fprintf(stderr,"Error at line#%d, expected %d cols, got %d 
cols...aborting\n",nline,ncol,mycol);
    exit(1);
  }
  rewind(fp);
  printf("importing %d lines...\n",nline);
  sprintf(sql,"insert into %s values (",tablename);
  for(mycol=0; mycol<ncol; ++mycol) {
    strcat(sql,mycol==0?"":",");
    strcat(sql,"?");
  }
  strcat(sql,")");
  // prepare our statement
  rc = sqlite3_prepare(db,sql,strlen(sql),&stmt,NULL);
  checkrc(db,SQLITE_OK,rc,1,"Prepare failed on '%s':",sql);
  // Let's wrap things in a transaction
  rc = sqlite3_exec(db,"BEGIN",NULL,NULL,NULL);
  checkrc(db,SQLITE_OK,rc,1,"BEGIN failed on '%s': ",sql);
  // Loop over file
  rewind(fp);
  nline=0;
  while(!feof(fp)) {
    int i;
    for(i=1; i<=ncol; ++i) { // bind the columns as text, table will take care 
of conversion to column types
      char *p=nextToken(fp);
      if (feof(fp)) break;
      rc=sqlite3_bind_text(stmt,i,p,-1,SQLITE_TRANSIENT);
      checkrc(db,SQLITE_OK,rc,0,"Bind_text failed on '%s': ",sql);
    }
    if (feof(fp)) break;
    rc = sqlite3_step(stmt);
    checkrc(db,SQLITE_DONE,rc,1,"Insert failed on '%s': ",databasename);
    ++ninsert;
    if ((ninsert % MOD)==0) {
      printf("%d\r",ninsert);
      fflush(stdout);
    }
    rc = sqlite3_reset(stmt);
    checkrc(db,SQLITE_OK,rc,1,"Reset failed on '%s': ",sql);
  }
  rc=sqlite3_finalize(stmt);
  checkrc(db,SQLITE_OK,rc,1,"Finalize failed: ",NULL);
  printf("%d lines, %d inserts, committing...\n",nline,ninsert);
  rc = sqlite3_exec(db,"COMMIT",NULL,NULL,NULL);
  checkrc(db,SQLITE_OK,rc,1,"COMMIT failed on '%s': ",sql);
  rc=sqlite3_close(db);
  checkrc(db,SQLITE_OK,rc,1,"Close failed on '%s': ",argv[2]);
  fclose(fp);
  return 0;
}

Michael D. Black
Senior Scientist
Advanced Analytics Directorate
Advanced GEOINT Solutions Operating Unit
Northrop Grumman Information Systems
_______________________________________________
sqlite-users mailing list
sqlite-users@sqlite.org
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users

Reply via email to