Hello Richard !

I'm trying to create a sqlite3 database with some pubic data (the sqlite3 dump gziped is 9184MB) that took a very long time to build (2 days tested on linux and os x). I was trying to vacuum it to see if I could get better performance but it take too long so I did a ".dump" (2 hours) and now I'm trying to restore but when it reaches around 1GB it slowdown a lot (only grow 1MB every minute).

See attached schema. It doesn't use foreign keys or heavy constraints.

So after several tests I could isolate a test case (see attached, the sample data is at https://www.endad.eu/tst-sqlite3/favorecidos-nis.txt.xz 41MB 25M lines, it also show stats of work done) it seems that the distribution of some fields cause a lot of btree partitions.

On this test case the data was get from the mentioned database, it's an int64 field used to unique identify individuals, there is 25M and sqlite3 can insert 80% of then in 15% of the total time and then take 75% of the time to insert 20% of the final data.

I usually test it on a linux and os x and it behaves the same on both, the io activity is insane in the last 20% of the data. The full database is not usable (35GB) most operations take too long (minutes and hours).

After several tests to try overcome this problem, I've tried to prefill the problematic table with data known beforehand and update it later instead of insert, thinking this would move away the btree balancing slowdown on insertion but for my surprise the slowdown remain there it seems that sqlite first delete then reinsert the whole record (because I can not explain it other way, I even filled all fields with excess data so the update would have plenty of space to do it's job).


I understand that you are busy but I think that this can be another good test to check performance bottleneck, as I mentioned the data is public so I can provide it or you can grab yourself.

Sample data like:

20902122112
15032539272
12282967455
16129244577
20730772521
10262168283
13789627770
17021574841
16405626831
17044198056

Thank you so much for your attention and great work !

sql schema
==========
create table if not exists processed(
    id integer primary key,
    count integer
);

create table if not exists municipios(
    id integer primary key,
    uf varchar collate nocase_slna,
    nome varchar collate nocase_slna
);

create table if not exists programas(
    id integer primary key,
    codigo_funcao integer not null,
    codigo_subfuncao integer not null,
    codigo_programa integer not null,
    codigo_acao integer not null
);
create unique index if not exists programas_all_idx on programas(
    codigo_funcao, codigo_subfuncao, codigo_programa, codigo_acao);

create table if not exists fontes_finalidade(
    id integer primary key,
    nome varchar collate nocase_slna unique
);

create table if not exists favorecidos(
    id integer primary key,
    nome varchar collate nocase_slna,
    municipio_id integer,
    programa_id integer not null,
    fonte_finalidade_id integer not null
);

create table if not exists bolsas_familia(
    id integer primary key,
    favorecido_id integer not null,
    valor_parcela numeric,
    mes_competencia integer
);
--create index if not exists bolsas_familia_favorecidos_idx on bolsas_familia(favorecido_id);

--insert or ignore into favorecidos(id, nome,programa_id,fonte_finalidade_id) select nis, nome,1,1 from adb.favorecidos order by nis;

==========

main.cpp


#include <string>
#include <stdio.h>
#include <sys/timeb.h>

#include "sqlite3.h"

using namespace std;

namespace os {

int getmillicount()
{
  // Something like GetTickCount but portable
  // It rolls over every ~ 12.1 days (0x100000/24/60/60)
  // Use GetMilliSpan to correct for rollover
  timeb tb;
  ftime( &tb );
  int nCount = tb.millitm + (tb.time & 0xfffff) * 1000;
  return nCount;
}
};

struct SQLite3Statement
{
    sqlite3_stmt *stmt;
    SQLite3Statement(){stmt=NULL;}
    ~SQLite3Statement()
    {
        if(stmt) sqlite3_finalize(stmt);
    }
    int bind_exec(std::string &p1, sqlite3_int64 p2)
    {
        if(stmt)
        {
            sqlite3_bind_text(stmt, 1, p1.c_str(), p1.size(), NULL);
            sqlite3_bind_int64(stmt, 2, p2);
            int rc = sqlite3_step(stmt);
            sqlite3_reset(stmt);
            return rc;
        }
        return -1;
    }
};

struct SQLite3
{
    sqlite3 *db;
    SQLite3(const char *dbname)
    {
        db = NULL;
        sqlite3_open_v2(dbname, &db, SQLITE_OPEN_CREATE | 
SQLITE_OPEN_READWRITE, NULL);
    }
    ~SQLite3()
    {
        sqlite3_close(db);
    }

    int exec_dml(const char *szSql)
    {
        return sqlite3_exec(db, szSql, NULL, NULL, NULL);
    }

    int prepare(SQLite3Statement &stmt, const char *szSql)
    {
        return sqlite3_prepare_v2(db, szSql, -1, &stmt.stmt, NULL);
    }
};

struct file
{
    FILE *fp;
    file(const char *fname, const char *mode)
    {
        fp = fopen(fname, mode);
    }
    int tell()
    {
        return ftell(fp);
    }
    std::string read_line()
    {
        char line[32];
        if(!fgets(line, sizeof(line),fp)) line[0] = '\0';;
        return line;
    }
};

int main(int argc, char *argv[])
{
    int total_milli_param = 0;
    if(argc > 1)
    {
        total_milli_param = atoi(argv[1]);
    }

    SQLite3 db("bf-favorecidos-nis.db");
    db.exec_dml("PRAGMA synchronous = 0;");
    db.exec_dml("drop table if exists bf_fav_nis;");
    db.exec_dml("create table if not exists bf_fav_nis(id integer primary key, 
offset integer);");
    SQLite3Statement stmt;
    db.prepare(stmt, "insert into bf_fav_nis(id, offset) values(?,?)");
    file fd("favorecidos-nis.txt","r");

    db.exec_dml("begin;");

    std::string line;
    auto count = 0, offset = 0,
        total_count = 22501426, //total number of line in fd
        total_milli = (total_milli_param ? total_milli_param : 190020); //total 
miliseconds spent to process all lines

    auto step_count = 100000,
        start_milli = os::getmillicount(), last_milli = os::getmillicount();

    
printf("rec_count\tcount_rate\tmilli_rate\tstep_rate\tstep_count\tspent_milli\ttill_now_milli\n");
    while( (line = fd.read_line()).size() )
    {
        stmt.bind_exec(line, offset);
        ++count;
        offset = fd.tell();

        if( (count % step_count) == 0)
        {
            auto now_milli = os::getmillicount();
            auto till_now_milli = now_milli - start_milli;
            auto spent_milli = now_milli - last_milli;
            auto step_rate = step_count / spent_milli;
            auto total_count_rate = ((count + 0.0) / total_count) * 100;
            auto total_milli_rate = ((till_now_milli + 0.0) / total_milli) * 
100;
            last_milli = now_milli;
            printf("%d\t%f\t%f\t%d\t%d\t%d\t%d\n", count, total_count_rate, 
total_milli_rate, step_rate * 1000, step_count, spent_milli, till_now_milli);
            db.exec_dml("commit;begin;");
        }
    }

    db.exec_dml("commit;");

    auto milli_spent = os::getmillicount() - start_milli;
    printf("%d\t%d\t%d\t%f\n", count, (count / milli_spent) * 1000, 
milli_spent, milli_spent / 60.0);
    return 0;
}


mk-it


g++ -std=c++11 -O2 -o sqlite3-insert main.cpp -lsqlite3

_______________________________________________
sqlite-users mailing list
sqlite-users@mailinglists.sqlite.org
http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users

Reply via email to