The problem that I see is that with multiple threads all attempting to "BEGIN EXCLUSIVE", they will occasionally *all* fail, calling the busy handler repeatedly until it finally returns SQL_BUSY. Let me re-state for clarity's sake: 10 threads all try "BEGIN EXCLUSIVE" at the same time. One succeeds, processes, and COMMITs. The other 9 will sometimes repeatedly call the busy handler over and over until they fail with SQLITE_BUSY, even though the database *should be* available to start a new exclusive transaction.
I have tried to eliminate as many variables as I could in my testing and in the sample program:
* No SQL queries other than BEGIN and COMMIT are being used on an empty database.
* No threads are accessing the database in any place other than the pthread_cleanup handler.
* I have tested on Fedora Core 1,2,3, SUSE and Gentoo, and with 32 and 64 bit systems.
The remaining variables I am working with are system load and CPU count:
* On a hyperthreaded (2 virtual CPUs) computer it fails at least 50% of the time when unloaded and < 5% with the load around 1, and then climbs up again as the load increases.
* On a multi-CPU computer it fails about 10% of the time when unloaded and < 1% when both CPUs are otherwise occupied.
* I have occasional access to a single-CPU Pentium 3 where I have not been able to duplicate the problem. This pc is running FC2 just like our chief offender the hyperthreaded PC who fails 50% of the time when unloaded.
In order to demonstrate this problem I have attached a program called sqlthreads.c. It builds under Linux with:
gcc -Wall -g sqlthreads.c -o sqlthreads -lpthread -lsqlite3.
sqlthreads takes two arguments: the number of threads to spawn, and the type of busy handler to use. 0 is the standard "sqlite3_busy_timeout" handler with a 20 second timeout. 1 is a custom sqlite3_busy_handler with a fixed sleep time. -1 is a similar sqlite3_busy_handler with a random sleep time. When run, the test launches N threads, waits a second, then cancels all of them. At this point, each thread will sqlite3_exec a BEGIN EXCLUSIVE. The one that succeeds will sleep for 1/2 second and then call COMMIT, at which time another thread *should* be free to begin. If at the end you see "It appears as though at least x threads failed to obtain an exclusive lock on the database!" then you have experienced what I consider to be the problem.
If I could get any independant corroboration of this problem, that would be fantastic. If you skimmed through my long email, it appears most likely to occur on a hyperthreaded PC, aka Pentium 4 2.8 ghz or higher.
-Eli
/* * sqlthreads.c -- sqlite3 locking / thread concurrency tester */
#include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <pthread.h> #include <signal.h> #include <sqlite3.h> // global variables int g_failed = 0; int g_threadCnt = 0; pthread_t g_pool[256] = {0}; pthread_key_t g_pkeyDb = 0; pthread_mutex_t g_logMutex = PTHREAD_MUTEX_INITIALIZER; // // log output wrapper, adds timestamp and thread id // void sqlog(char * pattern, ...) { va_list varArgs; char timeBuf[16]; time_t t = time(0); struct tm tmStruct; // determine the current time localtime_r(&t, &tmStruct); strftime(timeBuf, sizeof(timeBuf), "%H:%M:%S", &tmStruct); // init the variable argument processor va_start(varArgs, pattern); // obtain a mutex lock and write out the log entry pthread_mutex_lock(&g_logMutex); fprintf(stderr, "%s [%u] ", timeBuf, (unsigned int)pthread_self()); vfprintf(stderr, pattern, varArgs); pthread_mutex_unlock(&g_logMutex); // clean up and exit va_end(varArgs); } // // sqlite3 alternative busy handler // int busyHandler(void *pArg1, int iPriorCalls) { sqlog("!! dbBusyHandler %d\n", iPriorCalls); // sleep if handler has been called less than threshold value if (iPriorCalls < 20) { // adding a random value here greatly reduces locking if (pArg1 < 0) usleep((rand() % 500000) + 400000); else usleep(500000); return 1; } // have sqlite3_exec immediately return SQLITE_BUSY return 0; } // // each thread calls this when it is canceled // void destroySingleThread(void *ptr) { int rc; sqlite3 * pDb; // get our db handle from thread local storage pDb = (sqlite3 *) pthread_getspecific(g_pkeyDb); // we want to get EXCLUSIVE access to the DB rc = sqlite3_exec(pDb, "BEGIN EXCLUSIVE", NULL, NULL, NULL); if (rc != SQLITE_OK) { sqlog("destroySingleThread: Exclusive Transaction failed! %s\n", sqlite3_errmsg(pDb)); g_failed++; } else { // sleep for a bit to simulate some SQL queries usleep(500000); sqlog("destroySingleThread: thread %u terminated!!\n", pthread_self()); // commit the transaction, which should free it up for other threads rc = sqlite3_exec(pDb, "COMMIT", NULL, NULL, NULL); if (rc != SQLITE_OK) { sqlog("destroySingleThread: Commit failed! %s\n", sqlite3_errmsg(pDb)); } } sqlite3_close(pDb); return; } // // each thread calls this to open the database and then spin // void * startSingleThread(void * sleepMode) { sqlite3 * pDb = NULL; char * szDb = "./sqlthreadsDb"; sqlog("startSingleThread: new thread %u launched\n", pthread_self()); // open handle to the database and save it in thread local storage if (SQLITE_OK != sqlite3_open(szDb, &pDb)) { sqlog("startSingleThread: Failed to open database %s: %s\n", szDb, sqlite3_errmsg(pDb)); return NULL; } // install a timeout or busy handler to be called if database is locked if (* (int *)sleepMode == 0) sqlite3_busy_timeout(pDb, 20000); else sqlite3_busy_handler(pDb, busyHandler, sleepMode); // put db handle in thread local storage so thread destructor can close it if (0 != pthread_setspecific(g_pkeyDb, pDb)) { sqlog("startSingleThread: pthread_setspecific failed to save pDb\n"); return NULL; } // push the node destructor onto the thread's cleanup stack pthread_cleanup_push(destroySingleThread, 0); // sit in a loop, doing nothing much while (1) { pthread_testcancel(); usleep(100000); } // (never gets here, but...) remove the cleanup handler from the stack pthread_cleanup_pop(1); return NULL; } // // launch all threads // int startThreads(int threadCnt, void * sleepMode) { int rc, idx; pthread_attr_t attr; // create a pthread_attr object for our pthreads pthread_attr_init(&attr); pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); // spawn the threads for (idx = 0; idx < threadCnt; idx++) { rc = pthread_create(&g_pool[idx], &attr, startSingleThread, sleepMode); if (rc) { sqlog("startThreads: Failed to start thread %d: %d\n", idx, rc); break; } g_threadCnt = idx+1; } // destroy the pthread_attr object we used to create our pthreads pthread_attr_destroy(&attr); return 0; } // // start the main server process // int main(int argc, char **argv) { int idx, threadCnt, sleepMode; // parse cmd line args if (argc != 3) { printf("\nUsage: %s [thread count] [sleep mode]\n", argv[0]); printf( "\tSleep Mode is one of:\n" "\t\t 0 - Standard sqlite3_busy_timeout handler\n" "\t\t-1 - sqlite3_busy_handler with random usleep time\n" "\t\t 1 - sqlite3_busy_handler with fixed usleep time\n"); return 0; } // get the thread count and sleep mode threadCnt = atoi(argv[1]); sleepMode = atoi(argv[2]); if (threadCnt < 1 || threadCnt > 50) { printf("\nBe reasonable! Thread count should be between 1 and 50\n\n"); return 0; } // print out a banner sqlog("sqlite3 thread test starting...\n"); if (sleepMode == 0) sqlog("Using Sqlite3 standard sqlite3_busy_timeout handler\n"); else if (sleepMode == 1) sqlog("Using custom sqlite3_busy_handler /w fixed usleep\n"); else if (sleepMode == -1) sqlog("Using custom sqlite3_busy_handler /w random usleep\n"); else { sqlog("\nInvalid sleep mode '%s' specified!\n\n", argv[2]); exit(0); } // create a thread local storage key and start up our threads pthread_key_create(&g_pkeyDb, NULL); startThreads(threadCnt, &sleepMode); // loop through the thread array and wait on each sqlog("main: now sleep for 1 sec and then cancel all threads\n"); sleep(1); sqlog("\nNote that if you are testing sqlite3_timeout_handler (0)" "\nit may not produce output for up to 20 seconds\n"); // loop and cancel threads for (idx = 0; idx < g_threadCnt; idx++) { sqlog("main: Sending cancel to thread %u\n", g_pool[idx]); pthread_cancel(g_pool[idx]); } sqlog("main: pthread_cancel sent to all threads!\n"); // loop and wait for all threads to exit for (idx = 0; idx < g_threadCnt; idx++) { if (g_pool[idx]) pthread_join(g_pool[idx], NULL); } sqlog("Concluding the sqlite3 thread test...\n"); if (g_failed) sqlog("It appears as though at least %d threads failed to obtain an " "exclusive lock on the database!\n", g_failed); else sqlog("Success! Now try again! :-)\n"); // delete thread local storage and exit pthread_key_delete(g_pkeyDb); pthread_exit(0); return 0; }