The branch, master has been updated via a69916d0687309766b0014dc9cee6a966aaa89da (commit) via 9ec0009443a0ac4187ce5212a5143689daa58a02 (commit) via a5db1122ec48d7e7384066848457c850c1a6cf3c (commit) via 52a87e608d0406aee9df99f7ac3ce16e834b520b (commit) via 27ba0e5a6681063225df7244a85aa304c51c6948 (commit) from eededd592c92c59b435f0046989b2327fcc280b1 (commit)
http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master - Log ----------------------------------------------------------------- commit a69916d0687309766b0014dc9cee6a966aaa89da Author: Volker Lendecke <v...@samba.org> Date: Fri Aug 6 10:12:13 2010 +0200 Correctly set docdir commit 9ec0009443a0ac4187ce5212a5143689daa58a02 Author: Rusty Russell <ru...@rustcorp.com.au> Date: Mon Aug 16 10:22:21 2010 +0930 tdb: workaround starvation problem in locking entire database. (Imported from SAMBA 11ab43084b10cf53b530cdc3a6036c898b79ca38) We saw tdb_lockall() take 71 seconds under heavy load; this is because Linux (at least) doesn't prevent new small locks being obtained while we're waiting for a big log. The workaround is to do divide and conquer using non-blocking chainlocks: if we get down to a single chain we block. Using a simple test program where children did "hold lock for 100ms, sleep for 1 second" the time to do tdb_lockall() dropped signifiantly. There are ln(hashsize) locks taken in the contended case, but that's slow anyway. More analysis is given in my blog at http://rusty.ozlabs.org/?p=120 This may also help transactions, though in that case it's the initial read lock which uses this gradual locking routine; the update-to-write-lock code is separate and still tries to update in one go. Even though ABI doesn't change, minor version bumped so behavior change can be easily detected. CQ:S1018154 Signed-off-by: Rusty Russell <ru...@rustcorp.com.au> commit a5db1122ec48d7e7384066848457c850c1a6cf3c Author: Rusty Russell <ru...@rustcorp.com.au> Date: Mon Aug 16 10:13:32 2010 +0930 tdb: Fix tdb_check() to work with read-only tdb databases. (Import from SAMBA bc1c82ea137e1bf6cb55139a666c56ebb2226b23) The function tdb_lockall() uses F_WRLCK internally, which doesn't work on a fd opened with O_RDONLY. Use tdb_lockall_read() instead. commit 52a87e608d0406aee9df99f7ac3ce16e834b520b Author: Rusty Russell <ru...@rustcorp.com.au> Date: Mon Aug 16 10:12:02 2010 +0930 tdb: remove unused variable in tdb_new_database(). (Imported from SAMBA 2eab1d7fdcb54f9ec27431ca4858eb64cb1bd835) commit 27ba0e5a6681063225df7244a85aa304c51c6948 Author: Rusty Russell <ru...@rustcorp.com.au> Date: Mon Aug 16 10:20:19 2010 +0930 tdb: fix short write logic in tdb_new_database Commit 207a213c/24fed55d purported to fix the problem of signals during tdb_new_database (which could cause a spurious short write, hence a failure). However, the code is wrong: newdb+written is not correct. Fix this by introducing a general tdb_write_all() and using it here and in the tracing code. Cc: Stefan Metzmacher <me...@samba.org> Signed-off-by: Rusty Russell <ru...@rustcorp.com.au> ----------------------------------------------------------------------- Summary of changes: Makefile.in | 2 +- lib/tdb/common/check.c | 6 +- lib/tdb/common/lock.c | 86 +++++++++++++++++++++++++++++++++-------- lib/tdb/common/open.c | 17 +-------- lib/tdb/common/tdb.c | 16 +++++++- lib/tdb/common/tdb_private.h | 2 +- lib/tdb/configure.ac | 2 +- 7 files changed, 91 insertions(+), 40 deletions(-) Changeset truncated at 500 lines: diff --git a/Makefile.in b/Makefile.in index d2e0dea..15231b6 100755 --- a/Makefile.in +++ b/Makefile.in @@ -9,7 +9,7 @@ prefix = @prefix@ exec_prefix = @exec_prefix@ datarootdir = @datarootdir@ includedir = @includedir@ -docdir = /usr/share/doc +docdir = @docdir@ libdir = @libdir@ bindir = @bindir@ sbindir = @sbindir@ diff --git a/lib/tdb/common/check.c b/lib/tdb/common/check.c index 6bbfd7d..2c64043 100644 --- a/lib/tdb/common/check.c +++ b/lib/tdb/common/check.c @@ -311,7 +311,7 @@ int tdb_check(struct tdb_context *tdb, struct tdb_record rec; bool found_recovery = false; - if (tdb_lockall(tdb) == -1) + if (tdb_lockall_read(tdb) == -1) return -1; /* Make sure we know true size of the underlying file. */ @@ -412,12 +412,12 @@ int tdb_check(struct tdb_context *tdb, } free(hashes); - tdb_unlockall(tdb); + tdb_unlockall_read(tdb); return 0; free: free(hashes); unlock: - tdb_unlockall(tdb); + tdb_unlockall_read(tdb); return -1; } diff --git a/lib/tdb/common/lock.c b/lib/tdb/common/lock.c index 285b7a3..803feee 100644 --- a/lib/tdb/common/lock.c +++ b/lib/tdb/common/lock.c @@ -152,14 +152,6 @@ int tdb_brlock(struct tdb_context *tdb, return -1; } - /* Sanity check */ - if (tdb->transaction && offset >= lock_offset(-1) && len != 0) { - tdb->ecode = TDB_ERR_RDONLY; - TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_brlock attempted in transaction at offset %d rw_type=%d flags=%d len=%d\n", - offset, rw_type, flags, (int)len)); - return -1; - } - do { ret = fcntl_lock(tdb, rw_type, offset, len, flags & TDB_LOCK_WAIT); @@ -486,11 +478,9 @@ int tdb_transaction_unlock(struct tdb_context *tdb, int ltype) return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false); } - -/* lock/unlock entire database. It can only be upgradable if you have some - * other way of guaranteeing exclusivity (ie. transaction write lock). */ -int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, - enum tdb_lock_flags flags, bool upgradable) +/* Returns 0 if all done, -1 if error, 1 if ok. */ +static int tdb_allrecord_check(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags flags, bool upgradable) { /* There are no locks on read-only dbs */ if (tdb->read_only || tdb->traverse_read) { @@ -520,11 +510,73 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, tdb->ecode = TDB_ERR_LOCK; return -1; } + return 1; +} - if (tdb_brlock(tdb, ltype, FREELIST_TOP, 0, flags)) { - if (flags & TDB_LOCK_WAIT) { - TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lockall failed (%s)\n", strerror(errno))); - } +/* We only need to lock individual bytes, but Linux merges consecutive locks + * so we lock in contiguous ranges. */ +static int tdb_chainlock_gradual(struct tdb_context *tdb, + int ltype, enum tdb_lock_flags flags, + size_t off, size_t len) +{ + int ret; + enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT); + + if (len <= 4) { + /* Single record. Just do blocking lock. */ + return tdb_brlock(tdb, ltype, off, len, flags); + } + + /* First we try non-blocking. */ + ret = tdb_brlock(tdb, ltype, off, len, nb_flags); + if (ret == 0) { + return 0; + } + + /* Try locking first half, then second. */ + ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2); + if (ret == -1) + return -1; + + ret = tdb_chainlock_gradual(tdb, ltype, flags, + off + len / 2, len - len / 2); + if (ret == -1) { + tdb_brunlock(tdb, ltype, off, len / 2); + return -1; + } + return 0; +} + +/* lock/unlock entire database. It can only be upgradable if you have some + * other way of guaranteeing exclusivity (ie. transaction write lock). + * We do the locking gradually to avoid being starved by smaller locks. */ +int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags flags, bool upgradable) +{ + switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) { + case -1: + return -1; + case 0: + return 0; + } + + /* We cover two kinds of locks: + * 1) Normal chain locks. Taken for almost all operations. + * 3) Individual records locks. Taken after normal or free + * chain locks. + * + * It is (1) which cause the starvation problem, so we're only + * gradual for that. */ + if (tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP, + tdb->header.hash_size * 4) == -1) { + return -1; + } + + /* Grab individual record locks. */ + if (tdb_brlock(tdb, ltype, lock_offset(tdb->header.hash_size), 0, + flags) == -1) { + tdb_brunlock(tdb, ltype, FREELIST_TOP, + tdb->header.hash_size * 4); return -1; } diff --git a/lib/tdb/common/open.c b/lib/tdb/common/open.c index dfe780d..7687ff6 100644 --- a/lib/tdb/common/open.c +++ b/lib/tdb/common/open.c @@ -51,7 +51,6 @@ static int tdb_new_database(struct tdb_context *tdb, int hash_size) struct tdb_header *newdb; size_t size; int ret = -1; - ssize_t written; /* We make it up in memory, then write it out if not internal */ size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off_t); @@ -83,22 +82,8 @@ static int tdb_new_database(struct tdb_context *tdb, int hash_size) /* Don't endian-convert the magic food! */ memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1); /* we still have "ret == -1" here */ - written = write(tdb->fd, newdb, size); - if (written == size) { + if (tdb_write_all(tdb->fd, newdb, size)) ret = 0; - } else if (written != -1) { - /* call write once again, this usually should return -1 and - * set errno appropriately */ - size -= written; - written = write(tdb->fd, newdb+written, size); - if (written == size) { - ret = 0; - } else if (written >= 0) { - /* a second incomplete write - we give up. - * guessing the errno... */ - errno = ENOSPC; - } - } fail: SAFE_FREE(newdb); diff --git a/lib/tdb/common/tdb.c b/lib/tdb/common/tdb.c index dac3f4e..4d8c5fc 100644 --- a/lib/tdb/common/tdb.c +++ b/lib/tdb/common/tdb.c @@ -989,10 +989,24 @@ int tdb_repack(struct tdb_context *tdb) return 0; } +/* Even on files, we can get partial writes due to signals. */ +bool tdb_write_all(int fd, const void *buf, size_t count) +{ + while (count) { + size_t ret; + ret = write(fd, buf, count); + if (ret < 0) + return false; + buf = (const char *)buf + ret; + count -= ret; + } + return true; +} + #ifdef TDB_TRACE static void tdb_trace_write(struct tdb_context *tdb, const char *str) { - if (write(tdb->tracefd, str, strlen(str)) != strlen(str)) { + if (!tdb_write_alltdb->tracefd, str, strlen(str)) { close(tdb->tracefd); tdb->tracefd = -1; } diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h index e216713..9d0f3bc 100644 --- a/lib/tdb/common/tdb_private.h +++ b/lib/tdb/common/tdb_private.h @@ -266,5 +266,5 @@ void tdb_io_init(struct tdb_context *tdb); int tdb_expand(struct tdb_context *tdb, tdb_off_t size); int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct tdb_record *rec); - +bool tdb_write_all(int fd, const void *buf, size_t count); int tdb_transaction_recover(struct tdb_context *tdb); diff --git a/lib/tdb/configure.ac b/lib/tdb/configure.ac index 686b0a6..9b87227 100644 --- a/lib/tdb/configure.ac +++ b/lib/tdb/configure.ac @@ -2,7 +2,7 @@ AC_PREREQ(2.50) AC_DEFUN([SMB_MODULE_DEFAULT], [echo -n ""]) AC_DEFUN([SMB_LIBRARY_ENABLE], [echo -n ""]) AC_DEFUN([SMB_ENABLE], [echo -n ""]) -AC_INIT(tdb, 1.2.2) +AC_INIT(tdb, 1.2.3) AC_CONFIG_SRCDIR([common/tdb.c]) AC_CONFIG_HEADER(include/config.h) AC_LIBREPLACE_ALL_CHECKS -- CTDB repository