The branch, v4-0-test has been updated via ed0c3a0f74c305b3b8554b05c3f97cf79db8296a (commit) from 84dc6bc19c166e42c5eca2949e4ef5024c80f513 (commit)
http://gitweb.samba.org/?p=samba.git;a=shortlog;h=v4-0-test - Log ----------------------------------------------------------------- commit ed0c3a0f74c305b3b8554b05c3f97cf79db8296a Author: Andrew Tridgell <[EMAIL PROTECTED]> Date: Tue Jan 15 14:05:47 2008 +1100 merged tdb from ctdb bzr tree ----------------------------------------------------------------------- Summary of changes: source/lib/tdb/common/freelist.c | 101 +++++++--- source/lib/tdb/common/io.c | 20 +- source/lib/tdb/common/lock.c | 7 + source/lib/tdb/common/open.c | 11 +- source/lib/tdb/common/tdb.c | 119 ++++++++++- source/lib/tdb/common/tdb_private.h | 7 +- source/lib/tdb/common/transaction.c | 408 ++++++++++++++++++++-------------- source/lib/tdb/common/traverse.c | 3 + source/lib/tdb/include/tdb.h | 5 + 9 files changed, 471 insertions(+), 210 deletions(-) Changeset truncated at 500 lines: diff --git a/source/lib/tdb/common/freelist.c b/source/lib/tdb/common/freelist.c index b109643..c086c15 100644 --- a/source/lib/tdb/common/freelist.c +++ b/source/lib/tdb/common/freelist.c @@ -27,6 +27,12 @@ #include "tdb_private.h" +/* 'right' merges can involve O(n^2) cost when combined with a + traverse, so they are disabled until we find a way to do them in + O(1) time +*/ +#define USE_RIGHT_MERGES 0 + /* read a freelist record and check for simple errors */ int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct *rec) { @@ -56,7 +62,7 @@ int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct } - +#if USE_RIGHT_MERGES /* Remove an element from the freelist. Must have alloc lock. */ static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next) { @@ -75,6 +81,7 @@ static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_ TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%d\n", off)); return TDB_ERRCODE(TDB_ERR_CORRUPT, -1); } +#endif /* update a record tailer (must hold allocation lock) */ @@ -93,8 +100,6 @@ static int update_tailer(struct tdb_context *tdb, tdb_off_t offset, neccessary. */ int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec) { - tdb_off_t right, left; - /* Allocation and tailer lock */ if (tdb_lock(tdb, -1, F_WRLCK) != 0) return -1; @@ -105,9 +110,10 @@ int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec) goto fail; } +#if USE_RIGHT_MERGES /* Look right first (I'm an Australian, dammit) */ - right = offset + sizeof(*rec) + rec->rec_len; - if (right + sizeof(*rec) <= tdb->map_size) { + if (offset + sizeof(*rec) + rec->rec_len + sizeof(*rec) <= tdb->map_size) { + tdb_off_t right = offset + sizeof(*rec) + rec->rec_len; struct list_struct r; if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) { @@ -122,13 +128,18 @@ int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec) goto left; } rec->rec_len += sizeof(r) + r.rec_len; + if (update_tailer(tdb, offset, rec) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset)); + goto fail; + } } } - left: +#endif + /* Look left */ - left = offset - sizeof(tdb_off_t); - if (left > TDB_DATA_START(tdb->header.hash_size)) { + if (offset - sizeof(tdb_off_t) > TDB_DATA_START(tdb->header.hash_size)) { + tdb_off_t left = offset - sizeof(tdb_off_t); struct list_struct l; tdb_off_t leftsize; @@ -145,7 +156,12 @@ left: left = offset - leftsize; - /* Now read in record */ + if (leftsize > offset || + left < TDB_DATA_START(tdb->header.hash_size)) { + goto update; + } + + /* Now read in the left record */ if (tdb->methods->tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left read failed at %u (%u)\n", left, leftsize)); goto update; @@ -153,21 +169,24 @@ left: /* If it's free, expand to include it. */ if (l.magic == TDB_FREE_MAGIC) { - if (remove_from_freelist(tdb, left, l.next) == -1) { - TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left free failed at %u\n", left)); - goto update; - } else { - offset = left; - rec->rec_len += leftsize; + /* we now merge the new record into the left record, rather than the other + way around. This makes the operation O(1) instead of O(n). This change + prevents traverse from being O(n^2) after a lot of deletes */ + l.rec_len += sizeof(*rec) + rec->rec_len; + if (tdb_rec_write(tdb, left, &l) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_left failed at %u\n", left)); + goto fail; } + if (update_tailer(tdb, left, &l) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset)); + goto fail; + } + tdb_unlock(tdb, -1, F_WRLCK); + return 0; } } update: - if (update_tailer(tdb, offset, rec) == -1) { - TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset)); - goto fail; - } /* Now, prepend to free list */ rec->magic = TDB_FREE_MAGIC; @@ -261,6 +280,7 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st tdb_off_t rec_ptr, last_ptr; tdb_len_t rec_len; } bestfit; + float multiplier = 1.0; if (tdb_lock(tdb, -1, F_WRLCK) == -1) return 0; @@ -295,18 +315,27 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st bestfit.rec_len = rec->rec_len; bestfit.rec_ptr = rec_ptr; bestfit.last_ptr = last_ptr; - /* consider a fit to be good enough if - we aren't wasting more than half - the space */ - if (bestfit.rec_len < 2*length) { - break; - } } } /* move to the next record */ last_ptr = rec_ptr; rec_ptr = rec->next; + + /* if we've found a record that is big enough, then + stop searching if its also not too big. The + definition of 'too big' changes as we scan + through */ + if (bestfit.rec_len > 0 && + bestfit.rec_len < length * multiplier) { + break; + } + + /* this multiplier means we only extremely rarely + search more than 50 or so records. At 50 records we + accept records up to 11 times larger than what we + want */ + multiplier *= 1.05; } if (bestfit.rec_ptr != 0) { @@ -328,3 +357,25 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st return 0; } + + +/* + return the size of the freelist - used to decide if we should repack +*/ +int tdb_freelist_size(struct tdb_context *tdb) +{ + tdb_off_t ptr; + int count=0; + + if (tdb_lock(tdb, -1, F_RDLCK) == -1) { + return -1; + } + + ptr = FREELIST_TOP; + while (tdb_ofs_read(tdb, ptr, &ptr) == 0 && ptr != 0) { + count++; + } + + tdb_unlock(tdb, -1, F_RDLCK); + return count; +} diff --git a/source/lib/tdb/common/io.c b/source/lib/tdb/common/io.c index 8ab0768..172ab69 100644 --- a/source/lib/tdb/common/io.c +++ b/source/lib/tdb/common/io.c @@ -101,8 +101,8 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off, off+written); } if (written == -1) { - /* Ensure ecode is set for log fn. */ - tdb->ecode = TDB_ERR_IO; + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_IO; TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d " "len=%d (%s)\n", off, len, strerror(errno))); return TDB_ERRCODE(TDB_ERR_IO, -1); @@ -111,8 +111,8 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off, "write %d bytes at %d in two attempts\n", len, off)); errno = ENOSPC; - return TDB_ERRCODE(TDB_ERR_IO, -1); - } + return TDB_ERRCODE(TDB_ERR_IO, -1); + } } return 0; } @@ -230,7 +230,7 @@ void tdb_mmap(struct tdb_context *tdb) says to use for mmap expansion */ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition) { - char buf[1024]; + char buf[8192]; if (tdb->read_only || tdb->traverse_read) { tdb->ecode = TDB_ERR_RDONLY; @@ -294,7 +294,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t ad int tdb_expand(struct tdb_context *tdb, tdb_off_t size) { struct list_struct rec; - tdb_off_t offset; + tdb_off_t offset, new_size; if (tdb_lock(tdb, -1, F_WRLCK) == -1) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n")); @@ -304,9 +304,11 @@ int tdb_expand(struct tdb_context *tdb, tdb_off_t size) /* must know about any previous expansions by another process */ tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1); - /* always make room for at least 10 more records, and round - the database up to a multiple of the page size */ - size = TDB_ALIGN(tdb->map_size + size*10, tdb->page_size) - tdb->map_size; + /* always make room for at least 100 more records, and at + least 25% more space. Round the database up to a multiple + of the page size */ + new_size = MAX(tdb->map_size + size*100, tdb->map_size * 1.25); + size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size; if (!(tdb->flags & TDB_INTERNAL)) tdb_munmap(tdb); diff --git a/source/lib/tdb/common/lock.c b/source/lib/tdb/common/lock.c index e3fe888..f156c0f 100644 --- a/source/lib/tdb/common/lock.c +++ b/source/lib/tdb/common/lock.c @@ -505,6 +505,9 @@ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key) /* record lock stops delete underneath */ int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off) { + if (tdb->global_lock.count) { + return 0; + } return off ? tdb->methods->tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0, 1) : 0; } @@ -537,6 +540,10 @@ int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off) struct tdb_traverse_lock *i; uint32_t count = 0; + if (tdb->global_lock.count) { + return 0; + } + if (off == 0) return 0; for (i = &tdb->travlocks; i; i = i->next) diff --git a/source/lib/tdb/common/open.c b/source/lib/tdb/common/open.c index 0bd1c91..6bd8fda 100644 --- a/source/lib/tdb/common/open.c +++ b/source/lib/tdb/common/open.c @@ -35,7 +35,7 @@ static struct tdb_context *tdbs = NULL; static unsigned int default_tdb_hash(TDB_DATA *key) { uint32_t value; /* Used to compute the hash value. */ - uint32_t i; /* Used to cycle through random values. */ + uint32_t i; /* Used to cycle through random values. */ /* Set the initial value from the key size. */ for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++) @@ -90,7 +90,7 @@ static int tdb_new_database(struct tdb_context *tdb, int hash_size) size -= written; written = write(tdb->fd, newdb+written, size); if (written == size) { - ret = 0; + ret = 0; } else if (written >= 0) { /* a second incomplete write - we give up. * guessing the errno... */ @@ -152,6 +152,7 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, int rev = 0, locked = 0; unsigned char *vp; uint32_t vertest; + unsigned v; if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) { /* Can't log this */ @@ -215,6 +216,10 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, goto fail; /* errno set by open(2) */ } + /* on exec, don't inherit the fd */ + v = fcntl(tdb->fd, F_GETFD, 0); + fcntl(tdb->fd, F_SETFD, v | FD_CLOEXEC); + /* ensure there is only one process initialising at once */ if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) { TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get global lock on %s: %s\n", @@ -242,7 +247,7 @@ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags, /* its not a valid database - possibly initialise it */ if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) { if (errno == 0) { - errno = EIO; /* ie bad format or something */ + errno = EIO; /* ie bad format or something */ } goto fail; } diff --git a/source/lib/tdb/common/tdb.c b/source/lib/tdb/common/tdb.c index 0e9d1db..fd4e1cc 100644 --- a/source/lib/tdb/common/tdb.c +++ b/source/lib/tdb/common/tdb.c @@ -102,8 +102,7 @@ static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, } /* As tdb_find, but if you succeed, keep the lock */ -tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, - uint32_t hash, int locktype, +tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype, struct list_struct *rec) { uint32_t rec_ptr; @@ -237,14 +236,15 @@ int tdb_exists(struct tdb_context *tdb, TDB_DATA key) } /* actually delete an entry in the database given the offset */ -int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct*rec) +int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct *rec) { tdb_off_t last_ptr, i; struct list_struct lastrec; if (tdb->read_only || tdb->traverse_read) return -1; - if (tdb_write_lock_record(tdb, rec_ptr) == -1) { + if (tdb->traverse_write != 0 || + tdb_write_lock_record(tdb, rec_ptr) == -1) { /* Someone traversing here: mark it as dead */ rec->magic = TDB_DEAD_MAGIC; return tdb_rec_write(tdb, rec_ptr, rec); @@ -666,6 +666,16 @@ int tdb_get_flags(struct tdb_context *tdb) return tdb->flags; } +void tdb_add_flags(struct tdb_context *tdb, unsigned flags) +{ + tdb->flags |= flags; +} + +void tdb_remove_flags(struct tdb_context *tdb, unsigned flags) +{ + tdb->flags &= ~flags; +} + /* enable sequence number handling on an open tdb @@ -674,3 +684,104 @@ void tdb_enable_seqnum(struct tdb_context *tdb) { tdb->flags |= TDB_SEQNUM; } + + +/* + wipe the entire database, deleting all records. This can be done + very fast by using a global lock. The entire data portion of the + file becomes a single entry in the freelist. + */ +int tdb_wipe_all(struct tdb_context *tdb) +{ + int i; + tdb_off_t offset = 0; + ssize_t data_len; + + if (tdb_lockall(tdb) != 0) { + return -1; + } + + /* wipe the hashes */ + for (i=0;i<tdb->header.hash_size;i++) { + if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i)); + goto failed; + } + } + + /* wipe the freelist */ + if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n")); + goto failed; + } + + if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &offset) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write recovery head\n")); + goto failed; + } + + /* add all the rest of the file to the freelist */ + data_len = (tdb->map_size - TDB_DATA_START(tdb->header.hash_size)) - sizeof(struct list_struct); + if (data_len > 0) { + struct list_struct rec; + memset(&rec,'\0',sizeof(rec)); + rec.rec_len = data_len; + if (tdb_free(tdb, TDB_DATA_START(tdb->header.hash_size), &rec) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to add free record\n")); + goto failed; + } + } + + if (tdb_unlockall(tdb) != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n")); + goto failed; + } + + return 0; + +failed: + tdb_unlockall(tdb); + return -1; +} + + +/* + validate the integrity of all tdb hash chains. Useful when debugging + */ +int tdb_validate(struct tdb_context *tdb) +{ + int h; + for (h=-1;h<(int)tdb->header.hash_size;h++) { + tdb_off_t rec_ptr; + uint32_t count = 0; + if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &rec_ptr) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_validate: failed ofs_read at top of hash %d\n", h)); + return -1; + } + while (rec_ptr) { + struct list_struct r; + tdb_off_t size; + + if (tdb_rec_read(tdb, rec_ptr, &r) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_validate: failed rec_read h=%d rec_ptr=%u count=%u\n", + h, rec_ptr, count)); + return -1; + } + if (tdb_ofs_read(tdb, rec_ptr + sizeof(r) + r.rec_len - sizeof(tdb_off_t), &size) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_validate: failed ofs_read h=%d rec_ptr=%u count=%u\n", + h, rec_ptr, count)); + return -1; + } + if (size != r.rec_len + sizeof(r)) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_validate: failed size check size=%u h=%d rec_ptr=%u count=%u\n", + size, h, rec_ptr, count)); + return -1; + } + rec_ptr = r.next; + count++; + } + } + return 0; +} + + diff --git a/source/lib/tdb/common/tdb_private.h b/source/lib/tdb/common/tdb_private.h index 00bd0eb..63a6d04 100644 --- a/source/lib/tdb/common/tdb_private.h +++ b/source/lib/tdb/common/tdb_private.h @@ -38,6 +38,10 @@ typedef uint32_t tdb_len_t; typedef uint32_t tdb_off_t; +#ifndef offsetof +#define offsetof(t,f) ((unsigned int)&((t *)0)->f) +#endif + #define TDB_MAGIC_FOOD "TDB file\n" #define TDB_VERSION (0x26011967 + 6) #define TDB_MAGIC (0x26011999U) @@ -54,7 +58,7 @@ typedef uint32_t tdb_off_t; #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r)) -- Samba Shared Repository