From: Mike Day <ncm...@ncultra.org> Allow "unlocked" reads of the ram_list by using an RCU-enabled DQ. Most readers of the list no longer require holding the list mutex.
The ram_list now uses a QLIST instead of a QTAILQ. The difference is minimal. Signed-off-by: Mike Day <ncm...@ncultra.org> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> --- arch_init.c | 62 +++++++++++++++++--------- exec.c | 118 +++++++++++++++++++++++++++++-------------------- include/exec/cpu-all.h | 7 +-- 3 files changed, 113 insertions(+), 74 deletions(-) diff --git a/arch_init.c b/arch_init.c index 757632b..2051673 100644 --- a/arch_init.c +++ b/arch_init.c @@ -52,6 +52,7 @@ #include "exec/ram_addr.h" #include "hw/acpi/acpi.h" #include "qemu/host-utils.h" +#include "qemu/rcu_queue.h" #ifdef DEBUG_ARCH_INIT #define DPRINTF(fmt, ...) \ @@ -523,9 +524,12 @@ static void migration_bitmap_sync(void) trace_migration_bitmap_sync_start(); address_space_sync_dirty_bitmap(&address_space_memory); - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { migration_bitmap_sync_range(block->mr->ram_addr, block->used_length); } + rcu_read_unlock(); + trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; @@ -648,6 +652,8 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset, /* * ram_find_and_save_block: Finds a page to send and sends it to f * + * Called within an RCU critical section. + * * Returns: The number of bytes written. * 0 means no dirty pages */ @@ -661,7 +667,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) MemoryRegion *mr; if (!block) - block = QLIST_FIRST(&ram_list.blocks); + block = QLIST_FIRST_RCU(&ram_list.blocks); while (true) { mr = block->mr; @@ -672,9 +678,9 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) } if (offset >= block->used_length) { offset = 0; - block = QLIST_NEXT(block, next); + block = QLIST_NEXT_RCU(block, next); if (!block) { - block = QLIST_FIRST(&ram_list.blocks); + block = QLIST_FIRST_RCU(&ram_list.blocks); complete_round = true; ram_bulk_stage = false; } @@ -728,10 +734,10 @@ uint64_t ram_bytes_total(void) RAMBlock *block; uint64_t total = 0; - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) total += block->used_length; - } - + rcu_read_unlock(); return total; } @@ -777,6 +783,13 @@ static void reset_ram_globals(void) #define MAX_WAIT 50 /* ms, half buffered_file limit */ + +/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has + * long-running RCU critical section. When rcu-reclaims in the code + * start to become numerous it will be necessary to reduce the + * granularity of these critical sections. + */ + static int ram_save_setup(QEMUFile *f, void *opaque) { RAMBlock *block; @@ -818,7 +831,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } qemu_mutex_lock_iothread(); - qemu_mutex_lock_ramlist(); + rcu_read_lock(); bytes_transferred = 0; reset_ram_globals(); @@ -831,7 +844,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) * gaps due to alignment or unplugs. */ migration_dirty_pages = 0; - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { uint64_t block_pages; block_pages = block->used_length >> TARGET_PAGE_BITS; @@ -844,13 +857,13 @@ static int ram_save_setup(QEMUFile *f, void *opaque) qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { qemu_put_byte(f, strlen(block->idstr)); qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); qemu_put_be64(f, block->used_length); } - qemu_mutex_unlock_ramlist(); + rcu_read_unlock(); ram_control_before_iterate(f, RAM_CONTROL_SETUP); ram_control_after_iterate(f, RAM_CONTROL_SETUP); @@ -867,9 +880,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) int64_t t0; int total_sent = 0; - qemu_mutex_lock_ramlist(); - - if (ram_list.version != last_version) { + rcu_read_lock(); + if (atomic_rcu_read(&ram_list.version) != last_version) { reset_ram_globals(); } @@ -903,8 +915,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) } i++; } - - qemu_mutex_unlock_ramlist(); + rcu_read_unlock(); /* * Must occur before EOS (or any QEMUFile operation) @@ -931,7 +942,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) static int ram_save_complete(QEMUFile *f, void *opaque) { - qemu_mutex_lock_ramlist(); + rcu_read_lock(); + migration_bitmap_sync(); ram_control_before_iterate(f, RAM_CONTROL_FINISH); @@ -953,7 +965,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) ram_control_after_iterate(f, RAM_CONTROL_FINISH); migration_end(); - qemu_mutex_unlock_ramlist(); + rcu_read_unlock(); qemu_put_be64(f, RAM_SAVE_FLAG_EOS); return 0; @@ -1009,6 +1021,9 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) return 0; } +/* Must be called from within a rcu critical section. + * Returns a pointer from within the RCU-protected ram_list. + */ static inline void *host_from_stream_offset(QEMUFile *f, ram_addr_t offset, int flags) @@ -1030,7 +1045,7 @@ static inline void *host_from_stream_offset(QEMUFile *f, qemu_get_buffer(f, (uint8_t *)id, len); id[len] = 0; - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (!strncmp(id, block->idstr, sizeof(id)) && block->max_length > offset) { return memory_region_get_ram_ptr(block->mr) + offset; @@ -1063,6 +1078,12 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) ret = -EINVAL; } + /* This RCU critical section can be very long running. + * When RCU reclaims in the code start to become numerous, + * it will be necessary to reduce the granularity of this + * critical section. + */ + rcu_read_lock(); while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { ram_addr_t addr, total_ram_bytes; void *host; @@ -1087,7 +1108,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) id[len] = 0; length = qemu_get_be64(f); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (!strncmp(id, block->idstr, sizeof(id))) { if (length != block->used_length) { Error *local_err = NULL; @@ -1161,6 +1182,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } } + rcu_read_unlock(); DPRINTF("Completed load of VM with exit code %d seq iteration " "%" PRIu64 "\n", ret, seq_iter); return ret; diff --git a/exec.c b/exec.c index 6f88a86..1c70662 100644 --- a/exec.c +++ b/exec.c @@ -44,7 +44,7 @@ #include "trace.h" #endif #include "exec/cpu-all.h" - +#include "qemu/rcu_queue.h" #include "exec/cputlb.h" #include "translate-all.h" @@ -58,6 +58,9 @@ #if !defined(CONFIG_USER_ONLY) static bool in_migration; +/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes + * are protected by a lock, currently the iothread lock. + */ RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) }; static MemoryRegion *system_memory; @@ -426,7 +429,6 @@ address_space_translate_for_iotlb(CPUState *cpu, hwaddr addr, void cpu_exec_init_all(void) { #if !defined(CONFIG_USER_ONLY) - qemu_mutex_init(&ram_list.mutex); memory_map_init(); io_mem_init(); #endif @@ -806,16 +808,16 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...) } #if !defined(CONFIG_USER_ONLY) +/* Called from RCU critical section */ static RAMBlock *qemu_get_ram_block(ram_addr_t addr) { RAMBlock *block; - /* The list is protected by the iothread lock here. */ block = atomic_rcu_read(&ram_list.mru_block); if (block && addr - block->offset < block->max_length) { goto found; } - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (addr - block->offset < block->max_length) { goto found; } @@ -854,10 +856,12 @@ static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length) end = TARGET_PAGE_ALIGN(start + length); start &= TARGET_PAGE_MASK; + rcu_read_lock(); block = qemu_get_ram_block(start); assert(block == qemu_get_ram_block(end - 1)); start1 = (uintptr_t)ramblock_ptr(block, start - block->offset); cpu_tlb_reset_dirty_all(start1, length); + rcu_read_unlock(); } /* Note: start and end must be within the same ram block. */ @@ -1061,16 +1065,6 @@ void qemu_flush_coalesced_mmio_buffer(void) kvm_flush_coalesced_mmio_buffer(); } -void qemu_mutex_lock_ramlist(void) -{ - qemu_mutex_lock(&ram_list.mutex); -} - -void qemu_mutex_unlock_ramlist(void) -{ - qemu_mutex_unlock(&ram_list.mutex); -} - #ifdef __linux__ #include <sys/vfs.h> @@ -1190,6 +1184,7 @@ error: } #endif +/* Called with the iothread lock held, to protect ram_list. */ static ram_addr_t find_ram_offset(ram_addr_t size) { RAMBlock *block, *next_block; @@ -1197,16 +1192,16 @@ static ram_addr_t find_ram_offset(ram_addr_t size) assert(size != 0); /* it would hand out same offset multiple times */ - if (QLIST_EMPTY(&ram_list.blocks)) { + if (QLIST_EMPTY_RCU(&ram_list.blocks)) { return 0; } - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { ram_addr_t end, next = RAM_ADDR_MAX; end = block->offset + block->max_length; - QLIST_FOREACH(next_block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) { if (next_block->offset >= end) { next = MIN(next, next_block->offset); } @@ -1231,9 +1226,11 @@ ram_addr_t last_ram_offset(void) RAMBlock *block; ram_addr_t last = 0; - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { last = MAX(last, block->offset + block->max_length); } + rcu_read_unlock(); return last; } @@ -1253,11 +1250,14 @@ static void qemu_ram_setup_dump(void *addr, ram_addr_t size) } } +/* Called within an RCU critical section, or while the iothread lock + * is held. + */ static RAMBlock *find_ram_block(ram_addr_t addr) { RAMBlock *block; - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (block->offset == addr) { return block; } @@ -1266,6 +1266,7 @@ static RAMBlock *find_ram_block(ram_addr_t addr) return NULL; } +/* Called with the iothread lock held, to protect ram_list. */ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev) { RAMBlock *new_block = find_ram_block(addr); @@ -1283,18 +1284,16 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev) } pstrcat(new_block->idstr, sizeof(new_block->idstr), name); - /* This assumes the iothread lock is taken here too. */ - qemu_mutex_lock_ramlist(); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (block != new_block && !strcmp(block->idstr, new_block->idstr)) { fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n", new_block->idstr); abort(); } } - qemu_mutex_unlock_ramlist(); } +/* Called with the iothread lock held, to protect ram_list. */ void qemu_ram_unset_idstr(ram_addr_t addr) { RAMBlock *block = find_ram_block(addr); @@ -1357,6 +1356,7 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp) return 0; } +/* Called with the iothread lock held, to protect ram_list. */ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) { RAMBlock *block; @@ -1365,8 +1365,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; - /* This assumes the iothread lock is taken here too. */ - qemu_mutex_lock_ramlist(); new_block->offset = find_ram_offset(new_block->max_length); if (!new_block->host) { @@ -1380,7 +1378,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) error_setg_errno(errp, errno, "cannot set up guest memory '%s'", memory_region_name(new_block->mr)); - qemu_mutex_unlock_ramlist(); return -1; } memory_try_enable_merging(new_block->host, new_block->max_length); @@ -1391,24 +1388,22 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) * QLIST (which has an RCU-friendly variant) does not have insertion at * tail, so save the last element in last_block. */ - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { last_block = block; if (block->max_length < new_block->max_length) { break; } } if (block) { - QLIST_INSERT_BEFORE(block, new_block, next); + QLIST_INSERT_BEFORE_RCU(block, new_block, next); } else if (last_block) { - QLIST_INSERT_AFTER(last_block, new_block, next); + QLIST_INSERT_AFTER_RCU(last_block, new_block, next); } else { /* list is empty */ - QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next); + QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next); } ram_list.mru_block = NULL; atomic_rcu_set(&ram_list.version, ram_list.version + 1); - qemu_mutex_unlock_ramlist(); - new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; if (new_ram_size > old_ram_size) { @@ -1436,6 +1431,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) } #ifdef __linux__ +/* Called with the iothread lock held, to protect ram_list. */ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, bool share, const char *mem_path, Error **errp) @@ -1483,6 +1479,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, } #endif +/* Called with the iothread lock held, to protect ram_list. */ static ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, void (*resized)(const char*, @@ -1520,12 +1517,14 @@ ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, return addr; } +/* Called with the iothread lock held, to protect ram_list. */ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr, Error **errp) { return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp); } +/* Called with the iothread lock held, to protect ram_list. */ ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp) { return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp); @@ -1540,15 +1539,14 @@ ram_addr_t qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz, return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp); } +/* Called with the iothread lock held, to protect ram_list. */ void qemu_ram_free_from_ptr(ram_addr_t addr) { RAMBlock *block; - /* This assumes the iothread lock is taken here too. */ - qemu_mutex_lock_ramlist(); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (addr == block->offset) { - QLIST_REMOVE(block, next); + QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; atomic_rcu_set(&ram_list.version, ram_list.version + 1); call_rcu(block, (void (*)(struct RAMBlock *))g_free, rcu); @@ -1579,19 +1577,15 @@ void qemu_ram_free(ram_addr_t addr) { RAMBlock *block; - /* This assumes the iothread lock is taken here too. */ - qemu_mutex_lock_ramlist(); - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (addr == block->offset) { - QLIST_REMOVE(block, next); + QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; atomic_rcu_set(&ram_list.version, ram_list.version + 1); call_rcu(block, reclaim_ramblock, rcu); break; } } - qemu_mutex_unlock_ramlist(); - } #ifndef _WIN32 @@ -1602,7 +1596,7 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) int flags; void *area, *vaddr; - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { offset = addr - block->offset; if (offset < block->max_length) { vaddr = ramblock_ptr(block, offset); @@ -1649,8 +1643,10 @@ int qemu_get_ram_fd(ram_addr_t addr) RAMBlock *block; int fd; + rcu_read_lock(); block = qemu_get_ram_block(addr); fd = atomic_rcu_read(&block->fd); + rcu_read_unlock(); return fd; } @@ -1659,8 +1655,10 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr) RAMBlock *block; void *ptr; + rcu_read_lock(); block = qemu_get_ram_block(addr); ptr = ramblock_ptr(block, 0); + rcu_read_unlock(); return ptr; } @@ -1668,12 +1666,19 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr) * This should not be used for general purpose DMA. Use address_space_map * or address_space_rw instead. For local memory (e.g. video ram) that the * device owns, use memory_region_get_ram_ptr. + * + * By the time this function returns, the returned pointer is not protected + * by RCU anymore. If the caller is not within an RCU critical section and + * does not hold the iothread lock, it must have other means of protecting the + * pointer, such as a reference to the region that includes the incoming + * ram_addr_t. */ void *qemu_get_ram_ptr(ram_addr_t addr) { RAMBlock *block; void *ptr; + rcu_read_lock(); block = qemu_get_ram_block(addr); if (xen_enabled() && block->host == NULL) { @@ -1683,19 +1688,26 @@ void *qemu_get_ram_ptr(ram_addr_t addr) */ if (block->offset == 0) { ptr = xen_map_cache(addr, 0, 0); - goto done; + goto unlock; } block->host = xen_map_cache(block->offset, block->max_length, 1); } ptr = ramblock_ptr(block, addr - block->offset); -done: +unlock: + rcu_read_unlock(); return ptr; } /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr * but takes a size argument. + * + * By the time this function returns, the returned pointer is not protected + * by RCU anymore. If the caller is not within an RCU critical section and + * does not hold the iothread lock, it must have other means of protecting the + * pointer, such as a reference to the region that includes the incoming + * ram_addr_t. */ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size) { @@ -1707,11 +1719,13 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size) return xen_map_cache(addr, *size, 1); } else { RAMBlock *block; - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { if (addr - block->offset < block->max_length) { if (addr - block->offset + *size > block->max_length) *size = block->max_length - addr + block->offset; ptr = ramblock_ptr(block, addr - block->offset); + rcu_read_unlock(); return ptr; } } @@ -1737,17 +1751,20 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) MemoryRegion *mr; if (xen_enabled()) { + rcu_read_lock(); *ram_addr = xen_ram_addr_from_mapcache(ptr); mr = qemu_get_ram_block(*ram_addr)->mr; + rcu_read_unlock(); return mr; } - block = ram_list.mru_block; + rcu_read_lock(); + block = atomic_rcu_read(&ram_list.mru_block); if (block && block->host && host - block->host < block->max_length) { goto found; } - QLIST_FOREACH(block, &ram_list.blocks, next) { + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { /* This case append when the block is not mapped. */ if (block->host == NULL) { continue; @@ -1762,6 +1779,7 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) found: *ram_addr = block->offset + (host - block->host); mr = block->mr; + rcu_read_unlock(); return mr; } @@ -3015,8 +3033,10 @@ void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) { RAMBlock *block; - QLIST_FOREACH(block, &ram_list.blocks, next) { + rcu_read_lock(); + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { func(block->host, block->offset, block->used_length, opaque); } + rcu_read_unlock(); } #endif diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index b38e03f..70f0f20 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -278,9 +278,7 @@ struct RAMBlock { void (*resized)(const char*, uint64_t length, void *host); uint32_t flags; char idstr[256]; - /* Reads can take either the iothread or the ramlist lock. - * Writes must take both locks. - */ + /* Writes must hold the iothread lock */ QLIST_ENTRY(RAMBlock) next; int fd; }; @@ -293,11 +291,10 @@ static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset) } typedef struct RAMList { - QemuMutex mutex; /* Protected by the iothread lock. */ unsigned long *dirty_memory[DIRTY_MEMORY_NUM]; RAMBlock *mru_block; - /* Protected by the ramlist lock. */ + /* RCU-enabled, writes protected by the iothread lock. */ QLIST_HEAD(, RAMBlock) blocks; uint32_t version; } RAMList; -- 1.8.3.1