On Fri, 02/06 17:55, Paolo Bonzini wrote: > From: Mike Day <ncm...@ncultra.org> > > Allow "unlocked" reads of the ram_list by using an RCU-enabled QLIST. > > The ramlist mutex is kept, because call_rcu callbacks are not run within > the iothread lock. Thus, writers still need to take the ramlist mutex, > but they no longer need to assume that the iothread lock is taken. > > Readers of the list, instead, no longer require either the iothread > or ramlist mutex, but they need to use rcu_read_lock() and > rcu_read_unlock(). > > One place in arch_init.c was downgrading from write side to read side > like this: > > qemu_mutex_lock_iothread() > qemu_mutex_lock_ramlist() > ... > qemu_mutex_unlock_iothread() > ... > qemu_mutex_unlock_ramlist() > > and the equivalent idiom is: > > qemu_mutex_lock_ramlist() > rcu_read_lock() > ... > qemu_mutex_unlock_ramlist() > ... > rcu_read_unlock() > > Signed-off-by: Mike Day <ncm...@ncultra.org> > Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> > --- > arch_init.c | 65 ++++++++++++++++++++++--------- > exec.c | 101 > ++++++++++++++++++++++++++++++++++--------------- > include/exec/cpu-all.h | 6 +-- > 3 files changed, 120 insertions(+), 52 deletions(-) > > diff --git a/arch_init.c b/arch_init.c > index 1ee2e35..5fc6fc3 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -52,6 +52,7 @@ > #include "exec/ram_addr.h" > #include "hw/acpi/acpi.h" > #include "qemu/host-utils.h" > +#include "qemu/rcu_queue.h" > > #ifdef DEBUG_ARCH_INIT > #define DPRINTF(fmt, ...) \ > @@ -523,9 +524,12 @@ static void migration_bitmap_sync(void) > trace_migration_bitmap_sync_start(); > address_space_sync_dirty_bitmap(&address_space_memory); > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + rcu_read_lock(); > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > migration_bitmap_sync_range(block->mr->ram_addr, block->used_length); > } > + rcu_read_unlock(); > + > trace_migration_bitmap_sync_end(migration_dirty_pages > - num_dirty_pages_init); > num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; > @@ -648,6 +652,8 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, > ram_addr_t offset, > /* > * ram_find_and_save_block: Finds a page to send and sends it to f > * > + * Called within an RCU critical section. > + * > * Returns: The number of bytes written. > * 0 means no dirty pages > */ > @@ -661,7 +667,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool > last_stage) > MemoryRegion *mr; > > if (!block) > - block = QLIST_FIRST(&ram_list.blocks); > + block = QLIST_FIRST_RCU(&ram_list.blocks); > > while (true) { > mr = block->mr; > @@ -672,9 +678,9 @@ static int ram_find_and_save_block(QEMUFile *f, bool > last_stage) > } > if (offset >= block->used_length) { > offset = 0; > - block = QLIST_NEXT(block, next); > + block = QLIST_NEXT_RCU(block, next); > if (!block) { > - block = QLIST_FIRST(&ram_list.blocks); > + block = QLIST_FIRST_RCU(&ram_list.blocks); > complete_round = true; > ram_bulk_stage = false; > } > @@ -728,10 +734,10 @@ uint64_t ram_bytes_total(void) > RAMBlock *block; > uint64_t total = 0; > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + rcu_read_lock(); > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) > total += block->used_length; > - } > - > + rcu_read_unlock(); > return total; > } > > @@ -777,6 +783,13 @@ static void reset_ram_globals(void) > > #define MAX_WAIT 50 /* ms, half buffered_file limit */ > > + > +/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has > + * long-running RCU critical section. When rcu-reclaims in the code > + * start to become numerous it will be necessary to reduce the > + * granularity of these critical sections. > + */ > + > static int ram_save_setup(QEMUFile *f, void *opaque) > { > RAMBlock *block; > @@ -820,6 +833,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > /* iothread lock needed for ram_list.dirty_memory[] */ > qemu_mutex_lock_iothread(); > qemu_mutex_lock_ramlist(); > + rcu_read_lock(); > bytes_transferred = 0; > reset_ram_globals(); > > @@ -832,7 +846,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > * gaps due to alignment or unplugs. > */ > migration_dirty_pages = 0; > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > uint64_t block_pages; > > block_pages = block->used_length >> TARGET_PAGE_BITS; > @@ -841,17 +855,18 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > > memory_global_dirty_log_start(); > migration_bitmap_sync(); > + qemu_mutex_unlock_ramlist(); > qemu_mutex_unlock_iothread(); > > qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > qemu_put_byte(f, strlen(block->idstr)); > qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); > qemu_put_be64(f, block->used_length); > } > > - qemu_mutex_unlock_ramlist(); > + rcu_read_unlock(); > > ram_control_before_iterate(f, RAM_CONTROL_SETUP); > ram_control_after_iterate(f, RAM_CONTROL_SETUP); > @@ -868,12 +883,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) > int64_t t0; > int total_sent = 0; > > - qemu_mutex_lock_ramlist(); > - > + rcu_read_lock(); > if (ram_list.version != last_version) { > reset_ram_globals(); > } > > + /* Read version before ram_list.blocks */ > + smp_rmb(); > + > ram_control_before_iterate(f, RAM_CONTROL_ROUND); > > t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > @@ -904,8 +921,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) > } > i++; > } > - > - qemu_mutex_unlock_ramlist(); > + rcu_read_unlock(); > > /* > * Must occur before EOS (or any QEMUFile operation) > @@ -933,7 +949,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) > /* Called with iothread lock */ > static int ram_save_complete(QEMUFile *f, void *opaque) > { > - qemu_mutex_lock_ramlist(); > + rcu_read_lock(); > + > migration_bitmap_sync(); > > ram_control_before_iterate(f, RAM_CONTROL_FINISH); > @@ -955,7 +972,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) > ram_control_after_iterate(f, RAM_CONTROL_FINISH); > migration_end(); > > - qemu_mutex_unlock_ramlist(); > + rcu_read_unlock(); > qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > > return 0; > @@ -969,7 +986,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void > *opaque, uint64_t max_size) > > if (remaining_size < max_size) { > qemu_mutex_lock_iothread(); > + rcu_read_lock(); > migration_bitmap_sync(); > + rcu_read_unlock(); > qemu_mutex_unlock_iothread(); > remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; > } > @@ -1011,6 +1030,9 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, > void *host) > return 0; > } > > +/* Must be called from within a rcu critical section. > + * Returns a pointer from within the RCU-protected ram_list. > + */ > static inline void *host_from_stream_offset(QEMUFile *f, > ram_addr_t offset, > int flags) > @@ -1032,7 +1054,7 @@ static inline void *host_from_stream_offset(QEMUFile *f, > qemu_get_buffer(f, (uint8_t *)id, len); > id[len] = 0; > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > if (!strncmp(id, block->idstr, sizeof(id)) && > block->max_length > offset) { > return memory_region_get_ram_ptr(block->mr) + offset; > @@ -1065,6 +1087,12 @@ static int ram_load(QEMUFile *f, void *opaque, int > version_id) > ret = -EINVAL; > } > > + /* This RCU critical section can be very long running. > + * When RCU reclaims in the code start to become numerous, > + * it will be necessary to reduce the granularity of this > + * critical section. > + */ > + rcu_read_lock(); > while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { > ram_addr_t addr, total_ram_bytes; > void *host; > @@ -1089,7 +1117,7 @@ static int ram_load(QEMUFile *f, void *opaque, int > version_id) > id[len] = 0; > length = qemu_get_be64(f); > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > if (!strncmp(id, block->idstr, sizeof(id))) { > if (length != block->used_length) { > Error *local_err = NULL; > @@ -1163,6 +1191,7 @@ static int ram_load(QEMUFile *f, void *opaque, int > version_id) > } > } > > + rcu_read_unlock(); > DPRINTF("Completed load of VM with exit code %d seq iteration " > "%" PRIu64 "\n", ret, seq_iter); > return ret; > diff --git a/exec.c b/exec.c > index ea1f6c4..8273c3f 100644 > --- a/exec.c > +++ b/exec.c > @@ -44,7 +44,7 @@ > #include "trace.h" > #endif > #include "exec/cpu-all.h" > - > +#include "qemu/rcu_queue.h" > #include "exec/cputlb.h" > #include "translate-all.h" > > @@ -58,6 +58,9 @@ > #if !defined(CONFIG_USER_ONLY) > static bool in_migration; > > +/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes > + * are protected by the ramlist lock. > + */ > RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) }; > > static MemoryRegion *system_memory; > @@ -806,16 +809,16 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...) > } > > #if !defined(CONFIG_USER_ONLY) > +/* Called from RCU critical section */ > static RAMBlock *qemu_get_ram_block(ram_addr_t addr) > { > RAMBlock *block; > > - /* The list is protected by the iothread lock here. */ > block = atomic_rcu_read(&ram_list.mru_block); > if (block && addr - block->offset < block->max_length) { > goto found; > } > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > if (addr - block->offset < block->max_length) { > goto found; > } > @@ -854,10 +857,12 @@ static void tlb_reset_dirty_range_all(ram_addr_t start, > ram_addr_t length) > end = TARGET_PAGE_ALIGN(start + length); > start &= TARGET_PAGE_MASK; > > + rcu_read_lock(); > block = qemu_get_ram_block(start); > assert(block == qemu_get_ram_block(end - 1)); > start1 = (uintptr_t)ramblock_ptr(block, start - block->offset); > cpu_tlb_reset_dirty_all(start1, length); > + rcu_read_unlock(); > } > > /* Note: start and end must be within the same ram block. */ > @@ -1190,6 +1195,7 @@ error: > } > #endif > > +/* Called with the ramlist lock held. */ > static ram_addr_t find_ram_offset(ram_addr_t size) > { > RAMBlock *block, *next_block; > @@ -1197,16 +1203,16 @@ static ram_addr_t find_ram_offset(ram_addr_t size) > > assert(size != 0); /* it would hand out same offset multiple times */ > > - if (QLIST_EMPTY(&ram_list.blocks)) { > + if (QLIST_EMPTY_RCU(&ram_list.blocks)) { > return 0; > } > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > ram_addr_t end, next = RAM_ADDR_MAX; > > end = block->offset + block->max_length; > > - QLIST_FOREACH(next_block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) { > if (next_block->offset >= end) { > next = MIN(next, next_block->offset); > } > @@ -1231,9 +1237,11 @@ ram_addr_t last_ram_offset(void) > RAMBlock *block; > ram_addr_t last = 0; > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + rcu_read_lock(); > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > last = MAX(last, block->offset + block->max_length); > } > + rcu_read_unlock(); > return last; > } > > @@ -1253,11 +1261,14 @@ static void qemu_ram_setup_dump(void *addr, > ram_addr_t size) > } > } > > +/* Called within an RCU critical section, or while the ramlist lock > + * is held. > + */ > static RAMBlock *find_ram_block(ram_addr_t addr) > { > RAMBlock *block; > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > if (block->offset == addr) { > return block; > } > @@ -1271,6 +1282,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char > *name, DeviceState *dev) > { > RAMBlock *new_block, *block; > > + rcu_read_lock(); > new_block = find_ram_block(addr); > assert(new_block); > assert(!new_block->idstr[0]); > @@ -1284,15 +1296,14 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char > *name, DeviceState *dev) > } > pstrcat(new_block->idstr, sizeof(new_block->idstr), name); > > - qemu_mutex_lock_ramlist(); > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > if (block != new_block && !strcmp(block->idstr, new_block->idstr)) { > fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n", > new_block->idstr); > abort(); > } > } > - qemu_mutex_unlock_ramlist(); > + rcu_read_unlock(); > } > > /* Called with iothread lock held. */ > @@ -1305,10 +1316,12 @@ void qemu_ram_unset_idstr(ram_addr_t addr) > * does not work anyway. > */ > > + rcu_read_lock(); > block = find_ram_block(addr); > if (block) { > memset(block->idstr, 0, sizeof(block->idstr)); > } > + rcu_read_unlock(); > } > > static int memory_try_enable_merging(void *addr, size_t len) > @@ -1372,7 +1385,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, > Error **errp) > > old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; > > - /* This assumes the iothread lock is taken here too. */ > qemu_mutex_lock_ramlist(); > new_block->offset = find_ram_offset(new_block->max_length); > > @@ -1398,21 +1410,23 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, > Error **errp) > * QLIST (which has an RCU-friendly variant) does not have insertion at > * tail, so save the last element in last_block. > */ > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > last_block = block; > if (block->max_length < new_block->max_length) { > break; > } > } > if (block) { > - QLIST_INSERT_BEFORE(block, new_block, next); > + QLIST_INSERT_BEFORE_RCU(block, new_block, next); > } else if (last_block) { > - QLIST_INSERT_AFTER(last_block, new_block, next); > + QLIST_INSERT_AFTER_RCU(last_block, new_block, next); > } else { /* list is empty */ > - QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next); > + QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next); > } > ram_list.mru_block = NULL; > > + /* Write list before version */ > + smp_wmb(); > ram_list.version++; > qemu_mutex_unlock_ramlist(); > > @@ -1552,12 +1566,13 @@ void qemu_ram_free_from_ptr(ram_addr_t addr) > { > RAMBlock *block; > > - /* This assumes the iothread lock is taken here too. */ > qemu_mutex_lock_ramlist(); > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > if (addr == block->offset) { > - QLIST_REMOVE(block, next); > + QLIST_REMOVE_RCU(block, next); > ram_list.mru_block = NULL; > + /* Write list before version */ > + smp_wmb(); > ram_list.version++; > call_rcu(block, (void (*)(struct RAMBlock *))g_free, rcu); > break; > @@ -1583,17 +1598,17 @@ static void reclaim_ramblock(RAMBlock *block) > g_free(block); > } > > -/* Called with the iothread lock held */ > void qemu_ram_free(ram_addr_t addr) > { > RAMBlock *block; > > - /* This assumes the iothread lock is taken here too. */ > qemu_mutex_lock_ramlist(); > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > if (addr == block->offset) { > - QLIST_REMOVE(block, next); > + QLIST_REMOVE_RCU(block, next); > ram_list.mru_block = NULL; > + /* Write list before version */ > + smp_wmb(); > ram_list.version++; > call_rcu(block, reclaim_ramblock, rcu); > break; > @@ -1610,7 +1625,7 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) > int flags; > void *area, *vaddr; > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > offset = addr - block->offset; > if (offset < block->max_length) { > vaddr = ramblock_ptr(block, offset); > @@ -1657,8 +1672,10 @@ int qemu_get_ram_fd(ram_addr_t addr) > RAMBlock *block; > int fd; > > + rcu_read_lock(); > block = qemu_get_ram_block(addr); > fd = block->fd; > + rcu_read_unlock(); > return fd; > } > > @@ -1667,8 +1684,10 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr) > RAMBlock *block; > void *ptr; > > + rcu_read_lock(); > block = qemu_get_ram_block(addr); > ptr = ramblock_ptr(block, 0); > + rcu_read_unlock(); > return ptr; > } > > @@ -1676,12 +1695,19 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr) > * This should not be used for general purpose DMA. Use address_space_map > * or address_space_rw instead. For local memory (e.g. video ram) that the > * device owns, use memory_region_get_ram_ptr. > + * > + * By the time this function returns, the returned pointer is not protected > + * by RCU anymore. If the caller is not within an RCU critical section and > + * does not hold the iothread lock, it must have other means of protecting > the > + * pointer, such as a reference to the region that includes the incoming > + * ram_addr_t. > */ > void *qemu_get_ram_ptr(ram_addr_t addr) > { > RAMBlock *block; > void *ptr; > > + rcu_read_lock(); > block = qemu_get_ram_block(addr); > > if (xen_enabled() && block->host == NULL) { > @@ -1691,19 +1717,26 @@ void *qemu_get_ram_ptr(ram_addr_t addr) > */ > if (block->offset == 0) { > ptr = xen_map_cache(addr, 0, 0); > - goto done; > + goto unlock; > } > > block->host = xen_map_cache(block->offset, block->max_length, 1); > } > ptr = ramblock_ptr(block, addr - block->offset); > > -done: > +unlock: > + rcu_read_unlock(); > return ptr; > } > > /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr > * but takes a size argument. > + * > + * By the time this function returns, the returned pointer is not protected > + * by RCU anymore. If the caller is not within an RCU critical section and > + * does not hold the iothread lock, it must have other means of protecting > the > + * pointer, such as a reference to the region that includes the incoming > + * ram_addr_t. > */ > static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size) > { > @@ -1715,11 +1748,13 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, > hwaddr *size) > return xen_map_cache(addr, *size, 1); > } else { > RAMBlock *block; > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + rcu_read_lock(); > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > if (addr - block->offset < block->max_length) { > if (addr - block->offset + *size > block->max_length) > *size = block->max_length - addr + block->offset; > ptr = ramblock_ptr(block, addr - block->offset); > + rcu_read_unlock(); > return ptr; > } > }
The following lines are "fprintf(stderr, ..." and "abort()", the above return is the only exit point of the function after rcu lock. so putting unlock in the loop is ok. > @@ -1745,17 +1780,20 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, > ram_addr_t *ram_addr) > MemoryRegion *mr; > > if (xen_enabled()) { > + rcu_read_lock(); > *ram_addr = xen_ram_addr_from_mapcache(ptr); > mr = qemu_get_ram_block(*ram_addr)->mr; > + rcu_read_unlock(); > return mr; > } > > - block = ram_list.mru_block; > + rcu_read_lock(); > + block = atomic_rcu_read(&ram_list.mru_block); > if (block && block->host && host - block->host < block->max_length) { > goto found; > } > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > /* This case append when the block is not mapped. */ > if (block->host == NULL) { > continue; } if (host - block->host < block->max_length) { goto found; } } //// return NULL; Should we call rcu_read_unlock at "////"? Fam > @@ -1770,6 +1808,7 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, > ram_addr_t *ram_addr) > found: > *ram_addr = block->offset + (host - block->host); > mr = block->mr; > + rcu_read_unlock(); > return mr; > } > > @@ -3023,8 +3062,10 @@ void qemu_ram_foreach_block(RAMBlockIterFunc func, > void *opaque) > { > RAMBlock *block; > > - QLIST_FOREACH(block, &ram_list.blocks, next) { > + rcu_read_lock(); > + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { > func(block->host, block->offset, block->used_length, opaque); > } > + rcu_read_unlock(); > } > #endif > diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h > index 87b8658..ac06c67 100644 > --- a/include/exec/cpu-all.h > +++ b/include/exec/cpu-all.h > @@ -279,9 +279,7 @@ struct RAMBlock { > uint32_t flags; > /* Protected by iothread lock. */ > char idstr[256]; > - /* Reads can take either the iothread or the ramlist lock. > - * Writes must take both locks. > - */ > + /* RCU-enabled, writes protected by the ramlist lock */ > QLIST_ENTRY(RAMBlock) next; > int fd; > }; > @@ -298,7 +296,7 @@ typedef struct RAMList { > /* Protected by the iothread lock. */ > unsigned long *dirty_memory[DIRTY_MEMORY_NUM]; > RAMBlock *mru_block; > - /* Protected by the ramlist lock. */ > + /* RCU-enabled, writes protected by the ramlist lock. */ > QLIST_HEAD(, RAMBlock) blocks; > uint32_t version; > } RAMList; > -- > 1.8.3.1 >