On 01.03.2011, at 19:35, anthony.per...@citrix.com wrote: > From: Jun Nakajima <jun.nakaj...@intel.com> > > On IA32 host or IA32 PAE host, at present, generally, we can't create > an HVM guest with more than 2G memory, because generally it's almost > impossible for Qemu to find a large enough and consecutive virtual > address space to map an HVM guest's whole physical address space. > The attached patch fixes this issue using dynamic mapping based on > little blocks of memory. > > Each call to qemu_get_ram_ptr makes a call to qemu_map_cache with the > lock option, so mapcache will not unmap these ram_ptr. > > Signed-off-by: Jun Nakajima <jun.nakaj...@intel.com> > Signed-off-by: Anthony PERARD <anthony.per...@citrix.com> > Signed-off-by: Stefano Stabellini <stefano.stabell...@eu.citrix.com> > --- > Makefile.target | 3 + > configure | 3 + > exec.c | 40 ++++++- > hw/xen.h | 13 ++ > hw/xen_common.h | 9 ++ > xen-all.c | 64 +++++++++++ > xen-mapcache-stub.c | 40 +++++++ > xen-mapcache.c | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++ > xen-mapcache.h | 22 ++++ > xen-stub.c | 4 + > 10 files changed, 504 insertions(+), 4 deletions(-) > create mode 100644 xen-mapcache-stub.c > create mode 100644 xen-mapcache.c > create mode 100644 xen-mapcache.h > > diff --git a/Makefile.target b/Makefile.target > index c539b1e..dcdd51d 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -214,8 +214,11 @@ else > CONFIG_NO_XEN = y > endif > # xen support > +CONFIG_NO_XEN_MAPCACHE = $(if $(subst n,,$(CONFIG_XEN_MAPCACHE)),n,y) > obj-i386-$(CONFIG_XEN) += xen-all.o > obj-$(CONFIG_NO_XEN) += xen-stub.o > +obj-i386-$(CONFIG_XEN_MAPCACHE) += xen-mapcache.o > +obj-$(CONFIG_NO_XEN_MAPCACHE) += xen-mapcache-stub.o > > obj-i386-$(CONFIG_XEN) += xen_platform.o > > diff --git a/configure b/configure > index a84d974..4fd8696 100755 > --- a/configure > +++ b/configure > @@ -3172,6 +3172,9 @@ case "$target_arch2" in > i386|x86_64) > if test "$xen" = "yes" -a "$target_softmmu" = "yes" ; then > echo "CONFIG_XEN=y" >> $config_target_mak > + if test "$cpu" = "i386" -o "$cpu" = "x86_64"; then > + echo "CONFIG_XEN_MAPCACHE=y" >> $config_target_mak > + fi > fi > esac > case "$target_arch2" in > diff --git a/exec.c b/exec.c > index d611100..558595a 100644 > --- a/exec.c > +++ b/exec.c > @@ -32,6 +32,7 @@ > #include "hw/qdev.h" > #include "osdep.h" > #include "kvm.h" > +#include "hw/xen.h" > #include "qemu-timer.h" > #if defined(CONFIG_USER_ONLY) > #include <qemu.h> > @@ -51,6 +52,8 @@ > #include <libutil.h> > #endif > #endif > +#else /* !CONFIG_USER_ONLY */ > +#include "xen-mapcache.h" > #endif > > //#define DEBUG_TB_INVALIDATE > @@ -2865,6 +2868,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, > const char *name, > } > } > > + new_block->offset = find_ram_offset(size); > if (host) { > new_block->host = host; > } else { > @@ -2886,13 +2890,15 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, > const char *name, > PROT_EXEC|PROT_READ|PROT_WRITE, > MAP_SHARED | MAP_ANONYMOUS, -1, 0); > #else > - new_block->host = qemu_vmalloc(size); > + if (xen_mapcache_enabled()) { > + xen_ram_alloc(new_block->offset, size); > + } else { > + new_block->host = qemu_vmalloc(size); > + } > #endif > qemu_madvise(new_block->host, size, QEMU_MADV_MERGEABLE); > } > } > - > - new_block->offset = find_ram_offset(size); > new_block->length = size; > > QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next); > @@ -2933,7 +2939,11 @@ void qemu_ram_free(ram_addr_t addr) > #if defined(TARGET_S390X) && defined(CONFIG_KVM) > munmap(block->host, block->length); > #else > - qemu_vfree(block->host); > + if (xen_mapcache_enabled()) { > + qemu_invalidate_entry(block->host); > + } else { > + qemu_vfree(block->host); > + } > #endif > } > qemu_free(block); > @@ -2959,6 +2969,15 @@ void *qemu_get_ram_ptr(ram_addr_t addr) > if (addr - block->offset < block->length) { > QLIST_REMOVE(block, next); > QLIST_INSERT_HEAD(&ram_list.blocks, block, next); > + if (xen_mapcache_enabled()) { > + /* We need to check if the requested address is in the RAM > + * because we don't want to map the entire memory in QEMU. > + */ > + if (block->offset == 0) { > + return qemu_map_cache(addr, 0, 1); > + } > + block->host = qemu_map_cache(block->offset, block->length, > 1); > + } > return block->host + (addr - block->offset); > } > } > @@ -2994,11 +3013,21 @@ int qemu_ram_addr_from_host(void *ptr, ram_addr_t > *ram_addr) > uint8_t *host = ptr; > > QLIST_FOREACH(block, &ram_list.blocks, next) { > + /* This case append when the block is not mapped. */ > + if (block->host == NULL) { > + continue; > + } > if (host - block->host < block->length) { > *ram_addr = block->offset + (host - block->host); > return 0; > } > } > + > + if (xen_mapcache_enabled()) { > + *ram_addr = qemu_ram_addr_from_mapcache(ptr); > + return 0; > + } > + > return -1; > } > > @@ -3909,6 +3938,9 @@ void cpu_physical_memory_unmap(void *buffer, > target_phys_addr_t len, > if (is_write) { > cpu_physical_memory_write(bounce.addr, bounce.buffer, access_len); > } > + if (xen_enabled()) { > + qemu_invalidate_entry(buffer); > + } > qemu_vfree(bounce.buffer); > bounce.buffer = NULL; > cpu_notify_map_clients(); > diff --git a/hw/xen.h b/hw/xen.h > index 12d4e5f..e26d061 100644 > --- a/hw/xen.h > +++ b/hw/xen.h > @@ -31,6 +31,15 @@ static inline int xen_enabled(void) > #endif > } > > +static inline int xen_mapcache_enabled(void) > +{ > +#ifdef CONFIG_XEN_MAPCACHE > + return xen_enabled(); > +#else > + return 0; > +#endif > +} > + > int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num); > void xen_piix3_set_irq(void *opaque, int irq_num, int level); > void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int > len); > @@ -41,6 +50,10 @@ void pci_xen_platform_init(PCIBus *bus); > > int xen_init(void); > > +#if defined(NEED_CPU_H) && !defined(CONFIG_USER_ONLY) > +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size); > +#endif > + > #if defined(CONFIG_XEN) && CONFIG_XEN_CTRL_INTERFACE_VERSION < 400 > # define HVM_MAX_VCPUS 32 > #endif > diff --git a/hw/xen_common.h b/hw/xen_common.h > index 7e123ec..5a36642 100644 > --- a/hw/xen_common.h > +++ b/hw/xen_common.h > @@ -50,6 +50,15 @@ static inline int xc_fd(int xen_xc) > } > > > +static inline int xc_domain_populate_physmap_exact > + (XenXC xc_handle, uint32_t domid, unsigned long nr_extents, > + unsigned int extent_order, unsigned int mem_flags, xen_pfn_t > *extent_start) > +{ > + return xc_domain_memory_populate_physmap > + (xc_handle, domid, nr_extents, extent_order, mem_flags, > extent_start); > +} > + > + > /* Xen 4.1 */ > #else > > diff --git a/xen-all.c b/xen-all.c > index 761f2a0..03d1e90 100644 > --- a/xen-all.c > +++ b/xen-all.c > @@ -10,6 +10,8 @@ > #include "hw/xen_common.h" > #include "hw/xen_backend.h" > > +#include "xen-mapcache.h" > + > /* Xen specific function for piix pci */ > > int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num) > @@ -52,6 +54,64 @@ qemu_irq *xen_interrupt_controller_init(void) > return qemu_allocate_irqs(xen_set_irq, NULL, 16); > } > > + > +/* Memory Ops */ > + > +static void xen_ram_init(ram_addr_t ram_size) > +{ > + RAMBlock *new_block; > + ram_addr_t below_4g_mem_size, above_4g_mem_size = 0; > + > + new_block = qemu_mallocz(sizeof (*new_block)); > + pstrcpy(new_block->idstr, sizeof (new_block->idstr), "xen.ram"); > + new_block->host = NULL; > + new_block->offset = 0; > + new_block->length = ram_size; > + > + QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next); > + > + ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty, > + new_block->length >> > TARGET_PAGE_BITS); > + memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS), > + 0xff, new_block->length >> TARGET_PAGE_BITS); > + > + if (ram_size >= 0xe0000000 ) { > + above_4g_mem_size = ram_size - 0xe0000000; > + below_4g_mem_size = 0xe0000000; > + } else { > + below_4g_mem_size = ram_size; > + } > + > + cpu_register_physical_memory(0, below_4g_mem_size, new_block->offset); > +#if TARGET_PHYS_ADDR_BITS > 32 > + if (above_4g_mem_size > 0) { > + cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size, > + new_block->offset + below_4g_mem_size); > + } > +#endif > +} > + > +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size) > +{ > + unsigned long nr_pfn; > + xen_pfn_t *pfn_list; > + int i; > + > + nr_pfn = size >> TARGET_PAGE_BITS; > + pfn_list = qemu_malloc(sizeof (*pfn_list) * nr_pfn); > + > + for (i = 0; i < nr_pfn; i++) { > + pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i; > + } > + > + if (xc_domain_populate_physmap_exact(xen_xc, xen_domid, nr_pfn, 0, 0, > pfn_list)) { > + hw_error("xen: failed to populate ram at %lx", ram_addr); > + } > + > + qemu_free(pfn_list); > +} > + > + > /* Initialise Xen */ > > int xen_init(void) > @@ -62,5 +122,9 @@ int xen_init(void) > return -1; > } > > + /* Init RAM management */ > + qemu_map_cache_init(); > + xen_ram_init(ram_size); > + > return 0; > } > diff --git a/xen-mapcache-stub.c b/xen-mapcache-stub.c > new file mode 100644 > index 0000000..541bee6 > --- /dev/null > +++ b/xen-mapcache-stub.c > @@ -0,0 +1,40 @@ > +/* > + * Copyright (C) 2011 Citrix Ltd. > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include "config.h" > + > +#include "exec-all.h" > +#include "qemu-common.h" > +#include "cpu-common.h" > +#include "xen-mapcache.h" > + > +void qemu_map_cache_init(void) > +{ > +} > + > +uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t > size, uint8_t lock) > +{ > + return qemu_get_ram_ptr(phys_addr); > +} > + > +void qemu_map_cache_unlock(void *buffer) > +{ > +} > + > +ram_addr_t qemu_ram_addr_from_mapcache(void *ptr) > +{ > + return -1; > +} > + > +void qemu_invalidate_map_cache(void) > +{ > +} > + > +void qemu_invalidate_entry(uint8_t *buffer) > +{ > +} > diff --git a/xen-mapcache.c b/xen-mapcache.c > new file mode 100644 > index 0000000..d7f44a7 > --- /dev/null > +++ b/xen-mapcache.c > @@ -0,0 +1,310 @@ > +/* > + * Copyright (C) 2011 Citrix Ltd. > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include "config.h" > + > +#include <sys/resource.h> > + > +#include "hw/xen_backend.h" > +#include "blockdev.h" > + > +#include <xen/hvm/params.h> > +#include <sys/mman.h> > + > +#include "xen-mapcache.h" > + > + > +//#define MAPCACHE_DEBUG > + > +#ifdef MAPCACHE_DEBUG > +# define DPRINTF(fmt, ...) do { \ > + fprintf(stderr, "xen_mapcache: " fmt, ## __VA_ARGS__); \ > +} while (0) > +#else > +# define DPRINTF(fmt, ...) do { } while (0) > +#endif > + > +#if defined(__i386__) > +# define MCACHE_BUCKET_SHIFT 16 > +#elif defined(__x86_64__) > +# define MCACHE_BUCKET_SHIFT 20 > +#endif > +#define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT) > + > +#define BITS_PER_LONG (sizeof(long) * 8) > +#define BITS_TO_LONGS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) > +#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] > + > +typedef struct MapCacheEntry { > + target_phys_addr_t paddr_index; > + uint8_t *vaddr_base; > + DECLARE_BITMAP(valid_mapping, MCACHE_BUCKET_SIZE >> XC_PAGE_SHIFT); > + uint8_t lock; > + struct MapCacheEntry *next; > +} MapCacheEntry; > + > +typedef struct MapCacheRev { > + uint8_t *vaddr_req; > + target_phys_addr_t paddr_index; > + QTAILQ_ENTRY(MapCacheRev) next; > +} MapCacheRev; > + > +typedef struct MapCache { > + MapCacheEntry *entry; > + unsigned long nr_buckets; > + QTAILQ_HEAD(map_cache_head, MapCacheRev) locked_entries; > + > + /* For most cases (>99.9%), the page address is the same. */ > + target_phys_addr_t last_address_index; > + uint8_t *last_address_vaddr; > + unsigned long max_mcache_size; > + unsigned int mcache_bucket_shift; > +} MapCache; > + > +static MapCache *mapcache; > + > +static inline int test_bit(unsigned int bit, const unsigned long *map) > +{ > + return !!((map)[(bit) / BITS_PER_LONG] & (1UL << ((bit) % > BITS_PER_LONG))); > +}
We have a bitmap framework in qemu now. Please use that :). See bitmap.h / bitops.h / bitops.c. Alex