This call allows a guest to ask the hypervisor which of its (guest physical) memory ranges were already zeroed out by the hypervisor, which means there's no need for the guest to zero them out again at boot.
To do so, we have to track what memory is not zero any more. The conservative estimate for this is to consider all memory that we ever touched to not be zero any more. To track this, we introduce a new bitmap, independent of the existing dirty tracking maps. This is for several reasons: * Dirty memory tracking bitmaps are always allocated. This is wasteful when we only want this bitmap for one specific use case. * Dirty memory tracking is tightly aligned with target page sizes. We want to track at a much coarser granularity, which reduces the bitmap size, and will also make our hypercall reply have fewer fragmented areas, without losing too much of the benefit. Note that there are limitations of what we track: we only track initial memory. Memory hotplugged later will not be tracked, and so never reported as pre-zeroed. We also don't track non-zero memory over migrations. Both of these are fine with respect to the use case: Windows only enquires about zeroed memory relatively early during boot, where both of these operations are uncommon. For those cases, we simply report "memory is not pre-zeroed". Finally, we currently don't track memory that the VM itself touched. For this, we will need KVM support, and likely a new ioctl that helps us track memory mapped by the guest. Signed-off-by: Florian Schmidt <[email protected]> --- docs/system/i386/hyperv.rst | 7 ++ hw/hyperv/hyperv.c | 121 +++++++++++++++++++++++++++++++ include/hw/hyperv/hyperv-proto.h | 11 +++ include/hw/hyperv/hyperv.h | 7 ++ include/system/physmem.h | 8 ++ system/memory.c | 7 +- system/physmem.c | 52 +++++++++++++ target/i386/cpu.c | 2 + target/i386/cpu.h | 1 + target/i386/kvm/hyperv-proto.h | 5 ++ target/i386/kvm/hyperv.c | 8 ++ target/i386/kvm/kvm.c | 17 +++++ 12 files changed, 244 insertions(+), 2 deletions(-) diff --git a/docs/system/i386/hyperv.rst b/docs/system/i386/hyperv.rst index c5ca25067e..71ee456921 100644 --- a/docs/system/i386/hyperv.rst +++ b/docs/system/i386/hyperv.rst @@ -263,6 +263,13 @@ Existing enlightenments provide any useful new functionality, but it's it's required to be enabled to use any extended hypercalls. +``hv-boot-zeroed-mem`` + Enables the HvExtGetBootZeroedMemory hypercall. This allows a Windows guest to + inquire which memory has already been zeroed out by the host and thus doesn't + need to be zeroed out at boot again. + + Requires: ``hv-ext-query-caps`` + Supplementary features ---------------------- diff --git a/hw/hyperv/hyperv.c b/hw/hyperv/hyperv.c index 1764203d3c..c99b247bc8 100644 --- a/hw/hyperv/hyperv.c +++ b/hw/hyperv/hyperv.c @@ -13,6 +13,8 @@ #include "qapi/error.h" #include "system/address-spaces.h" #include "system/memory.h" +#include "system/physmem.h" +#include "system/runstate.h" #include "exec/target_page.h" #include "exec/cpu-common.h" #include "linux/kvm.h" @@ -23,10 +25,13 @@ #include "qemu/queue.h" #include "qemu/rcu.h" #include "qemu/rcu_queue.h" +#include "hw/core/boards.h" #include "hw/hyperv/hyperv.h" #include "qom/object.h" #include "target/i386/kvm/hyperv-proto.h" +#define HV_BOOT_ZEROED_PAGE_SHIFT 9 + struct SynICState { DeviceState parent_obj; @@ -731,6 +736,98 @@ cleanup: return ret; } +struct boot_zero_opaque { + struct hyperv_get_boot_zeroed_memory_output *zr; + const unsigned long *zero_blocks; + unsigned long num; + unsigned int order; + unsigned int count; +}; + +static bool bootzero_mem_cb(Int128 istart, Int128 ilen, const MemoryRegion *mr, + hwaddr offset_in_region, void *opaque) +{ + struct boot_zero_opaque *p = opaque; + ram_addr_t r_start, r_pfn_start, r_pfn_end; + hwaddr ram_gpa_pfn_offset; + unsigned long b_start, b_end; + unsigned long begin, end, idx; + uint64_t pfn_start, pfn_end; + + if (!memory_region_is_ram(mr) + || memory_region_is_rom(mr) + || memory_region_is_ram_device(mr) + || (int128_get64(ilen) == 0)) { + return false; + } + + r_start = memory_region_get_ram_addr(mr) + offset_in_region; + r_pfn_start = r_start >> TARGET_PAGE_BITS; + r_pfn_end = (r_start + int128_get64(ilen) - 1) >> TARGET_PAGE_BITS; + ram_gpa_pfn_offset = (int128_get64(istart) - r_start) >> TARGET_PAGE_BITS; + b_start = r_pfn_start >> p->order; + b_end = MIN((r_pfn_end >> p->order) + 1, p->num); + + idx = b_start; + while (idx < b_end && p->count < ARRAY_SIZE(p->zr->ranges)) { + begin = find_next_zero_bit(p->zero_blocks, b_end, idx); + if (begin == b_end) { + break; + } + end = find_next_bit(p->zero_blocks, b_end, begin); + + pfn_start = MAX(begin << p->order, r_pfn_start); + pfn_end = MIN((end << p->order) - 1, r_pfn_end); + + p->zr->ranges[p->count].start_pfn = pfn_start + ram_gpa_pfn_offset; + p->zr->ranges[p->count].page_count = pfn_end - pfn_start + 1; + p->count++; + idx = end; + } + + return p->count == ARRAY_SIZE(p->zr->ranges); +} + +uint16_t hyperv_ext_hcall_get_boot_zeroed_memory(uint64_t outgpa, bool fast) +{ + uint16_t ret; + hwaddr len; + struct boot_zero_opaque priv = { 0 }; + hwaddr write_len = 0; + + if (fast) { + ret = HV_STATUS_INVALID_HYPERCALL_CODE; + goto cleanup; + } + + len = sizeof(*priv.zr); + priv.zr = cpu_physical_memory_map(outgpa, &len, 1); + if (!priv.zr || len < sizeof(*priv.zr)) { + ret = HV_STATUS_INSUFFICIENT_MEMORY; + goto cleanup; + } + + priv.zero_blocks = physical_memory_get_mapped_ranges(&priv.num, &priv.order); + priv.num *= BITS_PER_LONG; + + priv.zr->range_count = 0; + if (priv.zero_blocks) { + RCU_READ_LOCK_GUARD(); + flatview_for_each_range(address_space_to_flatview(&address_space_memory), + bootzero_mem_cb, &priv); + priv.zr->range_count = priv.count; + } + write_len = sizeof(priv.zr->range_count) + + priv.count * sizeof(priv.zr->ranges[0]); + ret = HV_STATUS_SUCCESS; + +cleanup: + if (priv.zr) { + cpu_physical_memory_unmap(priv.zr, len, 1, write_len); + } + return ret; +} + uint16_t hyperv_hcall_signal_event(uint64_t param, bool fast) { EventFlagHandler *handler; @@ -1014,6 +1111,30 @@ uint64_t hyperv_syndbg_query_options(void) return msg.u.query_options.options; } +void hyperv_boot_zeroed_setup(void) +{ + static bool initialized; + + if (initialized) { + return; + } + + initialized = true; + + if (runstate_check(RUN_STATE_INMIGRATE)) { + /* + * We do not track zeroed memory across migrations. + * The hypercall is only issues early during boot, so we don't lose + * much by not dealing with the complication of moving the zeroed + * state of guest memory to the migrated instance. + */ + return; + } + + physical_memory_init_mapped_tracker(current_machine->ram_size >> TARGET_PAGE_BITS, + HV_BOOT_ZEROED_PAGE_SHIFT); +} + static bool vmbus_recommended_features_enabled; bool hyperv_are_vmbus_recommended_features_enabled(void) diff --git a/include/hw/hyperv/hyperv-proto.h b/include/hw/hyperv/hyperv-proto.h index f1d1d2eb26..5bf5684d11 100644 --- a/include/hw/hyperv/hyperv-proto.h +++ b/include/hw/hyperv/hyperv-proto.h @@ -36,6 +36,7 @@ #define HV_RETRIEVE_DEBUG_DATA 0x006a #define HV_RESET_DEBUG_SESSION 0x006b #define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001 +#define HV_EXT_CALL_GET_BOOT_ZEROED_MEMORY 0x8002 #define HV_HYPERCALL_FAST (1u << 16) /* @@ -192,4 +193,14 @@ struct hyperv_retrieve_debug_data_output { uint32_t retrieved_count; uint32_t remaining_count; } __attribute__ ((__packed__)); + +struct hyperv_get_boot_zeroed_memory_range { + uint64_t start_pfn; + uint64_t page_count; +} __attribute__ ((__packed__)); + +struct hyperv_get_boot_zeroed_memory_output { + uint64_t range_count; + struct hyperv_get_boot_zeroed_memory_range ranges[255]; +} __attribute__ ((__packed__)); #endif diff --git a/include/hw/hyperv/hyperv.h b/include/hw/hyperv/hyperv.h index e29d60f565..8e7fec8e4a 100644 --- a/include/hw/hyperv/hyperv.h +++ b/include/hw/hyperv/hyperv.h @@ -102,11 +102,18 @@ uint16_t hyperv_hcall_post_dbg_data(uint64_t ingpa, uint64_t outgpa, bool fast); */ uint16_t hyperv_ext_hcall_query_caps(uint64_t sup, uint64_t outgpa, bool fast); +/* + * Process HVCALL_EXT_GET_BOOT_ZEROED_MEMORY hypercall. + */ +uint16_t hyperv_ext_hcall_get_boot_zeroed_memory(uint64_t outgpa, bool fast); + uint32_t hyperv_syndbg_send(uint64_t ingpa, uint32_t count); uint32_t hyperv_syndbg_recv(uint64_t ingpa, uint32_t count); void hyperv_syndbg_set_pending_page(uint64_t ingpa); uint64_t hyperv_syndbg_query_options(void); +void hyperv_boot_zeroed_setup(void); + typedef enum HvSynthDbgMsgType { HV_SYNDBG_MSG_CONNECTION_INFO, HV_SYNDBG_MSG_SEND, diff --git a/include/system/physmem.h b/include/system/physmem.h index da91b77bd9..7f5063b0af 100644 --- a/include/system/physmem.h +++ b/include/system/physmem.h @@ -53,4 +53,12 @@ bool physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap, ram_addr_t length); int ram_block_rebind(Error **errp); +void physical_memory_init_mapped_tracker(unsigned long num_pages, + unsigned int order); + +void physical_memory_set_mapped_range(ram_addr_t addr, ram_addr_t length); + +const unsigned long *physical_memory_get_mapped_ranges(unsigned long *len, + unsigned int *order); + #endif diff --git a/system/memory.c b/system/memory.c index 739ba11da6..21d14489b9 100644 --- a/system/memory.c +++ b/system/memory.c @@ -2188,9 +2188,12 @@ void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr, hwaddr size) { + ram_addr_t ramaddr; + assert(mr->ram_block); - physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, - size, + ramaddr = memory_region_get_ram_addr(mr); + physical_memory_set_mapped_range(ramaddr + addr, size); + physical_memory_set_dirty_range(ramaddr + addr, size, memory_region_get_dirty_log_mask(mr)); } diff --git a/system/physmem.c b/system/physmem.c index c58d940e80..0728367035 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -180,6 +180,56 @@ struct DirtyBitmapSnapshot { unsigned long dirty[]; }; +/** + * TODO: Or as part of RAMBlock, but we don't intend to extend once set up at + * init time. + * @mapped_blocks: pointer to the the bitmap itself, maybe NULL if no tracking. + * @mapped_blocks_num: the length of the bitmap, in sizeof(*mapped_blocks) + * @mapped_blocks_order: the order, in pages, i.e., 0 = 1 bit per page. + */ +static unsigned long *mapped_blocks; +static unsigned long mapped_blocks_num; +static unsigned int mapped_blocks_order; + +void physical_memory_init_mapped_tracker(unsigned long num_pages, + unsigned int order) +{ + mapped_blocks_order = order; + mapped_blocks_num = DIV_ROUND_UP(DIV_ROUND_UP(num_pages, 1ULL << order), + BITS_PER_LONG); + mapped_blocks = g_malloc0(sizeof(*mapped_blocks) * mapped_blocks_num); +} + +void physical_memory_set_mapped_range(ram_addr_t addr, ram_addr_t length) +{ + unsigned long first_bit, last_bit; + unsigned long max_bits = mapped_blocks_num * BITS_PER_LONG; + + if (mapped_blocks == NULL || length == 0) { + return; + } + + /* + * Since we don't track hotplugged memory, we may get requests to + * (partially or fully) set a region we don't track. + */ + first_bit = addr >> (TARGET_PAGE_BITS + mapped_blocks_order); + if (first_bit >= max_bits) { + return; + } + last_bit = MIN((addr + length - 1) >> (TARGET_PAGE_BITS + mapped_blocks_order), + max_bits - 1); + bitmap_set_atomic(mapped_blocks, first_bit, last_bit - first_bit + 1); +} + +const unsigned long *physical_memory_get_mapped_ranges(unsigned long *len, + unsigned int *order) +{ + *len = mapped_blocks_num; + *order = mapped_blocks_order; + return mapped_blocks; +} + static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes) { static unsigned alloc_hint = 16; @@ -3131,6 +3181,8 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, assert(ramaddr != RAM_ADDR_INVALID); addr += ramaddr; + physical_memory_set_mapped_range(addr, length); + /* No early return if dirty_log_mask is or becomes 0, because * physical_memory_set_dirty_range will still call * xen_modified_memory. diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 4d77de52b4..0a47dc376d 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -10762,6 +10762,8 @@ static const Property x86_cpu_properties[] = { HYPERV_FEAT_TLBFLUSH_DIRECT, 0), DEFINE_PROP_BIT64("hv-ext-query-caps", X86CPU, hyperv_features, HYPERV_FEAT_EXT_CALLS, 0), + DEFINE_PROP_BIT64("hv-boot-zeroed-mem", X86CPU, hyperv_features, + HYPERV_FEAT_BOOT_ZEROED_MEMORY, 0), DEFINE_PROP_ON_OFF_AUTO("hv-no-nonarch-coresharing", X86CPU, hyperv_no_nonarch_cs, ON_OFF_AUTO_OFF), #ifdef CONFIG_SYNDBG diff --git a/target/i386/cpu.h b/target/i386/cpu.h index eb870fa015..9e64c78eaf 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1481,6 +1481,7 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define HYPERV_FEAT_TLBFLUSH_EXT 19 #define HYPERV_FEAT_TLBFLUSH_DIRECT 20 #define HYPERV_FEAT_EXT_CALLS 21 +#define HYPERV_FEAT_BOOT_ZEROED_MEMORY 22 #ifndef HYPERV_SPINLOCK_NEVER_NOTIFY #define HYPERV_SPINLOCK_NEVER_NOTIFY 0xFFFFFFFF diff --git a/target/i386/kvm/hyperv-proto.h b/target/i386/kvm/hyperv-proto.h index 4eb2955ac5..ec38b717e4 100644 --- a/target/i386/kvm/hyperv-proto.h +++ b/target/i386/kvm/hyperv-proto.h @@ -94,6 +94,11 @@ #define HV_NESTED_DIRECT_FLUSH (1u << 17) #define HV_NESTED_MSR_BITMAP (1u << 19) +/* + * HV_EXT_CALL_QUERY_CAPABILITIES bits + */ +#define HV_EXT_CAP_GET_BOOT_ZEROED_MEMORY (1u << 0) + /* * Basic virtualized MSRs */ diff --git a/target/i386/kvm/hyperv.c b/target/i386/kvm/hyperv.c index 807acaf6b1..dc226b4419 100644 --- a/target/i386/kvm/hyperv.c +++ b/target/i386/kvm/hyperv.c @@ -123,6 +123,14 @@ int kvm_hv_handle_exit(X86CPU *cpu, struct kvm_hyperv_exit *exit) hyperv_ext_hcall_query_caps(hv_build_ext_call_caps(CPU(cpu)), out_param, fast); break; + case HV_EXT_CALL_GET_BOOT_ZEROED_MEMORY: + if (!hyperv_feat_enabled(cpu, HYPERV_FEAT_BOOT_ZEROED_MEMORY)) { + exit->u.hcall.result = HV_STATUS_INVALID_HYPERCALL_CODE; + } else { + exit->u.hcall.result = + hyperv_ext_hcall_get_boot_zeroed_memory(out_param, fast); + } + break; default: exit->u.hcall.result = HV_STATUS_INVALID_HYPERCALL_CODE; } diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 586656258d..70ac56624f 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1143,6 +1143,14 @@ static struct { .bits = HV_ENABLE_EXT_HYPERCALLS} } }, + [HYPERV_FEAT_BOOT_ZEROED_MEMORY] = { + .desc = "enlighten guest about pre-zeroed memory (hv-boot-zeroed-mem)", + .flags = { + {.func = HV_EXT_CALL_QUERY_CAPABILITIES, .reg = 0, + .bits = HV_EXT_CAP_GET_BOOT_ZEROED_MEMORY} + }, + .dependencies = BIT(HYPERV_FEAT_EXT_CALLS) + }, }; static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max, @@ -1378,6 +1386,11 @@ static bool hyperv_feature_supported(CPUState *cs, int feature) continue; } + if (func == HV_EXT_CALL_QUERY_CAPABILITIES) { + /* These do not correspond to host CPUID feature bits. */ + return true; + } + if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) { return false; } @@ -1819,6 +1832,10 @@ static int hyperv_init_vcpu(X86CPU *cpu) hyperv_x86_set_vmbus_recommended_features_enabled(); } + if (hyperv_feat_enabled(cpu, HYPERV_FEAT_BOOT_ZEROED_MEMORY)) { + hyperv_boot_zeroed_setup(); + } + return 0; } -- 2.47.3
