This adds the necessary pieces to implement savevm / migration for the pseries machine. The most complex part here is migrating the hash table - for the paravirtualized pseries machine the guest's hash page table is not stored within guest memory, but externally and the guest accesses it via hypercalls.
This patch uses a hypervisor reserved bit of the HPTE as a dirty bit (tracking changes to the HPTE itself, not the page it references). This is used to implement a live migration style incremental save and restore of the hash table contents. In addition it adds VMStateDescription information to save and restore the (few) remaining pieces of state information needed by the pseries machine. Signed-off-by: David Gibson <da...@gibson.dropbear.id.au> --- hw/spapr.c | 268 +++++++++++++++++++++++++++++++++++++++++++++++++++++- hw/spapr.h | 52 ++++++++++- hw/spapr_hcall.c | 41 +-------- 3 files changed, 319 insertions(+), 42 deletions(-) diff --git a/hw/spapr.c b/hw/spapr.c index e8dbd97..2b05fb4 100644 --- a/hw/spapr.c +++ b/hw/spapr.c @@ -648,7 +648,7 @@ static void spapr_cpu_reset(void *opaque) env->spr[SPR_HIOR] = 0; - env->external_htab = spapr->htab; + env->external_htab = (uint8_t *)spapr->htab; env->htab_base = -1; env->htab_mask = HTAB_SIZE(spapr) - 1; env->spr[SPR_SDR1] = (unsigned long)spapr->htab | @@ -670,6 +670,268 @@ static int spapr_vga_init(PCIBus *pci_bus) } } +static const VMStateDescription vmstate_spapr = { + .name = "spapr", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField []) { + /* Sanity check */ + VMSTATE_UINT32_EQUAL(next_irq, sPAPREnvironment), + + /* RTC offset */ + VMSTATE_UINT64(rtc_offset, sPAPREnvironment), + + VMSTATE_END_OF_LIST() + }, +}; + +#define HPTE(_table, _i) (void *)(((uint64_t *)(_table)) + ((_i) * 2)) +#define HPTE_VALID(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE_V_VALID) +#define HPTE_DIRTY(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE_V_HPTE_DIRTY) +#define CLEAN_HPTE(_hpte) ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE_V_HPTE_DIRTY)) + +static int htab_save_setup(QEMUFile *f, void *opaque) +{ + sPAPREnvironment *spapr = opaque; + + spapr->htab_save_index = 0; + spapr->htab_first_pass = true; + + /* "Iteration" header */ + qemu_put_be32(f, spapr->htab_shift); + + return 0; +} + +#define MAX_ITERATION_NS 5000000 /* 5 ms */ + +static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr, + int64_t max_ns) +{ + int htabslots = HTAB_SIZE(spapr) / HPTE_SIZE; + int index = spapr->htab_save_index; + int64_t starttime = qemu_get_clock_ns(rt_clock); + + assert(spapr->htab_first_pass); + + do { + int chunkstart; + + /* Consume invalid HPTEs */ + while ((index < htabslots) + && !HPTE_VALID(HPTE(spapr->htab, index))) { + index++; + CLEAN_HPTE(HPTE(spapr->htab, index)); + } + + /* Consume valid HPTEs */ + chunkstart = index; + while ((index < htabslots) + && HPTE_VALID(HPTE(spapr->htab, index))) { + index++; + CLEAN_HPTE(HPTE(spapr->htab, index)); + } + + if (index > chunkstart) { + int n_valid = index - chunkstart; + + qemu_put_be32(f, chunkstart); + qemu_put_be16(f, n_valid); + qemu_put_be16(f, 0); + qemu_put_buffer(f, HPTE(spapr->htab, chunkstart), + HPTE_SIZE * n_valid); + + if ((qemu_get_clock_ns(rt_clock) - starttime) > max_ns) { + break; + } + } + } while ((index < htabslots) && !qemu_file_rate_limit(f)); + + if (index >= htabslots) { + assert(index == htabslots); + index = 0; + spapr->htab_first_pass = false; + } + spapr->htab_save_index = index; +} + +static bool htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr, + int64_t max_ns) +{ + bool final = max_ns < 0; + int htabslots = HTAB_SIZE(spapr) / HPTE_SIZE; + int examined = 0, sent = 0; + int index = spapr->htab_save_index; + int64_t starttime = qemu_get_clock_ns(rt_clock); + + assert(!spapr->htab_first_pass); + + do { + int chunkstart, invalidstart; + + /* Consume non-dirty HPTEs */ + while ((index < htabslots) + && !HPTE_DIRTY(HPTE(spapr->htab, index))) { + index++; + examined++; + } + + chunkstart = index; + /* Consume valid dirty HPTEs */ + while ((index < htabslots) + && HPTE_DIRTY(HPTE(spapr->htab, index)) + && HPTE_VALID(HPTE(spapr->htab, index))) { + CLEAN_HPTE(HPTE(spapr->htab, index)); + index++; + examined++; + } + + invalidstart = index; + /* Consume invalid dirty HPTEs */ + while ((index < htabslots) + && HPTE_DIRTY(HPTE(spapr->htab, index)) + && !HPTE_VALID(HPTE(spapr->htab, index))) { + CLEAN_HPTE(HPTE(spapr->htab, index)); + index++; + examined++; + } + + if (index > chunkstart) { + int n_valid = invalidstart - chunkstart; + int n_invalid = index - invalidstart; + + qemu_put_be32(f, chunkstart); + qemu_put_be16(f, n_valid); + qemu_put_be16(f, n_invalid); + qemu_put_buffer(f, HPTE(spapr->htab, chunkstart), + HPTE_SIZE * n_valid); + sent += index - chunkstart; + + if (!final && (qemu_get_clock_ns(rt_clock) - starttime) > max_ns) { + break; + } + } + + if (examined >= htabslots) { + break; + } + + if (index >= htabslots) { + assert(index == htabslots); + index = 0; + } + } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final)); + + if (index >= htabslots) { + assert(index == htabslots); + index = 0; + } + + spapr->htab_save_index = index; + + return (examined >= htabslots) && (sent == 0); +} + +static int htab_save_iterate(QEMUFile *f, void *opaque) +{ + sPAPREnvironment *spapr = opaque; + bool nothingleft = false;; + + /* Iteration header */ + qemu_put_be32(f, 0); + + if (spapr->htab_first_pass) { + htab_save_first_pass(f, spapr, MAX_ITERATION_NS); + } else { + nothingleft = htab_save_later_pass(f, spapr, MAX_ITERATION_NS); + } + + /* End marker */ + qemu_put_be32(f, 0); + qemu_put_be16(f, 0); + qemu_put_be16(f, 0); + + return nothingleft ? 1 : 0; +} + +static int htab_save_complete(QEMUFile *f, void *opaque) +{ + sPAPREnvironment *spapr = opaque; + + /* Iteration header */ + qemu_put_be32(f, 0); + + htab_save_later_pass(f, spapr, -1); + + /* End marker */ + qemu_put_be32(f, 0); + qemu_put_be16(f, 0); + qemu_put_be16(f, 0); + + return 0; +} + +static int htab_load(QEMUFile *f, void *opaque, int version_id) +{ + sPAPREnvironment *spapr = opaque; + uint32_t section_hdr; + + if (version_id < 1 || version_id > 1) { + fprintf(stderr, "htab_load() bad version\n"); + return -EINVAL; + } + + section_hdr = qemu_get_be32(f); + + if (section_hdr) { + /* First section, just the hash shift */ + if (spapr->htab_shift != section_hdr) { + return -EINVAL; + } + return 0; + } + + while (true) { + uint32_t index; + uint16_t n_valid, n_invalid; + + index = qemu_get_be32(f); + n_valid = qemu_get_be16(f); + n_invalid = qemu_get_be16(f); + + if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) { + /* End of Stream */ + break; + } + + if ((index + n_valid + n_invalid) >= + (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) { + /* Bad index in stream */ + fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) " + "in htab stream\n", index, n_valid, n_invalid); + return -EINVAL; + } + + if (n_valid) { + qemu_get_buffer(f, HPTE(spapr->htab, index), HPTE_SIZE * n_valid); + } + if (n_invalid) { + memset(HPTE(spapr->htab, index + n_valid), 0, + HPTE_SIZE * n_invalid); + } + } + + return 0; +} + +static SaveVMHandlers savevm_htab_handlers = { + .save_live_setup = htab_save_setup, + .save_live_iterate = htab_save_iterate, + .save_live_complete = htab_save_complete, + .load_state = htab_load, +}; + /* pSeries LPAR / sPAPR hardware init */ static void ppc_spapr_init(ram_addr_t ram_size, const char *boot_device, @@ -910,6 +1172,10 @@ static void ppc_spapr_init(ram_addr_t ram_size, spapr->entry_point = 0x100; + vmstate_register(NULL, 0, &vmstate_spapr, spapr); + register_savevm_live(NULL, "spapr/htab", -1, 1, + &savevm_htab_handlers, spapr); + /* Prepare the device tree */ spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, initrd_base, initrd_size, diff --git a/hw/spapr.h b/hw/spapr.h index f6864da..c306c83 100644 --- a/hw/spapr.h +++ b/hw/spapr.h @@ -8,27 +8,71 @@ struct VIOsPAPRBus; struct sPAPRPHBState; struct icp_state; +#define HPTE_SIZE 16 +#define HPTES_PER_GROUP 8 + +#define HPTE_V_SSIZE_SHIFT 62 +#define HPTE_V_AVPN_SHIFT 7 +#define HPTE_V_AVPN 0x3fffffffffffff80ULL +#define HPTE_V_AVPN_VAL(x) (((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT) +#define HPTE_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff80UL)) +#define HPTE_V_BOLTED 0x0000000000000010ULL +#define HPTE_V_LOCK 0x0000000000000008ULL +#define HPTE_V_LARGE 0x0000000000000004ULL +#define HPTE_V_SECONDARY 0x0000000000000002ULL +#define HPTE_V_VALID 0x0000000000000001ULL + +#define HPTE_R_PP0 0x8000000000000000ULL +#define HPTE_R_TS 0x4000000000000000ULL +#define HPTE_R_KEY_HI 0x3000000000000000ULL +#define HPTE_R_RPN_SHIFT 12 +#define HPTE_R_RPN 0x3ffffffffffff000ULL +#define HPTE_R_FLAGS 0x00000000000003ffULL +#define HPTE_R_PP 0x0000000000000003ULL +#define HPTE_R_N 0x0000000000000004ULL +#define HPTE_R_G 0x0000000000000008ULL +#define HPTE_R_M 0x0000000000000010ULL +#define HPTE_R_I 0x0000000000000020ULL +#define HPTE_R_W 0x0000000000000040ULL +#define HPTE_R_WIMG 0x0000000000000078ULL +#define HPTE_R_C 0x0000000000000080ULL +#define HPTE_R_R 0x0000000000000100ULL +#define HPTE_R_KEY_LO 0x0000000000000e00ULL + +#define HPTE_V_1TB_SEG 0x4000000000000000ULL +#define HPTE_V_VRMA_MASK 0x4001ffffff000000ULL + +#define HPTE_V_HPTE_DIRTY 0x0000000000000040ULL + +typedef struct { + uint64_t pte0, pte1; +} HashPTE; + typedef struct sPAPREnvironment { struct VIOsPAPRBus *vio_bus; QLIST_HEAD(, sPAPRPHBState) phbs; struct icp_state *icp; target_phys_addr_t ram_limit; - void *htab; - long htab_shift; + HashPTE *htab; + uint32_t htab_shift; target_phys_addr_t rma_size; int vrma_adjust; target_phys_addr_t fdt_addr, rtas_addr; long rtas_size; void *fdt_skel; target_ulong entry_point; - int next_irq; - int rtc_offset; + uint32_t next_irq; + uint64_t rtc_offset; char *cpu_model; bool has_graphics; uint32_t epow_irq; Notifier epow_notifier; + + /* Migration state */ + int htab_save_index; + bool htab_first_pass; } sPAPREnvironment; #define H_SUCCESS 0 diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c index 621dabd..0d7675b 100644 --- a/hw/spapr_hcall.c +++ b/hw/spapr_hcall.c @@ -6,39 +6,6 @@ #include "helper_regs.h" #include "hw/spapr.h" -#define HPTES_PER_GROUP 8 - -#define HPTE_V_SSIZE_SHIFT 62 -#define HPTE_V_AVPN_SHIFT 7 -#define HPTE_V_AVPN 0x3fffffffffffff80ULL -#define HPTE_V_AVPN_VAL(x) (((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT) -#define HPTE_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff80UL)) -#define HPTE_V_BOLTED 0x0000000000000010ULL -#define HPTE_V_LOCK 0x0000000000000008ULL -#define HPTE_V_LARGE 0x0000000000000004ULL -#define HPTE_V_SECONDARY 0x0000000000000002ULL -#define HPTE_V_VALID 0x0000000000000001ULL - -#define HPTE_R_PP0 0x8000000000000000ULL -#define HPTE_R_TS 0x4000000000000000ULL -#define HPTE_R_KEY_HI 0x3000000000000000ULL -#define HPTE_R_RPN_SHIFT 12 -#define HPTE_R_RPN 0x3ffffffffffff000ULL -#define HPTE_R_FLAGS 0x00000000000003ffULL -#define HPTE_R_PP 0x0000000000000003ULL -#define HPTE_R_N 0x0000000000000004ULL -#define HPTE_R_G 0x0000000000000008ULL -#define HPTE_R_M 0x0000000000000010ULL -#define HPTE_R_I 0x0000000000000020ULL -#define HPTE_R_W 0x0000000000000040ULL -#define HPTE_R_WIMG 0x0000000000000078ULL -#define HPTE_R_C 0x0000000000000080ULL -#define HPTE_R_R 0x0000000000000100ULL -#define HPTE_R_KEY_LO 0x0000000000000e00ULL - -#define HPTE_V_1TB_SEG 0x4000000000000000ULL -#define HPTE_V_VRMA_MASK 0x4001ffffff000000ULL - static target_ulong compute_tlbie_rb(target_ulong v, target_ulong r, target_ulong pte_index) { @@ -149,7 +116,7 @@ static target_ulong h_enter(CPUPPCState *env, sPAPREnvironment *spapr, } stq_p(hpte + (HASH_PTE_SIZE_64/2), ptel); /* eieio(); FIXME: need some sort of barrier for smp? */ - stq_p(hpte, pteh); + stq_p(hpte, pteh | HPTE_V_HPTE_DIRTY); args[0] = pte_index + i; return H_SUCCESS; @@ -186,7 +153,7 @@ static target_ulong remove_hpte(CPUPPCState *env, target_ulong ptex, } *vp = v; *rp = r; - stq_p(hpte, 0); + stq_p(hpte, HPTE_V_HPTE_DIRTY); rb = compute_tlbie_rb(v, r, ptex); ppc_tlb_invalidate_one(env, rb); return REMOVE_SUCCESS; @@ -313,11 +280,11 @@ static target_ulong h_protect(CPUPPCState *env, sPAPREnvironment *spapr, r |= (flags << 48) & HPTE_R_KEY_HI; r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); rb = compute_tlbie_rb(v, r, pte_index); - stq_p(hpte, v & ~HPTE_V_VALID); + stq_p(hpte, (v & ~HPTE_V_VALID) | HPTE_V_HPTE_DIRTY); ppc_tlb_invalidate_one(env, rb); stq_p(hpte + (HASH_PTE_SIZE_64/2), r); /* Don't need a memory barrier, due to qemu's global lock */ - stq_p(hpte, v); + stq_p(hpte, v | HPTE_V_HPTE_DIRTY); return H_SUCCESS; } -- 1.7.10.4