On 05.09.2013, at 06:54, Alexey Kardashevskiy wrote: > On 09/05/2013 02:30 PM, David Gibson wrote: >> On Tue, Sep 03, 2013 at 05:31:42PM +1000, Alexey Kardashevskiy wrote: >>> This allows guests to have a different timebase origin from the host. >>> >>> This is needed for migration, where a guest can migrate from one host >>> to another and the two hosts might have a different timebase origin. >>> However, the timebase seen by the guest must not go backwards, and >>> should go forwards only by a small amount corresponding to the time >>> taken for the migration. >>> >>> This is only supported for recent POWER hardware which has the TBU40 >>> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not >>> 970. >>> >>> This adds kvm_access_one_reg() to access a special register which is not >>> in env->spr. >>> >>> The feature must be present in the host kernel. >>> >>> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> >>> --- >>> >>> This is an RFC but not a final patch. Can break something but I just do not >>> see what. >>> >>> --- >>> hw/ppc/ppc.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ >>> include/hw/ppc/ppc.h | 4 ++++ >>> target-ppc/kvm.c | 23 +++++++++++++++++++++++ >>> target-ppc/machine.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ >>> trace-events | 3 +++ >>> 5 files changed, 123 insertions(+) >>> >>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c >>> index 1e3cab3..7d08c9a 100644 >>> --- a/hw/ppc/ppc.c >>> +++ b/hw/ppc/ppc.c >>> @@ -31,6 +31,7 @@ >>> #include "hw/loader.h" >>> #include "sysemu/kvm.h" >>> #include "kvm_ppc.h" >>> +#include "trace.h" >>> >>> //#define PPC_DEBUG_IRQ >>> #define PPC_DEBUG_TB >>> @@ -796,6 +797,54 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t >>> freq) >>> cpu_ppc_store_purr(cpu, 0x0000000000000000ULL); >>> } >>> >>> +/* >>> + * Calculate timebase on the destination side of migration >> >>> + * We calculate new timebase offset as shown below: >>> + * 1) Gtb2 = Gtb1 + max(tod2 - tod1, 0) >>> + * Gtb2 = tb2 + off2 >>> + * Gtb1 = tb1 + off1 >>> + * 2) tb2 + off2 = tb1 + off1 + max(tod2 - tod1, 0) >>> + * 3) off2 = tb1 - tb2 + off1 + max(tod2 - tod1, 0) >>> + * >>> + * where: >>> + * Gtb2 - destination guest timebase >>> + * tb2 - destination host timebase >>> + * off2 - destination timebase offset >>> + * tod2 - destination time of the day >>> + * Gtb1 - source guest timebase >>> + * tb1 - source host timebase >>> + * off1 - source timebase offset >>> + * tod1 - source time of the day >>> + * >>> + * The result we want is in @off2 >>> + * >>> + * Two conditions must be met for @off2: >>> + * 1) off2 must be multiple of 2^24 ticks as it will be set via TBU40 SPR >>> + * 2) Gtb2 >= Gtb1 >> >> What about the TCG case, where there is not host timebase, only a a >> host system clock? > > > cpu_get_real_ticks() returns ticks, this is what the patch cares about. > What is the difference between KVM and TCG here? > > >>> + */ >>> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env) >>> +{ >>> + uint64_t tb2, tod2, off2; >>> + int ratio = tb_env->tb_freq / 1000000; >>> + struct timeval tv; >>> + >>> + tb2 = cpu_get_real_ticks(); >>> + gettimeofday(&tv, NULL); >>> + tod2 = tv.tv_sec * 1000000 + tv.tv_usec; >>> + >>> + off2 = tb_env->timebase - tb2 + tb_env->tb_offset; >>> + if (tod2 > tb_env->time_of_the_day) { >>> + off2 += (tod2 - tb_env->time_of_the_day) * ratio; >>> + } >>> + off2 = ROUND_UP(off2, 1 << 24); >>> + >>> + trace_ppc_tb_adjust(tb_env->tb_offset, off2, >>> + (int64_t)off2 - tb_env->tb_offset); >>> + >>> + tb_env->tb_offset = off2; >>> +} >>> + >>> /* Set up (once) timebase frequency (in Hz) */ >>> clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq) >>> { >>> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h >>> index 132ab97..235871c 100644 >>> --- a/include/hw/ppc/ppc.h >>> +++ b/include/hw/ppc/ppc.h >>> @@ -32,6 +32,9 @@ struct ppc_tb_t { >>> uint64_t purr_start; >>> void *opaque; >>> uint32_t flags; >>> + /* Cached values for live migration purposes */ >>> + uint64_t timebase; >>> + uint64_t time_of_the_day; >> >> How is the time of day encoded here? > > > Microseconds. I'll put a comment here, I just thought it is quite obvious > as gettimeofday() returns microseconds. > > >>> }; >>> >>> /* PPC Timers flags */ >>> @@ -46,6 +49,7 @@ struct ppc_tb_t { >>> */ >>> >>> uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t >>> tb_offset); >>> +void cpu_ppc_adjust_tb_offset(ppc_tb_t *tb_env); >>> clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq); >>> /* Embedded PowerPC DCR management */ >>> typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn); >>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c >>> index 7af9e3d..93df955 100644 >>> --- a/target-ppc/kvm.c >>> +++ b/target-ppc/kvm.c >>> @@ -35,6 +35,7 @@ >>> #include "hw/sysbus.h" >>> #include "hw/ppc/spapr.h" >>> #include "hw/ppc/spapr_vio.h" >>> +#include "hw/ppc/ppc.h" >>> #include "sysemu/watchdog.h" >>> >>> //#define DEBUG_KVM >>> @@ -761,6 +762,22 @@ static int kvm_put_vpa(CPUState *cs) >>> } >>> #endif /* TARGET_PPC64 */ >>> >>> +static int kvm_access_one_reg(CPUState *cs, bool set, __u64 id, >>> void *addr) >> >> I think it would be nicer to have seperate set_one_reg and get_one_reg >> functions, rather than using a direction parameter. > > > I am regularly getting requests from Alex to merge functions which look > almost the same. Please do not make it worse :)
The reason I ask you to merge function is to reduce code duplication. The API should still look sane and I think what David suggests makes a lot of sense. In normal QEMU fashion, that translates to: static int kvm_access_one_reg(CPUState *cs, uint64_t id, void *addr, bool set) { .... } int kvm_set_one_reg(CPUState *cs, uint64_t id, void *addr) { return kvm_access_one_reg(cs, id, addr, true); } int kvm_get_one_reg(CPUState *cs, uint64_t id, void *addr) { return kvm_access_one_reg(cs, id, addr, false); } Also, this code should live in generic KVM code that can be used by more than just the PPC target. > > >>> +{ >>> + struct kvm_one_reg reg = { >>> + .id = id, >>> + .addr = (uintptr_t)addr, >>> + }; >>> + int ret = kvm_vcpu_ioctl(cs, set ? KVM_SET_ONE_REG : KVM_GET_ONE_REG, >>> ®); >>> + >>> + if (ret) { >>> + DPRINTF("Unable to %s time base offset to KVM: %s\n", >>> + set ? "set" : "get", strerror(errno)); >>> + } >>> + >>> + return ret; >>> +} >>> + >>> int kvm_arch_put_registers(CPUState *cs, int level) >>> { >>> PowerPCCPU *cpu = POWERPC_CPU(cs); >>> @@ -873,6 +890,9 @@ int kvm_arch_put_registers(CPUState *cs, int level) >>> DPRINTF("Warning: Unable to set VPA information to KVM\n"); >>> } >>> } >>> + >>> + kvm_access_one_reg(cs, 1, KVM_REG_PPC_TB_OFFSET, >>> + &env->tb_env->tb_offset); >> >> Hrm. It may be too late, but would it make more sense for qemu to >> just calculate the "ideal" offset - not 24-bit aligned, and have the >> kernel round that up to a value suitable for TBU40. I'm thinking that >> might be more robust if someone decides that POWER10 should have a >> TBU50 or a TBU30 instead of TBU40. > > > No, not late, the kernel patchset has not been noticed yet by anyone :) > Just a little remark here - QEMU sets one value to the kernel as an offset > but when it will be about to migrate again and read this offset from the > kernel, it will be different (rounded up) from what QEMU set. Not a problem > though. > > >>> #endif /* TARGET_PPC64 */ >>> } >>> >>> @@ -1082,6 +1102,9 @@ int kvm_arch_get_registers(CPUState *cs) >>> DPRINTF("Warning: Unable to get VPA information from >>> KVM\n"); >>> } >>> } >>> + >>> + kvm_access_one_reg(cs, 0, KVM_REG_PPC_TB_OFFSET, >>> + &env->tb_env->tb_offset); >>> #endif >>> } >>> >>> diff --git a/target-ppc/machine.c b/target-ppc/machine.c >>> index 12e1512..d1ffc7f 100644 >>> --- a/target-ppc/machine.c >>> +++ b/target-ppc/machine.c >>> @@ -1,5 +1,6 @@ >>> #include "hw/hw.h" >>> #include "hw/boards.h" >>> +#include "hw/ppc/ppc.h" >>> #include "sysemu/kvm.h" >>> #include "helper_regs.h" >>> >>> @@ -459,6 +460,45 @@ static const VMStateDescription vmstate_tlbmas = { >>> } >>> }; >>> >>> +static void timebase_pre_save(void *opaque) >>> +{ >>> + ppc_tb_t *tb_env = opaque; >>> + struct timeval tv; >>> + >>> + gettimeofday(&tv, NULL); >>> + tb_env->time_of_the_day = tv.tv_sec * 1000000 + tv.tv_usec; >>> + tb_env->timebase = cpu_get_real_ticks(); >>> +} >>> + >>> +static int timebase_post_load(void *opaque, int version_id) >>> +{ >>> + ppc_tb_t *tb_env = opaque; >>> + >>> + if (!tb_env) { >>> + printf("NO TB!\n"); >>> + return -1; >>> + } >>> + cpu_ppc_adjust_tb_offset(tb_env); >>> + >>> + return 0; >>> +} >>> + >>> +static const VMStateDescription vmstate_timebase = { >>> + .name = "cpu/timebase", >>> + .version_id = 1, >>> + .minimum_version_id = 1, >>> + .minimum_version_id_old = 1, >>> + .pre_save = timebase_pre_save, >>> + .post_load = timebase_post_load, >>> + .fields = (VMStateField []) { >>> + VMSTATE_UINT64(timebase, ppc_tb_t), >>> + VMSTATE_INT64(tb_offset, ppc_tb_t), >> >> tb_offset is inherently a local concept, since it depends on the host >> timebase. So how can it belong in the migration stream? > > > I do not have pure guest timebase in QEMU and I need it on the destination. > But I have host timebase + offset to calculate it. And tb_offset is already > in ppc_tb_t. It looked logical to me to send the existing field and add > only the missing part. I still don't understand. You want the guest visible timebase in the migration stream, no? Alex