From: Ross Lagerwall <ross.lagerw...@citrix.com> Implement support for the apply, revert and replace actions.
To perform and action on a payload, the hypercall sets up a data structure to schedule the work. A hook is added in all the return-to-guest paths to check for work to do and execute it if needed. In this way, patches can be applied with all CPUs idle and without stacks. The first CPU to do_xsplice() becomes the master and triggers a reschedule softirq to trigger all the other CPUs to enter do_xsplice() with no stack. Once all CPUs have rendezvoused, all CPUs disable IRQs and NMIs are ignored. The system is then quiscient and the master performs the action. After this, all CPUs enable IRQs and NMIs are re-enabled. The action to perform is one of: - APPLY: For each function in the module, store the first 5 bytes of the old function and replace it with a jump to the new function. - REVERT: Copy the previously stored bytes into the first 5 bytes of the old function. - REPLACE: Revert each applied module and then apply the new module. To prevent a deadlock with any other barrier in the system, the master will wait for up to 30ms before timing out. I've taken some measurements and found the patch application to take about 100 μs on a 72 CPU system, whether idle or fully loaded. Signed-off-by: Ross Lagerwall <ross.lagerw...@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com> -- v2: - Pluck the 'struct xsplice_patch_func' in this patch. - Modify code per review comments. - Add more data in the keyboard handler. - Redo the patching code, split it in functions. v3: - Add return_ macro for debug builds. - Move s/payload_list_lock/payload_list/ to earlier patch - Remove const and use ELF types for xsplice_patch_func v4: - Add check routine to do simple sanity checks for various sections. v5: - s/%p/PRIx64/ as ARM builds complain. --- xen/arch/arm/xsplice.c | 10 +- xen/arch/x86/domain.c | 4 + xen/arch/x86/hvm/svm/svm.c | 2 + xen/arch/x86/hvm/vmx/vmcs.c | 2 + xen/arch/x86/xsplice.c | 19 +++ xen/common/xsplice.c | 372 ++++++++++++++++++++++++++++++++++++++++++-- xen/common/xsplice_elf.c | 8 +- xen/include/asm-arm/nmi.h | 13 ++ xen/include/xen/xsplice.h | 21 +++ 9 files changed, 432 insertions(+), 19 deletions(-) diff --git a/xen/arch/arm/xsplice.c b/xen/arch/arm/xsplice.c index 8d85fa9..06f6875 100644 --- a/xen/arch/arm/xsplice.c +++ b/xen/arch/arm/xsplice.c @@ -3,7 +3,15 @@ #include <xen/xsplice_elf.h> #include <xen/xsplice.h> -int xsplice_verify_elf(uint8_t *data, ssize_t len) +void xsplice_apply_jmp(struct xsplice_patch_func *func) +{ +} + +void xsplice_revert_jmp(struct xsplice_patch_func *func) +{ +} + +int xsplice_verify_elf(struct xsplice_elf *elf, uint8_t *data) { return -ENOSYS; } diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 9d43f7b..b5995b9 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -36,6 +36,7 @@ #include <xen/cpu.h> #include <xen/wait.h> #include <xen/guest_access.h> +#include <xen/xsplice.h> #include <public/sysctl.h> #include <public/hvm/hvm_vcpu.h> #include <asm/regs.h> @@ -121,6 +122,7 @@ static void idle_loop(void) (*pm_idle)(); do_tasklet(); do_softirq(); + do_xsplice(); /* Must be last. */ } } @@ -137,6 +139,7 @@ void startup_cpu_idle_loop(void) static void noreturn continue_idle_domain(struct vcpu *v) { + do_xsplice(); reset_stack_and_jump(idle_loop); } @@ -144,6 +147,7 @@ static void noreturn continue_nonidle_domain(struct vcpu *v) { check_wakeup_from_wait(); mark_regs_dirty(guest_cpu_user_regs()); + do_xsplice(); reset_stack_and_jump(ret_from_intr); } diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index e62dfa1..340f23b 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -26,6 +26,7 @@ #include <xen/hypercall.h> #include <xen/domain_page.h> #include <xen/xenoprof.h> +#include <xen/xsplice.h> #include <asm/current.h> #include <asm/io.h> #include <asm/paging.h> @@ -1108,6 +1109,7 @@ static void noreturn svm_do_resume(struct vcpu *v) hvm_do_resume(v); + do_xsplice(); reset_stack_and_jump(svm_asm_do_resume); } diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 5bc3c74..1008163 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -25,6 +25,7 @@ #include <xen/kernel.h> #include <xen/keyhandler.h> #include <xen/vm_event.h> +#include <xen/xsplice.h> #include <asm/current.h> #include <asm/cpufeature.h> #include <asm/processor.h> @@ -1716,6 +1717,7 @@ void vmx_do_resume(struct vcpu *v) } hvm_do_resume(v); + do_xsplice(); reset_stack_and_jump(vmx_asm_do_vmentry); } diff --git a/xen/arch/x86/xsplice.c b/xen/arch/x86/xsplice.c index 814dd52..ae35e91 100644 --- a/xen/arch/x86/xsplice.c +++ b/xen/arch/x86/xsplice.c @@ -10,6 +10,25 @@ __func__,__LINE__, x); return x; } #endif +#define PATCH_INSN_SIZE 5 + +void xsplice_apply_jmp(struct xsplice_patch_func *func) +{ + uint32_t val; + uint8_t *old_ptr; + + old_ptr = (uint8_t *)func->old_addr; + memcpy(func->undo, old_ptr, PATCH_INSN_SIZE); + *old_ptr++ = 0xe9; /* Relative jump */ + val = func->new_addr - func->old_addr - PATCH_INSN_SIZE; + memcpy(old_ptr, &val, sizeof val); +} + +void xsplice_revert_jmp(struct xsplice_patch_func *func) +{ + memcpy((void *)func->old_addr, func->undo, PATCH_INSN_SIZE); +} + int xsplice_verify_elf(struct xsplice_elf *elf, uint8_t *data) { diff --git a/xen/common/xsplice.c b/xen/common/xsplice.c index fbd6129..b854c0a 100644 --- a/xen/common/xsplice.c +++ b/xen/common/xsplice.c @@ -3,6 +3,7 @@ * */ +#include <xen/cpu.h> #include <xen/guest_access.h> #include <xen/keyhandler.h> #include <xen/lib.h> @@ -10,16 +11,25 @@ #include <xen/mm.h> #include <xen/sched.h> #include <xen/smp.h> +#include <xen/softirq.h> #include <xen/spinlock.h> +#include <xen/wait.h> #include <xen/xsplice_elf.h> #include <xen/xsplice.h> #include <asm/event.h> +#include <asm/nmi.h> #include <public/sysctl.h> +/* + * Protects against payload_list operations and also allows only one + * caller in schedule_work. + */ static DEFINE_SPINLOCK(payload_lock); static LIST_HEAD(payload_list); +static LIST_HEAD(applied_list); + static unsigned int payload_cnt; static unsigned int payload_version = 1; @@ -29,6 +39,9 @@ struct payload { struct list_head list; /* Linked to 'payload_list'. */ void *payload_address; /* Virtual address mapped. */ size_t payload_pages; /* Nr of the pages. */ + struct list_head applied_list; /* Linked to 'applied_list'. */ + struct xsplice_patch_func *funcs; /* The array of functions to patch. */ + unsigned int nfuncs; /* Nr of functions to patch. */ char name[XEN_XSPLICE_NAME_SIZE + 1];/* Name of it. */ }; @@ -36,6 +49,24 @@ struct payload { static int load_payload_data(struct payload *payload, uint8_t *raw, ssize_t len); static void free_payload_data(struct payload *payload); +/* Defines an outstanding patching action. */ +struct xsplice_work +{ + atomic_t semaphore; /* Used for rendezvous. First to grab it will + do the patching. */ + atomic_t irq_semaphore; /* Used to signal all IRQs disabled. */ + uint32_t timeout; /* Timeout to do the operation. */ + struct payload *data; /* The payload on which to act. */ + volatile bool_t do_work; /* Signals work to do. */ + volatile bool_t ready; /* Signals all CPUs synchronized. */ + uint32_t cmd; /* Action request: XSPLICE_ACTION_* */ +}; + +/* There can be only one outstanding patching action. */ +static struct xsplice_work xsplice_work; + +static int schedule_work(struct payload *data, uint32_t cmd, uint32_t timeout); + static const char *state2str(int32_t state) { #define STATE(x) [XSPLICE_STATE_##x] = #x @@ -61,14 +92,23 @@ static const char *state2str(int32_t state) static void xsplice_printall(unsigned char key) { struct payload *data; + unsigned int i; spin_lock(&payload_lock); list_for_each_entry ( data, &payload_list, list ) - printk(" name=%s state=%s(%d) %p using %zu pages.\n", data->name, + { + printk(" name=%s state=%s(%d) %p using %zu pages:\n", data->name, state2str(data->state), data->state, data->payload_address, data->payload_pages); + for ( i = 0; i < data->nfuncs; i++ ) + { + struct xsplice_patch_func *f = &(data->funcs[i]); + printk(" %s patch 0x%"PRIx64"(%u) with 0x%"PRIx64"(%u)\n", + f->name, f->old_addr, f->old_size, f->new_addr, f->new_size); + } + } spin_unlock(&payload_lock); } @@ -327,28 +367,22 @@ static int xsplice_action(xen_sysctl_xsplice_action_t *action) case XSPLICE_ACTION_REVERT: if ( data->state == XSPLICE_STATE_APPLIED ) { - /* No implementation yet. */ - data->state = XSPLICE_STATE_CHECKED; - data->rc = 0; - rc = 0; + data->rc = -EAGAIN; + rc = schedule_work(data, action->cmd, action->timeout); } break; case XSPLICE_ACTION_APPLY: if ( (data->state == XSPLICE_STATE_CHECKED) ) { - /* No implementation yet. */ - data->state = XSPLICE_STATE_APPLIED; - data->rc = 0; - rc = 0; + data->rc = -EAGAIN; + rc = schedule_work(data, action->cmd, action->timeout); } break; case XSPLICE_ACTION_REPLACE: if ( data->state == XSPLICE_STATE_CHECKED ) { - /* No implementation yet. */ - data->state = XSPLICE_STATE_CHECKED; - data->rc = 0; - rc = 0; + data->rc = -EAGAIN; + rc = schedule_work(data, action->cmd, action->timeout); } break; default: @@ -576,6 +610,62 @@ static int move_payload(struct payload *payload, struct xsplice_elf *elf) return 0; } +static int check_special_sections(struct payload *payload, + struct xsplice_elf *elf) +{ + unsigned int i; + static const char *const names[] = { ".xsplice.funcs" }; + + for ( i = 0; i < ARRAY_SIZE(names); i++ ) + { + struct xsplice_elf_sec *sec; + + sec = xsplice_elf_sec_by_name(elf, names[i]); + if ( !sec ) + { + printk(XENLOG_ERR "%s: %s is missing!\n", names[i],elf->name); + return -EINVAL; + } + if ( !sec->sec->sh_size ) + return -EINVAL; + } + return 0; +} + +static int find_special_sections(struct payload *payload, + struct xsplice_elf *elf) +{ + struct xsplice_elf_sec *sec; + unsigned int i; + struct xsplice_patch_func *f; + + sec = xsplice_elf_sec_by_name(elf, ".xsplice.funcs"); + if ( sec->sec->sh_size % sizeof *payload->funcs ) + return -EINVAL; + + payload->funcs = (struct xsplice_patch_func *)sec->load_addr; + payload->nfuncs = sec->sec->sh_size / (sizeof *payload->funcs); + + for ( i = 0; i < payload->nfuncs; i++ ) + { + unsigned int j; + + f = &(payload->funcs[i]); + + if ( !f->new_addr || !f->old_addr || !f->old_size || !f->new_size ) + return -EINVAL; + + for ( j = 0; j < 8; j ++ ) + if ( f->undo[j] ) + return -EINVAL; + + for ( j = 0; j < 24; j ++ ) + if ( f->pad[j] ) + return -EINVAL; + } + return 0; +} + static int load_payload_data(struct payload *payload, uint8_t *raw, ssize_t len) { struct xsplice_elf elf; @@ -605,7 +695,14 @@ static int load_payload_data(struct payload *payload, uint8_t *raw, ssize_t len) if ( rc ) goto err_payload; - /* Free our temporary data structure. */ + rc = check_special_sections(payload, &elf); + if ( rc ) + goto err_payload; + + rc = find_special_sections(payload, &elf); + if ( rc ) + goto err_payload; + xsplice_elf_free(&elf); return 0; @@ -617,6 +714,253 @@ static int load_payload_data(struct payload *payload, uint8_t *raw, ssize_t len) return rc; } + +/* + * The following functions get the CPUs into an appropriate state and + * apply (or revert) each of the module's functions. + */ + +/* + * This function is executed having all other CPUs with no stack (we may + * have cpu_idle on it) and IRQs disabled. We guard against NMI by temporarily + * installing our NOP NMI handler. + */ +static int apply_payload(struct payload *data) +{ + unsigned int i; + + printk(XENLOG_DEBUG "%s: Applying %u functions.\n", data->name, + data->nfuncs); + + for ( i = 0; i < data->nfuncs; i++ ) + xsplice_apply_jmp(data->funcs + i); + + list_add_tail(&data->applied_list, &applied_list); + + return 0; +} + +/* + * This function is executed having all other CPUs with no stack (we may + * have cpu_idle on it) and IRQs disabled. + */ +static int revert_payload(struct payload *data) +{ + unsigned int i; + + printk(XENLOG_DEBUG "%s: Reverting.\n", data->name); + + for ( i = 0; i < data->nfuncs; i++ ) + xsplice_revert_jmp(data->funcs + i); + + list_del(&data->applied_list); + + return 0; +} + +/* Must be holding the payload_lock. */ +static int schedule_work(struct payload *data, uint32_t cmd, uint32_t timeout) +{ + /* Fail if an operation is already scheduled. */ + if ( xsplice_work.do_work ) + return -EBUSY; + + xsplice_work.cmd = cmd; + xsplice_work.data = data; + xsplice_work.timeout = timeout ? timeout : MILLISECS(30); + + printk(XENLOG_DEBUG "%s: timeout is %"PRI_stime"ms\n", data->name, + xsplice_work.timeout / MILLISECS(1)); + + atomic_set(&xsplice_work.semaphore, -1); + atomic_set(&xsplice_work.irq_semaphore, -1); + + xsplice_work.ready = 0; + smp_wmb(); + xsplice_work.do_work = 1; + smp_wmb(); + + return 0; +} + +/* + * Note that because of this NOP code the do_nmi is not safely patchable. + * Also if we do receive 'real' NMIs we have lost them. + */ +static int mask_nmi_callback(const struct cpu_user_regs *regs, int cpu) +{ + return 1; +} + +static void reschedule_fn(void *unused) +{ + smp_mb(); /* Synchronize with setting do_work */ + raise_softirq(SCHEDULE_SOFTIRQ); +} + +static int xsplice_do_wait(atomic_t *counter, s_time_t timeout, + unsigned int total_cpus, const char *s) +{ + int rc = 0; + + while ( atomic_read(counter) != total_cpus && NOW() < timeout ) + cpu_relax(); + + /* Log & abort. */ + if ( atomic_read(counter) != total_cpus ) + { + printk(XENLOG_DEBUG "%s: %s %u/%u\n", xsplice_work.data->name, + s, atomic_read(counter), total_cpus); + rc = -EBUSY; + xsplice_work.data->rc = rc; + xsplice_work.do_work = 0; + smp_wmb(); + return rc; + } + return rc; +} + +static void xsplice_do_single(unsigned int total_cpus) +{ + nmi_callback_t saved_nmi_callback; + struct payload *data, *tmp; + s_time_t timeout; + int rc; + + data = xsplice_work.data; + timeout = xsplice_work.timeout + NOW(); + if ( xsplice_do_wait(&xsplice_work.semaphore, timeout, total_cpus, + "Timed out on CPU semaphore") ) + return; + + /* "Mask" NMIs. */ + saved_nmi_callback = set_nmi_callback(mask_nmi_callback); + + /* All CPUs are waiting, now signal to disable IRQs. */ + xsplice_work.ready = 1; + smp_wmb(); + + atomic_inc(&xsplice_work.irq_semaphore); + if ( xsplice_do_wait(&xsplice_work.irq_semaphore, timeout, total_cpus, + "Timed out on IRQ semaphore.") ) + return; + + local_irq_disable(); + /* Now this function should be the only one on any stack. + * No need to lock the payload list or applied list. */ + switch ( xsplice_work.cmd ) + { + case XSPLICE_ACTION_APPLY: + rc = apply_payload(data); + if ( rc == 0 ) + data->state = XSPLICE_STATE_APPLIED; + break; + case XSPLICE_ACTION_REVERT: + rc = revert_payload(data); + if ( rc == 0 ) + data->state = XSPLICE_STATE_CHECKED; + break; + case XSPLICE_ACTION_REPLACE: + list_for_each_entry_safe_reverse ( data, tmp, &applied_list, list ) + { + data->rc = revert_payload(data); + if ( data->rc == 0 ) + data->state = XSPLICE_STATE_CHECKED; + else + { + rc = -EINVAL; + break; + } + } + if ( rc != -EINVAL ) + { + rc = apply_payload(xsplice_work.data); + if ( rc == 0 ) + xsplice_work.data->state = XSPLICE_STATE_APPLIED; + } + break; + default: + rc = -EINVAL; + break; + } + + xsplice_work.data->rc = rc; + + local_irq_enable(); + set_nmi_callback(saved_nmi_callback); + + xsplice_work.do_work = 0; + smp_wmb(); /* Synchronize with waiting CPUs. */ +} + +/* + * The main function which manages the work of quiescing the system and + * patching code. + */ +void do_xsplice(void) +{ + struct payload *p = xsplice_work.data; + unsigned int cpu = smp_processor_id(); + + /* Fast path: no work to do. */ + if ( likely(!xsplice_work.do_work) ) + return; + ASSERT(local_irq_is_enabled()); + + /* Set at -1, so will go up to num_online_cpus - 1 */ + if ( atomic_inc_and_test(&xsplice_work.semaphore) ) + { + unsigned int total_cpus; + + if ( !get_cpu_maps() ) + { + printk(XENLOG_DEBUG "%s: CPU%u - unable to get cpu_maps lock.\n", + p->name, cpu); + xsplice_work.data->rc = -EBUSY; + xsplice_work.do_work = 0; + return; + } + + barrier(); /* MUST do it after get_cpu_maps. */ + total_cpus = num_online_cpus() - 1; + + if ( total_cpus ) + { + printk(XENLOG_DEBUG "%s: CPU%u - IPIing the %u CPUs.\n", p->name, + cpu, total_cpus); + smp_call_function(reschedule_fn, NULL, 0); + } + (void)xsplice_do_single(total_cpus); + + ASSERT(local_irq_is_enabled()); + + put_cpu_maps(); + + printk(XENLOG_DEBUG "%s finished with rc=%d\n", p->name, p->rc); + } + else + { + /* Wait for all CPUs to rendezvous. */ + while ( xsplice_work.do_work && !xsplice_work.ready ) + { + cpu_relax(); + smp_rmb(); + } + + /* Disable IRQs and signal. */ + local_irq_disable(); + atomic_inc(&xsplice_work.irq_semaphore); + + /* Wait for patching to complete. */ + while ( xsplice_work.do_work ) + { + cpu_relax(); + smp_rmb(); + } + local_irq_enable(); + } +} + static int __init xsplice_init(void) { register_keyhandler('x', xsplice_printall, "print xsplicing info", 1); diff --git a/xen/common/xsplice_elf.c b/xen/common/xsplice_elf.c index 0717263..ad70797 100644 --- a/xen/common/xsplice_elf.c +++ b/xen/common/xsplice_elf.c @@ -228,18 +228,18 @@ int xsplice_elf_resolve_symbols(struct xsplice_elf *elf) return_(-ENOENT); break; case SHN_ABS: - printk(XENLOG_DEBUG "%s: Absolute symbol: %s => 0x%p\n", + printk(XENLOG_DEBUG "%s: Absolute symbol: %s => 0x%"PRIx64"\n", elf->name, elf->sym[i].name, - (void *)elf->sym[i].sym->st_value); + elf->sym[i].sym->st_value); break; default: if ( elf->sec[elf->sym[i].sym->st_shndx].sec->sh_flags & SHF_ALLOC ) { elf->sym[i].sym->st_value += (unsigned long)elf->sec[elf->sym[i].sym->st_shndx].load_addr; - printk(XENLOG_DEBUG "%s: Symbol resolved: %s => 0x%p\n", + printk(XENLOG_DEBUG "%s: Symbol resolved: %s => 0x%"PRIx64"\n", elf->name, elf->sym[i].name, - (void *)elf->sym[i].sym->st_value); + elf->sym[i].sym->st_value); } } } diff --git a/xen/include/asm-arm/nmi.h b/xen/include/asm-arm/nmi.h index a60587e..82aff35 100644 --- a/xen/include/asm-arm/nmi.h +++ b/xen/include/asm-arm/nmi.h @@ -4,6 +4,19 @@ #define register_guest_nmi_callback(a) (-ENOSYS) #define unregister_guest_nmi_callback() (-ENOSYS) +typedef int (*nmi_callback_t)(const struct cpu_user_regs *regs, int cpu); + +/** + * set_nmi_callback + * + * Set a handler for an NMI. Only one handler may be + * set. Return the old nmi callback handler. + */ +static inline nmi_callback_t set_nmi_callback(nmi_callback_t callback) +{ + return NULL; +} + #endif /* ASM_NMI_H */ /* * Local variables: diff --git a/xen/include/xen/xsplice.h b/xen/include/xen/xsplice.h index d71c898..d6db1c2 100644 --- a/xen/include/xen/xsplice.h +++ b/xen/include/xen/xsplice.h @@ -6,8 +6,26 @@ struct xsplice_elf_sec; struct xsplice_elf_sym; struct xen_sysctl_xsplice_op; +#include <xen/elfstructs.h> +/* + * The structure which defines the patching. This is what the hypervisor + * expects in the '.xsplice.func' section of the ELF file. + * + * This MUST be in sync with what the tools generate. + */ +struct xsplice_patch_func { + const char *name; + Elf64_Xword new_addr; + Elf64_Xword old_addr; + Elf64_Word new_size; + Elf64_Word old_size; + uint8_t undo[8]; + uint8_t pad[24]; +}; + #ifdef CONFIG_XSPLICE int xsplice_control(struct xen_sysctl_xsplice_op *); +void do_xsplice(void); /* Arch hooks */ int xsplice_verify_elf(struct xsplice_elf *elf, uint8_t *data); @@ -17,11 +35,14 @@ int xsplice_perform_rel(struct xsplice_elf *elf, int xsplice_perform_rela(struct xsplice_elf *elf, struct xsplice_elf_sec *base, struct xsplice_elf_sec *rela); +void xsplice_apply_jmp(struct xsplice_patch_func *func); +void xsplice_revert_jmp(struct xsplice_patch_func *func); #else #include <xen/errno.h> /* For -ENOSYS */ static inline int xsplice_control(struct xen_sysctl_xsplice_op *op) { return -ENOSYS; } +static inline void do_xsplice(void) { }; #endif #endif /* __XEN_XSPLICE_H__ */ -- 2.1.0 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel