On Mon, Sep 17, 2007 at 02:42:28PM -0400, Mathieu Desnoyers wrote:
> i386 optimization of the immediate values which uses a movl with code patching
> to set/unset the value used to populate the register used as variable source.
> 
> Changelog:
> - Use text_poke_early with cr0 WP save/restore to patch the bypass. We are 
> doing
>   non atomic writes to a code region only touched by us (nobody can execute it
>   since we are protected by the immediate_mutex).
> - Put immediate_set and _immediate_set in the architecture independent header.
> 
> Signed-off-by: Mathieu Desnoyers <[EMAIL PROTECTED]>
> CC: Andi Kleen <[EMAIL PROTECTED]>
> CC: "H. Peter Anvin" <[EMAIL PROTECTED]>
> CC: Chuck Ebbert <[EMAIL PROTECTED]>
> CC: Christoph Hellwig <[EMAIL PROTECTED]>
> ---
>  arch/i386/kernel/Makefile    |    1 
>  arch/i386/kernel/immediate.c |  307 
> +++++++++++++++++++++++++++++++++++++++++++
>  arch/i386/kernel/traps.c     |    8 -
>  include/asm-i386/immediate.h |   82 +++++++++++
>  4 files changed, 394 insertions(+), 4 deletions(-)
> 
> Index: linux-2.6-lttng/include/asm-i386/immediate.h
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6-lttng/include/asm-i386/immediate.h      2007-09-17 
> 13:38:52.000000000 -0400
> @@ -0,0 +1,82 @@
> +#ifndef _ASM_I386_IMMEDIATE_H
> +#define _ASM_I386_IMMEDIATE_H
> +
> +/*
> + * Immediate values. i386 architecture optimizations.
> + *
> + * (C) Copyright 2006 Mathieu Desnoyers <[EMAIL PROTECTED]>
> + *
> + * This file is released under the GPLv2.
> + * See the file COPYING for more details.
> + */
> +
> +struct __immediate {
> +     long var;               /* Pointer to the identifier variable of the
> +                              * immediate value
> +                              */
> +     long immediate;         /*
> +                              * Pointer to the memory location of the
> +                              * immediate value within the instruction.
> +                              */
> +     long size;              /* Type size. */
> +};
> +
> +/**
> + * immediate_read - read immediate variable
> + * @name: immediate value name
> + *
> + * Reads the value of @var.
s/var/name/ ?
> + * Optimized version of the immediate.
> + * Do not use in __init and __exit functions. Use _immediate_read() instead.
> + * Makes sure the 2 and 4 bytes update will be atomic by aligning the 
> immediate
> + * value. 2 bytes (short) uses a 66H prefix. If size is bigger than 4 bytes,
> + * fall back on a memory read.
> + */
> +#define immediate_read(name)                                         \
> +     ({                                                              \
> +             __typeof__(name##__immediate) value;                    \
> +             switch (sizeof(value)) {                                \
> +             case 1:                                                 \
> +                     asm (   ".section __immediate, \"a\", @progbits;\n\t" \
> +                                     ".long %1, (0f)+1, 1;\n\t"      \
> +                                     ".previous;\n\t"                \
> +                                     "0:\n\t"                        \
> +                                     "mov %2,%0;\n\t"                \
> +                             : "=r" (value)                          \
> +                             : "m" (name##__immediate),              \
> +                               "i" (0));                             \
> +                     break;                                          \
> +             case 2:                                                 \
> +                     asm (   ".section __immediate, \"a\", @progbits;\n\t" \
> +                                     ".long %1, (0f)+2, 2;\n\t"      \
> +                                     ".previous;\n\t"                \
> +                                     "1:\n\t"                        \
> +                                     ".align 2;\n\t"                 \
> +                                     "0:\n\t"                        \
> +                                     "mov %2,%0;\n\t"                \
> +                             : "=r" (value)                          \
> +                             : "m" (name##__immediate),              \
> +                               "i" (0));                             \
> +                     break;                                          \
> +             case 4:                                                 \
> +                     asm (   ".section __immediate, \"a\", @progbits;\n\t" \
> +                                     ".long %1, (0f)+1, 4;\n\t"      \
> +                                     ".previous;\n\t"                \
> +                                     "1:\n\t"                        \
> +                                     ".org (1b)+(3-((1b)%%4)), 0x90;\n\t" \
> +                                     "0:\n\t"                        \
> +                                     "mov %2,%0;\n\t"                \
> +                             : "=r" (value)                          \
> +                             : "m" (name##__immediate),              \
> +                               "i" (0));                             \
> +                     break;                                          \
> +             default:value = name##__immediate;                      \
> +                     break;                                          \
> +             };                                                      \
> +             value;                                                  \
> +     })
> +
> +extern int arch_immediate_update(const struct __immediate *immediate);
> +extern void arch_immediate_update_early(const struct __immediate *immediate);
> +
> +#endif /* _ASM_I386_IMMEDIATE_H */
> Index: linux-2.6-lttng/arch/i386/kernel/Makefile
> ===================================================================
> --- linux-2.6-lttng.orig/arch/i386/kernel/Makefile    2007-09-17 
> 13:37:58.000000000 -0400
> +++ linux-2.6-lttng/arch/i386/kernel/Makefile 2007-09-17 13:38:44.000000000 
> -0400
> @@ -37,6 +37,7 @@ obj-$(CONFIG_MODULES)               += module.o
>  obj-y                                += sysenter.o vsyscall.o
>  obj-$(CONFIG_ACPI_SRAT)      += srat.o
>  obj-$(CONFIG_EFI)            += efi.o efi_stub.o
> +obj-$(CONFIG_IMMEDIATE)              += immediate.o
>  obj-$(CONFIG_DOUBLEFAULT)    += doublefault.o
>  obj-$(CONFIG_KGDB)           += kgdb.o kgdb-jmp.o
>  obj-$(CONFIG_VM86)           += vm86.o
> Index: linux-2.6-lttng/arch/i386/kernel/immediate.c
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6-lttng/arch/i386/kernel/immediate.c      2007-09-17 
> 13:38:44.000000000 -0400
> @@ -0,0 +1,307 @@
> +/*
> + * Immediate Value - i386 architecture specific code.
> + *
> + * Rationale
> + *
> + * Required because of :
> + * - Erratum 49 fix for Intel PIII.
> + * - Still present on newer processors : Intel Core 2 Duo Processor for Intel
> + *   Centrino Duo Processor Technology Specification Update, AH33.
> + *   Unsynchronized Cross-Modifying Code Operations Can Cause Unexpected
> + *   Instruction Execution Results.
> + *
> + * Permits immediate value modification by XMC with correct serialization.
> + *
> + * Reentrant for NMI and trap handler instrumentation. Permits XMC to a
> + * location that has preemption enabled because it involves no temporary or
> + * reused data structure.
> + *
> + * Quoting Richard J Moore, source of the information motivating this
> + * implementation which differs from the one proposed by Intel which is not
> + * suitable for kernel context (does not support NMI and would require 
> disabling
> + * interrupts on every CPU for a long period) :
> + *
> + * "There is another issue to consider when looking into using probes other
> + * then int3:
> + *
> + * Intel erratum 54 - Unsynchronized Cross-modifying code - refers to the
> + * practice of modifying code on one processor where another has prefetched
> + * the unmodified version of the code. Intel states that unpredictable 
> general
> + * protection faults may result if a synchronizing instruction (iret, int,
> + * int3, cpuid, etc ) is not executed on the second processor before it
> + * executes the pre-fetched out-of-date copy of the instruction.
> + *
> + * When we became aware of this I had a long discussion with Intel's
> + * microarchitecture guys. It turns out that the reason for this erratum
> + * (which incidentally Intel does not intend to fix) is because the trace
> + * cache - the stream of micorops resulting from instruction interpretation -
> + * cannot guaranteed to be valid. Reading between the lines I assume this
> + * issue arises because of optimization done in the trace cache, where it is
> + * no longer possible to identify the original instruction boundaries. If the
> + * CPU discoverers that the trace cache has been invalidated because of
> + * unsynchronized cross-modification then instruction execution will be
> + * aborted with a GPF. Further discussion with Intel revealed that replacing
> + * the first opcode byte with an int3 would not be subject to this erratum.
> + *
> + * So, is cmpxchg reliable? One has to guarantee more than mere atomicity."
> + *
> + * Overall design
> + *
> + * The algorithm proposed by Intel applies not so well in kernel context: it
> + * would imply disabling interrupts and looping on every CPUs while modifying
> + * the code and would not support instrumentation of code called from 
> interrupt
> + * sources that cannot be disabled.
> + *
> + * Therefore, we use a different algorithm to respect Intel's erratum (see 
> the
> + * quoted discussion above). We make sure that no CPU sees an out-of-date 
> copy
> + * of a pre-fetched instruction by 1 - using a breakpoint, which skips the
> + * instruction that is going to be modified, 2 - issuing an IPI to every CPU 
> to
> + * execute a sync_core(), to make sure that even when the breakpoint is 
> removed,
> + * no cpu could possibly still have the out-of-date copy of the instruction,
> + * modify the now unused 2nd byte of the instruction, and then put back the
> + * original 1st byte of the instruction.
> + *
> + * It has exactly the same intent as the algorithm proposed by Intel, but
> + * it has less side-effects, scales better and supports NMI, SMI and MCE.
> + *
> + * Mathieu Desnoyers <[EMAIL PROTECTED]>
> + */
> +
> +#include <linux/notifier.h>
> +#include <linux/preempt.h>
> +#include <linux/smp.h>
> +#include <linux/notifier.h>
> +#include <linux/module.h>
> +#include <linux/immediate.h>
> +#include <linux/kdebug.h>
> +#include <linux/rcupdate.h>
> +#include <linux/kprobes.h>
> +
> +#include <asm/cacheflush.h>
> +
> +#define BREAKPOINT_INSTRUCTION  0xcc
> +#define BREAKPOINT_INS_LEN   1
> +#define NR_NOPS                      8
> +
> +static long target_after_int3;               /* EIP of the target after the 
> int3 */
> +static long bypass_eip;                      /* EIP of the bypass. */
> +static long bypass_after_int3;               /* EIP after the end-of-bypass 
> int3 */
> +static long after_immediate;         /*
> +                                      * EIP where to resume after the
> +                                      * single-stepping.
> +                                      */
> +
> +/*
> + * Size of the movl instruction (without the immediate value) in bytes.
> + * The 2 bytes load immediate has a 66H prefix, which makes the opcode 2 
> bytes
> + * wide.
> + */
> +static inline size_t _immediate_get_insn_size(long size)
> +{
> +     switch (size) {
> +             case 1: return 1;
> +             case 2: return 2;
> +             case 4: return 1;
> +             default: BUG();
> +     };
> +}
> +
> +/*
> + * Internal bypass used during value update. The bypass is skipped by the
> + * function in which it is inserted.
> + * No need to be aligned because we exclude readers from the site during
> + * update.
> + * Layout is:
> + * nop nop nop nop nop nop nop nop int3
> + * The nops are the target replaced by the instruction to single-step.
> + */
> +static inline void _immediate_bypass(long *bypassaddr, long *breaknextaddr)
> +{
> +             asm volatile (  "jmp 2f;\n\t"
> +                             "0:\n\t"
> +                             ".space 8, 0x90;\n\t"
> +                             "1:\n\t"
> +                             "int3;\n\t"
> +                             "2:\n\t"
> +                             "movl $(0b),%0;\n\t"
> +                             "movl $((1b)+1),%1;\n\t"
> +                             : "=r" (*bypassaddr),
> +                               "=r" (*breaknextaddr) : );
> +}
> +
> +static void immediate_synchronize_core(void *info)
> +{
> +     sync_core();    /* use cpuid to stop speculative execution */
> +}
> +
> +/*
> + * The eip value points right after the breakpoint instruction, in the second
> + * byte of the movl.
> + * Disable preemption in the bypass to make sure no thread will be preempted 
> in
> + * it. We can then use synchronize_sched() to make sure every bypass users 
> have
> + * ended.
> + */
> +static int immediate_notifier(struct notifier_block *nb,
> +     unsigned long val, void *data)
> +{
> +     enum die_val die_val = (enum die_val) val;
> +     struct die_args *args = data;
> +
> +     if (!args->regs || user_mode_vm(args->regs))
> +             return NOTIFY_DONE;
> +
> +     if (die_val == DIE_INT3) {
> +             if (args->regs->eip == target_after_int3) {
> +                     preempt_disable();
> +                     args->regs->eip = bypass_eip;
> +                     return NOTIFY_STOP;
> +             } else if (args->regs->eip == bypass_after_int3) {
> +                     args->regs->eip = after_immediate;
> +                     preempt_enable();
> +                     return NOTIFY_STOP;
> +             }
> +     }
> +     return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block immediate_notify = {
> +     .notifier_call = immediate_notifier,
> +     .priority = 0x7fffffff, /* we need to be notified first */
> +};
> +
> +
> +/**
> + * arch_immediate_update - update one immediate value
> + * @immediate: pointer of type const struct __immediate to update
> + *
> + * Update one immediate value. Must be called with immediate_mutex held.
> + */
> +__kprobes int arch_immediate_update(const struct __immediate *immediate)
> +{
> +     int ret;
> +     size_t insn_size = _immediate_get_insn_size(immediate->size);
> +     long insn = immediate->immediate - insn_size;
> +     long len;
> +     unsigned long cr0;
> +
> +#ifdef CONFIG_KPROBES
> +     /*
> +      * Fail if a kprobe has been set on this instruction.
> +      * (TODO: we could eventually do better and modify all the (possibly
> +      * nested) kprobes for this site if kprobes had an API for this.
> +      */
> +     if (unlikely(*(unsigned char*)insn == BREAKPOINT_INSTRUCTION)) {
> +             printk(KERN_WARNING "Immediate value in conflict with kprobe. "
> +                                 "Variable at %p, "
> +                                 "instruction at %p, size %lu\n",
> +                                 (void*)immediate->immediate,
> +                                 (void*)immediate->var, immediate->size);
> +             return -EBUSY;
> +     }
> +#endif
> +
> +     /*
> +      * If the variable and the instruction have the same value, there is
> +      * nothing to do.
> +      */
> +     switch (immediate->size) {
> +             case 1: if (*(uint8_t*)immediate->immediate
> +                                     == *(uint8_t*)immediate->var)
> +                             return 0;
> +                     break;
> +             case 2: if (*(uint16_t*)immediate->immediate
> +                                     == *(uint16_t*)immediate->var)
> +                             return 0;
> +                     break;
> +             case 4: if (*(uint32_t*)immediate->immediate
> +                                     == *(uint32_t*)immediate->var)
> +                             return 0;
> +                     break;
> +             default:return -EINVAL;
> +     }
> +
> +     _immediate_bypass(&bypass_eip, &bypass_after_int3);
> +
> +     after_immediate = immediate->immediate + immediate->size;
> +
> +     /*
> +      * Using the _early variants because nobody is executing the
> +      * bypass code while we patch it. It is protected by the
> +      * immediate_mutex. Since we modify the instructions non atomically (for
> +      * nops), we have to use the _early variant.
> +      * We must however deal with the WP flag in cr0 by ourself.
> +      */
> +     kernel_wp_save(cr0);
> +     text_poke_early((void*)bypass_eip, (void*)insn,
> +                     insn_size + immediate->size);
> +     /*
> +      * Fill the rest with nops.
> +      */
> +     len = NR_NOPS - immediate->size - insn_size;
> +     add_nops((void*)(bypass_eip + insn_size + immediate->size), len);
> +     kernel_wp_restore(cr0);
> +
> +     target_after_int3 = insn + BREAKPOINT_INS_LEN;
> +     /* register_die_notifier has memory barriers */
> +     register_die_notifier(&immediate_notify);
> +     /* The breakpoint will single-step the bypass */
> +     text_poke((void*)insn,
> +             INIT_ARRAY(unsigned char, BREAKPOINT_INSTRUCTION, 1), 1);
> +     wmb();
> +     /*
> +      * Execute serializing instruction on each CPU.
> +      * Acts as a memory barrier.
> +      */
> +     ret = on_each_cpu(immediate_synchronize_core, NULL, 1, 1);
> +     BUG_ON(ret != 0);
> +
> +     text_poke((void*)(insn + insn_size), (void*)immediate->var,
> +                     immediate->size);
> +     wmb();
> +     text_poke((void*)insn, (unsigned char*)bypass_eip, 1);
> +             /*
> +              * Wait for all int3 handlers to end
> +              * (interrupts are disabled in int3).
> +              * This CPU is clearly not in a int3 handler,
> +              * because int3 handler is not preemptible and
> +              * there cannot be any more int3 handler called
> +              * for this site, because we placed the original
> +              * instruction back.
> +              * synchronize_sched has memory barriers.
> +              */
> +     synchronize_sched();
> +     unregister_die_notifier(&immediate_notify);
> +     /* unregister_die_notifier has memory barriers */
> +     return 0;
> +}
> +
> +/**
> + * arch_immediate_update_early - update one immediate value at boot time
> + * @immediate: pointer of type const struct __immediate to update
> + *
> + * Update one immediate value at boot time.
> + */
> +void __init arch_immediate_update_early(const struct __immediate *immediate)
> +{
> +     /*
> +      * If the variable and the instruction have the same value, there is
> +      * nothing to do.
> +      */
> +     switch (immediate->size) {
> +             case 1: if (*(uint8_t*)immediate->immediate
> +                                     == *(uint8_t*)immediate->var)
> +                             return;
> +                     break;
> +             case 2: if (*(uint16_t*)immediate->immediate
> +                                     == *(uint16_t*)immediate->var)
> +                             return;
> +                     break;
> +             case 4: if (*(uint32_t*)immediate->immediate
> +                                     == *(uint32_t*)immediate->var)
> +                             return;
> +                     break;
> +             default:return;
> +     }
> +     memcpy((void*)immediate->immediate, (void*)immediate->var,
> +                     immediate->size);
> +}
> Index: linux-2.6-lttng/arch/i386/kernel/traps.c
> ===================================================================
> --- linux-2.6-lttng.orig/arch/i386/kernel/traps.c     2007-09-17 
> 13:37:59.000000000 -0400
> +++ linux-2.6-lttng/arch/i386/kernel/traps.c  2007-09-17 13:38:44.000000000 
> -0400
> @@ -601,7 +601,7 @@ fastcall void do_##name(struct pt_regs *
>  }
>  
>  DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, 
> regs->eip)
> -#ifndef CONFIG_KPROBES
> +#if !defined(CONFIG_KPROBES) && !defined(CONFIG_IMMEDIATE)
>  DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
>  #endif
>  DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
> @@ -843,14 +843,14 @@ void restart_nmi(void)
>       acpi_nmi_enable();
>  }
>  
> -#ifdef CONFIG_KPROBES
> +#if defined(CONFIG_KPROBES) || defined(CONFIG_IMMEDIATE)
>  fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
>  {
>       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
>                       == NOTIFY_STOP)
>               return;
> -     /* This is an interrupt gate, because kprobes wants interrupts
> -     disabled.  Normal trap handlers don't. */
> +     /* This is an interrupt gate, because kprobes and immediate values wants
> +      * interrupts disabled. Normal trap handlers don't. */
>       restore_interrupts(regs);
>       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
>  }
> 
> -- 
> Mathieu Desnoyers
> Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
> OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Regards/Gruß,
    Boris.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to