On Sat, Dec 22, 2007 at 09:13:44PM +0200, Avi Kivity wrote:
> Unfortunately, this fails badly on Intel i386:

Hmm ok there's a definitive bug that I forgot a int1 kernel->kernel
switch on x86 has no special debug stack like on x86-64. This will
have a better chance to work, hope I got all offsets right by
memory.... At least the offset "32" in the leal and eax + fastcall
should all be right or I doubt it could survive the double
dereferencing. Likely the one-more-derefence didn't oops there because
you likely have >=1g of ram and there was a 25% chance of crashing due
the lack of sched-in and 75% chance of crashing in the
one-more-dereference in a more meaningful way.

Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]>

diff --git a/kernel/hack-module.awk b/kernel/hack-module.awk
index 7993aa2..5187c96 100644
--- a/kernel/hack-module.awk
+++ b/kernel/hack-module.awk
@@ -24,32 +24,6 @@
     printf("MODULE_INFO(version, \"%s\");\n", version)
 }
 
-/^static unsigned long vmcs_readl/ {
-    in_vmcs_read = 1
-}
-
-/ASM_VMX_VMREAD_RDX_RAX/ && in_vmcs_read {
-    printf("\tstart_special_insn();\n")
-}
-
-/return/ && in_vmcs_read {
-    printf("\tend_special_insn();\n");
-    in_vmcs_read = 0
-}
-
-/^static void vmcs_writel/ {
-    in_vmcs_write = 1
-}
-
-/ASM_VMX_VMWRITE_RAX_RDX/ && in_vmcs_write {
-    printf("\tstart_special_insn();\n")
-}
-
-/if/ && in_vmcs_write {
-    printf("\tend_special_insn();\n");
-    in_vmcs_write = 0
-}
-
 /^static void vmx_load_host_state/ {
     vmx_load_host_state = 1
 }
@@ -74,15 +48,6 @@
     print "\tspecial_reload_dr7();"
 }
 
-/static void vcpu_put|static int __vcpu_run|static struct kvm_vcpu 
\*vmx_create_vcpu/ {
-    in_tricky_func = 1
-}
-
-/preempt_disable|get_cpu/ && in_tricky_func {
-    printf("\tin_special_section();\n");
-    in_tricky_func = 0
-}
-
 /unsigned long flags;/ &&  vmx_load_host_state {
     print "\tunsigned long gsbase;"
 }
@@ -90,4 +55,3 @@
 /local_irq_save/ &&  vmx_load_host_state {
     print "\t\tgsbase = vmcs_readl(HOST_GS_BASE);"
 }
-
diff --git a/kernel/preempt.c b/kernel/preempt.c
index 8bb0405..fd6f8dc 100644
--- a/kernel/preempt.c
+++ b/kernel/preempt.c
@@ -6,8 +6,6 @@
 
 static DEFINE_SPINLOCK(pn_lock);
 static LIST_HEAD(pn_list);
-static DEFINE_PER_CPU(int, notifier_enabled);
-static DEFINE_PER_CPU(struct task_struct *, last_tsk);
 
 #define dprintk(fmt) do {                                              \
                if (0)                                                  \
@@ -15,59 +13,95 @@ static DEFINE_PER_CPU(struct task_struct *, last_tsk);
                               current->pid, raw_smp_processor_id());   \
        } while (0)
 
-static void preempt_enable_notifiers(void)
+static void preempt_enable_sched_out_notifiers(void)
 {
-       int cpu = raw_smp_processor_id();
-
-       if (per_cpu(notifier_enabled, cpu))
-               return;
-
-       dprintk("\n");
-       per_cpu(notifier_enabled, cpu) = 1;
        asm volatile ("mov %0, %%db0" : : "r"(schedule));
-       asm volatile ("mov %0, %%db7" : : "r"(0x702ul));
+       asm volatile ("mov %0, %%db7" : : "r"(0x701ul));
+#ifdef CONFIG_X86_64
+       current->thread.debugreg7 = 0ul;
+#else
+       current->thread.debugreg[7] = 0ul;
+#endif
+#ifdef TIF_DEBUG
+       clear_tsk_thread_flag(current, TIF_DEBUG);
+#endif
+}
+
+static void preempt_enable_sched_in_notifiers(void * addr)
+{
+       asm volatile ("mov %0, %%db0" : : "r"(addr));
+       asm volatile ("mov %0, %%db7" : : "r"(0x701ul));
+#ifdef CONFIG_X86_64
+       current->thread.debugreg0 = (unsigned long) addr;
+       current->thread.debugreg7 = 0x701ul;
+#else
+       current->thread.debugreg[0] = (unsigned long) addr;
+       current->thread.debugreg[7] = 0x701ul;
+#endif
+#ifdef TIF_DEBUG
+       set_tsk_thread_flag(current, TIF_DEBUG);
+#endif
 }
 
 void special_reload_dr7(void)
 {
-       asm volatile ("mov %0, %%db7" : : "r"(0x702ul));
+       asm volatile ("mov %0, %%db7" : : "r"(0x701ul));
 }
 EXPORT_SYMBOL_GPL(special_reload_dr7);
 
-static void preempt_disable_notifiers(void)
+static void __preempt_disable_notifiers(void)
 {
-       int cpu = raw_smp_processor_id();
-
-       if (!per_cpu(notifier_enabled, cpu))
-               return;
+       asm volatile ("mov %0, %%db7" : : "r"(0ul));
+}
 
-       dprintk("\n");
-       per_cpu(notifier_enabled, cpu) = 0;
-       asm volatile ("mov %0, %%db7" : : "r"(0x400ul));
+static void preempt_disable_notifiers(void)
+{
+       __preempt_disable_notifiers();
+#ifdef CONFIG_X86_64
+       current->thread.debugreg7 = 0ul;
+#else
+       current->thread.debugreg[7] = 0ul;
+#endif
+#ifdef TIF_DEBUG
+       clear_tsk_thread_flag(current, TIF_DEBUG);
+#endif
 }
 
-static void  __attribute__((used)) preempt_notifier_trigger(void)
+static void fastcall  __attribute__((used)) preempt_notifier_trigger(void *** 
ip)
 {
        struct preempt_notifier *pn;
        int cpu = raw_smp_processor_id();
        int found = 0;
-       unsigned long flags;
 
        dprintk(" - in\n");
        //dump_stack();
-       spin_lock_irqsave(&pn_lock, flags);
+       spin_lock(&pn_lock);
        list_for_each_entry(pn, &pn_list, link)
                if (pn->tsk == current) {
                        found = 1;
                        break;
                }
-       spin_unlock_irqrestore(&pn_lock, flags);
-       preempt_disable_notifiers();
+       spin_unlock(&pn_lock);
+
        if (found) {
-               dprintk("sched_out\n");
-               pn->ops->sched_out(pn, NULL);
-               per_cpu(last_tsk, cpu) = NULL;
-       }
+               if ((void *) *ip != schedule) {
+                       dprintk("sched_in\n");
+                       preempt_enable_sched_out_notifiers();
+                       pn->ops->sched_in(pn, cpu);
+               } else {
+                       void * sched_in_addr;
+                       dprintk("sched_out\n");
+#ifdef CONFIG_X86_64
+                       sched_in_addr = **(ip+3);
+#else
+                       /* no special debug stack switch on x86 */
+                       sched_in_addr = (void *) *(ip+3);
+#endif
+                       preempt_enable_sched_in_notifiers(sched_in_addr);
+                       pn->ops->sched_out(pn, NULL);
+               }
+       } else
+               __preempt_disable_notifiers();
        dprintk(" - out\n");
 }
 
@@ -104,6 +138,11 @@ asm ("pn_int1_handler:  \n\t"
      "pop "  TMP " \n\t"
      "jz .Lnotme \n\t"
      SAVE_REGS "\n\t"
+#ifdef CONFIG_X86_64
+     "leaq 120(%rsp),%rdi\n\t"
+#else
+     "leal 32(%esp),%eax\n\t"
+#endif
      "call preempt_notifier_trigger \n\t"
      RESTORE_REGS "\n\t"
 #ifdef CONFIG_X86_64
@@ -121,75 +160,28 @@ asm ("pn_int1_handler:  \n\t"
 #endif
        );
 
-void in_special_section(void)
-{
-       struct preempt_notifier *pn;
-       int cpu = raw_smp_processor_id();
-       int found = 0;
-       unsigned long flags;
-
-       if (per_cpu(last_tsk, cpu) == current)
-               return;
-
-       dprintk(" - in\n");
-       spin_lock_irqsave(&pn_lock, flags);
-       list_for_each_entry(pn, &pn_list, link)
-               if (pn->tsk == current) {
-                       found = 1;
-                       break;
-               }
-       spin_unlock_irqrestore(&pn_lock, flags);
-       if (found) {
-               dprintk("\n");
-               per_cpu(last_tsk, cpu) = current;
-               pn->ops->sched_in(pn, cpu);
-               preempt_enable_notifiers();
-       }
-       dprintk(" - out\n");
-}
-EXPORT_SYMBOL_GPL(in_special_section);
-
-void start_special_insn(void)
-{
-       preempt_disable();
-       in_special_section();
-}
-EXPORT_SYMBOL_GPL(start_special_insn);
-
-void end_special_insn(void)
-{
-       preempt_enable();
-}
-EXPORT_SYMBOL_GPL(end_special_insn);
-
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
-       int cpu = get_cpu();
        unsigned long flags;
 
        dprintk(" - in\n");
        spin_lock_irqsave(&pn_lock, flags);
-       preempt_enable_notifiers();
+       preempt_enable_sched_out_notifiers();
        notifier->tsk = current;
        list_add(&notifier->link, &pn_list);
        spin_unlock_irqrestore(&pn_lock, flags);
-       per_cpu(last_tsk, cpu) = current;
-       put_cpu();
        dprintk(" - out\n");
 }
 
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
-       int cpu = get_cpu();
        unsigned long flags;
 
        dprintk(" - in\n");
        spin_lock_irqsave(&pn_lock, flags);
        list_del(&notifier->link);
        spin_unlock_irqrestore(&pn_lock, flags);
-       per_cpu(last_tsk, cpu) = NULL;
        preempt_disable_notifiers();
-       put_cpu();
        dprintk(" - out\n");
 }
 
@@ -238,7 +230,16 @@ void preempt_notifier_sys_init(void)
 
 static void do_disable(void *blah)
 {
-       preempt_disable_notifiers();
+#ifdef TIF_DEBUG
+       if (!test_tsk_thread_flag(current, TIF_DEBUG))
+#else
+#ifdef CONFIG_X86_64
+       if (!current->thread.debugreg7)
+#else
+       if (!current->thread.debugreg[7])
+#endif
+#endif
+               __preempt_disable_notifiers();
 }
 
 void preempt_notifier_sys_exit(void)


> 
> > kvm: emulating preempt notifiers; do not benchmark on this machine
> > loaded kvm module (kvm-56-127-g433be51)
> > vmwrite error: reg c08 value d8 (err 3080)
> >  [<f8baf9e2>] vmx_save_host_state+0x4f/0x162 [kvm_intel]
> >  [<c0425803>] __cond_resched+0x25/0x3c
> >  [<f91a22a4>] kvm_arch_vcpu_ioctl_run+0x16f/0x3a7 [kvm]
> >  [<f919f244>] kvm_vcpu_ioctl+0xcb/0x28f [kvm]
> >  [<c0421987>] enqueue_entity+0x2c0/0x2ea
> >  [<c05a8340>] skb_dequeue+0x39/0x3f
> >  [<c0604b6d>] unix_stream_recvmsg+0x3a2/0x4c3
> >  [<c0425c82>] scheduler_tick+0x1a1/0x274
> >  [<c0487329>] core_sys_select+0x21f/0x2fa
> >  [<c043e9e6>] clockevents_program_event+0xb5/0xbc
> >  [<c04c6853>] avc_has_perm+0x4e/0x58
> >  [<c04c7174>] inode_has_perm+0x66/0x6e
> >  [<c0430bed>] recalc_sigpending+0xb/0x1d
> >  [<c043231d>] dequeue_signal+0xa9/0x12a
> >  [<c043cb95>] getnstimeofday+0x30/0xbf
> >  [<c04c7205>] file_has_perm+0x89/0x91
> >  [<f919f179>] kvm_vcpu_ioctl+0x0/0x28f [kvm]
> >  [<c04861b9>] do_ioctl+0x21/0xa0
> >  [<c048646f>] vfs_ioctl+0x237/0x249
> >  [<c04864cd>] sys_ioctl+0x4c/0x67
> >  [<c0404f26>] sysenter_past_esp+0x5f/0x85
> >  =======================
> 
> vmwrite error means the vmcs pointer was not loaded, probably because
> the sched_in event did not fire after a vcpu migration.
> 
> -- 
> Do not meddle in the internals of kernels, for they are subtle and quick to 
> panic.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to