On Wed, Oct 17, 2012 at 04:51:17PM -0700, H. Peter Anvin wrote:
> On 10/17/2012 06:49 AM, Konrad Rzeszutek Wilk wrote:
> >In the past it used to point to 'sidt' (native_store_idt) operation
> >which is a non-privileged operation. This resulted in the
> >'struct desc_ptr' value containing the address of Xen's IDT table,
> >instead of the IDT table that Linux thinks its using. The end result
> >is that doing:
> >
> >   store_idt(&desc);
> >   load_idt(&desc);
> >
> >would blow up b/c xen_load_idt would try to parse the IDT contents
> >(desc) and de-reference a virtual address that is outside Linux's
> >__va (it is in Xen's virtual address).
> >
> >With this patch we are providing the last written IDT address.
> >
> 
> OK... this seems like another excellent set of pvops calls that
> should be nukable to Kingdom Come.  There is no reason, ever, to
> read the IDT and GDT from the kernel... the kernel already knows
> what they should be!

The code that uses these is "__save_processor_state" and 
"__restore_processor_state". To test the viability of removing
them, I did a very simple patch (see attached) and found
out that omitting those calls on AMD machines at least (hadn't
tried Intel yet) crashes the machine.

Interestingly enough, skipping the cr3, cr2, and cr0 loads in
arch/x86/kernel/acpi/wakeup_64.S worked fine!?


>From 7a7b1ea335f852f2a5ced1eb7f305fa329f7615a Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Date: Wed, 16 Jan 2013 16:16:46 -0500
Subject: [PATCH] x86: ignore_idt, ignore_gdt, tss, cr3 knobs.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
---
 arch/x86/include/asm/suspend_64.h |  1 +
 arch/x86/kernel/acpi/sleep.c      | 19 +++++++++++++++-
 arch/x86/kernel/acpi/wakeup_64.S  |  4 ++++
 arch/x86/kernel/asm-offsets_64.c  |  1 +
 arch/x86/power/cpu.c              | 47 ++++++++++++++++++++++++++++++---------
 arch/x86/xen/enlighten.c          |  4 +++-
 drivers/xen/acpi.c                |  6 ++++-
 7 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/suspend_64.h 
b/arch/x86/include/asm/suspend_64.h
index 09b0bf1..15603b1 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -36,6 +36,7 @@ struct saved_context {
        unsigned long tr;
        unsigned long safety;
        unsigned long return_address;
+       u8 skip;
 } __attribute__((packed));
 
 #define loaddebug(thread,register) \
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d5e0d71..13c71e6 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -24,6 +24,9 @@ unsigned long acpi_realmode_flags;
 #if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
 static char temp_stack[4096];
 #endif
+#include <asm/suspend_64.h>
+bool skip_cr3;
+extern struct saved_context saved_context;
 
 /**
  * acpi_suspend_lowlevel - save kernel state
@@ -82,10 +85,16 @@ int acpi_suspend_lowlevel(void)
        saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
+       if (skip_cr3) {
+               printk(KERN_INFO "Skipping cr3\n");
+               saved_context.skip = 1;
+       }
        do_suspend_lowlevel();
        return 0;
 }
-
+bool ignore_idt;
+bool ignore_gdt;
+bool alternative_tss;
 static int __init acpi_sleep_setup(char *str)
 {
        while ((str != NULL) && (*str != '\0')) {
@@ -105,6 +114,14 @@ static int __init acpi_sleep_setup(char *str)
                        acpi_nvs_nosave_s3();
                if (strncmp(str, "old_ordering", 12) == 0)
                        acpi_old_suspend_ordering();
+               if (strncmp(str, "ignore_idt", 10) == 0)
+                       ignore_idt = true;
+               if (strncmp(str, "ignore_gdt", 10) == 0)
+                       ignore_idt = true;
+               if (strncmp(str, "tss", 3) == 0)
+                       alternative_tss = true;
+               if (strncmp(str, "cr3", 3) == 0)
+                       skip_cr3 = true;
                str = strchr(str, ',');
                if (str != NULL)
                        str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8ea5164..c7e41c5 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -83,12 +83,16 @@ resume_point:
        movq    $saved_context, %rax
        movq    saved_context_cr4(%rax), %rbx
        movq    %rbx, %cr4
+
+       cmp     $0x0, saved_context_skip(%rax)
+       jne     skip
        movq    saved_context_cr3(%rax), %rbx
        movq    %rbx, %cr3
        movq    saved_context_cr2(%rax), %rbx
        movq    %rbx, %cr2
        movq    saved_context_cr0(%rax), %rbx
        movq    %rbx, %cr0
+skip:
        pushq   pt_regs_flags(%rax)
        popfq
        movq    pt_regs_sp(%rax), %rsp
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1b4754f..b7b7808 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -73,6 +73,7 @@ int main(void)
        ENTRY(cr3);
        ENTRY(cr4);
        ENTRY(cr8);
+       ENTRY(skip);
        BLANK();
 #undef ENTRY
 
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index c8ece7d..b4b2436 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -35,6 +35,10 @@ unsigned long saved_context_eflags;
 struct saved_context saved_context;
 #endif
 
+extern bool ignore_idt;
+extern bool ignore_gdt;
+extern bool alternative_tss;
+
 /**
  *     __save_processor_state - save CPU registers before creating a
  *             hibernation image and before restoring the memory state from it
@@ -61,12 +65,16 @@ static void __save_processor_state(struct saved_context 
*ctxt)
         * descriptor tables
         */
 #ifdef CONFIG_X86_32
-       store_gdt(&ctxt->gdt);
-       store_idt(&ctxt->idt);
+       if (!ignore_gdt)
+               store_gdt(&ctxt->gdt);
+       if (!ignore_idt)
+               store_idt(&ctxt->idt);
 #else
 /* CONFIG_X86_64 */
-       store_gdt((struct desc_ptr *)&ctxt->gdt_limit);
-       store_idt((struct desc_ptr *)&ctxt->idt_limit);
+       if (!ignore_gdt)
+               store_gdt((struct desc_ptr *)&ctxt->gdt_limit);
+       if (!ignore_idt)
+               store_idt((struct desc_ptr *)&ctxt->idt_limit);
 #endif
        store_tr(ctxt->tr);
 
@@ -136,6 +144,7 @@ static void fix_processor_context(void)
        struct tss_struct *t = &per_cpu(init_tss, cpu);
 #ifdef CONFIG_X86_64
        struct desc_struct *desc = get_cpu_gdt_table(cpu);
+       tss_desc tss;
 #endif
        set_tss_desc(cpu, t);   /*
                                 * This just modifies memory; should not be
@@ -145,13 +154,23 @@ static void fix_processor_context(void)
                                 */
 
 #ifdef CONFIG_X86_64
-       if (!desc_empty(&desc[GDT_ENTRY_TSS]))
-               desc[GDT_ENTRY_TSS].type = DESC_TSS;
+       if (alternative_tss) {
+               memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
+               tss.type = DESC_TSS;
+               write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
+       } else {
+               if (!desc_empty(&desc[GDT_ENTRY_TSS])) {
+                       desc[GDT_ENTRY_TSS].type = DESC_TSS;
+               }
+       }
 
        syscall_init();                         /* This sets MSR_*STAR and 
related */
 #endif
+       outb('G', 0xe9);
        load_TR_desc();                         /* This does ltr */
+       outb('H', 0xe9);
        load_LDT(&current->active_mm->context); /* This does lldt */
+       outb('I', 0xe9);
 }
 
 /**
@@ -185,12 +204,16 @@ static void __restore_processor_state(struct 
saved_context *ctxt)
         * ltr is done i fix_processor_context().
         */
 #ifdef CONFIG_X86_32
-       load_gdt(&ctxt->gdt);
-       load_idt(&ctxt->idt);
+       if (!ignore_gdt)
+               load_gdt(&ctxt->gdt);
+       if (!ignore_idt)
+               load_idt(&ctxt->idt);
 #else
 /* CONFIG_X86_64 */
-       load_gdt((const struct desc_ptr *)&ctxt->gdt_limit);
-       load_idt((const struct desc_ptr *)&ctxt->idt_limit);
+       if (!ignore_gdt)
+               load_gdt((const struct desc_ptr *)&ctxt->gdt_limit);
+       if (!ignore_idt)
+               load_idt((const struct desc_ptr *)&ctxt->idt_limit);
 #endif
 
        /*
@@ -228,9 +251,13 @@ static void __restore_processor_state(struct saved_context 
*ctxt)
 
        fix_processor_context();
 
+       outb('P', 0xe9);
        do_fpu_end();
+       outb('Q', 0xe9);
        x86_platform.restore_sched_clock_state();
+       outb('R', 0xe9);
        mtrr_bp_restore();
+       outb('S', 0xe9);
 }
 
 /* Needed by apm.c */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index dfe1332..8be90ea 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1035,6 +1035,7 @@ static void xen_write_cr4(unsigned long cr4)
        native_write_cr4(cr4);
 }
 #ifdef CONFIG_X86_64
+extern bool skip_cr3;
 static inline unsigned long xen_read_cr8(void)
 {
        return 0;
@@ -1094,7 +1095,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned 
low, unsigned high)
                 * xen_read_cr3 will try to find the cr3 for the user-space
                 * case - and feed it to the hypercall (which would fail).
                 */
-               xen_acpi_reload_cr3_value();
+               if (!skip_cr3)
+                       xen_acpi_reload_cr3_value();
 #endif
        default:
                ret = native_write_msr_safe(msr, low, high);
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
index a97414d..2ac4aac 100644
--- a/drivers/xen/acpi.c
+++ b/drivers/xen/acpi.c
@@ -56,6 +56,8 @@ void xen_acpi_reload_cr3_value(void)
        saved_context.cr3 = read_cr3();
 }
 #endif
+extern bool skip_cr3;
+
 int xen_acpi_notify_hypervisor_state(u8 sleep_state,
                                     u32 pm1a_cnt, u32 pm1b_cnt)
 {
@@ -98,7 +100,9 @@ int xen_acpi_notify_hypervisor_state(u8 sleep_state,
                mfn = pfn_to_mfn(PFN_DOWN(saved_context.cr3));
                /* and back to a physical address */
                mfn = PFN_PHYS(mfn);
-               saved_context.cr3 = mfn;
+
+               if (!skip_cr3)
+                       saved_context.cr3 = mfn;
 
                /* Sadly, this has the end result that if we the resume code
                 * does the movq X, %cr3 and then later uses the X value to do
-- 
1.8.0.2

> 
>       -hpa
> 
> 
> -- 
> H. Peter Anvin, Intel Open Source Technology Center
> I work for Intel.  I don't speak on their behalf.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to