kvmclock defines few static variables which are shared with hypervisor
during the kvmclock initialization.

When SEV is active, memory is encrypted with a guest-specific key, and
if guest OS wants to share the memory region with hypervisor then it must
clear the C-bit before sharing it.

The '__decrypted' can be used to define a shared variables; the variables
will be put in the .data.decryption section. This section is mapped with
C=0 early in the boot, we also ensure that the initialized values are
updated to match with C=0 (i.e peform an in-place decryption). The
.data..decrypted section is PMD aligned and sized so that we avoid the
need for spliting the pages when map with C=0.

Signed-off-by: Brijesh Singh <[email protected]>
Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
Cc: [email protected]
Cc: Tom Lendacky <[email protected]>
Cc: [email protected]
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: Paolo Bonzini <[email protected]>
Cc: Sean Christopherson <[email protected]>
Cc: "Radim Krčmář" <[email protected]>
---
 arch/x86/include/asm/mem_encrypt.h |   4 +
 arch/x86/kernel/head64.c           |  12 ++
 arch/x86/kernel/vmlinux.lds.S      |  18 +++
 arch/x86/mm/mem_encrypt_identity.c | 220 +++++++++++++++++++++++++++----------
 4 files changed, 197 insertions(+), 57 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index c064383..3f7d9d3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -52,6 +52,8 @@ void __init mem_encrypt_init(void);
 bool sme_active(void);
 bool sev_active(void);
 
+#define __decrypted __attribute__((__section__(".data..decrypted")))
+
 #else  /* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask    0ULL
@@ -77,6 +79,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long 
size) { return 0;
 static inline int __init
 early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 
0; }
 
+#define __decrypted
+
 #endif /* CONFIG_AMD_MEM_ENCRYPT */
 
 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8047379..6a18297 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -43,6 +43,9 @@ extern pmd_t 
early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
 static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 
+/* To clear memory encryption mask from the decrypted section */
+extern char __start_data_decrypted[], __end_data_decrypted[];
+
 #ifdef CONFIG_X86_5LEVEL
 unsigned int __pgtable_l5_enabled __ro_after_init;
 unsigned int pgdir_shift __ro_after_init = 39;
@@ -112,6 +115,7 @@ static bool __head check_la57_support(unsigned long 
physaddr)
 unsigned long __head __startup_64(unsigned long physaddr,
                                  struct boot_params *bp)
 {
+       unsigned long vaddr, vaddr_end;
        unsigned long load_delta, *p;
        unsigned long pgtable_flags;
        pgdval_t *pgd;
@@ -234,6 +238,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
        /* Encrypt the kernel and related (if SME is active) */
        sme_encrypt_kernel(bp);
 
+       /* Clear the memory encryption mask from the decrypted section */
+       vaddr = (unsigned long)__start_data_decrypted;
+       vaddr_end = (unsigned long)__end_data_decrypted;
+       for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+               i = pmd_index(vaddr);
+               pmd[i] -= sme_get_me_mask();
+       }
+
        /*
         * Return the SME encryption mask (if SME is active) to be used as a
         * modifier for the initial pgdir entry programmed into CR3.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8bde0a4..511b875 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -89,6 +89,22 @@ PHDRS {
        note PT_NOTE FLAGS(0);          /* ___ */
 }
 
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. But we make this section a pmd
+ * aligned to avoid spliting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define DATA_DECRYPTED_SECTION                                         \
+       . = ALIGN(PMD_SIZE);                                            \
+       __start_data_decrypted = .;                                     \
+       *(.data..decrypted);                                            \
+       __end_data_decrypted = .;                                       \
+       . = ALIGN(PMD_SIZE);                                            \
+
+
 SECTIONS
 {
 #ifdef CONFIG_X86_32
@@ -171,6 +187,8 @@ SECTIONS
                /* rarely changed data like cpu maps */
                READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
 
+               DATA_DECRYPTED_SECTION
+
                /* End of data section */
                _edata = .;
        } :data
diff --git a/arch/x86/mm/mem_encrypt_identity.c 
b/arch/x86/mm/mem_encrypt_identity.c
index 7ae3686..ccf6e2b 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -59,6 +59,8 @@
                                 (_PAGE_PAT | _PAGE_PWT))
 
 #define PTE_FLAGS_ENC          (PTE_FLAGS | _PAGE_ENC)
+#define PTE_FLAGS_ENC_WP       ((PTE_FLAGS_ENC & ~_PAGE_CACHE_MASK) | \
+                                (_PAGE_PAT | _PAGE_PWT))
 
 struct sme_populate_pgd_data {
        void    *pgtable_area;
@@ -72,10 +74,28 @@ struct sme_populate_pgd_data {
        unsigned long vaddr_end;
 };
 
+struct sme_workarea_data {
+       unsigned long kernel_start;
+       unsigned long kernel_end;
+       unsigned long kernel_len;
+
+       unsigned long initrd_start;
+       unsigned long initrd_end;
+       unsigned long initrd_len;
+
+       unsigned long workarea_start;
+       unsigned long workarea_end;
+       unsigned long workarea_len;
+
+       unsigned long decrypted_base;
+};
+
 static char sme_cmdline_arg[] __initdata = "mem_encrypt";
 static char sme_cmdline_on[]  __initdata = "on";
 static char sme_cmdline_off[] __initdata = "off";
 
+extern char __start_data_decrypted[], __end_data_decrypted[];
+
 static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
 {
        unsigned long pgd_start, pgd_end, pgd_size;
@@ -219,6 +239,11 @@ static void __init sme_map_range_encrypted(struct 
sme_populate_pgd_data *ppd)
        __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
 }
 
+static void __init sme_map_range_encrypted_wp(struct sme_populate_pgd_data 
*ppd)
+{
+       __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC_WP);
+}
+
 static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
 {
        __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
@@ -266,19 +291,17 @@ static unsigned long __init sme_pgtable_calc(unsigned 
long len)
        return entries + tables;
 }
 
-void __init sme_encrypt_kernel(struct boot_params *bp)
+static void __init build_workarea_map(struct boot_params *bp,
+                                     struct sme_workarea_data *wa,
+                                     struct sme_populate_pgd_data *ppd)
 {
        unsigned long workarea_start, workarea_end, workarea_len;
        unsigned long execute_start, execute_end, execute_len;
        unsigned long kernel_start, kernel_end, kernel_len;
        unsigned long initrd_start, initrd_end, initrd_len;
-       struct sme_populate_pgd_data ppd;
        unsigned long pgtable_area_len;
        unsigned long decrypted_base;
 
-       if (!sme_active())
-               return;
-
        /*
         * Prepare for encrypting the kernel and initrd by building new
         * pagetables with the necessary attributes needed to encrypt the
@@ -358,17 +381,17 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
         * pagetables and when the new encrypted and decrypted kernel
         * mappings are populated.
         */
-       ppd.pgtable_area = (void *)execute_end;
+       ppd->pgtable_area = (void *)execute_end;
 
        /*
         * Make sure the current pagetable structure has entries for
         * addressing the workarea.
         */
-       ppd.pgd = (pgd_t *)native_read_cr3_pa();
-       ppd.paddr = workarea_start;
-       ppd.vaddr = workarea_start;
-       ppd.vaddr_end = workarea_end;
-       sme_map_range_decrypted(&ppd);
+       ppd->pgd = (pgd_t *)native_read_cr3_pa();
+       ppd->paddr = workarea_start;
+       ppd->vaddr = workarea_start;
+       ppd->vaddr_end = workarea_end;
+       sme_map_range_decrypted(ppd);
 
        /* Flush the TLB - no globals so cr3 is enough */
        native_write_cr3(__native_read_cr3());
@@ -379,9 +402,9 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
         * then be populated with new PUDs and PMDs as the encrypted and
         * decrypted kernel mappings are created.
         */
-       ppd.pgd = ppd.pgtable_area;
-       memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
-       ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
+       ppd->pgd = ppd->pgtable_area;
+       memset(ppd->pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
+       ppd->pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
 
        /*
         * A different PGD index/entry must be used to get different
@@ -399,75 +422,158 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
        decrypted_base <<= PGDIR_SHIFT;
 
        /* Add encrypted kernel (identity) mappings */
-       ppd.paddr = kernel_start;
-       ppd.vaddr = kernel_start;
-       ppd.vaddr_end = kernel_end;
-       sme_map_range_encrypted(&ppd);
+       ppd->paddr = kernel_start;
+       ppd->vaddr = kernel_start;
+       ppd->vaddr_end = kernel_end;
+       sme_map_range_encrypted(ppd);
 
        /* Add decrypted, write-protected kernel (non-identity) mappings */
-       ppd.paddr = kernel_start;
-       ppd.vaddr = kernel_start + decrypted_base;
-       ppd.vaddr_end = kernel_end + decrypted_base;
-       sme_map_range_decrypted_wp(&ppd);
+       ppd->paddr = kernel_start;
+       ppd->vaddr = kernel_start + decrypted_base;
+       ppd->vaddr_end = kernel_end + decrypted_base;
+       sme_map_range_decrypted_wp(ppd);
 
        if (initrd_len) {
                /* Add encrypted initrd (identity) mappings */
-               ppd.paddr = initrd_start;
-               ppd.vaddr = initrd_start;
-               ppd.vaddr_end = initrd_end;
-               sme_map_range_encrypted(&ppd);
+               ppd->paddr = initrd_start;
+               ppd->vaddr = initrd_start;
+               ppd->vaddr_end = initrd_end;
+               sme_map_range_encrypted(ppd);
                /*
                 * Add decrypted, write-protected initrd (non-identity) mappings
                 */
-               ppd.paddr = initrd_start;
-               ppd.vaddr = initrd_start + decrypted_base;
-               ppd.vaddr_end = initrd_end + decrypted_base;
-               sme_map_range_decrypted_wp(&ppd);
+               ppd->paddr = initrd_start;
+               ppd->vaddr = initrd_start + decrypted_base;
+               ppd->vaddr_end = initrd_end + decrypted_base;
+               sme_map_range_decrypted_wp(ppd);
        }
 
-       /* Add decrypted workarea mappings to both kernel mappings */
-       ppd.paddr = workarea_start;
-       ppd.vaddr = workarea_start;
-       ppd.vaddr_end = workarea_end;
-       sme_map_range_decrypted(&ppd);
+       /*
+        * When SEV is active, kernel is already encrypted hence mapping
+        * the initial workarea_start as encrypted. When SME is active,
+        * the kernel is not encrypted hence add a decrypted workarea
+        * mappings to both kernel mappings
+        */
+       ppd->paddr = workarea_start;
+       ppd->vaddr = workarea_start;
+       ppd->vaddr_end = workarea_end;
+       if (sev_active())
+               sme_map_range_encrypted(ppd);
+       else
+               sme_map_range_decrypted(ppd);
+
+       ppd->paddr = workarea_start;
+       ppd->vaddr = workarea_start + decrypted_base;
+       ppd->vaddr_end = workarea_end + decrypted_base;
+       sme_map_range_decrypted(ppd);
 
-       ppd.paddr = workarea_start;
-       ppd.vaddr = workarea_start + decrypted_base;
-       ppd.vaddr_end = workarea_end + decrypted_base;
-       sme_map_range_decrypted(&ppd);
+       wa->kernel_start = kernel_start;
+       wa->kernel_end = kernel_end;
+       wa->kernel_len = kernel_len;
 
-       /* Perform the encryption */
-       sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
-                           kernel_len, workarea_start, (unsigned long)ppd.pgd);
+       wa->initrd_start = initrd_start;
+       wa->initrd_end = initrd_end;
+       wa->initrd_len = initrd_len;
 
-       if (initrd_len)
-               sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
-                                   initrd_len, workarea_start,
-                                   (unsigned long)ppd.pgd);
+       wa->workarea_start = workarea_start;
+       wa->workarea_end = workarea_end;
+       wa->workarea_len = workarea_len;
 
+       wa->decrypted_base = decrypted_base;
+}
+
+static void __init remove_workarea_map(struct sme_workarea_data *wa,
+                                      struct sme_populate_pgd_data *ppd)
+{
        /*
         * At this point we are running encrypted.  Remove the mappings for
         * the decrypted areas - all that is needed for this is to remove
         * the PGD entry/entries.
         */
-       ppd.vaddr = kernel_start + decrypted_base;
-       ppd.vaddr_end = kernel_end + decrypted_base;
-       sme_clear_pgd(&ppd);
-
-       if (initrd_len) {
-               ppd.vaddr = initrd_start + decrypted_base;
-               ppd.vaddr_end = initrd_end + decrypted_base;
-               sme_clear_pgd(&ppd);
+       ppd->vaddr = wa->kernel_start + wa->decrypted_base;
+       ppd->vaddr_end = wa->kernel_end + wa->decrypted_base;
+       sme_clear_pgd(ppd);
+
+       if (wa->initrd_len) {
+               ppd->vaddr = wa->initrd_start + wa->decrypted_base;
+               ppd->vaddr_end = wa->initrd_end + wa->decrypted_base;
+               sme_clear_pgd(ppd);
        }
 
-       ppd.vaddr = workarea_start + decrypted_base;
-       ppd.vaddr_end = workarea_end + decrypted_base;
-       sme_clear_pgd(&ppd);
+       ppd->vaddr = wa->workarea_start + wa->decrypted_base;
+       ppd->vaddr_end = wa->workarea_end + wa->decrypted_base;
+       sme_clear_pgd(ppd);
 
        /* Flush the TLB - no globals so cr3 is enough */
        native_write_cr3(__native_read_cr3());
 }
 
+static void __init decrypt_data_decrypted_section(struct sme_workarea_data *wa,
+                                                 struct sme_populate_pgd_data 
*ppd)
+{
+       unsigned long decrypted_start, decrypted_end, decrypted_len;
+
+       /* Physical addresses of decrypted data section */
+       decrypted_start = __pa_symbol(__start_data_decrypted);
+       decrypted_end = __pa_symbol(__end_data_decrypted);
+       decrypted_len = decrypted_end - decrypted_start;
+
+       if (!decrypted_len)
+               return;
+
+       /* Add decrypted mapping for the section (identity) */
+       ppd->paddr = decrypted_start;
+       ppd->vaddr = decrypted_start;
+       ppd->vaddr_end = decrypted_end;
+       sme_map_range_decrypted(ppd);
+
+       /* Add encrypted-wp mapping for the section (non-identity) */
+       ppd->paddr = decrypted_start;
+       ppd->vaddr = decrypted_start + wa->decrypted_base;
+       ppd->vaddr_end = decrypted_end + wa->decrypted_base;
+       sme_map_range_encrypted_wp(ppd);
+
+       /* Perform in-place decryption */
+       sme_encrypt_execute(decrypted_start + wa->decrypted_base,
+                           decrypted_start,
+                           decrypted_len, wa->workarea_start,
+                           (unsigned long)ppd->pgd);
+
+       ppd->vaddr = decrypted_start + wa->decrypted_base;
+       ppd->vaddr_end = decrypted_end + wa->decrypted_base;
+       sme_clear_pgd(ppd);
+}
+
+void __init sme_encrypt_kernel(struct boot_params *bp)
+{
+       struct sme_populate_pgd_data ppd;
+       struct sme_workarea_data wa;
+
+       if (!mem_encrypt_active())
+               return;
+
+       build_workarea_map(bp, &wa, &ppd);
+
+       /* When SEV is active, encrypt kernel and initrd */
+       if (sme_active()) {
+               sme_encrypt_execute(wa.kernel_start,
+                                   wa.kernel_start + wa.decrypted_base,
+                                   wa.kernel_len, wa.workarea_start,
+                                   (unsigned long)ppd.pgd);
+
+               if (wa.initrd_len)
+                       sme_encrypt_execute(wa.initrd_start,
+                                           wa.initrd_start + wa.decrypted_base,
+                                           wa.initrd_len, wa.workarea_start,
+                                           (unsigned long)ppd.pgd);
+       }
+
+       /* Decrypt the contents of .data..decrypted section */
+       decrypt_data_decrypted_section(&wa, &ppd);
+
+       remove_workarea_map(&wa, &ppd);
+}
+
 void __init sme_enable(struct boot_params *bp)
 {
        const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
-- 
2.7.4

Reply via email to