On Thu, Feb 16, 2017 at 09:48:08AM -0600, Tom Lendacky wrote:
> This patch adds the support to encrypt the kernel in-place. This is
> done by creating new page mappings for the kernel - a decrypted
> write-protected mapping and an encrypted mapping. The kernel is encyrpted

s/encyrpted/encrypted/

> by copying the kernel through a temporary buffer.

"... by copying it... "

> 
> Signed-off-by: Tom Lendacky <thomas.lenda...@amd.com>
> ---

...

> +ENTRY(sme_encrypt_execute)
> +
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +     /*
> +      * Entry parameters:
> +      *   RDI - virtual address for the encrypted kernel mapping
> +      *   RSI - virtual address for the decrypted kernel mapping
> +      *   RDX - length of kernel
> +      *   RCX - address of the encryption workarea

                                                     , including:

> +      *     - stack page (PAGE_SIZE)
> +      *     - encryption routine page (PAGE_SIZE)
> +      *     - intermediate copy buffer (PMD_PAGE_SIZE)
> +      *    R8 - address of the pagetables to use for encryption
> +      */
> +
> +     /* Set up a one page stack in the non-encrypted memory area */
> +     movq    %rcx, %rax
> +     addq    $PAGE_SIZE, %rax
> +     movq    %rsp, %rbp

%rbp is callee-saved and you're overwriting it here. You need to push it
first.

> +     movq    %rax, %rsp
> +     push    %rbp
> +
> +     push    %r12
> +     push    %r13

In general, just do all pushes on function entry and the pops on exit,
like the compiler does.

> +     movq    %rdi, %r10
> +     movq    %rsi, %r11
> +     movq    %rdx, %r12
> +     movq    %rcx, %r13
> +
> +     /* Copy encryption routine into the workarea */
> +     movq    %rax, %rdi
> +     leaq    .Lencrypt_start(%rip), %rsi
> +     movq    $(.Lencrypt_stop - .Lencrypt_start), %rcx
> +     rep     movsb
> +
> +     /* Setup registers for call */
> +     movq    %r10, %rdi
> +     movq    %r11, %rsi
> +     movq    %r8, %rdx
> +     movq    %r12, %rcx
> +     movq    %rax, %r8
> +     addq    $PAGE_SIZE, %r8
> +
> +     /* Call the encryption routine */
> +     call    *%rax
> +
> +     pop     %r13
> +     pop     %r12
> +
> +     pop     %rsp                    /* Restore original stack pointer */
> +.Lencrypt_exit:

Please put side comments like this here:

ENTRY(sme_encrypt_execute)

#ifdef CONFIG_AMD_MEM_ENCRYPT
        /*
         * Entry parameters:
         *   RDI - virtual address for the encrypted kernel mapping
         *   RSI - virtual address for the decrypted kernel mapping
         *   RDX - length of kernel
         *   RCX - address of the encryption workarea
         *     - stack page (PAGE_SIZE)
         *     - encryption routine page (PAGE_SIZE)
         *     - intermediate copy buffer (PMD_PAGE_SIZE)
         *    R8 - address of the pagetables to use for encryption
         */

        /* Set up a one page stack in the non-encrypted memory area */
        movq    %rcx, %rax                      # %rax = workarea
        addq    $PAGE_SIZE, %rax                # %rax += 4096
        movq    %rsp, %rbp                      # stash stack ptr
        movq    %rax, %rsp                      # set new stack
        push    %rbp                            # needs to happen before the 
mov %rsp, %rbp

        push    %r12
        push    %r13

        movq    %rdi, %r10                      # encrypted kernel
        movq    %rsi, %r11                      # decrypted kernel
        movq    %rdx, %r12                      # kernel length
        movq    %rcx, %r13                      # workarea
        ...

and so on.

...

> diff --git a/arch/x86/kernel/mem_encrypt_init.c 
> b/arch/x86/kernel/mem_encrypt_init.c
> index 25af15d..07cbb90 100644
> --- a/arch/x86/kernel/mem_encrypt_init.c
> +++ b/arch/x86/kernel/mem_encrypt_init.c
> @@ -16,9 +16,200 @@
>  #ifdef CONFIG_AMD_MEM_ENCRYPT
>  
>  #include <linux/mem_encrypt.h>
> +#include <linux/mm.h>
> +
> +#include <asm/sections.h>
> +
> +extern void sme_encrypt_execute(unsigned long, unsigned long, unsigned long,
> +                             void *, pgd_t *);

This belongs into mem_encrypt.h. And I think it already came up: please
use names for those params.

> +
> +#define PGD_FLAGS    _KERNPG_TABLE_NOENC
> +#define PUD_FLAGS    _KERNPG_TABLE_NOENC
> +#define PMD_FLAGS    __PAGE_KERNEL_LARGE_EXEC
> +
> +static void __init *sme_pgtable_entry(pgd_t *pgd, void *next_page,
> +                                   void *vaddr, pmdval_t pmd_val)
> +{

sme_populate() or so sounds better.

> +     pud_t *pud;
> +     pmd_t *pmd;
> +
> +     pgd += pgd_index((unsigned long)vaddr);
> +     if (pgd_none(*pgd)) {
> +             pud = next_page;
> +             memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
> +             native_set_pgd(pgd,
> +                            native_make_pgd((unsigned long)pud + PGD_FLAGS));

Let it stick out, no need for those "stairs" in the vertical alignment :)

> +             next_page += sizeof(*pud) * PTRS_PER_PUD;
> +     } else {
> +             pud = (pud_t *)(native_pgd_val(*pgd) & ~PTE_FLAGS_MASK);
> +     }
> +
> +     pud += pud_index((unsigned long)vaddr);
> +     if (pud_none(*pud)) {
> +             pmd = next_page;
> +             memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
> +             native_set_pud(pud,
> +                            native_make_pud((unsigned long)pmd + PUD_FLAGS));
> +             next_page += sizeof(*pmd) * PTRS_PER_PMD;
> +     } else {
> +             pmd = (pmd_t *)(native_pud_val(*pud) & ~PTE_FLAGS_MASK);
> +     }
> +
> +     pmd += pmd_index((unsigned long)vaddr);
> +     if (pmd_none(*pmd) || !pmd_large(*pmd))
> +             native_set_pmd(pmd, native_make_pmd(pmd_val));
> +
> +     return next_page;
> +}
> +
> +static unsigned long __init sme_pgtable_calc(unsigned long start,
> +                                          unsigned long end)
> +{
> +     unsigned long addr, total;
> +
> +     total = 0;
> +     addr = start;
> +     while (addr < end) {
> +             unsigned long pgd_end;
> +
> +             pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
> +             if (pgd_end > end)
> +                     pgd_end = end;
> +
> +             total += sizeof(pud_t) * PTRS_PER_PUD * 2;
> +
> +             while (addr < pgd_end) {
> +                     unsigned long pud_end;
> +
> +                     pud_end = (addr & PUD_MASK) + PUD_SIZE;
> +                     if (pud_end > end)
> +                             pud_end = end;
> +
> +                     total += sizeof(pmd_t) * PTRS_PER_PMD * 2;

That "* 2" is?

> +
> +                     addr = pud_end;

So                      addr += PUD_SIZE;

?

> +             }
> +
> +             addr = pgd_end;

So              addr += PGD_SIZE;

?

> +     total += sizeof(pgd_t) * PTRS_PER_PGD;
> +
> +     return total;
> +}
>  
>  void __init sme_encrypt_kernel(void)
>  {
> +     pgd_t *pgd;
> +     void *workarea, *next_page, *vaddr;
> +     unsigned long kern_start, kern_end, kern_len;
> +     unsigned long index, paddr, pmd_flags;
> +     unsigned long exec_size, full_size;
> +
> +     /* If SME is not active then no need to prepare */

That comment is obvious.

> +     if (!sme_active())
> +             return;
> +
> +     /* Set the workarea to be after the kernel */
> +     workarea = (void *)ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
> +
> +     /*
> +      * Prepare for encrypting the kernel by building new pagetables with
> +      * the necessary attributes needed to encrypt the kernel in place.
> +      *
> +      *   One range of virtual addresses will map the memory occupied
> +      *   by the kernel as encrypted.
> +      *
> +      *   Another range of virtual addresses will map the memory occupied
> +      *   by the kernel as decrypted and write-protected.
> +      *
> +      *     The use of write-protect attribute will prevent any of the
> +      *     memory from being cached.
> +      */
> +
> +     /* Physical address gives us the identity mapped virtual address */
> +     kern_start = __pa_symbol(_text);
> +     kern_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE) - 1;

So
        kern_end = (unsigned long)workarea - 1;

?

Also, you can make that workarea be unsigned long and cast it to void *
only when needed so that you don't need to cast it in here for the
calculations.

> +     kern_len = kern_end - kern_start + 1;
> +
> +     /*
> +      * Calculate required number of workarea bytes needed:
> +      *   executable encryption area size:
> +      *     stack page (PAGE_SIZE)
> +      *     encryption routine page (PAGE_SIZE)
> +      *     intermediate copy buffer (PMD_PAGE_SIZE)
> +      *   pagetable structures for workarea (in case not currently mapped)
> +      *   pagetable structures for the encryption of the kernel
> +      */
> +     exec_size = (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
> +
> +     full_size = exec_size;
> +     full_size += ALIGN(exec_size, PMD_PAGE_SIZE) / PMD_PAGE_SIZE *
> +                  sizeof(pmd_t) * PTRS_PER_PMD;
> +     full_size += sme_pgtable_calc(kern_start, kern_end + exec_size);
> +
> +     next_page = workarea + exec_size;

So next_page is the next free page after the workarea, correct? Because
of all things, *that* certainly needs a comment. It took me a while to
decipher what's going on here and I'm still not 100% clear.

> +     /* Make sure the current pagetables have entries for the workarea */
> +     pgd = (pgd_t *)native_read_cr3();
> +     paddr = (unsigned long)workarea;
> +     while (paddr < (unsigned long)workarea + full_size) {
> +             vaddr = (void *)paddr;
> +             next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +                                           paddr + PMD_FLAGS);
> +
> +             paddr += PMD_PAGE_SIZE;
> +     }
> +     native_write_cr3(native_read_cr3());

Why not

        native_write_cr3((unsigned long)pgd);

?

Now you can actually acknowledge that the code block in between changed
the hierarchy in pgd and you're reloading it.

> +     /* Calculate a PGD index to be used for the decrypted mapping */
> +     index = (pgd_index(kern_end + full_size) + 1) & (PTRS_PER_PGD - 1);
> +     index <<= PGDIR_SHIFT;

So call it decrypt_mapping_pgd or so. index doesn't say anything. Also,
move it right above where it is being used. This function is very hard
to follow as it is.

> +     /* Set and clear the PGD */

This needs more text: we're building a new temporary pagetable which
will have A, B and C mapped into it and blablabla...

> +     pgd = next_page;
> +     memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
> +     next_page += sizeof(*pgd) * PTRS_PER_PGD;
> +
> +     /* Add encrypted (identity) mappings for the kernel */
> +     pmd_flags = PMD_FLAGS | _PAGE_ENC;
> +     paddr = kern_start;
> +     while (paddr < kern_end) {
> +             vaddr = (void *)paddr;
> +             next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +                                           paddr + pmd_flags);
> +
> +             paddr += PMD_PAGE_SIZE;
> +     }
> +
> +     /* Add decrypted (non-identity) mappings for the kernel */
> +     pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
> +     paddr = kern_start;
> +     while (paddr < kern_end) {
> +             vaddr = (void *)(paddr + index);
> +             next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +                                           paddr + pmd_flags);
> +
> +             paddr += PMD_PAGE_SIZE;
> +     }
> +
> +     /* Add the workarea to both mappings */
> +     paddr = kern_end + 1;

        paddr = (unsigned long)workarea;

Now this makes sense when I read the comment above it.

> +     while (paddr < (kern_end + exec_size)) {

... which actually wants that exec_size to be called workarea_size. Then
it'll make more sense.

And then the thing above:

        next_page = workarea + exec_size;

would look like:

        next_page = workarea + workarea_size;

which would make even more sense. And since you have stuff called _start
and _end, you can do:

        next_page = workarea_start + workarea_size;

and not it would make most sense. Eva! :-)

> +             vaddr = (void *)paddr;
> +             next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +                                           paddr + PMD_FLAGS);
> +
> +             vaddr = (void *)(paddr + index);
> +             next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +                                           paddr + PMD_FLAGS);
> +
> +             paddr += PMD_PAGE_SIZE;
> +     }
> +
> +     /* Perform the encryption */
> +     sme_encrypt_execute(kern_start, kern_start + index, kern_len,
> +                         workarea, pgd);
> +

Phew, that's one tough patch to review. I'd like to review it again in
your next submission.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

Reply via email to