Re: [RFC PATCH v2 14/32] x86: mm: Provide support to use memblock when spliting large pages

2017-03-16 Thread Borislav Petkov
On Fri, Mar 10, 2017 at 04:41:56PM -0600, Brijesh Singh wrote:
> I can take a look at fixing those warning. In my initial attempt was to create
> a new function to clear encryption bit but it ended up looking very similar to
> __change_page_attr_set_clr() hence decided to extend the exiting function to
> use memblock_alloc().

... except that having all that SEV-specific code in main code paths is
yucky and I'd like to avoid it, if possible.

> Early in boot process, guest kernel allocates some structure (its either
> statically allocated or dynamic allocated via memblock_alloc). And shares the 
> physical
> address of these structure with hypervisor. Since entire guest memory area is 
> mapped
> as encrypted hence those structure's are mapped as encrypted memory range. We 
> need
> a method to clear the encryption bit. Sometime these structure maybe part of 
> 2M pages
> and need to split into smaller pages.

So how hard would it be if the hypervisor allocated that memory for the
guest instead? It would allocate it decrypted and guest would need to
access it decrypted too. All in preparation for SEV-ES which will need a
block of unencrypted memory for the guest anyway...

> In most cases, guest and hypervisor communication starts as soon as guest 
> provides
> the physical address to hypervisor. So we must map the pages as decrypted 
> before
> sharing the physical address to hypervisor.

See above: so purely theoretically speaking, the hypervisor could prep
that decrypted range for the guest. I'd look in Paolo's direction,
though, for the feasibility of something like that.

Thanks.

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
-- 


Re: [RFC PATCH v2 14/32] x86: mm: Provide support to use memblock when spliting large pages

2017-03-16 Thread Paolo Bonzini


On 10/03/2017 23:41, Brijesh Singh wrote:
>> Maybe there's a reason this fires:
>>
>> WARNING: modpost: Found 2 section mismatch(es).
>> To see full details build your kernel with:
>> 'make CONFIG_DEBUG_SECTION_MISMATCH=y'
>>
>> WARNING: vmlinux.o(.text+0x48edc): Section mismatch in reference from
>> the function __change_page_attr() to the function
>> .init.text:memblock_alloc()
>> The function __change_page_attr() references
>> the function __init memblock_alloc().
>> This is often because __change_page_attr lacks a __init
>> annotation or the annotation of memblock_alloc is wrong.
>>
>> WARNING: vmlinux.o(.text+0x491d1): Section mismatch in reference from
>> the function __change_page_attr() to the function
>> .meminit.text:memblock_free()
>> The function __change_page_attr() references
>> the function __meminit memblock_free().
>> This is often because __change_page_attr lacks a __meminit
>> annotation or the annotation of memblock_free is wrong.
>> 
>> But maybe Paolo might have an even better idea...
> 
> I am sure he will have better idea :)

Not sure if it's better or worse, but an alternative idea is to turn
__change_page_attr and __change_page_attr_set_clr inside out, so that:
1) the alloc_pages/__free_page happens in __change_page_attr_set_clr;
2) __change_page_attr_set_clr overall does not beocome more complex.

Then you can introduce __early_change_page_attr_set_clr and/or
early_kernel_map_pages_in_pgd, for use in your next patches.  They use
the memblock allocator instead of alloc/free_page

The attached patch is compile-tested only and almost certainly has some
thinko in it.  But it even skims a few lines from the code so the idea
might have some merit.

Paolo
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 28d42130243c..953c8e697562 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -490,11 +490,12 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 }
 
 static int
-try_preserve_large_page(pte_t *kpte, unsigned long address,
+try_preserve_large_page(pte_t **p_kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
 	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
-	pte_t new_pte, old_pte, *tmp;
+	pte_t *kpte = *p_kpte;
+	pte_t new_pte, old_pte;
 	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
 	enum pg_level level;
@@ -507,8 +508,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * Check for races, another CPU might have split this page
 	 * up already:
 	 */
-	tmp = _lookup_address_cpa(cpa, address, &level);
-	if (tmp != kpte)
+	*p_kpte = _lookup_address_cpa(cpa, address, &level);
+	if (*p_kpte != kpte)
 		goto out_unlock;
 
 	switch (level) {
@@ -634,17 +635,18 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	unsigned int i, level;
 	pte_t *tmp;
 	pgprot_t ref_prot;
+	int retry = 1;
 
+	if (!debug_pagealloc_enabled())
+		spin_lock(&cpa_lock);
 	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
 	 */
 	tmp = _lookup_address_cpa(cpa, address, &level);
-	if (tmp != kpte) {
-		spin_unlock(&pgd_lock);
-		return 1;
-	}
+	if (tmp != kpte)
+		goto out;
 
 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 
@@ -671,10 +673,11 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 		break;
 
 	default:
-		spin_unlock(&pgd_lock);
-		return 1;
+		goto out;
 	}
 
+	retry = 0;
+
 	/*
 	 * Set the GLOBAL flags only if the PRESENT flag is set
 	 * otherwise pmd/pte_present will return true even on a non
@@ -718,28 +721,34 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	 * going on.
 	 */
 	__flush_tlb_all();
-	spin_unlock(&pgd_lock);
 
-	return 0;
-}
-
-static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
-			unsigned long address)
-{
-	struct page *base;
+out:
+	spin_unlock(&pgd_lock);
 
+	/*
+	 * Do a global flush tlb after splitting the large page
+ 	 * and before we do the actual change page attribute in the PTE.
+ 	 *
+ 	 * With out this, we violate the TLB application note, that says
+ 	 * "The TLBs may contain both ordinary and large-page
+	 *  translations for a 4-KByte range of linear addresses. This
+	 *  may occur if software modifies the paging structures so that
+	 *  the page size used for the address range changes. If the two
+	 *  translations differ with respect to page frame or attributes
+	 *  (e.g., permissions), processor behavior is undefined and may
+	 *  be implementation-specific."
+ 	 *
+ 	 * We do this global tlb flush inside the cpa_lock, so that we
+	 * don't allow any other cpu, with stale tlb entries change the
+	 * page attribute in parallel, that also falls into the
+	 * just split large page entry.
+ 	 */
+	if (!retry)
+		flush_tlb_all();
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
-	if (!debug_pagealloc_enabled())
-

Re: [RFC PATCH v2 14/32] x86: mm: Provide support to use memblock when spliting large pages

2017-03-16 Thread Paolo Bonzini


On 02/03/2017 16:15, Brijesh Singh wrote:
> 
>  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
> -struct page *base)
> +   pte_t *pbase, unsigned long new_pfn)
>  {
> - pte_t *pbase = (pte_t *)page_address(base);

Just one comment and I'll reply to Boris, I think you can compute pbase 
with pfn_to_kaddr, and avoid adding a new argument.

>*/
> - __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
> + __set_pmd_pte(kpte, address,
> + native_make_pte((new_pfn << PAGE_SHIFT) + _KERNPG_TABLE));

And this probably is better written as:

__set_pmd_pte(kpte, address, pfn_pte(new_pfn, __pgprot(_KERNPG_TABLE));

Paolo


Re: [RFC PATCH v2 14/32] x86: mm: Provide support to use memblock when spliting large pages

2017-03-10 Thread Brijesh Singh

Hi Boris,

On 03/10/2017 05:06 AM, Borislav Petkov wrote:

On Thu, Mar 02, 2017 at 10:15:15AM -0500, Brijesh Singh wrote:

If kernel_maps_pages_in_pgd is called early in boot process to change the


kernel_map_pages_in_pgd()


memory attributes then it fails to allocate memory when spliting large
pages. The patch extends the cpa_data to provide the support to use
memblock_alloc when slab allocator is not available.

The feature will be used in Secure Encrypted Virtualization (SEV) mode,
where we may need to change the memory region attributes in early boot
process.

Signed-off-by: Brijesh Singh 
---
 arch/x86/mm/pageattr.c |   51 
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 46cc89d..9e4ab3b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -37,6 +38,7 @@ struct cpa_data {
int flags;
unsigned long   pfn;
unsignedforce_split : 1;
+   unsignedforce_memblock :1;
int curpage;
struct page **pages;
 };
@@ -627,9 +629,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,

 static int
 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
-  struct page *base)
+ pte_t *pbase, unsigned long new_pfn)
 {
-   pte_t *pbase = (pte_t *)page_address(base);
unsigned long ref_pfn, pfn, pfninc = 1;
unsigned int i, level;
pte_t *tmp;
@@ -646,7 +647,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
unsigned long address,
return 1;
}

-   paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+   paravirt_alloc_pte(&init_mm, new_pfn);

switch (level) {
case PG_LEVEL_2M:
@@ -707,7 +708,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
unsigned long address,
 * pagetable protections, the actual ptes set above control the
 * primary protection behavior:
 */
-   __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+   __set_pmd_pte(kpte, address,
+   native_make_pte((new_pfn << PAGE_SHIFT) + _KERNPG_TABLE));

/*
 * Intel Atom errata AAH41 workaround.
@@ -723,21 +725,50 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
unsigned long address,
return 0;
 }

+static pte_t *try_alloc_pte(struct cpa_data *cpa, unsigned long *pfn)
+{
+   unsigned long phys;
+   struct page *base;
+
+   if (cpa->force_memblock) {
+   phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);


Maybe there's a reason this fires:

WARNING: modpost: Found 2 section mismatch(es).
To see full details build your kernel with:
'make CONFIG_DEBUG_SECTION_MISMATCH=y'

WARNING: vmlinux.o(.text+0x48edc): Section mismatch in reference from the 
function __change_page_attr() to the function .init.text:memblock_alloc()
The function __change_page_attr() references
the function __init memblock_alloc().
This is often because __change_page_attr lacks a __init
annotation or the annotation of memblock_alloc is wrong.

WARNING: vmlinux.o(.text+0x491d1): Section mismatch in reference from the 
function __change_page_attr() to the function .meminit.text:memblock_free()
The function __change_page_attr() references
the function __meminit memblock_free().
This is often because __change_page_attr lacks a __meminit
annotation or the annotation of memblock_free is wrong.



I can take a look at fixing those warning. In my initial attempt was to create
a new function to clear encryption bit but it ended up looking very similar to
__change_page_attr_set_clr() hence decided to extend the exiting function to
use memblock_alloc().



Why do we need this whole early mapping? For the guest? I don't like
that memblock thing at all.


Early in boot process, guest kernel allocates some structure (its either
statically allocated or dynamic allocated via memblock_alloc). And shares the 
physical
address of these structure with hypervisor. Since entire guest memory area is 
mapped
as encrypted hence those structure's are mapped as encrypted memory range. We 
need
a method to clear the encryption bit. Sometime these structure maybe part of 2M 
pages
and need to split into smaller pages.



So I think the approach with the .data..percpu..hv_shared section is
fine and we should consider SEV-ES

http://support.amd.com/TechDocs/Protecting%20VM%20Register%20State%20with%20SEV-ES.pdf

and do this right from the get-go so that when SEV-ES comes along, we
should simply be ready and extend that mechanism to put the whole Guest
Hypervisor Communication Block in there.




But then the fact that you're mapping those decrypted in init_mm.pgd
makes me think you don't need that early mapping thing at all. Those are
the decrypted mappings of the hypervis

Re: [RFC PATCH v2 14/32] x86: mm: Provide support to use memblock when spliting large pages

2017-03-10 Thread Borislav Petkov
On Thu, Mar 02, 2017 at 10:15:15AM -0500, Brijesh Singh wrote:
> If kernel_maps_pages_in_pgd is called early in boot process to change the

kernel_map_pages_in_pgd()

> memory attributes then it fails to allocate memory when spliting large
> pages. The patch extends the cpa_data to provide the support to use
> memblock_alloc when slab allocator is not available.
> 
> The feature will be used in Secure Encrypted Virtualization (SEV) mode,
> where we may need to change the memory region attributes in early boot
> process.
> 
> Signed-off-by: Brijesh Singh 
> ---
>  arch/x86/mm/pageattr.c |   51 
> 
>  1 file changed, 42 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index 46cc89d..9e4ab3b 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -37,6 +38,7 @@ struct cpa_data {
>   int flags;
>   unsigned long   pfn;
>   unsignedforce_split : 1;
> + unsignedforce_memblock :1;
>   int curpage;
>   struct page **pages;
>  };
> @@ -627,9 +629,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long 
> address,
>  
>  static int
>  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
> -struct page *base)
> +   pte_t *pbase, unsigned long new_pfn)
>  {
> - pte_t *pbase = (pte_t *)page_address(base);
>   unsigned long ref_pfn, pfn, pfninc = 1;
>   unsigned int i, level;
>   pte_t *tmp;
> @@ -646,7 +647,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
> unsigned long address,
>   return 1;
>   }
>  
> - paravirt_alloc_pte(&init_mm, page_to_pfn(base));
> + paravirt_alloc_pte(&init_mm, new_pfn);
>  
>   switch (level) {
>   case PG_LEVEL_2M:
> @@ -707,7 +708,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
> unsigned long address,
>* pagetable protections, the actual ptes set above control the
>* primary protection behavior:
>*/
> - __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
> + __set_pmd_pte(kpte, address,
> + native_make_pte((new_pfn << PAGE_SHIFT) + _KERNPG_TABLE));
>  
>   /*
>* Intel Atom errata AAH41 workaround.
> @@ -723,21 +725,50 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
> unsigned long address,
>   return 0;
>  }
>  
> +static pte_t *try_alloc_pte(struct cpa_data *cpa, unsigned long *pfn)
> +{
> + unsigned long phys;
> + struct page *base;
> +
> + if (cpa->force_memblock) {
> + phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);

Maybe there's a reason this fires:

WARNING: modpost: Found 2 section mismatch(es).
To see full details build your kernel with:
'make CONFIG_DEBUG_SECTION_MISMATCH=y'

WARNING: vmlinux.o(.text+0x48edc): Section mismatch in reference from the 
function __change_page_attr() to the function .init.text:memblock_alloc()
The function __change_page_attr() references
the function __init memblock_alloc().
This is often because __change_page_attr lacks a __init
annotation or the annotation of memblock_alloc is wrong.

WARNING: vmlinux.o(.text+0x491d1): Section mismatch in reference from the 
function __change_page_attr() to the function .meminit.text:memblock_free()
The function __change_page_attr() references
the function __meminit memblock_free().
This is often because __change_page_attr lacks a __meminit
annotation or the annotation of memblock_free is wrong.

Why do we need this whole early mapping? For the guest? I don't like
that memblock thing at all.

So I think the approach with the .data..percpu..hv_shared section is
fine and we should consider SEV-ES

http://support.amd.com/TechDocs/Protecting%20VM%20Register%20State%20with%20SEV-ES.pdf

and do this right from the get-go so that when SEV-ES comes along, we
should simply be ready and extend that mechanism to put the whole Guest
Hypervisor Communication Block in there.

But then the fact that you're mapping those decrypted in init_mm.pgd
makes me think you don't need that early mapping thing at all. Those are
the decrypted mappings of the hypervisor. And that you can do late.

Now, what would be better, IMHO (and I have no idea about virtualization
design so take with a grain of salt) is if the guest would allocate
enough memory for the GHCB and mark it decrypted from the very
beginning. It will be the communication vehicle with the hypervisor
anyway.

And we already do similar things in sme_map_bootdata() for the baremetal
kernel to map boot_data, initrd, EFI, ... and so on things decrypted.

And we should extend that mechanism to map the GHCB in the guest too and
then we can get rid of all that need for ->force_memblock which makes
the crazy mess in pageattr.c even crazier. And it would be lovely if we
can do it wi

[RFC PATCH v2 14/32] x86: mm: Provide support to use memblock when spliting large pages

2017-03-02 Thread Brijesh Singh
If kernel_maps_pages_in_pgd is called early in boot process to change the
memory attributes then it fails to allocate memory when spliting large
pages. The patch extends the cpa_data to provide the support to use
memblock_alloc when slab allocator is not available.

The feature will be used in Secure Encrypted Virtualization (SEV) mode,
where we may need to change the memory region attributes in early boot
process.

Signed-off-by: Brijesh Singh 
---
 arch/x86/mm/pageattr.c |   51 
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 46cc89d..9e4ab3b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -37,6 +38,7 @@ struct cpa_data {
int flags;
unsigned long   pfn;
unsignedforce_split : 1;
+   unsignedforce_memblock :1;
int curpage;
struct page **pages;
 };
@@ -627,9 +629,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 
 static int
 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
-  struct page *base)
+ pte_t *pbase, unsigned long new_pfn)
 {
-   pte_t *pbase = (pte_t *)page_address(base);
unsigned long ref_pfn, pfn, pfninc = 1;
unsigned int i, level;
pte_t *tmp;
@@ -646,7 +647,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
unsigned long address,
return 1;
}
 
-   paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+   paravirt_alloc_pte(&init_mm, new_pfn);
 
switch (level) {
case PG_LEVEL_2M:
@@ -707,7 +708,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
unsigned long address,
 * pagetable protections, the actual ptes set above control the
 * primary protection behavior:
 */
-   __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+   __set_pmd_pte(kpte, address,
+   native_make_pte((new_pfn << PAGE_SHIFT) + _KERNPG_TABLE));
 
/*
 * Intel Atom errata AAH41 workaround.
@@ -723,21 +725,50 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
unsigned long address,
return 0;
 }
 
+static pte_t *try_alloc_pte(struct cpa_data *cpa, unsigned long *pfn)
+{
+   unsigned long phys;
+   struct page *base;
+
+   if (cpa->force_memblock) {
+   phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+   if (!phys)
+   return NULL;
+   *pfn = phys >> PAGE_SHIFT;
+   return (pte_t *)__va(phys);
+   }
+
+   base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+   if (!base)
+   return NULL;
+   *pfn = page_to_pfn(base);
+   return (pte_t *)page_address(base);
+}
+
+static void try_free_pte(struct cpa_data *cpa, pte_t *pte)
+{
+   if (cpa->force_memblock)
+   memblock_free(__pa(pte), PAGE_SIZE);
+   else
+   __free_page((struct page *)pte);
+}
+
 static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
unsigned long address)
 {
-   struct page *base;
+   pte_t *new_pte;
+   unsigned long new_pfn;
 
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
-   base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+   new_pte = try_alloc_pte(cpa, &new_pfn);
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
-   if (!base)
+   if (!new_pte)
return -ENOMEM;
 
-   if (__split_large_page(cpa, kpte, address, base))
-   __free_page(base);
+   if (__split_large_page(cpa, kpte, address, new_pte, new_pfn))
+   try_free_pte(cpa, new_pte);
 
return 0;
 }
@@ -2035,6 +2066,7 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned 
long address,
unsigned numpages, unsigned long page_flags)
 {
int retval = -EINVAL;
+   int use_memblock = !slab_is_available();
 
struct cpa_data cpa = {
.vaddr = &address,
@@ -2044,6 +2076,7 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned 
long address,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(0),
.flags = 0,
+   .force_memblock = use_memblock,
};
 
if (!(__supported_pte_mask & _PAGE_NX))