Re: [Xen-devel] [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d

2017-03-06 Thread Boris Ostrovsky

> +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
> + int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
> + bool last, unsigned long limit)
> +{
> + int i, nr, flush = 0;
> +
> + nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
> + for (i = 0; i < nr; i++) {
> + pud_t *pud;
> +
> + if (p4d_none(p4d[i]))
> + continue;
> +
> + pud = pud_offset(&p4d[i], 0);
> + if (PTRS_PER_PUD > 1)
> + flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
> + xen_pud_walk(mm, pud, func, last && i == nr - 1, limit);
> + }
> + return flush;
> +}

..

> + p4d = p4d_offset(&pgd[i], 0);
> + if (PTRS_PER_P4D > 1)
> + flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
> + xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);


We are losing flush status at all levels so we need something like

flush |= xen_XXX_walk(...)



>   }
>  
> -out:
>   /* Do the top level last, so that the callbacks can use it as
>  a cue to do final things like tlb flushes. */
>   flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
> @@ -1150,57 +1161,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void 
> *pgtbl, bool unpin)
>   xen_free_ro_pages(pa, PAGE_SIZE);
>  }
>  
> +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
> +{
> + unsigned long pa;
> + pte_t *pte_tbl;
> + int i;
> +
> + if (pmd_large(*pmd)) {
> + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> + xen_free_ro_pages(pa, PMD_SIZE);
> + return;
> + }
> +
> + pte_tbl = pte_offset_kernel(pmd, 0);
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + if (pte_none(pte_tbl[i]))
> + continue;
> + pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
> + xen_free_ro_pages(pa, PAGE_SIZE);
> + }
> + set_pmd(pmd, __pmd(0));
> + xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
> +}
> +
> +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
> +{
> + unsigned long pa;
> + pmd_t *pmd_tbl;
> + int i;
> +
> + if (pud_large(*pud)) {
> + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> + xen_free_ro_pages(pa, PUD_SIZE);
> + return;
> + }
> +
> + pmd_tbl = pmd_offset(pud, 0);
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + if (pmd_none(pmd_tbl[i]))
> + continue;
> + xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
> + }
> + set_pud(pud, __pud(0));
> + xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
> +}
> +
> +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
> +{
> + unsigned long pa;
> + pud_t *pud_tbl;
> + int i;
> +
> + if (p4d_large(*p4d)) {
> + pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
> + xen_free_ro_pages(pa, P4D_SIZE);
> + return;
> + }
> +
> + pud_tbl = pud_offset(p4d, 0);
> + for (i = 0; i < PTRS_PER_PUD; i++) {
> + if (pud_none(pud_tbl[i]))
> + continue;
> + xen_cleanmfnmap_pud(pud_tbl + i, unpin);
> + }
> + set_p4d(p4d, __p4d(0));
> + xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
> +}
> +
>  /*
>   * Since it is well isolated we can (and since it is perhaps large we should)
>   * also free the page tables mapping the initial P->M table.
>   */
>  static void __init xen_cleanmfnmap(unsigned long vaddr)
>  {
> - unsigned long va = vaddr & PMD_MASK;
> - unsigned long pa;
> - pgd_t *pgd = pgd_offset_k(va);
> - pud_t *pud_page = pud_offset(pgd, 0);
> - pud_t *pud;
> - pmd_t *pmd;
> - pte_t *pte;
> + pgd_t *pgd;
> + p4d_t *p4d;
>   unsigned int i;
>   bool unpin;
>  
>   unpin = (vaddr == 2 * PGDIR_SIZE);
> - set_pgd(pgd, __pgd(0));
> - do {
> - pud = pud_page + pud_index(va);
> - if (pud_none(*pud)) {
> - va += PUD_SIZE;
> - } else if (pud_large(*pud)) {
> - pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> - xen_free_ro_pages(pa, PUD_SIZE);
> - va += PUD_SIZE;
> - } else {
> - pmd = pmd_offset(pud, va);
> - if (pmd_large(*pmd)) {
> - pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> - xen_free_ro_pages(pa, PMD_SIZE);
> - } else if (!pmd_none(*pmd)) {
> - pte = pte_offset_kernel(pmd, va);
> - set_pmd(pmd, __pmd(0));
> - for (i = 0; i < PTRS_PER_PTE; ++i) {
> - if (pte_none(pte[i]))
> - break;
> - pa = pte_pfn(pte[i]) << PAGE_SHIFT;
> -  

Re: [Xen-devel] [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d

2017-03-07 Thread Kirill A. Shutemov
On Mon, Mar 06, 2017 at 03:48:24PM -0500, Boris Ostrovsky wrote:
> 
> > +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
> > +   int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
> > +   bool last, unsigned long limit)
> > +{
> > +   int i, nr, flush = 0;
> > +
> > +   nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
> > +   for (i = 0; i < nr; i++) {
> > +   pud_t *pud;
> > +
> > +   if (p4d_none(p4d[i]))
> > +   continue;
> > +
> > +   pud = pud_offset(&p4d[i], 0);
> > +   if (PTRS_PER_PUD > 1)
> > +   flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
> > +   xen_pud_walk(mm, pud, func, last && i == nr - 1, limit);
> > +   }
> > +   return flush;
> > +}
> 
> ..
> 
> > +   p4d = p4d_offset(&pgd[i], 0);
> > +   if (PTRS_PER_P4D > 1)
> > +   flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
> > +   xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
> 
> 
> We are losing flush status at all levels so we need something like
> 
> flush |= xen_XXX_walk(...)

+ Xiong.

Thanks for noticing this. The fixup is below.

Please test, I don't have a setup for this.

> 
> 
> 
> > }
> >  
> > -out:
> > /* Do the top level last, so that the callbacks can use it as
> >a cue to do final things like tlb flushes. */
> > flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
> > @@ -1150,57 +1161,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void 
> > *pgtbl, bool unpin)
> > xen_free_ro_pages(pa, PAGE_SIZE);
> >  }
> >  
> > +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
> > +{
> > +   unsigned long pa;
> > +   pte_t *pte_tbl;
> > +   int i;
> > +
> > +   if (pmd_large(*pmd)) {
> > +   pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> > +   xen_free_ro_pages(pa, PMD_SIZE);
> > +   return;
> > +   }
> > +
> > +   pte_tbl = pte_offset_kernel(pmd, 0);
> > +   for (i = 0; i < PTRS_PER_PTE; i++) {
> > +   if (pte_none(pte_tbl[i]))
> > +   continue;
> > +   pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
> > +   xen_free_ro_pages(pa, PAGE_SIZE);
> > +   }
> > +   set_pmd(pmd, __pmd(0));
> > +   xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
> > +}
> > +
> > +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
> > +{
> > +   unsigned long pa;
> > +   pmd_t *pmd_tbl;
> > +   int i;
> > +
> > +   if (pud_large(*pud)) {
> > +   pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> > +   xen_free_ro_pages(pa, PUD_SIZE);
> > +   return;
> > +   }
> > +
> > +   pmd_tbl = pmd_offset(pud, 0);
> > +   for (i = 0; i < PTRS_PER_PMD; i++) {
> > +   if (pmd_none(pmd_tbl[i]))
> > +   continue;
> > +   xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
> > +   }
> > +   set_pud(pud, __pud(0));
> > +   xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
> > +}
> > +
> > +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
> > +{
> > +   unsigned long pa;
> > +   pud_t *pud_tbl;
> > +   int i;
> > +
> > +   if (p4d_large(*p4d)) {
> > +   pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
> > +   xen_free_ro_pages(pa, P4D_SIZE);
> > +   return;
> > +   }
> > +
> > +   pud_tbl = pud_offset(p4d, 0);
> > +   for (i = 0; i < PTRS_PER_PUD; i++) {
> > +   if (pud_none(pud_tbl[i]))
> > +   continue;
> > +   xen_cleanmfnmap_pud(pud_tbl + i, unpin);
> > +   }
> > +   set_p4d(p4d, __p4d(0));
> > +   xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
> > +}
> > +
> >  /*
> >   * Since it is well isolated we can (and since it is perhaps large we 
> > should)
> >   * also free the page tables mapping the initial P->M table.
> >   */
> >  static void __init xen_cleanmfnmap(unsigned long vaddr)
> >  {
> > -   unsigned long va = vaddr & PMD_MASK;
> > -   unsigned long pa;
> > -   pgd_t *pgd = pgd_offset_k(va);
> > -   pud_t *pud_page = pud_offset(pgd, 0);
> > -   pud_t *pud;
> > -   pmd_t *pmd;
> > -   pte_t *pte;
> > +   pgd_t *pgd;
> > +   p4d_t *p4d;
> > unsigned int i;
> > bool unpin;
> >  
> > unpin = (vaddr == 2 * PGDIR_SIZE);
> > -   set_pgd(pgd, __pgd(0));
> > -   do {
> > -   pud = pud_page + pud_index(va);
> > -   if (pud_none(*pud)) {
> > -   va += PUD_SIZE;
> > -   } else if (pud_large(*pud)) {
> > -   pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> > -   xen_free_ro_pages(pa, PUD_SIZE);
> > -   va += PUD_SIZE;
> > -   } else {
> > -   pmd = pmd_offset(pud, va);
> > -   if (pmd_large(*pmd)) {
> > -   pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> > -   xen_free_ro_pages(pa, PMD_SIZE);
> > -   } else if (!pmd_none(*pmd)) {
> > -   pte = pte_offset_kernel(pmd, va);
> > -   set_pmd(pmd, __pmd(0));
> > 

Re: [Xen-devel] [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d

2017-03-07 Thread Boris Ostrovsky

>> Don't we need to pass vaddr down to all routines so that they select
>> appropriate tables? You seem to always be choosing the first one.
> IIUC, we clear whole page table subtree covered by one pgd entry.
> So, no, there's no need to pass vaddr down. Just pointer to page table
> entry is enough.
>
> But I know virtually nothing about Xen. Please re-check my reasoning.

Yes, we effectively remove the whole page table for vaddr so I guess
it's OK.

>
> I would also appreciate help with getting x86 Xen code work with 5-level
> paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.

Hmmm... that's a problem since this requires changes in the hypervisor
and even if/when these changes are made older version of hypervisor
still will not be able to run those guests.

This affects only PV guests and there is a series under review that
provides clean code separation with CONFIG_XEN_PV but because, for
example, dom0 (Xen control domain) is PV this will significantly limit
availability of dom0-capable kernels (because I assume distros will want
to have CONFIG_X86_5LEVEL).


>
> Fixup:

Yes, that works. (But then it worked even without this change because
problems caused by missing the flush would be intermittent. And a joy to
debug).

-boris


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d

2017-03-07 Thread Andrew Cooper
On 07/03/17 18:18, Boris Ostrovsky wrote:
>>> Don't we need to pass vaddr down to all routines so that they select
>>> appropriate tables? You seem to always be choosing the first one.
>> IIUC, we clear whole page table subtree covered by one pgd entry.
>> So, no, there's no need to pass vaddr down. Just pointer to page table
>> entry is enough.
>>
>> But I know virtually nothing about Xen. Please re-check my reasoning.
> Yes, we effectively remove the whole page table for vaddr so I guess
> it's OK.
>
>> I would also appreciate help with getting x86 Xen code work with 5-level
>> paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.
> Hmmm... that's a problem since this requires changes in the hypervisor
> and even if/when these changes are made older version of hypervisor
> still will not be able to run those guests.
>
> This affects only PV guests and there is a series under review that
> provides clean code separation with CONFIG_XEN_PV but because, for
> example, dom0 (Xen control domain) is PV this will significantly limit
> availability of dom0-capable kernels (because I assume distros will want
> to have CONFIG_X86_5LEVEL).

Wasn't the plan to be able to automatically detect 4 vs 5 level support,
and cope either way, so distros didn't have to ship two different builds
of Linux?

If so, all we need to do git things to compile sensibly, and have the PV
entry code in Linux configure the rest of the kernel appropriately.

(If not, please ignore me.)

~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d

2017-03-07 Thread Boris Ostrovsky
On 03/07/2017 01:26 PM, Andrew Cooper wrote:
> On 07/03/17 18:18, Boris Ostrovsky wrote:
 Don't we need to pass vaddr down to all routines so that they select
 appropriate tables? You seem to always be choosing the first one.
>>> IIUC, we clear whole page table subtree covered by one pgd entry.
>>> So, no, there's no need to pass vaddr down. Just pointer to page table
>>> entry is enough.
>>>
>>> But I know virtually nothing about Xen. Please re-check my reasoning.
>> Yes, we effectively remove the whole page table for vaddr so I guess
>> it's OK.
>>
>>> I would also appreciate help with getting x86 Xen code work with 5-level
>>> paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.
>> Hmmm... that's a problem since this requires changes in the hypervisor
>> and even if/when these changes are made older version of hypervisor
>> still will not be able to run those guests.
>>
>> This affects only PV guests and there is a series under review that
>> provides clean code separation with CONFIG_XEN_PV but because, for
>> example, dom0 (Xen control domain) is PV this will significantly limit
>> availability of dom0-capable kernels (because I assume distros will want
>> to have CONFIG_X86_5LEVEL).
> Wasn't the plan to be able to automatically detect 4 vs 5 level support,
> and cope either way, so distros didn't have to ship two different builds
> of Linux?
>
> If so, all we need to do git things to compile sensibly, and have the PV
> entry code in Linux configure the rest of the kernel appropriately.

I am not aware of any plans but this would obviously be the preferred route.

-boris


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel