Re: [PATCH v2 5/6] x86/iommu: switch hwdom IOMMU to use a rangeset

2023-12-05 Thread Roger Pau Monné
On Tue, Dec 05, 2023 at 04:47:15PM +0100, Jan Beulich wrote:
> On 05.12.2023 16:43, Roger Pau Monné wrote:
> > On Tue, Dec 05, 2023 at 04:27:05PM +0100, Jan Beulich wrote:
> >> On 04.12.2023 10:43, Roger Pau Monne wrote:
> >>> @@ -476,58 +406,55 @@ void __hwdom_init arch_iommu_hwdom_init(struct 
> >>> domain *d)
> >>>  if ( !map )
> >>>  panic("IOMMU init: unable to allocate rangeset\n");
> >>>  
> >>> -max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
> >>> -top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
> >>> +if ( iommu_hwdom_inclusive )
> >>> +{
> >>> +/* Add the whole range below 4GB, UNUSABLE regions will be 
> >>> removed. */
> >>> +rc = rangeset_add_range(map, 0, max_pfn);
> >>> +if ( rc )
> >>> +panic("IOMMU inclusive mappings can't be added: %d\n",
> >>> +  rc);
> >>> +}
> >>>  
> >>> -for ( i = 0, start = 0, count = 0; i < top; )
> >>> +for ( i = 0; i < e820.nr_map; i++ )
> >>>  {
> >>> -unsigned long pfn = pdx_to_pfn(i);
> >>> -unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
> >>> +struct e820entry entry = e820.map[i];
> >>>  
> >>> -if ( !perms )
> >>> -/* nothing */;
> >>> -else if ( paging_mode_translate(d) )
> >>> +switch ( entry.type )
> >>>  {
> >>> -int rc;
> >>> +case E820_UNUSABLE:
> >>> +if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) > 
> >>> max_pfn )
> >>> +continue;
> >>
> >> The !iommu_hwdom_inclusive part isn't really needed here, is it? The ...
> > 
> > Nor the PFN_DOWN(entry.addr) > max_pfn.
> 
> Hmm, I couldn't convince myself that could also be dropped.

We never map unusable regions, so it's always fine to remove them from
the rangeset.  The condition was just a way to exit early and avoid
the rangeset_remove_range() call.

> >>> -rc = p2m_add_identity_entry(d, pfn,
> >>> -perms & IOMMUF_writable ? 
> >>> p2m_access_rw
> >>> -: 
> >>> p2m_access_r,
> >>> -0);
> >>> +rc = rangeset_remove_range(map, PFN_DOWN(entry.addr),
> >>> +   PFN_DOWN(entry.addr + entry.size 
> >>> - 1));
> >>
> >> ... call here would then simply be a no-op, as it looks. And things would
> >> overall look more safe if the removal was skipped for fewer reasons.
> > 
> > OK, the removal can be done unconditionally if so desired.
> > 
> >>> @@ -605,7 +532,7 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain 
> >>> *d)
> >>>  rangeset_destroy(map);
> >>>  
> >>>  /* Use if to avoid compiler warning */
> >>> -if ( iommu_iotlb_flush_all(d, flush_flags) )
> >>> +if ( iommu_iotlb_flush_all(d, map_data.flush_flags) )
> >>>  return;
> >>>  }
> >>
> >> Ah yes, here is said change. But I think for correctness this wants
> >> moving to the earlier patch.
> > 
> > OK, so something like:
> > 
> > map_data.flush_flags |= flush_flags;
> 
> Or simply drop flush_flags here right away (read: replace by map.flush_flags).

Right, OK, that will lead to some small changes to existing code which
I wanted to avoid in the context of just adding new code to deal with
a rangeset, but anyway, if that's preferred I will adjust.

Thanks, Roger.



Re: [PATCH v2 5/6] x86/iommu: switch hwdom IOMMU to use a rangeset

2023-12-05 Thread Jan Beulich
On 05.12.2023 16:43, Roger Pau Monné wrote:
> On Tue, Dec 05, 2023 at 04:27:05PM +0100, Jan Beulich wrote:
>> On 04.12.2023 10:43, Roger Pau Monne wrote:
>>> @@ -476,58 +406,55 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain 
>>> *d)
>>>  if ( !map )
>>>  panic("IOMMU init: unable to allocate rangeset\n");
>>>  
>>> -max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
>>> -top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
>>> +if ( iommu_hwdom_inclusive )
>>> +{
>>> +/* Add the whole range below 4GB, UNUSABLE regions will be 
>>> removed. */
>>> +rc = rangeset_add_range(map, 0, max_pfn);
>>> +if ( rc )
>>> +panic("IOMMU inclusive mappings can't be added: %d\n",
>>> +  rc);
>>> +}
>>>  
>>> -for ( i = 0, start = 0, count = 0; i < top; )
>>> +for ( i = 0; i < e820.nr_map; i++ )
>>>  {
>>> -unsigned long pfn = pdx_to_pfn(i);
>>> -unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
>>> +struct e820entry entry = e820.map[i];
>>>  
>>> -if ( !perms )
>>> -/* nothing */;
>>> -else if ( paging_mode_translate(d) )
>>> +switch ( entry.type )
>>>  {
>>> -int rc;
>>> +case E820_UNUSABLE:
>>> +if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) > max_pfn )
>>> +continue;
>>
>> The !iommu_hwdom_inclusive part isn't really needed here, is it? The ...
> 
> Nor the PFN_DOWN(entry.addr) > max_pfn.

Hmm, I couldn't convince myself that could also be dropped.

>>> -rc = p2m_add_identity_entry(d, pfn,
>>> -perms & IOMMUF_writable ? 
>>> p2m_access_rw
>>> -: 
>>> p2m_access_r,
>>> -0);
>>> +rc = rangeset_remove_range(map, PFN_DOWN(entry.addr),
>>> +   PFN_DOWN(entry.addr + entry.size - 
>>> 1));
>>
>> ... call here would then simply be a no-op, as it looks. And things would
>> overall look more safe if the removal was skipped for fewer reasons.
> 
> OK, the removal can be done unconditionally if so desired.
> 
>>> @@ -605,7 +532,7 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain 
>>> *d)
>>>  rangeset_destroy(map);
>>>  
>>>  /* Use if to avoid compiler warning */
>>> -if ( iommu_iotlb_flush_all(d, flush_flags) )
>>> +if ( iommu_iotlb_flush_all(d, map_data.flush_flags) )
>>>  return;
>>>  }
>>
>> Ah yes, here is said change. But I think for correctness this wants
>> moving to the earlier patch.
> 
> OK, so something like:
> 
> map_data.flush_flags |= flush_flags;

Or simply drop flush_flags here right away (read: replace by map.flush_flags).

Jan

> And adjusting the iommu_iotlb_flush_all() would be fine in this patch
> context.
> 
> Thanks, Roger.




Re: [PATCH v2 5/6] x86/iommu: switch hwdom IOMMU to use a rangeset

2023-12-05 Thread Roger Pau Monné
On Tue, Dec 05, 2023 at 04:27:05PM +0100, Jan Beulich wrote:
> On 04.12.2023 10:43, Roger Pau Monne wrote:
> > @@ -476,58 +406,55 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain 
> > *d)
> >  if ( !map )
> >  panic("IOMMU init: unable to allocate rangeset\n");
> >  
> > -max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
> > -top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
> > +if ( iommu_hwdom_inclusive )
> > +{
> > +/* Add the whole range below 4GB, UNUSABLE regions will be 
> > removed. */
> > +rc = rangeset_add_range(map, 0, max_pfn);
> > +if ( rc )
> > +panic("IOMMU inclusive mappings can't be added: %d\n",
> > +  rc);
> > +}
> >  
> > -for ( i = 0, start = 0, count = 0; i < top; )
> > +for ( i = 0; i < e820.nr_map; i++ )
> >  {
> > -unsigned long pfn = pdx_to_pfn(i);
> > -unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
> > +struct e820entry entry = e820.map[i];
> >  
> > -if ( !perms )
> > -/* nothing */;
> > -else if ( paging_mode_translate(d) )
> > +switch ( entry.type )
> >  {
> > -int rc;
> > +case E820_UNUSABLE:
> > +if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) > max_pfn )
> > +continue;
> 
> The !iommu_hwdom_inclusive part isn't really needed here, is it? The ...

Nor the PFN_DOWN(entry.addr) > max_pfn.

> > -rc = p2m_add_identity_entry(d, pfn,
> > -perms & IOMMUF_writable ? 
> > p2m_access_rw
> > -: 
> > p2m_access_r,
> > -0);
> > +rc = rangeset_remove_range(map, PFN_DOWN(entry.addr),
> > +   PFN_DOWN(entry.addr + entry.size - 
> > 1));
> 
> ... call here would then simply be a no-op, as it looks. And things would
> overall look more safe if the removal was skipped for fewer reasons.

OK, the removal can be done unconditionally if so desired.

> > @@ -605,7 +532,7 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain 
> > *d)
> >  rangeset_destroy(map);
> >  
> >  /* Use if to avoid compiler warning */
> > -if ( iommu_iotlb_flush_all(d, flush_flags) )
> > +if ( iommu_iotlb_flush_all(d, map_data.flush_flags) )
> >  return;
> >  }
> 
> Ah yes, here is said change. But I think for correctness this wants
> moving to the earlier patch.

OK, so something like:

map_data.flush_flags |= flush_flags;

And adjusting the iommu_iotlb_flush_all() would be fine in this patch
context.

Thanks, Roger.



Re: [PATCH v2 5/6] x86/iommu: switch hwdom IOMMU to use a rangeset

2023-12-05 Thread Jan Beulich
On 04.12.2023 10:43, Roger Pau Monne wrote:
> @@ -476,58 +406,55 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain 
> *d)
>  if ( !map )
>  panic("IOMMU init: unable to allocate rangeset\n");
>  
> -max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
> -top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
> +if ( iommu_hwdom_inclusive )
> +{
> +/* Add the whole range below 4GB, UNUSABLE regions will be removed. 
> */
> +rc = rangeset_add_range(map, 0, max_pfn);
> +if ( rc )
> +panic("IOMMU inclusive mappings can't be added: %d\n",
> +  rc);
> +}
>  
> -for ( i = 0, start = 0, count = 0; i < top; )
> +for ( i = 0; i < e820.nr_map; i++ )
>  {
> -unsigned long pfn = pdx_to_pfn(i);
> -unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
> +struct e820entry entry = e820.map[i];
>  
> -if ( !perms )
> -/* nothing */;
> -else if ( paging_mode_translate(d) )
> +switch ( entry.type )
>  {
> -int rc;
> +case E820_UNUSABLE:
> +if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) > max_pfn )
> +continue;

The !iommu_hwdom_inclusive part isn't really needed here, is it? The ...

> -rc = p2m_add_identity_entry(d, pfn,
> -perms & IOMMUF_writable ? 
> p2m_access_rw
> -: 
> p2m_access_r,
> -0);
> +rc = rangeset_remove_range(map, PFN_DOWN(entry.addr),
> +   PFN_DOWN(entry.addr + entry.size - 
> 1));

... call here would then simply be a no-op, as it looks. And things would
overall look more safe if the removal was skipped for fewer reasons.

> @@ -605,7 +532,7 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
>  rangeset_destroy(map);
>  
>  /* Use if to avoid compiler warning */
> -if ( iommu_iotlb_flush_all(d, flush_flags) )
> +if ( iommu_iotlb_flush_all(d, map_data.flush_flags) )
>  return;
>  }

Ah yes, here is said change. But I think for correctness this wants
moving to the earlier patch.

Jan



[PATCH v2 5/6] x86/iommu: switch hwdom IOMMU to use a rangeset

2023-12-04 Thread Roger Pau Monne
The current loop that iterates from 0 to the maximum RAM address in order to
setup the IOMMU mappings is highly inefficient, and it will get worse as the
amount of RAM increases.  It's also not accounting for any reserved regions
past the last RAM address.

Instead of iterating over memory addresses, iterate over the memory map regions
and use a rangeset in order to keep track of which ranges need to be identity
mapped in the hardware domain physical address space.

On an AMD EPYC 7452 with 512GiB of RAM, the time to execute
arch_iommu_hwdom_init() in nanoseconds is:

x old
+ new
N   Min   MaxMedian   AvgStddev
x   5 2.2364154e+10  2.338244e+10 2.2474685e+10 2.2622409e+10 4.2949869e+08
+   5   1025012   1033036   1026188 1028276.2 3623.1194
Difference at 95.0% confidence
-2.26214e+10 +/- 4.42931e+08
-99.9955% +/- 9.05152e-05%
(Student's t, pooled s = 3.03701e+08)

Execution time of arch_iommu_hwdom_init() goes down from ~22s to ~0.001s.

Note there's a change for HVM domains (ie: PVH dom0) that get switched to
create the p2m mappings using map_mmio_regions() instead of
p2m_add_identity_entry(), so that ranges can be mapped with a single function
call if possible.  Note that the interface of map_mmio_regions() doesn't
allow creating read-only mappings, but so far there are no such mappings
created for PVH dom0 in arch_iommu_hwdom_init().

No change intended in the resulting mappings that a hardware domain gets.

Signed-off-by: Roger Pau Monné 
---
Changes since v1:
 - Split from bigger patch.
 - Remove unneeded default case.
---
 xen/drivers/passthrough/x86/iommu.c | 157 
 1 file changed, 42 insertions(+), 115 deletions(-)

diff --git a/xen/drivers/passthrough/x86/iommu.c 
b/xen/drivers/passthrough/x86/iommu.c
index 7e97805fccec..81d6129189d0 100644
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -300,76 +300,6 @@ void iommu_identity_map_teardown(struct domain *d)
 }
 }
 
-static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d,
- unsigned long pfn,
- unsigned long max_pfn)
-{
-mfn_t mfn = _mfn(pfn);
-unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable;
-
-/*
- * Set up 1:1 mapping for dom0. Default to include only conventional RAM
- * areas and let RMRRs include needed reserved regions. When set, the
- * inclusive mapping additionally maps in every pfn up to 4GB except those
- * that fall in unusable ranges for PV Dom0.
- */
-if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
-return 0;
-
-switch ( type = page_get_ram_type(mfn) )
-{
-case RAM_TYPE_UNUSABLE:
-return 0;
-
-case RAM_TYPE_CONVENTIONAL:
-if ( iommu_hwdom_strict )
-return 0;
-break;
-
-default:
-if ( type & RAM_TYPE_RESERVED )
-{
-if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
-perms = 0;
-}
-else if ( is_hvm_domain(d) )
-return 0;
-else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
-perms = 0;
-}
-
-/* Check that it doesn't overlap with the Interrupt Address Range. */
-if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
-return 0;
-/* ... or the IO-APIC */
-if ( has_vioapic(d) )
-{
-for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
-if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
-return 0;
-}
-else if ( is_pv_domain(d) )
-{
-/*
- * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
- * ones there (also for e.g. HPET in certain cases), so it should also
- * have such established for IOMMUs.
- */
-if ( iomem_access_permitted(d, pfn, pfn) &&
- rangeset_contains_singleton(mmio_ro_ranges, pfn) )
-perms = IOMMUF_readable;
-}
-/*
- * ... or the PCIe MCFG regions.
- * TODO: runtime added MMCFG regions are not checked to make sure they
- * don't overlap with already mapped regions, thus preventing trapping.
- */
-if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
-return 0;
-
-return perms;
-}
-
 static int __hwdom_init cf_check map_subtract(unsigned long s, unsigned long e,
   void *data)
 {
@@ -444,8 +374,8 @@ static int __hwdom_init cf_check identity_map(unsigned long 
s, unsigned long e,
 
 void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
 {
-unsigned long i, top, max_pfn, start, count;
-unsigned int flush_flags = 0, start_perms = 0;
+const unsigned long max_pfn = PFN_DOWN(GB(4)) - 1;
+unsigned int i;
 struct rangeset *map;
 struct map_data map_data = { .d = d };