Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-22 Thread Auger Eric
Hi Alexey,

On 22/03/18 08:45, Alexey Kardashevskiy wrote:
> On 22/3/18 2:29 am, Alex Williamson wrote:
>> On Mon, 19 Mar 2018 21:49:58 +0100
>> Auger Eric  wrote:
>>
>>> Hi Alexey,
>>>
>>> On 09/02/18 08:55, Alexey Kardashevskiy wrote:
 At the moment if vfio_memory_listener is registered in the system memory
 address space, it maps/unmaps every RAM memory region for DMA.
 It expects system page size aligned memory sections so vfio_dma_map
 would not fail and so far this has been the case. A mapping failure
 would be fatal. A side effect of such behavior is that some MMIO pages
 would not be mapped silently.

 However we are going to change MSIX BAR handling so we will end having
 non-aligned sections in vfio_memory_listener (more details is in
 the next patch) and vfio_dma_map will exit QEMU.

 In order to avoid fatal failures on what previously was not a failure and
 was just silently ignored, this checks the section alignment to
 the smallest supported IOMMU page size and prints an error if not aligned;
 it also prints an error if vfio_dma_map failed despite the page size check.
 Both errors are not fatal; only MMIO RAM regions are checked
 (aka "RAM device" regions).

 If the amount of errors printed is overwhelming, the MSIX relocation
 could be used to avoid excessive error output.

 This is unlikely to cause any behavioral change.

 Signed-off-by: Alexey Kardashevskiy 
 ---
  hw/vfio/common.c | 55 
 +--
  1 file changed, 49 insertions(+), 6 deletions(-)

 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index f895e3c..736f271 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
 @@ -544,18 +544,40 @@ static void vfio_listener_region_add(MemoryListener 
 *listener,
  
  llsize = int128_sub(llend, int128_make64(iova));
  
 +if (memory_region_is_ram_device(section->mr)) {
 +hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
 +
 +if ((iova & pgmask) || (llsize & pgmask)) {
 +error_report("Region 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx
 + " is not aligned to 0x%"HWADDR_PRIx
 + " and cannot be mapped for DMA",
 + section->offset_within_region,
 + int128_getlo(section->size),  
>>> Didn't you want to print the section->offset_within_address_space
>>> instead of section->offset_within_region?
> 
> 
> I do, my bad - this offset_within_region is a leftover from an earlier
> version, iova is offset_within_address_space essentially.
> 
> 
>>>
>>> For an 1000e card*, with a 64kB page, it outputs:
>>> "Region 0x50..0xffb0 is not aligned to 0x1 and cannot be mapped for DMA"
>>>
>>> an absolute gpa range would be simpler I think.
>>
>> I agree, the offset within the address space seems like it'd be much
>> easier to map back to the device.  Since we're handling it as
>> non-fatal, perhaps we can even add "MMIO Region" to give the user a bit
>> more context where the issue occurs.  Alexey, do you want to submit a
>> follow-up patch, or Eric, you may have the best resources to tune the
>> message.  Thanks,
> 
> 
> I can if somebody gives a hint about what needs to be tuned in the message.
> I am also fine if Eric does this too :)

I propose to send a follow-up patch as I have the proper HW to test it now.

Thanks

Eric
> 
> 
>>
>> Alex
>>  
>>> * where Region 3: Memory at 100e (32-bit, non-prefetchable)
>>> [size=16K] contains the MSIX table
>>>
>>> Capabilities: [a0] MSI-X: Enable+ Count=5 Masked-
>>> Vector table: BAR=3 offset=
>>> PBA: BAR=3 offset=2000
>>>
>>> Thanks
>>>
>>> Eric
 + pgmask + 1);
 +return;
 +}
 +}
 +
  ret = vfio_dma_map(container, iova, int128_get64(llsize),
 vaddr, section->readonly);
  if (ret) {
  error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
   "0x%"HWADDR_PRIx", %p) = %d (%m)",
   container, iova, int128_get64(llsize), vaddr, ret);
 +if (memory_region_is_ram_device(section->mr)) {
 +/* Allow unexpected mappings not to be fatal for RAM devices 
 */
 +return;
 +}
  goto fail;
  }
  
  return;
  
  fail:
 +if (memory_region_is_ram_device(section->mr)) {
 +error_report("failed to vfio_dma_map. pci p2p may not work");
 +return;
 +}
  /*
   * On the initfn path, store the first error in the container so we
   * can gracefully fail.  Runtime, there's not much we can do other
 @@ -577,6 +599,7 @@ 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-22 Thread Alexey Kardashevskiy
On 22/3/18 2:29 am, Alex Williamson wrote:
> On Mon, 19 Mar 2018 21:49:58 +0100
> Auger Eric  wrote:
> 
>> Hi Alexey,
>>
>> On 09/02/18 08:55, Alexey Kardashevskiy wrote:
>>> At the moment if vfio_memory_listener is registered in the system memory
>>> address space, it maps/unmaps every RAM memory region for DMA.
>>> It expects system page size aligned memory sections so vfio_dma_map
>>> would not fail and so far this has been the case. A mapping failure
>>> would be fatal. A side effect of such behavior is that some MMIO pages
>>> would not be mapped silently.
>>>
>>> However we are going to change MSIX BAR handling so we will end having
>>> non-aligned sections in vfio_memory_listener (more details is in
>>> the next patch) and vfio_dma_map will exit QEMU.
>>>
>>> In order to avoid fatal failures on what previously was not a failure and
>>> was just silently ignored, this checks the section alignment to
>>> the smallest supported IOMMU page size and prints an error if not aligned;
>>> it also prints an error if vfio_dma_map failed despite the page size check.
>>> Both errors are not fatal; only MMIO RAM regions are checked
>>> (aka "RAM device" regions).
>>>
>>> If the amount of errors printed is overwhelming, the MSIX relocation
>>> could be used to avoid excessive error output.
>>>
>>> This is unlikely to cause any behavioral change.
>>>
>>> Signed-off-by: Alexey Kardashevskiy 
>>> ---
>>>  hw/vfio/common.c | 55 
>>> +--
>>>  1 file changed, 49 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index f895e3c..736f271 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -544,18 +544,40 @@ static void vfio_listener_region_add(MemoryListener 
>>> *listener,
>>>  
>>>  llsize = int128_sub(llend, int128_make64(iova));
>>>  
>>> +if (memory_region_is_ram_device(section->mr)) {
>>> +hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
>>> +
>>> +if ((iova & pgmask) || (llsize & pgmask)) {
>>> +error_report("Region 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx
>>> + " is not aligned to 0x%"HWADDR_PRIx
>>> + " and cannot be mapped for DMA",
>>> + section->offset_within_region,
>>> + int128_getlo(section->size),  
>> Didn't you want to print the section->offset_within_address_space
>> instead of section->offset_within_region?


I do, my bad - this offset_within_region is a leftover from an earlier
version, iova is offset_within_address_space essentially.


>>
>> For an 1000e card*, with a 64kB page, it outputs:
>> "Region 0x50..0xffb0 is not aligned to 0x1 and cannot be mapped for DMA"
>>
>> an absolute gpa range would be simpler I think.
> 
> I agree, the offset within the address space seems like it'd be much
> easier to map back to the device.  Since we're handling it as
> non-fatal, perhaps we can even add "MMIO Region" to give the user a bit
> more context where the issue occurs.  Alexey, do you want to submit a
> follow-up patch, or Eric, you may have the best resources to tune the
> message.  Thanks,


I can if somebody gives a hint about what needs to be tuned in the message.
I am also fine if Eric does this too :)


> 
> Alex
>  
>> * where  Region 3: Memory at 100e (32-bit, non-prefetchable)
>> [size=16K] contains the MSIX table
>>
>>  Capabilities: [a0] MSI-X: Enable+ Count=5 Masked-
>>  Vector table: BAR=3 offset=
>>  PBA: BAR=3 offset=2000
>>
>> Thanks
>>
>> Eric
>>> + pgmask + 1);
>>> +return;
>>> +}
>>> +}
>>> +
>>>  ret = vfio_dma_map(container, iova, int128_get64(llsize),
>>> vaddr, section->readonly);
>>>  if (ret) {
>>>  error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
>>>   "0x%"HWADDR_PRIx", %p) = %d (%m)",
>>>   container, iova, int128_get64(llsize), vaddr, ret);
>>> +if (memory_region_is_ram_device(section->mr)) {
>>> +/* Allow unexpected mappings not to be fatal for RAM devices */
>>> +return;
>>> +}
>>>  goto fail;
>>>  }
>>>  
>>>  return;
>>>  
>>>  fail:
>>> +if (memory_region_is_ram_device(section->mr)) {
>>> +error_report("failed to vfio_dma_map. pci p2p may not work");
>>> +return;
>>> +}
>>>  /*
>>>   * On the initfn path, store the first error in the container so we
>>>   * can gracefully fail.  Runtime, there's not much we can do other
>>> @@ -577,6 +599,7 @@ static void vfio_listener_region_del(MemoryListener 
>>> *listener,
>>>  hwaddr iova, end;
>>>  Int128 llend, llsize;
>>>  int ret;
>>> +bool try_unmap = true;
>>>  
>>>  if (vfio_listener_skipped_section(section)) {
>>>  trace_vfio_listener_region_del_skip(

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-21 Thread Alex Williamson
On Mon, 19 Mar 2018 21:49:58 +0100
Auger Eric  wrote:

> Hi Alexey,
> 
> On 09/02/18 08:55, Alexey Kardashevskiy wrote:
> > At the moment if vfio_memory_listener is registered in the system memory
> > address space, it maps/unmaps every RAM memory region for DMA.
> > It expects system page size aligned memory sections so vfio_dma_map
> > would not fail and so far this has been the case. A mapping failure
> > would be fatal. A side effect of such behavior is that some MMIO pages
> > would not be mapped silently.
> > 
> > However we are going to change MSIX BAR handling so we will end having
> > non-aligned sections in vfio_memory_listener (more details is in
> > the next patch) and vfio_dma_map will exit QEMU.
> > 
> > In order to avoid fatal failures on what previously was not a failure and
> > was just silently ignored, this checks the section alignment to
> > the smallest supported IOMMU page size and prints an error if not aligned;
> > it also prints an error if vfio_dma_map failed despite the page size check.
> > Both errors are not fatal; only MMIO RAM regions are checked
> > (aka "RAM device" regions).
> > 
> > If the amount of errors printed is overwhelming, the MSIX relocation
> > could be used to avoid excessive error output.
> > 
> > This is unlikely to cause any behavioral change.
> > 
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> >  hw/vfio/common.c | 55 
> > +--
> >  1 file changed, 49 insertions(+), 6 deletions(-)
> > 
> > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > index f895e3c..736f271 100644
> > --- a/hw/vfio/common.c
> > +++ b/hw/vfio/common.c
> > @@ -544,18 +544,40 @@ static void vfio_listener_region_add(MemoryListener 
> > *listener,
> >  
> >  llsize = int128_sub(llend, int128_make64(iova));
> >  
> > +if (memory_region_is_ram_device(section->mr)) {
> > +hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
> > +
> > +if ((iova & pgmask) || (llsize & pgmask)) {
> > +error_report("Region 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx
> > + " is not aligned to 0x%"HWADDR_PRIx
> > + " and cannot be mapped for DMA",
> > + section->offset_within_region,
> > + int128_getlo(section->size),  
> Didn't you want to print the section->offset_within_address_space
> instead of section->offset_within_region?
> 
> For an 1000e card*, with a 64kB page, it outputs:
> "Region 0x50..0xffb0 is not aligned to 0x1 and cannot be mapped for DMA"
> 
> an absolute gpa range would be simpler I think.

I agree, the offset within the address space seems like it'd be much
easier to map back to the device.  Since we're handling it as
non-fatal, perhaps we can even add "MMIO Region" to give the user a bit
more context where the issue occurs.  Alexey, do you want to submit a
follow-up patch, or Eric, you may have the best resources to tune the
message.  Thanks,

Alex
 
> * where   Region 3: Memory at 100e (32-bit, non-prefetchable)
> [size=16K] contains the MSIX table
> 
>   Capabilities: [a0] MSI-X: Enable+ Count=5 Masked-
>   Vector table: BAR=3 offset=
>   PBA: BAR=3 offset=2000
> 
> Thanks
> 
> Eric
> > + pgmask + 1);
> > +return;
> > +}
> > +}
> > +
> >  ret = vfio_dma_map(container, iova, int128_get64(llsize),
> > vaddr, section->readonly);
> >  if (ret) {
> >  error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> >   "0x%"HWADDR_PRIx", %p) = %d (%m)",
> >   container, iova, int128_get64(llsize), vaddr, ret);
> > +if (memory_region_is_ram_device(section->mr)) {
> > +/* Allow unexpected mappings not to be fatal for RAM devices */
> > +return;
> > +}
> >  goto fail;
> >  }
> >  
> >  return;
> >  
> >  fail:
> > +if (memory_region_is_ram_device(section->mr)) {
> > +error_report("failed to vfio_dma_map. pci p2p may not work");
> > +return;
> > +}
> >  /*
> >   * On the initfn path, store the first error in the container so we
> >   * can gracefully fail.  Runtime, there's not much we can do other
> > @@ -577,6 +599,7 @@ static void vfio_listener_region_del(MemoryListener 
> > *listener,
> >  hwaddr iova, end;
> >  Int128 llend, llsize;
> >  int ret;
> > +bool try_unmap = true;
> >  
> >  if (vfio_listener_skipped_section(section)) {
> >  trace_vfio_listener_region_del_skip(
> > @@ -629,13 +652,33 @@ static void vfio_listener_region_del(MemoryListener 
> > *listener,
> >  
> >  trace_vfio_listener_region_del(iova, end);
> >  
> > -ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
> > +if (memory_region_is_ram_device(section->mr)) {
> > +hwaddr pgmask;
> > +

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-19 Thread Auger Eric
Hi Alexey,

On 09/02/18 08:55, Alexey Kardashevskiy wrote:
> At the moment if vfio_memory_listener is registered in the system memory
> address space, it maps/unmaps every RAM memory region for DMA.
> It expects system page size aligned memory sections so vfio_dma_map
> would not fail and so far this has been the case. A mapping failure
> would be fatal. A side effect of such behavior is that some MMIO pages
> would not be mapped silently.
> 
> However we are going to change MSIX BAR handling so we will end having
> non-aligned sections in vfio_memory_listener (more details is in
> the next patch) and vfio_dma_map will exit QEMU.
> 
> In order to avoid fatal failures on what previously was not a failure and
> was just silently ignored, this checks the section alignment to
> the smallest supported IOMMU page size and prints an error if not aligned;
> it also prints an error if vfio_dma_map failed despite the page size check.
> Both errors are not fatal; only MMIO RAM regions are checked
> (aka "RAM device" regions).
> 
> If the amount of errors printed is overwhelming, the MSIX relocation
> could be used to avoid excessive error output.
> 
> This is unlikely to cause any behavioral change.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  hw/vfio/common.c | 55 +--
>  1 file changed, 49 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index f895e3c..736f271 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -544,18 +544,40 @@ static void vfio_listener_region_add(MemoryListener 
> *listener,
>  
>  llsize = int128_sub(llend, int128_make64(iova));
>  
> +if (memory_region_is_ram_device(section->mr)) {
> +hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
> +
> +if ((iova & pgmask) || (llsize & pgmask)) {
> +error_report("Region 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx
> + " is not aligned to 0x%"HWADDR_PRIx
> + " and cannot be mapped for DMA",
> + section->offset_within_region,
> + int128_getlo(section->size),
Didn't you want to print the section->offset_within_address_space
instead of section->offset_within_region?

For an 1000e card*, with a 64kB page, it outputs:
"Region 0x50..0xffb0 is not aligned to 0x1 and cannot be mapped for DMA"

an absolute gpa range would be simpler I think.


* where Region 3: Memory at 100e (32-bit, non-prefetchable)
[size=16K] contains the MSIX table

Capabilities: [a0] MSI-X: Enable+ Count=5 Masked-
Vector table: BAR=3 offset=
PBA: BAR=3 offset=2000

Thanks

Eric
> + pgmask + 1);
> +return;
> +}
> +}
> +
>  ret = vfio_dma_map(container, iova, int128_get64(llsize),
> vaddr, section->readonly);
>  if (ret) {
>  error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
>   "0x%"HWADDR_PRIx", %p) = %d (%m)",
>   container, iova, int128_get64(llsize), vaddr, ret);
> +if (memory_region_is_ram_device(section->mr)) {
> +/* Allow unexpected mappings not to be fatal for RAM devices */
> +return;
> +}
>  goto fail;
>  }
>  
>  return;
>  
>  fail:
> +if (memory_region_is_ram_device(section->mr)) {
> +error_report("failed to vfio_dma_map. pci p2p may not work");
> +return;
> +}
>  /*
>   * On the initfn path, store the first error in the container so we
>   * can gracefully fail.  Runtime, there's not much we can do other
> @@ -577,6 +599,7 @@ static void vfio_listener_region_del(MemoryListener 
> *listener,
>  hwaddr iova, end;
>  Int128 llend, llsize;
>  int ret;
> +bool try_unmap = true;
>  
>  if (vfio_listener_skipped_section(section)) {
>  trace_vfio_listener_region_del_skip(
> @@ -629,13 +652,33 @@ static void vfio_listener_region_del(MemoryListener 
> *listener,
>  
>  trace_vfio_listener_region_del(iova, end);
>  
> -ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
> +if (memory_region_is_ram_device(section->mr)) {
> +hwaddr pgmask;
> +VFIOHostDMAWindow *hostwin;
> +bool hostwin_found = false;
> +
> +QLIST_FOREACH(hostwin, >hostwin_list, hostwin_next) {
> +if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
> +hostwin_found = true;
> +break;
> +}
> +}
> +assert(hostwin_found); /* or region_add() would have failed */
> +
> +pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
> +try_unmap = !((iova & pgmask) || (llsize & pgmask));
> +}
> +
> +if (try_unmap) {
> +ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
> +if (ret) {
> +

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-14 Thread Alex Williamson
On Wed, 14 Mar 2018 13:40:21 +1100
Alexey Kardashevskiy  wrote:

> On 14/3/18 3:56 am, Alex Williamson wrote:
> > Actually making sure it compiles would have been nice:
> > 
> > qemu.git/hw/vfio/common.c: In function ‘vfio_listener_region_add’:
> > qemu.git/hw/vfio/common.c:550:40: error: invalid operands to binary & (have 
> > ‘Int128 {aka struct Int128}’ and ‘hwaddr {aka long long unsigned int}’)
> >  if ((iova & pgmask) || (llsize & pgmask)) {
> > ^
> > qemu.git/hw/vfio/common.c: In function ‘vfio_listener_region_del’:
> > qemu.git/hw/vfio/common.c:669:50: error: invalid operands to binary & (have 
> > ‘Int128 {aka struct Int128}’ and ‘hwaddr {aka long long unsigned int}’)
> >  try_unmap = !((iova & pgmask) || (llsize & pgmask));
> >   ^
> > Clearly llsize needs to be wrapped in int128_get64() here.  Should I
> > presume that testing of this patch is equally lacking?   
> 
> No.
> 
> I do not know how but this definitely compiles for me with these:
> 
> gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-16)
> gcc (GCC) 6.3.1 20161221 (Red Hat 6.3.1-1)
> 
> I always thought Int128 is a struct but it turns out there are __uint128_t
> and __int128_t and everywhere I compile (including x86_64 laptop) there is
> CONFIG_INT128=y in config-host.mak, hence no error.
> 
> I can see that you fixed it (thanks for that!) but how do you compile it to
> get this error? There is no way to disable gcc's types and even 4.8.5 can
> do these. Thanks,

Hmm, I always try to do a 32bit build before I send a pull request,
that's where I started this time.  I thought an Int128 was always a
structure, so I figured it always failed.  Good to know there was more
than just my testing on this commit.  Since it's in, we can fix it
during the freeze if it turns out to be broken.  Thanks,

Alex



Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-13 Thread Alexey Kardashevskiy
On 14/3/18 3:56 am, Alex Williamson wrote:
> [Cc +Eric]
> 
> On Tue, 13 Mar 2018 15:53:19 +1100
> Alexey Kardashevskiy  wrote:
> 
>> On 7/3/18 1:17 pm, Alexey Kardashevskiy wrote:
>>> On 26/02/18 19:36, Alexey Kardashevskiy wrote:  
 On 19/02/18 13:46, Alexey Kardashevskiy wrote:  
> On 16/02/18 16:28, David Gibson wrote:  
>> On Wed, Feb 14, 2018 at 08:55:41AM -0700, Alex Williamson wrote:  
>>> On Wed, 14 Feb 2018 19:09:16 +1100
>>> Alexey Kardashevskiy  wrote:
>>>  
 On 14/02/18 12:33, David Gibson wrote:  
> On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote: 
>
>> On 13/02/18 16:41, David Gibson wrote:
>>> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:
 On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy 
 wrote:
> On 13/02/18 03:06, Alex Williamson wrote:
>> On Mon, 12 Feb 2018 18:05:54 +1100
>> Alexey Kardashevskiy  wrote:
>>
>>> On 12/02/18 16:19, David Gibson wrote:
 On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
 wrote:  
> At the moment if vfio_memory_listener is registered in the 
> system memory
> address space, it maps/unmaps every RAM memory region for DMA.
> It expects system page size aligned memory sections so 
> vfio_dma_map
> would not fail and so far this has been the case. A mapping 
> failure
> would be fatal. A side effect of such behavior is that some 
> MMIO pages
> would not be mapped silently.
>
> However we are going to change MSIX BAR handling so we will 
> end having
> non-aligned sections in vfio_memory_listener (more details is 
> in
> the next patch) and vfio_dma_map will exit QEMU.
>
> In order to avoid fatal failures on what previously was not a 
> failure and
> was just silently ignored, this checks the section alignment 
> to
> the smallest supported IOMMU page size and prints an error if 
> not aligned;
> it also prints an error if vfio_dma_map failed despite the 
> page size check.
> Both errors are not fatal; only MMIO RAM regions are checked
> (aka "RAM device" regions).
>
> If the amount of errors printed is overwhelming, the MSIX 
> relocation
> could be used to avoid excessive error output.
>
> This is unlikely to cause any behavioral change.
>
> Signed-off-by: Alexey Kardashevskiy   

 There are some relatively superficial problems noted below.

 But more fundamentally, this feels like it's extending an 
 existing
 hack past the point of usefulness.

 The explicit check for is_ram_device() here has always 
 bothered me -
 it's not like a real bus bridge magically knows whether a 
 target
 address maps to RAM or not.

 What I think is really going on is that even for systems 
 without an
 IOMMU, it's not really true to say that the PCI address space 
 maps
 directly onto address_space_memory.  Instead, there's a large, 
 but
 much less than 2^64 sized, "upstream window" at address 0 on 
 the PCI
 bus, which is identity mapped to the system bus.  Details will 
 vary
 with the system, but in practice we expect nothing but RAM to 
 be in
 that window.  Addresses not within that window won't be mapped 
 to the
 system bus but will just be broadcast on the PCI bus and might 
 be
 picked up as a p2p transaction.  
>>>
>>> Currently this p2p works only via the IOMMU, direct p2p is not 
>>> possible as
>>> the guest needs to know physical MMIO addresses to make p2p 
>>> work and it
>>> does not.
>>
>> /me points to the Direct Translated P2P section of the ACS spec, 
>> though
>> it's as prone to spoofing by the device as ATS.  In any case, p2p
>> reflected from the IOMMU is 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-13 Thread Eric Blake

[Making a meta-comment that I've pointed out to others before]

On 03/13/2018 12:13 PM, Auger Eric wrote:

Hi Alex,

...


On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:

At the moment if vfio_memory_listener is registered in the system memory


17 levels of '>' before I even began my reply.  And several screenfuls 
of content that I'm snipping...



Clearly llsize needs to be wrapped in int128_get64() here.  Should I
presume that testing of this patch is equally lacking?  Eric, have you
done any testing of this?  I think this was fixing a mapping issue you
reported.  Thanks,

not that version unfortunately. I don't have access to the HW on which I
had the issue anymore. Maybe I could by the beg of next week?


before getting to the 2-line meat of the message.  It's okay to snip 
portions of the quoted content that are no longer relevant to the 
immediate message, as it helps improve the signal-to-noise ratio (we 
have list archives, for when referring to prior context is needed)


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-13 Thread Auger Eric
Hi Alex,

On 13/03/18 17:56, Alex Williamson wrote:
> [Cc +Eric]
> 
> On Tue, 13 Mar 2018 15:53:19 +1100
> Alexey Kardashevskiy  wrote:
> 
>> On 7/3/18 1:17 pm, Alexey Kardashevskiy wrote:
>>> On 26/02/18 19:36, Alexey Kardashevskiy wrote:  
 On 19/02/18 13:46, Alexey Kardashevskiy wrote:  
> On 16/02/18 16:28, David Gibson wrote:  
>> On Wed, Feb 14, 2018 at 08:55:41AM -0700, Alex Williamson wrote:  
>>> On Wed, 14 Feb 2018 19:09:16 +1100
>>> Alexey Kardashevskiy  wrote:
>>>  
 On 14/02/18 12:33, David Gibson wrote:  
> On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote: 
>
>> On 13/02/18 16:41, David Gibson wrote:
>>> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:
 On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy 
 wrote:
> On 13/02/18 03:06, Alex Williamson wrote:
>> On Mon, 12 Feb 2018 18:05:54 +1100
>> Alexey Kardashevskiy  wrote:
>>
>>> On 12/02/18 16:19, David Gibson wrote:
 On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
 wrote:  
> At the moment if vfio_memory_listener is registered in the 
> system memory
> address space, it maps/unmaps every RAM memory region for DMA.
> It expects system page size aligned memory sections so 
> vfio_dma_map
> would not fail and so far this has been the case. A mapping 
> failure
> would be fatal. A side effect of such behavior is that some 
> MMIO pages
> would not be mapped silently.
>
> However we are going to change MSIX BAR handling so we will 
> end having
> non-aligned sections in vfio_memory_listener (more details is 
> in
> the next patch) and vfio_dma_map will exit QEMU.
>
> In order to avoid fatal failures on what previously was not a 
> failure and
> was just silently ignored, this checks the section alignment 
> to
> the smallest supported IOMMU page size and prints an error if 
> not aligned;
> it also prints an error if vfio_dma_map failed despite the 
> page size check.
> Both errors are not fatal; only MMIO RAM regions are checked
> (aka "RAM device" regions).
>
> If the amount of errors printed is overwhelming, the MSIX 
> relocation
> could be used to avoid excessive error output.
>
> This is unlikely to cause any behavioral change.
>
> Signed-off-by: Alexey Kardashevskiy   

 There are some relatively superficial problems noted below.

 But more fundamentally, this feels like it's extending an 
 existing
 hack past the point of usefulness.

 The explicit check for is_ram_device() here has always 
 bothered me -
 it's not like a real bus bridge magically knows whether a 
 target
 address maps to RAM or not.

 What I think is really going on is that even for systems 
 without an
 IOMMU, it's not really true to say that the PCI address space 
 maps
 directly onto address_space_memory.  Instead, there's a large, 
 but
 much less than 2^64 sized, "upstream window" at address 0 on 
 the PCI
 bus, which is identity mapped to the system bus.  Details will 
 vary
 with the system, but in practice we expect nothing but RAM to 
 be in
 that window.  Addresses not within that window won't be mapped 
 to the
 system bus but will just be broadcast on the PCI bus and might 
 be
 picked up as a p2p transaction.  
>>>
>>> Currently this p2p works only via the IOMMU, direct p2p is not 
>>> possible as
>>> the guest needs to know physical MMIO addresses to make p2p 
>>> work and it
>>> does not.
>>
>> /me points to the Direct Translated P2P section of the ACS spec, 
>> though
>> it's as prone to spoofing by the device as ATS.  In any case, p2p
>> reflected from the 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-13 Thread Alex Williamson
[Cc +Eric]

On Tue, 13 Mar 2018 15:53:19 +1100
Alexey Kardashevskiy  wrote:

> On 7/3/18 1:17 pm, Alexey Kardashevskiy wrote:
> > On 26/02/18 19:36, Alexey Kardashevskiy wrote:  
> >> On 19/02/18 13:46, Alexey Kardashevskiy wrote:  
> >>> On 16/02/18 16:28, David Gibson wrote:  
>  On Wed, Feb 14, 2018 at 08:55:41AM -0700, Alex Williamson wrote:  
> > On Wed, 14 Feb 2018 19:09:16 +1100
> > Alexey Kardashevskiy  wrote:
> >  
> >> On 14/02/18 12:33, David Gibson wrote:  
> >>> On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote: 
> >>>
>  On 13/02/18 16:41, David Gibson wrote:
> > On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:
> >> On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy 
> >> wrote:
> >>> On 13/02/18 03:06, Alex Williamson wrote:
>  On Mon, 12 Feb 2018 18:05:54 +1100
>  Alexey Kardashevskiy  wrote:
> 
> > On 12/02/18 16:19, David Gibson wrote:
> >> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
> >> wrote:  
> >>> At the moment if vfio_memory_listener is registered in the 
> >>> system memory
> >>> address space, it maps/unmaps every RAM memory region for DMA.
> >>> It expects system page size aligned memory sections so 
> >>> vfio_dma_map
> >>> would not fail and so far this has been the case. A mapping 
> >>> failure
> >>> would be fatal. A side effect of such behavior is that some 
> >>> MMIO pages
> >>> would not be mapped silently.
> >>>
> >>> However we are going to change MSIX BAR handling so we will 
> >>> end having
> >>> non-aligned sections in vfio_memory_listener (more details is 
> >>> in
> >>> the next patch) and vfio_dma_map will exit QEMU.
> >>>
> >>> In order to avoid fatal failures on what previously was not a 
> >>> failure and
> >>> was just silently ignored, this checks the section alignment 
> >>> to
> >>> the smallest supported IOMMU page size and prints an error if 
> >>> not aligned;
> >>> it also prints an error if vfio_dma_map failed despite the 
> >>> page size check.
> >>> Both errors are not fatal; only MMIO RAM regions are checked
> >>> (aka "RAM device" regions).
> >>>
> >>> If the amount of errors printed is overwhelming, the MSIX 
> >>> relocation
> >>> could be used to avoid excessive error output.
> >>>
> >>> This is unlikely to cause any behavioral change.
> >>>
> >>> Signed-off-by: Alexey Kardashevskiy   
> >>
> >> There are some relatively superficial problems noted below.
> >>
> >> But more fundamentally, this feels like it's extending an 
> >> existing
> >> hack past the point of usefulness.
> >>
> >> The explicit check for is_ram_device() here has always 
> >> bothered me -
> >> it's not like a real bus bridge magically knows whether a 
> >> target
> >> address maps to RAM or not.
> >>
> >> What I think is really going on is that even for systems 
> >> without an
> >> IOMMU, it's not really true to say that the PCI address space 
> >> maps
> >> directly onto address_space_memory.  Instead, there's a large, 
> >> but
> >> much less than 2^64 sized, "upstream window" at address 0 on 
> >> the PCI
> >> bus, which is identity mapped to the system bus.  Details will 
> >> vary
> >> with the system, but in practice we expect nothing but RAM to 
> >> be in
> >> that window.  Addresses not within that window won't be mapped 
> >> to the
> >> system bus but will just be broadcast on the PCI bus and might 
> >> be
> >> picked up as a p2p transaction.  
> >
> > Currently this p2p works only via the IOMMU, direct p2p is not 
> > possible as
> > the guest needs to know physical MMIO addresses to make p2p 
> > work and it
> > does not.
> 
>  /me points to the Direct Translated P2P section of the ACS spec, 
>  though
>  it's as prone to spoofing by the device as ATS.  In any case, p2p
>  reflected from the IOMMU is still p2p and offloads the CPU even 
>  if
> 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-12 Thread Alexey Kardashevskiy
On 7/3/18 1:17 pm, Alexey Kardashevskiy wrote:
> On 26/02/18 19:36, Alexey Kardashevskiy wrote:
>> On 19/02/18 13:46, Alexey Kardashevskiy wrote:
>>> On 16/02/18 16:28, David Gibson wrote:
 On Wed, Feb 14, 2018 at 08:55:41AM -0700, Alex Williamson wrote:
> On Wed, 14 Feb 2018 19:09:16 +1100
> Alexey Kardashevskiy  wrote:
>
>> On 14/02/18 12:33, David Gibson wrote:
>>> On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote:  
 On 13/02/18 16:41, David Gibson wrote:  
> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:  
>> On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy 
>> wrote:  
>>> On 13/02/18 03:06, Alex Williamson wrote:  
 On Mon, 12 Feb 2018 18:05:54 +1100
 Alexey Kardashevskiy  wrote:
  
> On 12/02/18 16:19, David Gibson wrote:  
>> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
>> wrote:
>>> At the moment if vfio_memory_listener is registered in the 
>>> system memory
>>> address space, it maps/unmaps every RAM memory region for DMA.
>>> It expects system page size aligned memory sections so 
>>> vfio_dma_map
>>> would not fail and so far this has been the case. A mapping 
>>> failure
>>> would be fatal. A side effect of such behavior is that some 
>>> MMIO pages
>>> would not be mapped silently.
>>>
>>> However we are going to change MSIX BAR handling so we will end 
>>> having
>>> non-aligned sections in vfio_memory_listener (more details is in
>>> the next patch) and vfio_dma_map will exit QEMU.
>>>
>>> In order to avoid fatal failures on what previously was not a 
>>> failure and
>>> was just silently ignored, this checks the section alignment to
>>> the smallest supported IOMMU page size and prints an error if 
>>> not aligned;
>>> it also prints an error if vfio_dma_map failed despite the page 
>>> size check.
>>> Both errors are not fatal; only MMIO RAM regions are checked
>>> (aka "RAM device" regions).
>>>
>>> If the amount of errors printed is overwhelming, the MSIX 
>>> relocation
>>> could be used to avoid excessive error output.
>>>
>>> This is unlikely to cause any behavioral change.
>>>
>>> Signed-off-by: Alexey Kardashevskiy 
>>
>> There are some relatively superficial problems noted below.
>>
>> But more fundamentally, this feels like it's extending an 
>> existing
>> hack past the point of usefulness.
>>
>> The explicit check for is_ram_device() here has always bothered 
>> me -
>> it's not like a real bus bridge magically knows whether a target
>> address maps to RAM or not.
>>
>> What I think is really going on is that even for systems without 
>> an
>> IOMMU, it's not really true to say that the PCI address space 
>> maps
>> directly onto address_space_memory.  Instead, there's a large, 
>> but
>> much less than 2^64 sized, "upstream window" at address 0 on the 
>> PCI
>> bus, which is identity mapped to the system bus.  Details will 
>> vary
>> with the system, but in practice we expect nothing but RAM to be 
>> in
>> that window.  Addresses not within that window won't be mapped 
>> to the
>> system bus but will just be broadcast on the PCI bus and might be
>> picked up as a p2p transaction.
>
> Currently this p2p works only via the IOMMU, direct p2p is not 
> possible as
> the guest needs to know physical MMIO addresses to make p2p work 
> and it
> does not.  

 /me points to the Direct Translated P2P section of the ACS spec, 
 though
 it's as prone to spoofing by the device as ATS.  In any case, p2p
 reflected from the IOMMU is still p2p and offloads the CPU even if
 bandwidth suffers vs bare metal depending on if the data doubles 
 back
 over any links.  Thanks,  
>>>
>>> Sure, I was just saying that p2p via IOMMU won't be as simple as 
>>> broadcast
>>> on the PCI bus, IOMMU needs to be programmed in advance to make 
>>> this work,
>>> and current that broadcast won't work for the passed 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-03-06 Thread Alexey Kardashevskiy
On 26/02/18 19:36, Alexey Kardashevskiy wrote:
> On 19/02/18 13:46, Alexey Kardashevskiy wrote:
>> On 16/02/18 16:28, David Gibson wrote:
>>> On Wed, Feb 14, 2018 at 08:55:41AM -0700, Alex Williamson wrote:
 On Wed, 14 Feb 2018 19:09:16 +1100
 Alexey Kardashevskiy  wrote:

> On 14/02/18 12:33, David Gibson wrote:
>> On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote:  
>>> On 13/02/18 16:41, David Gibson wrote:  
 On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:  
> On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote: 
>  
>> On 13/02/18 03:06, Alex Williamson wrote:  
>>> On Mon, 12 Feb 2018 18:05:54 +1100
>>> Alexey Kardashevskiy  wrote:
>>>  
 On 12/02/18 16:19, David Gibson wrote:  
> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
> wrote:
>> At the moment if vfio_memory_listener is registered in the 
>> system memory
>> address space, it maps/unmaps every RAM memory region for DMA.
>> It expects system page size aligned memory sections so 
>> vfio_dma_map
>> would not fail and so far this has been the case. A mapping 
>> failure
>> would be fatal. A side effect of such behavior is that some MMIO 
>> pages
>> would not be mapped silently.
>>
>> However we are going to change MSIX BAR handling so we will end 
>> having
>> non-aligned sections in vfio_memory_listener (more details is in
>> the next patch) and vfio_dma_map will exit QEMU.
>>
>> In order to avoid fatal failures on what previously was not a 
>> failure and
>> was just silently ignored, this checks the section alignment to
>> the smallest supported IOMMU page size and prints an error if 
>> not aligned;
>> it also prints an error if vfio_dma_map failed despite the page 
>> size check.
>> Both errors are not fatal; only MMIO RAM regions are checked
>> (aka "RAM device" regions).
>>
>> If the amount of errors printed is overwhelming, the MSIX 
>> relocation
>> could be used to avoid excessive error output.
>>
>> This is unlikely to cause any behavioral change.
>>
>> Signed-off-by: Alexey Kardashevskiy 
>
> There are some relatively superficial problems noted below.
>
> But more fundamentally, this feels like it's extending an existing
> hack past the point of usefulness.
>
> The explicit check for is_ram_device() here has always bothered 
> me -
> it's not like a real bus bridge magically knows whether a target
> address maps to RAM or not.
>
> What I think is really going on is that even for systems without 
> an
> IOMMU, it's not really true to say that the PCI address space maps
> directly onto address_space_memory.  Instead, there's a large, but
> much less than 2^64 sized, "upstream window" at address 0 on the 
> PCI
> bus, which is identity mapped to the system bus.  Details will 
> vary
> with the system, but in practice we expect nothing but RAM to be 
> in
> that window.  Addresses not within that window won't be mapped to 
> the
> system bus but will just be broadcast on the PCI bus and might be
> picked up as a p2p transaction.

 Currently this p2p works only via the IOMMU, direct p2p is not 
 possible as
 the guest needs to know physical MMIO addresses to make p2p work 
 and it
 does not.  
>>>
>>> /me points to the Direct Translated P2P section of the ACS spec, 
>>> though
>>> it's as prone to spoofing by the device as ATS.  In any case, p2p
>>> reflected from the IOMMU is still p2p and offloads the CPU even if
>>> bandwidth suffers vs bare metal depending on if the data doubles 
>>> back
>>> over any links.  Thanks,  
>>
>> Sure, I was just saying that p2p via IOMMU won't be as simple as 
>> broadcast
>> on the PCI bus, IOMMU needs to be programmed in advance to make this 
>> work,
>> and current that broadcast won't work for the passed through 
>> devices.  
>
> Well, sure, p2p in a guest with passthrough devices clearly needs to
> be translated through the IOMMU (and p2p from a passthrough to an

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-26 Thread Alexey Kardashevskiy
On 19/02/18 13:46, Alexey Kardashevskiy wrote:
> On 16/02/18 16:28, David Gibson wrote:
>> On Wed, Feb 14, 2018 at 08:55:41AM -0700, Alex Williamson wrote:
>>> On Wed, 14 Feb 2018 19:09:16 +1100
>>> Alexey Kardashevskiy  wrote:
>>>
 On 14/02/18 12:33, David Gibson wrote:
> On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote:  
>> On 13/02/18 16:41, David Gibson wrote:  
>>> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:  
 On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:  
> On 13/02/18 03:06, Alex Williamson wrote:  
>> On Mon, 12 Feb 2018 18:05:54 +1100
>> Alexey Kardashevskiy  wrote:
>>  
>>> On 12/02/18 16:19, David Gibson wrote:  
 On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
 wrote:
> At the moment if vfio_memory_listener is registered in the system 
> memory
> address space, it maps/unmaps every RAM memory region for DMA.
> It expects system page size aligned memory sections so 
> vfio_dma_map
> would not fail and so far this has been the case. A mapping 
> failure
> would be fatal. A side effect of such behavior is that some MMIO 
> pages
> would not be mapped silently.
>
> However we are going to change MSIX BAR handling so we will end 
> having
> non-aligned sections in vfio_memory_listener (more details is in
> the next patch) and vfio_dma_map will exit QEMU.
>
> In order to avoid fatal failures on what previously was not a 
> failure and
> was just silently ignored, this checks the section alignment to
> the smallest supported IOMMU page size and prints an error if not 
> aligned;
> it also prints an error if vfio_dma_map failed despite the page 
> size check.
> Both errors are not fatal; only MMIO RAM regions are checked
> (aka "RAM device" regions).
>
> If the amount of errors printed is overwhelming, the MSIX 
> relocation
> could be used to avoid excessive error output.
>
> This is unlikely to cause any behavioral change.
>
> Signed-off-by: Alexey Kardashevskiy 

 There are some relatively superficial problems noted below.

 But more fundamentally, this feels like it's extending an existing
 hack past the point of usefulness.

 The explicit check for is_ram_device() here has always bothered me 
 -
 it's not like a real bus bridge magically knows whether a target
 address maps to RAM or not.

 What I think is really going on is that even for systems without an
 IOMMU, it's not really true to say that the PCI address space maps
 directly onto address_space_memory.  Instead, there's a large, but
 much less than 2^64 sized, "upstream window" at address 0 on the 
 PCI
 bus, which is identity mapped to the system bus.  Details will vary
 with the system, but in practice we expect nothing but RAM to be in
 that window.  Addresses not within that window won't be mapped to 
 the
 system bus but will just be broadcast on the PCI bus and might be
 picked up as a p2p transaction.
>>>
>>> Currently this p2p works only via the IOMMU, direct p2p is not 
>>> possible as
>>> the guest needs to know physical MMIO addresses to make p2p work 
>>> and it
>>> does not.  
>>
>> /me points to the Direct Translated P2P section of the ACS spec, 
>> though
>> it's as prone to spoofing by the device as ATS.  In any case, p2p
>> reflected from the IOMMU is still p2p and offloads the CPU even if
>> bandwidth suffers vs bare metal depending on if the data doubles back
>> over any links.  Thanks,  
>
> Sure, I was just saying that p2p via IOMMU won't be as simple as 
> broadcast
> on the PCI bus, IOMMU needs to be programmed in advance to make this 
> work,
> and current that broadcast won't work for the passed through devices. 
>  

 Well, sure, p2p in a guest with passthrough devices clearly needs to
 be translated through the IOMMU (and p2p from a passthrough to an
 emulated device is essentially impossible).

 But.. what does that have to do with this code.  This is the memory
 area watcher, looking for memory regions being mapped directly 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-18 Thread Alexey Kardashevskiy
On 16/02/18 16:28, David Gibson wrote:
> On Wed, Feb 14, 2018 at 08:55:41AM -0700, Alex Williamson wrote:
>> On Wed, 14 Feb 2018 19:09:16 +1100
>> Alexey Kardashevskiy  wrote:
>>
>>> On 14/02/18 12:33, David Gibson wrote:
 On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote:  
> On 13/02/18 16:41, David Gibson wrote:  
>> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:  
>>> On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:  
 On 13/02/18 03:06, Alex Williamson wrote:  
> On Mon, 12 Feb 2018 18:05:54 +1100
> Alexey Kardashevskiy  wrote:
>  
>> On 12/02/18 16:19, David Gibson wrote:  
>>> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
>>> wrote:
 At the moment if vfio_memory_listener is registered in the system 
 memory
 address space, it maps/unmaps every RAM memory region for DMA.
 It expects system page size aligned memory sections so vfio_dma_map
 would not fail and so far this has been the case. A mapping failure
 would be fatal. A side effect of such behavior is that some MMIO 
 pages
 would not be mapped silently.

 However we are going to change MSIX BAR handling so we will end 
 having
 non-aligned sections in vfio_memory_listener (more details is in
 the next patch) and vfio_dma_map will exit QEMU.

 In order to avoid fatal failures on what previously was not a 
 failure and
 was just silently ignored, this checks the section alignment to
 the smallest supported IOMMU page size and prints an error if not 
 aligned;
 it also prints an error if vfio_dma_map failed despite the page 
 size check.
 Both errors are not fatal; only MMIO RAM regions are checked
 (aka "RAM device" regions).

 If the amount of errors printed is overwhelming, the MSIX 
 relocation
 could be used to avoid excessive error output.

 This is unlikely to cause any behavioral change.

 Signed-off-by: Alexey Kardashevskiy 
>>>
>>> There are some relatively superficial problems noted below.
>>>
>>> But more fundamentally, this feels like it's extending an existing
>>> hack past the point of usefulness.
>>>
>>> The explicit check for is_ram_device() here has always bothered me -
>>> it's not like a real bus bridge magically knows whether a target
>>> address maps to RAM or not.
>>>
>>> What I think is really going on is that even for systems without an
>>> IOMMU, it's not really true to say that the PCI address space maps
>>> directly onto address_space_memory.  Instead, there's a large, but
>>> much less than 2^64 sized, "upstream window" at address 0 on the PCI
>>> bus, which is identity mapped to the system bus.  Details will vary
>>> with the system, but in practice we expect nothing but RAM to be in
>>> that window.  Addresses not within that window won't be mapped to 
>>> the
>>> system bus but will just be broadcast on the PCI bus and might be
>>> picked up as a p2p transaction.
>>
>> Currently this p2p works only via the IOMMU, direct p2p is not 
>> possible as
>> the guest needs to know physical MMIO addresses to make p2p work and 
>> it
>> does not.  
>
> /me points to the Direct Translated P2P section of the ACS spec, 
> though
> it's as prone to spoofing by the device as ATS.  In any case, p2p
> reflected from the IOMMU is still p2p and offloads the CPU even if
> bandwidth suffers vs bare metal depending on if the data doubles back
> over any links.  Thanks,  

 Sure, I was just saying that p2p via IOMMU won't be as simple as 
 broadcast
 on the PCI bus, IOMMU needs to be programmed in advance to make this 
 work,
 and current that broadcast won't work for the passed through devices.  
>>>
>>> Well, sure, p2p in a guest with passthrough devices clearly needs to
>>> be translated through the IOMMU (and p2p from a passthrough to an
>>> emulated device is essentially impossible).
>>>
>>> But.. what does that have to do with this code.  This is the memory
>>> area watcher, looking for memory regions being mapped directly into
>>> the PCI space.  NOT IOMMU regions, since those are handled separately
>>> by wiring up the IOMMU notifier.  This will only trigger if RAM-like,
>>> non-RAM regions are put into PCI space *not* 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-15 Thread David Gibson
On Wed, Feb 14, 2018 at 08:55:41AM -0700, Alex Williamson wrote:
> On Wed, 14 Feb 2018 19:09:16 +1100
> Alexey Kardashevskiy  wrote:
> 
> > On 14/02/18 12:33, David Gibson wrote:
> > > On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote:  
> > >> On 13/02/18 16:41, David Gibson wrote:  
> > >>> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:  
> >  On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:  
> > > On 13/02/18 03:06, Alex Williamson wrote:  
> > >> On Mon, 12 Feb 2018 18:05:54 +1100
> > >> Alexey Kardashevskiy  wrote:
> > >>  
> > >>> On 12/02/18 16:19, David Gibson wrote:  
> >  On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
> >  wrote:
> > > At the moment if vfio_memory_listener is registered in the system 
> > > memory
> > > address space, it maps/unmaps every RAM memory region for DMA.
> > > It expects system page size aligned memory sections so 
> > > vfio_dma_map
> > > would not fail and so far this has been the case. A mapping 
> > > failure
> > > would be fatal. A side effect of such behavior is that some MMIO 
> > > pages
> > > would not be mapped silently.
> > >
> > > However we are going to change MSIX BAR handling so we will end 
> > > having
> > > non-aligned sections in vfio_memory_listener (more details is in
> > > the next patch) and vfio_dma_map will exit QEMU.
> > >
> > > In order to avoid fatal failures on what previously was not a 
> > > failure and
> > > was just silently ignored, this checks the section alignment to
> > > the smallest supported IOMMU page size and prints an error if not 
> > > aligned;
> > > it also prints an error if vfio_dma_map failed despite the page 
> > > size check.
> > > Both errors are not fatal; only MMIO RAM regions are checked
> > > (aka "RAM device" regions).
> > >
> > > If the amount of errors printed is overwhelming, the MSIX 
> > > relocation
> > > could be used to avoid excessive error output.
> > >
> > > This is unlikely to cause any behavioral change.
> > >
> > > Signed-off-by: Alexey Kardashevskiy 
> > 
> >  There are some relatively superficial problems noted below.
> > 
> >  But more fundamentally, this feels like it's extending an existing
> >  hack past the point of usefulness.
> > 
> >  The explicit check for is_ram_device() here has always bothered me 
> >  -
> >  it's not like a real bus bridge magically knows whether a target
> >  address maps to RAM or not.
> > 
> >  What I think is really going on is that even for systems without an
> >  IOMMU, it's not really true to say that the PCI address space maps
> >  directly onto address_space_memory.  Instead, there's a large, but
> >  much less than 2^64 sized, "upstream window" at address 0 on the 
> >  PCI
> >  bus, which is identity mapped to the system bus.  Details will vary
> >  with the system, but in practice we expect nothing but RAM to be in
> >  that window.  Addresses not within that window won't be mapped to 
> >  the
> >  system bus but will just be broadcast on the PCI bus and might be
> >  picked up as a p2p transaction.
> > >>>
> > >>> Currently this p2p works only via the IOMMU, direct p2p is not 
> > >>> possible as
> > >>> the guest needs to know physical MMIO addresses to make p2p work 
> > >>> and it
> > >>> does not.  
> > >>
> > >> /me points to the Direct Translated P2P section of the ACS spec, 
> > >> though
> > >> it's as prone to spoofing by the device as ATS.  In any case, p2p
> > >> reflected from the IOMMU is still p2p and offloads the CPU even if
> > >> bandwidth suffers vs bare metal depending on if the data doubles back
> > >> over any links.  Thanks,  
> > >
> > > Sure, I was just saying that p2p via IOMMU won't be as simple as 
> > > broadcast
> > > on the PCI bus, IOMMU needs to be programmed in advance to make this 
> > > work,
> > > and current that broadcast won't work for the passed through devices. 
> > >  
> > 
> >  Well, sure, p2p in a guest with passthrough devices clearly needs to
> >  be translated through the IOMMU (and p2p from a passthrough to an
> >  emulated device is essentially impossible).
> > 
> >  But.. what does that have to do with this code.  This is the memory
> >  area watcher, looking for memory regions being mapped directly into
> >  the PCI space.  NOT IOMMU regions, since those are handled separately
> >  by 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-14 Thread Alex Williamson
On Wed, 14 Feb 2018 19:09:16 +1100
Alexey Kardashevskiy  wrote:

> On 14/02/18 12:33, David Gibson wrote:
> > On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote:  
> >> On 13/02/18 16:41, David Gibson wrote:  
> >>> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:  
>  On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:  
> > On 13/02/18 03:06, Alex Williamson wrote:  
> >> On Mon, 12 Feb 2018 18:05:54 +1100
> >> Alexey Kardashevskiy  wrote:
> >>  
> >>> On 12/02/18 16:19, David Gibson wrote:  
>  On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy 
>  wrote:
> > At the moment if vfio_memory_listener is registered in the system 
> > memory
> > address space, it maps/unmaps every RAM memory region for DMA.
> > It expects system page size aligned memory sections so vfio_dma_map
> > would not fail and so far this has been the case. A mapping failure
> > would be fatal. A side effect of such behavior is that some MMIO 
> > pages
> > would not be mapped silently.
> >
> > However we are going to change MSIX BAR handling so we will end 
> > having
> > non-aligned sections in vfio_memory_listener (more details is in
> > the next patch) and vfio_dma_map will exit QEMU.
> >
> > In order to avoid fatal failures on what previously was not a 
> > failure and
> > was just silently ignored, this checks the section alignment to
> > the smallest supported IOMMU page size and prints an error if not 
> > aligned;
> > it also prints an error if vfio_dma_map failed despite the page 
> > size check.
> > Both errors are not fatal; only MMIO RAM regions are checked
> > (aka "RAM device" regions).
> >
> > If the amount of errors printed is overwhelming, the MSIX relocation
> > could be used to avoid excessive error output.
> >
> > This is unlikely to cause any behavioral change.
> >
> > Signed-off-by: Alexey Kardashevskiy 
> 
>  There are some relatively superficial problems noted below.
> 
>  But more fundamentally, this feels like it's extending an existing
>  hack past the point of usefulness.
> 
>  The explicit check for is_ram_device() here has always bothered me -
>  it's not like a real bus bridge magically knows whether a target
>  address maps to RAM or not.
> 
>  What I think is really going on is that even for systems without an
>  IOMMU, it's not really true to say that the PCI address space maps
>  directly onto address_space_memory.  Instead, there's a large, but
>  much less than 2^64 sized, "upstream window" at address 0 on the PCI
>  bus, which is identity mapped to the system bus.  Details will vary
>  with the system, but in practice we expect nothing but RAM to be in
>  that window.  Addresses not within that window won't be mapped to the
>  system bus but will just be broadcast on the PCI bus and might be
>  picked up as a p2p transaction.
> >>>
> >>> Currently this p2p works only via the IOMMU, direct p2p is not 
> >>> possible as
> >>> the guest needs to know physical MMIO addresses to make p2p work and 
> >>> it
> >>> does not.  
> >>
> >> /me points to the Direct Translated P2P section of the ACS spec, though
> >> it's as prone to spoofing by the device as ATS.  In any case, p2p
> >> reflected from the IOMMU is still p2p and offloads the CPU even if
> >> bandwidth suffers vs bare metal depending on if the data doubles back
> >> over any links.  Thanks,  
> >
> > Sure, I was just saying that p2p via IOMMU won't be as simple as 
> > broadcast
> > on the PCI bus, IOMMU needs to be programmed in advance to make this 
> > work,
> > and current that broadcast won't work for the passed through devices.  
> 
>  Well, sure, p2p in a guest with passthrough devices clearly needs to
>  be translated through the IOMMU (and p2p from a passthrough to an
>  emulated device is essentially impossible).
> 
>  But.. what does that have to do with this code.  This is the memory
>  area watcher, looking for memory regions being mapped directly into
>  the PCI space.  NOT IOMMU regions, since those are handled separately
>  by wiring up the IOMMU notifier.  This will only trigger if RAM-like,
>  non-RAM regions are put into PCI space *not* behind an IOMMMU.  
> >>>
> >>> Duh, sorry, realised I was mixing up host and guest IOMMU.  I guess
> >>> the point here is that this will map RAM-like devices into the host
> >>> IOMMU when there is no guest IOMMU, allowing p2p transactions 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-14 Thread Alexey Kardashevskiy
On 14/02/18 12:33, David Gibson wrote:
> On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote:
>> On 13/02/18 16:41, David Gibson wrote:
>>> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:
 On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:
> On 13/02/18 03:06, Alex Williamson wrote:
>> On Mon, 12 Feb 2018 18:05:54 +1100
>> Alexey Kardashevskiy  wrote:
>>
>>> On 12/02/18 16:19, David Gibson wrote:
 On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:  
> At the moment if vfio_memory_listener is registered in the system 
> memory
> address space, it maps/unmaps every RAM memory region for DMA.
> It expects system page size aligned memory sections so vfio_dma_map
> would not fail and so far this has been the case. A mapping failure
> would be fatal. A side effect of such behavior is that some MMIO pages
> would not be mapped silently.
>
> However we are going to change MSIX BAR handling so we will end having
> non-aligned sections in vfio_memory_listener (more details is in
> the next patch) and vfio_dma_map will exit QEMU.
>
> In order to avoid fatal failures on what previously was not a failure 
> and
> was just silently ignored, this checks the section alignment to
> the smallest supported IOMMU page size and prints an error if not 
> aligned;
> it also prints an error if vfio_dma_map failed despite the page size 
> check.
> Both errors are not fatal; only MMIO RAM regions are checked
> (aka "RAM device" regions).
>
> If the amount of errors printed is overwhelming, the MSIX relocation
> could be used to avoid excessive error output.
>
> This is unlikely to cause any behavioral change.
>
> Signed-off-by: Alexey Kardashevskiy   

 There are some relatively superficial problems noted below.

 But more fundamentally, this feels like it's extending an existing
 hack past the point of usefulness.

 The explicit check for is_ram_device() here has always bothered me -
 it's not like a real bus bridge magically knows whether a target
 address maps to RAM or not.

 What I think is really going on is that even for systems without an
 IOMMU, it's not really true to say that the PCI address space maps
 directly onto address_space_memory.  Instead, there's a large, but
 much less than 2^64 sized, "upstream window" at address 0 on the PCI
 bus, which is identity mapped to the system bus.  Details will vary
 with the system, but in practice we expect nothing but RAM to be in
 that window.  Addresses not within that window won't be mapped to the
 system bus but will just be broadcast on the PCI bus and might be
 picked up as a p2p transaction.  
>>>
>>> Currently this p2p works only via the IOMMU, direct p2p is not possible 
>>> as
>>> the guest needs to know physical MMIO addresses to make p2p work and it
>>> does not.
>>
>> /me points to the Direct Translated P2P section of the ACS spec, though
>> it's as prone to spoofing by the device as ATS.  In any case, p2p
>> reflected from the IOMMU is still p2p and offloads the CPU even if
>> bandwidth suffers vs bare metal depending on if the data doubles back
>> over any links.  Thanks,
>
> Sure, I was just saying that p2p via IOMMU won't be as simple as broadcast
> on the PCI bus, IOMMU needs to be programmed in advance to make this work,
> and current that broadcast won't work for the passed through devices.

 Well, sure, p2p in a guest with passthrough devices clearly needs to
 be translated through the IOMMU (and p2p from a passthrough to an
 emulated device is essentially impossible).

 But.. what does that have to do with this code.  This is the memory
 area watcher, looking for memory regions being mapped directly into
 the PCI space.  NOT IOMMU regions, since those are handled separately
 by wiring up the IOMMU notifier.  This will only trigger if RAM-like,
 non-RAM regions are put into PCI space *not* behind an IOMMMU.
>>>
>>> Duh, sorry, realised I was mixing up host and guest IOMMU.  I guess
>>> the point here is that this will map RAM-like devices into the host
>>> IOMMU when there is no guest IOMMU, allowing p2p transactions between
>>> passthrough devices (though not from passthrough to emulated devices).
>>
>> Correct.
>>
>>>
>>> The conditions still seem kind of awkward to me, but I guess it makes
>>> sense.
>>
>> Is it the time to split this listener to RAM-listener and PCI bus listener?
> 
> I'm not really sure what you mean by that.
> 
>> On x86 it listens on 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-13 Thread David Gibson
On Tue, Feb 13, 2018 at 07:20:56PM +1100, Alexey Kardashevskiy wrote:
> On 13/02/18 16:41, David Gibson wrote:
> > On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:
> >> On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:
> >>> On 13/02/18 03:06, Alex Williamson wrote:
>  On Mon, 12 Feb 2018 18:05:54 +1100
>  Alexey Kardashevskiy  wrote:
> 
> > On 12/02/18 16:19, David Gibson wrote:
> >> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:  
> >>> At the moment if vfio_memory_listener is registered in the system 
> >>> memory
> >>> address space, it maps/unmaps every RAM memory region for DMA.
> >>> It expects system page size aligned memory sections so vfio_dma_map
> >>> would not fail and so far this has been the case. A mapping failure
> >>> would be fatal. A side effect of such behavior is that some MMIO pages
> >>> would not be mapped silently.
> >>>
> >>> However we are going to change MSIX BAR handling so we will end having
> >>> non-aligned sections in vfio_memory_listener (more details is in
> >>> the next patch) and vfio_dma_map will exit QEMU.
> >>>
> >>> In order to avoid fatal failures on what previously was not a failure 
> >>> and
> >>> was just silently ignored, this checks the section alignment to
> >>> the smallest supported IOMMU page size and prints an error if not 
> >>> aligned;
> >>> it also prints an error if vfio_dma_map failed despite the page size 
> >>> check.
> >>> Both errors are not fatal; only MMIO RAM regions are checked
> >>> (aka "RAM device" regions).
> >>>
> >>> If the amount of errors printed is overwhelming, the MSIX relocation
> >>> could be used to avoid excessive error output.
> >>>
> >>> This is unlikely to cause any behavioral change.
> >>>
> >>> Signed-off-by: Alexey Kardashevskiy   
> >>
> >> There are some relatively superficial problems noted below.
> >>
> >> But more fundamentally, this feels like it's extending an existing
> >> hack past the point of usefulness.
> >>
> >> The explicit check for is_ram_device() here has always bothered me -
> >> it's not like a real bus bridge magically knows whether a target
> >> address maps to RAM or not.
> >>
> >> What I think is really going on is that even for systems without an
> >> IOMMU, it's not really true to say that the PCI address space maps
> >> directly onto address_space_memory.  Instead, there's a large, but
> >> much less than 2^64 sized, "upstream window" at address 0 on the PCI
> >> bus, which is identity mapped to the system bus.  Details will vary
> >> with the system, but in practice we expect nothing but RAM to be in
> >> that window.  Addresses not within that window won't be mapped to the
> >> system bus but will just be broadcast on the PCI bus and might be
> >> picked up as a p2p transaction.  
> >
> > Currently this p2p works only via the IOMMU, direct p2p is not possible 
> > as
> > the guest needs to know physical MMIO addresses to make p2p work and it
> > does not.
> 
>  /me points to the Direct Translated P2P section of the ACS spec, though
>  it's as prone to spoofing by the device as ATS.  In any case, p2p
>  reflected from the IOMMU is still p2p and offloads the CPU even if
>  bandwidth suffers vs bare metal depending on if the data doubles back
>  over any links.  Thanks,
> >>>
> >>> Sure, I was just saying that p2p via IOMMU won't be as simple as broadcast
> >>> on the PCI bus, IOMMU needs to be programmed in advance to make this work,
> >>> and current that broadcast won't work for the passed through devices.
> >>
> >> Well, sure, p2p in a guest with passthrough devices clearly needs to
> >> be translated through the IOMMU (and p2p from a passthrough to an
> >> emulated device is essentially impossible).
> >>
> >> But.. what does that have to do with this code.  This is the memory
> >> area watcher, looking for memory regions being mapped directly into
> >> the PCI space.  NOT IOMMU regions, since those are handled separately
> >> by wiring up the IOMMU notifier.  This will only trigger if RAM-like,
> >> non-RAM regions are put into PCI space *not* behind an IOMMMU.
> > 
> > Duh, sorry, realised I was mixing up host and guest IOMMU.  I guess
> > the point here is that this will map RAM-like devices into the host
> > IOMMU when there is no guest IOMMU, allowing p2p transactions between
> > passthrough devices (though not from passthrough to emulated devices).
> 
> Correct.
> 
> > 
> > The conditions still seem kind of awkward to me, but I guess it makes
> > sense.
> 
> Is it the time to split this listener to RAM-listener and PCI bus listener?

I'm not really sure what you mean by that.

> On x86 it listens on the "memory" AS, on spapr - on the
> 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-13 Thread Alexey Kardashevskiy
On 13/02/18 16:41, David Gibson wrote:
> On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:
>> On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:
>>> On 13/02/18 03:06, Alex Williamson wrote:
 On Mon, 12 Feb 2018 18:05:54 +1100
 Alexey Kardashevskiy  wrote:

> On 12/02/18 16:19, David Gibson wrote:
>> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:  
>>> At the moment if vfio_memory_listener is registered in the system memory
>>> address space, it maps/unmaps every RAM memory region for DMA.
>>> It expects system page size aligned memory sections so vfio_dma_map
>>> would not fail and so far this has been the case. A mapping failure
>>> would be fatal. A side effect of such behavior is that some MMIO pages
>>> would not be mapped silently.
>>>
>>> However we are going to change MSIX BAR handling so we will end having
>>> non-aligned sections in vfio_memory_listener (more details is in
>>> the next patch) and vfio_dma_map will exit QEMU.
>>>
>>> In order to avoid fatal failures on what previously was not a failure 
>>> and
>>> was just silently ignored, this checks the section alignment to
>>> the smallest supported IOMMU page size and prints an error if not 
>>> aligned;
>>> it also prints an error if vfio_dma_map failed despite the page size 
>>> check.
>>> Both errors are not fatal; only MMIO RAM regions are checked
>>> (aka "RAM device" regions).
>>>
>>> If the amount of errors printed is overwhelming, the MSIX relocation
>>> could be used to avoid excessive error output.
>>>
>>> This is unlikely to cause any behavioral change.
>>>
>>> Signed-off-by: Alexey Kardashevskiy   
>>
>> There are some relatively superficial problems noted below.
>>
>> But more fundamentally, this feels like it's extending an existing
>> hack past the point of usefulness.
>>
>> The explicit check for is_ram_device() here has always bothered me -
>> it's not like a real bus bridge magically knows whether a target
>> address maps to RAM or not.
>>
>> What I think is really going on is that even for systems without an
>> IOMMU, it's not really true to say that the PCI address space maps
>> directly onto address_space_memory.  Instead, there's a large, but
>> much less than 2^64 sized, "upstream window" at address 0 on the PCI
>> bus, which is identity mapped to the system bus.  Details will vary
>> with the system, but in practice we expect nothing but RAM to be in
>> that window.  Addresses not within that window won't be mapped to the
>> system bus but will just be broadcast on the PCI bus and might be
>> picked up as a p2p transaction.  
>
> Currently this p2p works only via the IOMMU, direct p2p is not possible as
> the guest needs to know physical MMIO addresses to make p2p work and it
> does not.

 /me points to the Direct Translated P2P section of the ACS spec, though
 it's as prone to spoofing by the device as ATS.  In any case, p2p
 reflected from the IOMMU is still p2p and offloads the CPU even if
 bandwidth suffers vs bare metal depending on if the data doubles back
 over any links.  Thanks,
>>>
>>> Sure, I was just saying that p2p via IOMMU won't be as simple as broadcast
>>> on the PCI bus, IOMMU needs to be programmed in advance to make this work,
>>> and current that broadcast won't work for the passed through devices.
>>
>> Well, sure, p2p in a guest with passthrough devices clearly needs to
>> be translated through the IOMMU (and p2p from a passthrough to an
>> emulated device is essentially impossible).
>>
>> But.. what does that have to do with this code.  This is the memory
>> area watcher, looking for memory regions being mapped directly into
>> the PCI space.  NOT IOMMU regions, since those are handled separately
>> by wiring up the IOMMU notifier.  This will only trigger if RAM-like,
>> non-RAM regions are put into PCI space *not* behind an IOMMMU.
> 
> Duh, sorry, realised I was mixing up host and guest IOMMU.  I guess
> the point here is that this will map RAM-like devices into the host
> IOMMU when there is no guest IOMMU, allowing p2p transactions between
> passthrough devices (though not from passthrough to emulated devices).

Correct.

> 
> The conditions still seem kind of awkward to me, but I guess it makes
> sense.

Is it the time to split this listener to RAM-listener and PCI bus listener?

On x86 it listens on the "memory" AS, on spapr - on the
"pci@8002000" AS, this will just create more confusion over time...



-- 
Alexey



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-12 Thread David Gibson
On Tue, Feb 13, 2018 at 04:36:30PM +1100, David Gibson wrote:
> On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:
> > On 13/02/18 03:06, Alex Williamson wrote:
> > > On Mon, 12 Feb 2018 18:05:54 +1100
> > > Alexey Kardashevskiy  wrote:
> > > 
> > >> On 12/02/18 16:19, David Gibson wrote:
> > >>> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:  
> >  At the moment if vfio_memory_listener is registered in the system 
> >  memory
> >  address space, it maps/unmaps every RAM memory region for DMA.
> >  It expects system page size aligned memory sections so vfio_dma_map
> >  would not fail and so far this has been the case. A mapping failure
> >  would be fatal. A side effect of such behavior is that some MMIO pages
> >  would not be mapped silently.
> > 
> >  However we are going to change MSIX BAR handling so we will end having
> >  non-aligned sections in vfio_memory_listener (more details is in
> >  the next patch) and vfio_dma_map will exit QEMU.
> > 
> >  In order to avoid fatal failures on what previously was not a failure 
> >  and
> >  was just silently ignored, this checks the section alignment to
> >  the smallest supported IOMMU page size and prints an error if not 
> >  aligned;
> >  it also prints an error if vfio_dma_map failed despite the page size 
> >  check.
> >  Both errors are not fatal; only MMIO RAM regions are checked
> >  (aka "RAM device" regions).
> > 
> >  If the amount of errors printed is overwhelming, the MSIX relocation
> >  could be used to avoid excessive error output.
> > 
> >  This is unlikely to cause any behavioral change.
> > 
> >  Signed-off-by: Alexey Kardashevskiy   
> > >>>
> > >>> There are some relatively superficial problems noted below.
> > >>>
> > >>> But more fundamentally, this feels like it's extending an existing
> > >>> hack past the point of usefulness.
> > >>>
> > >>> The explicit check for is_ram_device() here has always bothered me -
> > >>> it's not like a real bus bridge magically knows whether a target
> > >>> address maps to RAM or not.
> > >>>
> > >>> What I think is really going on is that even for systems without an
> > >>> IOMMU, it's not really true to say that the PCI address space maps
> > >>> directly onto address_space_memory.  Instead, there's a large, but
> > >>> much less than 2^64 sized, "upstream window" at address 0 on the PCI
> > >>> bus, which is identity mapped to the system bus.  Details will vary
> > >>> with the system, but in practice we expect nothing but RAM to be in
> > >>> that window.  Addresses not within that window won't be mapped to the
> > >>> system bus but will just be broadcast on the PCI bus and might be
> > >>> picked up as a p2p transaction.  
> > >>
> > >> Currently this p2p works only via the IOMMU, direct p2p is not possible 
> > >> as
> > >> the guest needs to know physical MMIO addresses to make p2p work and it
> > >> does not.
> > > 
> > > /me points to the Direct Translated P2P section of the ACS spec, though
> > > it's as prone to spoofing by the device as ATS.  In any case, p2p
> > > reflected from the IOMMU is still p2p and offloads the CPU even if
> > > bandwidth suffers vs bare metal depending on if the data doubles back
> > > over any links.  Thanks,
> > 
> > Sure, I was just saying that p2p via IOMMU won't be as simple as broadcast
> > on the PCI bus, IOMMU needs to be programmed in advance to make this work,
> > and current that broadcast won't work for the passed through devices.
> 
> Well, sure, p2p in a guest with passthrough devices clearly needs to
> be translated through the IOMMU (and p2p from a passthrough to an
> emulated device is essentially impossible).
> 
> But.. what does that have to do with this code.  This is the memory
> area watcher, looking for memory regions being mapped directly into
> the PCI space.  NOT IOMMU regions, since those are handled separately
> by wiring up the IOMMU notifier.  This will only trigger if RAM-like,
> non-RAM regions are put into PCI space *not* behind an IOMMMU.

Duh, sorry, realised I was mixing up host and guest IOMMU.  I guess
the point here is that this will map RAM-like devices into the host
IOMMU when there is no guest IOMMU, allowing p2p transactions between
passthrough devices (though not from passthrough to emulated devices).

The conditions still seem kind of awkward to me, but I guess it makes
sense.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-12 Thread David Gibson
On Tue, Feb 13, 2018 at 12:15:52PM +1100, Alexey Kardashevskiy wrote:
> On 13/02/18 03:06, Alex Williamson wrote:
> > On Mon, 12 Feb 2018 18:05:54 +1100
> > Alexey Kardashevskiy  wrote:
> > 
> >> On 12/02/18 16:19, David Gibson wrote:
> >>> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:  
>  At the moment if vfio_memory_listener is registered in the system memory
>  address space, it maps/unmaps every RAM memory region for DMA.
>  It expects system page size aligned memory sections so vfio_dma_map
>  would not fail and so far this has been the case. A mapping failure
>  would be fatal. A side effect of such behavior is that some MMIO pages
>  would not be mapped silently.
> 
>  However we are going to change MSIX BAR handling so we will end having
>  non-aligned sections in vfio_memory_listener (more details is in
>  the next patch) and vfio_dma_map will exit QEMU.
> 
>  In order to avoid fatal failures on what previously was not a failure and
>  was just silently ignored, this checks the section alignment to
>  the smallest supported IOMMU page size and prints an error if not 
>  aligned;
>  it also prints an error if vfio_dma_map failed despite the page size 
>  check.
>  Both errors are not fatal; only MMIO RAM regions are checked
>  (aka "RAM device" regions).
> 
>  If the amount of errors printed is overwhelming, the MSIX relocation
>  could be used to avoid excessive error output.
> 
>  This is unlikely to cause any behavioral change.
> 
>  Signed-off-by: Alexey Kardashevskiy   
> >>>
> >>> There are some relatively superficial problems noted below.
> >>>
> >>> But more fundamentally, this feels like it's extending an existing
> >>> hack past the point of usefulness.
> >>>
> >>> The explicit check for is_ram_device() here has always bothered me -
> >>> it's not like a real bus bridge magically knows whether a target
> >>> address maps to RAM or not.
> >>>
> >>> What I think is really going on is that even for systems without an
> >>> IOMMU, it's not really true to say that the PCI address space maps
> >>> directly onto address_space_memory.  Instead, there's a large, but
> >>> much less than 2^64 sized, "upstream window" at address 0 on the PCI
> >>> bus, which is identity mapped to the system bus.  Details will vary
> >>> with the system, but in practice we expect nothing but RAM to be in
> >>> that window.  Addresses not within that window won't be mapped to the
> >>> system bus but will just be broadcast on the PCI bus and might be
> >>> picked up as a p2p transaction.  
> >>
> >> Currently this p2p works only via the IOMMU, direct p2p is not possible as
> >> the guest needs to know physical MMIO addresses to make p2p work and it
> >> does not.
> > 
> > /me points to the Direct Translated P2P section of the ACS spec, though
> > it's as prone to spoofing by the device as ATS.  In any case, p2p
> > reflected from the IOMMU is still p2p and offloads the CPU even if
> > bandwidth suffers vs bare metal depending on if the data doubles back
> > over any links.  Thanks,
> 
> Sure, I was just saying that p2p via IOMMU won't be as simple as broadcast
> on the PCI bus, IOMMU needs to be programmed in advance to make this work,
> and current that broadcast won't work for the passed through devices.

Well, sure, p2p in a guest with passthrough devices clearly needs to
be translated through the IOMMU (and p2p from a passthrough to an
emulated device is essentially impossible).

But.. what does that have to do with this code.  This is the memory
area watcher, looking for memory regions being mapped directly into
the PCI space.  NOT IOMMU regions, since those are handled separately
by wiring up the IOMMU notifier.  This will only trigger if RAM-like,
non-RAM regions are put into PCI space *not* behind an IOMMMU.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-12 Thread Alexey Kardashevskiy
On 13/02/18 03:06, Alex Williamson wrote:
> On Mon, 12 Feb 2018 18:05:54 +1100
> Alexey Kardashevskiy  wrote:
> 
>> On 12/02/18 16:19, David Gibson wrote:
>>> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:  
 At the moment if vfio_memory_listener is registered in the system memory
 address space, it maps/unmaps every RAM memory region for DMA.
 It expects system page size aligned memory sections so vfio_dma_map
 would not fail and so far this has been the case. A mapping failure
 would be fatal. A side effect of such behavior is that some MMIO pages
 would not be mapped silently.

 However we are going to change MSIX BAR handling so we will end having
 non-aligned sections in vfio_memory_listener (more details is in
 the next patch) and vfio_dma_map will exit QEMU.

 In order to avoid fatal failures on what previously was not a failure and
 was just silently ignored, this checks the section alignment to
 the smallest supported IOMMU page size and prints an error if not aligned;
 it also prints an error if vfio_dma_map failed despite the page size check.
 Both errors are not fatal; only MMIO RAM regions are checked
 (aka "RAM device" regions).

 If the amount of errors printed is overwhelming, the MSIX relocation
 could be used to avoid excessive error output.

 This is unlikely to cause any behavioral change.

 Signed-off-by: Alexey Kardashevskiy   
>>>
>>> There are some relatively superficial problems noted below.
>>>
>>> But more fundamentally, this feels like it's extending an existing
>>> hack past the point of usefulness.
>>>
>>> The explicit check for is_ram_device() here has always bothered me -
>>> it's not like a real bus bridge magically knows whether a target
>>> address maps to RAM or not.
>>>
>>> What I think is really going on is that even for systems without an
>>> IOMMU, it's not really true to say that the PCI address space maps
>>> directly onto address_space_memory.  Instead, there's a large, but
>>> much less than 2^64 sized, "upstream window" at address 0 on the PCI
>>> bus, which is identity mapped to the system bus.  Details will vary
>>> with the system, but in practice we expect nothing but RAM to be in
>>> that window.  Addresses not within that window won't be mapped to the
>>> system bus but will just be broadcast on the PCI bus and might be
>>> picked up as a p2p transaction.  
>>
>> Currently this p2p works only via the IOMMU, direct p2p is not possible as
>> the guest needs to know physical MMIO addresses to make p2p work and it
>> does not.
> 
> /me points to the Direct Translated P2P section of the ACS spec, though
> it's as prone to spoofing by the device as ATS.  In any case, p2p
> reflected from the IOMMU is still p2p and offloads the CPU even if
> bandwidth suffers vs bare metal depending on if the data doubles back
> over any links.  Thanks,

Sure, I was just saying that p2p via IOMMU won't be as simple as broadcast
on the PCI bus, IOMMU needs to be programmed in advance to make this work,
and current that broadcast won't work for the passed through devices.



-- 
Alexey



Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-12 Thread Alex Williamson
On Mon, 12 Feb 2018 18:05:54 +1100
Alexey Kardashevskiy  wrote:

> On 12/02/18 16:19, David Gibson wrote:
> > On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:  
> >> At the moment if vfio_memory_listener is registered in the system memory
> >> address space, it maps/unmaps every RAM memory region for DMA.
> >> It expects system page size aligned memory sections so vfio_dma_map
> >> would not fail and so far this has been the case. A mapping failure
> >> would be fatal. A side effect of such behavior is that some MMIO pages
> >> would not be mapped silently.
> >>
> >> However we are going to change MSIX BAR handling so we will end having
> >> non-aligned sections in vfio_memory_listener (more details is in
> >> the next patch) and vfio_dma_map will exit QEMU.
> >>
> >> In order to avoid fatal failures on what previously was not a failure and
> >> was just silently ignored, this checks the section alignment to
> >> the smallest supported IOMMU page size and prints an error if not aligned;
> >> it also prints an error if vfio_dma_map failed despite the page size check.
> >> Both errors are not fatal; only MMIO RAM regions are checked
> >> (aka "RAM device" regions).
> >>
> >> If the amount of errors printed is overwhelming, the MSIX relocation
> >> could be used to avoid excessive error output.
> >>
> >> This is unlikely to cause any behavioral change.
> >>
> >> Signed-off-by: Alexey Kardashevskiy   
> > 
> > There are some relatively superficial problems noted below.
> > 
> > But more fundamentally, this feels like it's extending an existing
> > hack past the point of usefulness.
> > 
> > The explicit check for is_ram_device() here has always bothered me -
> > it's not like a real bus bridge magically knows whether a target
> > address maps to RAM or not.
> > 
> > What I think is really going on is that even for systems without an
> > IOMMU, it's not really true to say that the PCI address space maps
> > directly onto address_space_memory.  Instead, there's a large, but
> > much less than 2^64 sized, "upstream window" at address 0 on the PCI
> > bus, which is identity mapped to the system bus.  Details will vary
> > with the system, but in practice we expect nothing but RAM to be in
> > that window.  Addresses not within that window won't be mapped to the
> > system bus but will just be broadcast on the PCI bus and might be
> > picked up as a p2p transaction.  
> 
> Currently this p2p works only via the IOMMU, direct p2p is not possible as
> the guest needs to know physical MMIO addresses to make p2p work and it
> does not.

/me points to the Direct Translated P2P section of the ACS spec, though
it's as prone to spoofing by the device as ATS.  In any case, p2p
reflected from the IOMMU is still p2p and offloads the CPU even if
bandwidth suffers vs bare metal depending on if the data doubles back
over any links.  Thanks,

Alex



Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-11 Thread Alexey Kardashevskiy
On 12/02/18 16:19, David Gibson wrote:
> On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:
>> At the moment if vfio_memory_listener is registered in the system memory
>> address space, it maps/unmaps every RAM memory region for DMA.
>> It expects system page size aligned memory sections so vfio_dma_map
>> would not fail and so far this has been the case. A mapping failure
>> would be fatal. A side effect of such behavior is that some MMIO pages
>> would not be mapped silently.
>>
>> However we are going to change MSIX BAR handling so we will end having
>> non-aligned sections in vfio_memory_listener (more details is in
>> the next patch) and vfio_dma_map will exit QEMU.
>>
>> In order to avoid fatal failures on what previously was not a failure and
>> was just silently ignored, this checks the section alignment to
>> the smallest supported IOMMU page size and prints an error if not aligned;
>> it also prints an error if vfio_dma_map failed despite the page size check.
>> Both errors are not fatal; only MMIO RAM regions are checked
>> (aka "RAM device" regions).
>>
>> If the amount of errors printed is overwhelming, the MSIX relocation
>> could be used to avoid excessive error output.
>>
>> This is unlikely to cause any behavioral change.
>>
>> Signed-off-by: Alexey Kardashevskiy 
> 
> There are some relatively superficial problems noted below.
> 
> But more fundamentally, this feels like it's extending an existing
> hack past the point of usefulness.
> 
> The explicit check for is_ram_device() here has always bothered me -
> it's not like a real bus bridge magically knows whether a target
> address maps to RAM or not.
> 
> What I think is really going on is that even for systems without an
> IOMMU, it's not really true to say that the PCI address space maps
> directly onto address_space_memory.  Instead, there's a large, but
> much less than 2^64 sized, "upstream window" at address 0 on the PCI
> bus, which is identity mapped to the system bus.  Details will vary
> with the system, but in practice we expect nothing but RAM to be in
> that window.  Addresses not within that window won't be mapped to the
> system bus but will just be broadcast on the PCI bus and might be
> picked up as a p2p transaction.

Currently this p2p works only via the IOMMU, direct p2p is not possible as
the guest needs to know physical MMIO addresses to make p2p work and it
does not.


> Actually on classic PC, I suspect
> there may be two windows, below and above the "ISA IO hole".
> 
> With PCIe it gets more complicated, of course.  I still suspect
> there's some sort of upstream window to the host, but whether things
> outside the window get reflected back down the PCIe heirarchy will
> depend on those p2p relevant configuration parameters.
> 
> Maybe it's time we had a detailed look at what really happens in
> physical bridges, rather than faking it with the is_ram_device()
> check?
> 
> Anyway, that said, the patch below might still be a reasonable interim
> hack, once the smaller problems are fixed.
> 
>> ---
>>  hw/vfio/common.c | 55 
>> +--
>>  1 file changed, 49 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index f895e3c..736f271 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -544,18 +544,40 @@ static void vfio_listener_region_add(MemoryListener 
>> *listener,
>>  
>>  llsize = int128_sub(llend, int128_make64(iova));
>>  
>> +if (memory_region_is_ram_device(section->mr)) {
>> +hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
>> +
>> +if ((iova & pgmask) || (llsize & pgmask)) {
>> +error_report("Region 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx
>> + " is not aligned to 0x%"HWADDR_PRIx
>> + " and cannot be mapped for DMA",
>> + section->offset_within_region,
>> + int128_getlo(section->size),
>> + pgmask + 1);
>> +return;
>> +}
>> +}
>> +
>>  ret = vfio_dma_map(container, iova, int128_get64(llsize),
>> vaddr, section->readonly);
>>  if (ret) {
>>  error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
>>   "0x%"HWADDR_PRIx", %p) = %d (%m)",
>>   container, iova, int128_get64(llsize), vaddr, ret);
>> +if (memory_region_is_ram_device(section->mr)) {
>> +/* Allow unexpected mappings not to be fatal for RAM devices */
>> +return;
>> +}
>>  goto fail;
>>  }
>>  
>>  return;
>>  
>>  fail:
>> +if (memory_region_is_ram_device(section->mr)) {
>> +error_report("failed to vfio_dma_map. pci p2p may not work");
> 
> Isn't this logic exactly backwards?  p2p will be affected when a *non
> RAM* device (itself probably a PCI MMIO window) can't be mapped in.


"RAM device MR" 

Re: [Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-11 Thread David Gibson
On Fri, Feb 09, 2018 at 06:55:01PM +1100, Alexey Kardashevskiy wrote:
> At the moment if vfio_memory_listener is registered in the system memory
> address space, it maps/unmaps every RAM memory region for DMA.
> It expects system page size aligned memory sections so vfio_dma_map
> would not fail and so far this has been the case. A mapping failure
> would be fatal. A side effect of such behavior is that some MMIO pages
> would not be mapped silently.
> 
> However we are going to change MSIX BAR handling so we will end having
> non-aligned sections in vfio_memory_listener (more details is in
> the next patch) and vfio_dma_map will exit QEMU.
> 
> In order to avoid fatal failures on what previously was not a failure and
> was just silently ignored, this checks the section alignment to
> the smallest supported IOMMU page size and prints an error if not aligned;
> it also prints an error if vfio_dma_map failed despite the page size check.
> Both errors are not fatal; only MMIO RAM regions are checked
> (aka "RAM device" regions).
> 
> If the amount of errors printed is overwhelming, the MSIX relocation
> could be used to avoid excessive error output.
> 
> This is unlikely to cause any behavioral change.
> 
> Signed-off-by: Alexey Kardashevskiy 

There are some relatively superficial problems noted below.

But more fundamentally, this feels like it's extending an existing
hack past the point of usefulness.

The explicit check for is_ram_device() here has always bothered me -
it's not like a real bus bridge magically knows whether a target
address maps to RAM or not.

What I think is really going on is that even for systems without an
IOMMU, it's not really true to say that the PCI address space maps
directly onto address_space_memory.  Instead, there's a large, but
much less than 2^64 sized, "upstream window" at address 0 on the PCI
bus, which is identity mapped to the system bus.  Details will vary
with the system, but in practice we expect nothing but RAM to be in
that window.  Addresses not within that window won't be mapped to the
system bus but will just be broadcast on the PCI bus and might be
picked up as a p2p transaction.  Actually on classic PC, I suspect
there may be two windows, below and above the "ISA IO hole".

With PCIe it gets more complicated, of course.  I still suspect
there's some sort of upstream window to the host, but whether things
outside the window get reflected back down the PCIe heirarchy will
depend on those p2p relevant configuration parameters.

Maybe it's time we had a detailed look at what really happens in
physical bridges, rather than faking it with the is_ram_device()
check?

Anyway, that said, the patch below might still be a reasonable interim
hack, once the smaller problems are fixed.

> ---
>  hw/vfio/common.c | 55 +--
>  1 file changed, 49 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index f895e3c..736f271 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -544,18 +544,40 @@ static void vfio_listener_region_add(MemoryListener 
> *listener,
>  
>  llsize = int128_sub(llend, int128_make64(iova));
>  
> +if (memory_region_is_ram_device(section->mr)) {
> +hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
> +
> +if ((iova & pgmask) || (llsize & pgmask)) {
> +error_report("Region 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx
> + " is not aligned to 0x%"HWADDR_PRIx
> + " and cannot be mapped for DMA",
> + section->offset_within_region,
> + int128_getlo(section->size),
> + pgmask + 1);
> +return;
> +}
> +}
> +
>  ret = vfio_dma_map(container, iova, int128_get64(llsize),
> vaddr, section->readonly);
>  if (ret) {
>  error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
>   "0x%"HWADDR_PRIx", %p) = %d (%m)",
>   container, iova, int128_get64(llsize), vaddr, ret);
> +if (memory_region_is_ram_device(section->mr)) {
> +/* Allow unexpected mappings not to be fatal for RAM devices */
> +return;
> +}
>  goto fail;
>  }
>  
>  return;
>  
>  fail:
> +if (memory_region_is_ram_device(section->mr)) {
> +error_report("failed to vfio_dma_map. pci p2p may not work");

Isn't this logic exactly backwards?  p2p will be affected when a *non
RAM* device (itself probably a PCI MMIO window) can't be mapped in.

> +return;
> +}
>  /*
>   * On the initfn path, store the first error in the container so we
>   * can gracefully fail.  Runtime, there's not much we can do other
> @@ -577,6 +599,7 @@ static void vfio_listener_region_del(MemoryListener 
> *listener,
>  hwaddr iova, end;
>  Int128 llend, llsize;
>  int ret;
> +bool 

[Qemu-devel] [PATCH qemu v7 2/4] vfio/pci: Relax DMA map errors for MMIO regions

2018-02-08 Thread Alexey Kardashevskiy
At the moment if vfio_memory_listener is registered in the system memory
address space, it maps/unmaps every RAM memory region for DMA.
It expects system page size aligned memory sections so vfio_dma_map
would not fail and so far this has been the case. A mapping failure
would be fatal. A side effect of such behavior is that some MMIO pages
would not be mapped silently.

However we are going to change MSIX BAR handling so we will end having
non-aligned sections in vfio_memory_listener (more details is in
the next patch) and vfio_dma_map will exit QEMU.

In order to avoid fatal failures on what previously was not a failure and
was just silently ignored, this checks the section alignment to
the smallest supported IOMMU page size and prints an error if not aligned;
it also prints an error if vfio_dma_map failed despite the page size check.
Both errors are not fatal; only MMIO RAM regions are checked
(aka "RAM device" regions).

If the amount of errors printed is overwhelming, the MSIX relocation
could be used to avoid excessive error output.

This is unlikely to cause any behavioral change.

Signed-off-by: Alexey Kardashevskiy 
---
 hw/vfio/common.c | 55 +--
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f895e3c..736f271 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -544,18 +544,40 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 
 llsize = int128_sub(llend, int128_make64(iova));
 
+if (memory_region_is_ram_device(section->mr)) {
+hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+
+if ((iova & pgmask) || (llsize & pgmask)) {
+error_report("Region 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx
+ " is not aligned to 0x%"HWADDR_PRIx
+ " and cannot be mapped for DMA",
+ section->offset_within_region,
+ int128_getlo(section->size),
+ pgmask + 1);
+return;
+}
+}
+
 ret = vfio_dma_map(container, iova, int128_get64(llsize),
vaddr, section->readonly);
 if (ret) {
 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
  "0x%"HWADDR_PRIx", %p) = %d (%m)",
  container, iova, int128_get64(llsize), vaddr, ret);
+if (memory_region_is_ram_device(section->mr)) {
+/* Allow unexpected mappings not to be fatal for RAM devices */
+return;
+}
 goto fail;
 }
 
 return;
 
 fail:
+if (memory_region_is_ram_device(section->mr)) {
+error_report("failed to vfio_dma_map. pci p2p may not work");
+return;
+}
 /*
  * On the initfn path, store the first error in the container so we
  * can gracefully fail.  Runtime, there's not much we can do other
@@ -577,6 +599,7 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 hwaddr iova, end;
 Int128 llend, llsize;
 int ret;
+bool try_unmap = true;
 
 if (vfio_listener_skipped_section(section)) {
 trace_vfio_listener_region_del_skip(
@@ -629,13 +652,33 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 
 trace_vfio_listener_region_del(iova, end);
 
-ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
+if (memory_region_is_ram_device(section->mr)) {
+hwaddr pgmask;
+VFIOHostDMAWindow *hostwin;
+bool hostwin_found = false;
+
+QLIST_FOREACH(hostwin, >hostwin_list, hostwin_next) {
+if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+hostwin_found = true;
+break;
+}
+}
+assert(hostwin_found); /* or region_add() would have failed */
+
+pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+try_unmap = !((iova & pgmask) || (llsize & pgmask));
+}
+
+if (try_unmap) {
+ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
+if (ret) {
+error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+ "0x%"HWADDR_PRIx") = %d (%m)",
+ container, iova, int128_get64(llsize), ret);
+}
+}
+
 memory_region_unref(section->mr);
-if (ret) {
-error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%m)",
- container, iova, int128_get64(llsize), ret);
-}
 
 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
 vfio_spapr_remove_window(container,
-- 
2.11.0