Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Joao Martins
On 11/09/2023 19:35, Alex Williamson wrote:
> On Mon, 11 Sep 2023 11:12:55 +0100
> Joao Martins  wrote:
> 
>> On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>>>> -Original Message-
>>>> From: Joao Martins 
>>>> Sent: Monday, September 11, 2023 5:07 PM
>>>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>>
>>>> On 11/09/2023 09:57, Duan, Zhenzhong wrote:  
>>>>>> -Original Message-
>>>>>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >>>>> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
>>>>>> Martins
>>>>>> Sent: Friday, September 8, 2023 5:30 PM
>>>>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>>>>
>>>>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>>>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>>>>>> QEMU includes in the 64-bit range the RAM regions at the lower part
>>>>>> and vfio-pci device RAM regions which are at the top of the address
>>>>>> space. This range contains a large gap and the size can be bigger than
>>>>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>>>>>
>>>>>> To avoid such large ranges, introduce a new PCI range covering the
>>>>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>>>>>> to avoid breaking potential SeaBIOS guests.
>>>>>>
>>>>>> Signed-off-by: Joao Martins 
>>>>>> [ clg: - wrote commit log
>>>>>>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>>>>>> Signed-off-by: Cédric Le Goater 
>>>>>> ---
>>>>>> v2:
>>>>>> * s/minpci/minpci64/
>>>>>> * s/maxpci/maxpci64/
>>>>>> * Expand comment to cover the pci-hole64 and why we don't do special
>>>>>>  handling of pci-hole32.
>>>>>> ---
>>>>>> hw/vfio/common.c | 71 +---
>>>>>> hw/vfio/trace-events |  2 +-
>>>>>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>> index 237101d03844..134649226d43 100644
>>>>>> --- a/hw/vfio/common.c
>>>>>> +++ b/hw/vfio/common.c
>>>>>> @@ -27,6 +27,7 @@
>>>>>>
>>>>>> #include "hw/vfio/vfio-common.h"
>>>>>> #include "hw/vfio/vfio.h"
>>>>>> +#include "hw/vfio/pci.h"
>>>>>> #include "exec/address-spaces.h"
>>>>>> #include "exec/memory.h"
>>>>>> #include "exec/ram_addr.h"
>>>>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>>>>>> hwaddr max32;
>>>>>> hwaddr min64;
>>>>>> hwaddr max64;
>>>>>> +hwaddr minpci64;
>>>>>> +hwaddr maxpci64;
>>>>>> } VFIODirtyRanges;
>>>>>>
>>>>>> typedef struct VFIODirtyRangesListener {
>>>>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>>>>>> MemoryListener listener;
>>>>>> } VFIODirtyRangesListener;
>>>>>>
>>>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>>>>> + VFIOContainer *container)
>>>>>> +{
>>>>>> +VFIOPCIDevice *pcidev;
>>>>>> +VFIODevice *vbasedev;
>>>>>> +VFIOGroup *group;
>>>>>> +Object *owner;
>>>>>> +
>>>>>> +owner = memory_region_owner(section->mr);
>>>>>> +
>>>>>> +QLIST_FOREACH(group, &container->group_list, container_next) {
>>>>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>>>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>>>>> +continue;
>>>>>> +}
>>>>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>>>> +   

Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Alex Williamson
On Mon, 11 Sep 2023 11:12:55 +0100
Joao Martins  wrote:

> On 11/09/2023 10:48, Duan, Zhenzhong wrote:
> >> -Original Message-
> >> From: Joao Martins 
> >> Sent: Monday, September 11, 2023 5:07 PM
> >> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
> >>
> >> On 11/09/2023 09:57, Duan, Zhenzhong wrote:  
> >>>> -Original Message-
> >>>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org  >>>> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
> >>>> Martins
> >>>> Sent: Friday, September 8, 2023 5:30 PM
> >>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
> >>>>
> >>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
> >>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
> >>>> QEMU includes in the 64-bit range the RAM regions at the lower part
> >>>> and vfio-pci device RAM regions which are at the top of the address
> >>>> space. This range contains a large gap and the size can be bigger than
> >>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
> >>>>
> >>>> To avoid such large ranges, introduce a new PCI range covering the
> >>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
> >>>> to avoid breaking potential SeaBIOS guests.
> >>>>
> >>>> Signed-off-by: Joao Martins 
> >>>> [ clg: - wrote commit log
> >>>>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
> >>>> Signed-off-by: Cédric Le Goater 
> >>>> ---
> >>>> v2:
> >>>> * s/minpci/minpci64/
> >>>> * s/maxpci/maxpci64/
> >>>> * Expand comment to cover the pci-hole64 and why we don't do special
> >>>>  handling of pci-hole32.
> >>>> ---
> >>>> hw/vfio/common.c | 71 +---
> >>>> hw/vfio/trace-events |  2 +-
> >>>> 2 files changed, 61 insertions(+), 12 deletions(-)
> >>>>
> >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>>> index 237101d03844..134649226d43 100644
> >>>> --- a/hw/vfio/common.c
> >>>> +++ b/hw/vfio/common.c
> >>>> @@ -27,6 +27,7 @@
> >>>>
> >>>> #include "hw/vfio/vfio-common.h"
> >>>> #include "hw/vfio/vfio.h"
> >>>> +#include "hw/vfio/pci.h"
> >>>> #include "exec/address-spaces.h"
> >>>> #include "exec/memory.h"
> >>>> #include "exec/ram_addr.h"
> >>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
> >>>> hwaddr max32;
> >>>> hwaddr min64;
> >>>> hwaddr max64;
> >>>> +hwaddr minpci64;
> >>>> +hwaddr maxpci64;
> >>>> } VFIODirtyRanges;
> >>>>
> >>>> typedef struct VFIODirtyRangesListener {
> >>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
> >>>> MemoryListener listener;
> >>>> } VFIODirtyRangesListener;
> >>>>
> >>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
> >>>> + VFIOContainer *container)
> >>>> +{
> >>>> +VFIOPCIDevice *pcidev;
> >>>> +VFIODevice *vbasedev;
> >>>> +VFIOGroup *group;
> >>>> +Object *owner;
> >>>> +
> >>>> +owner = memory_region_owner(section->mr);
> >>>> +
> >>>> +QLIST_FOREACH(group, &container->group_list, container_next) {
> >>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
> >>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
> >>>> +continue;
> >>>> +}
> >>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> >>>> +if (OBJECT(pcidev) == owner) {
> >>>> +return true;
> >>>> +}
> >>>> +}
> >>>> +}
> >>>> +
> >>>> +return false;
> >>>> +}  
> >>>
> >

RE: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Duan, Zhenzhong
>-Original Message-
>From: Joao Martins 
>Sent: Monday, September 11, 2023 6:20 PM
>To: Duan, Zhenzhong ; qemu-devel@nongnu.org
>Cc: Alex Williamson ; Cedric Le Goater
>; Avihai Horon ; Gerd Hoffmann
>
>Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>>>> + VFIOContainer *container)
>>>>> +{
>>>>> +VFIOPCIDevice *pcidev;
>>>>> +VFIODevice *vbasedev;
>>>>> +VFIOGroup *group;
>>>>> +Object *owner;
>>>>> +
>>>>> +owner = memory_region_owner(section->mr);
>>>>> +
>>>>> +QLIST_FOREACH(group, &container->group_list, container_next) {
>>>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>>>> +continue;
>>>>> +}
>>>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>>> +if (OBJECT(pcidev) == owner) {
>>>>> +return true;
>>>>> +}
>>>>> +}
>>>>> +}
>>>>> +
>>>>> +return false;
>>>>> +}
>>>>
>>>> What about simplify it with memory_region_is_ram_device()?
>>>> This way vdpa device could also be included.
>>>>
>>>
>>> Note that the check is not interested in RAM (or any other kinda of memory
>like
>>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM
>that
>>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() 
>>> really
>>> cover it? If so, I am all for the simplification.
>>
>> Ram device is used not only by vfio pci bars but also host notifier of vdpa 
>> and
>vhost-user.
>
>My only concern is whether this is all part of the pci-hole64 or not e.g. if we
>expand to general memory_region_is_ram_device() would we go back to the
>initial bug where we create an enourmous range.

Ok, I have no idea if memory_region_is_ram_device() will be expanded for other
usage in the future. Anyway looks better to keep your code for secure.

> The latter that you mentioned should be
>mainly virtio-net devices as presented to the guest (regardless of backend is
>vdpa, or vhost-user) and perhaps they are all in the hole32 PCI hole?
Agree, that's possible.

Thanks
Zhenzhong


RE: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Duan, Zhenzhong


>-Original Message-
>From: Joao Martins 
>Sent: Monday, September 11, 2023 6:13 PM
>Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>>> -Original Message-
>>> From: Joao Martins 
>>> Sent: Monday, September 11, 2023 5:07 PM
>>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
...
>> I have another question, previously I think vfio pci bars are device states 
>> and
>> save/restored through VFIO migration protocol, so we don't need to dirty
>> tracking them. Do I understand wrong?
>
>The general thinking of device dirty tracking is to track all addressable 
>IOVAs.
>But you do raise a good question. My understanding is that migrating the bars
>*as is* might be device migration specific (not a guarantee?); the save file 
>and
>precopy interface are the only places we transfer from/to the data and it's 
>just
>opaque data, not bars or anything formatted specifically -- so if we migrate
>bars it is hidden in what device f/w wants to move. Might be that BARs aren't
>even needed as they are sort of scratch space from h/w side. Ultimately, the
>dirty tracker is the one reporting the values, and the device h/w chooses to 
>not
>report those IOVAs as dirty then nothing changes.

Understood, thanks Joao.


Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Joao Martins
On 11/09/2023 10:48, Duan, Zhenzhong wrote:
 +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
 + VFIOContainer *container)
 +{
 +VFIOPCIDevice *pcidev;
 +VFIODevice *vbasedev;
 +VFIOGroup *group;
 +Object *owner;
 +
 +owner = memory_region_owner(section->mr);
 +
 +QLIST_FOREACH(group, &container->group_list, container_next) {
 +QLIST_FOREACH(vbasedev, &group->device_list, next) {
 +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
 +continue;
 +}
 +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 +if (OBJECT(pcidev) == owner) {
 +return true;
 +}
 +}
 +}
 +
 +return false;
 +}
>>>
>>> What about simplify it with memory_region_is_ram_device()?
>>> This way vdpa device could also be included.
>>>
>>
>> Note that the check is not interested in RAM (or any other kinda of memory 
>> like
>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM 
>> that
>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>> cover it? If so, I am all for the simplification.
> 
> Ram device is used not only by vfio pci bars but also host notifier of vdpa 
> and vhost-user.

My only concern is whether this is all part of the pci-hole64 or not e.g. if we
expand to general memory_region_is_ram_device() would we go back to the initial
bug where we create an enourmous range. The latter that you mentioned should be
mainly virtio-net devices as presented to the guest (regardless of backend is
vdpa, or vhost-user) and perhaps they are all in the hole32 PCI hole?

Joao



Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Joao Martins
On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>> -Original Message-
>> From: Joao Martins 
>> Sent: Monday, September 11, 2023 5:07 PM
>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>
>> On 11/09/2023 09:57, Duan, Zhenzhong wrote:
>>>> -Original Message-
>>>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >>> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
>>>> Martins
>>>> Sent: Friday, September 8, 2023 5:30 PM
>>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>>
>>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>>>> QEMU includes in the 64-bit range the RAM regions at the lower part
>>>> and vfio-pci device RAM regions which are at the top of the address
>>>> space. This range contains a large gap and the size can be bigger than
>>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>>>
>>>> To avoid such large ranges, introduce a new PCI range covering the
>>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>>>> to avoid breaking potential SeaBIOS guests.
>>>>
>>>> Signed-off-by: Joao Martins 
>>>> [ clg: - wrote commit log
>>>>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>>>> Signed-off-by: Cédric Le Goater 
>>>> ---
>>>> v2:
>>>> * s/minpci/minpci64/
>>>> * s/maxpci/maxpci64/
>>>> * Expand comment to cover the pci-hole64 and why we don't do special
>>>>  handling of pci-hole32.
>>>> ---
>>>> hw/vfio/common.c | 71 +---
>>>> hw/vfio/trace-events |  2 +-
>>>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index 237101d03844..134649226d43 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -27,6 +27,7 @@
>>>>
>>>> #include "hw/vfio/vfio-common.h"
>>>> #include "hw/vfio/vfio.h"
>>>> +#include "hw/vfio/pci.h"
>>>> #include "exec/address-spaces.h"
>>>> #include "exec/memory.h"
>>>> #include "exec/ram_addr.h"
>>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>>>> hwaddr max32;
>>>> hwaddr min64;
>>>> hwaddr max64;
>>>> +hwaddr minpci64;
>>>> +hwaddr maxpci64;
>>>> } VFIODirtyRanges;
>>>>
>>>> typedef struct VFIODirtyRangesListener {
>>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>>>> MemoryListener listener;
>>>> } VFIODirtyRangesListener;
>>>>
>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>>> + VFIOContainer *container)
>>>> +{
>>>> +VFIOPCIDevice *pcidev;
>>>> +VFIODevice *vbasedev;
>>>> +VFIOGroup *group;
>>>> +Object *owner;
>>>> +
>>>> +owner = memory_region_owner(section->mr);
>>>> +
>>>> +QLIST_FOREACH(group, &container->group_list, container_next) {
>>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>>> +continue;
>>>> +}
>>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>> +if (OBJECT(pcidev) == owner) {
>>>> +return true;
>>>> +}
>>>> +}
>>>> +}
>>>> +
>>>> +return false;
>>>> +}
>>>
>>> What about simplify it with memory_region_is_ram_device()?
>>> This way vdpa device could also be included.
>>>
>>
>> Note that the check is not interested in RAM (or any other kinda of memory 
>> like
>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM 
>> that
>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>> cover it? If so, I am all for the simplification.
> 
> Ram device is used not only by vfio pci bars but also host notifier of vdpa 
> and vhost-user.
> 
> I have another question, previously I think vfio pci bars are device states 
> and
> save/restored through VFIO migration protocol, so we don't need to dirty
> tracking them. Do I understand wrong?

The general thinking of device dirty tracking is to track all addressable IOVAs.
But you do raise a good question. My understanding is that migrating the bars
*as is* might be device migration specific (not a guarantee?); the save file and
precopy interface are the only places we transfer from/to the data and it's just
opaque data, not bars or anything formatted specifically -- so if we migrate
bars it is hidden in what device f/w wants to move. Might be that BARs aren't
even needed as they are sort of scratch space from h/w side. Ultimately, the
dirty tracker is the one reporting the values, and the device h/w chooses to not
report those IOVAs as dirty then nothing changes.



RE: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Duan, Zhenzhong


>-Original Message-
>From: Joao Martins 
>Sent: Monday, September 11, 2023 5:07 PM
>Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>On 11/09/2023 09:57, Duan, Zhenzhong wrote:
>>> -Original Message-
>>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
>>> Martins
>>> Sent: Friday, September 8, 2023 5:30 PM
>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>
>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>>> QEMU includes in the 64-bit range the RAM regions at the lower part
>>> and vfio-pci device RAM regions which are at the top of the address
>>> space. This range contains a large gap and the size can be bigger than
>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>>
>>> To avoid such large ranges, introduce a new PCI range covering the
>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>>> to avoid breaking potential SeaBIOS guests.
>>>
>>> Signed-off-by: Joao Martins 
>>> [ clg: - wrote commit log
>>>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>>> Signed-off-by: Cédric Le Goater 
>>> ---
>>> v2:
>>> * s/minpci/minpci64/
>>> * s/maxpci/maxpci64/
>>> * Expand comment to cover the pci-hole64 and why we don't do special
>>>  handling of pci-hole32.
>>> ---
>>> hw/vfio/common.c | 71 +---
>>> hw/vfio/trace-events |  2 +-
>>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index 237101d03844..134649226d43 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -27,6 +27,7 @@
>>>
>>> #include "hw/vfio/vfio-common.h"
>>> #include "hw/vfio/vfio.h"
>>> +#include "hw/vfio/pci.h"
>>> #include "exec/address-spaces.h"
>>> #include "exec/memory.h"
>>> #include "exec/ram_addr.h"
>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>>> hwaddr max32;
>>> hwaddr min64;
>>> hwaddr max64;
>>> +hwaddr minpci64;
>>> +hwaddr maxpci64;
>>> } VFIODirtyRanges;
>>>
>>> typedef struct VFIODirtyRangesListener {
>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>>> MemoryListener listener;
>>> } VFIODirtyRangesListener;
>>>
>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>> + VFIOContainer *container)
>>> +{
>>> +VFIOPCIDevice *pcidev;
>>> +VFIODevice *vbasedev;
>>> +VFIOGroup *group;
>>> +Object *owner;
>>> +
>>> +owner = memory_region_owner(section->mr);
>>> +
>>> +QLIST_FOREACH(group, &container->group_list, container_next) {
>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>> +continue;
>>> +}
>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>> +if (OBJECT(pcidev) == owner) {
>>> +return true;
>>> +}
>>> +}
>>> +}
>>> +
>>> +return false;
>>> +}
>>
>> What about simplify it with memory_region_is_ram_device()?
>> This way vdpa device could also be included.
>>
>
>Note that the check is not interested in RAM (or any other kinda of memory like
>VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
>would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>cover it? If so, I am all for the simplification.

Ram device is used not only by vfio pci bars but also host notifier of vdpa and 
vhost-user.

I have another question, previously I think vfio pci bars are device states and
save/restored through VFIO migration protocol, so we don't need to dirty
tracking them. Do I understand wrong?

Thanks
Zhenzhong


Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Joao Martins
On 11/09/2023 09:57, Duan, Zhenzhong wrote:
>> -Original Message-
>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org > devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
>> Martins
>> Sent: Friday, September 8, 2023 5:30 PM
>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>
>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>> QEMU includes in the 64-bit range the RAM regions at the lower part
>> and vfio-pci device RAM regions which are at the top of the address
>> space. This range contains a large gap and the size can be bigger than
>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>
>> To avoid such large ranges, introduce a new PCI range covering the
>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>> to avoid breaking potential SeaBIOS guests.
>>
>> Signed-off-by: Joao Martins 
>> [ clg: - wrote commit log
>>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>> Signed-off-by: Cédric Le Goater 
>> ---
>> v2:
>> * s/minpci/minpci64/
>> * s/maxpci/maxpci64/
>> * Expand comment to cover the pci-hole64 and why we don't do special
>>  handling of pci-hole32.
>> ---
>> hw/vfio/common.c | 71 +---
>> hw/vfio/trace-events |  2 +-
>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 237101d03844..134649226d43 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -27,6 +27,7 @@
>>
>> #include "hw/vfio/vfio-common.h"
>> #include "hw/vfio/vfio.h"
>> +#include "hw/vfio/pci.h"
>> #include "exec/address-spaces.h"
>> #include "exec/memory.h"
>> #include "exec/ram_addr.h"
>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>> hwaddr max32;
>> hwaddr min64;
>> hwaddr max64;
>> +hwaddr minpci64;
>> +hwaddr maxpci64;
>> } VFIODirtyRanges;
>>
>> typedef struct VFIODirtyRangesListener {
>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>> MemoryListener listener;
>> } VFIODirtyRangesListener;
>>
>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>> + VFIOContainer *container)
>> +{
>> +VFIOPCIDevice *pcidev;
>> +VFIODevice *vbasedev;
>> +VFIOGroup *group;
>> +Object *owner;
>> +
>> +owner = memory_region_owner(section->mr);
>> +
>> +QLIST_FOREACH(group, &container->group_list, container_next) {
>> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>> +continue;
>> +}
>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +if (OBJECT(pcidev) == owner) {
>> +return true;
>> +}
>> +}
>> +}
>> +
>> +return false;
>> +}
> 
> What about simplify it with memory_region_is_ram_device()?
> This way vdpa device could also be included.
> 

Note that the check is not interested in RAM (or any other kinda of memory like
VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
cover it? If so, I am all for the simplification.

Joao



RE: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Duan, Zhenzhong


>-Original Message-
>From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
>Martins
>Sent: Friday, September 8, 2023 5:30 PM
>Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>QEMU includes in the 64-bit range the RAM regions at the lower part
>and vfio-pci device RAM regions which are at the top of the address
>space. This range contains a large gap and the size can be bigger than
>the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>
>To avoid such large ranges, introduce a new PCI range covering the
>vfio-pci device RAM regions, this only if the addresses are above 4GB
>to avoid breaking potential SeaBIOS guests.
>
>Signed-off-by: Joao Martins 
>[ clg: - wrote commit log
>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>Signed-off-by: Cédric Le Goater 
>---
>v2:
>* s/minpci/minpci64/
>* s/maxpci/maxpci64/
>* Expand comment to cover the pci-hole64 and why we don't do special
>  handling of pci-hole32.
>---
> hw/vfio/common.c | 71 +---
> hw/vfio/trace-events |  2 +-
> 2 files changed, 61 insertions(+), 12 deletions(-)
>
>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>index 237101d03844..134649226d43 100644
>--- a/hw/vfio/common.c
>+++ b/hw/vfio/common.c
>@@ -27,6 +27,7 @@
>
> #include "hw/vfio/vfio-common.h"
> #include "hw/vfio/vfio.h"
>+#include "hw/vfio/pci.h"
> #include "exec/address-spaces.h"
> #include "exec/memory.h"
> #include "exec/ram_addr.h"
>@@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
> hwaddr max32;
> hwaddr min64;
> hwaddr max64;
>+hwaddr minpci64;
>+hwaddr maxpci64;
> } VFIODirtyRanges;
>
> typedef struct VFIODirtyRangesListener {
>@@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
> MemoryListener listener;
> } VFIODirtyRangesListener;
>
>+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>+ VFIOContainer *container)
>+{
>+VFIOPCIDevice *pcidev;
>+VFIODevice *vbasedev;
>+VFIOGroup *group;
>+Object *owner;
>+
>+owner = memory_region_owner(section->mr);
>+
>+QLIST_FOREACH(group, &container->group_list, container_next) {
>+QLIST_FOREACH(vbasedev, &group->device_list, next) {
>+if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>+continue;
>+}
>+pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>+if (OBJECT(pcidev) == owner) {
>+return true;
>+}
>+}
>+}
>+
>+return false;
>+}

What about simplify it with memory_region_is_ram_device()?
This way vdpa device could also be included.

Thanks
Zhenzhong



Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-10 Thread Cédric Le Goater

On 9/8/23 11:29, Joao Martins wrote:

QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
QEMU includes in the 64-bit range the RAM regions at the lower part
and vfio-pci device RAM regions which are at the top of the address
space. This range contains a large gap and the size can be bigger than
the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).

To avoid such large ranges, introduce a new PCI range covering the
vfio-pci device RAM regions, this only if the addresses are above 4GB
to avoid breaking potential SeaBIOS guests.

Signed-off-by: Joao Martins 
[ clg: - wrote commit log
- fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
Signed-off-by: Cédric Le Goater 
---
v2:
* s/minpci/minpci64/
* s/maxpci/maxpci64/
* Expand comment to cover the pci-hole64 and why we don't do special
   handling of pci-hole32.


This is a valuable fix for the latest OVMF enabling dynamic MMIO
window. Applied to vfio-next.

Thanks,

C.



---
  hw/vfio/common.c | 71 +---
  hw/vfio/trace-events |  2 +-
  2 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 237101d03844..134649226d43 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -27,6 +27,7 @@
  
  #include "hw/vfio/vfio-common.h"

  #include "hw/vfio/vfio.h"
+#include "hw/vfio/pci.h"
  #include "exec/address-spaces.h"
  #include "exec/memory.h"
  #include "exec/ram_addr.h"
@@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
  hwaddr max32;
  hwaddr min64;
  hwaddr max64;
+hwaddr minpci64;
+hwaddr maxpci64;
  } VFIODirtyRanges;
  
  typedef struct VFIODirtyRangesListener {

@@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
  MemoryListener listener;
  } VFIODirtyRangesListener;
  
+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,

+ VFIOContainer *container)
+{
+VFIOPCIDevice *pcidev;
+VFIODevice *vbasedev;
+VFIOGroup *group;
+Object *owner;
+
+owner = memory_region_owner(section->mr);
+
+QLIST_FOREACH(group, &container->group_list, container_next) {
+QLIST_FOREACH(vbasedev, &group->device_list, next) {
+if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+continue;
+}
+pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+if (OBJECT(pcidev) == owner) {
+return true;
+}
+}
+}
+
+return false;
+}
+
  static void vfio_dirty_tracking_update(MemoryListener *listener,
 MemoryRegionSection *section)
  {
@@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener 
*listener,
  }
  
  /*

- * The address space passed to the dirty tracker is reduced to two ranges:
- * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
+ * The address space passed to the dirty tracker is reduced to three 
ranges:
+ * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
+ * PCI 64-bit hole.
+ *
   * The underlying reports of dirty will query a sub-interval of each of
   * these ranges.
   *
- * The purpose of the dual range handling is to handle known cases of big
- * holes in the address space, like the x86 AMD 1T hole. The alternative
- * would be an IOVATree but that has a much bigger runtime overhead and
- * unnecessary complexity.
+ * The purpose of the three range handling is to handle known cases of big
+ * holes in the address space, like the x86 AMD 1T hole, and firmware (like
+ * OVMF) which may relocate the pci-hole64 to the end of the address space.
+ * The latter would otherwise generate large ranges for tracking, stressing
+ * the limits of supported hardware. The pci-hole32 will always be below 4G
+ * (overlapping or not) so it doesn't need special handling and is part of
+ * the 32-bit range.
+ *
+ * The alternative would be an IOVATree but that has a much bigger runtime
+ * overhead and unnecessary complexity.
   */
-min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
-max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
-
+if (vfio_section_is_vfio_pci(section, dirty->container) &&
+iova >= UINT32_MAX) {
+min = &range->minpci64;
+max = &range->maxpci64;
+} else {
+min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
+max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+}
  if (*min > iova) {
  *min = iova;
  }
@@ -1461,6 +1502,7 @@ static void vfio_dirty_tracking_init(VFIOContainer 
*container,
  memset(&dirty, 0, sizeof(dirty));
  dirty.ranges.min32 = UINT32_MAX;
  dirty.ranges.min64 = UINT64_MAX;
+dirty.ranges.minpci64 = U

Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-09 Thread Joao Martins



On 08/09/2023 10:29, Joao Martins wrote:
> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
> QEMU includes in the 64-bit range the RAM regions at the lower part
> and vfio-pci device RAM regions which are at the top of the address
> space. This range contains a large gap and the size can be bigger than
> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
> 
> To avoid such large ranges, introduce a new PCI range covering the
> vfio-pci device RAM regions, this only if the addresses are above 4GB
> to avoid breaking potential SeaBIOS guests.
> 
> Signed-off-by: Joao Martins 
> [ clg: - wrote commit log
>- fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
> Signed-off-by: Cédric Le Goater 

This should have a:

Fixes: 5255bbf4ec16 ("vfio/common: Add device dirty page tracking start/stop")

> ---
> v2:
> * s/minpci/minpci64/
> * s/maxpci/maxpci64/
> * Expand comment to cover the pci-hole64 and why we don't do special
>   handling of pci-hole32.
> ---
>  hw/vfio/common.c | 71 +---
>  hw/vfio/trace-events |  2 +-
>  2 files changed, 61 insertions(+), 12 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 237101d03844..134649226d43 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -27,6 +27,7 @@
>  
>  #include "hw/vfio/vfio-common.h"
>  #include "hw/vfio/vfio.h"
> +#include "hw/vfio/pci.h"
>  #include "exec/address-spaces.h"
>  #include "exec/memory.h"
>  #include "exec/ram_addr.h"
> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>  hwaddr max32;
>  hwaddr min64;
>  hwaddr max64;
> +hwaddr minpci64;
> +hwaddr maxpci64;
>  } VFIODirtyRanges;
>  
>  typedef struct VFIODirtyRangesListener {
> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>  MemoryListener listener;
>  } VFIODirtyRangesListener;
>  
> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
> + VFIOContainer *container)
> +{
> +VFIOPCIDevice *pcidev;
> +VFIODevice *vbasedev;
> +VFIOGroup *group;
> +Object *owner;
> +
> +owner = memory_region_owner(section->mr);
> +
> +QLIST_FOREACH(group, &container->group_list, container_next) {
> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
> +continue;
> +}
> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +if (OBJECT(pcidev) == owner) {
> +return true;
> +}
> +}
> +}
> +
> +return false;
> +}
> +
>  static void vfio_dirty_tracking_update(MemoryListener *listener,
> MemoryRegionSection *section)
>  {
> @@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener 
> *listener,
>  }
>  
>  /*
> - * The address space passed to the dirty tracker is reduced to two 
> ranges:
> - * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
> + * The address space passed to the dirty tracker is reduced to three 
> ranges:
> + * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
> + * PCI 64-bit hole.
> + *
>   * The underlying reports of dirty will query a sub-interval of each of
>   * these ranges.
>   *
> - * The purpose of the dual range handling is to handle known cases of big
> - * holes in the address space, like the x86 AMD 1T hole. The alternative
> - * would be an IOVATree but that has a much bigger runtime overhead and
> - * unnecessary complexity.
> + * The purpose of the three range handling is to handle known cases of 
> big
> + * holes in the address space, like the x86 AMD 1T hole, and firmware 
> (like
> + * OVMF) which may relocate the pci-hole64 to the end of the address 
> space.
> + * The latter would otherwise generate large ranges for tracking, 
> stressing
> + * the limits of supported hardware. The pci-hole32 will always be below 
> 4G
> + * (overlapping or not) so it doesn't need special handling and is part 
> of
> + * the 32-bit range.
> + *
> + * The alternative would be an IOVATree but that has a much bigger 
> runtime
> + * overhead and unnecessary complexity.
>   */
> -min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
> -max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
> -
> +if (vfio_section_is_vfio_pci(section, dirty->container) &&
> +iova >= UINT32_MAX) {
> +min = &range->minpci64;
> +max = &range->maxpci64;
> +} else {
> +min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
> +max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
> +}
>  if (*min > iova) {
>  *min = iova;
>  }
> @@ -1

[PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-08 Thread Joao Martins
QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
QEMU includes in the 64-bit range the RAM regions at the lower part
and vfio-pci device RAM regions which are at the top of the address
space. This range contains a large gap and the size can be bigger than
the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).

To avoid such large ranges, introduce a new PCI range covering the
vfio-pci device RAM regions, this only if the addresses are above 4GB
to avoid breaking potential SeaBIOS guests.

Signed-off-by: Joao Martins 
[ clg: - wrote commit log
   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
Signed-off-by: Cédric Le Goater 
---
v2:
* s/minpci/minpci64/
* s/maxpci/maxpci64/
* Expand comment to cover the pci-hole64 and why we don't do special
  handling of pci-hole32.
---
 hw/vfio/common.c | 71 +---
 hw/vfio/trace-events |  2 +-
 2 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 237101d03844..134649226d43 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -27,6 +27,7 @@
 
 #include "hw/vfio/vfio-common.h"
 #include "hw/vfio/vfio.h"
+#include "hw/vfio/pci.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
@@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
 hwaddr max32;
 hwaddr min64;
 hwaddr max64;
+hwaddr minpci64;
+hwaddr maxpci64;
 } VFIODirtyRanges;
 
 typedef struct VFIODirtyRangesListener {
@@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
 MemoryListener listener;
 } VFIODirtyRangesListener;
 
+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
+ VFIOContainer *container)
+{
+VFIOPCIDevice *pcidev;
+VFIODevice *vbasedev;
+VFIOGroup *group;
+Object *owner;
+
+owner = memory_region_owner(section->mr);
+
+QLIST_FOREACH(group, &container->group_list, container_next) {
+QLIST_FOREACH(vbasedev, &group->device_list, next) {
+if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+continue;
+}
+pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+if (OBJECT(pcidev) == owner) {
+return true;
+}
+}
+}
+
+return false;
+}
+
 static void vfio_dirty_tracking_update(MemoryListener *listener,
MemoryRegionSection *section)
 {
@@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener 
*listener,
 }
 
 /*
- * The address space passed to the dirty tracker is reduced to two ranges:
- * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
+ * The address space passed to the dirty tracker is reduced to three 
ranges:
+ * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
+ * PCI 64-bit hole.
+ *
  * The underlying reports of dirty will query a sub-interval of each of
  * these ranges.
  *
- * The purpose of the dual range handling is to handle known cases of big
- * holes in the address space, like the x86 AMD 1T hole. The alternative
- * would be an IOVATree but that has a much bigger runtime overhead and
- * unnecessary complexity.
+ * The purpose of the three range handling is to handle known cases of big
+ * holes in the address space, like the x86 AMD 1T hole, and firmware (like
+ * OVMF) which may relocate the pci-hole64 to the end of the address space.
+ * The latter would otherwise generate large ranges for tracking, stressing
+ * the limits of supported hardware. The pci-hole32 will always be below 4G
+ * (overlapping or not) so it doesn't need special handling and is part of
+ * the 32-bit range.
+ *
+ * The alternative would be an IOVATree but that has a much bigger runtime
+ * overhead and unnecessary complexity.
  */
-min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
-max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
-
+if (vfio_section_is_vfio_pci(section, dirty->container) &&
+iova >= UINT32_MAX) {
+min = &range->minpci64;
+max = &range->maxpci64;
+} else {
+min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
+max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+}
 if (*min > iova) {
 *min = iova;
 }
@@ -1461,6 +1502,7 @@ static void vfio_dirty_tracking_init(VFIOContainer 
*container,
 memset(&dirty, 0, sizeof(dirty));
 dirty.ranges.min32 = UINT32_MAX;
 dirty.ranges.min64 = UINT64_MAX;
+dirty.ranges.minpci64 = UINT64_MAX;
 dirty.listener = vfio_dirty_tracking_listener;
 dirty.container = container;
 
@@ -1531,7 +1573,8 @@ 
vfio_device_feature_dma_logging_start_create(VFIOContainer *container,