Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
On 11/09/2023 19:35, Alex Williamson wrote: > On Mon, 11 Sep 2023 11:12:55 +0100 > Joao Martins wrote: > >> On 11/09/2023 10:48, Duan, Zhenzhong wrote: >>>> -Original Message- >>>> From: Joao Martins >>>> Sent: Monday, September 11, 2023 5:07 PM >>>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges >>>> >>>> On 11/09/2023 09:57, Duan, Zhenzhong wrote: >>>>>> -Original Message- >>>>>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >>>>> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao >>>>>> Martins >>>>>> Sent: Friday, September 8, 2023 5:30 PM >>>>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges >>>>>> >>>>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit >>>>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, >>>>>> QEMU includes in the 64-bit range the RAM regions at the lower part >>>>>> and vfio-pci device RAM regions which are at the top of the address >>>>>> space. This range contains a large gap and the size can be bigger than >>>>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). >>>>>> >>>>>> To avoid such large ranges, introduce a new PCI range covering the >>>>>> vfio-pci device RAM regions, this only if the addresses are above 4GB >>>>>> to avoid breaking potential SeaBIOS guests. >>>>>> >>>>>> Signed-off-by: Joao Martins >>>>>> [ clg: - wrote commit log >>>>>> - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] >>>>>> Signed-off-by: Cédric Le Goater >>>>>> --- >>>>>> v2: >>>>>> * s/minpci/minpci64/ >>>>>> * s/maxpci/maxpci64/ >>>>>> * Expand comment to cover the pci-hole64 and why we don't do special >>>>>> handling of pci-hole32. >>>>>> --- >>>>>> hw/vfio/common.c | 71 +--- >>>>>> hw/vfio/trace-events | 2 +- >>>>>> 2 files changed, 61 insertions(+), 12 deletions(-) >>>>>> >>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>>>>> index 237101d03844..134649226d43 100644 >>>>>> --- a/hw/vfio/common.c >>>>>> +++ b/hw/vfio/common.c >>>>>> @@ -27,6 +27,7 @@ >>>>>> >>>>>> #include "hw/vfio/vfio-common.h" >>>>>> #include "hw/vfio/vfio.h" >>>>>> +#include "hw/vfio/pci.h" >>>>>> #include "exec/address-spaces.h" >>>>>> #include "exec/memory.h" >>>>>> #include "exec/ram_addr.h" >>>>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { >>>>>> hwaddr max32; >>>>>> hwaddr min64; >>>>>> hwaddr max64; >>>>>> +hwaddr minpci64; >>>>>> +hwaddr maxpci64; >>>>>> } VFIODirtyRanges; >>>>>> >>>>>> typedef struct VFIODirtyRangesListener { >>>>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { >>>>>> MemoryListener listener; >>>>>> } VFIODirtyRangesListener; >>>>>> >>>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >>>>>> + VFIOContainer *container) >>>>>> +{ >>>>>> +VFIOPCIDevice *pcidev; >>>>>> +VFIODevice *vbasedev; >>>>>> +VFIOGroup *group; >>>>>> +Object *owner; >>>>>> + >>>>>> +owner = memory_region_owner(section->mr); >>>>>> + >>>>>> +QLIST_FOREACH(group, &container->group_list, container_next) { >>>>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) { >>>>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >>>>>> +continue; >>>>>> +} >>>>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >>>>>> +
Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
On Mon, 11 Sep 2023 11:12:55 +0100 Joao Martins wrote: > On 11/09/2023 10:48, Duan, Zhenzhong wrote: > >> -Original Message- > >> From: Joao Martins > >> Sent: Monday, September 11, 2023 5:07 PM > >> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges > >> > >> On 11/09/2023 09:57, Duan, Zhenzhong wrote: > >>>> -Original Message- > >>>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >>>> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao > >>>> Martins > >>>> Sent: Friday, September 8, 2023 5:30 PM > >>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges > >>>> > >>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit > >>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, > >>>> QEMU includes in the 64-bit range the RAM regions at the lower part > >>>> and vfio-pci device RAM regions which are at the top of the address > >>>> space. This range contains a large gap and the size can be bigger than > >>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). > >>>> > >>>> To avoid such large ranges, introduce a new PCI range covering the > >>>> vfio-pci device RAM regions, this only if the addresses are above 4GB > >>>> to avoid breaking potential SeaBIOS guests. > >>>> > >>>> Signed-off-by: Joao Martins > >>>> [ clg: - wrote commit log > >>>> - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] > >>>> Signed-off-by: Cédric Le Goater > >>>> --- > >>>> v2: > >>>> * s/minpci/minpci64/ > >>>> * s/maxpci/maxpci64/ > >>>> * Expand comment to cover the pci-hole64 and why we don't do special > >>>> handling of pci-hole32. > >>>> --- > >>>> hw/vfio/common.c | 71 +--- > >>>> hw/vfio/trace-events | 2 +- > >>>> 2 files changed, 61 insertions(+), 12 deletions(-) > >>>> > >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c > >>>> index 237101d03844..134649226d43 100644 > >>>> --- a/hw/vfio/common.c > >>>> +++ b/hw/vfio/common.c > >>>> @@ -27,6 +27,7 @@ > >>>> > >>>> #include "hw/vfio/vfio-common.h" > >>>> #include "hw/vfio/vfio.h" > >>>> +#include "hw/vfio/pci.h" > >>>> #include "exec/address-spaces.h" > >>>> #include "exec/memory.h" > >>>> #include "exec/ram_addr.h" > >>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { > >>>> hwaddr max32; > >>>> hwaddr min64; > >>>> hwaddr max64; > >>>> +hwaddr minpci64; > >>>> +hwaddr maxpci64; > >>>> } VFIODirtyRanges; > >>>> > >>>> typedef struct VFIODirtyRangesListener { > >>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { > >>>> MemoryListener listener; > >>>> } VFIODirtyRangesListener; > >>>> > >>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, > >>>> + VFIOContainer *container) > >>>> +{ > >>>> +VFIOPCIDevice *pcidev; > >>>> +VFIODevice *vbasedev; > >>>> +VFIOGroup *group; > >>>> +Object *owner; > >>>> + > >>>> +owner = memory_region_owner(section->mr); > >>>> + > >>>> +QLIST_FOREACH(group, &container->group_list, container_next) { > >>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) { > >>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { > >>>> +continue; > >>>> +} > >>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > >>>> +if (OBJECT(pcidev) == owner) { > >>>> +return true; > >>>> +} > >>>> +} > >>>> +} > >>>> + > >>>> +return false; > >>>> +} > >>> > >
RE: [PATCH v1] vfio/common: Separate vfio-pci ranges
>-Original Message- >From: Joao Martins >Sent: Monday, September 11, 2023 6:20 PM >To: Duan, Zhenzhong ; qemu-devel@nongnu.org >Cc: Alex Williamson ; Cedric Le Goater >; Avihai Horon ; Gerd Hoffmann > >Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges > >On 11/09/2023 10:48, Duan, Zhenzhong wrote: >>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >>>>> + VFIOContainer *container) >>>>> +{ >>>>> +VFIOPCIDevice *pcidev; >>>>> +VFIODevice *vbasedev; >>>>> +VFIOGroup *group; >>>>> +Object *owner; >>>>> + >>>>> +owner = memory_region_owner(section->mr); >>>>> + >>>>> +QLIST_FOREACH(group, &container->group_list, container_next) { >>>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) { >>>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >>>>> +continue; >>>>> +} >>>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >>>>> +if (OBJECT(pcidev) == owner) { >>>>> +return true; >>>>> +} >>>>> +} >>>>> +} >>>>> + >>>>> +return false; >>>>> +} >>>> >>>> What about simplify it with memory_region_is_ram_device()? >>>> This way vdpa device could also be included. >>>> >>> >>> Note that the check is not interested in RAM (or any other kinda of memory >like >>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM >that >>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() >>> really >>> cover it? If so, I am all for the simplification. >> >> Ram device is used not only by vfio pci bars but also host notifier of vdpa >> and >vhost-user. > >My only concern is whether this is all part of the pci-hole64 or not e.g. if we >expand to general memory_region_is_ram_device() would we go back to the >initial bug where we create an enourmous range. Ok, I have no idea if memory_region_is_ram_device() will be expanded for other usage in the future. Anyway looks better to keep your code for secure. > The latter that you mentioned should be >mainly virtio-net devices as presented to the guest (regardless of backend is >vdpa, or vhost-user) and perhaps they are all in the hole32 PCI hole? Agree, that's possible. Thanks Zhenzhong
RE: [PATCH v1] vfio/common: Separate vfio-pci ranges
>-Original Message- >From: Joao Martins >Sent: Monday, September 11, 2023 6:13 PM >Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges > >On 11/09/2023 10:48, Duan, Zhenzhong wrote: >>> -Original Message- >>> From: Joao Martins >>> Sent: Monday, September 11, 2023 5:07 PM >>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges ... >> I have another question, previously I think vfio pci bars are device states >> and >> save/restored through VFIO migration protocol, so we don't need to dirty >> tracking them. Do I understand wrong? > >The general thinking of device dirty tracking is to track all addressable >IOVAs. >But you do raise a good question. My understanding is that migrating the bars >*as is* might be device migration specific (not a guarantee?); the save file >and >precopy interface are the only places we transfer from/to the data and it's >just >opaque data, not bars or anything formatted specifically -- so if we migrate >bars it is hidden in what device f/w wants to move. Might be that BARs aren't >even needed as they are sort of scratch space from h/w side. Ultimately, the >dirty tracker is the one reporting the values, and the device h/w chooses to >not >report those IOVAs as dirty then nothing changes. Understood, thanks Joao.
Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
On 11/09/2023 10:48, Duan, Zhenzhong wrote: +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, + VFIOContainer *container) +{ +VFIOPCIDevice *pcidev; +VFIODevice *vbasedev; +VFIOGroup *group; +Object *owner; + +owner = memory_region_owner(section->mr); + +QLIST_FOREACH(group, &container->group_list, container_next) { +QLIST_FOREACH(vbasedev, &group->device_list, next) { +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { +continue; +} +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); +if (OBJECT(pcidev) == owner) { +return true; +} +} +} + +return false; +} >>> >>> What about simplify it with memory_region_is_ram_device()? >>> This way vdpa device could also be included. >>> >> >> Note that the check is not interested in RAM (or any other kinda of memory >> like >> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM >> that >> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really >> cover it? If so, I am all for the simplification. > > Ram device is used not only by vfio pci bars but also host notifier of vdpa > and vhost-user. My only concern is whether this is all part of the pci-hole64 or not e.g. if we expand to general memory_region_is_ram_device() would we go back to the initial bug where we create an enourmous range. The latter that you mentioned should be mainly virtio-net devices as presented to the guest (regardless of backend is vdpa, or vhost-user) and perhaps they are all in the hole32 PCI hole? Joao
Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
On 11/09/2023 10:48, Duan, Zhenzhong wrote: >> -Original Message- >> From: Joao Martins >> Sent: Monday, September 11, 2023 5:07 PM >> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges >> >> On 11/09/2023 09:57, Duan, Zhenzhong wrote: >>>> -Original Message- >>>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >>> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao >>>> Martins >>>> Sent: Friday, September 8, 2023 5:30 PM >>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges >>>> >>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit >>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, >>>> QEMU includes in the 64-bit range the RAM regions at the lower part >>>> and vfio-pci device RAM regions which are at the top of the address >>>> space. This range contains a large gap and the size can be bigger than >>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). >>>> >>>> To avoid such large ranges, introduce a new PCI range covering the >>>> vfio-pci device RAM regions, this only if the addresses are above 4GB >>>> to avoid breaking potential SeaBIOS guests. >>>> >>>> Signed-off-by: Joao Martins >>>> [ clg: - wrote commit log >>>> - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] >>>> Signed-off-by: Cédric Le Goater >>>> --- >>>> v2: >>>> * s/minpci/minpci64/ >>>> * s/maxpci/maxpci64/ >>>> * Expand comment to cover the pci-hole64 and why we don't do special >>>> handling of pci-hole32. >>>> --- >>>> hw/vfio/common.c | 71 +--- >>>> hw/vfio/trace-events | 2 +- >>>> 2 files changed, 61 insertions(+), 12 deletions(-) >>>> >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>>> index 237101d03844..134649226d43 100644 >>>> --- a/hw/vfio/common.c >>>> +++ b/hw/vfio/common.c >>>> @@ -27,6 +27,7 @@ >>>> >>>> #include "hw/vfio/vfio-common.h" >>>> #include "hw/vfio/vfio.h" >>>> +#include "hw/vfio/pci.h" >>>> #include "exec/address-spaces.h" >>>> #include "exec/memory.h" >>>> #include "exec/ram_addr.h" >>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { >>>> hwaddr max32; >>>> hwaddr min64; >>>> hwaddr max64; >>>> +hwaddr minpci64; >>>> +hwaddr maxpci64; >>>> } VFIODirtyRanges; >>>> >>>> typedef struct VFIODirtyRangesListener { >>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { >>>> MemoryListener listener; >>>> } VFIODirtyRangesListener; >>>> >>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >>>> + VFIOContainer *container) >>>> +{ >>>> +VFIOPCIDevice *pcidev; >>>> +VFIODevice *vbasedev; >>>> +VFIOGroup *group; >>>> +Object *owner; >>>> + >>>> +owner = memory_region_owner(section->mr); >>>> + >>>> +QLIST_FOREACH(group, &container->group_list, container_next) { >>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) { >>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >>>> +continue; >>>> +} >>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >>>> +if (OBJECT(pcidev) == owner) { >>>> +return true; >>>> +} >>>> +} >>>> +} >>>> + >>>> +return false; >>>> +} >>> >>> What about simplify it with memory_region_is_ram_device()? >>> This way vdpa device could also be included. >>> >> >> Note that the check is not interested in RAM (or any other kinda of memory >> like >> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM >> that >> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really >> cover it? If so, I am all for the simplification. > > Ram device is used not only by vfio pci bars but also host notifier of vdpa > and vhost-user. > > I have another question, previously I think vfio pci bars are device states > and > save/restored through VFIO migration protocol, so we don't need to dirty > tracking them. Do I understand wrong? The general thinking of device dirty tracking is to track all addressable IOVAs. But you do raise a good question. My understanding is that migrating the bars *as is* might be device migration specific (not a guarantee?); the save file and precopy interface are the only places we transfer from/to the data and it's just opaque data, not bars or anything formatted specifically -- so if we migrate bars it is hidden in what device f/w wants to move. Might be that BARs aren't even needed as they are sort of scratch space from h/w side. Ultimately, the dirty tracker is the one reporting the values, and the device h/w chooses to not report those IOVAs as dirty then nothing changes.
RE: [PATCH v1] vfio/common: Separate vfio-pci ranges
>-Original Message- >From: Joao Martins >Sent: Monday, September 11, 2023 5:07 PM >Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges > >On 11/09/2023 09:57, Duan, Zhenzhong wrote: >>> -Original Message- >>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao >>> Martins >>> Sent: Friday, September 8, 2023 5:30 PM >>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges >>> >>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit >>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, >>> QEMU includes in the 64-bit range the RAM regions at the lower part >>> and vfio-pci device RAM regions which are at the top of the address >>> space. This range contains a large gap and the size can be bigger than >>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). >>> >>> To avoid such large ranges, introduce a new PCI range covering the >>> vfio-pci device RAM regions, this only if the addresses are above 4GB >>> to avoid breaking potential SeaBIOS guests. >>> >>> Signed-off-by: Joao Martins >>> [ clg: - wrote commit log >>> - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] >>> Signed-off-by: Cédric Le Goater >>> --- >>> v2: >>> * s/minpci/minpci64/ >>> * s/maxpci/maxpci64/ >>> * Expand comment to cover the pci-hole64 and why we don't do special >>> handling of pci-hole32. >>> --- >>> hw/vfio/common.c | 71 +--- >>> hw/vfio/trace-events | 2 +- >>> 2 files changed, 61 insertions(+), 12 deletions(-) >>> >>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>> index 237101d03844..134649226d43 100644 >>> --- a/hw/vfio/common.c >>> +++ b/hw/vfio/common.c >>> @@ -27,6 +27,7 @@ >>> >>> #include "hw/vfio/vfio-common.h" >>> #include "hw/vfio/vfio.h" >>> +#include "hw/vfio/pci.h" >>> #include "exec/address-spaces.h" >>> #include "exec/memory.h" >>> #include "exec/ram_addr.h" >>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { >>> hwaddr max32; >>> hwaddr min64; >>> hwaddr max64; >>> +hwaddr minpci64; >>> +hwaddr maxpci64; >>> } VFIODirtyRanges; >>> >>> typedef struct VFIODirtyRangesListener { >>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { >>> MemoryListener listener; >>> } VFIODirtyRangesListener; >>> >>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >>> + VFIOContainer *container) >>> +{ >>> +VFIOPCIDevice *pcidev; >>> +VFIODevice *vbasedev; >>> +VFIOGroup *group; >>> +Object *owner; >>> + >>> +owner = memory_region_owner(section->mr); >>> + >>> +QLIST_FOREACH(group, &container->group_list, container_next) { >>> +QLIST_FOREACH(vbasedev, &group->device_list, next) { >>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >>> +continue; >>> +} >>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >>> +if (OBJECT(pcidev) == owner) { >>> +return true; >>> +} >>> +} >>> +} >>> + >>> +return false; >>> +} >> >> What about simplify it with memory_region_is_ram_device()? >> This way vdpa device could also be included. >> > >Note that the check is not interested in RAM (or any other kinda of memory like >VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that >would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really >cover it? If so, I am all for the simplification. Ram device is used not only by vfio pci bars but also host notifier of vdpa and vhost-user. I have another question, previously I think vfio pci bars are device states and save/restored through VFIO migration protocol, so we don't need to dirty tracking them. Do I understand wrong? Thanks Zhenzhong
Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
On 11/09/2023 09:57, Duan, Zhenzhong wrote: >> -Original Message- >> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org > devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao >> Martins >> Sent: Friday, September 8, 2023 5:30 PM >> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges >> >> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit >> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, >> QEMU includes in the 64-bit range the RAM regions at the lower part >> and vfio-pci device RAM regions which are at the top of the address >> space. This range contains a large gap and the size can be bigger than >> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). >> >> To avoid such large ranges, introduce a new PCI range covering the >> vfio-pci device RAM regions, this only if the addresses are above 4GB >> to avoid breaking potential SeaBIOS guests. >> >> Signed-off-by: Joao Martins >> [ clg: - wrote commit log >> - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] >> Signed-off-by: Cédric Le Goater >> --- >> v2: >> * s/minpci/minpci64/ >> * s/maxpci/maxpci64/ >> * Expand comment to cover the pci-hole64 and why we don't do special >> handling of pci-hole32. >> --- >> hw/vfio/common.c | 71 +--- >> hw/vfio/trace-events | 2 +- >> 2 files changed, 61 insertions(+), 12 deletions(-) >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 237101d03844..134649226d43 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -27,6 +27,7 @@ >> >> #include "hw/vfio/vfio-common.h" >> #include "hw/vfio/vfio.h" >> +#include "hw/vfio/pci.h" >> #include "exec/address-spaces.h" >> #include "exec/memory.h" >> #include "exec/ram_addr.h" >> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { >> hwaddr max32; >> hwaddr min64; >> hwaddr max64; >> +hwaddr minpci64; >> +hwaddr maxpci64; >> } VFIODirtyRanges; >> >> typedef struct VFIODirtyRangesListener { >> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { >> MemoryListener listener; >> } VFIODirtyRangesListener; >> >> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >> + VFIOContainer *container) >> +{ >> +VFIOPCIDevice *pcidev; >> +VFIODevice *vbasedev; >> +VFIOGroup *group; >> +Object *owner; >> + >> +owner = memory_region_owner(section->mr); >> + >> +QLIST_FOREACH(group, &container->group_list, container_next) { >> +QLIST_FOREACH(vbasedev, &group->device_list, next) { >> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >> +continue; >> +} >> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >> +if (OBJECT(pcidev) == owner) { >> +return true; >> +} >> +} >> +} >> + >> +return false; >> +} > > What about simplify it with memory_region_is_ram_device()? > This way vdpa device could also be included. > Note that the check is not interested in RAM (or any other kinda of memory like VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really cover it? If so, I am all for the simplification. Joao
RE: [PATCH v1] vfio/common: Separate vfio-pci ranges
>-Original Message- >From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao >Martins >Sent: Friday, September 8, 2023 5:30 PM >Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges > >QEMU computes the DMA logging ranges for two predefined ranges: 32-bit >and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, >QEMU includes in the 64-bit range the RAM regions at the lower part >and vfio-pci device RAM regions which are at the top of the address >space. This range contains a large gap and the size can be bigger than >the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). > >To avoid such large ranges, introduce a new PCI range covering the >vfio-pci device RAM regions, this only if the addresses are above 4GB >to avoid breaking potential SeaBIOS guests. > >Signed-off-by: Joao Martins >[ clg: - wrote commit log > - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] >Signed-off-by: Cédric Le Goater >--- >v2: >* s/minpci/minpci64/ >* s/maxpci/maxpci64/ >* Expand comment to cover the pci-hole64 and why we don't do special > handling of pci-hole32. >--- > hw/vfio/common.c | 71 +--- > hw/vfio/trace-events | 2 +- > 2 files changed, 61 insertions(+), 12 deletions(-) > >diff --git a/hw/vfio/common.c b/hw/vfio/common.c >index 237101d03844..134649226d43 100644 >--- a/hw/vfio/common.c >+++ b/hw/vfio/common.c >@@ -27,6 +27,7 @@ > > #include "hw/vfio/vfio-common.h" > #include "hw/vfio/vfio.h" >+#include "hw/vfio/pci.h" > #include "exec/address-spaces.h" > #include "exec/memory.h" > #include "exec/ram_addr.h" >@@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { > hwaddr max32; > hwaddr min64; > hwaddr max64; >+hwaddr minpci64; >+hwaddr maxpci64; > } VFIODirtyRanges; > > typedef struct VFIODirtyRangesListener { >@@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { > MemoryListener listener; > } VFIODirtyRangesListener; > >+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >+ VFIOContainer *container) >+{ >+VFIOPCIDevice *pcidev; >+VFIODevice *vbasedev; >+VFIOGroup *group; >+Object *owner; >+ >+owner = memory_region_owner(section->mr); >+ >+QLIST_FOREACH(group, &container->group_list, container_next) { >+QLIST_FOREACH(vbasedev, &group->device_list, next) { >+if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >+continue; >+} >+pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >+if (OBJECT(pcidev) == owner) { >+return true; >+} >+} >+} >+ >+return false; >+} What about simplify it with memory_region_is_ram_device()? This way vdpa device could also be included. Thanks Zhenzhong
Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
On 9/8/23 11:29, Joao Martins wrote: QEMU computes the DMA logging ranges for two predefined ranges: 32-bit and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, QEMU includes in the 64-bit range the RAM regions at the lower part and vfio-pci device RAM regions which are at the top of the address space. This range contains a large gap and the size can be bigger than the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). To avoid such large ranges, introduce a new PCI range covering the vfio-pci device RAM regions, this only if the addresses are above 4GB to avoid breaking potential SeaBIOS guests. Signed-off-by: Joao Martins [ clg: - wrote commit log - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] Signed-off-by: Cédric Le Goater --- v2: * s/minpci/minpci64/ * s/maxpci/maxpci64/ * Expand comment to cover the pci-hole64 and why we don't do special handling of pci-hole32. This is a valuable fix for the latest OVMF enabling dynamic MMIO window. Applied to vfio-next. Thanks, C. --- hw/vfio/common.c | 71 +--- hw/vfio/trace-events | 2 +- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 237101d03844..134649226d43 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -27,6 +27,7 @@ #include "hw/vfio/vfio-common.h" #include "hw/vfio/vfio.h" +#include "hw/vfio/pci.h" #include "exec/address-spaces.h" #include "exec/memory.h" #include "exec/ram_addr.h" @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { hwaddr max32; hwaddr min64; hwaddr max64; +hwaddr minpci64; +hwaddr maxpci64; } VFIODirtyRanges; typedef struct VFIODirtyRangesListener { @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { MemoryListener listener; } VFIODirtyRangesListener; +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, + VFIOContainer *container) +{ +VFIOPCIDevice *pcidev; +VFIODevice *vbasedev; +VFIOGroup *group; +Object *owner; + +owner = memory_region_owner(section->mr); + +QLIST_FOREACH(group, &container->group_list, container_next) { +QLIST_FOREACH(vbasedev, &group->device_list, next) { +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { +continue; +} +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); +if (OBJECT(pcidev) == owner) { +return true; +} +} +} + +return false; +} + static void vfio_dirty_tracking_update(MemoryListener *listener, MemoryRegionSection *section) { @@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, } /* - * The address space passed to the dirty tracker is reduced to two ranges: - * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges. + * The address space passed to the dirty tracker is reduced to three ranges: + * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the + * PCI 64-bit hole. + * * The underlying reports of dirty will query a sub-interval of each of * these ranges. * - * The purpose of the dual range handling is to handle known cases of big - * holes in the address space, like the x86 AMD 1T hole. The alternative - * would be an IOVATree but that has a much bigger runtime overhead and - * unnecessary complexity. + * The purpose of the three range handling is to handle known cases of big + * holes in the address space, like the x86 AMD 1T hole, and firmware (like + * OVMF) which may relocate the pci-hole64 to the end of the address space. + * The latter would otherwise generate large ranges for tracking, stressing + * the limits of supported hardware. The pci-hole32 will always be below 4G + * (overlapping or not) so it doesn't need special handling and is part of + * the 32-bit range. + * + * The alternative would be an IOVATree but that has a much bigger runtime + * overhead and unnecessary complexity. */ -min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; -max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; - +if (vfio_section_is_vfio_pci(section, dirty->container) && +iova >= UINT32_MAX) { +min = &range->minpci64; +max = &range->maxpci64; +} else { +min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; +max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; +} if (*min > iova) { *min = iova; } @@ -1461,6 +1502,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, memset(&dirty, 0, sizeof(dirty)); dirty.ranges.min32 = UINT32_MAX; dirty.ranges.min64 = UINT64_MAX; +dirty.ranges.minpci64 = U
Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
On 08/09/2023 10:29, Joao Martins wrote: > QEMU computes the DMA logging ranges for two predefined ranges: 32-bit > and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, > QEMU includes in the 64-bit range the RAM regions at the lower part > and vfio-pci device RAM regions which are at the top of the address > space. This range contains a large gap and the size can be bigger than > the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). > > To avoid such large ranges, introduce a new PCI range covering the > vfio-pci device RAM regions, this only if the addresses are above 4GB > to avoid breaking potential SeaBIOS guests. > > Signed-off-by: Joao Martins > [ clg: - wrote commit log >- fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] > Signed-off-by: Cédric Le Goater This should have a: Fixes: 5255bbf4ec16 ("vfio/common: Add device dirty page tracking start/stop") > --- > v2: > * s/minpci/minpci64/ > * s/maxpci/maxpci64/ > * Expand comment to cover the pci-hole64 and why we don't do special > handling of pci-hole32. > --- > hw/vfio/common.c | 71 +--- > hw/vfio/trace-events | 2 +- > 2 files changed, 61 insertions(+), 12 deletions(-) > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 237101d03844..134649226d43 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -27,6 +27,7 @@ > > #include "hw/vfio/vfio-common.h" > #include "hw/vfio/vfio.h" > +#include "hw/vfio/pci.h" > #include "exec/address-spaces.h" > #include "exec/memory.h" > #include "exec/ram_addr.h" > @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { > hwaddr max32; > hwaddr min64; > hwaddr max64; > +hwaddr minpci64; > +hwaddr maxpci64; > } VFIODirtyRanges; > > typedef struct VFIODirtyRangesListener { > @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { > MemoryListener listener; > } VFIODirtyRangesListener; > > +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, > + VFIOContainer *container) > +{ > +VFIOPCIDevice *pcidev; > +VFIODevice *vbasedev; > +VFIOGroup *group; > +Object *owner; > + > +owner = memory_region_owner(section->mr); > + > +QLIST_FOREACH(group, &container->group_list, container_next) { > +QLIST_FOREACH(vbasedev, &group->device_list, next) { > +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { > +continue; > +} > +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > +if (OBJECT(pcidev) == owner) { > +return true; > +} > +} > +} > + > +return false; > +} > + > static void vfio_dirty_tracking_update(MemoryListener *listener, > MemoryRegionSection *section) > { > @@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener > *listener, > } > > /* > - * The address space passed to the dirty tracker is reduced to two > ranges: > - * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges. > + * The address space passed to the dirty tracker is reduced to three > ranges: > + * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the > + * PCI 64-bit hole. > + * > * The underlying reports of dirty will query a sub-interval of each of > * these ranges. > * > - * The purpose of the dual range handling is to handle known cases of big > - * holes in the address space, like the x86 AMD 1T hole. The alternative > - * would be an IOVATree but that has a much bigger runtime overhead and > - * unnecessary complexity. > + * The purpose of the three range handling is to handle known cases of > big > + * holes in the address space, like the x86 AMD 1T hole, and firmware > (like > + * OVMF) which may relocate the pci-hole64 to the end of the address > space. > + * The latter would otherwise generate large ranges for tracking, > stressing > + * the limits of supported hardware. The pci-hole32 will always be below > 4G > + * (overlapping or not) so it doesn't need special handling and is part > of > + * the 32-bit range. > + * > + * The alternative would be an IOVATree but that has a much bigger > runtime > + * overhead and unnecessary complexity. > */ > -min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; > -max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; > - > +if (vfio_section_is_vfio_pci(section, dirty->container) && > +iova >= UINT32_MAX) { > +min = &range->minpci64; > +max = &range->maxpci64; > +} else { > +min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; > +max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; > +} > if (*min > iova) { > *min = iova; > } > @@ -1
[PATCH v1] vfio/common: Separate vfio-pci ranges
QEMU computes the DMA logging ranges for two predefined ranges: 32-bit and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, QEMU includes in the 64-bit range the RAM regions at the lower part and vfio-pci device RAM regions which are at the top of the address space. This range contains a large gap and the size can be bigger than the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). To avoid such large ranges, introduce a new PCI range covering the vfio-pci device RAM regions, this only if the addresses are above 4GB to avoid breaking potential SeaBIOS guests. Signed-off-by: Joao Martins [ clg: - wrote commit log - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] Signed-off-by: Cédric Le Goater --- v2: * s/minpci/minpci64/ * s/maxpci/maxpci64/ * Expand comment to cover the pci-hole64 and why we don't do special handling of pci-hole32. --- hw/vfio/common.c | 71 +--- hw/vfio/trace-events | 2 +- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 237101d03844..134649226d43 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -27,6 +27,7 @@ #include "hw/vfio/vfio-common.h" #include "hw/vfio/vfio.h" +#include "hw/vfio/pci.h" #include "exec/address-spaces.h" #include "exec/memory.h" #include "exec/ram_addr.h" @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { hwaddr max32; hwaddr min64; hwaddr max64; +hwaddr minpci64; +hwaddr maxpci64; } VFIODirtyRanges; typedef struct VFIODirtyRangesListener { @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { MemoryListener listener; } VFIODirtyRangesListener; +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, + VFIOContainer *container) +{ +VFIOPCIDevice *pcidev; +VFIODevice *vbasedev; +VFIOGroup *group; +Object *owner; + +owner = memory_region_owner(section->mr); + +QLIST_FOREACH(group, &container->group_list, container_next) { +QLIST_FOREACH(vbasedev, &group->device_list, next) { +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { +continue; +} +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); +if (OBJECT(pcidev) == owner) { +return true; +} +} +} + +return false; +} + static void vfio_dirty_tracking_update(MemoryListener *listener, MemoryRegionSection *section) { @@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, } /* - * The address space passed to the dirty tracker is reduced to two ranges: - * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges. + * The address space passed to the dirty tracker is reduced to three ranges: + * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the + * PCI 64-bit hole. + * * The underlying reports of dirty will query a sub-interval of each of * these ranges. * - * The purpose of the dual range handling is to handle known cases of big - * holes in the address space, like the x86 AMD 1T hole. The alternative - * would be an IOVATree but that has a much bigger runtime overhead and - * unnecessary complexity. + * The purpose of the three range handling is to handle known cases of big + * holes in the address space, like the x86 AMD 1T hole, and firmware (like + * OVMF) which may relocate the pci-hole64 to the end of the address space. + * The latter would otherwise generate large ranges for tracking, stressing + * the limits of supported hardware. The pci-hole32 will always be below 4G + * (overlapping or not) so it doesn't need special handling and is part of + * the 32-bit range. + * + * The alternative would be an IOVATree but that has a much bigger runtime + * overhead and unnecessary complexity. */ -min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; -max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; - +if (vfio_section_is_vfio_pci(section, dirty->container) && +iova >= UINT32_MAX) { +min = &range->minpci64; +max = &range->maxpci64; +} else { +min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; +max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; +} if (*min > iova) { *min = iova; } @@ -1461,6 +1502,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, memset(&dirty, 0, sizeof(dirty)); dirty.ranges.min32 = UINT32_MAX; dirty.ranges.min64 = UINT64_MAX; +dirty.ranges.minpci64 = UINT64_MAX; dirty.listener = vfio_dirty_tracking_listener; dirty.container = container; @@ -1531,7 +1573,8 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container,