Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions

2020-09-25 Thread Auger Eric
Hi Fam,

On 9/25/20 5:44 PM, Fam Zheng wrote:
> On Fri, 2020-09-25 at 17:23 +0200, Auger Eric wrote:
 @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState
 *s, const char *device,
  if (ret) {
  goto fail;
  }
 +g_free(iommu_info);
  return 0;
  fail:
 +g_free(s->usable_iova_ranges);
>>>
>>> Set s->usable_iova_ranges to NULL to avoid double free?
>>
>> I think I did at the beginning of qemu_vfio_init_pci()
> 
> Yes, but I mean clearing the pointer will make calling
> qemu_vfio_close() safe, there is also a g_free() on this one.
Oh yes, got it.

Thank you for the review.

Best Regards

Eric
> 
> Fam
> 
>>
>> Thanks
>>
>> Eric
>>>
 +s->nb_iova_ranges = 0;
 +g_free(iommu_info);
  close(s->group);
  fail_container:
  close(s->container);
 @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
  qemu_vfio_undo_mapping(s, >mappings[i], NULL);
  }
  ram_block_notifier_remove(>ram_notifier);
 +g_free(s->usable_iova_ranges);
 +s->nb_iova_ranges = 0;
  qemu_vfio_reset(s);
  close(s->device);
  close(s->group);
 -- 
 2.21.3


>>>
>>> Fam
>>>
>>
>>
> 




Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions

2020-09-25 Thread Fam Zheng
On Fri, 2020-09-25 at 17:23 +0200, Auger Eric wrote:
> > > @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState
> > > *s, const char *device,
> > >  if (ret) {
> > >  goto fail;
> > >  }
> > > +g_free(iommu_info);
> > >  return 0;
> > >  fail:
> > > +g_free(s->usable_iova_ranges);
> > 
> > Set s->usable_iova_ranges to NULL to avoid double free?
> 
> I think I did at the beginning of qemu_vfio_init_pci()

Yes, but I mean clearing the pointer will make calling
qemu_vfio_close() safe, there is also a g_free() on this one.

Fam

> 
> Thanks
> 
> Eric
> > 
> > > +s->nb_iova_ranges = 0;
> > > +g_free(iommu_info);
> > >  close(s->group);
> > >  fail_container:
> > >  close(s->container);
> > > @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
> > >  qemu_vfio_undo_mapping(s, >mappings[i], NULL);
> > >  }
> > >  ram_block_notifier_remove(>ram_notifier);
> > > +g_free(s->usable_iova_ranges);
> > > +s->nb_iova_ranges = 0;
> > >  qemu_vfio_reset(s);
> > >  close(s->device);
> > >  close(s->group);
> > > -- 
> > > 2.21.3
> > > 
> > > 
> > 
> > Fam
> > 
> 
> 




Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions

2020-09-25 Thread Auger Eric
Hi Fam,

On 9/25/20 4:43 PM, Fam Zheng wrote:
> On 2020-09-25 15:48, Eric Auger wrote:
>> The IOVA allocator currently ignores host reserved regions.
>> As a result some chosen IOVAs may collide with some of them,
>> resulting in VFIO MAP_DMA errors later on. This happens on ARM
>> where the MSI reserved window quickly is encountered:
>> [0x800, 0x810]. since 5.4 kernel, VFIO returns the usable
>> IOVA regions. So let's enumerate them in the prospect to avoid
>> them, later on.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  util/vfio-helpers.c | 75 +++--
>>  1 file changed, 73 insertions(+), 2 deletions(-)
>>
>> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
>> index 583bdfb36f..8e91beba95 100644
>> --- a/util/vfio-helpers.c
>> +++ b/util/vfio-helpers.c
>> @@ -40,6 +40,11 @@ typedef struct {
>>  uint64_t iova;
>>  } IOVAMapping;
>>  
>> +struct IOVARange {
>> +uint64_t start;
>> +uint64_t end;
>> +};
>> +
>>  struct QEMUVFIOState {
>>  QemuMutex lock;
>>  
>> @@ -49,6 +54,8 @@ struct QEMUVFIOState {
>>  int device;
>>  RAMBlockNotifier ram_notifier;
>>  struct vfio_region_info config_region_info, bar_region_info[6];
>> +struct IOVARange *usable_iova_ranges;
>> +uint8_t nb_iova_ranges;
>>  
>>  /* These fields are protected by @lock */
>>  /* VFIO's IO virtual address space is managed by splitting into a few
>> @@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, 
>> void *buf, int size, int
>>  return ret == size ? 0 : -errno;
>>  }
>>  
>> +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
>> +{
>> +struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
>> +struct vfio_info_cap_header *cap = first_cap;
>> +void *offset = first_cap;
>> +int i;
>> +
>> +while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
>> +if (cap->next) {
> 
> This check looks reversed.
> 
>> +return;
>> +}
>> +offset += cap->next;
> 
> @offset is unused.
> 
>> +cap = (struct vfio_info_cap_header *)cap;
> 
> This assignment is nop.
ugh indeed, that loop implementation is totally crap. I will test the
rewriting by adding an extra cap on kernel side.
> 
>> +}
>> +
>> +cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
>> +
>> +s->nb_iova_ranges = cap_iova_range->nr_iovas;
>> +if (s->nb_iova_ranges > 1) {
>> +s->usable_iova_ranges =
>> +g_realloc(s->usable_iova_ranges,
>> +  s->nb_iova_ranges * sizeof(struct IOVARange));
>> +}
>> +
>> +for (i = 0; i <  s->nb_iova_ranges; i++) {
> 
> s/  / /
ok
> 
>> +s->usable_iova_ranges[i].start = 
>> cap_iova_range->iova_ranges[i].start;
>> +s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
>> +}
>> +}
>> +
>>  static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>Error **errp)
>>  {
>> @@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const 
>> char *device,
>>  int i;
>>  uint16_t pci_cmd;
>>  struct vfio_group_status group_status = { .argsz = sizeof(group_status) 
>> };
>> -struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) 
>> };
>> +struct vfio_iommu_type1_info *iommu_info = NULL;
>> +size_t iommu_info_size = sizeof(*iommu_info);
>>  struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
>>  char *group_file = NULL;
>>  
>> +s->usable_iova_ranges = NULL;
>> +
>>  /* Create a new container */
>>  s->container = open("/dev/vfio/vfio", O_RDWR);
>>  
>> @@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const 
>> char *device,
>>  goto fail;
>>  }
>>  
>> +iommu_info = g_malloc0(iommu_info_size);
>> +iommu_info->argsz = iommu_info_size;
>> +
>>  /* Get additional IOMMU info */
>> -if (ioctl(s->container, VFIO_IOMMU_GET_INFO, _info)) {
>> +if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>>  error_setg_errno(errp, errno, "Failed to get IOMMU info");
>>  ret = -errno;
>>  goto fail;
>>  }
>>  
>> +/*
>> + * if the kernel does not report usable IOVA regions, choose
>> + * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
>> + */
>> +s->nb_iova_ranges = 1;
>> +s->usable_iova_ranges = g_new0(struct IOVARange, 1);
>> +s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
>> +s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
>> +
>> +if (iommu_info->argsz > iommu_info_size) {
>> +void *first_cap;
>> +
>> +iommu_info_size = iommu_info->argsz;
>> +iommu_info = g_realloc(iommu_info, iommu_info_size);
>> +if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>> +ret = -errno;
>> +goto fail;
>> +}
>> +first_cap = 

Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions

2020-09-25 Thread Fam Zheng
On 2020-09-25 15:48, Eric Auger wrote:
> The IOVA allocator currently ignores host reserved regions.
> As a result some chosen IOVAs may collide with some of them,
> resulting in VFIO MAP_DMA errors later on. This happens on ARM
> where the MSI reserved window quickly is encountered:
> [0x800, 0x810]. since 5.4 kernel, VFIO returns the usable
> IOVA regions. So let's enumerate them in the prospect to avoid
> them, later on.
> 
> Signed-off-by: Eric Auger 
> ---
>  util/vfio-helpers.c | 75 +++--
>  1 file changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> index 583bdfb36f..8e91beba95 100644
> --- a/util/vfio-helpers.c
> +++ b/util/vfio-helpers.c
> @@ -40,6 +40,11 @@ typedef struct {
>  uint64_t iova;
>  } IOVAMapping;
>  
> +struct IOVARange {
> +uint64_t start;
> +uint64_t end;
> +};
> +
>  struct QEMUVFIOState {
>  QemuMutex lock;
>  
> @@ -49,6 +54,8 @@ struct QEMUVFIOState {
>  int device;
>  RAMBlockNotifier ram_notifier;
>  struct vfio_region_info config_region_info, bar_region_info[6];
> +struct IOVARange *usable_iova_ranges;
> +uint8_t nb_iova_ranges;
>  
>  /* These fields are protected by @lock */
>  /* VFIO's IO virtual address space is managed by splitting into a few
> @@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, 
> void *buf, int size, int
>  return ret == size ? 0 : -errno;
>  }
>  
> +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
> +{
> +struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
> +struct vfio_info_cap_header *cap = first_cap;
> +void *offset = first_cap;
> +int i;
> +
> +while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
> +if (cap->next) {

This check looks reversed.

> +return;
> +}
> +offset += cap->next;

@offset is unused.

> +cap = (struct vfio_info_cap_header *)cap;

This assignment is nop.

> +}
> +
> +cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
> +
> +s->nb_iova_ranges = cap_iova_range->nr_iovas;
> +if (s->nb_iova_ranges > 1) {
> +s->usable_iova_ranges =
> +g_realloc(s->usable_iova_ranges,
> +  s->nb_iova_ranges * sizeof(struct IOVARange));
> +}
> +
> +for (i = 0; i <  s->nb_iova_ranges; i++) {

s/  / /

> +s->usable_iova_ranges[i].start = 
> cap_iova_range->iova_ranges[i].start;
> +s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
> +}
> +}
> +
>  static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>Error **errp)
>  {
> @@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const 
> char *device,
>  int i;
>  uint16_t pci_cmd;
>  struct vfio_group_status group_status = { .argsz = sizeof(group_status) 
> };
> -struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) 
> };
> +struct vfio_iommu_type1_info *iommu_info = NULL;
> +size_t iommu_info_size = sizeof(*iommu_info);
>  struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
>  char *group_file = NULL;
>  
> +s->usable_iova_ranges = NULL;
> +
>  /* Create a new container */
>  s->container = open("/dev/vfio/vfio", O_RDWR);
>  
> @@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const 
> char *device,
>  goto fail;
>  }
>  
> +iommu_info = g_malloc0(iommu_info_size);
> +iommu_info->argsz = iommu_info_size;
> +
>  /* Get additional IOMMU info */
> -if (ioctl(s->container, VFIO_IOMMU_GET_INFO, _info)) {
> +if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>  error_setg_errno(errp, errno, "Failed to get IOMMU info");
>  ret = -errno;
>  goto fail;
>  }
>  
> +/*
> + * if the kernel does not report usable IOVA regions, choose
> + * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
> + */
> +s->nb_iova_ranges = 1;
> +s->usable_iova_ranges = g_new0(struct IOVARange, 1);
> +s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
> +s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
> +
> +if (iommu_info->argsz > iommu_info_size) {
> +void *first_cap;
> +
> +iommu_info_size = iommu_info->argsz;
> +iommu_info = g_realloc(iommu_info, iommu_info_size);
> +if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
> +ret = -errno;
> +goto fail;
> +}
> +first_cap = (void *)iommu_info + iommu_info->cap_offset;
> +collect_usable_iova_ranges(s, first_cap);
> +}
> +
>  s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
>  
>  if (s->device < 0) {
> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const 
> char *device,
>  if (ret) 

[RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions

2020-09-25 Thread Eric Auger
The IOVA allocator currently ignores host reserved regions.
As a result some chosen IOVAs may collide with some of them,
resulting in VFIO MAP_DMA errors later on. This happens on ARM
where the MSI reserved window quickly is encountered:
[0x800, 0x810]. since 5.4 kernel, VFIO returns the usable
IOVA regions. So let's enumerate them in the prospect to avoid
them, later on.

Signed-off-by: Eric Auger 
---
 util/vfio-helpers.c | 75 +++--
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 583bdfb36f..8e91beba95 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -40,6 +40,11 @@ typedef struct {
 uint64_t iova;
 } IOVAMapping;
 
+struct IOVARange {
+uint64_t start;
+uint64_t end;
+};
+
 struct QEMUVFIOState {
 QemuMutex lock;
 
@@ -49,6 +54,8 @@ struct QEMUVFIOState {
 int device;
 RAMBlockNotifier ram_notifier;
 struct vfio_region_info config_region_info, bar_region_info[6];
+struct IOVARange *usable_iova_ranges;
+uint8_t nb_iova_ranges;
 
 /* These fields are protected by @lock */
 /* VFIO's IO virtual address space is managed by splitting into a few
@@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, 
void *buf, int size, int
 return ret == size ? 0 : -errno;
 }
 
+static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
+{
+struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
+struct vfio_info_cap_header *cap = first_cap;
+void *offset = first_cap;
+int i;
+
+while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+if (cap->next) {
+return;
+}
+offset += cap->next;
+cap = (struct vfio_info_cap_header *)cap;
+}
+
+cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
+
+s->nb_iova_ranges = cap_iova_range->nr_iovas;
+if (s->nb_iova_ranges > 1) {
+s->usable_iova_ranges =
+g_realloc(s->usable_iova_ranges,
+  s->nb_iova_ranges * sizeof(struct IOVARange));
+}
+
+for (i = 0; i <  s->nb_iova_ranges; i++) {
+s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
+s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
+}
+}
+
 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
   Error **errp)
 {
@@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const 
char *device,
 int i;
 uint16_t pci_cmd;
 struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
-struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+struct vfio_iommu_type1_info *iommu_info = NULL;
+size_t iommu_info_size = sizeof(*iommu_info);
 struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
 char *group_file = NULL;
 
+s->usable_iova_ranges = NULL;
+
 /* Create a new container */
 s->container = open("/dev/vfio/vfio", O_RDWR);
 
@@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const 
char *device,
 goto fail;
 }
 
+iommu_info = g_malloc0(iommu_info_size);
+iommu_info->argsz = iommu_info_size;
+
 /* Get additional IOMMU info */
-if (ioctl(s->container, VFIO_IOMMU_GET_INFO, _info)) {
+if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
 error_setg_errno(errp, errno, "Failed to get IOMMU info");
 ret = -errno;
 goto fail;
 }
 
+/*
+ * if the kernel does not report usable IOVA regions, choose
+ * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+ */
+s->nb_iova_ranges = 1;
+s->usable_iova_ranges = g_new0(struct IOVARange, 1);
+s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
+s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+
+if (iommu_info->argsz > iommu_info_size) {
+void *first_cap;
+
+iommu_info_size = iommu_info->argsz;
+iommu_info = g_realloc(iommu_info, iommu_info_size);
+if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+ret = -errno;
+goto fail;
+}
+first_cap = (void *)iommu_info + iommu_info->cap_offset;
+collect_usable_iova_ranges(s, first_cap);
+}
+
 s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
 
 if (s->device < 0) {
@@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char 
*device,
 if (ret) {
 goto fail;
 }
+g_free(iommu_info);
 return 0;
 fail:
+g_free(s->usable_iova_ranges);
+s->nb_iova_ranges = 0;
+g_free(iommu_info);
 close(s->group);
 fail_container:
 close(s->container);
@@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
 qemu_vfio_undo_mapping(s, >mappings[i], NULL);
 }
 ram_block_notifier_remove(>ram_notifier);
+