On Wed, 28 Sep 2016, Andre Przywara wrote: > The number of LPIs on a host can be potentially huge (millions), > although in practise will be mostly reasonable. So prematurely allocating > an array of struct irq_desc's for each LPI is not an option. > However Xen itself does not care about LPIs, as every LPI will be injected > into a guest (Dom0 for now). > Create a dense data structure (8 Bytes) for each LPI which holds just > enough information to determine the virtual IRQ number and the VCPU into > which the LPI needs to be injected. > Also to not artificially limit the number of LPIs, we create a 2-level > table for holding those structures. > This patch introduces functions to initialize these tables and to > create, lookup and destroy entries for a given LPI. > We allocate and access LPI information in a way that does not require > a lock. > > Signed-off-by: Andre Przywara <andre.przyw...@arm.com> > --- > xen/arch/arm/gic-its.c | 154 > ++++++++++++++++++++++++++++++++++++++++++ > xen/include/asm-arm/gic-its.h | 18 +++++ > 2 files changed, 172 insertions(+) > > diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c > index 88397bc..2140e4a 100644 > --- a/xen/arch/arm/gic-its.c > +++ b/xen/arch/arm/gic-its.c > @@ -18,18 +18,31 @@ > > #include <xen/config.h> > #include <xen/lib.h> > +#include <xen/sched.h> > #include <xen/err.h> > #include <xen/device_tree.h> > #include <xen/libfdt/libfdt.h> > #include <asm/p2m.h> > +#include <asm/domain.h> > #include <asm/io.h> > #include <asm/gic.h> > #include <asm/gic_v3_defs.h> > #include <asm/gic-its.h> > > +/* LPIs on the host always go to a guest, so no struct irq_desc for them. */ > +union host_lpi { > + uint64_t data; > + struct { > + uint64_t virt_lpi:32; > + uint64_t dom_id:16; > + uint64_t vcpu_id:16; > + }; > +};
Why not the following? union host_lpi { uint64_t data; struct { uint32_t virt_lpi; uint16_t dom_id; uint16_t vcpu_id; }; }; > /* Global state */ > static struct { > uint8_t *lpi_property; > + union host_lpi **host_lpis; > int host_lpi_bits; > } lpi_data; > > @@ -43,6 +56,26 @@ static DEFINE_PER_CPU(void *, pending_table); > #define MAX_HOST_LPI_BITS \ > min_t(unsigned int, lpi_data.host_lpi_bits, CONFIG_HOST_LPI_BITS) > #define MAX_HOST_LPIS (BIT(MAX_HOST_LPI_BITS) - 8192) > +#define HOST_LPIS_PER_PAGE (PAGE_SIZE / sizeof(union host_lpi)) > + > +static union host_lpi *gic_find_host_lpi(uint32_t lpi, struct domain *d) I take "lpi" is the physical lpi here. Maybe we would rename it to "plpi" for clarity. > +{ > + union host_lpi *hlpi; > + > + if ( lpi < 8192 || lpi >= MAX_HOST_LPIS + 8192 ) > + return NULL; > + > + lpi -= 8192; > + if ( !lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE] ) > + return NULL; > + > + hlpi = &lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE][lpi % > HOST_LPIS_PER_PAGE]; I realize I am sometimes obsessive about this, but division operations are expensive and this is on the hot path, so I would do: #define HOST_LPIS_PER_PAGE (PAGE_SIZE >> 3) unsigned int table = lpi / HOST_LPIS_PER_PAGE; then use table throughout this function. > + if ( d && hlpi->dom_id != d->domain_id ) > + return NULL; I think this function is very useful so I would avoid making any domain checks here: one day we might want to retrieve hlpi even if hlpi->dom_id != d->domain_id. I would move the domain check outside. > + return hlpi; > +} > > #define ITS_COMMAND_SIZE 32 > > @@ -96,6 +129,33 @@ static int its_send_cmd_sync(struct host_its *its, int > cpu) > return its_send_command(its, cmd); > } > > +static int its_send_cmd_discard(struct host_its *its, > + uint32_t deviceid, uint32_t eventid) > +{ > + uint64_t cmd[4]; > + > + cmd[0] = GITS_CMD_DISCARD | ((uint64_t)deviceid << 32); > + cmd[1] = eventid; > + cmd[2] = 0x00; > + cmd[3] = 0x00; > + > + return its_send_command(its, cmd); > +} > + > +static int its_send_cmd_mapti(struct host_its *its, > + uint32_t deviceid, uint32_t eventid, > + uint32_t pintid, uint16_t icid) > +{ > + uint64_t cmd[4]; > + > + cmd[0] = GITS_CMD_MAPTI | ((uint64_t)deviceid << 32); > + cmd[1] = eventid | ((uint64_t)pintid << 32); > + cmd[2] = icid; > + cmd[3] = 0x00; > + > + return its_send_command(its, cmd); > +} > + > static int its_send_cmd_mapc(struct host_its *its, int collection_id, int > cpu) > { > uint64_t cmd[4]; > @@ -330,15 +390,109 @@ uint64_t gicv3_lpi_get_proptable() > return reg; > } > > +/* Allocate the 2nd level array for host LPIs. This one holds pointers > + * to the page with the actual "union host_lpi" entries. Our LPI limit > + * avoids excessive memory usage. > + */ > int gicv3_lpi_init_host_lpis(int lpi_bits) > { > + int nr_lpi_ptrs; > + > lpi_data.host_lpi_bits = lpi_bits; > > + nr_lpi_ptrs = MAX_HOST_LPIS / (PAGE_SIZE / sizeof(union host_lpi)); > + > + lpi_data.host_lpis = xzalloc_array(union host_lpi *, nr_lpi_ptrs); > + if ( !lpi_data.host_lpis ) > + return -ENOMEM; Why are we not allocating the 2nd level right away? To save memory? If so, I would like some numbers in a real use case scenario written either here on in the commit message. > printk("GICv3: using at most %ld LPIs on the host.\n", MAX_HOST_LPIS); > > return 0; > } > > +/* Allocates a new host LPI to be injected as "virt_lpi" into the specified > + * VCPU. Returns the host LPI ID or a negative error value. > + */ > +int gicv3_lpi_allocate_host_lpi(struct host_its *its, > + uint32_t devid, uint32_t eventid, > + struct vcpu *v, int virt_lpi) > +{ > + int chunk, i; > + union host_lpi hlpi, *new_chunk; > + > + /* TODO: handle some kind of preassigned LPI mapping for DomUs */ > + if ( !its ) > + return -EPERM; > + > + /* TODO: This could be optimized by storing some "next available" hint > and > + * only iterate if this one doesn't work. But this function should be > + * called rarely. > + */ Yes please. Even a trivial pointer to last would be far better than this. It would be nice to run some numbers and prove that in realistic scenarios finding an empty plpi doesn't take more than 5-10 ops, which should be the case unless we have to loop over and the initial chucks are still fully populated, causing Xen to scan for 512 units at a time. We defenitely want to avoid that, if not in rare worse case scenarios. > + for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++) > + { > + /* If we hit an unallocated chunk, we initialize it and use entry 0. > */ > + if ( !lpi_data.host_lpis[chunk] ) > + { > + new_chunk = alloc_xenheap_pages(0, 0); > + if ( !new_chunk ) > + return -ENOMEM; > + > + memset(new_chunk, 0, PAGE_SIZE); > + lpi_data.host_lpis[chunk] = new_chunk; > + i = 0; > + } > + else > + { > + /* Find an unallocted entry in this chunk. */ > + for (i = 0; i < HOST_LPIS_PER_PAGE; i++) > + if ( !lpi_data.host_lpis[chunk][i].virt_lpi ) > + break; > + > + /* If this chunk is fully allocted, advance to the next one. */ ^ allocated > + if ( i == HOST_LPIS_PER_PAGE) > + continue; > + } > + > + hlpi.virt_lpi = virt_lpi; > + hlpi.dom_id = v->domain->domain_id; > + hlpi.vcpu_id = v->vcpu_id; > + lpi_data.host_lpis[chunk][i].data = hlpi.data; > + > + if (its) code style > + { > + its_send_cmd_mapti(its, devid, eventid, > + chunk * HOST_LPIS_PER_PAGE + i + 8192, 0); > + its_send_cmd_sync(its, 0); Why hardcode the physical cpu to 0? Should we get the pcpu the vcpu is currently running on? > + } > + > + return chunk * HOST_LPIS_PER_PAGE + i + 8192; > + } > + > + return -ENOSPC; > +} > + > +/* Drops the connection of the given host LPI to a virtual LPI. > + */ > +int gicv3_lpi_drop_host_lpi(struct host_its *its, > + uint32_t devid, uint32_t eventid, uint32_t > host_lpi) > +{ > + union host_lpi *hlpip; > + > + if ( !its ) > + return -EPERM; > + > + hlpip = gic_find_host_lpi(host_lpi, NULL); > + if ( !hlpip ) > + return -1; > + > + hlpip->data = 0; > + > + its_send_cmd_discard(its, devid, eventid); > + > + return 0; > +} > + > void gicv3_its_dt_init(const struct dt_device_node *node) > { > const struct dt_device_node *its = NULL; > diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h > index b49d274..512a388 100644 > --- a/xen/include/asm-arm/gic-its.h > +++ b/xen/include/asm-arm/gic-its.h > @@ -114,6 +114,12 @@ void gicv3_set_redist_addr(paddr_t address, int > redist_id); > /* Map a collection for this host CPU to each host ITS. */ > void gicv3_its_setup_collection(int cpu); > > +int gicv3_lpi_allocate_host_lpi(struct host_its *its, > + uint32_t devid, uint32_t eventid, > + struct vcpu *v, int virt_lpi); > +int gicv3_lpi_drop_host_lpi(struct host_its *its, > + uint32_t devid, uint32_t eventid, > + uint32_t host_lpi); > #else > > static inline void gicv3_its_dt_init(const struct dt_device_node *node) > @@ -141,6 +147,18 @@ static inline void gicv3_set_redist_addr(paddr_t > address, int redist_id) > static inline void gicv3_its_setup_collection(int cpu) > { > } > +static inline int gicv3_lpi_allocate_host_lpi(struct host_its *its, > + uint32_t devid, uint32_t > eventid, > + struct vcpu *v, int virt_lpi) > +{ > + return 0; > +} > +static inline int gicv3_lpi_drop_host_lpi(struct host_its *its, > + uint32_t devid, uint32_t eventid, > + uint32_t host_lpi) > +{ > + return 0; > +} > > #endif /* CONFIG_HAS_ITS */ > > -- > 2.9.0 > _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel