On Thu, Mar 01, 2012 at 04:36:06PM +1100, David Gibson wrote: > This patch adds the basic infrastructure necessary to emulate an IOMMU > visible to the guest. The DMAContext structure is extended with > information and a callback describing the translation, and the various > DMA functions used by devices will now perform IOMMU translation using > this callback. > > Cc: Michael S. Tsirkin <m...@redhat.com> > Cc: Richard Henderson <r...@twiddle.net> > > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munte...@linux360.ro> > Signed-off-by: David Gibson <da...@gibson.dropbear.id.au> > --- > configure | 12 ++++ > dma-helpers.c | 189 > +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Why not just dma.c? > dma.h | 126 +++++++++++++++++++++++++++++++------- > 3 files changed, 305 insertions(+), 22 deletions(-) > > diff --git a/configure b/configure > index fb0e18e..3eff89c 100755 > --- a/configure > +++ b/configure > @@ -138,6 +138,7 @@ linux_aio="" > cap_ng="" > attr="" > libattr="" > +iommu="yes" > xfs="" > > vhost_net="no" > @@ -784,6 +785,10 @@ for opt do > ;; > --enable-vhost-net) vhost_net="yes" > ;; > + --enable-iommu) iommu="yes" > + ;; > + --disable-iommu) iommu="no" > + ;; > --disable-opengl) opengl="no" > ;; > --enable-opengl) opengl="yes" > @@ -1085,6 +1090,8 @@ echo " --enable-docs enable documentation > build" > echo " --disable-docs disable documentation build" > echo " --disable-vhost-net disable vhost-net acceleration support" > echo " --enable-vhost-net enable vhost-net acceleration support" > +echo " --disable-iommu disable IOMMU emulation support" > +echo " --enable-iommu enable IOMMU emulation support" > echo " --enable-trace-backend=B Set trace backend" > echo " Available backends:" > $("$source_path"/scripts/tracetool --list-backends) > echo " --with-trace-file=NAME Full PATH,NAME of file to store traces" > @@ -2935,6 +2942,7 @@ echo "posix_madvise $posix_madvise" > echo "uuid support $uuid" > echo "libcap-ng support $cap_ng" > echo "vhost-net support $vhost_net" > +echo "IOMMU support $iommu" > echo "Trace backend $trace_backend" > echo "Trace output file $trace_file-<pid>" > echo "spice support $spice" > @@ -3812,6 +3820,10 @@ if test "$target_softmmu" = "yes" -a \( \ > echo "CONFIG_NEED_MMU=y" >> $config_target_mak > fi > > +if test "$iommu" = "yes" ; then > + echo "CONFIG_IOMMU=y" >> $config_host_mak > +fi > + > if test "$gprof" = "yes" ; then > echo "TARGET_GPROF=yes" >> $config_target_mak > if test "$target_linux_user" = "yes" ; then > diff --git a/dma-helpers.c b/dma-helpers.c > index 9dcfb2c..8269d60 100644 > --- a/dma-helpers.c > +++ b/dma-helpers.c > @@ -10,6 +10,9 @@ > #include "dma.h" > #include "block_int.h" > #include "trace.h" > +#include "range.h" > + > +/*#define DEBUG_IOMMU*/ > > void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma) > { > @@ -255,3 +258,189 @@ void dma_acct_start(BlockDriverState *bs, > BlockAcctCookie *cookie, > { > bdrv_acct_start(bs, cookie, sg->size, type); > } > + > +#ifdef CONFIG_IOMMU > +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len, > + DMADirection dir) > +{ > + target_phys_addr_t paddr, plen; > + > +#ifdef DEBUG_DMA > + fprintf(stderr, "dma_memory_check iommu=%p addr=0x%llx len=0x%llx > dir=%d\n", > + (unsigned long long)addr, (unsigned long long)len, dir); > +#endif > + > + while (len) { > + if (dma->translate(dma, addr, &paddr, &plen, dir) != 0) { > + return false; > + } > + > + /* The translation might be valid for larger regions. */ > + if (plen > len) { > + plen = len; > + } > + > + len -= plen; > + addr += plen; > + } > + > + return true; > +} > + > +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr, > + void *buf, dma_addr_t len, DMADirection dir) By the way, users still ignore the return code from the pci routines. I suspect the API of returning errors is just not such a good one. What happens on read errors with real devices? Master abort? What about write errors? Maybe we need a callback that will set error flags in the device instead. > +{ > + target_phys_addr_t paddr, plen; > + int err; > + > +#ifdef DEBUG_IOMMU > + fprintf(stderr, "dma_memory_rw iommu=%p addr=0x%llx len=0x%llx dir=%d\n", > + (unsigned long long)addr, (unsigned long long)len, dir); > +#endif > + > + while (len) { > + err = dma->translate(dma, addr, &paddr, &plen, dir); > + if (err) { > + return -1; Why not return err? > + } > + > + /* The translation might be valid for larger regions. */ > + if (plen > len) { > + plen = len; > + } > + > + cpu_physical_memory_rw(paddr, buf, plen, > + dir == DMA_DIRECTION_FROM_DEVICE); > + > + len -= plen; > + addr += plen; > + buf += plen; > + } > + > + return 0; > +} > + > +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len) > +{ > + target_phys_addr_t paddr, plen; > + int err; > + > +#ifdef DEBUG_IOMMU > + fprintf(stderr, "dma_memory_zero iommu=%p addr=0x%llx len=0x%llx\n", > + (unsigned long long)addr, (unsigned long long)len); > +#endif > + > + while (len) { > + err = dma->translate(dma, addr, &paddr, &plen, > + DMA_DIRECTION_FROM_DEVICE); > + if (err) { > + return -1; > + } > + > + /* The translation might be valid for larger regions. */ > + if (plen > len) { > + plen = len; > + } > + > + cpu_physical_memory_zero(paddr, plen); > + > + len -= plen; > + addr += plen; > + } > + > + return 0; > +} > + > +typedef struct DMAMemoryMap DMAMemoryMap; > +struct DMAMemoryMap { > + dma_addr_t addr; > + size_t len; > + void *buf; > + DMAInvalidateMapFunc *invalidate; > + void *invalidate_opaque; Do we absolutely need the opaque value? Can we let the user allocate the map objects? Then user can use safe container_of to get user data. > + > + QLIST_ENTRY(DMAMemoryMap) list; > +}; > + > +void dma_context_init(DMAContext *dma, DMATranslateFunc fn, void *opaque) > +{ > + dma->translate = fn; > + dma->opaque = opaque; > + QLIST_INIT(&dma->memory_maps); > +} > + > +void dma_invalidate_memory_range(DMAContext *dma, > + dma_addr_t addr, dma_addr_t len) > +{ > + DMAMemoryMap *map; > + > + QLIST_FOREACH(map, &dma->memory_maps, list) { > + if (ranges_overlap(addr, len, map->addr, map->len)) { > + map->invalidate(map->invalidate_opaque); > + QLIST_REMOVE(map, list); > + free(map); > + } > + } > +} > + > +void *__dma_memory_map(DMAContext *dma, DMAInvalidateMapFunc *cb, > + void *cb_opaque, dma_addr_t addr, dma_addr_t *len, > + DMADirection dir) > +{ > + int err; > + target_phys_addr_t paddr, plen; > + void *buf; > + > + plen = *len; > + err = dma->translate(dma, addr, &paddr, &plen, dir); > + if (err) { > + return NULL; > + } > + > + /* > + * If this is true, the virtual region is contiguous, > + * but the translated physical region isn't. We just > + * clamp *len, much like cpu_physical_memory_map() does. > + */ > + if (plen < *len) { > + *len = plen; > + } > + > + buf = cpu_physical_memory_map(paddr, &plen, > + dir == DMA_DIRECTION_FROM_DEVICE); > + *len = plen; > + > + /* We treat maps as remote TLBs to cope with stuff like AIO. */ > + if (cb) { > + DMAMemoryMap *map; > + > + map = g_malloc(sizeof(DMAMemoryMap)); > + map->addr = addr; > + map->len = *len; > + map->buf = buf; > + map->invalidate = cb; > + map->invalidate_opaque = cb_opaque; > + > + QLIST_INSERT_HEAD(&dma->memory_maps, map, list); > + } > + > + return buf; > +} > + > +void __dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len, > + DMADirection dir, dma_addr_t access_len) > +{ > + DMAMemoryMap *map; > + > + cpu_physical_memory_unmap(buffer, len, > + dir == DMA_DIRECTION_FROM_DEVICE, > + access_len); > + > + QLIST_FOREACH(map, &dma->memory_maps, list) { > + if ((map->buf == buffer) && (map->len == len)) { > + QLIST_REMOVE(map, list); > + free(map); > + } > + } > +} > +#endif /* CONFIG_IOMMU */ > diff --git a/dma.h b/dma.h > index a66e3d7..c27f8b3 100644 > --- a/dma.h > +++ b/dma.h > @@ -15,6 +15,7 @@ > #include "hw/hw.h" > #include "block.h" > > +typedef struct DMAContext DMAContext; > typedef struct ScatterGatherEntry ScatterGatherEntry; > > typedef enum { > @@ -31,30 +32,83 @@ struct QEMUSGList { > }; > > #if defined(TARGET_PHYS_ADDR_BITS) > + > +#ifdef CONFIG_IOMMU > +/* > + * When an IOMMU is present, bus addresses become distinct from > + * CPU/memory physical addresses and may be a different size. Because > + * the IOVA size depends more on the bus than on the platform, we more > + * or less have to treat these as 64-bit always to cover all (or at > + * least most) cases. > + */ > +typedef uint64_t dma_addr_t; > + > +#define DMA_ADDR_BITS 64 > +#define DMA_ADDR_FMT "%" PRIx64 > +#else > typedef target_phys_addr_t dma_addr_t; > > #define DMA_ADDR_BITS TARGET_PHYS_ADDR_BITS > #define DMA_ADDR_FMT TARGET_FMT_plx > +#endif > + > +typedef int DMATranslateFunc(DMAContext *dma, > + dma_addr_t addr, > + target_phys_addr_t *paddr, > + target_phys_addr_t *len, > + DMADirection dir); > + > +typedef struct DMAContext { > +#ifdef CONFIG_IOMMU > + DMATranslateFunc *translate; > + void *opaque; > + QLIST_HEAD(memory_maps, DMAMemoryMap) memory_maps; > +#endif > +} DMAContext; > + Here I suspect opaque is not needed at all. > +#ifdef CONFIG_IOMMU > +static inline bool dma_has_iommu(DMACon.text *dma) > +{ > + return (dma != NULL); > +} () not necessary here. Neither is != NULL. > +#else > +static inline bool dma_has_iommu(DMAContext *dma) > +{ > + return false; > +} > +#endif > > typedef void DMAInvalidateMapFunc(void *); > > /* Checks that the given range of addresses is valid for DMA. This is > * useful for certain cases, but usually you should just use > * dma_memory_{read,write}() and check for errors */ > -static inline bool dma_memory_valid(DMAContext *dma, dma_addr_t addr, > - dma_addr_t len, DMADirection dir) > +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len, > + DMADirection dir); > +static inline bool dma_memory_valid(DMAContext *dma, > + dma_addr_t addr, dma_addr_t len, > + DMADirection dir) > { > - /* Stub version, with no iommu we assume all bus addresses are valid */ > - return true; > + if (!dma_has_iommu(dma)) { > + return true; > + } else { > + return __dma_memory_valid(dma, addr, len, dir); > + } > } > > +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr, > + void *buf, dma_addr_t len, DMADirection dir); > static inline int dma_memory_rw(DMAContext *dma, dma_addr_t addr, > void *buf, dma_addr_t len, DMADirection dir) > { > - /* Stub version when we have no iommu support */ > - cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len, > - dir == DMA_DIRECTION_FROM_DEVICE); > - return 0; > + if (!dma_has_iommu(dma)) { > + /* Fast-path for no IOMMU */ > + cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len, cast not needed, I think. > + dir == DMA_DIRECTION_FROM_DEVICE); > + return 0; > + } else { > + return __dma_memory_rw(dma, addr, buf, len, dir); > + } > } > > static inline int dma_memory_read(DMAContext *dma, dma_addr_t addr, > @@ -70,35 +124,55 @@ static inline int dma_memory_write(DMAContext *dma, > dma_addr_t addr, > DMA_DIRECTION_FROM_DEVICE); > } > > +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len); > static inline int dma_memory_zero(DMAContext *dma, dma_addr_t addr, > dma_addr_t len) > { > - /* Stub version when we have no iommu support */ > - cpu_physical_memory_zero(addr, len); > - return 0; > + if (!dma_has_iommu(dma)) { > + /* Fast-path for no IOMMU */ > + cpu_physical_memory_zero(addr, len); > + return 0; > + } else { > + return __dma_memory_zero(dma, addr, len); > + } > } > > +void *__dma_memory_map(DMAContext *dma, > + DMAInvalidateMapFunc *cb, void *opaque, > + dma_addr_t addr, dma_addr_t *len, > + DMADirection dir); > static inline void *dma_memory_map(DMAContext *dma, > - DMAInvalidateMapFunc *cb, void *opaque, > + DMAInvalidateMapFunc *cb, void *cb_opaque, > dma_addr_t addr, dma_addr_t *len, > DMADirection dir) > { > - target_phys_addr_t xlen = *len; > - void *p; > - > - p = cpu_physical_memory_map(addr, &xlen, > - dir == DMA_DIRECTION_FROM_DEVICE); > - *len = xlen; > - return p; > + if (!dma_has_iommu(dma)) { > + target_phys_addr_t xlen = *len; > + void *p; > + > + p = cpu_physical_memory_map(addr, &xlen, > + dir == DMA_DIRECTION_FROM_DEVICE); > + *len = xlen; > + return p; > + } else { > + return __dma_memory_map(dma, cb, cb_opaque, addr, len, dir); > + } > } Are callbacks ever useful? I note that without an iommu, they are never invoked. So how can a device use them? > > +void __dma_memory_unmap(DMAContext *dma, > + void *buffer, dma_addr_t len, > + DMADirection dir, dma_addr_t access_len); > static inline void dma_memory_unmap(DMAContext *dma, > void *buffer, dma_addr_t len, > DMADirection dir, dma_addr_t access_len) > { > - return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len, > - dir == DMA_DIRECTION_FROM_DEVICE, > - access_len); > + if (!dma_has_iommu(dma)) { > + return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len, > + dir == DMA_DIRECTION_FROM_DEVICE, > + access_len); > + } else { > + __dma_memory_unmap(dma, buffer, len, dir, access_len); > + } > } > > #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \ > @@ -139,6 +213,14 @@ DEFINE_LDST_DMA(q, q, 64, be); > > #undef DEFINE_LDST_DMA > > +#ifdef CONFIG_IOMMU > + > +void dma_context_init(DMAContext *dma, DMATranslateFunc, void *opaque); > +void dma_invalidate_memory_range(DMAContext *dma, > + dma_addr_t addr, dma_addr_t len); > + > +#endif /* CONFIG_IOMMU */ > + > struct ScatterGatherEntry { > dma_addr_t base; > dma_addr_t len; > -- > 1.7.9