On Thu, Mar 01, 2012 at 04:36:06PM +1100, David Gibson wrote:
> This patch adds the basic infrastructure necessary to emulate an IOMMU
> visible to the guest.  The DMAContext structure is extended with
> information and a callback describing the translation, and the various
> DMA functions used by devices will now perform IOMMU translation using
> this callback.
> 
> Cc: Michael S. Tsirkin <m...@redhat.com>
> Cc: Richard Henderson <r...@twiddle.net>
> 
> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munte...@linux360.ro>
> Signed-off-by: David Gibson <da...@gibson.dropbear.id.au>
> ---
>  configure     |   12 ++++
>  dma-helpers.c |  189 
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Why not just dma.c?

>  dma.h         |  126 +++++++++++++++++++++++++++++++-------
>  3 files changed, 305 insertions(+), 22 deletions(-)
> 
> diff --git a/configure b/configure
> index fb0e18e..3eff89c 100755
> --- a/configure
> +++ b/configure
> @@ -138,6 +138,7 @@ linux_aio=""
>  cap_ng=""
>  attr=""
>  libattr=""
> +iommu="yes"
>  xfs=""
>  
>  vhost_net="no"
> @@ -784,6 +785,10 @@ for opt do
>    ;;
>    --enable-vhost-net) vhost_net="yes"
>    ;;
> +  --enable-iommu) iommu="yes"
> +  ;;
> +  --disable-iommu) iommu="no"
> +  ;;
>    --disable-opengl) opengl="no"
>    ;;
>    --enable-opengl) opengl="yes"
> @@ -1085,6 +1090,8 @@ echo "  --enable-docs            enable documentation 
> build"
>  echo "  --disable-docs           disable documentation build"
>  echo "  --disable-vhost-net      disable vhost-net acceleration support"
>  echo "  --enable-vhost-net       enable vhost-net acceleration support"
> +echo "  --disable-iommu          disable IOMMU emulation support"
> +echo "  --enable-iommu           enable IOMMU emulation support"
>  echo "  --enable-trace-backend=B Set trace backend"
>  echo "                           Available backends:" 
> $("$source_path"/scripts/tracetool --list-backends)
>  echo "  --with-trace-file=NAME   Full PATH,NAME of file to store traces"
> @@ -2935,6 +2942,7 @@ echo "posix_madvise     $posix_madvise"
>  echo "uuid support      $uuid"
>  echo "libcap-ng support $cap_ng"
>  echo "vhost-net support $vhost_net"
> +echo "IOMMU support     $iommu"
>  echo "Trace backend     $trace_backend"
>  echo "Trace output file $trace_file-<pid>"
>  echo "spice support     $spice"
> @@ -3812,6 +3820,10 @@ if test "$target_softmmu" = "yes" -a \( \
>    echo "CONFIG_NEED_MMU=y" >> $config_target_mak
>  fi
>  
> +if test "$iommu" = "yes" ; then
> +  echo "CONFIG_IOMMU=y" >> $config_host_mak
> +fi
> +
>  if test "$gprof" = "yes" ; then
>    echo "TARGET_GPROF=yes" >> $config_target_mak
>    if test "$target_linux_user" = "yes" ; then
> diff --git a/dma-helpers.c b/dma-helpers.c
> index 9dcfb2c..8269d60 100644
> --- a/dma-helpers.c
> +++ b/dma-helpers.c
> @@ -10,6 +10,9 @@
>  #include "dma.h"
>  #include "block_int.h"
>  #include "trace.h"
> +#include "range.h"
> +
> +/*#define DEBUG_IOMMU*/
>  
>  void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma)
>  {
> @@ -255,3 +258,189 @@ void dma_acct_start(BlockDriverState *bs, 
> BlockAcctCookie *cookie,
>  {
>      bdrv_acct_start(bs, cookie, sg->size, type);
>  }
> +
> +#ifdef CONFIG_IOMMU
> +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> +                        DMADirection dir)
> +{
> +    target_phys_addr_t paddr, plen;
> +
> +#ifdef DEBUG_DMA
> +    fprintf(stderr, "dma_memory_check iommu=%p addr=0x%llx len=0x%llx 
> dir=%d\n",
> +            (unsigned long long)addr, (unsigned long long)len, dir);
> +#endif
> +
> +    while (len) {
> +        if (dma->translate(dma, addr, &paddr, &plen, dir) != 0) {
> +            return false;
> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        len -= plen;
> +        addr += plen;
> +    }
> +
> +    return true;
> +}
> +
> +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> +                    void *buf, dma_addr_t len, DMADirection dir)

By the way, users still ignore the return code
from the pci routines.
I suspect the API of returning errors is just not
such a good one. What happens on read errors with
real devices? Master abort? What about write errors?
Maybe we need a callback
that will set error flags in the device instead.

> +{
> +    target_phys_addr_t paddr, plen;
> +    int err;
> +
> +#ifdef DEBUG_IOMMU
> +    fprintf(stderr, "dma_memory_rw iommu=%p addr=0x%llx len=0x%llx dir=%d\n",
> +            (unsigned long long)addr, (unsigned long long)len, dir);
> +#endif
> +
> +    while (len) {
> +        err = dma->translate(dma, addr, &paddr, &plen, dir);
> +        if (err) {
> +            return -1;

Why not return err?

> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        cpu_physical_memory_rw(paddr, buf, plen,
> +                               dir == DMA_DIRECTION_FROM_DEVICE);
> +
> +        len -= plen;
> +        addr += plen;
> +        buf += plen;
> +    }
> +
> +    return 0;
> +}
> +
> +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len)
> +{
> +    target_phys_addr_t paddr, plen;
> +    int err;
> +
> +#ifdef DEBUG_IOMMU
> +    fprintf(stderr, "dma_memory_zero iommu=%p addr=0x%llx len=0x%llx\n",
> +            (unsigned long long)addr, (unsigned long long)len);
> +#endif
> +
> +    while (len) {
> +        err = dma->translate(dma, addr, &paddr, &plen,
> +                             DMA_DIRECTION_FROM_DEVICE);
> +        if (err) {
> +            return -1;
> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        cpu_physical_memory_zero(paddr, plen);
> +
> +        len -= plen;
> +        addr += plen;
> +    }
> +
> +    return 0;
> +}
> +
> +typedef struct DMAMemoryMap DMAMemoryMap;
> +struct DMAMemoryMap {
> +    dma_addr_t              addr;
> +    size_t                  len;
> +    void                    *buf;
> +    DMAInvalidateMapFunc    *invalidate;
> +    void                    *invalidate_opaque;

Do we absolutely need the opaque value?
Can we let the user allocate the map objects?
Then user can use safe container_of to get user data.

> +
> +    QLIST_ENTRY(DMAMemoryMap) list;
> +};
> +
> +void dma_context_init(DMAContext *dma, DMATranslateFunc fn, void *opaque)
> +{
> +    dma->translate = fn;
> +    dma->opaque = opaque;
> +    QLIST_INIT(&dma->memory_maps);
> +}
> +
> +void dma_invalidate_memory_range(DMAContext *dma,
> +                                 dma_addr_t addr, dma_addr_t len)
> +{
> +    DMAMemoryMap *map;
> +
> +    QLIST_FOREACH(map, &dma->memory_maps, list) {
> +        if (ranges_overlap(addr, len, map->addr, map->len)) {
> +            map->invalidate(map->invalidate_opaque);
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +
> +void *__dma_memory_map(DMAContext *dma, DMAInvalidateMapFunc *cb,
> +                       void *cb_opaque, dma_addr_t addr, dma_addr_t *len,
> +                       DMADirection dir)
> +{
> +    int err;
> +    target_phys_addr_t paddr, plen;
> +    void *buf;
> +
> +    plen = *len;
> +    err = dma->translate(dma, addr, &paddr, &plen, dir);
> +    if (err) {
> +        return NULL;
> +    }
> +
> +    /*
> +     * If this is true, the virtual region is contiguous,
> +     * but the translated physical region isn't. We just
> +     * clamp *len, much like cpu_physical_memory_map() does.
> +     */
> +    if (plen < *len) {
> +        *len = plen;
> +    }
> +
> +    buf = cpu_physical_memory_map(paddr, &plen,
> +                                  dir == DMA_DIRECTION_FROM_DEVICE);
> +    *len = plen;
> +
> +    /* We treat maps as remote TLBs to cope with stuff like AIO. */
> +    if (cb) {
> +        DMAMemoryMap *map;
> +
> +        map = g_malloc(sizeof(DMAMemoryMap));
> +        map->addr = addr;
> +        map->len = *len;
> +        map->buf = buf;
> +        map->invalidate = cb;
> +        map->invalidate_opaque = cb_opaque;
> +
> +        QLIST_INSERT_HEAD(&dma->memory_maps, map, list);
> +    }
> +
> +    return buf;
> +}
> +
> +void __dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len,
> +                        DMADirection dir, dma_addr_t access_len)
> +{
> +    DMAMemoryMap *map;
> +
> +    cpu_physical_memory_unmap(buffer, len,
> +                              dir == DMA_DIRECTION_FROM_DEVICE,
> +                              access_len);
> +
> +    QLIST_FOREACH(map, &dma->memory_maps, list) {
> +        if ((map->buf == buffer) && (map->len == len)) {
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +#endif /* CONFIG_IOMMU */
> diff --git a/dma.h b/dma.h
> index a66e3d7..c27f8b3 100644
> --- a/dma.h
> +++ b/dma.h
> @@ -15,6 +15,7 @@
>  #include "hw/hw.h"
>  #include "block.h"
>  
> +typedef struct DMAContext DMAContext;
>  typedef struct ScatterGatherEntry ScatterGatherEntry;
>  
>  typedef enum {
> @@ -31,30 +32,83 @@ struct QEMUSGList {
>  };
>  
>  #if defined(TARGET_PHYS_ADDR_BITS)
> +
> +#ifdef CONFIG_IOMMU
> +/*
> + * When an IOMMU is present, bus addresses become distinct from
> + * CPU/memory physical addresses and may be a different size.  Because
> + * the IOVA size depends more on the bus than on the platform, we more
> + * or less have to treat these as 64-bit always to cover all (or at
> + * least most) cases.
> + */
> +typedef uint64_t dma_addr_t;
> +
> +#define DMA_ADDR_BITS 64
> +#define DMA_ADDR_FMT "%" PRIx64
> +#else
>  typedef target_phys_addr_t dma_addr_t;
>  
>  #define DMA_ADDR_BITS TARGET_PHYS_ADDR_BITS
>  #define DMA_ADDR_FMT TARGET_FMT_plx
> +#endif
> +
> +typedef int DMATranslateFunc(DMAContext *dma,
> +                             dma_addr_t addr,
> +                             target_phys_addr_t *paddr,
> +                             target_phys_addr_t *len,
> +                             DMADirection dir);
> +
> +typedef struct DMAContext {
> +#ifdef CONFIG_IOMMU
> +    DMATranslateFunc *translate;
> +    void *opaque;
> +    QLIST_HEAD(memory_maps, DMAMemoryMap) memory_maps;
> +#endif
> +} DMAContext;
> +

Here I suspect opaque is not needed at all.

> +#ifdef CONFIG_IOMMU
> +static inline bool dma_has_iommu(DMACon.text *dma)
> +{
> +    return (dma != NULL);
> +}

() not necessary here. Neither is != NULL.

> +#else
> +static inline bool dma_has_iommu(DMAContext *dma)
> +{
> +    return false;
> +}
> +#endif
>  
>  typedef void DMAInvalidateMapFunc(void *);
>  
>  /* Checks that the given range of addresses is valid for DMA.  This is
>   * useful for certain cases, but usually you should just use
>   * dma_memory_{read,write}() and check for errors */
> -static inline bool dma_memory_valid(DMAContext *dma, dma_addr_t addr,
> -                                    dma_addr_t len, DMADirection dir)
> +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> +                        DMADirection dir);
> +static inline bool dma_memory_valid(DMAContext *dma,
> +                                    dma_addr_t addr, dma_addr_t len,
> +                                    DMADirection dir)
>  {
> -    /* Stub version, with no iommu we assume all bus addresses are valid */
> -    return true;
> +    if (!dma_has_iommu(dma)) {
> +        return true;
> +    } else {
> +        return __dma_memory_valid(dma, addr, len, dir);
> +    }
>  }
>  
> +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> +                    void *buf, dma_addr_t len, DMADirection dir);
>  static inline int dma_memory_rw(DMAContext *dma, dma_addr_t addr,
>                                  void *buf, dma_addr_t len, DMADirection dir)
>  {
> -    /* Stub version when we have no iommu support */
> -    cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,
> -                           dir == DMA_DIRECTION_FROM_DEVICE);
> -    return 0;
> +    if (!dma_has_iommu(dma)) {
> +        /* Fast-path for no IOMMU */
> +        cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,

cast not needed, I think.

> +                               dir == DMA_DIRECTION_FROM_DEVICE);
> +        return 0;
> +    } else {
> +        return __dma_memory_rw(dma, addr, buf, len, dir);
> +    }
>  }
>  
>  static inline int dma_memory_read(DMAContext *dma, dma_addr_t addr,
> @@ -70,35 +124,55 @@ static inline int dma_memory_write(DMAContext *dma, 
> dma_addr_t addr,
>                           DMA_DIRECTION_FROM_DEVICE);
>  }
>  
> +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len);
>  static inline int dma_memory_zero(DMAContext *dma, dma_addr_t addr,
>                                    dma_addr_t len)
>  {
> -    /* Stub version when we have no iommu support */
> -    cpu_physical_memory_zero(addr, len);
> -    return 0;
> +    if (!dma_has_iommu(dma)) {
> +        /* Fast-path for no IOMMU */
> +        cpu_physical_memory_zero(addr, len);
> +        return 0;
> +    } else {
> +        return __dma_memory_zero(dma, addr, len);
> +    }
>  }
>  
> +void *__dma_memory_map(DMAContext *dma,
> +                       DMAInvalidateMapFunc *cb, void *opaque,
> +                       dma_addr_t addr, dma_addr_t *len,
> +                       DMADirection dir);
>  static inline void *dma_memory_map(DMAContext *dma,
> -                                   DMAInvalidateMapFunc *cb, void *opaque,
> +                                   DMAInvalidateMapFunc *cb, void *cb_opaque,
>                                     dma_addr_t addr, dma_addr_t *len,
>                                     DMADirection dir)
>  {
> -    target_phys_addr_t xlen = *len;
> -    void *p;
> -
> -    p = cpu_physical_memory_map(addr, &xlen,
> -                                dir == DMA_DIRECTION_FROM_DEVICE);
> -    *len = xlen;
> -    return p;
> +    if (!dma_has_iommu(dma)) {
> +        target_phys_addr_t xlen = *len;
> +        void *p;
> +
> +        p = cpu_physical_memory_map(addr, &xlen,
> +                                    dir == DMA_DIRECTION_FROM_DEVICE);
> +        *len = xlen;
> +        return p;
> +    } else {
> +        return __dma_memory_map(dma, cb, cb_opaque, addr, len, dir);
> +    }
>  }

Are callbacks ever useful? I note that without an iommu, they are never
invoked. So how can a device use them?

>  
> +void __dma_memory_unmap(DMAContext *dma,
> +                        void *buffer, dma_addr_t len,
> +                        DMADirection dir, dma_addr_t access_len);
>  static inline void dma_memory_unmap(DMAContext *dma,
>                                      void *buffer, dma_addr_t len,
>                                      DMADirection dir, dma_addr_t access_len)
>  {
> -    return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
> -                                     dir == DMA_DIRECTION_FROM_DEVICE,
> -                                     access_len);
> +    if (!dma_has_iommu(dma)) {
> +        return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
> +                                         dir == DMA_DIRECTION_FROM_DEVICE,
> +                                         access_len);
> +    } else {
> +        __dma_memory_unmap(dma, buffer, len, dir, access_len);
> +    }
>  }
>  
>  #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \
> @@ -139,6 +213,14 @@ DEFINE_LDST_DMA(q, q, 64, be);
>  
>  #undef DEFINE_LDST_DMA
>  
> +#ifdef CONFIG_IOMMU
> +
> +void dma_context_init(DMAContext *dma, DMATranslateFunc, void *opaque);
> +void dma_invalidate_memory_range(DMAContext *dma,
> +                                 dma_addr_t addr, dma_addr_t len);
> +
> +#endif /* CONFIG_IOMMU */
> +
>  struct ScatterGatherEntry {
>      dma_addr_t base;
>      dma_addr_t len;
> -- 
> 1.7.9

Reply via email to