On 29.09.2011, at 08:45, David Gibson wrote: > The pseries machine of qemu implements the TCE mechanism used as a > virtual IOMMU for the PAPR defined virtual IO devices. Because the > PAPR spec only defines a small DMA address space, the guest VIO > drivers need to update TCE mappings very frequently - the virtual > network device is particularly bad. This means many slow exits to > qemu to emulate the H_PUT_TCE hypercall. > > Sufficiently recent kernels allow this to be mitigated by implementing > H_PUT_TCE in the host kernel. To make use of this, however, qemu > needs to initialize the necessary TCE tables, and map them into itself > so that the VIO device implementations can retrieve the mappings when > they access guest memory (which is treated as a virtual DMA > operation). > > This patch adds the necessary calls to use the KVM TCE acceleration. > If the kernel does not support acceleration, or there is some other > error creating the accelerated TCE table, then it will still fall back > to full userspace TCE implementation. > > Signed-off-by: David Gibson <da...@gibson.dropbear.id.au> > --- > hw/spapr_vio.c | 8 ++++++- > hw/spapr_vio.h | 1 + > target-ppc/kvm.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ > target-ppc/kvm_ppc.h | 14 +++++++++++++ > 4 files changed, 76 insertions(+), 1 deletions(-) > > diff --git a/hw/spapr_vio.c b/hw/spapr_vio.c > index 35818e1..1da3032 100644 > --- a/hw/spapr_vio.c > +++ b/hw/spapr_vio.c > @@ -165,7 +165,13 @@ static void rtce_init(VIOsPAPRDevice *dev) > * sizeof(VIOsPAPR_RTCE); > > if (size) { > - dev->rtce_table = g_malloc0(size); > + dev->rtce_table = kvmppc_create_spapr_tce(dev->reg, > + dev->rtce_window_size, > + &dev->kvmtce_fd); > + > + if (!dev->rtce_table) { > + dev->rtce_table = g_malloc0(size); > + } > } > } > > diff --git a/hw/spapr_vio.h b/hw/spapr_vio.h > index 4fe5f74..a325a5f 100644 > --- a/hw/spapr_vio.h > +++ b/hw/spapr_vio.h > @@ -57,6 +57,7 @@ typedef struct VIOsPAPRDevice { > target_ulong signal_state; > uint32_t rtce_window_size; > VIOsPAPR_RTCE *rtce_table; > + int kvmtce_fd; > VIOsPAPR_CRQ crq; > } VIOsPAPRDevice; > > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > index 37ee902..866cf7f 100644 > --- a/target-ppc/kvm.c > +++ b/target-ppc/kvm.c > @@ -28,6 +28,7 @@ > #include "kvm_ppc.h" > #include "cpu.h" > #include "device_tree.h" > +#include "hw/sysbus.h" > #include "hw/spapr.h" > > #include "hw/sysbus.h" > @@ -58,6 +59,7 @@ static int cap_ppc_smt = 0; > #ifdef KVM_CAP_PPC_RMA > static int cap_ppc_rma = 0; > #endif > +static int cap_spapr_tce = false; > > /* XXX We have a race condition where we actually have a level triggered > * interrupt, but the infrastructure can't expose that yet, so the guest > @@ -87,6 +89,9 @@ int kvm_arch_init(KVMState *s) > #ifdef KVM_CAP_PPC_RMA > cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA); > #endif > +#ifdef KVM_CAP_SPAPR_TCE > + cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE); > +#endif > > if (!cap_interrupt_level) { > fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the " > @@ -792,6 +797,55 @@ off_t kvmppc_alloc_rma(const char *name) > #endif > } > > +void *kvmppc_create_spapr_tce(target_ulong liobn, uint32_t window_size, int > *pfd) > +{ struct kvm_create_spapr_tce args = { > + .liobn = liobn, > + .window_size = window_size, > + }; > + long len; > + int fd; > + void *table; > + > + if (!cap_spapr_tce) { > + return NULL; > + } > + > + fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, &args); > + if (fd < 0) { > + return NULL; > + } > + > + len = (window_size / SPAPR_VIO_TCE_PAGE_SIZE) * sizeof(VIOsPAPR_RTCE); > + /* FIXME: round this up to page size */ > + > + table = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); > + if (table == MAP_FAILED) { > + close(fd); > + return NULL; > + } > + > + *pfd = fd; > + return table; > +} > + > +int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t window_size)
Hrm. Is this ever called somewhere? Alex