[Qemu-devel] [PATCH v3 0/3] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-05-21 Thread Aviv B.D
From: "Aviv Ben-David" 

* Advertize Cache Mode capability in iommu cap register.
* Register every VFIO device with IOMMU state.
* On page cache invalidation in vIOMMU, check if the domain belong to
  VFIO device and mirror the guest requests to host.

Changes from v1 to v2:
* remove assumption that the cache do not clears
* fix lockup on high load.

Changes from v2 to v3:
* remove debug leftovers
* split to sepearate commits
* change is_write to flags in vtd_do_iommu_translate, add IOMMU_NO_FAIL 
  to suppress error propagating to guest.


Aviv Ben-David (3):
  IOMMU: add VTD_CAP_VM to vIOMMU capability exposed to guest
  IOMMU: change iommu_op->translate's is_write to flags, add support to
NO_FAIL flag mode
  IOMMU: Integrate between VFIO and vIOMMU to support device assignment

 exec.c |   2 +-
 hw/i386/intel_iommu.c  | 137 -
 hw/i386/intel_iommu_internal.h |   3 +
 hw/vfio/common.c   |  11 +++-
 include/exec/memory.h  |   4 +-
 include/hw/i386/intel_iommu.h  |   4 ++
 include/hw/vfio/vfio-common.h  |   1 +
 memory.c   |   2 +-
 8 files changed, 129 insertions(+), 35 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v3 3/3] IOMMU: Integrate between VFIO and vIOMMU to support device assignment

2016-05-21 Thread Aviv B.D
From: "Aviv Ben-David" 

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 69 --
 hw/i386/intel_iommu_internal.h |  2 ++
 hw/vfio/common.c   | 11 +--
 include/hw/i386/intel_iommu.h  |  4 +++
 include/hw/vfio/vfio-common.h  |  1 +
 5 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 410f810..128ec7c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | 
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif
 
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
 uint64_t wmask, uint64_t w1cmask)
 {
@@ -126,6 +129,22 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState 
*s, hwaddr addr,
 return new_val;
 }
 
+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, 
uint16_t * domain_id)
+{
+VTDContextEntry ce;
+int ret_fr;
+
+assert(domain_id);
+
+ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+if (ret_fr){
+return -1;
+}
+
+*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+return 0;
+}
+
 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
 uint64_t clear, uint64_t mask)
 {
@@ -724,9 +743,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, 
uint8_t bus_num,
 }
 
 if (!vtd_context_entry_present(ce)) {
-VTD_DPRINTF(GENERAL,
-"error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-"is not present", devfn, bus_num);
 return -VTD_FR_CONTEXT_ENTRY_P;
 } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
(ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -1033,18 +1049,58 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState 
*s, uint16_t domain_id)
 &domain_id);
 }
 
+static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, uint16_t 
domain_id,
+  hwaddr addr, uint8_t am)
+{
+VFIOGuestIOMMU * giommu;
+
+QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
+VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, 
iommu);
+uint16_t vfio_domain_id; 
+int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, 
&vfio_domain_id);
+int i=0;
+if (!ret && domain_id == vfio_domain_id){
+IOMMUTLBEntry entry; 
+
+/* do vfio unmap */
+VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
+entry.target_as = NULL;
+entry.iova = addr & VTD_PAGE_MASK_4K;
+entry.translated_addr = 0;
+entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
+entry.perm = IOMMU_NONE;
+memory_region_notify_iommu(giommu->iommu, entry);
+   
+/* do vfio map */
+VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
+/* call to vtd_iommu_translate */
+for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
+IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, 
addr, IOMMU_NO_FAIL); 
+if (entry.perm != IOMMU_NONE){
+memory_region_notify_iommu(giommu->iommu, entry);
+}
+}
+}
+}
+}
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
   hwaddr addr, uint8_t am)
 {
 VTDIOTLBPageInvInfo info;
 
 assert(am <= VTD_MAMV);
+
 info.domain_id = domain_id;
 info.addr = addr;
 info.mask = ~((1 << am) - 1);
+
 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+
+vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
 }
 
+
 /* Flush IOTLB
  * Returns the IOTLB Actual Invalidation Granularity.
  * @val: the content of the IOTLB_REG
@@ -1912,6 +1968,13 @@ static Property vtd_properties[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
+void vtd_register_giommu(VFIOGuestIOMMU * giommu)
+{
+VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, 
iommu);
+IntelIOMMUState *s = vtd_as->iommu_state;
+
+QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
+}
 
 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
 {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index ae40f73..102e9a5 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -339,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
 #define VTD_PAGE_SHIFT_1G   30
 

[Qemu-devel] [PATCH v3 1/3] IOMMU: add VTD_CAP_CM to vIOMMU capability exposed to guest

2016-05-21 Thread Aviv B.D
From: "Aviv Ben-David" 

This flag tells the guest to invalidate tlb cache also after unmap operations.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 3 ++-
 hw/i386/intel_iommu_internal.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 347718f..1af8da8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1949,7 +1949,8 @@ static void vtd_init(IntelIOMMUState *s)
 s->iq_last_desc_type = VTD_INV_DESC_NONE;
 s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
- VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
+ VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
+ VTD_CAP_CM;
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
 vtd_reset_context_cache(s);
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index e5f514c..ae40f73 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -190,6 +190,7 @@
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
 #define VTD_CAP_PSI (1ULL << 39)
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM  (1ULL << 7)
 
 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT 8
-- 
1.9.1




[Qemu-devel] [PATCH v3 2/3] IOMMU: change iommu_op->translate's is_write to flags, add support to NO_FAIL flag mode

2016-05-21 Thread Aviv B.D
From: "Aviv Ben-David" 

Supports translation trials without reporting error to guest on translation 
failures.

Signed-off-by: Aviv Ben-David 
---
 exec.c|  2 +-
 hw/i386/intel_iommu.c | 65 ---
 include/exec/memory.h |  4 ++--
 memory.c  |  2 +-
 4 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/exec.c b/exec.c
index 2e363f0..028c8e0 100644
--- a/exec.c
+++ b/exec.c
@@ -431,7 +431,7 @@ MemoryRegion *address_space_translate(AddressSpace *as, 
hwaddr addr,
 break;
 }
 
-iotlb = mr->iommu_ops->translate(mr, addr, is_write);
+iotlb = mr->iommu_ops->translate(mr, addr, is_write ? IOMMU_WO : 
IOMMU_RO);
 addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 | (addr & iotlb.addr_mask));
 *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1af8da8..410f810 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -356,7 +356,7 @@ static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, 
uint16_t index)
 /* Must not update F field now, should be done later */
 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
 uint16_t source_id, hwaddr addr,
-VTDFaultReason fault, bool is_write)
+VTDFaultReason fault, IOMMUAccessFlags flags)
 {
 uint64_t hi = 0, lo;
 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
@@ -365,7 +365,7 @@ static void vtd_record_frcd(IntelIOMMUState *s, uint16_t 
index,
 
 lo = VTD_FRCD_FI(addr);
 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
-if (!is_write) {
+if (!(flags == IOMMU_WO || flags == IOMMU_RW)) {
 hi |= VTD_FRCD_T;
 }
 vtd_set_quad_raw(s, frcd_reg_addr, lo);
@@ -396,7 +396,7 @@ static bool vtd_try_collapse_fault(IntelIOMMUState *s, 
uint16_t source_id)
 /* Log and report an DMAR (address translation) fault to software */
 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
   hwaddr addr, VTDFaultReason fault,
-  bool is_write)
+  IOMMUAccessFlags flags)
 {
 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 
@@ -407,7 +407,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, 
uint16_t source_id,
 return;
 }
 VTD_DPRINTF(FLOG, "sid 0x%"PRIx16 ", fault %d, addr 0x%"PRIx64
-", is_write %d", source_id, fault, addr, is_write);
+", flags %d", source_id, fault, addr, flags);
 if (fsts_reg & VTD_FSTS_PFO) {
 VTD_DPRINTF(FLOG, "new fault is not recorded due to "
 "Primary Fault Overflow");
@@ -425,7 +425,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, 
uint16_t source_id,
 return;
 }
 
-vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
+vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, flags);
 
 if (fsts_reg & VTD_FSTS_PPF) {
 VTD_DPRINTF(FLOG, "there are pending faults already, "
@@ -621,7 +621,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t 
level)
 /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, 
IOMMUAccessFlags flags,
 uint64_t *slptep, uint32_t *slpte_level,
 bool *reads, bool *writes)
 {
@@ -641,7 +641,20 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 }
 
 /* FIXME: what is the Atomics request here? */
-access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+switch(flags){
+case IOMMU_WO:
+access_right_check = VTD_SL_W;
+break;
+case IOMMU_RO:
+access_right_check = VTD_SL_R;
+break;
+case IOMMU_RW: /* passthrow */
+case IOMMU_NO_FAIL:
+access_right_check = VTD_SL_R | VTD_SL_W;
+break;
+default:
+assert(0);
+}
 
 while (true) {
 offset = vtd_gpa_level_offset(gpa, level);
@@ -663,8 +676,8 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 if (!(slpte & access_right_check)) {
 VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
 "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-(is_write ? "write" : "read"), gpa, slpte);
-return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
+(flags == IOMMU_WO ? "write" : "read"), gpa, slpte);
+return (flags == IOMMU_RW || flags == IOMMU_WO) 

Re: [Qemu-devel] [PATCH v3 3/3] IOMMU: Integrate between VFIO and vIOMMU to support device assignment

2016-05-28 Thread Aviv B.D.
Hi,
Your idea to search the relevent VTDAddressSpace and call it's notifier
will
probably work. Next week I'll try to implement it (for now with the costly
scan
of each context).
I still not sure if populating the MemoryRegion will suffice for hot plug
vfio
device but i'll try to look into it.

As far as I understand the memory_region_iommu_replay function, it still
scans
the whole 64bit address space, and therefore may hang the VM for a long
time.

Aviv.

On Thu, May 26, 2016 at 11:58 PM Alex Williamson 
wrote:

> On Mon, 23 May 2016 11:53:42 -0600
> Alex Williamson  wrote:
>
> > On Sat, 21 May 2016 19:19:50 +0300
> > "Aviv B.D"  wrote:
> >
> > > From: "Aviv Ben-David" 
> > >
> >
> > Some commentary about the changes necessary to achieve $SUBJECT would
> > be nice here.
> >
> > > Signed-off-by: Aviv Ben-David 
> > > ---
> > >  hw/i386/intel_iommu.c  | 69
> --
> > >  hw/i386/intel_iommu_internal.h |  2 ++
> > >  hw/vfio/common.c   | 11 +--
> > >  include/hw/i386/intel_iommu.h  |  4 +++
> > >  include/hw/vfio/vfio-common.h  |  1 +
> > >  5 files changed, 81 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > index 410f810..128ec7c 100644
> > > --- a/hw/i386/intel_iommu.c
> > > +++ b/hw/i386/intel_iommu.c
> > > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT(CSR);
> > >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> > >  #endif
> > >
> > > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t
> bus_num,
> > > +uint8_t devfn, VTDContextEntry
> *ce);
> > > +
> > >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
> val,
> > >  uint64_t wmask, uint64_t w1cmask)
> > >  {
> > > @@ -126,6 +129,22 @@ static uint32_t
> vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
> > >  return new_val;
> > >  }
> > >
> > > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t devfn, uint16_t * domain_id)
> > > +{
> > > +VTDContextEntry ce;
> > > +int ret_fr;
> > > +
> > > +assert(domain_id);
> > > +
> > > +ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > > +if (ret_fr){
> > > +return -1;
> > > +}
> > > +
> > > +*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> > > +return 0;
> > > +}
> > > +
> > >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr
> addr,
> > >  uint64_t clear, uint64_t mask)
> > >  {
> > > @@ -724,9 +743,6 @@ static int
> vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > >  }
> > >
> > >  if (!vtd_context_entry_present(ce)) {
> > > -VTD_DPRINTF(GENERAL,
> > > -"error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > > -"is not present", devfn, bus_num);
> > >  return -VTD_FR_CONTEXT_ENTRY_P;
> > >  } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> > > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > > @@ -1033,18 +1049,58 @@ static void
> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
> > >  &domain_id);
> > >  }
> > >
> > > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
> uint16_t domain_id,
> > > +  hwaddr addr, uint8_t am)
> > > +{
> > > +VFIOGuestIOMMU * giommu;
> > > +
> >
> > VT-d parsing VFIO private data structures, nope this is not a good idea.
> >
> > > +QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > > +VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> >
> > VT-d needs to keep track of its own address spaces and call the iommu
> > notifier, it should have no visibility whatsoever that there are vfio
> > devices attached.
> >
> > > +uint16_t vfio_domain_id;
> > > +int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn, &vfio_domain_id);
> > &

Re: [Qemu-devel] [PATCH v3 3/3] IOMMU: Integrate between VFIO and vIOMMU to support device assignment

2016-05-28 Thread Aviv B.D.
On Sat, May 28, 2016 at 7:02 PM Alex Williamson 
wrote:

> On Sat, 28 May 2016 10:52:58 +
> "Aviv B.D."  wrote:
>
> > Hi,
> > Your idea to search the relevent VTDAddressSpace and call it's notifier
> > will
> > probably work. Next week I'll try to implement it (for now with the
> costly
> > scan
> > of each context).
>
> I think an optimization we can make is to use pci_for_each_bus() and
> pci_for_each_device() to scan only context entries where devices are
> present.  Then for each context entry, retrieve the DID, if it matches
> the invalidation domain_id, retrieve the VTDAddressSpace and perform a
> memory_region_notify_iommu() using VTDAddressSpace.iommu.  Still
> horribly inefficient, but an improvement over walking all context
> entries and avoids gratuitous callbacks between unrelated drivers in
> QEMU.
>

Thanks for the references on how I can do it. :)

>
> Overall, I have very little faith that this will be the only change
> required to make this work though.  For instance, if a device is added
> or removed from a domain, where is that accounted for?  Ideally this
> should trigger the region_add/region_del listener callbacks, but I
> don't see how that works with how VT-d creates a fixed VTDAddressSpace
> per device, and in fact how our QEMU memory model doesn't allow the
> address space of a device to be dynamically aliased against other
> address spaces or really changed at all.
>
> > I still not sure if populating the MemoryRegion will suffice for hot plug
> > vfio
> > device but i'll try to look into it.
> >
> > As far as I understand the memory_region_iommu_replay function, it still
> > scans
> > the whole 64bit address space, and therefore may hang the VM for a long
> > time.
>
> Then we need to fix that problem, one option might be to make a replay
> callback on MemoryRegionIOMMUOps that walks the page tables for a given
> context entry rather than blindly traversing a 64bit address space.  We
> can't simply ignore the issue by #ifdef'ing out the code.  I suspect
> there's a lot more involved to make VT-d interact properly with a
> physical device than what's been proposed so far.  At every
> invalidation, we need to figure out what's changed and update the host
> mappings.  We also need better, more dynamic address space management
> to make the virtual hardware reflect physical hardware when we enable
> things like passthrough mode or have multiple devices sharing an iommu
> domain.  I think we're just barely scratching the surface here.  Thanks,
>
> Alex
>


I agree with you regarding hotplug, therefore I only ifdef this code out
and didn't
delete it. With the call to memory_region_iommu_replay QEMU hangs on startup
with a very long loop that prevent any device assignment  with vIOMMU
enabled.

I'm hoping not to enlarge the scope of this patch to include hotplug device
assignment
with iommu enabled.

Thanks,
Aviv.


Re: [Qemu-devel] [PATCH v3 1/3] IOMMU: add VTD_CAP_CM to vIOMMU capability exposed to guest

2016-05-28 Thread Aviv B.D.
What is the best way to add this configuration option?

Aviv.

On Tue, May 24, 2016 at 12:25 PM Jan Kiszka  wrote:

> On 2016-05-24 10:14, Jason Wang wrote:
> > On 2016年05月22日 00:19, Aviv B.D wrote:
> >> From: "Aviv Ben-David" 
> >>
> >> This flag tells the guest to invalidate tlb cache also after unmap
> >> operations.
> >>
> >> Signed-off-by: Aviv Ben-David 
> >> ---
> >
> > Is this a guest visible behavior?  If yes, shouldn't we cache
> > translation fault conditions in IOTLB?
>
> It is guest visible, and this first of all means, besides requiring to
> be optional, that it definitely needs to be off for compat systems. Or
> it stays off by default.
>
> Jan
>
> --
> Siemens AG, Corporate Technology, CT RDA ITP SES-DE
> Corporate Competence Center Embedded Linux
>


Re: [Qemu-devel] [PATCH v3 3/3] IOMMU: Integrate between VFIO and vIOMMU to support device assignment

2016-05-28 Thread Aviv B.D.
Hi,
As far as I tested the disabled code (call to memory_region_iommu_replay)
hangup
QEMU on startup if IOMMU is enabled (scaning 64 bit address space takes
more
than an hour on modern hardware) , at least on x86 hardware. So the code is
not 100%
correct for any context. Maybe it just should be disabled for x86
architecture?

By specification any such behavior of applying a domain to device should
include
cache invalidation if CM flag is present so I'm not thinking that my patch
break
this scenario.

Thanks,
Aviv.

On Sat, May 28, 2016 at 8:39 PM Alex Williamson 
wrote:

> On Sat, 28 May 2016 16:10:55 +
> "Aviv B.D."  wrote:
>
> > On Sat, May 28, 2016 at 7:02 PM Alex Williamson <
> alex.william...@redhat.com>
> > wrote:
> >
> > > On Sat, 28 May 2016 10:52:58 +
> > > "Aviv B.D."  wrote:
> > >
> > > > Hi,
> > > > Your idea to search the relevent VTDAddressSpace and call it's
> notifier
> > > > will
> > > > probably work. Next week I'll try to implement it (for now with the
> > > costly
> > > > scan
> > > > of each context).
> > >
> > > I think an optimization we can make is to use pci_for_each_bus() and
> > > pci_for_each_device() to scan only context entries where devices are
> > > present.  Then for each context entry, retrieve the DID, if it matches
> > > the invalidation domain_id, retrieve the VTDAddressSpace and perform a
> > > memory_region_notify_iommu() using VTDAddressSpace.iommu.  Still
> > > horribly inefficient, but an improvement over walking all context
> > > entries and avoids gratuitous callbacks between unrelated drivers in
> > > QEMU.
> > >
> >
> > Thanks for the references on how I can do it. :)
> >
> > >
> > > Overall, I have very little faith that this will be the only change
> > > required to make this work though.  For instance, if a device is added
> > > or removed from a domain, where is that accounted for?  Ideally this
> > > should trigger the region_add/region_del listener callbacks, but I
> > > don't see how that works with how VT-d creates a fixed VTDAddressSpace
> > > per device, and in fact how our QEMU memory model doesn't allow the
> > > address space of a device to be dynamically aliased against other
> > > address spaces or really changed at all.
> > >
> > > > I still not sure if populating the MemoryRegion will suffice for hot
> plug
> > > > vfio
> > > > device but i'll try to look into it.
> > > >
> > > > As far as I understand the memory_region_iommu_replay function, it
> still
> > > > scans
> > > > the whole 64bit address space, and therefore may hang the VM for a
> long
> > > > time.
> > >
> > > Then we need to fix that problem, one option might be to make a replay
> > > callback on MemoryRegionIOMMUOps that walks the page tables for a given
> > > context entry rather than blindly traversing a 64bit address space.  We
> > > can't simply ignore the issue by #ifdef'ing out the code.  I suspect
> > > there's a lot more involved to make VT-d interact properly with a
> > > physical device than what's been proposed so far.  At every
> > > invalidation, we need to figure out what's changed and update the host
> > > mappings.  We also need better, more dynamic address space management
> > > to make the virtual hardware reflect physical hardware when we enable
> > > things like passthrough mode or have multiple devices sharing an iommu
> > > domain.  I think we're just barely scratching the surface here.
> Thanks,
> > >
> > > Alex
> > >
> >
> >
> > I agree with you regarding hotplug, therefore I only ifdef this code out
> > and didn't
> > delete it. With the call to memory_region_iommu_replay QEMU hangs on
> startup
> > with a very long loop that prevent any device assignment  with vIOMMU
> > enabled.
> >
> > I'm hoping not to enlarge the scope of this patch to include hotplug
> device
> > assignment
> > with iommu enabled.
>
> It's not just hotplug, any case where an existing domain can be applied
> to a device.  The series is incomplete without such support and I won't
> accept any changes into vfio that disables code that's correct in other
> contexts.  Thanks,
>
> Alex
>


Re: [Qemu-devel] [PATCH v3 3/3] IOMMU: Integrate between VFIO and vIOMMU to support device assignment

2016-06-02 Thread Aviv B.D.
Hi,

In case of hot plug vfio device there should not be any active mapping
to this device prior the device addition. Also before it added to a guest
the guest should not attach the device to any domain as the device is not
present.
With CM enabled the guest must invalidate the domain or individual mappings
that belong to this new device before any use of those maps.

I'm still not sure that this functionality is necessary in x86 and
currently there
is a scenario (for x86) that use this functionality.

Thanks,
Aviv.

On Sat, May 28, 2016 at 10:48 PM Alex Williamson 
wrote:

> On Sat, 28 May 2016 18:14:18 +
> "Aviv B.D."  wrote:
>
> > Hi,
> > As far as I tested the disabled code (call to memory_region_iommu_replay)
> > hangup
> > QEMU on startup if IOMMU is enabled (scaning 64 bit address space takes
> > more
> > than an hour on modern hardware) , at least on x86 hardware. So the code
> is
> > not 100%
> > correct for any context. Maybe it just should be disabled for x86
> > architecture?
> >
> > By specification any such behavior of applying a domain to device should
> > include
> > cache invalidation if CM flag is present so I'm not thinking that my
> patch
> > break
> > this scenario.
>
> The functionality is completely necessary, imagine moving a device from
> an IOMMU API domain in the guest back to the passthrough domain, if
> there is no replay of the IOMMU context, the device cannot perform any
> DMA at all.  The current replay mechanism is obviously not designed for
> iterating over every page of a 64bit address space, which is why I
> suggest a replay callback on MemoryRegionIOMMUOps so that VT-d can
> optimize the replay by walking the VT-d page tables and perhaps
> implementation of hardware passthrough mode and the ability to
> dynamically switch a device to address_space_memory.  The current
> replay code is correct and functional in a context with a window based
> IOMMU where the IOMMU address space is much smaller.  We cannot have
> correct operation without a mechanism to rebuild the host IOMMU context
> when a device is switched to a new domain.  Please address it.  Thanks,
>
> Alex
>


Re: [Qemu-devel] [PATCH v4 RESEND 3/3] IOMMU: enable intel_iommu map and unmap notifiers

2016-10-24 Thread Aviv B.D.
On Fri, Oct 21, 2016 at 6:57 AM, Peter Xu  wrote:

> On Thu, Oct 20, 2016 at 10:11:15PM +0300, Aviv B.D. wrote:
>
> [...]
>
> > > > +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> > > > +   uint16_t domain_id,
> hwaddr
> > > addr,
> > > > +   uint8_t am)
> > > > +{
> > >
> > > The logic of this function looks strange to me.
> > >
> > > > +IntelIOMMUNotifierNode *node;
> > > > +
> > > > +QLIST_FOREACH(node, &(s->notifiers_list), next) {
> > > > +VTDAddressSpace *vtd_as = node->vtd_as;
> > > > +uint16_t vfio_domain_id;
> > > > +int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> > > vtd_as->devfn,
> > > > +  &vfio_domain_id);
> > > > +if (!ret && domain_id == vfio_domain_id) {
> > > > +IOMMUTLBEntry entry;
> > > > +
> > > > +/* notify unmap */
> > > > +if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
> > >
> > > First of all, if we are talking about VFIO, notifier_flag should
> > > always be MAP|UNMAP. So in that case, for newly mapped entries, looks
> > > like we will first send an UNMAP, then a MAP?
> > >
> >
> > You are correct, there is no valid reason to have notifier_flag other
> than
> > MAP|UNMAP
> > at least for VFIO.
>
> Not sure whether you know about upstream vhost work, vhost is going to
> support Intel IOMMU, in that case, it will only register to UNMAP
> notifier, not MAP.
>
> > I'm not sure if in the feature there won't be good reason to do
> otherwise,
> > so my
> > code support this scenario...
>
> My point is, I think you should not send this notify unconditionally.
> IMO the logic should be simpler here, like:
>
> foreach (page in invalidation range) {
> IOTLBEntry entry = m->iommu_ops.translate();
> if (entry.perm != IOMMU_NONE && notifier_flag & NOTIIFER_MAP) {
> /* this is map, user requested MAP flag */
> memory_region_iommu_notify(entry);
> } else if (entry.perm == IOMMU_NONE && notifier_flag &
> NOTIFIER_UNMAP) {
> /* this is unmap, user requested UNMAP */
> entry = ... /* setup the entry */
> memory_region_iommu_notify(entry);
> }
> }
>

This was my first algorithm, but VFIO do not support remapping of mapped
page.
Before each MAP operation in VFIO one must do unmap, and therefore I'm
sending
the unmap notifications blindly before.
I can rearrange my code closer to your suggestion.


>
> This is to follow your logic. I don't know whether this is efficient
> enough, maybe good for the first version. The problem is, when you
> call translate(), you will need to go over the page every time from
> root dir. A faster way may be: provide a function to walk specific
> address range. If you are going to implement the replay logic that
> Alex/David has mentioned, maybe that will help too (walk over the
> whole 64bit range).
>
> Interesting idea, but I prefer to add it in separate patch set after this
one committed, if it's OK.


> >
> >
> > > > +VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask
> %d",
> > > > +addr, am);
> > > > +entry.target_as = &address_space_memory;
> > > > +entry.iova = addr & VTD_PAGE_MASK_4K;
> > > > +entry.translated_addr = 0;
> > > > +entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K
> +
> > > am);
> > > > +entry.perm = IOMMU_NONE;
> > > > +memory_region_notify_iommu(&node->vtd_as->iommu,
> > > entry);
> > > > +}
> > > > +
> > > > +/* notify map */
> > > > +if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
> > > > +hwaddr original_addr = addr;
> > > > +VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask
> %d",
> > > addr, am);
> > > > +while (addr < original_addr + (1 << am) *
> > > VTD_PAGE_SIZE) {
> > > > +/* call to vtd_iommu_translate */
> > > > +  

Re: [Qemu-devel] [PATCH v4 RESEND 3/3] IOMMU: enable intel_iommu map and unmap notifiers

2016-10-25 Thread Aviv B.D.
On Mon, Oct 24, 2016 at 11:02 AM, Peter Xu  wrote:

> On Mon, Oct 24, 2016 at 10:53:01AM +0300, Aviv B.D. wrote:
>
> [...]
>
> > This was my first algorithm, but VFIO do not support remapping of mapped
> > page.
> > Before each MAP operation in VFIO one must do unmap, and therefore I'm
> > sending
> > the unmap notifications blindly before.
> > I can rearrange my code closer to your suggestion.
>
> If so, I would suggest we solve the real problem first: we should not
> notify VFIO twice on map(), but only once. IMO either Alex's or
> David's suggestion (in the other mail) is a good start.
>
> OK. I will publish a new patch set that notify only once per page.
I prefer David's suggestion - adding the range information to the notifier
struct
and check it from inside the VFIO notification function.


> >
> >
> > >
> > > This is to follow your logic. I don't know whether this is efficient
> > > enough, maybe good for the first version. The problem is, when you
> > > call translate(), you will need to go over the page every time from
> > > root dir. A faster way may be: provide a function to walk specific
> > > address range. If you are going to implement the replay logic that
> > > Alex/David has mentioned, maybe that will help too (walk over the
> > > whole 64bit range).
> > >
> > > Interesting idea, but I prefer to add it in separate patch set after
> this
> > one committed, if it's OK.
>
> Sure.
>
> -- peterx
>

Thanks,
Aviv.


Re: [Qemu-devel] [PATCH v5 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-10-30 Thread Aviv B.D.
I will fix the warning in apb.c...

Can you post your qemu build configuration?

Thanks,
Aviv.

On Wed, Oct 26, 2016 at 6:37 PM, Chao Gao  wrote:

> On Fri, Oct 21, 2016 at 12:07:18AM +0300, Aviv B.D wrote:
> >From: "Aviv Ben-David" 
> >
> >* Advertize Cache Mode capability in iommu cap register.
> >  This capability is controlled by "cache-mode" property of intel-iommu
> device.
> >  To enable this option call QEMU with "-device
> intel-iommu,cache-mode=true".
> >
> >* On page cache invalidation in intel vIOMMU, check if the domain belong
> to
> >  registered notifier, and notify accordingly.
> >
> >Currently this patch still doesn't enabling VFIO devices support with
> vIOMMU
> >present. Current problems:
> >* vfio_iommu_map_notify is not aware about memory range belong to specific
> >  VFIOGuestIOMMU.
> >* memory_region_iommu_replay hangs QEMU on start up while it itterate over
> >  64bit address space. Commenting out the call to this function enables
> >  workable VFIO device while vIOMMU present.
> >
> After applying the patch series based on commit ede0cbeb, I run the
> following cmd:
> modprobe vfio-pci
> echo 8086 1528 > /sys/bus/pci/drivers/vfio-pci/new_id
> echo ":03:00.1" > /sys/bus/pci/devices/\:03\:00.1/driver/unbind
> echo 8086 1528 > /sys/bus/pci/drivers/vfio-pci/new_id
> to prepare for assigning nic device.
> After that, I try to boot a guest by
> qemu-system-x86_64 -boot c -m 4096 -monitor pty -serial stdio --enable-kvm
> \
> -M kernel-irqchip=split -bios bios.bin -smp cpus=288 -hda vdisk.img \
> -machine q35 -device intel-iommu,intremap=on,eim=on,cache-mode=true \
> -net none -device vfio-pci,host=03:00.1
> and however encounter this error:
> qemu-system-x86_64: -device vfio-pci,host=03:00.1: iommu map to non memory
> area e258e000
> qemu-system-x86_64: -device vfio-pci,host=03:00.1: iommu map to non memory
> area e258f000
> qemu-system-x86_64: -device vfio-pci,host=03:00.1: iommu map to non memory
> area e259
>
> Do i make some mistakes in this test? How to correct it?
>
> by the way, there is a build error:
> qemu/hw/pci-host/apb.c:326:5: error: initialization from incompatible
> pointer type [-Werror]
>  .translate = pbm_translate_iommu,
>  ^
> qemu/hw/pci-host/apb.c:326:5: error: (near initialization for
> \u2018pbm_iommu_ops.translate\u2019) [-Werror]
> cc1: all warnings being treated as errors
> make: *** [hw/pci-host/apb.o] Error 1
> >
> >--
> >1.9.1
> >
> >
>


[Qemu-devel] [PATCH RFC v2] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-04-09 Thread Aviv B.D.
From: "Aviv Ben-David" 
Date: Tue, 23 Feb 2016 00:24:54 +0200
Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present

* Fix bug that prevent qemu from starting up with vIOMMU and VFIO
  device are present.
* Advertize Cache Mode capability in iommu cap register.
* Register every VFIO device with IOMMU state.
* On page cache invalidation in vIOMMU, check if the domain belong to
  VFIO device and mirror the guest requests to host.

Changes from previous versions:
* remove assumption that the cache do not clears
* fix lock up on high load.
* refactor vtd_get_did_dev to return success return code, and actual
domain_id via argument.

Tested only on network cards (also with multiple cards at once).

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 113 +++--
 hw/i386/intel_iommu_internal.h |   3 ++
 hw/vfio/common.c   |  12 +++--
 include/exec/memory.h  |   8 ++-
 include/hw/i386/intel_iommu.h  |   4 ++
 include/hw/vfio/vfio-common.h  |   1 +
 6 files changed, 121 insertions(+), 20 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 347718f..a568181 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif

+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
 uint64_t wmask, uint64_t w1cmask)
 {
@@ -126,6 +129,22 @@ static uint32_t
vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
 return new_val;
 }

+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
uint8_t devfn, uint16_t * domain_id)
+{
+VTDContextEntry ce;
+int ret_fr;
+
+assert(domain_id);
+
+ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+if (ret_fr){
+return -1;
+}
+
+*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+return 0;
+}
+
 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
 uint64_t clear, uint64_t mask)
 {
@@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
uint32_t level)
 /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
IOMMUAccessPermissions is_write,
 uint64_t *slptep, uint32_t *slpte_level,
 bool *reads, bool *writes)
 {
@@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
uint64_t gpa, bool is_write,
 }

 /* FIXME: what is the Atomics request here? */
-access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+switch(is_write){
+case IOMMU_WRITE:
+access_right_check = VTD_SL_W;
+break;
+case IOMMU_READ:
+access_right_check = VTD_SL_R;
+break;
+case IOMMU_ANY:
+access_right_check = VTD_SL_R | VTD_SL_W;
+break;
+default:
+assert(0);
+}

 while (true) {
 offset = vtd_gpa_level_offset(gpa, level);
@@ -711,9 +742,9 @@ static int
vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
 }

 if (!vtd_context_entry_present(ce)) {
-VTD_DPRINTF(GENERAL,
+/*VTD_DPRINTF(GENERAL,
 "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-"is not present", devfn, bus_num);
+"is not present", devfn, bus_num);*/
 return -VTD_FR_CONTEXT_ENTRY_P;
 } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
(ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
  * @entry: IOMMUTLBEntry that contain the addr to be translated and result
  */
 static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
-   uint8_t devfn, hwaddr addr, bool is_write,
+   uint8_t devfn, hwaddr addr,
IOMMUAccessPermissions is_write,
IOMMUTLBEntry *entry)
 {
 IntelIOMMUState *s = vtd_as->iommu_state;
@@ -848,12 +879,14 @@ static void
vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
 if (ret_fr) {
 ret_fr = -ret_fr;
-if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
-VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
+if (is_write != IOMMU_ANY){
+if (is_fpd_set && vtd_is_qualified_fault(ret_fr)

Re: [Qemu-devel] [PATCH RFC v2] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-04-16 Thread Aviv B.D.
see comments below.

Thanks for your inputs,
Aviv

On Mon, Apr 11, 2016 at 6:40 AM, Peter Xu  wrote:
> Hi, Aviv,
>
> On Sat, Apr 09, 2016 at 09:03:38PM +0300, Aviv B.D. wrote:
>
> [...]
>
>> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
>> uint8_t devfn, uint16_t * domain_id)
>
> It seems that there are two lines above, however what I feel is that
> this is a long line splitted by the email client or something
> else... are you sending the patch using git-send-email? Not sure
> whether this would be a problem.
>
> I see the same thing in previous patches too.

You are correct, next time I'll use git-send-email.

>
> [...]
>
>> @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
>>   * @entry: IOMMUTLBEntry that contain the addr to be translated and result
>>   */
>>  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>> -   uint8_t devfn, hwaddr addr, bool 
>> is_write,
>> +   uint8_t devfn, hwaddr addr,
>> IOMMUAccessPermissions is_write,
>
> First of all, if we are to change this "is_write" into a permission,
> I would prefer change it's name too from "is_write" to "perm" or
> else, so that people would know this is not a boolean any more.
>
> Secondly, I assume you have not handled all the is_write uses below,
> right? So the code seems not consistent. Many places are still using
> it as boolean (I suppose).
>
you are correct, I'll fix those locations to use the correct enum.
The translate function returns entry.perm == IOMMU_NONE if no mapping
is existing, and this is a valid case. Actually, the kernel may signal
that the hardware should invalidate a chunk of 4 consecutive pages
with one page in the middle that is in invalid state. I want to skip
those pages (in the mapping stage of the function).

>
> - Here, we do the remap for every 4K, guess even with huge
>   pages. Better way to do? A stupid one from me: take special care
>   for am == 9, 18 cases?

You are correct, I want the code firstly to work correctly (even if a
bit slower than optimal). I'll try to include an optimizations for
this in the next version (but currently I can't promise that I'll have
the time).

>
> - Is it possible that multiple devices use same domain ID? Not
>   sure. If so, we will always map/unmap the same address twice?
>

yes. Domains can be shared by more than one device, but if QEMU assign
those devices to the same domain on host the map and unmap will happen
only once. As far as I understand the VFIO code in QEMU, it always try
to assign all of those devices to the same domain on host. Therefore
this problem doesn't existing.

> [...]
>
>>  static void vfio_listener_region_add(MemoryListener *listener,
>>   MemoryRegionSection *section)
>>  {
>> @@ -344,6 +347,7 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>>  iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>>  llend = int128_make64(section->offset_within_address_space);
>>  llend = int128_add(llend, section->size);
>> +llend = int128_add(llend, int128_exts64(-1));
>
> It seems that Bandan has fixed this. Please try pull latest master
> branch and check commit 55efcc537d330. If so, maybe we need to
> rebase?

Will do.
IOMMU_ANY is flag to suppress errors reports to guest kernel in case
of not existing mapping, So in this case I'm not sure that this enum
represent correctly this intent.

>
> Thanks.
>
> -- peterx



Re: [Qemu-devel] [PATCH RFC v2] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-04-16 Thread Aviv B.D.
See my comments below,
Thanks,
Aviv.

On Mon, Apr 11, 2016 at 11:25 PM, Alex Williamson
 wrote:
>
> Some more detailed comments now that I have some faith that the host
> IOMMU domain is working correctly...
>
> On Sat, 9 Apr 2016 21:03:38 +0300
> "Aviv B.D."  wrote:
>
>> From: "Aviv Ben-David" 
>> Date: Tue, 23 Feb 2016 00:24:54 +0200
>> Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present
>>
>> * Fix bug that prevent qemu from starting up with vIOMMU and VFIO
>>   device are present.
>> * Advertize Cache Mode capability in iommu cap register.
>> * Register every VFIO device with IOMMU state.
>> * On page cache invalidation in vIOMMU, check if the domain belong to
>>   VFIO device and mirror the guest requests to host.
>>
>> Changes from previous versions:
>> * remove assumption that the cache do not clears
>> * fix lock up on high load.
>> * refactor vtd_get_did_dev to return success return code, and actual
>> domain_id via argument.
>>
>> Tested only on network cards (also with multiple cards at once).
>>
>> Signed-off-by: Aviv Ben-David 
>> ---
>>  hw/i386/intel_iommu.c  | 113 
>> +++--
>>  hw/i386/intel_iommu_internal.h |   3 ++
>>  hw/vfio/common.c   |  12 +++--
>>  include/exec/memory.h  |   8 ++-
>>  include/hw/i386/intel_iommu.h  |   4 ++
>>  include/hw/vfio/vfio-common.h  |   1 +
>>  6 files changed, 121 insertions(+), 20 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index 347718f..a568181 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
>> VTD_DBGBIT(CSR);
>>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>>  #endif
>>
>> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>> +uint8_t devfn, VTDContextEntry *ce);
>> +
>>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>>  uint64_t wmask, uint64_t w1cmask)
>>  {
>> @@ -126,6 +129,22 @@ static uint32_t
>> vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
>>  return new_val;
>>  }
>>
>> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
>> uint8_t devfn, uint16_t * domain_id)
>> +{
>> +VTDContextEntry ce;
>> +int ret_fr;
>> +
>> +assert(domain_id);
>> +
>> +ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
>> +if (ret_fr){
>> +return -1;
>> +}
>> +
>> +*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
>> +return 0;
>> +}
>> +
>>  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>>  uint64_t clear, uint64_t mask)
>>  {
>> @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
>> uint32_t level)
>>  /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
>>   * of the translation, can be used for deciding the size of large page.
>>   */
>> -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool 
>> is_write,
>> +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
>> IOMMUAccessPermissions is_write,
>
> "is_write" is binary, yes/no, IOMMUAccessPermissions clearly has more
> states.  This should change to "flags" or something and should use
> existing IOMMUAccessFlags rather than defining something new.  This
> should be done in a separate patch that doesn't introduce new
> functionality otherwise.

OK, I will do it.

>
>>  uint64_t *slptep, uint32_t *slpte_level,
>>  bool *reads, bool *writes)
>>  {
>> @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
>> uint64_t gpa, bool is_write,
>>  }
>>
>>  /* FIXME: what is the Atomics request here? */
>> -access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
>> +switch(is_write){
>> +case IOMMU_WRITE:
>> +access_right_check = VTD_SL_W;
>> +break;
>> +case IOMMU_READ:
>> +access_right_check = VTD_SL_R;
>> +break;
>> +case IOMMU_ANY:
>> +access_right_check = VTD_SL_R | VTD_SL_W;
>> +break;
>> +default:
>> +assert(0);
&g

[Qemu-devel] [PATCH][RFC] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-03-12 Thread Aviv B.D.
From: "Aviv B.D." 

 * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
device are present.
 * Advertise Cache Mode capability in iommu cap register.
 * Register every VFIO device with IOMMU state.
 * On page cache invalidation in vIOMMU, check if the domain belong to
   VFIO device and mirror the guest requests to host.

Not working (Yet!):
 * Tested only with network interface card (ixgbevf) and
intel_iommu=strict in guest's kernel command line.
 * Lock up under high load.
 * Errors on guest poweroff.
 * High relative latency compare to VFIO without IOMMU.

Signed-off-by: Aviv B.D. 
---
 hw/i386/intel_iommu.c  | 76
++
 hw/i386/intel_iommu_internal.h |  1 +
 hw/vfio/common.c   | 12 +--
 include/hw/i386/intel_iommu.h  |  4 +++
 include/hw/vfio/vfio-common.h  |  1 +
 5 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 347718f..046688f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif

+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
 uint64_t wmask, uint64_t w1cmask)
 {
@@ -126,6 +130,19 @@ static uint32_t
vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
 return new_val;
 }

+static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
uint8_t devfn)
+{
+VTDContextEntry ce;
+int ret_fr;
+
+ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+if (ret_fr){
+return -1;
+}
+
+return VTD_CONTEXT_ENTRY_DID(ce.hi);
+}
+
 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
 uint64_t clear, uint64_t mask)
 {
@@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
uint8_t bus_num,
 }

 if (!vtd_context_entry_present(ce)) {
-VTD_DPRINTF(GENERAL,
+/*VTD_DPRINTF(GENERAL,
 "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-"is not present", devfn, bus_num);
+"is not present", devfn, bus_num);*/
 return -VTD_FR_CONTEXT_ENTRY_P;
 } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
(ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -1020,14 +1037,53 @@ static void
vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
   hwaddr addr, uint8_t am)
 {
 VTDIOTLBPageInvInfo info;
+VFIOGuestIOMMU * giommu;
+bool flag = false;

 assert(am <= VTD_MAMV);
 info.domain_id = domain_id;
 info.addr = addr;
 info.mask = ~((1 << am) - 1);
+
+QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
+VTDAddressSpace *vtd_as = container_of(giommu->iommu,
VTDAddressSpace, iommu);
+uint16_t vfio_source_id =
vtd_make_source_id(pci_bus_num(vtd_as->bus), vtd_as->devfn);
+uint16_t vfio_domain_id = vtd_get_did_dev(s,
pci_bus_num(vtd_as->bus), vtd_as->devfn);
+if (vfio_domain_id != (uint16_t)-1 &&
+domain_id == vfio_domain_id){
+VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
vfio_source_id, addr);
+if (iotlb_entry != NULL){
+IOMMUTLBEntry entry;
+VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
addr, am);
+entry.iova = addr & VTD_PAGE_MASK_4K;
+entry.translated_addr =
vtd_get_slpte_addr(iotlb_entry->slpte) & VTD_PAGE_MASK_4K;
+entry.addr_mask = ~VTD_PAGE_MASK_4K;
+entry.perm = IOMMU_NONE;
+memory_region_notify_iommu(giommu->iommu, entry);
+flag = true;
+
+}
+}
+
+}
+
 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
-}

+QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
+VTDAddressSpace *vtd_as = container_of(giommu->iommu,
VTDAddressSpace, iommu);
+uint16_t vfio_domain_id = vtd_get_did_dev(s,
pci_bus_num(vtd_as->bus), vtd_as->devfn);
+if (vfio_domain_id != (uint16_t)-1 &&
+domain_id == vfio_domain_id && !flag){
+/* do vfio map */
+VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr,
am);
+/* call to vtd_iommu_translate */
+IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu,
addr, 0);
+entry.perm = IOMMU_RW;
+memory_region_

Re: [Qemu-devel] [PATCH][RFC] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-03-18 Thread Aviv B.D.
On Tue, Mar 15, 2016 at 12:53 PM, Michael S. Tsirkin  wrote:

> On Sat, Mar 12, 2016 at 06:13:17PM +0200, Aviv B.D. wrote:
> > From: "Aviv B.D." 
> >
> >  * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
> > device are present.
> >  * Advertise Cache Mode capability in iommu cap register.
> >  * Register every VFIO device with IOMMU state.
> >  * On page cache invalidation in vIOMMU, check if the domain belong to
> >VFIO device and mirror the guest requests to host.
> >
> > Not working (Yet!):
> >  * Tested only with network interface card (ixgbevf) and
> > intel_iommu=strict in guest's kernel command line.
> >  * Lock up under high load.
> >  * Errors on guest poweroff.
> >  * High relative latency compare to VFIO without IOMMU.
> >
> > Signed-off-by: Aviv B.D. 
>
> Thanks, this is very interesting.
> So this needs some cleanup, and some issues that will have to be addressed.
> See below.
> Thanks!
>
> Thanks!

>
> > ---
> >  hw/i386/intel_iommu.c  | 76
> ++
> >  hw/i386/intel_iommu_internal.h |  1 +
> >  hw/vfio/common.c   | 12 +--
> >  include/hw/i386/intel_iommu.h  |  4 +++
> >  include/hw/vfio/vfio-common.h  |  1 +
> >  5 files changed, 85 insertions(+), 9 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 347718f..046688f 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT
> > (CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> >
> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > +uint8_t devfn, VTDContextEntry *ce);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
> val,
> >  uint64_t wmask, uint64_t w1cmask)
> >  {
> > @@ -126,6 +130,19 @@ static uint32_t
> vtd_set_clear_mask_long(IntelIOMMUState
> > *s, hwaddr addr,
> >  return new_val;
> >  }
> >
> > +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t
> > devfn)
> > +{
> > +VTDContextEntry ce;
> > +int ret_fr;
> > +
> > +ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +if (ret_fr){
> > +return -1;
> > +}
> > +
> > +return VTD_CONTEXT_ENTRY_DID(ce.hi);
> > +}
> > +
> >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
> >  uint64_t clear, uint64_t mask)
> >  {
> > @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
> *s,
> > uint8_t bus_num,
> >  }
> >
> >  if (!vtd_context_entry_present(ce)) {
> > -VTD_DPRINTF(GENERAL,
> > +/*VTD_DPRINTF(GENERAL,
> >  "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > -"is not present", devfn, bus_num);
> > +"is not present", devfn, bus_num);*/
> >  return -VTD_FR_CONTEXT_ENTRY_P;
> >  } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > @@ -1020,14 +1037,53 @@ static void
> vtd_iotlb_page_invalidate(IntelIOMMUState
> > *s, uint16_t domain_id,
> >hwaddr addr, uint8_t am)
> >  {
> >  VTDIOTLBPageInvInfo info;
> > +VFIOGuestIOMMU * giommu;
> > +bool flag = false;
> >
> >  assert(am <= VTD_MAMV);
> >  info.domain_id = domain_id;
> >  info.addr = addr;
> >  info.mask = ~((1 << am) - 1);
> > +
> > +QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > +VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace,
> > iommu);
> > +uint16_t vfio_source_id =
> vtd_make_source_id(pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +if (vfio_domain_id != (uint16_t)-1 &&
> > +domain_id == vfio_domain_id){
> > +VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> vfio_source_id,
> > addr);
> > +if (iotlb_entr

Re: [Qemu-devel] [PATCH][RFC] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-03-19 Thread Aviv B.D.
On Tue, Mar 15, 2016 at 10:52 AM, Peter Xu  wrote:

> On Mon, Mar 14, 2016 at 08:52:33PM +0200, Marcel Apfelbaum wrote:
> > On 03/12/2016 06:13 PM, Aviv B.D. wrote:
> > Adding (possibly) interested developers to the thread.
>
> Thanks CC.
>
> Hi, Aviv, several questions inline.
>
> [...]
>
> > >@@ -1020,14 +1037,53 @@ static void
> vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
> > >hwaddr addr, uint8_t am)
> > >  {
> > >  VTDIOTLBPageInvInfo info;
> > >+VFIOGuestIOMMU * giommu;
> > >+bool flag = false;
> > >  assert(am <= VTD_MAMV);
> > >  info.domain_id = domain_id;
> > >  info.addr = addr;
> > >  info.mask = ~((1 << am) - 1);
> > >+
> > >+QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > >+VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> > >+uint16_t vfio_source_id =
> vtd_make_source_id(pci_bus_num(vtd_as->bus), vtd_as->devfn);
> > >+uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus), vtd_as->devfn);
> > >+if (vfio_domain_id != (uint16_t)-1 &&
>
> Could you (or anyone) help explain what does vfio_domain_id != -1
> mean?


vtd_get_did_dev returns -1 if the device is not mapped to any domain
(generally, the CE is not present).
probably a better interface is to return whether the device has a domain or
not and returns the domain_id via the pointer argument.


>
> > >+domain_id == vfio_domain_id){
> > >+VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> vfio_source_id, addr);
> > >+if (iotlb_entry != NULL){
>
> Here, shall we notify VFIO even if the address is not cached in
> IOTLB? Anyway, we need to do the unmap() of the address, am I
> correct?
>
With this code I do a unmap operation if the address was cached in the
IOTLB, if not I'm assuming that the current invalidation invalidate an
(previously) non present address and I should do a map operation (during
the map operation I'm calling s->iommu_ops.translate that caches the
address).

>
> > >+IOMMUTLBEntry entry;
> > >+VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask
> %d", addr, am);
> > >+entry.iova = addr & VTD_PAGE_MASK_4K;
> > >+entry.translated_addr =
> vtd_get_slpte_addr(iotlb_entry->slpte) & VTD_PAGE_MASK_4K;
> > >+entry.addr_mask = ~VTD_PAGE_MASK_4K;
> > >+entry.perm = IOMMU_NONE;
> > >+memory_region_notify_iommu(giommu->iommu, entry);
> > >+flag = true;
> > >+
> > >+}
> > >+}
> > >+
> > >+}
> > >+
> > >  g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page,
> &info);
> > >-}
> > >+QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > >+VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> > >+uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus), vtd_as->devfn);
> > >+if (vfio_domain_id != (uint16_t)-1 &&
> > >+domain_id == vfio_domain_id && !flag){
> > >+/* do vfio map */
> > >+VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d",
> addr, am);
> > >+/* call to vtd_iommu_translate */
> > >+IOMMUTLBEntry entry =
> s->iommu_ops.translate(giommu->iommu, addr, 0);
> > >+entry.perm = IOMMU_RW;
> > >+memory_region_notify_iommu(giommu->iommu, entry);
> > >+//g_vfio_iommu->n.notify(&g_vfio_iommu->n, &entry);
> > >+}
> > >+}
> > >+}
>
> I see that we did handled all the page invalidations. Would it
> possible that guest kernel send domain/global invalidations? Should
> we handle them too?
>

You are correct, currently this code is pretty much at POC level, and i
support only the page invalidation because this is what linux is using. The
final version should support also those invalidation ops.

>
> [...]
>
> > >  static void vfio_listener_region_add(MemoryListener *listener,
> > >   MemoryRegionSection *section)
> > >  {
> > >@@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener
> *listener,
> > >  iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> > >  llend = int128_make64(section->offset_within_address_space);
> > >  llend = int128_add(llend, section->size);
> > >+llend = int128_add(llend, int128_exts64(-1));
>
> Here, -1 should fix the assertion core dump. However, shall we also
> handle all the rest places that used "llend" (possibly with variable
> "end") too? For example, at the end of current function, when we map
> dma regions:
>
> ret = vfio_dma_map(container, iova, end - iova, vaddr,
> section->readonly);
>
> To:
>
> ret = vfio_dma_map(container, iova, end + 1 - iova, vaddr,
> section->readonly);
>
> I will add this to the next version of the patch, thanks!

Thanks.
> Peter
>


Re: [Qemu-devel] [PATCH][RFC] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-03-19 Thread Aviv B.D.
On Fri, Mar 18, 2016 at 5:06 AM, Peter Xu  wrote:

> On Thu, Mar 17, 2016 at 01:17:30PM +0200, Aviv B.D. wrote:
> [...]
> > vtd_get_did_dev returns -1 if the device is not mapped to any domain
> > (generally, the CE is not present).
> > probably a better interface is to return whether the device has a domain
> or
> > not and returns the domain_id via the pointer argument.
>
> Possibly, as long as guest kernel might be using (uint16_t)-1 as
> domain ID. ;)
>
> >
> >
> > >
> > > > >+domain_id == vfio_domain_id){
> > > > >+VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> > > vfio_source_id, addr);
> > > > >+if (iotlb_entry != NULL){
> > >
> > > Here, shall we notify VFIO even if the address is not cached in
> > > IOTLB? Anyway, we need to do the unmap() of the address, am I
> > > correct?
> > >
> > With this code I do a unmap operation if the address was cached in the
> > IOTLB, if not I'm assuming that the current invalidation invalidate an
> > (previously) non present address and I should do a map operation (during
> > the map operation I'm calling s->iommu_ops.translate that caches the
> > address).
>
> I am not 100% sure of this, but... is this related to IOTLB at all?
> What I see is that, IOTLB is only a cache layer of IOMMU, and it is
> possible that we mapped some areas which are not in the IOTLB at
> all.
>
> Or, let's make an assumption here: what if I turn IOTLB off (or say,
> set hash size to zero)? IOMMU should still work, though slower,
> right?  However, due to above checking, we'll never do ummap() in
> this case (while IMHO we should).
>
Thanks.
>
> -- peterx
>


Hi,
As far as I understand the code, currently there is no way to turn off the
IOTLB.
Furthermore. the IOTLB is not implemented as LRU, and actually caches
(indefinitely)
any accessed address, without any size constrains. I use those assumptions
to know
whether the current invalidation is for unmap operation or map operation.

But, I need to check if it possible (for the guest kernel) to squeeze
together unmap and
map operations and issue for them only one invalidation (probably the
answer is yes,
and it may explain one of my bugs)

Aviv.


Re: [Qemu-devel] [PATCH][RFC] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-03-22 Thread Aviv B.D.
On Mon, Mar 21, 2016 at 4:30 AM, Peter Xu  wrote:

> On Sat, Mar 19, 2016 at 11:40:04AM +0200, Aviv B.D. wrote:
> [...]
> > As far as I understand the code, currently there is no way to turn off
> the
> > IOTLB.
> > Furthermore. the IOTLB is not implemented as LRU, and actually caches
> > (indefinitely)
> > any accessed address, without any size constrains. I use those
> assumptions
> > to know
> > whether the current invalidation is for unmap operation or map operation.
>
> Please have a look at VTD_IOTLB_MAX_SIZE. It seems to be the size of
> the hash.
>
> Btw, I guess it's a much bigger problem if IOTLB has unlimited cache
> size...
>
> Thanks.

-- peterx
>

You are correct, VTD_IOTLB_MAX_SIZE limits the cache size (and reset the
whole cache
if this threshold exceeds...) I will think on another mechanism to identify
the correct
action for each invalidation.

Thanks,
Aviv.


Re: [Qemu-devel] [PATCH][RFC] IOMMU: Add Support to VFIO devices with vIOMMU present

2016-03-26 Thread Aviv B.D.
On Wed, Mar 23, 2016 at 5:33 PM, Michael S. Tsirkin  wrote:

> On Sat, Mar 12, 2016 at 06:13:17PM +0200, Aviv B.D. wrote:
> > From: "Aviv B.D." 
> >
> >  * Fix bug that prevent qemu from starting up when vIOMMU and VFIO
> > device are present.
> >  * Advertise Cache Mode capability in iommu cap register.
> >  * Register every VFIO device with IOMMU state.
> >  * On page cache invalidation in vIOMMU, check if the domain belong to
> >VFIO device and mirror the guest requests to host.
> >
> > Not working (Yet!):
> >  * Tested only with network interface card (ixgbevf) and
> > intel_iommu=strict in guest's kernel command line.
> >  * Lock up under high load.
> >  * Errors on guest poweroff.
> >  * High relative latency compare to VFIO without IOMMU.
> >
> > Signed-off-by: Aviv B.D. 
> > ---
> >  hw/i386/intel_iommu.c  | 76
> ++
> >  hw/i386/intel_iommu_internal.h |  1 +
> >  hw/vfio/common.c   | 12 +--
> >  include/hw/i386/intel_iommu.h  |  4 +++
> >  include/hw/vfio/vfio-common.h  |  1 +
> >  5 files changed, 85 insertions(+), 9 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 347718f..046688f 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -43,6 +44,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT
> > (CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> >
> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > +uint8_t devfn, VTDContextEntry *ce);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
> val,
> >  uint64_t wmask, uint64_t w1cmask)
> >  {
> > @@ -126,6 +130,19 @@ static uint32_t
> vtd_set_clear_mask_long(IntelIOMMUState
> > *s, hwaddr addr,
> >  return new_val;
> >  }
> >
> > +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t
> > devfn)
> > +{
> > +VTDContextEntry ce;
> > +int ret_fr;
> > +
> > +ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +if (ret_fr){
> > +return -1;
> > +}
> > +
> > +return VTD_CONTEXT_ENTRY_DID(ce.hi);
> > +}
> > +
> >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
> >  uint64_t clear, uint64_t mask)
> >  {
> > @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
> *s,
> > uint8_t bus_num,
> >  }
> >
> >  if (!vtd_context_entry_present(ce)) {
> > -VTD_DPRINTF(GENERAL,
> > +/*VTD_DPRINTF(GENERAL,
> >  "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > -"is not present", devfn, bus_num);
> > +"is not present", devfn, bus_num);*/
> >  return -VTD_FR_CONTEXT_ENTRY_P;
> >  } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > @@ -1020,14 +1037,53 @@ static void
> vtd_iotlb_page_invalidate(IntelIOMMUState
> > *s, uint16_t domain_id,
> >hwaddr addr, uint8_t am)
> >  {
> >  VTDIOTLBPageInvInfo info;
> > +VFIOGuestIOMMU * giommu;
> > +bool flag = false;
> >
> >  assert(am <= VTD_MAMV);
> >  info.domain_id = domain_id;
> >  info.addr = addr;
> >  info.mask = ~((1 << am) - 1);
> > +
> > +QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > +VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace,
> > iommu);
> > +uint16_t vfio_source_id =
> vtd_make_source_id(pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +uint16_t vfio_domain_id = vtd_get_did_dev(s,
> pci_bus_num(vtd_as->bus),
> > vtd_as->devfn);
> > +if (vfio_domain_id != (uint16_t)-1 &&
> > +domain_id == vfio_domain_id){
> > +VTDIOTLBEntry *iotlb_entry = vtd_lookup_iotlb(s,
> vfio_source_id,
> > addr);
> > +if (iotlb_entry != NULL){
> > +IOMMUTLBEntry entry;
> > +VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask
> %d", 

Re: [Qemu-devel] [PATCH v6 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-11-21 Thread Aviv B.D.
On Wed, Nov 16, 2016 at 9:57 PM, Michael S. Tsirkin  wrote:

> On Wed, Nov 16, 2016 at 09:50:46PM +0200, Aviv B.D. wrote:
> >
> >
> > On Wed, Nov 16, 2016 at 5:34 PM, Alex Williamson <
> alex.william...@redhat.com>
> > wrote:
> >
> > On Wed, 16 Nov 2016 15:54:56 +0200
> > "Michael S. Tsirkin"  wrote:
> >
> > > On Thu, Nov 10, 2016 at 12:44:47PM -0700, Alex Williamson wrote:
> > > > On Thu, 10 Nov 2016 21:20:36 +0200
> > > > "Michael S. Tsirkin"  wrote:
> > > >
> > > > > On Thu, Nov 10, 2016 at 09:04:13AM -0700, Alex Williamson
> wrote:
> > > > > > On Thu, 10 Nov 2016 17:54:35 +0200
> > > > > > "Michael S. Tsirkin"  wrote:
> > > > > >
> > > > > > > On Thu, Nov 10, 2016 at 08:30:21AM -0700, Alex Williamson
> wrote:
> > > > > > > > On Thu, 10 Nov 2016 17:14:24 +0200
> > > > > > > > "Michael S. Tsirkin"  wrote:
> > > > > > > >
> > > > > > > > > On Tue, Nov 08, 2016 at 01:04:21PM +0200, Aviv B.D
> wrote:
> > > > > > > > > > From: "Aviv Ben-David" 
> > > > > > > > > >
> > > > > > > > > > * Advertize Cache Mode capability in iommu cap
> register.
> > > > > > > > > >   This capability is controlled by "cache-mode"
> property of
> > intel-iommu device.
> > > > > > > > > >   To enable this option call QEMU with "-device
> > intel-iommu,cache-mode=true".
> > > > > > > > > >
> > > > > > > > > > * On page cache invalidation in intel vIOMMU, check
> if the
> > domain belong to
> > > > > > > > > >   registered notifier, and notify accordingly.
> > > > > > > > >
> > > > > > > > > This looks sane I think. Alex, care to comment?
> > > > > > > > > Merging will have to wait until after the release.
> > > > > > > > > Pls remember to re-test and re-ping then.
> > > > > > > >
> > > > > > > > I don't think it's suitable for upstream until there's a
> > reasonable
> > > > > > > > replay mechanism
> > > > > > >
> > > > > > > Could you pls clarify what do you mean by replay?
> > > > > > > Is this when you attach a device by hotplug to
> > > > > > > a running system?
> > > > > > >
> > > > > > > If yes this can maybe be addressed by disabling hotplug
> > temporarily.
> > > > > >
> > > > > > No, hotplug is not required, moving a device between existing
> > domains
> > > > > > requires replay, ie. actually using it for nested device
> > assignment.
> > > > >
> > > > > Good point, that one is a correctness thing. Aviv,
> > > > > could you add this in TODO list in a cover letter pls?
> > > > >
> > > > > > > > and we straighten out whether it's expected to get
> > > > > > > > multiple notifies and the notif-ee is responsible for
> filtering
> > > > > > > > them or if the notif-er should do filtering.
> > > > > > >
> > > > > > > OK this is a documentation thing.
> > > > > >
> > > > > > Well no, it needs to be decided and if necessary implemented.
> > > > >
> > > > > Let's assume it's the notif-ee for now. Less is more and all
> that.
> > > >
> > > > I think this is opposite of the approach dwg suggested.
> > > >
> > > > > > > >  Without those, this is
> > > > > > > > effectively just an RFC.
> > > > > > >
> > > > > > > It's infrastructure without users so it doesn't break
> things,
> > > > > > > I'm more interested in seeing whether it's broken in
> > > > > > > some way than whether it's complete.

Re: [Qemu-devel] [PATCH v6 1/3] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2016-11-21 Thread Aviv B.D.
On Fri, Nov 11, 2016 at 6:15 AM, Jason Wang  wrote:

>
>
> On 2016年11月11日 11:39, Michael S. Tsirkin wrote:
>
>> On Fri, Nov 11, 2016 at 10:32:42AM +0800, Jason Wang wrote:
>>
>>>
>>> On 2016年11月10日 06:00, Michael S. Tsirkin wrote:
>>>
>>>> On Wed, Nov 09, 2016 at 03:28:02PM +0800, Jason Wang wrote:
>>>>
>>>>>
>>>>>> On 2016年11月08日 19:04, Aviv B.D wrote:
>>>>>>
>>>>>>> From: "Aviv Ben-David"
>>>>>>>>
>>>>>>>> This capability asks the guest to invalidate cache before each map
>>>>>>>> operation.
>>>>>>>> We can use this invalidation to trap map operations in the
>>>>>>>> hypervisor.
>>>>>>>>
>>>>>>> Hi:
>>>>>>
>>>>>> Like I've asked twice in the past, I want to know why don't you cache
>>>>>> translation faults as what spec required (especially this is a guest
>>>>>> visible
>>>>>> behavior)?
>>>>>>
>>>>>> Btw, please cc me on posting future versions.
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>> Caching isn't guest visible.
>>>>
>>> Seems not, if one fault mapping were cached by IOTLB. Guest can notice
>>> this
>>> behavior.
>>>
>> Sorry, I don't get what you are saying.
>>
>> Spec just says you*can*  cache,
>>>> not that you must.
>>>>
>>>> Yes, but what did in this patch is "don't". What I suggest is just a
>>> "can",
>>> since anyway the IOTLB entries were limited and could be replaced by
>>> other.
>>>
>>> Thanks
>>>
>> Have trouble understanding this. Can you given an example of
>> a guest visible difference?
>>
>
> I guess this may do the detection:
>
> 1) map iova A to be non-present.
> 2) invalidate iova A
> 3) map iova A to addr B
> 4) access iova A
>
> A correct implemented CM may meet fault in step 4, but with this patch, we
> never.
>
>
By the specification (from intel) section 6.1 (Caching mode) tells that
this bit may be present
only on virtual machines. So with just this point the guest can detect that
it running in
a VM of some sort. Additionally, the wording of the specification enable
the host to issue a fault
it your scenario but, doesn't requiring it.

Thanks,
Aviv.


Re: [Qemu-devel] [PATCH v6 1/3] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2016-11-22 Thread Aviv B.D.
On Tue, Nov 22, 2016 at 6:11 AM, Jason Wang  wrote:

>
>
> On 2016年11月21日 21:35, Michael S. Tsirkin wrote:
>
>> On Fri, Nov 11, 2016 at 12:15:48PM +0800, Jason Wang wrote:
>>
>>> >
>>> >
>>> >On 2016年11月11日 11:39, Michael S. Tsirkin wrote:
>>>
>>>> > >On Fri, Nov 11, 2016 at 10:32:42AM +0800, Jason Wang wrote:
>>>>
>>>>> > > >
>>>>> > > >On 2016年11月10日 06:00, Michael S. Tsirkin wrote:
>>>>>
>>>>>> > > > >On Wed, Nov 09, 2016 at 03:28:02PM +0800, Jason Wang wrote:
>>>>>>
>>>>>>> > > > > > >
>>>>>>>> > > > > > >On 2016年11月08日 19:04, Aviv B.D wrote:
>>>>>>>>
>>>>>>>>> > > > > > > > >From: "Aviv Ben-David"
>>>>>>>>>> > > > > > > > >
>>>>>>>>>> > > > > > > > >This capability asks the guest to invalidate cache
>>>>>>>>>> before each map operation.
>>>>>>>>>> > > > > > > > >We can use this invalidation to trap map
>>>>>>>>>> operations in the hypervisor.
>>>>>>>>>>
>>>>>>>>> > > > > > >Hi:
>>>>>>>> > > > > > >
>>>>>>>> > > > > > >Like I've asked twice in the past, I want to know why
>>>>>>>> don't you cache
>>>>>>>> > > > > > >translation faults as what spec required (especially
>>>>>>>> this is a guest visible
>>>>>>>> > > > > > >behavior)?
>>>>>>>> > > > > > >
>>>>>>>> > > > > > >Btw, please cc me on posting future versions.
>>>>>>>> > > > > > >
>>>>>>>> > > > > > >Thanks
>>>>>>>>
>>>>>>> > > > >Caching isn't guest visible.
>>>>>>
>>>>> > > >Seems not, if one fault mapping were cached by IOTLB. Guest can
>>>>> notice this
>>>>> > > >behavior.
>>>>>
>>>> > >Sorry, I don't get what you are saying.
>>>> > >
>>>>
>>>>> > > > >Spec just says you*can*  cache,
>>>>>> > > > >not that you must.
>>>>>> > > > >
>>>>>>
>>>>> > > >Yes, but what did in this patch is "don't". What I suggest is
>>>>> just a "can",
>>>>> > > >since anyway the IOTLB entries were limited and could be replaced
>>>>> by other.
>>>>> > > >
>>>>> > > >Thanks
>>>>>
>>>> > >Have trouble understanding this. Can you given an example of
>>>> > >a guest visible difference?
>>>>
>>> >
>>> >I guess this may do the detection:
>>> >
>>> >1) map iova A to be non-present.
>>> >2) invalidate iova A
>>> >3) map iova A to addr B
>>> >4) access iova A
>>> >
>>> >A correct implemented CM may meet fault in step 4, but with this patch,
>>> we
>>> >never.
>>>
>> I think that IOTLB is free to invalidate entries at any point,
>> so the fault is not guaranteed on bare metal.
>>
>
> Yes, that's the interesting point. The fault is not guaranteed but
> conditional. And we have similar issue for IEC.
>
> So in conclusion (since I can't find a hardware IOMMU that have CM set):
>
> The spec says that there is no hardware IOMMU with CM bit set. This bit
exists only to enable efficient emulation of iommu.


> 1) If we don't cache fault conditions, we are still in question that
> whether it was spec compatible.
> 2) If we do cache fault conditions, we are 100% sure it was spec
> compatible.
>
Consider 2) is not complicated, we'd better do it I believe?
>
> "For implementations reporting Caching Mode (CM) as 1 in the Capability
Register, above conditions may cause caching of the entry that resulted in
the fault, and require explicit invalidation by software to invalidate such
cached entries." (
http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf,,
section 6.2.2, page 82). The key word is "may", therefore I do not think
that actually faulting the guest is necessary in this case.

However, if you insist on this part I will add the relevant code...

Thanks
>

Aviv.


[Qemu-devel] [PATCH v7 0/5] IOMMU: intel_iommu support map and unmap notifications

2016-11-28 Thread Aviv B.D
From: "Aviv Ben-David" 

* Advertize Cache Mode capability in iommu cap register. 
  This capability is controlled by "cache-mode" property of intel-iommu device.
  To enable this option call QEMU with "-device intel-iommu,cache-mode=true".

* On page cache invalidation in intel vIOMMU, check if the domain belong to
  registered notifier, and notify accordingly.

Currently this patch still doesn't enabling VFIO devices support with vIOMMU 
present. Current problems:
* vfio_iommu_map_notify is not aware about memory range belong to specific 
  VFIOGuestIOMMU.
* intel_iommu's replay op is not implemented yet (May come in different patch 
  set).
  The replay function is required for hotplug vfio device and to move devices 
  between existing domains.

Changes from v1 to v2:
* remove assumption that the cache do not clears
* fix lockup on high load.

Changes from v2 to v3:
* remove debug leftovers
* split to sepearate commits
* change is_write to flags in vtd_do_iommu_translate, add IOMMU_NO_FAIL 
  to suppress error propagating to guest.

Changes from v3 to v4:
* Add property to intel_iommu device to control the CM capability, 
  default to False.
* Use s->iommu_ops.notify_flag_changed to register notifiers.

Changes from v4 to v4 RESEND:
* Fix codding style pointed by checkpatch.pl script.

Changes from v4 to v5:
* Reduce the number of changes in patch 2 and make flags real bitfield.
* Revert deleted debug prints.
* Fix memory leak in patch 3.

Changes from v5 to v6:
* fix prototype of iommu_translate function for more IOMMU types.
* VFIO will be notified only on the difference, without unmap 
  before change to maps.

Changes from v6 to v7:
* Add replay op to iommu_ops, with current behavior as default implementation
  (Patch 4).
* Add stub to disable VFIO with intel_iommu support (Patch 5).
* Cosmetic changes to other patches.

Aviv Ben-David (5):
  IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to
guest
  IOMMU: change iommu_op->translate's is_write to flags, add support to
NO_FAIL flag mode
  IOMMU: enable intel_iommu map and unmap notifiers
  IOMMU: add specific replay function with default implemenation
  IOMMU: add specific null implementation of iommu_replay to intel_iommu

 exec.c |   3 +-
 hw/alpha/typhoon.c |   2 +-
 hw/i386/amd_iommu.c|   4 +-
 hw/i386/intel_iommu.c  | 165 ++---
 hw/i386/intel_iommu_internal.h |   3 +
 hw/pci-host/apb.c  |   2 +-
 hw/ppc/spapr_iommu.c   |   2 +-
 hw/s390x/s390-pci-bus.c|   2 +-
 include/exec/memory.h  |  10 ++-
 include/hw/i386/intel_iommu.h  |  11 +++
 memory.c   |  11 ++-
 11 files changed, 177 insertions(+), 38 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v7 5/5] IOMMU: add specific null implementation of iommu_replay to intel_iommu

2016-11-28 Thread Aviv B.D
From: "Aviv Ben-David" 

Currently the implementation preventing VFIO to work together with
intel_iommu.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index d872969..0787714 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2453,6 +2453,12 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 return vtd_dev_as;
 }
 
+static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n,
+ bool is_write){
+error_report("VFIO use with intel_iommu is currently not supported.");
+exit(1);
+}
+
 /* Do the initialization. It will also be called when reset, so pay
  * attention when adding new initialization stuff.
  */
@@ -2467,6 +2473,7 @@ static void vtd_init(IntelIOMMUState *s)
 
 s->iommu_ops.translate = vtd_iommu_translate;
 s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed;
+s->iommu_ops.replay = vtd_iommu_replay;
 s->root = 0;
 s->root_extended = false;
 s->dmar_enabled = false;
-- 
1.9.1




[Qemu-devel] [PATCH v7 1/5] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2016-11-28 Thread Aviv B.D
From: "Aviv Ben-David" 

This capability asks the guest to invalidate cache before each map operation.
We can use this invalidation to trap map operations in the hypervisor.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 5 +
 hw/i386/intel_iommu_internal.h | 1 +
 include/hw/i386/intel_iommu.h  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1b706ad..2cf07cd 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2017,6 +2017,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+DEFINE_PROP_BOOL("cache-mode", IntelIOMMUState, cache_mode_enabled, FALSE),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2391,6 +2392,10 @@ static void vtd_init(IntelIOMMUState *s)
 assert(s->intr_eim != ON_OFF_AUTO_AUTO);
 }
 
+if (s->cache_mode_enabled) {
+s->cap |= VTD_CAP_CM;
+}
+
 vtd_reset_context_cache(s);
 vtd_reset_iotlb(s);
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 11abfa2..00cbe0d 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -201,6 +201,7 @@
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
 #define VTD_CAP_PSI (1ULL << 39)
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM  (1ULL << 7)
 
 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT 8
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 405c9d1..749eef9 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -257,6 +257,8 @@ struct IntelIOMMUState {
 uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
 uint32_t version;
 
+bool cache_mode_enabled;/* RO - is cap CM enabled? */
+
 dma_addr_t root;/* Current root table pointer */
 bool root_extended; /* Type of root table (extended or not) */
 bool dmar_enabled;  /* Set if DMA remapping is enabled */
-- 
1.9.1




[Qemu-devel] [PATCH v7 4/5] IOMMU: add specific replay function with default implemenation

2016-11-28 Thread Aviv B.D
From: "Aviv Ben-David" 

The default implementation scans the address space and try to find
translation for each page, if exists.

This callback enables effiecent implementation for intel_iommu and other
subsystems with large address space.

Signed-off-by: Aviv Ben-David 
---
 include/exec/memory.h | 4 
 memory.c  | 8 
 2 files changed, 12 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 2d7ee54..a8b3701 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -180,6 +180,10 @@ struct MemoryRegionIOMMUOps {
 void (*notify_flag_changed)(MemoryRegion *iommu,
 IOMMUNotifierFlag old_flags,
 IOMMUNotifierFlag new_flags);
+/* Called when region has been moved between iommu domains */
+void (*replay)(MemoryRegion *mr,
+ IOMMUNotifier *n,
+ bool is_write);
 };
 
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
diff --git a/memory.c b/memory.c
index 9b88638..562d540 100644
--- a/memory.c
+++ b/memory.c
@@ -1624,6 +1624,14 @@ void memory_region_iommu_replay(MemoryRegion *mr, 
IOMMUNotifier *n,
 hwaddr addr, granularity;
 IOMMUTLBEntry iotlb;
 
+/* If there is specific implementation, use it */
+assert(memory_region_is_iommu(mr));
+if (mr->iommu_ops && mr->iommu_ops->replay) {
+return mr->iommu_ops->replay(mr, n, is_write);
+}
+
+/* No specific implementation for this iommu, fall back to default
+ * implementation */
 granularity = memory_region_iommu_get_min_page_size(mr);
 
 for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
-- 
1.9.1




[Qemu-devel] [PATCH v7 2/5] IOMMU: change iommu_op->translate's is_write to flags, add support to NO_FAIL flag mode

2016-11-28 Thread Aviv B.D
From: "Aviv Ben-David" 

Supports translation trials without reporting error to guest on
translation failure.

Signed-off-by: Aviv Ben-David 
---
 exec.c  |  3 ++-
 hw/alpha/typhoon.c  |  2 +-
 hw/i386/amd_iommu.c |  4 ++--
 hw/i386/intel_iommu.c   | 59 +++--
 hw/pci-host/apb.c   |  2 +-
 hw/ppc/spapr_iommu.c|  2 +-
 hw/s390x/s390-pci-bus.c |  2 +-
 include/exec/memory.h   |  6 +++--
 memory.c|  3 ++-
 9 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/exec.c b/exec.c
index 3d867f1..126fb31 100644
--- a/exec.c
+++ b/exec.c
@@ -466,7 +466,8 @@ MemoryRegion *address_space_translate(AddressSpace *as, 
hwaddr addr,
 break;
 }
 
-iotlb = mr->iommu_ops->translate(mr, addr, is_write);
+iotlb = mr->iommu_ops->translate(mr, addr,
+ is_write ? IOMMU_WO : IOMMU_RO);
 addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 | (addr & iotlb.addr_mask));
 *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c
index 883db13..a2840ac 100644
--- a/hw/alpha/typhoon.c
+++ b/hw/alpha/typhoon.c
@@ -664,7 +664,7 @@ static bool window_translate(TyphoonWindow *win, hwaddr 
addr,
 /* TODO: A translation failure here ought to set PCI error codes on the
Pchip and generate a machine check interrupt.  */
 static IOMMUTLBEntry typhoon_translate_iommu(MemoryRegion *iommu, hwaddr addr,
- bool is_write)
+ IOMMUAccessFlags flags)
 {
 TyphoonPchip *pchip = container_of(iommu, TyphoonPchip, iommu);
 IOMMUTLBEntry ret;
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 47b79d9..1f0d76b 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -988,7 +988,7 @@ static inline bool amdvi_is_interrupt_addr(hwaddr addr)
 }
 
 static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr,
- bool is_write)
+ IOMMUAccessFlags flags)
 {
 AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
 AMDVIState *s = as->iommu_state;
@@ -1017,7 +1017,7 @@ static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, 
hwaddr addr,
 return ret;
 }
 
-amdvi_do_translate(as, addr, is_write, &ret);
+amdvi_do_translate(as, addr, flags & IOMMU_WO, &ret);
 trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
 PCI_FUNC(as->devfn), addr, ret.translated_addr);
 return ret;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2cf07cd..05973b9 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -631,7 +631,8 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t 
level)
 /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
+IOMMUAccessFlags flags,
 uint64_t *slptep, uint32_t *slpte_level,
 bool *reads, bool *writes)
 {
@@ -640,7 +641,7 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 uint32_t offset;
 uint64_t slpte;
 uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
-uint64_t access_right_check;
+uint64_t access_right_check = 0;
 
 /* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in CAP_REG
  * and AW in context-entry.
@@ -651,7 +652,15 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 }
 
 /* FIXME: what is the Atomics request here? */
-access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+if (flags & IOMMU_WO) {
+access_right_check |= VTD_SL_W;
+}
+if (flags & IOMMU_RO) {
+access_right_check |= VTD_SL_R;
+}
+if (flags & IOMMU_NO_FAIL) {
+access_right_check |= VTD_SL_R | VTD_SL_W;
+}
 
 while (true) {
 offset = vtd_gpa_level_offset(gpa, level);
@@ -673,8 +682,8 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 if (!(slpte & access_right_check)) {
 VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
 "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-(is_write ? "write" : "read"), gpa, slpte);
-return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
+(flags & IOMMU_WO ? "write" : "read"), gpa, slpte);
+return (flags & IOMMU_WO) ? -VTD_FR_WRITE : -VTD_FR_READ;
 }
 if (vtd_slpte_nonzero_rsvd(slpte, level)) {
 VTD_DPRINTF(GENERAL, "error: non-zero reserved fi

[Qemu-devel] [PATCH v7 3/5] IOMMU: enable intel_iommu map and unmap notifiers

2016-11-28 Thread Aviv B.D
From: "Aviv Ben-David" 

Adds a list of registered vtd_as's to intel iommu state to save
iteration over each PCI device in a search of the corrosponding domain.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 94 ++
 hw/i386/intel_iommu_internal.h |  2 +
 include/hw/i386/intel_iommu.h  |  9 
 3 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 05973b9..d872969 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -679,7 +679,7 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa,
 }
 *reads = (*reads) && (slpte & VTD_SL_R);
 *writes = (*writes) && (slpte & VTD_SL_W);
-if (!(slpte & access_right_check)) {
+if (!(slpte & access_right_check) && !(flags & IOMMU_NO_FAIL)) {
 VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
 "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
 (flags & IOMMU_WO ? "write" : "read"), gpa, slpte);
@@ -978,6 +978,23 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState 
*s, uint8_t bus_num)
 return vtd_bus;
 }
 
+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn,
+   uint16_t *domain_id)
+{
+VTDContextEntry ce;
+int ret_fr;
+
+assert(domain_id);
+
+ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+if (ret_fr) {
+return -1;
+}
+
+*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+return 0;
+}
+
 /* Do a context-cache device-selective invalidation.
  * @func_mask: FM field after shifting
  */
@@ -1064,6 +1081,45 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState 
*s, uint16_t domain_id)
 &domain_id);
 }
 
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+   uint16_t domain_id, hwaddr addr,
+   uint8_t am)
+{
+IntelIOMMUNotifierNode *node;
+
+QLIST_FOREACH(node, &(s->notifiers_list), next) {
+VTDAddressSpace *vtd_as = node->vtd_as;
+uint16_t vfio_domain_id;
+int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn,
+  &vfio_domain_id);
+
+if (!ret && domain_id == vfio_domain_id) {
+hwaddr original_addr = addr;
+
+while (addr < original_addr + (1 << am) * VTD_PAGE_SIZE) {
+IOMMUTLBEntry entry = s->iommu_ops.translate(
+ &node->vtd_as->iommu,
+ addr,
+ IOMMU_NO_FAIL);
+
+if (entry.perm == IOMMU_NONE &&
+node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
+entry.target_as = &address_space_memory;
+entry.iova = addr & VTD_PAGE_MASK_4K;
+entry.translated_addr = 0;
+entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT);
+memory_region_notify_iommu(&node->vtd_as->iommu, entry);
+addr += VTD_PAGE_SIZE;
+} else if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
+memory_region_notify_iommu(&node->vtd_as->iommu, 
entry);
+addr += entry.addr_mask + 1;
+}
+}
+}
+}
+}
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
   hwaddr addr, uint8_t am)
 {
@@ -1074,6 +1130,8 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, 
uint16_t domain_id,
 info.addr = addr;
 info.mask = ~((1 << am) - 1);
 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+
+vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
 }
 
 /* Flush IOTLB
@@ -1999,15 +2057,37 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion 
*iommu,
   IOMMUNotifierFlag new)
 {
 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+IntelIOMMUState *s = vtd_as->iommu_state;
+IntelIOMMUNotifierNode *node = NULL;
+IntelIOMMUNotifierNode *next_node = NULL;
 
-if (new & IOMMU_NOTIFIER_MAP) {
-error_report("Device at bus %s addr %02x.%d requires iommu "
- "notifier which is currently not supported by "
- "intel-iommu emulation",
- vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
- PCI_FUNC(vtd_as->devfn));
+if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
+error_report("We need to set cache_mode=1 for intel-iommu to enable "
+ "device assignment with IOMMU protection.");
 exit(1);
 }
+
+/* Add new ndoe if no map

Re: [Qemu-devel] [PATCH v7 5/5] IOMMU: add specific null implementation of iommu_replay to intel_iommu

2016-11-28 Thread Aviv B.D.
On Mon, Nov 28, 2016 at 6:36 PM Alex Williamson 
wrote:

> On Mon, 28 Nov 2016 17:51:55 +0200
> "Aviv B.D"  wrote:
>
> > From: "Aviv Ben-David" 
> >
> > Currently the implementation preventing VFIO to work together with
> > intel_iommu.
> >
> > Signed-off-by: Aviv Ben-David 
> > ---
> >  hw/i386/intel_iommu.c | 7 +++
> >  1 file changed, 7 insertions(+)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index d872969..0787714 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -2453,6 +2453,12 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState
> *s, PCIBus *bus, int devfn)
> >  return vtd_dev_as;
> >  }
> >
> > +static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n,
> > + bool is_write){
> > +error_report("VFIO use with intel_iommu is currently not
> supported.");
>
> It doesn't make sense to call out vfio here, this is a deficiency of
> intel-iommu, not vfio.  vfio is just trying to use QEMU's iommu api.
> vfio is currently the only caller, but that could change and just adds
> maintenance effort to scrub this error message at a later time.  We
> could make the whole path support an error return to allow vfio to
> report the error, we could add a name field in the IOMMUNotifier, or we
> could just make the error more generic.  Thanks,
>
> Alex
>
> Sure, I'll change the message to something generic.

Aviv.


> > +exit(1);
> > +}
> > +
> >  /* Do the initialization. It will also be called when reset, so pay
> >   * attention when adding new initialization stuff.
> >   */
> > @@ -2467,6 +2473,7 @@ static void vtd_init(IntelIOMMUState *s)
> >
> >  s->iommu_ops.translate = vtd_iommu_translate;
> >  s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed;
> > +s->iommu_ops.replay = vtd_iommu_replay;
> >  s->root = 0;
> >  s->root_extended = false;
> >  s->dmar_enabled = false;
>
>


Re: [Qemu-devel] [PATCH v7 3/5] IOMMU: enable intel_iommu map and unmap notifiers

2016-11-29 Thread Aviv B.D.
On Tue, Nov 29, 2016 at 5:23 AM 蓝天宇  wrote:

> 2016-11-28 23:51 GMT+08:00 Aviv B.D :
> > From: "Aviv Ben-David" 
> >
> > Adds a list of registered vtd_as's to intel iommu state to save
> > iteration over each PCI device in a search of the corrosponding domain.
> >
> > Signed-off-by: Aviv Ben-David 
> > ---
> >  hw/i386/intel_iommu.c  | 94
> ++
> >  hw/i386/intel_iommu_internal.h |  2 +
> >  include/hw/i386/intel_iommu.h  |  9 
> >  3 files changed, 98 insertions(+), 7 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 05973b9..d872969 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -679,7 +679,7 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
> uint64_t gpa,
> >  }
> >  *reads = (*reads) && (slpte & VTD_SL_R);
> >  *writes = (*writes) && (slpte & VTD_SL_W);
> > -if (!(slpte & access_right_check)) {
> > +if (!(slpte & access_right_check) && !(flags & IOMMU_NO_FAIL)) {
> >  VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
> >  "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
> >  (flags & IOMMU_WO ? "write" : "read"), gpa,
> slpte);
> > @@ -978,6 +978,23 @@ static VTDBus
> *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
> >  return vtd_bus;
> >  }
> >
> > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t
> devfn,
> > +   uint16_t *domain_id)
> > +{
> > +VTDContextEntry ce;
> > +int ret_fr;
> > +
> > +assert(domain_id);
> > +
> > +ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +if (ret_fr) {
> > +return -1;
> > +}
> > +
> > +*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> > +return 0;
> > +}
> > +
> >  /* Do a context-cache device-selective invalidation.
> >   * @func_mask: FM field after shifting
> >   */
> > @@ -1064,6 +1081,45 @@ static void
> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
> >  &domain_id);
> >  }
> >
> > +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> > +   uint16_t domain_id, hwaddr
> addr,
> > +   uint8_t am)
> > +{
> > +IntelIOMMUNotifierNode *node;
> > +
> > +QLIST_FOREACH(node, &(s->notifiers_list), next) {
> > +VTDAddressSpace *vtd_as = node->vtd_as;
> > +uint16_t vfio_domain_id;
> > +int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn,
> > +  &vfio_domain_id);
> > +
> > +if (!ret && domain_id == vfio_domain_id) {
> > +hwaddr original_addr = addr;
> > +
> > +while (addr < original_addr + (1 << am) * VTD_PAGE_SIZE) {
> > +IOMMUTLBEntry entry = s->iommu_ops.translate(
> > +
>  &node->vtd_as->iommu,
> > + addr,
> > + IOMMU_NO_FAIL);
> > +
> > +if (entry.perm == IOMMU_NONE &&
> > +node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
> > +entry.target_as = &address_space_memory;
> > +entry.iova = addr & VTD_PAGE_MASK_4K;
> > +entry.translated_addr = 0;
> > +entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT);
> > +memory_region_notify_iommu(&node->vtd_as->iommu,
> entry);
> > +addr += VTD_PAGE_SIZE;
> > +} else if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
> > +
> memory_region_notify_iommu(&node->vtd_as->iommu, entry);
> > +addr += entry.addr_mask + 1;
> > +}
> > +}
> > +}
> > +}
> > +}
> > +
> >  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t
> domain_id,
> >hwaddr addr, uint8_t am)
> >  {
> > @@ -1074,6 +1130,8 @@ static void
> v

[Qemu-devel] [PATCH v4 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-10-15 Thread Aviv B.D
From: "Aviv Ben-David" 

* Advertize Cache Mode capability in iommu cap register. 
  This capability is controlled by "cache-mode" property of intel-iommu device.
  To enable this option call QEMU with "-device intel-iommu,cache-mode=true".

* On page cache invalidation in intel vIOMMU, check if the domain belong to
  registered notifier, and notify accordingly.

Currently this patch still doesn't enabling VFIO devices support with vIOMMU 
present. Current problems:
* vfio_iommu_map_notify is not aware about memory range belong to specific 
  VFIOGuestIOMMU.
* memory_region_iommu_replay hangs QEMU on start up while it itterate over 
  64bit address space. Commenting out the call to this function enables 
  workable VFIO device while vIOMMU present.

Changes from v1 to v2:
* remove assumption that the cache do not clears
* fix lockup on high load.

Changes from v2 to v3:
* remove debug leftovers
* split to sepearate commits
* change is_write to flags in vtd_do_iommu_translate, add IOMMU_NO_FAIL 
  to suppress error propagating to guest.

Changes from v3 to v4:
* Add property to intel_iommu device to control the CM capability, 
  default to False.
* Use s->iommu_ops.notify_flag_changed to register notifiers.

Aviv Ben-David (3):
  IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to
guest
  IOMMU: change iommu_op->translate's is_write to flags, add support to
NO_FAIL flag mode
  IOMMU: enable intel_iommu map and unmap notifiers

 exec.c |   2 +-
 hw/i386/amd_iommu.c|   4 +-
 hw/i386/intel_iommu.c  | 170 +
 hw/i386/intel_iommu_internal.h |   3 +
 include/exec/memory.h  |   4 +-
 include/hw/i386/intel_iommu.h  |  10 +++
 memory.c   |   2 +-
 7 files changed, 159 insertions(+), 36 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v4 3/3] IOMMU: enable intel_iommu map and unmap notifiers

2016-10-15 Thread Aviv B.D
From: "Aviv Ben-David" 

* register notifiers when vtd_iommu_notify_flag_changed is called.
* Notify on IOTLB page invalidation about unmap and map operations.

Adds a list of registered vtd_as's to intel iommu state to save
iterations over each PCI device in a search of the corrosponding domain.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 102 ++---
 hw/i386/intel_iommu_internal.h |   2 +
 include/hw/i386/intel_iommu.h  |   8 
 3 files changed, 105 insertions(+), 7 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 9ffa45f..0560e3c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -51,6 +51,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | 
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif
 
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
 uint64_t wmask, uint64_t w1cmask)
 {
@@ -142,6 +145,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState 
*s, hwaddr addr,
 return new_val;
 }
 
+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, 
+   uint16_t * domain_id)
+{
+VTDContextEntry ce;
+int ret_fr;
+
+assert(domain_id);
+
+ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+if (ret_fr){
+return -1;
+}
+
+*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+return 0;
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -682,9 +702,6 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, IOMMUAccessFlags
 *reads = (*reads) && (slpte & VTD_SL_R);
 *writes = (*writes) && (slpte & VTD_SL_W);
 if (!(slpte & access_right_check)) {
-VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
-"gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-(flags == IOMMU_WO ? "write" : "read"), gpa, slpte);
 return (flags == IOMMU_RW || flags == IOMMU_WO) ? -VTD_FR_WRITE : 
-VTD_FR_READ;
 }
 if (vtd_slpte_nonzero_rsvd(slpte, level)) {
@@ -732,9 +749,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, 
uint8_t bus_num,
 }
 
 if (!vtd_context_entry_present(ce)) {
-VTD_DPRINTF(GENERAL,
-"error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-"is not present", devfn, bus_num);
 return -VTD_FR_CONTEXT_ENTRY_P;
 } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
(ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -1062,6 +1076,55 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState 
*s, uint16_t domain_id)
 &domain_id);
 }
 
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, 
+   uint16_t domain_id, hwaddr addr, 
+   uint8_t am)
+{
+IntelIOMMUNotifierNode * node;
+
+QLIST_FOREACH(node, &(s->notifiers_list), next){
+VTDAddressSpace *vtd_as = node->vtd_as; 
+uint16_t vfio_domain_id; 
+int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, 
+  &vfio_domain_id);
+//int i=0;
+if (!ret && domain_id == vfio_domain_id){
+IOMMUTLBEntry entry; 
+
+/* notify unmap */
+if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP){
+VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, 
am);
+entry.target_as = &address_space_memory;
+entry.iova = addr & VTD_PAGE_MASK_4K;
+entry.translated_addr = 0;
+entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
+entry.perm = IOMMU_NONE;
+memory_region_notify_iommu(&node->vtd_as->iommu, entry);
+}
+   
+/* notify map */
+if (node->notifier_flag & IOMMU_NOTIFIER_MAP){
+hwaddr original_addr = addr;
+VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, 
am);
+while (addr < original_addr + (1vtd_as->iommu,
+addr, 
+IOMMU_NO_FAIL); 
+if (entry.perm != IOMMU_NONE){
+addr += entry.addr_mask + 1; 
+memory_region_notify_iommu(&node->vtd_as->iommu, 
entry);
+} else {
+addr += VTD_PAGE_SIZE;
+}
+}
+}

[Qemu-devel] [PATCH v4 1/3] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2016-10-15 Thread Aviv B.D
From: "Aviv Ben-David" 

This capability asks the guest to invalidate cache before each map operation.
We can use this invalidation to trap map operations in the hypervisor.

This capability only enabled when cache-mode property of the device is true.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 5 +
 hw/i386/intel_iommu_internal.h | 1 +
 include/hw/i386/intel_iommu.h  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2efd69b..6e5ad7b 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2012,6 +2012,7 @@ static const MemoryRegionOps vtd_mem_ops = {
 
 static Property vtd_properties[] = {
 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
+DEFINE_PROP_BOOL("cache-mode", IntelIOMMUState, cache_mode_enabled, FALSE),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2385,6 +2386,10 @@ static void vtd_init(IntelIOMMUState *s)
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_EIM | VTD_ECAP_MHMV;
 }
 
+if (s->cache_mode_enabled){
+s->cap |= VTD_CAP_CM;
+}
+
 vtd_reset_context_cache(s);
 vtd_reset_iotlb(s);
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 0829a50..35d9f3a 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -201,6 +201,7 @@
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
 #define VTD_CAP_PSI (1ULL << 39)
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM  (1ULL << 7)
 
 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT 8
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index a42dbd7..7a94f16 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -258,6 +258,8 @@ struct IntelIOMMUState {
 uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
 uint32_t version;
 
+bool cache_mode_enabled;/* RO - is cap CM enabled? */
+
 dma_addr_t root;/* Current root table pointer */
 bool root_extended; /* Type of root table (extended or not) */
 bool dmar_enabled;  /* Set if DMA remapping is enabled */
-- 
1.9.1




[Qemu-devel] [PATCH v4 2/3] IOMMU: change iommu_op->translate's is_write to flags, add support to NO_FAIL flag mode

2016-10-15 Thread Aviv B.D
From: "Aviv Ben-David" 

Supports translation trials without reporting error to guest on
translation failure.

Signed-off-by: Aviv Ben-David 
---
 exec.c|  2 +-
 hw/i386/amd_iommu.c   |  4 ++--
 hw/i386/intel_iommu.c | 65 ---
 include/exec/memory.h |  4 ++--
 memory.c  |  2 +-
 5 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/exec.c b/exec.c
index 374c364..1b16fe4 100644
--- a/exec.c
+++ b/exec.c
@@ -432,7 +432,7 @@ MemoryRegion *address_space_translate(AddressSpace *as, 
hwaddr addr,
 break;
 }
 
-iotlb = mr->iommu_ops->translate(mr, addr, is_write);
+iotlb = mr->iommu_ops->translate(mr, addr, is_write ? IOMMU_WO : 
IOMMU_RO);
 addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 | (addr & iotlb.addr_mask));
 *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 47b79d9..1f0d76b 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -988,7 +988,7 @@ static inline bool amdvi_is_interrupt_addr(hwaddr addr)
 }
 
 static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr,
- bool is_write)
+ IOMMUAccessFlags flags)
 {
 AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
 AMDVIState *s = as->iommu_state;
@@ -1017,7 +1017,7 @@ static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, 
hwaddr addr,
 return ret;
 }
 
-amdvi_do_translate(as, addr, is_write, &ret);
+amdvi_do_translate(as, addr, flags & IOMMU_WO, &ret);
 trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
 PCI_FUNC(as->devfn), addr, ret.translated_addr);
 return ret;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 6e5ad7b..9ffa45f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -364,7 +364,7 @@ static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, 
uint16_t index)
 /* Must not update F field now, should be done later */
 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
 uint16_t source_id, hwaddr addr,
-VTDFaultReason fault, bool is_write)
+VTDFaultReason fault, IOMMUAccessFlags flags)
 {
 uint64_t hi = 0, lo;
 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
@@ -373,7 +373,7 @@ static void vtd_record_frcd(IntelIOMMUState *s, uint16_t 
index,
 
 lo = VTD_FRCD_FI(addr);
 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
-if (!is_write) {
+if (!(flags == IOMMU_WO || flags == IOMMU_RW)) {
 hi |= VTD_FRCD_T;
 }
 vtd_set_quad_raw(s, frcd_reg_addr, lo);
@@ -404,7 +404,7 @@ static bool vtd_try_collapse_fault(IntelIOMMUState *s, 
uint16_t source_id)
 /* Log and report an DMAR (address translation) fault to software */
 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
   hwaddr addr, VTDFaultReason fault,
-  bool is_write)
+  IOMMUAccessFlags flags)
 {
 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 
@@ -415,7 +415,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, 
uint16_t source_id,
 return;
 }
 VTD_DPRINTF(FLOG, "sid 0x%"PRIx16 ", fault %d, addr 0x%"PRIx64
-", is_write %d", source_id, fault, addr, is_write);
+", flags %d", source_id, fault, addr, flags);
 if (fsts_reg & VTD_FSTS_PFO) {
 VTD_DPRINTF(FLOG, "new fault is not recorded due to "
 "Primary Fault Overflow");
@@ -433,7 +433,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, 
uint16_t source_id,
 return;
 }
 
-vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
+vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, flags);
 
 if (fsts_reg & VTD_FSTS_PPF) {
 VTD_DPRINTF(FLOG, "there are pending faults already, "
@@ -629,7 +629,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t 
level)
 /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, 
IOMMUAccessFlags flags,
 uint64_t *slptep, uint32_t *slpte_level,
 bool *reads, bool *writes)
 {
@@ -649,7 +649,20 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 }
 
 /* FIXME: what is the Atomics request here? */
-access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+switch(flags){
+case IOMMU_WO:
+access_right_ch

[Qemu-devel] [PATCH v4 RESEND 3/3] IOMMU: enable intel_iommu map and unmap notifiers

2016-10-17 Thread Aviv B.D
From: "Aviv Ben-David" 

Adds a list of registered vtd_as's to intel iommu state to save
iteration over each PCI device in a search of the corrosponding domain.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 102 ++---
 hw/i386/intel_iommu_internal.h |   2 +
 include/hw/i386/intel_iommu.h  |   9 
 3 files changed, 106 insertions(+), 7 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index dcf45f0..34fc1e8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -51,6 +51,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | 
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif
 
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
 uint64_t wmask, uint64_t w1cmask)
 {
@@ -142,6 +145,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState 
*s, hwaddr addr,
 return new_val;
 }
 
+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn,
+   uint16_t *domain_id)
+{
+VTDContextEntry ce;
+int ret_fr;
+
+assert(domain_id);
+
+ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+if (ret_fr) {
+return -1;
+}
+
+*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+return 0;
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -683,9 +703,6 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa,
 *reads = (*reads) && (slpte & VTD_SL_R);
 *writes = (*writes) && (slpte & VTD_SL_W);
 if (!(slpte & access_right_check)) {
-VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
-"gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-(flags == IOMMU_WO ? "write" : "read"), gpa, slpte);
 return (flags == IOMMU_RW || flags == IOMMU_WO) ?
-VTD_FR_WRITE : -VTD_FR_READ;
 }
@@ -734,9 +751,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, 
uint8_t bus_num,
 }
 
 if (!vtd_context_entry_present(ce)) {
-VTD_DPRINTF(GENERAL,
-"error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-"is not present", devfn, bus_num);
 return -VTD_FR_CONTEXT_ENTRY_P;
 } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
(ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -1065,6 +1079,55 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState 
*s, uint16_t domain_id)
 &domain_id);
 }
 
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+   uint16_t domain_id, hwaddr addr,
+   uint8_t am)
+{
+IntelIOMMUNotifierNode *node;
+
+QLIST_FOREACH(node, &(s->notifiers_list), next) {
+VTDAddressSpace *vtd_as = node->vtd_as;
+uint16_t vfio_domain_id;
+int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn,
+  &vfio_domain_id);
+if (!ret && domain_id == vfio_domain_id) {
+IOMMUTLBEntry entry;
+
+/* notify unmap */
+if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
+VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
+addr, am);
+entry.target_as = &address_space_memory;
+entry.iova = addr & VTD_PAGE_MASK_4K;
+entry.translated_addr = 0;
+entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
+entry.perm = IOMMU_NONE;
+memory_region_notify_iommu(&node->vtd_as->iommu, entry);
+}
+
+/* notify map */
+if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
+hwaddr original_addr = addr;
+VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, 
am);
+while (addr < original_addr + (1 << am) * VTD_PAGE_SIZE) {
+/* call to vtd_iommu_translate */
+IOMMUTLBEntry entry = s->iommu_ops.translate(
+ &node->vtd_as->iommu,
+ addr,
+ IOMMU_NO_FAIL);
+if (entry.perm != IOMMU_NONE) {
+addr += entry.addr_mask + 1;
+memory_region_notify_iommu(&node->vtd_as->iommu, 
entry);
+} else {
+addr += VTD_PAGE_SIZE;
+}
+}
+}
+}
+}
+}
+
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint

[Qemu-devel] [PATCH v4 RESEND 1/3] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2016-10-17 Thread Aviv B.D
From: "Aviv Ben-David" 

This capability asks the guest to invalidate cache before each map operation.
We can use this invalidation to trap map operations in the hypervisor.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 5 +
 hw/i386/intel_iommu_internal.h | 1 +
 include/hw/i386/intel_iommu.h  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2efd69b..69730cb 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2012,6 +2012,7 @@ static const MemoryRegionOps vtd_mem_ops = {
 
 static Property vtd_properties[] = {
 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
+DEFINE_PROP_BOOL("cache-mode", IntelIOMMUState, cache_mode_enabled, FALSE),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2385,6 +2386,10 @@ static void vtd_init(IntelIOMMUState *s)
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_EIM | VTD_ECAP_MHMV;
 }
 
+if (s->cache_mode_enabled) {
+s->cap |= VTD_CAP_CM;
+}
+
 vtd_reset_context_cache(s);
 vtd_reset_iotlb(s);
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 0829a50..35d9f3a 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -201,6 +201,7 @@
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
 #define VTD_CAP_PSI (1ULL << 39)
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM  (1ULL << 7)
 
 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT 8
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index a42dbd7..7a94f16 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -258,6 +258,8 @@ struct IntelIOMMUState {
 uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
 uint32_t version;
 
+bool cache_mode_enabled;/* RO - is cap CM enabled? */
+
 dma_addr_t root;/* Current root table pointer */
 bool root_extended; /* Type of root table (extended or not) */
 bool dmar_enabled;  /* Set if DMA remapping is enabled */
-- 
1.9.1




[Qemu-devel] [PATCH v4 RESEND 2/3] IOMMU: change iommu_op->translate's is_write to flags, add support to NO_FAIL flag mode

2016-10-17 Thread Aviv B.D
From: "Aviv Ben-David" 

Supports translation trials without reporting error to guest on
translation failure.

Signed-off-by: Aviv Ben-David 
---
 exec.c|  3 ++-
 hw/i386/amd_iommu.c   |  4 +--
 hw/i386/intel_iommu.c | 70 +--
 include/exec/memory.h |  6 +++--
 memory.c  |  3 ++-
 5 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/exec.c b/exec.c
index 374c364..266fa01 100644
--- a/exec.c
+++ b/exec.c
@@ -432,7 +432,8 @@ MemoryRegion *address_space_translate(AddressSpace *as, 
hwaddr addr,
 break;
 }
 
-iotlb = mr->iommu_ops->translate(mr, addr, is_write);
+iotlb = mr->iommu_ops->translate(mr, addr,
+ is_write ? IOMMU_WO : IOMMU_RO);
 addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 | (addr & iotlb.addr_mask));
 *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 47b79d9..1f0d76b 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -988,7 +988,7 @@ static inline bool amdvi_is_interrupt_addr(hwaddr addr)
 }
 
 static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr,
- bool is_write)
+ IOMMUAccessFlags flags)
 {
 AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
 AMDVIState *s = as->iommu_state;
@@ -1017,7 +1017,7 @@ static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, 
hwaddr addr,
 return ret;
 }
 
-amdvi_do_translate(as, addr, is_write, &ret);
+amdvi_do_translate(as, addr, flags & IOMMU_WO, &ret);
 trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
 PCI_FUNC(as->devfn), addr, ret.translated_addr);
 return ret;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 69730cb..dcf45f0 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -364,7 +364,7 @@ static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, 
uint16_t index)
 /* Must not update F field now, should be done later */
 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
 uint16_t source_id, hwaddr addr,
-VTDFaultReason fault, bool is_write)
+VTDFaultReason fault, IOMMUAccessFlags flags)
 {
 uint64_t hi = 0, lo;
 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
@@ -373,7 +373,7 @@ static void vtd_record_frcd(IntelIOMMUState *s, uint16_t 
index,
 
 lo = VTD_FRCD_FI(addr);
 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
-if (!is_write) {
+if (!(flags == IOMMU_WO || flags == IOMMU_RW)) {
 hi |= VTD_FRCD_T;
 }
 vtd_set_quad_raw(s, frcd_reg_addr, lo);
@@ -404,7 +404,7 @@ static bool vtd_try_collapse_fault(IntelIOMMUState *s, 
uint16_t source_id)
 /* Log and report an DMAR (address translation) fault to software */
 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
   hwaddr addr, VTDFaultReason fault,
-  bool is_write)
+  IOMMUAccessFlags flags)
 {
 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 
@@ -415,7 +415,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, 
uint16_t source_id,
 return;
 }
 VTD_DPRINTF(FLOG, "sid 0x%"PRIx16 ", fault %d, addr 0x%"PRIx64
-", is_write %d", source_id, fault, addr, is_write);
+", flags %d", source_id, fault, addr, flags);
 if (fsts_reg & VTD_FSTS_PFO) {
 VTD_DPRINTF(FLOG, "new fault is not recorded due to "
 "Primary Fault Overflow");
@@ -433,7 +433,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, 
uint16_t source_id,
 return;
 }
 
-vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
+vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, flags);
 
 if (fsts_reg & VTD_FSTS_PPF) {
 VTD_DPRINTF(FLOG, "there are pending faults already, "
@@ -629,7 +629,8 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t 
level)
 /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
+IOMMUAccessFlags flags,
 uint64_t *slptep, uint32_t *slpte_level,
 bool *reads, bool *writes)
 {
@@ -649,7 +650,20 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 }
 
 /* FIXME: what is the Atomics request here? */
-access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+

[Qemu-devel] [PATCH v4 RESEND 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-10-17 Thread Aviv B.D
From: "Aviv Ben-David" 

* Advertize Cache Mode capability in iommu cap register. 
  This capability is controlled by "cache-mode" property of intel-iommu device.
  To enable this option call QEMU with "-device intel-iommu,cache-mode=true".

* On page cache invalidation in intel vIOMMU, check if the domain belong to
  registered notifier, and notify accordingly.

Currently this patch still doesn't enabling VFIO devices support with vIOMMU 
present. Current problems:
* vfio_iommu_map_notify is not aware about memory range belong to specific 
  VFIOGuestIOMMU.
* memory_region_iommu_replay hangs QEMU on start up while it itterate over 
  64bit address space. Commenting out the call to this function enables 
  workable VFIO device while vIOMMU present.

Changes from v1 to v2:
* remove assumption that the cache do not clears
* fix lockup on high load.

Changes from v2 to v3:
* remove debug leftovers
* split to sepearate commits
* change is_write to flags in vtd_do_iommu_translate, add IOMMU_NO_FAIL 
  to suppress error propagating to guest.

Changes from v3 to v4:
* Add property to intel_iommu device to control the CM capability, 
  default to False.
* Use s->iommu_ops.notify_flag_changed to register notifiers.

Changes from v4 to v4 RESEND:
* Fix codding style pointed by checkpatch.pl script.

Aviv Ben-David (3):
  IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to
guest
  IOMMU: change iommu_op->translate's is_write to flags, add support to
NO_FAIL flag mode
  IOMMU: enable intel_iommu map and unmap notifiers

 exec.c |   3 +-
 hw/i386/amd_iommu.c|   4 +-
 hw/i386/intel_iommu.c  | 175 +
 hw/i386/intel_iommu_internal.h |   3 +
 include/exec/memory.h  |   6 +-
 include/hw/i386/intel_iommu.h  |  11 +++
 memory.c   |   3 +-
 7 files changed, 168 insertions(+), 37 deletions(-)

-- 
1.9.1




Re: [Qemu-devel] [PATCH v4 RESEND 2/3] IOMMU: change iommu_op->translate's is_write to flags, add support to NO_FAIL flag mode

2016-10-20 Thread Aviv B.D.
Hi,
You are right, and I will revert those functions you mentioned.

Thanks,
Aviv.



On Wed, Oct 19, 2016 at 11:35 AM, Peter Xu  wrote:

> On Mon, Oct 17, 2016 at 06:44:23PM +0300, Aviv B.D wrote:
>
> [...]
>
> > @@ -364,7 +364,7 @@ static void vtd_set_frcd_and_update_ppf(IntelIOMMUState
> *s, uint16_t index)
> >  /* Must not update F field now, should be done later */
> >  static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
> >  uint16_t source_id, hwaddr addr,
> > -VTDFaultReason fault, bool is_write)
> > +VTDFaultReason fault, IOMMUAccessFlags
> flags)
>
> I think we don't need to change this is_write into flags. We can just
> make sure we translate the flags into is_write when calling
> vtd_record_frcd. After all, NO_FAIL makes no sense in this function.
>
> >  {
> >  uint64_t hi = 0, lo;
> >  hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) <<
> 4);
> > @@ -373,7 +373,7 @@ static void vtd_record_frcd(IntelIOMMUState *s,
> uint16_t index,
> >
> >  lo = VTD_FRCD_FI(addr);
> >  hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
> > -if (!is_write) {
> > +if (!(flags == IOMMU_WO || flags == IOMMU_RW)) {
> >  hi |= VTD_FRCD_T;
> >  }
> >  vtd_set_quad_raw(s, frcd_reg_addr, lo);
> > @@ -404,7 +404,7 @@ static bool vtd_try_collapse_fault(IntelIOMMUState
> *s, uint16_t source_id)
> >  /* Log and report an DMAR (address translation) fault to software */
> >  static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t
> source_id,
> >hwaddr addr, VTDFaultReason fault,
> > -  bool is_write)
> > +  IOMMUAccessFlags flags)
>
> Here as well. IMHO we can just keep the old is_write, and tune the
> callers.
>
> >  {
> >  uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
> >
> > @@ -415,7 +415,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState
> *s, uint16_t source_id,
> >  return;
> >  }
> >  VTD_DPRINTF(FLOG, "sid 0x%"PRIx16 ", fault %d, addr 0x%"PRIx64
> > -", is_write %d", source_id, fault, addr, is_write);
> > +", flags %d", source_id, fault, addr, flags);
> >  if (fsts_reg & VTD_FSTS_PFO) {
> >  VTD_DPRINTF(FLOG, "new fault is not recorded due to "
> >  "Primary Fault Overflow");
> > @@ -433,7 +433,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState
> *s, uint16_t source_id,
> >  return;
> >  }
> >
> > -vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault,
> is_write);
> > +vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, flags);
>
> ... so if you agree on my previous comments, here we can use:
>
>vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault,
>flags & IOMMU_WO);
>
> and keep the vtd_record_frcd() untouched.
>
> [...]
>
> > @@ -789,11 +804,12 @@ static inline bool vtd_is_interrupt_addr(hwaddr
> addr)
> >   *
> >   * @bus_num: The bus number
> >   * @devfn: The devfn, which is the  combined of device and function
> number
> > - * @is_write: The access is a write operation
> > + * @flags: The access is a write operation
>
> Need to fix comment to suite the new flag.
>
> >   * @entry: IOMMUTLBEntry that contain the addr to be translated and
> result
> >   */
> >  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus
> *bus,
> > -   uint8_t devfn, hwaddr addr, bool
> is_write,
> > +   uint8_t devfn, hwaddr addr,
> > +   IOMMUAccessFlags flags,
> > IOMMUTLBEntry *entry)
> >  {
> >  IntelIOMMUState *s = vtd_as->iommu_state;
> > @@ -811,7 +827,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace
> *vtd_as, PCIBus *bus,
> >
> >  /* Check if the request is in interrupt address range */
> >  if (vtd_is_interrupt_addr(addr)) {
> > -if (is_write) {
> > +if (flags == IOMMU_WO || flags == IOMMU_RW) {
>
> I suggest we directly use (flags & IOMMU_WO) in all similar places.
>
> >  /* FIXME: since we don't know the length of the access
> here, we
> >   * treat Non-DWORD length write reques

Re: [Qemu-devel] [PATCH v4 RESEND 3/3] IOMMU: enable intel_iommu map and unmap notifiers

2016-10-20 Thread Aviv B.D.
I will fix all those issues.
Thanks,
Aviv.

On Thu, Oct 20, 2016 at 10:11 PM, Aviv B.D.  wrote:

>
>
> On Wed, Oct 19, 2016 at 12:33 PM, Peter Xu  wrote:
>
>> On Mon, Oct 17, 2016 at 06:44:24PM +0300, Aviv B.D wrote:
>> > From: "Aviv Ben-David" 
>> >
>> > Adds a list of registered vtd_as's to intel iommu state to save
>> > iteration over each PCI device in a search of the corrosponding domain.
>> >
>> > Signed-off-by: Aviv Ben-David 
>> > ---
>> >  hw/i386/intel_iommu.c  | 102 ++
>> ---
>> >  hw/i386/intel_iommu_internal.h |   2 +
>> >  include/hw/i386/intel_iommu.h  |   9 
>> >  3 files changed, 106 insertions(+), 7 deletions(-)
>> >
>> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> > index dcf45f0..34fc1e8 100644
>> > --- a/hw/i386/intel_iommu.c
>> > +++ b/hw/i386/intel_iommu.c
>> > @@ -51,6 +51,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
>> VTD_DBGBIT(CSR);
>> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>> >  #endif
>> >
>> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t
>> bus_num,
>> > +uint8_t devfn, VTDContextEntry
>> *ce);
>> > +
>> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
>> val,
>> >  uint64_t wmask, uint64_t w1cmask)
>> >  {
>> > @@ -142,6 +145,23 @@ static uint64_t 
>> > vtd_set_clear_mask_quad(IntelIOMMUState
>> *s, hwaddr addr,
>> >  return new_val;
>> >  }
>> >
>> > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
>> uint8_t devfn,
>> > +   uint16_t *domain_id)
>> > +{
>> > +VTDContextEntry ce;
>> > +int ret_fr;
>> > +
>> > +assert(domain_id);
>> > +
>> > +ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
>> > +if (ret_fr) {
>> > +return -1;
>> > +}
>> > +
>> > +*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
>> ^ one more space
>>
>> > +return 0;
>> > +}
>> > +
>> >  /* GHashTable functions */
>> >  static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
>> >  {
>> > @@ -683,9 +703,6 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
>> uint64_t gpa,
>> >  *reads = (*reads) && (slpte & VTD_SL_R);
>> >  *writes = (*writes) && (slpte & VTD_SL_W);
>> >  if (!(slpte & access_right_check)) {
>> > -VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
>> > -"gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
>> > -(flags == IOMMU_WO ? "write" : "read"), gpa,
>> slpte);
>>
>> Could I ask why we are removing these lines? It can be useful if we
>> have permission issues.
>>
>
> I will return Those lines if flags & NO_FAIL == 0
>
>>
>> >  return (flags == IOMMU_RW || flags == IOMMU_WO) ?
>> > -VTD_FR_WRITE : -VTD_FR_READ;
>> >  }
>> > @@ -734,9 +751,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
>> *s, uint8_t bus_num,
>> >  }
>> >
>> >  if (!vtd_context_entry_present(ce)) {
>> > -VTD_DPRINTF(GENERAL,
>> > -"error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
>> > -"is not present", devfn, bus_num);
>>
>> Here as well. Any reason to remove it?
>>
>>
> Here as well...
>
>
>> >  return -VTD_FR_CONTEXT_ENTRY_P;
>> >  } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>> > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
>> > @@ -1065,6 +1079,55 @@ static void 
>> > vtd_iotlb_domain_invalidate(IntelIOMMUState
>> *s, uint16_t domain_id)
>> >  &domain_id);
>> >  }
>> >
>> > +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
>> > +   uint16_t domain_id, hwaddr
>> addr,
>> > +   uint8_t am)
>> > +{
>>
>> The logic of this function

Re: [Qemu-devel] [PATCH v4 RESEND 3/3] IOMMU: enable intel_iommu map and unmap notifiers

2016-10-20 Thread Aviv B.D.
On Wed, Oct 19, 2016 at 12:33 PM, Peter Xu  wrote:

> On Mon, Oct 17, 2016 at 06:44:24PM +0300, Aviv B.D wrote:
> > From: "Aviv Ben-David" 
> >
> > Adds a list of registered vtd_as's to intel iommu state to save
> > iteration over each PCI device in a search of the corrosponding domain.
> >
> > Signed-off-by: Aviv Ben-David 
> > ---
> >  hw/i386/intel_iommu.c  | 102 ++
> ---
> >  hw/i386/intel_iommu_internal.h |   2 +
> >  include/hw/i386/intel_iommu.h  |   9 
> >  3 files changed, 106 insertions(+), 7 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index dcf45f0..34fc1e8 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -51,6 +51,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT(CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> >
> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t
> bus_num,
> > +uint8_t devfn, VTDContextEntry *ce);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
> val,
> >  uint64_t wmask, uint64_t w1cmask)
> >  {
> > @@ -142,6 +145,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState
> *s, hwaddr addr,
> >  return new_val;
> >  }
> >
> > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t devfn,
> > +   uint16_t *domain_id)
> > +{
> > +VTDContextEntry ce;
> > +int ret_fr;
> > +
> > +assert(domain_id);
> > +
> > +ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +if (ret_fr) {
> > +return -1;
> > +}
> > +
> > +*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> ^ one more space
>
> > +return 0;
> > +}
> > +
> >  /* GHashTable functions */
> >  static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
> >  {
> > @@ -683,9 +703,6 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
> uint64_t gpa,
> >  *reads = (*reads) && (slpte & VTD_SL_R);
> >  *writes = (*writes) && (slpte & VTD_SL_W);
> >  if (!(slpte & access_right_check)) {
> > -VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
> > -"gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
> > -(flags == IOMMU_WO ? "write" : "read"), gpa,
> slpte);
>
> Could I ask why we are removing these lines? It can be useful if we
> have permission issues.
>

I will return Those lines if flags & NO_FAIL == 0

>
> >  return (flags == IOMMU_RW || flags == IOMMU_WO) ?
> > -VTD_FR_WRITE : -VTD_FR_READ;
> >  }
> > @@ -734,9 +751,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState
> *s, uint8_t bus_num,
> >  }
> >
> >  if (!vtd_context_entry_present(ce)) {
> > -VTD_DPRINTF(GENERAL,
> > -"error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > -"is not present", devfn, bus_num);
>
> Here as well. Any reason to remove it?
>
>
Here as well...


> >  return -VTD_FR_CONTEXT_ENTRY_P;
> >  } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > @@ -1065,6 +1079,55 @@ static void 
> > vtd_iotlb_domain_invalidate(IntelIOMMUState
> *s, uint16_t domain_id)
> >  &domain_id);
> >  }
> >
> > +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
> > +   uint16_t domain_id, hwaddr
> addr,
> > +   uint8_t am)
> > +{
>
> The logic of this function looks strange to me.
>
> > +IntelIOMMUNotifierNode *node;
> > +
> > +QLIST_FOREACH(node, &(s->notifiers_list), next) {
> > +VTDAddressSpace *vtd_as = node->vtd_as;
> > +uint16_t vfio_domain_id;
> > +int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn,
> > +  &vfio_domain_id);
> > +if (!ret && domain_id == vfio_domain_id) {
> > +IOMMUTLBEntry entry;
> > +

Re: [Qemu-devel] [PATCH v4 RESEND 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-10-20 Thread Aviv B.D.
On Mon, Oct 17, 2016 at 7:07 PM, Alex Williamson  wrote:

> On Mon, 17 Oct 2016 18:44:21 +0300
> "Aviv B.D"  wrote:
>
> > From: "Aviv Ben-David" 
> >
> > * Advertize Cache Mode capability in iommu cap register.
> >   This capability is controlled by "cache-mode" property of intel-iommu
> device.
> >   To enable this option call QEMU with "-device
> intel-iommu,cache-mode=true".
> >
> > * On page cache invalidation in intel vIOMMU, check if the domain belong
> to
> >   registered notifier, and notify accordingly.
> >
> > Currently this patch still doesn't enabling VFIO devices support with
> vIOMMU
> > present. Current problems:
> > * vfio_iommu_map_notify is not aware about memory range belong to
> specific
> >   VFIOGuestIOMMU.
>
> Could you elaborate on why this is an issue?
>

In my setup the VFIO registered two memory areas with one page of
unregistered memory
between them.

When I'm calling memory_region_notify_iommu it calls the notifier function
of VFIO twice
when the second time is failing with warning to console as the new mapping
is already present.

The notifier function of VFIO should ignore IOMMUTLBEntry that is not in
the correct
range.


>
> > * memory_region_iommu_replay hangs QEMU on start up while it itterate
> over
> >   64bit address space. Commenting out the call to this function enables
> >   workable VFIO device while vIOMMU present.
>
> This has been discussed previously, it would be incorrect for vfio not
> to call the replay function.  The solution is to add an iommu driver
> callback to efficiently walk the mappings within a MemoryRegion.
> Thanks,
>

I'm aware about those discussion, I put this info here as helpful reminder
if someone wants to
test my code.


> Alex
>
> > Changes from v1 to v2:
> > * remove assumption that the cache do not clears
> > * fix lockup on high load.
> >
> > Changes from v2 to v3:
> > * remove debug leftovers
> > * split to sepearate commits
> > * change is_write to flags in vtd_do_iommu_translate, add IOMMU_NO_FAIL
> >   to suppress error propagating to guest.
> >
> > Changes from v3 to v4:
> > * Add property to intel_iommu device to control the CM capability,
> >   default to False.
> > * Use s->iommu_ops.notify_flag_changed to register notifiers.
> >
> > Changes from v4 to v4 RESEND:
> > * Fix codding style pointed by checkpatch.pl script.
> >
> > Aviv Ben-David (3):
> >   IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to
> > guest
> >   IOMMU: change iommu_op->translate's is_write to flags, add support to
> > NO_FAIL flag mode
> >   IOMMU: enable intel_iommu map and unmap notifiers
> >
> >  exec.c |   3 +-
> >  hw/i386/amd_iommu.c|   4 +-
> >  hw/i386/intel_iommu.c  | 175 ++
> +++
> >  hw/i386/intel_iommu_internal.h |   3 +
> >  include/exec/memory.h  |   6 +-
> >  include/hw/i386/intel_iommu.h  |  11 +++
> >  memory.c   |   3 +-
> >  7 files changed, 168 insertions(+), 37 deletions(-)
> >
>
>
Aviv.


[Qemu-devel] [PATCH v5 1/3] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2016-10-20 Thread Aviv B.D
From: "Aviv Ben-David" 

This capability asks the guest to invalidate cache before each map operation.
We can use this invalidation to trap map operations in the hypervisor.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 5 +
 hw/i386/intel_iommu_internal.h | 1 +
 include/hw/i386/intel_iommu.h  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1655a65..834887f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2017,6 +2017,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+DEFINE_PROP_BOOL("cache-mode", IntelIOMMUState, cache_mode_enabled, FALSE),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2391,6 +2392,10 @@ static void vtd_init(IntelIOMMUState *s)
 assert(s->intr_eim != ON_OFF_AUTO_AUTO);
 }
 
+if (s->cache_mode_enabled) {
+s->cap |= VTD_CAP_CM;
+}
+
 vtd_reset_context_cache(s);
 vtd_reset_iotlb(s);
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 0829a50..35d9f3a 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -201,6 +201,7 @@
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
 #define VTD_CAP_PSI (1ULL << 39)
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM  (1ULL << 7)
 
 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT 8
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 1989c1e..42d293f 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -258,6 +258,8 @@ struct IntelIOMMUState {
 uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
 uint32_t version;
 
+bool cache_mode_enabled;/* RO - is cap CM enabled? */
+
 dma_addr_t root;/* Current root table pointer */
 bool root_extended; /* Type of root table (extended or not) */
 bool dmar_enabled;  /* Set if DMA remapping is enabled */
-- 
1.9.1




[Qemu-devel] [PATCH v5 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-10-20 Thread Aviv B.D
From: "Aviv Ben-David" 

* Advertize Cache Mode capability in iommu cap register. 
  This capability is controlled by "cache-mode" property of intel-iommu device.
  To enable this option call QEMU with "-device intel-iommu,cache-mode=true".

* On page cache invalidation in intel vIOMMU, check if the domain belong to
  registered notifier, and notify accordingly.

Currently this patch still doesn't enabling VFIO devices support with vIOMMU 
present. Current problems:
* vfio_iommu_map_notify is not aware about memory range belong to specific 
  VFIOGuestIOMMU.
* memory_region_iommu_replay hangs QEMU on start up while it itterate over 
  64bit address space. Commenting out the call to this function enables 
  workable VFIO device while vIOMMU present.

Changes from v1 to v2:
* remove assumption that the cache do not clears
* fix lockup on high load.

Changes from v2 to v3:
* remove debug leftovers
* split to sepearate commits
* change is_write to flags in vtd_do_iommu_translate, add IOMMU_NO_FAIL 
  to suppress error propagating to guest.

Changes from v3 to v4:
* Add property to intel_iommu device to control the CM capability, 
  default to False.
* Use s->iommu_ops.notify_flag_changed to register notifiers.

Changes from v4 to v4 RESEND:
* Fix codding style pointed by checkpatch.pl script.

Changes from v4 to v5:
* Reduce the number of changes in patch 2 and make flags real bitfield.
* Revert deleted debug prints.
* Fix memory leak in patch 3.

Aviv Ben-David (3):
  IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to
guest
  IOMMU: change iommu_op->translate's is_write to flags, add support to
NO_FAIL flag mode
  IOMMU: enable intel_iommu map and unmap notifiers

 exec.c |   3 +-
 hw/i386/amd_iommu.c|   4 +-
 hw/i386/intel_iommu.c  | 171 ++---
 hw/i386/intel_iommu_internal.h |   3 +
 hw/ppc/spapr_iommu.c   |   2 +-
 include/exec/memory.h  |   6 +-
 include/hw/i386/intel_iommu.h  |  11 +++
 memory.c   |   3 +-
 8 files changed, 168 insertions(+), 35 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v5 3/3] IOMMU: enable intel_iommu map and unmap notifiers

2016-10-20 Thread Aviv B.D
From: "Aviv Ben-David" 

Adds a list of registered vtd_as's to intel iommu state to save
iteration over each PCI device in a search of the corrosponding domain.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 109 ++---
 hw/i386/intel_iommu_internal.h |   2 +
 include/hw/i386/intel_iommu.h  |   9 
 3 files changed, 112 insertions(+), 8 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 0821079..46e9b8e 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -54,6 +54,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | 
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif
 
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
 uint64_t wmask, uint64_t w1cmask)
 {
@@ -145,6 +148,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState 
*s, hwaddr addr,
 return new_val;
 }
 
+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn,
+   uint16_t *domain_id)
+{
+VTDContextEntry ce;
+int ret_fr;
+
+assert(domain_id);
+
+ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+if (ret_fr) {
+return -1;
+}
+
+*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+return 0;
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -679,10 +699,10 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa,
 }
 *reads = (*reads) && (slpte & VTD_SL_R);
 *writes = (*writes) && (slpte & VTD_SL_W);
-if (!(slpte & access_right_check)) {
+if (!(slpte & access_right_check) && !(flags & IOMMU_NO_FAIL)) {
 VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
 "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-(flags == IOMMU_WO ? "write" : "read"), gpa, slpte);
+(flags & IOMMU_WO ? "write" : "read"), gpa, slpte);
 return (flags & IOMMU_WO) ? -VTD_FR_WRITE : -VTD_FR_READ;
 }
 if (vtd_slpte_nonzero_rsvd(slpte, level)) {
@@ -1064,6 +1084,55 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState 
*s, uint16_t domain_id)
 &domain_id);
 }
 
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+   uint16_t domain_id, hwaddr addr,
+   uint8_t am)
+{
+IntelIOMMUNotifierNode *node;
+
+QLIST_FOREACH(node, &(s->notifiers_list), next) {
+VTDAddressSpace *vtd_as = node->vtd_as;
+uint16_t vfio_domain_id;
+int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn,
+  &vfio_domain_id);
+if (!ret && domain_id == vfio_domain_id) {
+IOMMUTLBEntry entry;
+
+/* notify unmap */
+if (node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
+VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
+addr, am);
+entry.target_as = &address_space_memory;
+entry.iova = addr & VTD_PAGE_MASK_4K;
+entry.translated_addr = 0;
+entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
+entry.perm = IOMMU_NONE;
+memory_region_notify_iommu(&node->vtd_as->iommu, entry);
+}
+
+/* notify map */
+if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
+hwaddr original_addr = addr;
+VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, 
am);
+while (addr < original_addr + (1 << am) * VTD_PAGE_SIZE) {
+/* call to vtd_iommu_translate */
+IOMMUTLBEntry entry = s->iommu_ops.translate(
+ &node->vtd_as->iommu,
+ addr,
+ IOMMU_NO_FAIL);
+if (entry.perm != IOMMU_NONE) {
+addr += entry.addr_mask + 1;
+memory_region_notify_iommu(&node->vtd_as->iommu, 
entry);
+} else {
+addr += VTD_PAGE_SIZE;
+}
+}
+}
+}
+}
+}
+
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
   hwaddr addr, uint8_t am)
 {
@@ -1074,6 +1143,8 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, 
uint16_t domain_id,
 info.addr = addr;
 info.mask = ~((1 << am) - 1);
 g_hash_table_foreach_remove(s->iotl

[Qemu-devel] [PATCH v5 2/3] IOMMU: change iommu_op->translate's is_write to flags, add support to NO_FAIL flag mode

2016-10-20 Thread Aviv B.D
From: "Aviv Ben-David" 

Supports translation trials without reporting error to guest on
translation failure.

Signed-off-by: Aviv Ben-David 
---
 exec.c|  3 ++-
 hw/i386/amd_iommu.c   |  4 ++--
 hw/i386/intel_iommu.c | 59 +--
 hw/ppc/spapr_iommu.c  |  2 +-
 include/exec/memory.h |  6 --
 memory.c  |  3 ++-
 6 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/exec.c b/exec.c
index e63c5a1..9e2e5ca 100644
--- a/exec.c
+++ b/exec.c
@@ -432,7 +432,8 @@ MemoryRegion *address_space_translate(AddressSpace *as, 
hwaddr addr,
 break;
 }
 
-iotlb = mr->iommu_ops->translate(mr, addr, is_write);
+iotlb = mr->iommu_ops->translate(mr, addr,
+ is_write ? IOMMU_WO : IOMMU_RO);
 addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 | (addr & iotlb.addr_mask));
 *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 47b79d9..1f0d76b 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -988,7 +988,7 @@ static inline bool amdvi_is_interrupt_addr(hwaddr addr)
 }
 
 static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr,
- bool is_write)
+ IOMMUAccessFlags flags)
 {
 AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
 AMDVIState *s = as->iommu_state;
@@ -1017,7 +1017,7 @@ static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, 
hwaddr addr,
 return ret;
 }
 
-amdvi_do_translate(as, addr, is_write, &ret);
+amdvi_do_translate(as, addr, flags & IOMMU_WO, &ret);
 trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
 PCI_FUNC(as->devfn), addr, ret.translated_addr);
 return ret;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 834887f..0821079 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -631,7 +631,8 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t 
level)
 /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
+IOMMUAccessFlags flags,
 uint64_t *slptep, uint32_t *slpte_level,
 bool *reads, bool *writes)
 {
@@ -640,7 +641,7 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 uint32_t offset;
 uint64_t slpte;
 uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
-uint64_t access_right_check;
+uint64_t access_right_check = 0;
 
 /* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in CAP_REG
  * and AW in context-entry.
@@ -651,7 +652,15 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 }
 
 /* FIXME: what is the Atomics request here? */
-access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+if (flags & IOMMU_WO) {
+access_right_check |= VTD_SL_W;
+}
+if (flags & IOMMU_RO) {
+access_right_check |= VTD_SL_R;
+}
+if (flags & IOMMU_NO_FAIL) {
+access_right_check |= VTD_SL_R | VTD_SL_W;
+}
 
 while (true) {
 offset = vtd_gpa_level_offset(gpa, level);
@@ -673,8 +682,8 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 if (!(slpte & access_right_check)) {
 VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
 "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-(is_write ? "write" : "read"), gpa, slpte);
-return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
+(flags == IOMMU_WO ? "write" : "read"), gpa, slpte);
+return (flags & IOMMU_WO) ? -VTD_FR_WRITE : -VTD_FR_READ;
 }
 if (vtd_slpte_nonzero_rsvd(slpte, level)) {
 VTD_DPRINTF(GENERAL, "error: non-zero reserved field in second "
@@ -791,11 +800,13 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
  *
  * @bus_num: The bus number
  * @devfn: The devfn, which is the  combined of device and function number
- * @is_write: The access is a write operation
+ * @flags: The access permission of the operation, use IOMMU_NO_FAIL to
+ * suppress translation errors (e.g. no mapping present)
  * @entry: IOMMUTLBEntry that contain the addr to be translated and result
  */
 static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
-   uint8_t devfn, hwaddr addr, bool is_write,
+   uint8_t devfn, hwaddr addr,
+   IOMMUAccessFlags flags,

[Qemu-devel] [PATCH v6 3/3] IOMMU: enable intel_iommu map and unmap notifiers

2016-11-08 Thread Aviv B.D
From: "Aviv Ben-David" 

Adds a list of registered vtd_as's to intel iommu state to save
iteration over each PCI device in a search of the corrosponding domain.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 96 +++---
 hw/i386/intel_iommu_internal.h |  2 +
 include/hw/i386/intel_iommu.h  |  9 
 3 files changed, 100 insertions(+), 7 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 6ba915c..095b56d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -54,6 +54,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | 
VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif
 
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
 uint64_t wmask, uint64_t w1cmask)
 {
@@ -145,6 +148,23 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState 
*s, hwaddr addr,
 return new_val;
 }
 
+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn,
+   uint16_t *domain_id)
+{
+VTDContextEntry ce;
+int ret_fr;
+
+assert(domain_id);
+
+ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+if (ret_fr) {
+return -1;
+}
+
+*domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+return 0;
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -679,7 +699,7 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa,
 }
 *reads = (*reads) && (slpte & VTD_SL_R);
 *writes = (*writes) && (slpte & VTD_SL_W);
-if (!(slpte & access_right_check)) {
+if (!(slpte & access_right_check) && !(flags & IOMMU_NO_FAIL)) {
 VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
 "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
 (flags & IOMMU_WO ? "write" : "read"), gpa, slpte);
@@ -1064,6 +1084,44 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState 
*s, uint16_t domain_id)
 &domain_id);
 }
 
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+   uint16_t domain_id, hwaddr addr,
+   uint8_t am)
+{
+IntelIOMMUNotifierNode *node;
+
+QLIST_FOREACH(node, &(s->notifiers_list), next) {
+VTDAddressSpace *vtd_as = node->vtd_as;
+uint16_t vfio_domain_id;
+int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn,
+  &vfio_domain_id);
+if (!ret && domain_id == vfio_domain_id) {
+hwaddr original_addr = addr;
+
+while (addr < original_addr + (1 << am) * VTD_PAGE_SIZE) {
+IOMMUTLBEntry entry = s->iommu_ops.translate(
+ &node->vtd_as->iommu,
+ addr,
+ IOMMU_NO_FAIL);
+
+if (entry.perm == IOMMU_NONE &&
+node->notifier_flag & IOMMU_NOTIFIER_UNMAP) {
+entry.target_as = &address_space_memory;
+entry.iova = addr & VTD_PAGE_MASK_4K;
+entry.translated_addr = 0;
+entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT);
+memory_region_notify_iommu(&node->vtd_as->iommu, entry);
+addr += VTD_PAGE_SIZE;
+} else if (node->notifier_flag & IOMMU_NOTIFIER_MAP) {
+memory_region_notify_iommu(&node->vtd_as->iommu, 
entry);
+addr += entry.addr_mask + 1;
+}
+}
+}
+}
+}
+
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
   hwaddr addr, uint8_t am)
 {
@@ -1074,6 +1132,8 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, 
uint16_t domain_id,
 info.addr = addr;
 info.mask = ~((1 << am) - 1);
 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+
+vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
 }
 
 /* Flush IOTLB
@@ -1999,15 +2059,37 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion 
*iommu,
   IOMMUNotifierFlag new)
 {
 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+IntelIOMMUState *s = vtd_as->iommu_state;
+IntelIOMMUNotifierNode *node = NULL;
+IntelIOMMUNotifierNode *next_node = NULL;
 
-if (new & IOMMU_NOTIFIER_MAP) {
-error_report("Device at bus %s addr %02x.%d requires iommu "
- "notifier which is currently not supported

[Qemu-devel] [PATCH v6 2/3] IOMMU: change iommu_op->translate's is_write to flags, add support to NO_FAIL flag mode

2016-11-08 Thread Aviv B.D
From: "Aviv Ben-David" 

Supports translation trials without reporting error to guest on
translation failure.

Signed-off-by: Aviv Ben-David 
---
 exec.c  |  3 ++-
 hw/alpha/typhoon.c  |  2 +-
 hw/i386/amd_iommu.c |  4 ++--
 hw/i386/intel_iommu.c   | 59 +++--
 hw/pci-host/apb.c   |  2 +-
 hw/ppc/spapr_iommu.c|  2 +-
 hw/s390x/s390-pci-bus.c |  2 +-
 include/exec/memory.h   |  6 +++--
 memory.c|  3 ++-
 9 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/exec.c b/exec.c
index 963b91a..5687c2a 100644
--- a/exec.c
+++ b/exec.c
@@ -466,7 +466,8 @@ MemoryRegion *address_space_translate(AddressSpace *as, 
hwaddr addr,
 break;
 }
 
-iotlb = mr->iommu_ops->translate(mr, addr, is_write);
+iotlb = mr->iommu_ops->translate(mr, addr,
+ is_write ? IOMMU_WO : IOMMU_RO);
 addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 | (addr & iotlb.addr_mask));
 *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c
index 883db13..a2840ac 100644
--- a/hw/alpha/typhoon.c
+++ b/hw/alpha/typhoon.c
@@ -664,7 +664,7 @@ static bool window_translate(TyphoonWindow *win, hwaddr 
addr,
 /* TODO: A translation failure here ought to set PCI error codes on the
Pchip and generate a machine check interrupt.  */
 static IOMMUTLBEntry typhoon_translate_iommu(MemoryRegion *iommu, hwaddr addr,
- bool is_write)
+ IOMMUAccessFlags flags)
 {
 TyphoonPchip *pchip = container_of(iommu, TyphoonPchip, iommu);
 IOMMUTLBEntry ret;
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 47b79d9..1f0d76b 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -988,7 +988,7 @@ static inline bool amdvi_is_interrupt_addr(hwaddr addr)
 }
 
 static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr,
- bool is_write)
+ IOMMUAccessFlags flags)
 {
 AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
 AMDVIState *s = as->iommu_state;
@@ -1017,7 +1017,7 @@ static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, 
hwaddr addr,
 return ret;
 }
 
-amdvi_do_translate(as, addr, is_write, &ret);
+amdvi_do_translate(as, addr, flags & IOMMU_WO, &ret);
 trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
 PCI_FUNC(as->devfn), addr, ret.translated_addr);
 return ret;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 834887f..6ba915c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -631,7 +631,8 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t 
level)
 /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
+IOMMUAccessFlags flags,
 uint64_t *slptep, uint32_t *slpte_level,
 bool *reads, bool *writes)
 {
@@ -640,7 +641,7 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 uint32_t offset;
 uint64_t slpte;
 uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
-uint64_t access_right_check;
+uint64_t access_right_check = 0;
 
 /* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in CAP_REG
  * and AW in context-entry.
@@ -651,7 +652,15 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 }
 
 /* FIXME: what is the Atomics request here? */
-access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
+if (flags & IOMMU_WO) {
+access_right_check |= VTD_SL_W;
+}
+if (flags & IOMMU_RO) {
+access_right_check |= VTD_SL_R;
+}
+if (flags & IOMMU_NO_FAIL) {
+access_right_check |= VTD_SL_R | VTD_SL_W;
+}
 
 while (true) {
 offset = vtd_gpa_level_offset(gpa, level);
@@ -673,8 +682,8 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 if (!(slpte & access_right_check)) {
 VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
 "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-(is_write ? "write" : "read"), gpa, slpte);
-return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
+(flags & IOMMU_WO ? "write" : "read"), gpa, slpte);
+return (flags & IOMMU_WO) ? -VTD_FR_WRITE : -VTD_FR_READ;
 }
 if (vtd_slpte_nonzero_rsvd(slpte, level)) {
 VTD_DPRINTF(GENERAL, "error: non-zero reserved fi

[Qemu-devel] [PATCH v6 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-11-08 Thread Aviv B.D
From: "Aviv Ben-David" 

* Advertize Cache Mode capability in iommu cap register. 
  This capability is controlled by "cache-mode" property of intel-iommu device.
  To enable this option call QEMU with "-device intel-iommu,cache-mode=true".

* On page cache invalidation in intel vIOMMU, check if the domain belong to
  registered notifier, and notify accordingly.

Currently this patch still doesn't enabling VFIO devices support with vIOMMU 
present. Current problems:
* vfio_iommu_map_notify is not aware about memory range belong to specific 
  VFIOGuestIOMMU.
* memory_region_iommu_replay hangs QEMU on start up while it itterate over 
  64bit address space. Commenting out the call to this function enables 
  workable VFIO device while vIOMMU present.
* vfio_iommu_map_notify should check if address space range is suitable for 
  current notifier.

Changes from v1 to v2:
* remove assumption that the cache do not clears
* fix lockup on high load.

Changes from v2 to v3:
* remove debug leftovers
* split to sepearate commits
* change is_write to flags in vtd_do_iommu_translate, add IOMMU_NO_FAIL 
  to suppress error propagating to guest.

Changes from v3 to v4:
* Add property to intel_iommu device to control the CM capability, 
  default to False.
* Use s->iommu_ops.notify_flag_changed to register notifiers.

Changes from v4 to v4 RESEND:
* Fix codding style pointed by checkpatch.pl script.

Changes from v4 to v5:
* Reduce the number of changes in patch 2 and make flags real bitfield.
* Revert deleted debug prints.
* Fix memory leak in patch 3.

Changes from v5 to v6:
* fix prototype of iommu_translate function for more IOMMU types.
* VFIO will be notified only on the difference, without unmap 
  before change to maps.

Aviv Ben-David (3):
  IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to
guest
  IOMMU: change iommu_op->translate's is_write to flags, add support to
NO_FAIL flag mode
  IOMMU: enable intel_iommu map and unmap notifiers

 exec.c |   3 +-
 hw/alpha/typhoon.c |   2 +-
 hw/i386/amd_iommu.c|   4 +-
 hw/i386/intel_iommu.c  | 160 +
 hw/i386/intel_iommu_internal.h |   3 +
 hw/pci-host/apb.c  |   2 +-
 hw/ppc/spapr_iommu.c   |   2 +-
 hw/s390x/s390-pci-bus.c|   2 +-
 include/exec/memory.h  |   6 +-
 include/hw/i386/intel_iommu.h  |  11 +++
 memory.c   |   3 +-
 11 files changed, 160 insertions(+), 38 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v6 1/3] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2016-11-08 Thread Aviv B.D
From: "Aviv Ben-David" 

This capability asks the guest to invalidate cache before each map operation.
We can use this invalidation to trap map operations in the hypervisor.

Signed-off-by: Aviv Ben-David 
---
 hw/i386/intel_iommu.c  | 5 +
 hw/i386/intel_iommu_internal.h | 1 +
 include/hw/i386/intel_iommu.h  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1655a65..834887f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2017,6 +2017,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+DEFINE_PROP_BOOL("cache-mode", IntelIOMMUState, cache_mode_enabled, FALSE),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2391,6 +2392,10 @@ static void vtd_init(IntelIOMMUState *s)
 assert(s->intr_eim != ON_OFF_AUTO_AUTO);
 }
 
+if (s->cache_mode_enabled) {
+s->cap |= VTD_CAP_CM;
+}
+
 vtd_reset_context_cache(s);
 vtd_reset_iotlb(s);
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 0829a50..35d9f3a 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -201,6 +201,7 @@
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
 #define VTD_CAP_PSI (1ULL << 39)
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM  (1ULL << 7)
 
 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT 8
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 1989c1e..42d293f 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -258,6 +258,8 @@ struct IntelIOMMUState {
 uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
 uint32_t version;
 
+bool cache_mode_enabled;/* RO - is cap CM enabled? */
+
 dma_addr_t root;/* Current root table pointer */
 bool root_extended; /* Type of root table (extended or not) */
 bool dmar_enabled;  /* Set if DMA remapping is enabled */
-- 
1.9.1




Re: [Qemu-devel] [PATCH v6 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-11-16 Thread Aviv B.D.
On Thu, Nov 10, 2016 at 9:20 PM, Michael S. Tsirkin  wrote:

> On Thu, Nov 10, 2016 at 09:04:13AM -0700, Alex Williamson wrote:
> > On Thu, 10 Nov 2016 17:54:35 +0200
> > "Michael S. Tsirkin"  wrote:
> >
> > > On Thu, Nov 10, 2016 at 08:30:21AM -0700, Alex Williamson wrote:
> > > > On Thu, 10 Nov 2016 17:14:24 +0200
> > > > "Michael S. Tsirkin"  wrote:
> > > >
> > > > > On Tue, Nov 08, 2016 at 01:04:21PM +0200, Aviv B.D wrote:
> > > > > > From: "Aviv Ben-David" 
> > > > > >
> > > > > > * Advertize Cache Mode capability in iommu cap register.
> > > > > >   This capability is controlled by "cache-mode" property of
> intel-iommu device.
> > > > > >   To enable this option call QEMU with "-device
> intel-iommu,cache-mode=true".
> > > > > >
> > > > > > * On page cache invalidation in intel vIOMMU, check if the
> domain belong to
> > > > > >   registered notifier, and notify accordingly.
> > > > >
> > > > > This looks sane I think. Alex, care to comment?
> > > > > Merging will have to wait until after the release.
> > > > > Pls remember to re-test and re-ping then.
> > > >
> > > > I don't think it's suitable for upstream until there's a reasonable
> > > > replay mechanism
> > >
> > > Could you pls clarify what do you mean by replay?
> > > Is this when you attach a device by hotplug to
> > > a running system?
> > >
> > > If yes this can maybe be addressed by disabling hotplug temporarily.
> >
> > No, hotplug is not required, moving a device between existing domains
> > requires replay, ie. actually using it for nested device assignment.
>
> Good point, that one is a correctness thing. Aviv,
> could you add this in TODO list in a cover letter pls?
>

Sure, no problem.


>
> > > > and we straighten out whether it's expected to get
> > > > multiple notifies and the notif-ee is responsible for filtering
> > > > them or if the notif-er should do filtering.
> > >
> > > OK this is a documentation thing.
> >
> > Well no, it needs to be decided and if necessary implemented.
>
> Let's assume it's the notif-ee for now. Less is more and all that.
>
> > > >  Without those, this is
> > > > effectively just an RFC.
> > >
> > > It's infrastructure without users so it doesn't break things,
> > > I'm more interested in seeing whether it's broken in
> > > some way than whether it's complete.
> >
> > If it allows use with vfio but doesn't fully implement the complete set
> > of interfaces, it does break things.  We currently prevent viommu usage
> > with vfio because it is incomplete.
>
> Right - that bit is still in as far as I can see.
>
> > > The patchset spent out of tree too long and I'd like to see
> > > us make progress towards device assignment working with
> > > vIOMMU sooner rather than later, so if it's broken I won't
> > > merge it but if it's incomplete I will.
> >
> > So long as it's incomplete and still prevents vfio usage, I'm ok with
> > merging it, but I don't want to enable vfio usage until it's complete.
> > Thanks,
> >
> > Alex
> >
> > > > > > Currently this patch still doesn't enabling VFIO devices support
> with vIOMMU
> > > > > > present. Current problems:
> > > > > > * vfio_iommu_map_notify is not aware about memory range belong
> to specific
> > > > > >   VFIOGuestIOMMU.
> > > > > > * memory_region_iommu_replay hangs QEMU on start up while it
> itterate over
> > > > > >   64bit address space. Commenting out the call to this function
> enables
> > > > > >   workable VFIO device while vIOMMU present.
> > > > > > * vfio_iommu_map_notify should check if address space range is
> suitable for
> > > > > >   current notifier.
> > > > > >
> > > > > > Changes from v1 to v2:
> > > > > > * remove assumption that the cache do not clears
> > > > > > * fix lockup on high load.
> > > > > >
> > > > > > Changes from v2 to v3:
> > > > > > * remove debug leftovers
> > > > > > * split to sepearate comm

Re: [Qemu-devel] [PATCH v6 0/3] IOMMU: intel_iommu support map and unmap notifications

2016-11-16 Thread Aviv B.D.
On Wed, Nov 16, 2016 at 5:34 PM, Alex Williamson  wrote:

> On Wed, 16 Nov 2016 15:54:56 +0200
> "Michael S. Tsirkin"  wrote:
>
> > On Thu, Nov 10, 2016 at 12:44:47PM -0700, Alex Williamson wrote:
> > > On Thu, 10 Nov 2016 21:20:36 +0200
> > > "Michael S. Tsirkin"  wrote:
> > >
> > > > On Thu, Nov 10, 2016 at 09:04:13AM -0700, Alex Williamson wrote:
> > > > > On Thu, 10 Nov 2016 17:54:35 +0200
> > > > > "Michael S. Tsirkin"  wrote:
> > > > >
> > > > > > On Thu, Nov 10, 2016 at 08:30:21AM -0700, Alex Williamson wrote:
> > > > > > > On Thu, 10 Nov 2016 17:14:24 +0200
> > > > > > > "Michael S. Tsirkin"  wrote:
> > > > > > >
> > > > > > > > On Tue, Nov 08, 2016 at 01:04:21PM +0200, Aviv B.D wrote:
> > > > > > > > > From: "Aviv Ben-David" 
> > > > > > > > >
> > > > > > > > > * Advertize Cache Mode capability in iommu cap register.
> > > > > > > > >   This capability is controlled by "cache-mode" property
> of intel-iommu device.
> > > > > > > > >   To enable this option call QEMU with "-device
> intel-iommu,cache-mode=true".
> > > > > > > > >
> > > > > > > > > * On page cache invalidation in intel vIOMMU, check if the
> domain belong to
> > > > > > > > >   registered notifier, and notify accordingly.
> > > > > > > >
> > > > > > > > This looks sane I think. Alex, care to comment?
> > > > > > > > Merging will have to wait until after the release.
> > > > > > > > Pls remember to re-test and re-ping then.
> > > > > > >
> > > > > > > I don't think it's suitable for upstream until there's a
> reasonable
> > > > > > > replay mechanism
> > > > > >
> > > > > > Could you pls clarify what do you mean by replay?
> > > > > > Is this when you attach a device by hotplug to
> > > > > > a running system?
> > > > > >
> > > > > > If yes this can maybe be addressed by disabling hotplug
> temporarily.
> > > > >
> > > > > No, hotplug is not required, moving a device between existing
> domains
> > > > > requires replay, ie. actually using it for nested device
> assignment.
> > > >
> > > > Good point, that one is a correctness thing. Aviv,
> > > > could you add this in TODO list in a cover letter pls?
> > > >
> > > > > > > and we straighten out whether it's expected to get
> > > > > > > multiple notifies and the notif-ee is responsible for filtering
> > > > > > > them or if the notif-er should do filtering.
> > > > > >
> > > > > > OK this is a documentation thing.
> > > > >
> > > > > Well no, it needs to be decided and if necessary implemented.
> > > >
> > > > Let's assume it's the notif-ee for now. Less is more and all that.
> > >
> > > I think this is opposite of the approach dwg suggested.
> > >
> > > > > > >  Without those, this is
> > > > > > > effectively just an RFC.
> > > > > >
> > > > > > It's infrastructure without users so it doesn't break things,
> > > > > > I'm more interested in seeing whether it's broken in
> > > > > > some way than whether it's complete.
> > > > >
> > > > > If it allows use with vfio but doesn't fully implement the
> complete set
> > > > > of interfaces, it does break things.  We currently prevent viommu
> usage
> > > > > with vfio because it is incomplete.
> > > >
> > > > Right - that bit is still in as far as I can see.
> > >
> > > Nope, 3/3 changes vtd_iommu_notify_flag_changed() to allow use with
> > > vfio even though it's still incomplete.  We would at least need
> > > something like a replay callback for VT-d that triggers an abort if you
> > > still want to accept it incomplete.  Thanks,
> > >
> > > Alex
> >
> > IIUC practically things seems to work, right?
>
> AFAIK, no.
>
> > So how about disabling by default with a flag for people that want to
> > experiment with it?
> > E.g. x-vfio-allow-broken-translations ?
>
> We've already been through one round of "intel-iommu is incomplete for
> use with device assignment, how can we prevent it from being used",
> which led to the notify_flag_changed callback on MemoryRegionIOMMUOps.
> This series now claims to fix that yet still doesn't provide a
> mechanism to do memory_region_iommu_replay() given that VT-d has a much
> larger address width.  Why is the onus on vfio to resolve this or
> provide some sort of workaround?  vfio is using the QEMU iommu
> interface correctly, intel-iommu is still incomplete. The least it
> could do is add an optional replay callback to MemoryRegionIOMMUOps
> that supersedes the existing memory_region_iommu_replay() code and
> triggers an abort when it gets called.  I don't know what an
> x-vfio-allow-broken-translations option would do, how I'd implement it,
> or why I'd bother to implement it.  Thanks,
>
> Alex
>

I'll implement your suggestion regarding the replay framwork.
Probably in another patch set, if it is OK?

Thanks,
Aviv.


Re: [Qemu-devel] [PATCH v7 17/17] intel_iommu: enable vfio devices

2017-03-19 Thread Aviv B.D.
Hi Peter,
Thanks, I think that I should receive credit for this patch.

Please attribute it under my technion mail: bda...@cs.technion.ac.il.

The signed-off line should be:

Signed-off-by: Aviv Ben-David 

Thanks,
Aviv.

On Thu, Mar 16, 2017 at 6:05 AM, Peter Xu  wrote:

> On Tue, Feb 07, 2017 at 04:28:19PM +0800, Peter Xu wrote:
> > This patch is based on Aviv Ben-David ()'s patch
> > upstream:
> >
> >   "IOMMU: enable intel_iommu map and unmap notifiers"
> >   https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg01453.html
> >
> > However I removed/fixed some content, and added my own codes.
> >
> > Instead of translate() every page for iotlb invalidations (which is
> > slower), we walk the pages when needed and notify in a hook function.
> >
> > This patch enables vfio devices for VT-d emulation.
> >
> > And, since we already have vhost DMAR support via device-iotlb, a
> > natural benefit that this patch brings is that vt-d enabled vhost can
> > live even without ATS capability now. Though more tests are needed.
> >
>
> Hi, Michael,
>
> If there is any possiblility that this version be merged in the future
> at any point, would you please help add Aviv's sign-off into this
> patch as well right here (I think it should be before Jason's r-b):
>
> Signed-off-by: Aviv Ben-David 
>
> Since I think we should definitely give Aviv more credit since he's
> done a great work before (and devoted lots of time).
>
> (Aviv, please reply if you have other opinions, or I'll just make
>  myself bold)
>
> Thanks,
>
> > Reviewed-by: Jason Wang 
> > Signed-off-by: Peter Xu 
>
> [...]
>
> -- peterx
>