Re: [PATCH v4 0/2] iommu: Add iommu_error class event to iommu trace

2013-09-25 Thread Joerg Roedel
On Tue, Sep 24, 2013 at 03:21:18PM -0600, Shuah Khan wrote:
 Shuah Khan (2):
   iommu: Add iommu_error class event to iommu trace
   iommu: Change iommu driver to call io_page_fault trace event

Applied to the tracing branch, thanks Shuah.


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 4/7] iommu/arm-smmu: Check for num_context_irqs 0 to avoid divide by zero exception

2013-09-25 Thread Andreas Herrmann
On Tue, Sep 24, 2013 at 11:40:48AM -0400, Will Deacon wrote:
 On Tue, Sep 24, 2013 at 04:06:58PM +0100, Andreas Herrmann wrote:
  With the right (or wrong;-) definition of v1 SMMU node in DTB it is
  possible to trigger a division by zero in arm_smmu_init_domain_context
  (if number of context irqs is 0):
  
 if (smmu-version == 1) {
 root_cfg-irptndx = atomic_inc_return(smmu-irptndx);
   → root_cfg-irptndx %= smmu-num_context_irqs;
 } else {
  
  Avoid this by checking for num_context_irqs  0 before trying to
  assign interrupts to contexts.
  
  Signed-off-by: Andreas Herrmann andreas.herrm...@calxeda.com
  ---
   drivers/iommu/arm-smmu.c |   31 +--
   1 file changed, 17 insertions(+), 14 deletions(-)
  
  diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
  index f5a856e..0dfd255 100644
  --- a/drivers/iommu/arm-smmu.c
  +++ b/drivers/iommu/arm-smmu.c
  @@ -828,21 +828,24 @@ static int arm_smmu_init_domain_context(struct 
  iommu_domain *domain,
  return ret;
   
  root_cfg-cbndx = ret;
  -   if (smmu-version == 1) {
  -   root_cfg-irptndx = atomic_inc_return(smmu-irptndx);
  -   root_cfg-irptndx %= smmu-num_context_irqs;
  -   } else {
  -   root_cfg-irptndx = root_cfg-cbndx;
  -   }
   
  -   irq = smmu-irqs[smmu-num_global_irqs + root_cfg-irptndx];
  -   ret = request_irq(irq, arm_smmu_context_fault, IRQF_SHARED,
  - arm-smmu-context-fault, domain);
  -   if (IS_ERR_VALUE(ret)) {
  -   dev_err(smmu-dev, failed to request context IRQ %d (%u)\n,
  -   root_cfg-irptndx, irq);
  -   root_cfg-irptndx = -1;
  -   goto out_free_context;
  +   if (smmu-num_context_irqs) {
 
 Can we move this check to probe time, to avoid re-evaluating it every time
 we initialise a new domain?

Yes, I'll move this check and issue an error message when there is not
at least one context interrupt available.

Andreas
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 3/3] Fix compile error in drivers/gpu/drm/msm/msm_drv.c with IOMMU disabled

2013-09-25 Thread Joerg Roedel
The function msm_iommu_get_ctx() is needed buy the MSM-GPU
driver with and wiithout IOMMU compiled in. Make the
function available when no IOMMU driver is there.

Signed-off-by: Joerg Roedel j...@8bytes.org
---
 drivers/iommu/msm_iommu.h |7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/iommu/msm_iommu.h b/drivers/iommu/msm_iommu.h
index 5c7c955..da53558 100644
--- a/drivers/iommu/msm_iommu.h
+++ b/drivers/iommu/msm_iommu.h
@@ -108,7 +108,14 @@ struct msm_iommu_ctx_drvdata {
  * Useful for testing and drivers that do not yet fully have IOMMU stuff in
  * their platform devices.
  */
+#ifdef CONFIG_MSM_IOMMU
 struct device *msm_iommu_get_ctx(const char *ctx_name);
+#else
+static inline struct device *msm_iommu_get_ctx(const char *ctx_name)
+{
+   return NULL;
+}
+#endif
 
 /*
  * Interrupt handler for the IOMMU context fault interrupt. Hooking the
-- 
1.7.9.5


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/3] MSM: Remove iommu include from drivers/gpu/drm/msm/mdp4/mdp4_kms.c

2013-09-25 Thread Joerg Roedel
The include file has been removed and the file does not
need it anyway, so remove it. Fixes a compile error.

Signed-off-by: Joerg Roedel j...@8bytes.org
---
 drivers/gpu/drm/msm/mdp4/mdp4_kms.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/msm/mdp4/mdp4_kms.c 
b/drivers/gpu/drm/msm/mdp4/mdp4_kms.c
index 5db5bba..bc7fd11 100644
--- a/drivers/gpu/drm/msm/mdp4/mdp4_kms.c
+++ b/drivers/gpu/drm/msm/mdp4/mdp4_kms.c
@@ -19,8 +19,6 @@
 #include msm_drv.h
 #include mdp4_kms.h
 
-#include mach/iommu.h
-
 static struct mdp4_platform_config *mdp4_get_config(struct platform_device 
*dev);
 
 static int mdp4_hw_init(struct msm_kms *kms)
-- 
1.7.9.5


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 6/7] iommu/arm-smmu: Add module parameter arm-smmu=off|force_isolation

2013-09-25 Thread Andreas Herrmann
On Tue, Sep 24, 2013 at 11:42:18AM -0400, Will Deacon wrote:
 On Tue, Sep 24, 2013 at 04:07:00PM +0100, Andreas Herrmann wrote:
arm-smmu= arm-smmu driver option
  
  off Disable arm-smmu driver (ie. ignore available SMMUs)
  
  force_isolation
  Try to attach each master device on every SMMU to a
  separate iommu_domain.
  
  Default is that driver detects SMMUs but no translation is configured
  (transactions just bypass translation process).
  
  Signed-off-by: Andreas Herrmann andreas.herrm...@calxeda.com
  ---
   drivers/iommu/arm-smmu.c |   26 ++
   1 file changed, 26 insertions(+)
  
  diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
  index 3eb2259..251564e 100644
  --- a/drivers/iommu/arm-smmu.c
  +++ b/drivers/iommu/arm-smmu.c
  @@ -399,6 +399,9 @@ struct arm_smmu_domain {
   static DEFINE_SPINLOCK(arm_smmu_devices_lock);
   static LIST_HEAD(arm_smmu_devices);
   
  +static bool arm_smmu_disabled;
  +static bool arm_smmu_force_isolation;
  +
   static struct arm_smmu_master *find_smmu_master(struct arm_smmu_device 
  *smmu,
  struct device_node *dev_node)
   {
  @@ -1837,6 +1840,9 @@ static int arm_smmu_device_dt_probe(struct 
  platform_device *pdev)
  struct of_phandle_args masterspec;
  int num_irqs, i, err;
   
  +   if (arm_smmu_disabled)
  +   return -ENODEV;
  +
  smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
  if (!smmu) {
  dev_err(dev, failed to allocate arm_smmu_device\n);
  @@ -2022,6 +2028,23 @@ static struct platform_driver arm_smmu_driver = {
  .remove = arm_smmu_device_remove,
   };
   
  +static int __init arm_smmu_parse_options(char *str)
  +{
  +   if (*str) {
  +   str++;
  +   if (!strncmp(str, off, 3))
  +   arm_smmu_disabled = true;
  +   else if(!strncmp(str, force_isolation, 15))
  +   arm_smmu_force_isolation = true;
  +   else {
  +   pr_warn(arm_smmu: invalid parameter (\%s\)\n, str);
  +   return 0;
  +   }
  +   }
  +   return 1;
  +}
  +__setup(arm-smmu, arm_smmu_parse_options);
 
 If this is going to be a common function for IOMMUs, let's instead move the
 command-line parsing out into the generic IOMMU layer, then have an optional
 callback into the low-level IOMMU driver for enabling/disabling it.

Makes sense and I am currently looking into it.


Andreas
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread Joerg Roedel
On Sat, Sep 21, 2013 at 09:59:00PM -0500, David Woodhouse wrote:
 I *hate* the bizarre calling convention for iommu_unmap(). Is it
 actually clearly documented anywhere? Why on earth is it not just
 returning void, and expected to unmap what it was *asked* to unmap?

Yeah, I agree that this should be documented since it is quite
non-standard/non-obvious behaviour of a function. The reason the
interface was implemented this way is that the caller should not need to
know (or keep track of) the page-size which was used to map a given iova.

So the interface is basically, that you give an iova and a size and the
iommu driver unmaps _at_least_ a region of that size. But if you ask for
a 4k region which is mapped by a 2M page then the whole 2M are unmapped.
The return value tells you how much was actually unmapped.

Not the best interface, I know. We should come up with a better way to
handle this.


Joerg


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 6/7] iommu/arm-smmu: Add module parameter arm-smmu=off|force_isolation

2013-09-25 Thread Joerg Roedel
On Tue, Sep 24, 2013 at 04:42:18PM +0100, Will Deacon wrote:
 On Tue, Sep 24, 2013 at 04:07:00PM +0100, Andreas Herrmann wrote:
  +__setup(arm-smmu, arm_smmu_parse_options);
 
 If this is going to be a common function for IOMMUs, let's instead move the
 command-line parsing out into the generic IOMMU layer, then have an optional
 callback into the low-level IOMMU driver for enabling/disabling it.

Hmm, actually the force-isolation parameters that different IOMMU
drivers implement are for their version of the DMA-API which is not yet
available in the generic IOMMU layer. Unless this changes I think it is
a good idea to keep these parameters close to the actual DMA-API
implementation used and not move them to the IOMMU layer.


Joerg


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread Alex Williamson
On Wed, 2013-09-25 at 17:54 +0200, Joerg Roedel wrote:
 On Sat, Sep 21, 2013 at 09:59:00PM -0500, David Woodhouse wrote:
  I *hate* the bizarre calling convention for iommu_unmap(). Is it
  actually clearly documented anywhere? Why on earth is it not just
  returning void, and expected to unmap what it was *asked* to unmap?
 
 Yeah, I agree that this should be documented since it is quite
 non-standard/non-obvious behaviour of a function. The reason the
 interface was implemented this way is that the caller should not need to
 know (or keep track of) the page-size which was used to map a given iova.
 
 So the interface is basically, that you give an iova and a size and the
 iommu driver unmaps _at_least_ a region of that size. But if you ask for
 a 4k region which is mapped by a 2M page then the whole 2M are unmapped.
 The return value tells you how much was actually unmapped.
 
 Not the best interface, I know. We should come up with a better way to
 handle this.

Actually, unmap can return zero too, which happens if you try to unmap
an offset into a super page mapping.  The current interface has issues,
but both legacy kvm assignment and vfio rely on it so we don't need to
track individual mappings.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/7] iommu: supress loff_t compilation error on powerpc

2013-09-25 Thread Alex Williamson
On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
 Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
 ---
  drivers/vfio/pci/vfio_pci_rdwr.c |3 ++-
  1 files changed, 2 insertions(+), 1 deletions(-)
 
 diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c 
 b/drivers/vfio/pci/vfio_pci_rdwr.c
 index 210db24..8a8156a 100644
 --- a/drivers/vfio/pci/vfio_pci_rdwr.c
 +++ b/drivers/vfio/pci/vfio_pci_rdwr.c
 @@ -181,7 +181,8 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, 
 char __user *buf,
  size_t count, loff_t *ppos, bool iswrite)
  {
   int ret;
 - loff_t off, pos = *ppos  VFIO_PCI_OFFSET_MASK;
 + loff_t off;
 + u64 pos = (u64 )(*ppos  VFIO_PCI_OFFSET_MASK);
   void __iomem *iomem = NULL;
   unsigned int rsrc;
   bool is_ioport;

What's the compile error that this fixes?

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device

2013-09-25 Thread Alex Williamson
On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
 This api return the iommu domain to which the device is attached.
 The iommu_domain is required for making API calls related to iommu.
 Follow up patches which use this API to know iommu maping.
 
 Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
 ---
  drivers/iommu/iommu.c |   10 ++
  include/linux/iommu.h |7 +++
  2 files changed, 17 insertions(+), 0 deletions(-)
 
 diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
 index fbe9ca7..6ac5f50 100644
 --- a/drivers/iommu/iommu.c
 +++ b/drivers/iommu/iommu.c
 @@ -696,6 +696,16 @@ void iommu_detach_device(struct iommu_domain *domain, 
 struct device *dev)
  }
  EXPORT_SYMBOL_GPL(iommu_detach_device);
  
 +struct iommu_domain *iommu_get_dev_domain(struct device *dev)
 +{
 + struct iommu_ops *ops = dev-bus-iommu_ops;
 +
 + if (unlikely(ops == NULL || ops-get_dev_iommu_domain == NULL))
 + return NULL;
 +
 + return ops-get_dev_iommu_domain(dev);
 +}
 +EXPORT_SYMBOL_GPL(iommu_get_dev_domain);

What prevents this from racing iommu_domain_free()?  There's no
references acquired, so there's no reason for the caller to assume the
pointer is valid.

  /*
   * IOMMU groups are really the natrual working unit of the IOMMU, but
   * the IOMMU API works on domains and devices.  Bridge that gap by
 diff --git a/include/linux/iommu.h b/include/linux/iommu.h
 index 7ea319e..fa046bd 100644
 --- a/include/linux/iommu.h
 +++ b/include/linux/iommu.h
 @@ -127,6 +127,7 @@ struct iommu_ops {
   int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count);
   /* Get the numer of window per domain */
   u32 (*domain_get_windows)(struct iommu_domain *domain);
 + struct iommu_domain *(*get_dev_iommu_domain)(struct device *dev);
  
   unsigned long pgsize_bitmap;
  };
 @@ -190,6 +191,7 @@ extern int iommu_domain_window_enable(struct iommu_domain 
 *domain, u32 wnd_nr,
 phys_addr_t offset, u64 size,
 int prot);
  extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 
 wnd_nr);
 +extern struct iommu_domain *iommu_get_dev_domain(struct device *dev);
  /**
   * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
   * @domain: the iommu domain where the fault has happened
 @@ -284,6 +286,11 @@ static inline void iommu_domain_window_disable(struct 
 iommu_domain *domain,
  {
  }
  
 +static inline struct iommu_domain *iommu_get_dev_domain(struct device *dev)
 +{
 + return NULL;
 +}
 +
  static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, 
 dma_addr_t iova)
  {
   return 0;



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 7/7] iommu/arm-smmu: Clear global and context bank fault status and syndrome registers

2013-09-25 Thread Will Deacon
On Tue, Sep 24, 2013 at 07:32:47PM +0100, Andreas Herrmann wrote:
 On Tue, Sep 24, 2013 at 11:42:52AM -0400, Will Deacon wrote:
  On Tue, Sep 24, 2013 at 04:07:01PM +0100, Andreas Herrmann wrote:
   Signed-off-by: Andreas Herrmann andreas.herrm...@calxeda.com
   ---
drivers/iommu/arm-smmu.c |9 +
1 file changed, 9 insertions(+)
   
   diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
   index 251564e..a499146 100644
   --- a/drivers/iommu/arm-smmu.c
   +++ b/drivers/iommu/arm-smmu.c
   @@ -645,6 +645,10 @@ static void arm_smmu_init_context_bank(struct 
   arm_smmu_domain *smmu_domain)
 stage1 = root_cfg-cbar != CBAR_TYPE_S2_TRANS;
 cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, root_cfg-cbndx);

   + /* clear fsr */
   + writel_relaxed(0x, cb_base + ARM_SMMU_CB_FSR);
   + writel_relaxed(0, cb_base + ARM_SMMU_CB_FSYNR0);
   +
 /* CBAR */
 reg = root_cfg-cbar;
 if (smmu-version == 1)
   @@ -1570,6 +1574,11 @@ static void arm_smmu_device_reset(struct 
   arm_smmu_device *smmu)
 int i = 0;
 u32 scr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sCR0);

   + /* clear global FSRs */
   + writel(0x, gr0_base + ARM_SMMU_GR0_sGFSR);
   + writel(0, gr0_base + ARM_SMMU_GR0_sGFSYNR0);
   + writel(0, gr0_base + ARM_SMMU_GR0_sGFSYNR1);
  
  Why do you need this?
 
 According to the spec the status and syndrome registers have
 unknown/unpredictable reset values. So better set known values before
 we start to use these registers (ie. handle faults where we read
 them). No?

Sure, but the only time these are made visible should be if we take a fault,
in which case the registers should contain something meaningful. I guess I
could see a problem if there is a shared interrupt line for faults though,
so a cut-down version of your patch (just clearing ARM_SMMU_GR0_sGFSR and
the ARM_SMMU_CB_FSR registers) should be enough.

Cheers,

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread David Woodhouse
On Wed, 2013-09-25 at 17:54 +0200, Joerg Roedel wrote:
 On Sat, Sep 21, 2013 at 09:59:00PM -0500, David Woodhouse wrote:
  I *hate* the bizarre calling convention for iommu_unmap(). Is it
  actually clearly documented anywhere? Why on earth is it not just
  returning void, and expected to unmap what it was *asked* to unmap?
 
 Yeah, I agree that this should be documented since it is quite
 non-standard/non-obvious behaviour of a function. The reason the
 interface was implemented this way is that the caller should not need to
 know (or keep track of) the page-size which was used to map a given iova.

Why would it ever care? If it *happens* to map something that can use
large pages, yay!. If it subsequently breaks apart those large pages by
unmapping 4KiB in the middle, let the IOMMU driver break that apart.

And why are we not allowed to unmap more than a single page at a time?

Doing unmaps in small pieces (with a full flush between every one) is
*horribly* slow.

 So the interface is basically, that you give an iova and a size and the
 iommu driver unmaps _at_least_ a region of that size. But if you ask for
 a 4k region which is mapped by a 2M page then the whole 2M are unmapped.
 The return value tells you how much was actually unmapped.
 
 Not the best interface, I know. We should come up with a better way to
 handle this.

Seriously, just the address and the size. Let the IOMMU code deal with
whether it can *actually* use large pages on the particular set of
IOMMUs that are in use for the devices you've added to the domain so
far.

-- 
dwmw2


smime.p7s
Description: S/MIME cryptographic signature
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 2/7] iommu/arm-smmu: Calculate SMMU_CB_BASE from smmu register values

2013-09-25 Thread Will Deacon
On Tue, Sep 24, 2013 at 07:07:20PM +0100, Andreas Herrmann wrote:
 On Tue, Sep 24, 2013 at 11:34:57AM -0400, Will Deacon wrote:
  On Tue, Sep 24, 2013 at 04:06:56PM +0100, Andreas Herrmann wrote:
   Currently it is derived from smmu resource size. If the resource size
   is wrongly specified (e.g. too large) this leads to a miscalculation
   and can cause undefined behaviour when context bank registers are
   modified.

[...]

  Hmm, this is a tricky one. We know that we have an inconsistency (i.e. the
  DT and the hardware don't agree on the size of the device) but we warn and
  attempt to continue with the value from the DT. I don't think that trusting
  the hardware is the right thing to do in this case, since it's not possible
  to change so we should let the DT act as an override.
 
  In other words: if the device tree is wrong, go fix it.
 
 Yes, I've found this issue with a wrong DT. With the original code
 there was some weirdness when setting certain context bank
 registers. (Identifying the root cause was not straight forward.)
 
 I think it's somehow odd not to trust the hardware values in the first
 place and to add (right from the beginning) a quirk for potential
 implementation bugs. Are there already implementations that use wrong
 register values that are required to determine the partitioning of the
 SMMU address space?

I don't know of any, but you can bet that people will want to run old
kernels on future hardware, so we should try and get this right from day
one.

 If there is a mismatch it's hard to say which value is the correct
 one. I think there are three options:
 (1) just print a warning about the mismatch
 (2) print a warning + override based on DT
 (3) print a warning + override based on DT + have an option to switch
 off the override
 
 So, what's your choice?

I had gone for (2), on the assumption that fixing a broken DT shouldn't be
too hard as well as allowing people to work around broken hardware. Yes, it
means we treat the DT as golden, but that's already the case in the absence
of fully probable hardware.

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread Joerg Roedel
On Wed, Sep 25, 2013 at 05:05:13PM +0100, David Woodhouse wrote:
 On Wed, 2013-09-25 at 17:54 +0200, Joerg Roedel wrote:
  Yeah, I agree that this should be documented since it is quite
  non-standard/non-obvious behaviour of a function. The reason the
  interface was implemented this way is that the caller should not need to
  know (or keep track of) the page-size which was used to map a given iova.
 
 Why would it ever care? If it *happens* to map something that can use
 large pages, yay!. If it subsequently breaks apart those large pages by
 unmapping 4KiB in the middle, let the IOMMU driver break that apart.

All that was implemented back in the old days where KVM was the only
user of that code. And back then it didn't make sense for the IOMMU
driver to split huge-pages because in the next step KVM would unmap all
the small pages anyway. So I took the other route and just told KVM how
much was unmapped so that the unmap code could skip over it.

 And why are we not allowed to unmap more than a single page at a time?

The user of iommu_unmap is actually allowed to do that. But the
page-size splitting it done in the iommu_unmap() function already to
have that code common between different IOMMU drivers. In the end the
IOMMU driver only exports the page-size bitmap and is fine.

 Doing unmaps in small pieces (with a full flush between every one) is
 *horribly* slow.

I proposed an iommu_commit() function to move all the necessary flushes
there after a couple of page-table changes. And this also allows to keep
the generic page-size splitting code generic.

 Seriously, just the address and the size. Let the IOMMU code deal with
 whether it can *actually* use large pages on the particular set of
 IOMMUs that are in use for the devices you've added to the domain so
 far.

Actually we can get rid of the unmapped_size thing by putting the same
constraints the DMA-API has in place. The user of the API has to unmap a
region with the same iova,size parameters used for mapping it. Then we
can guarantee that only the requested range will be unmapped.


Joerg


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread Alex Williamson
On Wed, 2013-09-25 at 17:05 +0100, David Woodhouse wrote:
 On Wed, 2013-09-25 at 17:54 +0200, Joerg Roedel wrote:
  On Sat, Sep 21, 2013 at 09:59:00PM -0500, David Woodhouse wrote:
   I *hate* the bizarre calling convention for iommu_unmap(). Is it
   actually clearly documented anywhere? Why on earth is it not just
   returning void, and expected to unmap what it was *asked* to unmap?
  
  Yeah, I agree that this should be documented since it is quite
  non-standard/non-obvious behaviour of a function. The reason the
  interface was implemented this way is that the caller should not need to
  know (or keep track of) the page-size which was used to map a given iova.
 
 Why would it ever care? If it *happens* to map something that can use
 large pages, yay!. If it subsequently breaks apart those large pages by
 unmapping 4KiB in the middle, let the IOMMU driver break that apart.

Can this be done atomically?  I thought part of the reason for this
interface was that iommu drivers typically couldn't replace a huge page
with multiple smaller pages in the presence of DMA.

 And why are we not allowed to unmap more than a single page at a time?
 
 Doing unmaps in small pieces (with a full flush between every one) is
 *horribly* slow.

While we call iommu_unmap with PAGE_SIZE, we certainly expect that
larger chunks will be used if they were mapped that way.

  So the interface is basically, that you give an iova and a size and the
  iommu driver unmaps _at_least_ a region of that size. But if you ask for
  a 4k region which is mapped by a 2M page then the whole 2M are unmapped.
  The return value tells you how much was actually unmapped.
  
  Not the best interface, I know. We should come up with a better way to
  handle this.
 
 Seriously, just the address and the size. Let the IOMMU code deal with
 whether it can *actually* use large pages on the particular set of
 IOMMUs that are in use for the devices you've added to the domain so
 far.

You're contradicting yourself here.  Just let the iommu deal with it,
but now you're raising concerns that Intel may be mixing super page
IOMMU hardware with non-super page IOMMU hardware on the same box, so I
do need to be concerned, yet there's no interface to discover super page
support.  What happens if my IOMMU domain makes use of super pages and
then I add a new device behind a new IOMMU without hardware super page
support?  We have the same problem with snoop control already.  There's
no atomic re-mapping interface, so KVM just pretends it can unmap and
remap without anyone noticing.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread David Woodhouse
On Wed, 2013-09-25 at 11:36 -0600, Alex Williamson wrote:
 On Wed, 2013-09-25 at 17:05 +0100, David Woodhouse wrote:
  Why would it ever care? If it *happens* to map something that can use
  large pages, yay!. If it subsequently breaks apart those large pages by
  unmapping 4KiB in the middle, let the IOMMU driver break that apart.
 
 Can this be done atomically?  I thought part of the reason for this
 interface was that iommu drivers typically couldn't replace a huge page
 with multiple smaller pages in the presence of DMA.

For the Intel IOMMU it can. You can atomically change from a large page
entry, to a pointer to a full set of smaller page tables. Do the IOTLB
flush, and at no time is there an interruption in service.

Not sure if this is true for *all* IOMMU hardware; I'd be perfectly
happy to accept a variant of Jörg's proposal that we should only ever
unmap exactly the same range that we mapped. Except we should allow the
unmapping of adjacent regions together; just not a partial unmap of
something that was mapped in one go.


  Seriously, just the address and the size. Let the IOMMU code deal with
  whether it can *actually* use large pages on the particular set of
  IOMMUs that are in use for the devices you've added to the domain so
  far.
 
 You're contradicting yourself here.  Just let the iommu deal with it,
 but now you're raising concerns that Intel may be mixing super page
 IOMMU hardware with non-super page IOMMU hardware on the same box, so I
 do need to be concerned, yet there's no interface to discover super page
 support. 

No contradiction. Do Not Concern Yourself. Just tell the IOMMU what you
want mapped, and where. Let *it* worry about whether it can use
superpages or not. Like it already *does*.

  What happens if my IOMMU domain makes use of super pages and
 then I add a new device behind a new IOMMU without hardware super page
 support? 

Currently, you end up with the domain happily including superpages, and
the less capable IOMMU that you added later won't cope. What we probably
*ought* to do is walk the page tables and convert any pre-existing
superpages to small pages, at the time we add the non-SP-capable IOMMU.

FWIW we currently screw up the handling of cache-coherent vs.
non-coherent page tables too. That one wants a wbinvd somewhere when we
add a non-coherent IOMMU to the domain.

-- 
dwmw2


smime.p7s
Description: S/MIME cryptographic signature
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 7/7] vfio pci: Add vfio iommu implementation for FSL_PAMU

2013-09-25 Thread Alex Williamson
On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
 This patch adds vfio iommu support for Freescale IOMMU
 (PAMU - Peripheral Access Management Unit).
 
 The Freescale PAMU is an aperture-based IOMMU with the following
 characteristics.  Each device has an entry in a table in memory
 describing the iova-phys mapping. The mapping has:
   -an overall aperture that is power of 2 sized, and has a start iova that
is naturally aligned
   -has 1 or more windows within the aperture
   -number of windows must be power of 2, max is 256
   -size of each window is determined by aperture size / # of windows
   -iova of each window is determined by aperture start iova / # of windows
   -the mapped region in each window can be different than
the window size...mapping must power of 2
   -physical address of the mapping must be naturally aligned
with the mapping size
 
 Some of the code is derived from TYPE1 iommu (driver/vfio/vfio_iommu_type1.c).
 
 Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
 ---
  drivers/vfio/Kconfig   |6 +
  drivers/vfio/Makefile  |1 +
  drivers/vfio/vfio_iommu_fsl_pamu.c |  952 
 
  include/uapi/linux/vfio.h  |  100 
  4 files changed, 1059 insertions(+), 0 deletions(-)
  create mode 100644 drivers/vfio/vfio_iommu_fsl_pamu.c
 
 diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
 index 26b3d9d..7d1da26 100644
 --- a/drivers/vfio/Kconfig
 +++ b/drivers/vfio/Kconfig
 @@ -8,11 +8,17 @@ config VFIO_IOMMU_SPAPR_TCE
   depends on VFIO  SPAPR_TCE_IOMMU
   default n
  
 +config VFIO_IOMMU_FSL_PAMU
 + tristate
 + depends on VFIO
 + default n
 +
  menuconfig VFIO
   tristate VFIO Non-Privileged userspace driver framework
   depends on IOMMU_API
   select VFIO_IOMMU_TYPE1 if X86
   select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
 + select VFIO_IOMMU_FSL_PAMU if FSL_PAMU
   help
 VFIO provides a framework for secure userspace device drivers.
 See Documentation/vfio.txt for more details.
 diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
 index c5792ec..7461350 100644
 --- a/drivers/vfio/Makefile
 +++ b/drivers/vfio/Makefile
 @@ -1,4 +1,5 @@
  obj-$(CONFIG_VFIO) += vfio.o
  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_common.o vfio_iommu_type1.o
  obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_common.o 
 vfio_iommu_spapr_tce.o
 +obj-$(CONFIG_VFIO_IOMMU_FSL_PAMU) += vfio_iommu_common.o 
 vfio_iommu_fsl_pamu.o
  obj-$(CONFIG_VFIO_PCI) += pci/
 diff --git a/drivers/vfio/vfio_iommu_fsl_pamu.c 
 b/drivers/vfio/vfio_iommu_fsl_pamu.c
 new file mode 100644
 index 000..b29365f
 --- /dev/null
 +++ b/drivers/vfio/vfio_iommu_fsl_pamu.c
 @@ -0,0 +1,952 @@
 +/*
 + * VFIO: IOMMU DMA mapping support for FSL PAMU IOMMU
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License, version 2, as
 + * published by the Free Software Foundation.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program; if not, write to the Free Software
 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 + *
 + * Copyright (C) 2013 Freescale Semiconductor, Inc.
 + *
 + * Author: Bharat Bhushan bharat.bhus...@freescale.com
 + *
 + * This file is derived from driver/vfio/vfio_iommu_type1.c
 + *
 + * The Freescale PAMU is an aperture-based IOMMU with the following
 + * characteristics.  Each device has an entry in a table in memory
 + * describing the iova-phys mapping. The mapping has:
 + *  -an overall aperture that is power of 2 sized, and has a start iova that
 + *   is naturally aligned
 + *  -has 1 or more windows within the aperture
 + * -number of windows must be power of 2, max is 256
 + * -size of each window is determined by aperture size / # of windows
 + * -iova of each window is determined by aperture start iova / # of 
 windows
 + * -the mapped region in each window can be different than
 + *  the window size...mapping must power of 2
 + * -physical address of the mapping must be naturally aligned
 + *  with the mapping size
 + */
 +
 +#include linux/compat.h
 +#include linux/device.h
 +#include linux/fs.h
 +#include linux/iommu.h
 +#include linux/module.h
 +#include linux/mm.h
 +#include linux/pci.h   /* pci_bus_type */
 +#include linux/sched.h
 +#include linux/slab.h
 +#include linux/uaccess.h
 +#include linux/vfio.h
 +#include linux/workqueue.h
 +#include linux/hugetlb.h
 +#include linux/msi.h
 +#include asm/fsl_pamu_stash.h
 +
 +#include vfio_iommu_common.h
 +
 +#define DRIVER_VERSION  

Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread Alex Williamson
On Wed, 2013-09-25 at 19:52 +0100, David Woodhouse wrote:
 On Wed, 2013-09-25 at 11:36 -0600, Alex Williamson wrote:
  On Wed, 2013-09-25 at 17:05 +0100, David Woodhouse wrote:
   Why would it ever care? If it *happens* to map something that can use
   large pages, yay!. If it subsequently breaks apart those large pages by
   unmapping 4KiB in the middle, let the IOMMU driver break that apart.
  
  Can this be done atomically?  I thought part of the reason for this
  interface was that iommu drivers typically couldn't replace a huge page
  with multiple smaller pages in the presence of DMA.
 
 For the Intel IOMMU it can. You can atomically change from a large page
 entry, to a pointer to a full set of smaller page tables. Do the IOTLB
 flush, and at no time is there an interruption in service.

Cool

 Not sure if this is true for *all* IOMMU hardware; I'd be perfectly
 happy to accept a variant of Jörg's proposal that we should only ever
 unmap exactly the same range that we mapped. Except we should allow the
 unmapping of adjacent regions together; just not a partial unmap of
 something that was mapped in one go.

Well, except if we've just trusted the IOMMU driver to add a device
behind a non-SP capable IOMMU to our domain and convert the page tables,
that partial unmap is no longer partial and now we get different
behavior than before so we can't depend on that adjacent unmapping.

   Seriously, just the address and the size. Let the IOMMU code deal with
   whether it can *actually* use large pages on the particular set of
   IOMMUs that are in use for the devices you've added to the domain so
   far.
  
  You're contradicting yourself here.  Just let the iommu deal with it,
  but now you're raising concerns that Intel may be mixing super page
  IOMMU hardware with non-super page IOMMU hardware on the same box, so I
  do need to be concerned, yet there's no interface to discover super page
  support. 
 
 No contradiction. Do Not Concern Yourself. Just tell the IOMMU what you
 want mapped, and where. Let *it* worry about whether it can use
 superpages or not. Like it already *does*.

I'd love to be able to do that, but...

   What happens if my IOMMU domain makes use of super pages and
  then I add a new device behind a new IOMMU without hardware super page
  support? 
 
 Currently, you end up with the domain happily including superpages, and
 the less capable IOMMU that you added later won't cope.

This is the trouble with trusting the iommu driver. ;)

  What we probably
 *ought* to do is walk the page tables and convert any pre-existing
 superpages to small pages, at the time we add the non-SP-capable IOMMU.

And then we need to figure out how to handle that in the proposed
interface changes above since it changes the unmap behavior to the naive
user.  There's also the question of whether the IOMMU driver should
re-evaluate super pages when the less capable IOMMU is removed from the
domain.

 FWIW we currently screw up the handling of cache-coherent vs.
 non-coherent page tables too. That one wants a wbinvd somewhere when we
 add a non-coherent IOMMU to the domain.

You're not selling the trust the IOMMU driver story very well here.
Can we assume that the IOMMU_CACHE flag (SNP) is ignored appropriately
by non-coherent IOMMUs?  Is there any downside to ignoring it and always
setting SNP in the IOMMU page tables?  AMD IOMMU ignores it, but it's
also always cache coherent.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread David Woodhouse
On Wed, 2013-09-25 at 13:44 -0600, Alex Williamson wrote:
 On Wed, 2013-09-25 at 19:52 +0100, David Woodhouse wrote:
  On Wed, 2013-09-25 at 11:36 -0600, Alex Williamson wrote:
   On Wed, 2013-09-25 at 17:05 +0100, David Woodhouse wrote:
Why would it ever care? If it *happens* to map something that can use
large pages, yay!. If it subsequently breaks apart those large pages by
unmapping 4KiB in the middle, let the IOMMU driver break that apart.
   
   Can this be done atomically?  I thought part of the reason for this
   interface was that iommu drivers typically couldn't replace a huge page
   with multiple smaller pages in the presence of DMA.
  
  For the Intel IOMMU it can. You can atomically change from a large page
  entry, to a pointer to a full set of smaller page tables. Do the IOTLB
  flush, and at no time is there an interruption in service.
 
 Cool
 
  Not sure if this is true for *all* IOMMU hardware; I'd be perfectly
  happy to accept a variant of Jörg's proposal that we should only ever
  unmap exactly the same range that we mapped. Except we should allow the
  unmapping of adjacent regions together; just not a partial unmap of
  something that was mapped in one go.
 
 Well, except if we've just trusted the IOMMU driver to add a device
 behind a non-SP capable IOMMU to our domain and convert the page tables,
 that partial unmap is no longer partial and now we get different
 behavior than before so we can't depend on that adjacent unmapping.

Que?

Jörg's proposal was that if you add a mapping at a given address+size,
you should always remove *exactly* that address+size. Which will always
work exactly the same, regardless of superpages.

My slight change to that was that if you also added an *adjacent*
mapping at address2+size2, you should be able to unmap both at the same
time. Which will *also* always work the same regardless of superpages.

Even if your two mappings were also *physically* contiguous, and *could*
have used superpages, they probably won't anyway because you mapped them
in two parts.

What happens if my IOMMU domain makes use of super pages and
   then I add a new device behind a new IOMMU without hardware super page
   support? 
  
  Currently, you end up with the domain happily including superpages, and
  the less capable IOMMU that you added later won't cope.
 
 This is the trouble with trusting the iommu driver. ;)

Sorry, I should have made it clearer that this is a *bug*. It's not by
design. The IOMMU driver ought to get this right, and will do.

   What we probably
  *ought* to do is walk the page tables and convert any pre-existing
  superpages to small pages, at the time we add the non-SP-capable IOMMU.
 
 And then we need to figure out how to handle that in the proposed
 interface changes above since it changes the unmap behavior to the naive
 user. 

Isn't that what you'd *expect*? Surely you don't *expect* the breakage
you currently get?

  There's also the question of whether the IOMMU driver should
 re-evaluate super pages when the less capable IOMMU is removed from the
 domain.

I wouldn't bother to go looking for opportunities to use super pages if
we remove the last non-SP-capable IOMMU from the domain.

  FWIW we currently screw up the handling of cache-coherent vs.
  non-coherent page tables too. That one wants a wbinvd somewhere when we
  add a non-coherent IOMMU to the domain.
 
 You're not selling the trust the IOMMU driver story very well here.
 Can we assume that the IOMMU_CACHE flag (SNP) is ignored appropriately
 by non-coherent IOMMUs?  Is there any downside to ignoring it and always
 setting SNP in the IOMMU page tables?  AMD IOMMU ignores it, but it's
 also always cache coherent.  Thanks,

SNP is a separate issue. I'm speaking of cache coherency of the hardware
page table walk — the feature bit that all the horrid clflush calls are
predicated on.

Again, this is just a bug. We *should* be getting this right, but don't
yet.

-- 
dwmw2


smime.p7s
Description: S/MIME cryptographic signature
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread Alex Williamson
On Wed, 2013-09-25 at 21:11 +0100, David Woodhouse wrote:
 On Wed, 2013-09-25 at 13:44 -0600, Alex Williamson wrote:
  On Wed, 2013-09-25 at 19:52 +0100, David Woodhouse wrote:
   On Wed, 2013-09-25 at 11:36 -0600, Alex Williamson wrote:
On Wed, 2013-09-25 at 17:05 +0100, David Woodhouse wrote:
 Why would it ever care? If it *happens* to map something that can use
 large pages, yay!. If it subsequently breaks apart those large pages 
 by
 unmapping 4KiB in the middle, let the IOMMU driver break that apart.

Can this be done atomically?  I thought part of the reason for this
interface was that iommu drivers typically couldn't replace a huge page
with multiple smaller pages in the presence of DMA.
   
   For the Intel IOMMU it can. You can atomically change from a large page
   entry, to a pointer to a full set of smaller page tables. Do the IOTLB
   flush, and at no time is there an interruption in service.
  
  Cool
  
   Not sure if this is true for *all* IOMMU hardware; I'd be perfectly
   happy to accept a variant of Jörg's proposal that we should only ever
   unmap exactly the same range that we mapped. Except we should allow the
   unmapping of adjacent regions together; just not a partial unmap of
   something that was mapped in one go.
  
  Well, except if we've just trusted the IOMMU driver to add a device
  behind a non-SP capable IOMMU to our domain and convert the page tables,
  that partial unmap is no longer partial and now we get different
  behavior than before so we can't depend on that adjacent unmapping.
 
 Que?
 
 Jörg's proposal was that if you add a mapping at a given address+size,
 you should always remove *exactly* that address+size. Which will always
 work exactly the same, regardless of superpages.
 
 My slight change to that was that if you also added an *adjacent*
 mapping at address2+size2, you should be able to unmap both at the same
 time. Which will *also* always work the same regardless of superpages.
 
 Even if your two mappings were also *physically* contiguous, and *could*
 have used superpages, they probably won't anyway because you mapped them
 in two parts.

Ok, sounds reasonable.  I somehow read it to still include some aspect
of the fill in the size API we have now.

 What happens if my IOMMU domain makes use of super pages and
then I add a new device behind a new IOMMU without hardware super page
support? 
   
   Currently, you end up with the domain happily including superpages, and
   the less capable IOMMU that you added later won't cope.
  
  This is the trouble with trusting the iommu driver. ;)
 
 Sorry, I should have made it clearer that this is a *bug*. It's not by
 design. The IOMMU driver ought to get this right, and will do.
 
What we probably
   *ought* to do is walk the page tables and convert any pre-existing
   superpages to small pages, at the time we add the non-SP-capable IOMMU.
  
  And then we need to figure out how to handle that in the proposed
  interface changes above since it changes the unmap behavior to the naive
  user. 
 
 Isn't that what you'd *expect*? Surely you don't *expect* the breakage
 you currently get?

For vfio I currently make the same statement that you're pushing for the
IOMMU API; I only guarantee unmapping at the same granularity as the
original mapping.  So that seems fine to me.

   There's also the question of whether the IOMMU driver should
  re-evaluate super pages when the less capable IOMMU is removed from the
  domain.
 
 I wouldn't bother to go looking for opportunities to use super pages if
 we remove the last non-SP-capable IOMMU from the domain.

I predict bugs getting filed if a guest sees a performance hit after
adding a device that is not restored when the device is removed.  If
only we could assume a similar feature set among IOMMUs in a system.

   FWIW we currently screw up the handling of cache-coherent vs.
   non-coherent page tables too. That one wants a wbinvd somewhere when we
   add a non-coherent IOMMU to the domain.
  
  You're not selling the trust the IOMMU driver story very well here.
  Can we assume that the IOMMU_CACHE flag (SNP) is ignored appropriately
  by non-coherent IOMMUs?  Is there any downside to ignoring it and always
  setting SNP in the IOMMU page tables?  AMD IOMMU ignores it, but it's
  also always cache coherent.  Thanks,
 
 SNP is a separate issue. I'm speaking of cache coherency of the hardware
 page table walk — the feature bit that all the horrid clflush calls are
 predicated on.
 
 Again, this is just a bug. We *should* be getting this right, but don't
 yet.

And for DMA_PTE_SNP?  intel_iommu_map() won't let us set this bit if the
domain contains a hardware unit that doesn't support ecap.SC, but it
also doesn't update existing mappings.  Barring hardware bugs, it seems
much easier to unconditionally set DMA_PTE_SNP but still advertise
IOMMU_CAP_CACHE_COHERENCY based on the composition of the domain.
Otherwise we have to 

Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread David Woodhouse
On Wed, 2013-09-25 at 15:46 -0600, Alex Williamson wrote:
 On Wed, 2013-09-25 at 21:11 +0100, David Woodhouse wrote:
  I wouldn't bother to go looking for opportunities to use super pages if
  we remove the last non-SP-capable IOMMU from the domain.
 
 I predict bugs getting filed if a guest sees a performance hit after
 adding a device that is not restored when the device is removed.  If
 only we could assume a similar feature set among IOMMUs in a system.

Ok, fine. As long as you don't have *too* much mapped, that shouldn't
suck too much.

FWIW we currently screw up the handling of cache-coherent vs.
non-coherent page tables too. That one wants a wbinvd somewhere when we
add a non-coherent IOMMU to the domain.
   
   You're not selling the trust the IOMMU driver story very well here.
   Can we assume that the IOMMU_CACHE flag (SNP) is ignored appropriately
   by non-coherent IOMMUs?  Is there any downside to ignoring it and always
   setting SNP in the IOMMU page tables?  AMD IOMMU ignores it, but it's
   also always cache coherent.  Thanks,
  
  SNP is a separate issue. I'm speaking of cache coherency of the hardware
  page table walk — the feature bit that all the horrid clflush calls are
  predicated on.
  
  Again, this is just a bug. We *should* be getting this right, but don't
  yet.
 
 And for DMA_PTE_SNP?  intel_iommu_map() won't let us set this bit if the
 domain contains a hardware unit that doesn't support ecap.SC, but it
 also doesn't update existing mappings.  Barring hardware bugs, it seems
 much easier to unconditionally set DMA_PTE_SNP but still advertise
 IOMMU_CAP_CACHE_COHERENCY based on the composition of the domain.
 Otherwise we have to reject adding devices to a domain that change the
 coherency once DMA mappings are in play or never set IOMMU_CACHE and
 advertise to KVM that the domain is always non-coherent.  Thanks,

It's not that the domain is non-coherent, is it? The SNP bit allows you
to *force* a PCIe DMA request to snoop the CPU cache, even if it didn't
*request* that.

If your device is actually *trying* to do cache-coherent (i.e. snooping)
DMA, surely that works regardless of the DMA_PTE_SNP setting?

Or am I missing something?


-- 
dwmw2


smime.p7s
Description: S/MIME cryptographic signature
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] On unmap, flush IOMMU TLB and return correct size

2013-09-25 Thread Alex Williamson
On Wed, 2013-09-25 at 23:15 +0100, David Woodhouse wrote:
 On Wed, 2013-09-25 at 15:46 -0600, Alex Williamson wrote:
  On Wed, 2013-09-25 at 21:11 +0100, David Woodhouse wrote:
   I wouldn't bother to go looking for opportunities to use super pages if
   we remove the last non-SP-capable IOMMU from the domain.
  
  I predict bugs getting filed if a guest sees a performance hit after
  adding a device that is not restored when the device is removed.  If
  only we could assume a similar feature set among IOMMUs in a system.
 
 Ok, fine. As long as you don't have *too* much mapped, that shouldn't
 suck too much.

Of course if we knew this was going to happen in advance of attaching
the new device, I wonder if we might opt to manage it with a separate
IOMMU domain.

 FWIW we currently screw up the handling of cache-coherent vs.
 non-coherent page tables too. That one wants a wbinvd somewhere when 
 we
 add a non-coherent IOMMU to the domain.

You're not selling the trust the IOMMU driver story very well here.
Can we assume that the IOMMU_CACHE flag (SNP) is ignored appropriately
by non-coherent IOMMUs?  Is there any downside to ignoring it and always
setting SNP in the IOMMU page tables?  AMD IOMMU ignores it, but it's
also always cache coherent.  Thanks,
   
   SNP is a separate issue. I'm speaking of cache coherency of the hardware
   page table walk — the feature bit that all the horrid clflush calls are
   predicated on.
   
   Again, this is just a bug. We *should* be getting this right, but don't
   yet.
  
  And for DMA_PTE_SNP?  intel_iommu_map() won't let us set this bit if the
  domain contains a hardware unit that doesn't support ecap.SC, but it
  also doesn't update existing mappings.  Barring hardware bugs, it seems
  much easier to unconditionally set DMA_PTE_SNP but still advertise
  IOMMU_CAP_CACHE_COHERENCY based on the composition of the domain.
  Otherwise we have to reject adding devices to a domain that change the
  coherency once DMA mappings are in play or never set IOMMU_CACHE and
  advertise to KVM that the domain is always non-coherent.  Thanks,
 
 It's not that the domain is non-coherent, is it? The SNP bit allows you
 to *force* a PCIe DMA request to snoop the CPU cache, even if it didn't
 *request* that.
 
 If your device is actually *trying* to do cache-coherent (i.e. snooping)
 DMA, surely that works regardless of the DMA_PTE_SNP setting?
 
 Or am I missing something?

It's the opposite, devices are trying to do No-Snoop transactions and
KVM would really rather prefer wbinvd was a no-op since emulated devices
don't need it.  We therefore need to toggle KVM's behavior when we have
an assigned device attached to an IOMMU domain where the hardware isn't
stripping the No-Snoop attribute.  Some graphics cards seem to make use
of this and will get error codes loading their driver if it doesn't
work.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 1/1] x86/iommu: correct ICS register offset

2013-09-25 Thread Li, Zhen-Hua

Joerg,
Thank you for reviewing this patch.

ZhenHua

On 09/24/2013 07:05 PM, Joerg Roedel wrote:

On Tue, Sep 17, 2013 at 04:38:29PM +0800, ZhenHua wrote:

Hi Guys,
 Though  DMAR_ICS_REG is not used yet, I think this patch is
necessary. So please take a look at it.

You are right, my Spec says the same. It doesn't matter much since the
register seems to be unused in the VT-d driver. I applied the patch to
iommu/fixes anyway.


Joerg


.



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH 5/7] iommu: supress loff_t compilation error on powerpc

2013-09-25 Thread Bhushan Bharat-R65777


 -Original Message-
 From: Alex Williamson [mailto:alex.william...@redhat.com]
 Sent: Wednesday, September 25, 2013 10:10 PM
 To: Bhushan Bharat-R65777
 Cc: j...@8bytes.org; b...@kernel.crashing.org; ga...@kernel.crashing.org; 
 linux-
 ker...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; linux-
 p...@vger.kernel.org; ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
 foundation.org; Bhushan Bharat-R65777
 Subject: Re: [PATCH 5/7] iommu: supress loff_t compilation error on powerpc
 
 On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
  Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
  ---
   drivers/vfio/pci/vfio_pci_rdwr.c |3 ++-
   1 files changed, 2 insertions(+), 1 deletions(-)
 
  diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c
  b/drivers/vfio/pci/vfio_pci_rdwr.c
  index 210db24..8a8156a 100644
  --- a/drivers/vfio/pci/vfio_pci_rdwr.c
  +++ b/drivers/vfio/pci/vfio_pci_rdwr.c
  @@ -181,7 +181,8 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, 
  char
 __user *buf,
 size_t count, loff_t *ppos, bool iswrite)  {
  int ret;
  -   loff_t off, pos = *ppos  VFIO_PCI_OFFSET_MASK;
  +   loff_t off;
  +   u64 pos = (u64 )(*ppos  VFIO_PCI_OFFSET_MASK);
  void __iomem *iomem = NULL;
  unsigned int rsrc;
  bool is_ioport;
 
 What's the compile error that this fixes?

I was getting below error; and after some googling I came to know that this is 
how it is fixed by other guys.

/home/r65777/linux-vfio/drivers/vfio/pci/vfio_pci_rdwr.c:193: undefined 
reference to `__cmpdi2'
/home/r65777/linux-vfio/drivers/vfio/pci/vfio_pci_rdwr.c:193: undefined 
reference to `__cmpdi2'

Thanks
-Bharat
 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH 6/7] vfio: moving some functions in common file

2013-09-25 Thread Bhushan Bharat-R65777


 -Original Message-
 From: linux-pci-ow...@vger.kernel.org [mailto:linux-pci-ow...@vger.kernel.org]
 On Behalf Of Alex Williamson
 Sent: Wednesday, September 25, 2013 10:33 PM
 To: Bhushan Bharat-R65777
 Cc: j...@8bytes.org; b...@kernel.crashing.org; ga...@kernel.crashing.org; 
 linux-
 ker...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; linux-
 p...@vger.kernel.org; ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
 foundation.org; Bhushan Bharat-R65777
 Subject: Re: [PATCH 6/7] vfio: moving some functions in common file
 
 On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
  Some function defined in vfio_iommu_type1.c were common and we want to
  use these for FSL IOMMU (PAMU) and iommu-none driver.
  So some of them are moved to vfio_iommu_common.c
 
  I think we can do more of that but we will take this step by step.
 
  Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
  ---
   drivers/vfio/Makefile|4 +-
   drivers/vfio/vfio_iommu_common.c |  235
 ++
   drivers/vfio/vfio_iommu_common.h |   30 +
   drivers/vfio/vfio_iommu_type1.c  |  206
  +-
   4 files changed, 268 insertions(+), 207 deletions(-)  create mode
  100644 drivers/vfio/vfio_iommu_common.c  create mode 100644
  drivers/vfio/vfio_iommu_common.h
 
  diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index
  72bfabc..c5792ec 100644
  --- a/drivers/vfio/Makefile
  +++ b/drivers/vfio/Makefile
  @@ -1,4 +1,4 @@
   obj-$(CONFIG_VFIO) += vfio.o
  -obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
  -obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
  +obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_common.o
  +vfio_iommu_type1.o
  +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_common.o
  +vfio_iommu_spapr_tce.o
   obj-$(CONFIG_VFIO_PCI) += pci/
  diff --git a/drivers/vfio/vfio_iommu_common.c
  b/drivers/vfio/vfio_iommu_common.c
  new file mode 100644
  index 000..8bdc0ea
  --- /dev/null
  +++ b/drivers/vfio/vfio_iommu_common.c
  @@ -0,0 +1,235 @@
  +/*
  + * VFIO: Common code for vfio IOMMU support
  + *
  + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  + * Author: Alex Williamson alex.william...@redhat.com
  + * Author: Bharat Bhushan bharat.bhus...@freescale.com
  + *
  + * This program is free software; you can redistribute it and/or
  +modify
  + * it under the terms of the GNU General Public License version 2 as
  + * published by the Free Software Foundation.
  + *
  + * Derived from original vfio:
  + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  + * Author: Tom Lyon, p...@cisco.com
  + */
  +
  +#include linux/compat.h
  +#include linux/device.h
  +#include linux/fs.h
  +#include linux/iommu.h
  +#include linux/module.h
  +#include linux/mm.h
  +#include linux/pci.h /* pci_bus_type */
  +#include linux/rbtree.h
  +#include linux/sched.h
  +#include linux/slab.h
  +#include linux/uaccess.h
  +#include linux/vfio.h
  +#include linux/workqueue.h
 
 Please cleanup includes on both the source and target files.  You obviously
 don't need linux/pci.h here for one.

Will do.

 
  +
  +static bool disable_hugepages;
  +module_param_named(disable_hugepages,
  +  disable_hugepages, bool, S_IRUGO | S_IWUSR);
  +MODULE_PARM_DESC(disable_hugepages,
  +Disable VFIO IOMMU support for IOMMU hugepages.);
  +
  +struct vwork {
  +   struct mm_struct*mm;
  +   longnpage;
  +   struct work_struct  work;
  +};
  +
  +/* delayed decrement/increment for locked_vm */ void
  +vfio_lock_acct_bg(struct work_struct *work) {
  +   struct vwork *vwork = container_of(work, struct vwork, work);
  +   struct mm_struct *mm;
  +
  +   mm = vwork-mm;
  +   down_write(mm-mmap_sem);
  +   mm-locked_vm += vwork-npage;
  +   up_write(mm-mmap_sem);
  +   mmput(mm);
  +   kfree(vwork);
  +}
  +
  +void vfio_lock_acct(long npage)
  +{
  +   struct vwork *vwork;
  +   struct mm_struct *mm;
  +
  +   if (!current-mm || !npage)
  +   return; /* process exited or nothing to do */
  +
  +   if (down_write_trylock(current-mm-mmap_sem)) {
  +   current-mm-locked_vm += npage;
  +   up_write(current-mm-mmap_sem);
  +   return;
  +   }
  +
  +   /*
  +* Couldn't get mmap_sem lock, so must setup to update
  +* mm-locked_vm later. If locked_vm were atomic, we
  +* wouldn't need this silliness
  +*/
  +   vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
  +   if (!vwork)
  +   return;
  +   mm = get_task_mm(current);
  +   if (!mm) {
  +   kfree(vwork);
  +   return;
  +   }
  +   INIT_WORK(vwork-work, vfio_lock_acct_bg);
  +   vwork-mm = mm;
  +   vwork-npage = npage;
  +   schedule_work(vwork-work);
  +}
  +
  +/*
  + * Some mappings aren't backed by a struct page, for example an
  +mmap'd
  + * MMIO range for our own or another device.  These use a different
  + * pfn