Re: [PATCH v10 04/10] iommu/vt-d: functions to copy data from old mem

2015-05-07 Thread Baoquan He
On 04/10/15 at 04:42pm, Li, Zhen-Hua wrote:
 Add some functions to copy the data from old kernel.
 These functions are used to copy context tables and page tables.
 
 To avoid calling iounmap between spin_lock_irqsave and spin_unlock_irqrestore,
 use a link here, store the pointers , and then use iounmap to free them in
 another place.
 
 Li, Zhen-hua:
 The functions and logics.
 
 Takao Indoh:
 Check if pfn is ram:
 if (page_is_ram(pfn))
 
 Signed-off-by: Li, Zhen-Hua zhen-h...@hp.com
 Signed-off-by: Takao Indoh indou.ta...@jp.fujitsu.com
 ---
  drivers/iommu/intel-iommu.c | 102 
 
  include/linux/intel-iommu.h |   6 +++
  2 files changed, 108 insertions(+)
 
 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
 index ff5ac04..5ba403a 100644
 --- a/drivers/iommu/intel-iommu.c
 +++ b/drivers/iommu/intel-iommu.c
 @@ -373,6 +373,17 @@ static struct context_entry 
 *device_to_existing_context_entry(
   struct intel_iommu *iommu,
   u8 bus, u8 devfn);
  
 +/*
 + * A structure used to store the address allocated by ioremap();
 + * The we need to call iounmap() to free them out of 
 spin_lock_irqsave/unlock;
 + */
 +struct iommu_remapped_entry {
 + struct list_head list;
 + void __iomem *mem;
 +};
 +static LIST_HEAD(__iommu_remapped_mem);
 +static DEFINE_MUTEX(__iommu_mem_list_lock);
 +
  
  /*
   * This domain is a statically identity mapping domain.
 @@ -4817,3 +4828,94 @@ static struct context_entry 
 *device_to_existing_context_entry(
   return ret;
  }
  
 +/*
 + * Copy memory from a physically-addressed area into a virtually-addressed 
 area
 + */

I don't find where __iommu_load_from_oldmem is called. Obsolete code of
this patch?

 +int __iommu_load_from_oldmem(void *to, unsigned long from, unsigned long 
 size)
 +{
 + unsigned long pfn;  /* Page Frame Number */
 + size_t csize = (size_t)size;/* Num(bytes to copy) */
 + unsigned long offset;   /* Lower 12 bits of to */
 + void __iomem *virt_mem;
 + struct iommu_remapped_entry *mapped;
 +
 + pfn = from  VTD_PAGE_SHIFT;
 + offset = from  (~VTD_PAGE_MASK);
 +
 + if (page_is_ram(pfn)) {
 + memcpy(to, pfn_to_kaddr(pfn) + offset, csize);
 + } else{
 +
 + mapped = kzalloc(sizeof(struct iommu_remapped_entry),
 + GFP_KERNEL);
 + if (!mapped)
 + return -ENOMEM;
 +
 + virt_mem = ioremap_cache((unsigned long)from, size);
 + if (!virt_mem) {
 + kfree(mapped);
 + return -ENOMEM;
 + }
 + memcpy(to, virt_mem, size);
 +
 + mutex_lock(__iommu_mem_list_lock);
 + mapped-mem = virt_mem;
 + list_add_tail(mapped-list, __iommu_remapped_mem);
 + mutex_unlock(__iommu_mem_list_lock);
 + }
 + return size;
 +}
 +
 +/*
 + * Copy memory from a virtually-addressed area into a physically-addressed 
 area
 + */
 +int __iommu_save_to_oldmem(unsigned long to, void *from, unsigned long size)
 +{
 + unsigned long pfn;  /* Page Frame Number */
 + size_t csize = (size_t)size;/* Num(bytes to copy) */
 + unsigned long offset;   /* Lower 12 bits of to */
 + void __iomem *virt_mem;
 + struct iommu_remapped_entry *mapped;
 +
 + pfn = to  VTD_PAGE_SHIFT;
 + offset = to  (~VTD_PAGE_MASK);
 +
 + if (page_is_ram(pfn)) {
 + memcpy(pfn_to_kaddr(pfn) + offset, from, csize);
 + } else{
 + mapped = kzalloc(sizeof(struct iommu_remapped_entry),
 + GFP_KERNEL);
 + if (!mapped)
 + return -ENOMEM;
 +
 + virt_mem = ioremap_cache((unsigned long)to, size);
 + if (!virt_mem) {
 + kfree(mapped);
 + return -ENOMEM;
 + }
 + memcpy(virt_mem, from, size);
 + mutex_lock(__iommu_mem_list_lock);
 + mapped-mem = virt_mem;
 + list_add_tail(mapped-list, __iommu_remapped_mem);
 + mutex_unlock(__iommu_mem_list_lock);
 + }
 + return size;
 +}
 +
 +/*
 + * Free the mapped memory for ioremap;
 + */
 +int __iommu_free_mapped_mem(void)
 +{
 + struct iommu_remapped_entry *mem_entry, *tmp;
 +
 + mutex_lock(__iommu_mem_list_lock);
 + list_for_each_entry_safe(mem_entry, tmp, __iommu_remapped_mem, list) {
 + iounmap(mem_entry-mem);
 + list_del(mem_entry-list);
 + kfree(mem_entry);
 + }
 + mutex_unlock(__iommu_mem_list_lock);
 + return 0;
 +}
 +
 diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
 index a65208a..4bca7b5 100644
 --- a/include/linux/intel-iommu.h
 +++ b/include/linux/intel-iommu.h
 @@ -368,4 +368,10 @@ extern int dmar_ir_support(void);
  

Re: [PATCH v10 04/10] iommu/vt-d: functions to copy data from old mem

2015-05-07 Thread Li, ZhenHua

It is called in
static int copy_root_entry_table(struct intel_iommu *iommu);


On 05/07/2015 03:49 PM, Baoquan He wrote:

On 04/10/15 at 04:42pm, Li, Zhen-Hua wrote:

Add some functions to copy the data from old kernel.
These functions are used to copy context tables and page tables.

To avoid calling iounmap between spin_lock_irqsave and spin_unlock_irqrestore,
use a link here, store the pointers , and then use iounmap to free them in
another place.

Li, Zhen-hua:
 The functions and logics.

Takao Indoh:
 Check if pfn is ram:
 if (page_is_ram(pfn))

Signed-off-by: Li, Zhen-Hua zhen-h...@hp.com
Signed-off-by: Takao Indoh indou.ta...@jp.fujitsu.com
---
  drivers/iommu/intel-iommu.c | 102 
  include/linux/intel-iommu.h |   6 +++
  2 files changed, 108 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index ff5ac04..5ba403a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -373,6 +373,17 @@ static struct context_entry 
*device_to_existing_context_entry(
struct intel_iommu *iommu,
u8 bus, u8 devfn);

+/*
+ * A structure used to store the address allocated by ioremap();
+ * The we need to call iounmap() to free them out of spin_lock_irqsave/unlock;
+ */
+struct iommu_remapped_entry {
+   struct list_head list;
+   void __iomem *mem;
+};
+static LIST_HEAD(__iommu_remapped_mem);
+static DEFINE_MUTEX(__iommu_mem_list_lock);
+

  /*
   * This domain is a statically identity mapping domain.
@@ -4817,3 +4828,94 @@ static struct context_entry 
*device_to_existing_context_entry(
return ret;
  }

+/*
+ * Copy memory from a physically-addressed area into a virtually-addressed area
+ */


I don't find where __iommu_load_from_oldmem is called. Obsolete code of
this patch?


+int __iommu_load_from_oldmem(void *to, unsigned long from, unsigned long size)
+{
+   unsigned long pfn;  /* Page Frame Number */
+   size_t csize = (size_t)size;/* Num(bytes to copy) */
+   unsigned long offset;   /* Lower 12 bits of to */
+   void __iomem *virt_mem;
+   struct iommu_remapped_entry *mapped;
+
+   pfn = from  VTD_PAGE_SHIFT;
+   offset = from  (~VTD_PAGE_MASK);
+
+   if (page_is_ram(pfn)) {
+   memcpy(to, pfn_to_kaddr(pfn) + offset, csize);
+   } else{
+
+   mapped = kzalloc(sizeof(struct iommu_remapped_entry),
+   GFP_KERNEL);
+   if (!mapped)
+   return -ENOMEM;
+
+   virt_mem = ioremap_cache((unsigned long)from, size);
+   if (!virt_mem) {
+   kfree(mapped);
+   return -ENOMEM;
+   }
+   memcpy(to, virt_mem, size);
+
+   mutex_lock(__iommu_mem_list_lock);
+   mapped-mem = virt_mem;
+   list_add_tail(mapped-list, __iommu_remapped_mem);
+   mutex_unlock(__iommu_mem_list_lock);
+   }
+   return size;
+}
+
+/*
+ * Copy memory from a virtually-addressed area into a physically-addressed area
+ */
+int __iommu_save_to_oldmem(unsigned long to, void *from, unsigned long size)
+{
+   unsigned long pfn;  /* Page Frame Number */
+   size_t csize = (size_t)size;/* Num(bytes to copy) */
+   unsigned long offset;   /* Lower 12 bits of to */
+   void __iomem *virt_mem;
+   struct iommu_remapped_entry *mapped;
+
+   pfn = to  VTD_PAGE_SHIFT;
+   offset = to  (~VTD_PAGE_MASK);
+
+   if (page_is_ram(pfn)) {
+   memcpy(pfn_to_kaddr(pfn) + offset, from, csize);
+   } else{
+   mapped = kzalloc(sizeof(struct iommu_remapped_entry),
+   GFP_KERNEL);
+   if (!mapped)
+   return -ENOMEM;
+
+   virt_mem = ioremap_cache((unsigned long)to, size);
+   if (!virt_mem) {
+   kfree(mapped);
+   return -ENOMEM;
+   }
+   memcpy(virt_mem, from, size);
+   mutex_lock(__iommu_mem_list_lock);
+   mapped-mem = virt_mem;
+   list_add_tail(mapped-list, __iommu_remapped_mem);
+   mutex_unlock(__iommu_mem_list_lock);
+   }
+   return size;
+}
+
+/*
+ * Free the mapped memory for ioremap;
+ */
+int __iommu_free_mapped_mem(void)
+{
+   struct iommu_remapped_entry *mem_entry, *tmp;
+
+   mutex_lock(__iommu_mem_list_lock);
+   list_for_each_entry_safe(mem_entry, tmp, __iommu_remapped_mem, list) {
+   iounmap(mem_entry-mem);
+   list_del(mem_entry-list);
+   kfree(mem_entry);
+   }
+   mutex_unlock(__iommu_mem_list_lock);
+   return 0;
+}
+
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a65208a..4bca7b5 100644
--- 

Re: Master-aware devices and sideband ID data

2015-05-07 Thread Stuart Yoder
On Tue, Mar 24, 2015 at 10:50 AM, Mark Rutland mark.rutl...@arm.com wrote:
 Hi all,

 For some devices, identification of particular masters is critical to
 their operation (e.g. IOMMUs, MSI controllers). The identity of masters
 is determined from sideband signals on the bus, and it may or may not be
 possible to probe and/or configure this information. Worse still, these
 information may be rewritten by intermediate buses, so the
 information for a given master may differ at different points in the bus
 hierarchy.

 We currently describe this master information for devices attached to
 IOMMUs, with a master ID being encoded in the iommu-cells. However this
 only covers the ID between the master and its IOMMU(s), and we don't
 currently have a mechanism to describe the master IDs as they are seen
 by devices beyond the IOMMU(s),

Is there a specific case you've run across where conveying this additional
master info would be needed,  or are you just thinking through how to fully
describe hardware?

Are there really cases out there were there is a hardwired hardware
relationship between RID and SID?

 or in the absence of any IOMMU(s).

 The following are example of master IDs:
 * PCIe Requester IDs (RIDs) (bus:device.function AKA BDF)
 * SMMU Stream IDs (SIDs)
 * GICv3 ITS Device IDs (DIDs)

 In the case of a system with PCIe, SMMU and GICv3, the master IDs are
 rewritten in a chain of RID=SID=DID, as in the following diagram:

 +-+
 | PCIe master |
 +-+
 ||
 || Requester ID (RID)
 || Probeable from RC.
 \/
 +---+
 | PCIe Root Complex |
 +---+
 ||
 || SMMU Stream ID (SID)
 || Derived from RID, static non-probeable mapping.

The FSL LS2085A SoC has an actual RID-SID mapping table in the PCI
controller, but it is not static in the sense of fixed in hardware or
firmware.  It's
a programmable mapping table, and we envision Linux programming this table.

 \/
 +--+
 | SMMU (IOMMU) |
 +--+
 ||
 || ITS Device ID (DID)
 || Derived from SID, static non-probeable mapping.
 \/

Is this even architecturally possible on ARM?  Can the SMMU transform stream
IDs into some different number?

 ++
 | GICv3 ITS (MSI controller) |
 ++

 In simpler cases, you might have a simpler set of master ID
 translations, e.g. just a DID:

 +-+
 | Platform device |
 +-+
 ||
 || ITS Device ID (DID)
 || Hard-wired on the bus.
 \/
 ++
 | GICv3 ITS (MSI controller) |
 ++

 Ignoring current bindings for the moment, I can see how we can describe
 this with a generic master id-binding, with master-id-map along the
 lines of interrupt-map, with a tuple format like:
 child-id-base child-id-length parent parent-id-base

 For the PCIe example, this would look something like the following (with
 properties omitted for brevity):

 PCI: pci@af00 {
 ...

 /* Requester ID of PCIe endpoints, implicit at runtime */
 master-id-cells = 1;

 /* RIDS idmapped to SIDS @ SMMU */
 master-id-map = 0 0x1 SMMU 0;
 }

 SMMU: iommu@bf00 {
 ...

 /* SID, derived from RID */
 master-id-cells = 1;

 /*
  * For some reason the high bit of the ID was negated.
  */
 master-id-map = 0x 0x8000 ITS 0x0 0x8000,
 0x8000 0x8000 ITS 0x0 0x;

 };

 ITS: its@cf00 {
 ...

 /* DID, derived from SID */
 master-id-cells = 2;

 /*
  * Master traffic not propagated beyond this point, so no
  * master-id-ranges
  */
 };

 For the simpler case, this would look something like:

 DEV: device@af00 {
 master-id-cells = 1;
 master-ids = 0xf, 0xb;
 master-id-map = 0 0xf ITS 0 0;
 };

 ITS: its@cf00 {
 ...

 /* DID */
 master-id-cells = 2;
 };


 However, this conflicts/overlaps with existing bindings (at least iommus
 and msi-parent), and I'm not sure how to reconcile them. Am I missing a
 neat way of describing this that works with the existing bindings?

 It's also not clear to me if it's sufficient to have a generic
 master-ids property (with the relevant type being implicit from
 context), or if each type of ID needs each type of ID (RID, SID, DID,
 etc) needs its own.

 Which other devices out there which require side-band master
 information, and what do they require?

For us, at least, the  master-id-map cannot be a static property.
Our PCI controller's DID-SID mapping table is programmable
and we won't know how to program it until Linux is running.  A PCI
SRIOV card may have virtual functions that appear as hot-plugged
devices and there is no way that boot firmware would have the knowledge
to have set that table up ahead of time.

What we do need is some 

Re: [PATCH 0/6] IOMMU/DMA map_resource support for peer-to-peer

2015-05-07 Thread Jerome Glisse
On Thu, May 07, 2015 at 12:16:30PM -0500, Bjorn Helgaas wrote:
 On Thu, May 7, 2015 at 11:23 AM, William Davis wda...@nvidia.com wrote:
  From: Bjorn Helgaas [mailto:bhelg...@google.com]
  Sent: Thursday, May 7, 2015 8:13 AM
  To: Yijing Wang
  Cc: William Davis; Joerg Roedel; open list:INTEL IOMMU (VT-d); linux-
  p...@vger.kernel.org; Terence Ripperda; John Hubbard; Jerome Glisse; Dave
  Jiang; David S. Miller; Alex Williamson
  Subject: Re: [PATCH 0/6] IOMMU/DMA map_resource support for peer-to-peer
 
  On Wed, May 6, 2015 at 8:48 PM, Yijing Wang wangyij...@huawei.com wrote:
   On 2015/5/7 6:18, Bjorn Helgaas wrote:
   [+cc Yijing, Dave J, Dave M, Alex]
  
   On Fri, May 01, 2015 at 01:32:12PM -0500, wda...@nvidia.com wrote:
   From: Will Davis wda...@nvidia.com
  
   Hi,
  
   This patch series adds DMA APIs to map and unmap a struct resource
   to and from a PCI device's IOVA domain, and implements the AMD,
   Intel, and nommu versions of these interfaces.
  
   This solves a long-standing problem with the existing DMA-remapping
   interfaces, which require that a struct page be given for the region
   to be mapped into a device's IOVA domain. This requirement cannot
   support peer device BAR ranges, for which no struct pages exist.
   ...
 
   I think we currently assume there's no peer-to-peer traffic.
  
   I don't know whether changing that will break anything, but I'm
   concerned about these:
  
 - PCIe MPS configuration (see pcie_bus_configure_settings()).
  
   I think it should be ok for PCIe MPS configuration, PCIE_BUS_PEER2PEER
   force every device's MPS to 128B, what its concern is the TLP payload
   size. In this series, it seems to only map a iova for device bar region.
 
  MPS configuration makes assumptions about whether there will be any peer-
  to-peer traffic.  If there will be none, MPS can be configured more
  aggressively.
 
  I don't think Linux has any way to detect whether a driver is doing peer-
  to-peer, and there's no way to prevent a driver from doing it.
  We're stuck with requiring the user to specify boot options
  (pci=pcie_bus_safe, pci=pcie_bus_perf, pci=pcie_bus_peer2peer,
  etc.) that tell the PCI core what the user expects to happen.
 
  This is a terrible user experience.  The user has no way to tell what
  drivers are going to do.  If he specifies the wrong thing, e.g., assume no
  peer-to-peer traffic, and then loads a driver that does peer-to-peer, the
  kernel will configure MPS aggressively and when the device does a peer-to-
  peer transfer, it may cause a Malformed TLP error.
 
 
  I agree that this isn't a great user experience, but just want to clarify
  that this problem is orthogonal to this patch series, correct?
 
  Prior to this series, the MPS mismatch is still possible with p2p traffic,
  but when an IOMMU is enabled p2p traffic will result in DMAR faults. The
  aim of the series is to allow drivers to fix the latter, not the former.
 
 Prior to this series, there wasn't any infrastructure for drivers to
 do p2p, so it was mostly reasonable to assume that there *was* no p2p
 traffic.
 
 I think we currently default to doing nothing to MPS.  Prior to this
 series, it might have been reasonable to optimize based on a no-p2p
 assumption, e.g., default to pcie_bus_safe or pcie_bus_perf.  After
 this series, I'm not sure what we could do, because p2p will be much
 more likely.
 
 It's just an issue; I don't know what the resolution is.

Can't we just have each device update its MPS at runtime. So if device A
decide to map something from device B then device A update MPS for A and
B to lowest common supported value.

Of course you need to keep track of that per device so that if a device C
comes around and want to exchange with device B and both C and B support
higher payload than A then if C reprogram B it will trigger issue for A.

I know we update other PCIE configuration parameter at runtime for GPU,
dunno if it is widely tested for other devices.

Cheers,
Jérôme
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v10 0/10] iommu/vt-d: Fix intel vt-d faults in kdump kernel

2015-05-07 Thread Li, ZhenHua

Hi Joerg,
This problem is caused by the latest updates in iommu module, and we are 
trying to fix it.

When it is fixed, we will send out a new version of the patchset.

Thanks
Zhenhua

On 05/08/2015 01:32 AM, Joerg Roedel wrote:

Hi Baoquan, ZhenHua,

On Mon, May 04, 2015 at 11:17:49AM +0800, Baoquan He wrote:

On 05/04/15 at 11:06am, Li, ZhenHua wrote:

Hi baoquan,
Could you paste the kernel log of the first kernel ?


Please let me know when you have worked this issue out.


Thanks,

Joerg



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 0/6] IOMMU/DMA map_resource support for peer-to-peer

2015-05-07 Thread Bjorn Helgaas
On Wed, May 6, 2015 at 8:48 PM, Yijing Wang wangyij...@huawei.com wrote:
 On 2015/5/7 6:18, Bjorn Helgaas wrote:
 [+cc Yijing, Dave J, Dave M, Alex]

 On Fri, May 01, 2015 at 01:32:12PM -0500, wda...@nvidia.com wrote:
 From: Will Davis wda...@nvidia.com

 Hi,

 This patch series adds DMA APIs to map and unmap a struct resource to and 
 from
 a PCI device's IOVA domain, and implements the AMD, Intel, and nommu 
 versions
 of these interfaces.

 This solves a long-standing problem with the existing DMA-remapping 
 interfaces,
 which require that a struct page be given for the region to be mapped into a
 device's IOVA domain. This requirement cannot support peer device BAR 
 ranges,
 for which no struct pages exist.
 ...

 I think we currently assume there's no peer-to-peer traffic.

 I don't know whether changing that will break anything, but I'm concerned
 about these:

   - PCIe MPS configuration (see pcie_bus_configure_settings()).

 I think it should be ok for PCIe MPS configuration, PCIE_BUS_PEER2PEER force 
 every
 device's MPS to 128B, what its concern is the TLP payload size. In this 
 series, it
 seems to only map a iova for device bar region.

MPS configuration makes assumptions about whether there will be any
peer-to-peer traffic.  If there will be none, MPS can be configured
more aggressively.

I don't think Linux has any way to detect whether a driver is doing
peer-to-peer, and there's no way to prevent a driver from doing it.
We're stuck with requiring the user to specify boot options
(pci=pcie_bus_safe, pci=pcie_bus_perf, pci=pcie_bus_peer2peer,
etc.) that tell the PCI core what the user expects to happen.

This is a terrible user experience.  The user has no way to tell what
drivers are going to do.  If he specifies the wrong thing, e.g.,
assume no peer-to-peer traffic, and then loads a driver that does
peer-to-peer, the kernel will configure MPS aggressively and when the
device does a peer-to-peer transfer, it may cause a Malformed TLP
error.

Bjorn
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v9 0/10] iommu/vt-d: Fix intel vt-d faults in kdump kernel

2015-05-07 Thread Dave Young
On 04/07/15 at 10:12am, Don Dutile wrote:
 On 04/06/2015 11:46 PM, Dave Young wrote:
 On 04/05/15 at 09:54am, Baoquan He wrote:
 On 04/03/15 at 05:21pm, Dave Young wrote:
 On 04/03/15 at 05:01pm, Li, ZhenHua wrote:
 Hi Dave,
 
 There may be some possibilities that the old iommu data is corrupted by
 some other modules. Currently we do not have a better solution for the
 dmar faults.
 
 But I think when this happens, we need to fix the module that corrupted
 the old iommu data. I once met a similar problem in normal kernel, the
 queue used by the qi_* functions was written again by another module.
 The fix was in that module, not in iommu module.
 
 It is too late, there will be no chance to save vmcore then.
 
 Also if it is possible to continue corrupt other area of oldmem because
 of using old iommu tables then it will cause more problems.
 
 So I think the tables at least need some verifycation before being used.
 
 
 Yes, it's a good thinking anout this and verification is also an
 interesting idea. kexec/kdump do a sha256 calculation on loaded kernel
 and then verify this again when panic happens in purgatory. This checks
 whether any code stomps into region reserved for kexec/kernel and corrupt
 the loaded kernel.
 
 If this is decided to do it should be an enhancement to current
 patchset but not a approach change. Since this patchset is going very
 close to point as maintainers expected maybe this can be merged firstly,
 then think about enhancement. After all without this patchset vt-d often
 raised error message, hung.
 
 It does not convince me, we should do it right at the beginning instead of
 introduce something wrong.
 
 I wonder why the old dma can not be remap to a specific page in kdump kernel
 so that it will not corrupt more memory. But I may missed something, I will
 looking for old threads and catch up.
 
 Thanks
 Dave
 
 The (only) issue is not corruption, but once the iommu is re-configured, the 
 old,
 not-stopped-yet, dma engines will use iova's that will generate dmar faults, 
 which
 will be enabled when the iommu is re-configured (even to a single/simple 
 paging scheme)
 in the kexec kernel.
 

Don, so if iommu is not reconfigured then these faults will not happen?

Baoquan and me has a confusion below today about iommu=off/intel_iommu=off:

intel_iommu_init()
{
...

   dmar_table_init();

   disable active iommu translations;

   if (no_iommu || dmar_disabled)
goto out_free_dmar;

...
}

Any reason not move no_iommu check to the begining of intel_iommu_init function?

Thanks
Dave
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v9 0/10] iommu/vt-d: Fix intel vt-d faults in kdump kernel

2015-05-07 Thread Don Dutile

On 05/07/2015 10:00 AM, Dave Young wrote:

On 04/07/15 at 10:12am, Don Dutile wrote:

On 04/06/2015 11:46 PM, Dave Young wrote:

On 04/05/15 at 09:54am, Baoquan He wrote:

On 04/03/15 at 05:21pm, Dave Young wrote:

On 04/03/15 at 05:01pm, Li, ZhenHua wrote:

Hi Dave,

There may be some possibilities that the old iommu data is corrupted by
some other modules. Currently we do not have a better solution for the
dmar faults.

But I think when this happens, we need to fix the module that corrupted
the old iommu data. I once met a similar problem in normal kernel, the
queue used by the qi_* functions was written again by another module.
The fix was in that module, not in iommu module.


It is too late, there will be no chance to save vmcore then.

Also if it is possible to continue corrupt other area of oldmem because
of using old iommu tables then it will cause more problems.

So I think the tables at least need some verifycation before being used.



Yes, it's a good thinking anout this and verification is also an
interesting idea. kexec/kdump do a sha256 calculation on loaded kernel
and then verify this again when panic happens in purgatory. This checks
whether any code stomps into region reserved for kexec/kernel and corrupt
the loaded kernel.

If this is decided to do it should be an enhancement to current
patchset but not a approach change. Since this patchset is going very
close to point as maintainers expected maybe this can be merged firstly,
then think about enhancement. After all without this patchset vt-d often
raised error message, hung.


It does not convince me, we should do it right at the beginning instead of
introduce something wrong.

I wonder why the old dma can not be remap to a specific page in kdump kernel
so that it will not corrupt more memory. But I may missed something, I will
looking for old threads and catch up.

Thanks
Dave


The (only) issue is not corruption, but once the iommu is re-configured, the 
old,
not-stopped-yet, dma engines will use iova's that will generate dmar faults, 
which
will be enabled when the iommu is re-configured (even to a single/simple paging 
scheme)
in the kexec kernel.



Don, so if iommu is not reconfigured then these faults will not happen?


Well, if iommu is not reconfigured, then if the crash isn't caused by
an IOMMU fault (some systems have firmware-first catch the IOMMU fault  convert
them into NMI_IOCK), then the DMA's will continue into the old kernel memory 
space.


Baoquan and me has a confusion below today about iommu=off/intel_iommu=off:

intel_iommu_init()
{
...

dmar_table_init();

disable active iommu translations;

if (no_iommu || dmar_disabled)
 goto out_free_dmar;

...
}

Any reason not move no_iommu check to the begining of intel_iommu_init function?


What does that do/help?


Thanks
Dave



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v9 0/10] iommu/vt-d: Fix intel vt-d faults in kdump kernel

2015-05-07 Thread Dave Young
On 05/06/15 at 10:16am, Joerg Roedel wrote:
 On Wed, May 06, 2015 at 09:46:49AM +0800, Dave Young wrote:
  For the original problem, the key issue is dmar faults cause kdump kernel
  hang so that vmcore can not be saved. I do not know the reason why it hangs
  I think it is acceptable if kdump kernel boot ok with some dma errors..
 
 It hangs because some devices can't handle the DMAR faults and the kdump
 kernel can't initialize them and will hang itself. For that it doesn't
 matter whether the fault was caused by a read or write request.

Ok, thanks for explanation. so it explains sometimes kdump kernel boot ok
with faults, sometimes it hangs instead.

Dave
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v9 0/10] iommu/vt-d: Fix intel vt-d faults in kdump kernel

2015-05-07 Thread Dave Young
On 05/04/15 at 01:05pm, Joerg Roedel wrote:
 On Fri, Apr 03, 2015 at 04:40:31PM +0800, Dave Young wrote:
  Have not read all the patches, but I have a question, not sure this
  has been answered before. Old memory is not reliable, what if the old
  memory get corrupted before panic? Is it safe to continue using it in
  2nd kernel, I worry that it will cause problems.
 
 Yes, the old memory could be corrupted, and there are more failure cases
 left which we have no way of handling yet (if iommu data structures are
 in kdump backup areas).
 
 The question is what to do if we find some of the old data structures
 corrupted, hand how far should the tests go. Should we also check the
 page-tables, for example? I think if some of the data structures for a
 device are corrupted it probably already failed in the old kernel and
 things won't get worse in the new one.

Joreg, I can not find the last reply from you, so just reply here about
my worries here.

I said that the patchset will cause more problems, let me explain about
it more here:

Suppose page table was corrupted, ie. original mapping iova1 - page 1
it was changed to iova1 - page 2 accidently while crash happening,
thus future dma will read/write page 2 instead page 1, right?

so the behavior changes like:
originally, dmar faults happen, but kdump kernel may boot ok with these
faults, and vmcore can be saved.
with the patchset, dmar faults does not happen, dma translation will be
handled, but dma write could corrupt unrelevant memory.

This might be corner case, but who knows because kernel paniced we can
not assume old page table is right.

But seems you all think it is safe, but let us understand each other
first then go to a solution.

Today we talked with Zhenhua about the problem I think both of us are clear
about the problems. Just he think it can be left as future work.

Thanks
Dave
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/6] DMA-API: Introduce dma_(un)map_resource

2015-05-07 Thread Bjorn Helgaas
On Fri, May 01, 2015 at 01:32:14PM -0500, wda...@nvidia.com wrote:
 From: Will Davis wda...@nvidia.com
 
 Add functions to DMA-map and -unmap a resource for a given device. This
 will allow devices to DMA-map a peer device's resource (for example,
 another device's BAR region on PCI) to enable peer-to-peer transactions.
 
 Signed-off-by: Will Davis wda...@nvidia.com
 Reviewed-by: Terence Ripperda trippe...@nvidia.com
 Reviewed-by: John Hubbard jhubb...@nvidia.com
 ---
  include/asm-generic/dma-mapping-broken.h |  9 +
  include/asm-generic/dma-mapping-common.h | 34 
 
  include/linux/dma-mapping.h  |  7 +++

You should document these new interfaces in Documentation/DMA-API-*

Bjorn
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH 2/6] DMA-API: Introduce dma_(un)map_resource

2015-05-07 Thread William Davis


 -Original Message-
 From: Bjorn Helgaas [mailto:bhelg...@google.com]
 Sent: Thursday, May 7, 2015 10:10 AM
 To: William Davis
 Cc: j...@8bytes.org; iommu@lists.linux-foundation.org; linux-
 p...@vger.kernel.org; Terence Ripperda; John Hubbard; jgli...@redhat.com
 Subject: Re: [PATCH 2/6] DMA-API: Introduce dma_(un)map_resource
 
 On Fri, May 01, 2015 at 01:32:14PM -0500, wda...@nvidia.com wrote:
  From: Will Davis wda...@nvidia.com
 
  Add functions to DMA-map and -unmap a resource for a given device.
  This will allow devices to DMA-map a peer device's resource (for
  example, another device's BAR region on PCI) to enable peer-to-peer
 transactions.
 
  Signed-off-by: Will Davis wda...@nvidia.com
  Reviewed-by: Terence Ripperda trippe...@nvidia.com
  Reviewed-by: John Hubbard jhubb...@nvidia.com
  ---
   include/asm-generic/dma-mapping-broken.h |  9 +
  include/asm-generic/dma-mapping-common.h | 34
 
   include/linux/dma-mapping.h  |  7 +++
 
 You should document these new interfaces in Documentation/DMA-API-*
 

Will do.

Thanks,
Will

--
nvpublic
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH 6/6] x86: add pci-nommu implementation of map_resource

2015-05-07 Thread William Davis


 -Original Message-
 From: Bjorn Helgaas [mailto:bhelg...@google.com]
 Sent: Thursday, May 7, 2015 10:08 AM
 To: William Davis
 Cc: j...@8bytes.org; iommu@lists.linux-foundation.org; linux-
 p...@vger.kernel.org; Terence Ripperda; John Hubbard; jgli...@redhat.com
 Subject: Re: [PATCH 6/6] x86: add pci-nommu implementation of map_resource
 
 On Fri, May 01, 2015 at 01:32:18PM -0500, wda...@nvidia.com wrote:
  From: Will Davis wda...@nvidia.com
 
  diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
  index da15918..6e9e66d 100644
  --- a/arch/x86/kernel/pci-nommu.c
  +++ b/arch/x86/kernel/pci-nommu.c
  @@ -38,6 +38,22 @@ static dma_addr_t nommu_map_page(struct device *dev,
 struct page *page,
  return bus;
   }
 
  +static dma_addr_t nommu_map_resource(struct device *dev, struct resource
 *res,
  +unsigned long offset, size_t size,
  +enum dma_data_direction dir,
  +struct dma_attrs *attrs)
  +{
  +   dma_addr_t bus = res-start + offset;
 
 res-start is the CPU physical address, not the bus address.  There is a
 pci_bus_address() interface to get the bus address.
 
 On many, but not all, x86 platforms the CPU physical address is identical
 to the PCI bus address.
 

Thanks for pointing that out. Since we already have the resource here (and not 
the BAR index), I'll use pcibios_resource_to_bus(), as pci_bus_address() does.

Thanks,
Will

--
nvpublic
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/6] dma-mapping: pci: add pci_(un)map_resource

2015-05-07 Thread Bjorn Helgaas
[+cc Dave for sparc64, Yinghai]

On Fri, May 01, 2015 at 01:32:15PM -0500, wda...@nvidia.com wrote:
 From: Will Davis wda...@nvidia.com
 
 Simply route these through to the new dma_(un)map_resource APIs.
 
 Signed-off-by: Will Davis wda...@nvidia.com
 Reviewed-by: Terence Ripperda trippe...@nvidia.com
 Reviewed-by: John Hubbard jhubb...@nvidia.com
 ---
  include/asm-generic/pci-dma-compat.h | 14 ++
  1 file changed, 14 insertions(+)
 
 diff --git a/include/asm-generic/pci-dma-compat.h 
 b/include/asm-generic/pci-dma-compat.h
 index c110843..ac4a4ad 100644
 --- a/include/asm-generic/pci-dma-compat.h
 +++ b/include/asm-generic/pci-dma-compat.h
 @@ -61,6 +61,20 @@ pci_unmap_page(struct pci_dev *hwdev, dma_addr_t 
 dma_address,
   dma_unmap_page(hwdev == NULL ? NULL : hwdev-dev, dma_address, size, 
 (enum dma_data_direction)direction);
  }
  
 +static inline dma_addr_t
 +pci_map_resource(struct pci_dev *hwdev, struct resource *resource,
 +  unsigned long offset, size_t size, int direction)
 +{
 + return dma_map_resource(hwdev == NULL ? NULL : hwdev-dev, resource, 
 offset, size, (enum dma_data_direction)direction);
 +}

On sparc64, PCI bus addresses, e.g., raw BAR values, can be 64 bits wide,
but dma_addr_t is only 32 bits [1].  So dma_addr_t is a bit of a problem
here.  It's likely that we will add a pci_bus_addr_t, but that hasn't
happened yet [2].

We do have existing problems already, e.g,. pci_bus_address() returns a
dma_addr_t, so it has the same problem.  So I guess this is just a heads-up
that this needs to be fixed eventually.

Bjorn

[1] 
http://lkml.kernel.org/r/20150327.145016.86183910134380870.da...@davemloft.net
[2] http://lkml.kernel.org/r/1427857069-6789-2-git-send-email-ying...@kernel.org
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH 0/6] IOMMU/DMA map_resource support for peer-to-peer

2015-05-07 Thread William Davis


 -Original Message-
 From: Bjorn Helgaas [mailto:bhelg...@google.com]
 Sent: Thursday, May 7, 2015 8:13 AM
 To: Yijing Wang
 Cc: William Davis; Joerg Roedel; open list:INTEL IOMMU (VT-d); linux-
 p...@vger.kernel.org; Terence Ripperda; John Hubbard; Jerome Glisse; Dave
 Jiang; David S. Miller; Alex Williamson
 Subject: Re: [PATCH 0/6] IOMMU/DMA map_resource support for peer-to-peer
 
 On Wed, May 6, 2015 at 8:48 PM, Yijing Wang wangyij...@huawei.com wrote:
  On 2015/5/7 6:18, Bjorn Helgaas wrote:
  [+cc Yijing, Dave J, Dave M, Alex]
 
  On Fri, May 01, 2015 at 01:32:12PM -0500, wda...@nvidia.com wrote:
  From: Will Davis wda...@nvidia.com
 
  Hi,
 
  This patch series adds DMA APIs to map and unmap a struct resource
  to and from a PCI device's IOVA domain, and implements the AMD,
  Intel, and nommu versions of these interfaces.
 
  This solves a long-standing problem with the existing DMA-remapping
  interfaces, which require that a struct page be given for the region
  to be mapped into a device's IOVA domain. This requirement cannot
  support peer device BAR ranges, for which no struct pages exist.
  ...
 
  I think we currently assume there's no peer-to-peer traffic.
 
  I don't know whether changing that will break anything, but I'm
  concerned about these:
 
- PCIe MPS configuration (see pcie_bus_configure_settings()).
 
  I think it should be ok for PCIe MPS configuration, PCIE_BUS_PEER2PEER
  force every device's MPS to 128B, what its concern is the TLP payload
  size. In this series, it seems to only map a iova for device bar region.
 
 MPS configuration makes assumptions about whether there will be any peer-
 to-peer traffic.  If there will be none, MPS can be configured more
 aggressively.
 
 I don't think Linux has any way to detect whether a driver is doing peer-
 to-peer, and there's no way to prevent a driver from doing it.
 We're stuck with requiring the user to specify boot options
 (pci=pcie_bus_safe, pci=pcie_bus_perf, pci=pcie_bus_peer2peer,
 etc.) that tell the PCI core what the user expects to happen.
 
 This is a terrible user experience.  The user has no way to tell what
 drivers are going to do.  If he specifies the wrong thing, e.g., assume no
 peer-to-peer traffic, and then loads a driver that does peer-to-peer, the
 kernel will configure MPS aggressively and when the device does a peer-to-
 peer transfer, it may cause a Malformed TLP error.
 

I agree that this isn't a great user experience, but just want to clarify that 
this problem is orthogonal to this patch series, correct?

Prior to this series, the MPS mismatch is still possible with p2p traffic, but 
when an IOMMU is enabled p2p traffic will result in DMAR faults. The aim of the 
series is to allow drivers to fix the latter, not the former.

Thanks,
Will

--
nvpublic
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 0/6] IOMMU/DMA map_resource support for peer-to-peer

2015-05-07 Thread Bjorn Helgaas
On Thu, May 7, 2015 at 11:23 AM, William Davis wda...@nvidia.com wrote:


 -Original Message-
 From: Bjorn Helgaas [mailto:bhelg...@google.com]
 Sent: Thursday, May 7, 2015 8:13 AM
 To: Yijing Wang
 Cc: William Davis; Joerg Roedel; open list:INTEL IOMMU (VT-d); linux-
 p...@vger.kernel.org; Terence Ripperda; John Hubbard; Jerome Glisse; Dave
 Jiang; David S. Miller; Alex Williamson
 Subject: Re: [PATCH 0/6] IOMMU/DMA map_resource support for peer-to-peer

 On Wed, May 6, 2015 at 8:48 PM, Yijing Wang wangyij...@huawei.com wrote:
  On 2015/5/7 6:18, Bjorn Helgaas wrote:
  [+cc Yijing, Dave J, Dave M, Alex]
 
  On Fri, May 01, 2015 at 01:32:12PM -0500, wda...@nvidia.com wrote:
  From: Will Davis wda...@nvidia.com
 
  Hi,
 
  This patch series adds DMA APIs to map and unmap a struct resource
  to and from a PCI device's IOVA domain, and implements the AMD,
  Intel, and nommu versions of these interfaces.
 
  This solves a long-standing problem with the existing DMA-remapping
  interfaces, which require that a struct page be given for the region
  to be mapped into a device's IOVA domain. This requirement cannot
  support peer device BAR ranges, for which no struct pages exist.
  ...

  I think we currently assume there's no peer-to-peer traffic.
 
  I don't know whether changing that will break anything, but I'm
  concerned about these:
 
- PCIe MPS configuration (see pcie_bus_configure_settings()).
 
  I think it should be ok for PCIe MPS configuration, PCIE_BUS_PEER2PEER
  force every device's MPS to 128B, what its concern is the TLP payload
  size. In this series, it seems to only map a iova for device bar region.

 MPS configuration makes assumptions about whether there will be any peer-
 to-peer traffic.  If there will be none, MPS can be configured more
 aggressively.

 I don't think Linux has any way to detect whether a driver is doing peer-
 to-peer, and there's no way to prevent a driver from doing it.
 We're stuck with requiring the user to specify boot options
 (pci=pcie_bus_safe, pci=pcie_bus_perf, pci=pcie_bus_peer2peer,
 etc.) that tell the PCI core what the user expects to happen.

 This is a terrible user experience.  The user has no way to tell what
 drivers are going to do.  If he specifies the wrong thing, e.g., assume no
 peer-to-peer traffic, and then loads a driver that does peer-to-peer, the
 kernel will configure MPS aggressively and when the device does a peer-to-
 peer transfer, it may cause a Malformed TLP error.


 I agree that this isn't a great user experience, but just want to clarify 
 that this problem is orthogonal to this patch series, correct?

 Prior to this series, the MPS mismatch is still possible with p2p traffic, 
 but when an IOMMU is enabled p2p traffic will result in DMAR faults. The aim 
 of the series is to allow drivers to fix the latter, not the former.

Prior to this series, there wasn't any infrastructure for drivers to
do p2p, so it was mostly reasonable to assume that there *was* no p2p
traffic.

I think we currently default to doing nothing to MPS.  Prior to this
series, it might have been reasonable to optimize based on a no-p2p
assumption, e.g., default to pcie_bus_safe or pcie_bus_perf.  After
this series, I'm not sure what we could do, because p2p will be much
more likely.

It's just an issue; I don't know what the resolution is.

Bjorn
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v10 0/10] iommu/vt-d: Fix intel vt-d faults in kdump kernel

2015-05-07 Thread Joerg Roedel
Hi Baoquan, ZhenHua,

On Mon, May 04, 2015 at 11:17:49AM +0800, Baoquan He wrote:
 On 05/04/15 at 11:06am, Li, ZhenHua wrote:
  Hi baoquan,
  Could you paste the kernel log of the first kernel ?

Please let me know when you have worked this issue out.


Thanks,

Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu