[PATCH] iommu: iova: Consolidate code for adding new node to iovad domain rbtree

2017-02-23 Thread Marek Szyprowski
This patch consolidates almost the same code used in iova_insert_rbtree()
and __alloc_and_insert_iova_range() functions. There is no functional change.

Signed-off-by: Marek Szyprowski 
---
 drivers/iommu/iova.c | 85 +++-
 1 file changed, 31 insertions(+), 54 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b7268a14184f..32b9c2fb37b6 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -100,6 +100,32 @@ static unsigned long iova_rcache_get(struct iova_domain 
*iovad,
}
 }
 
+/* Insert the iova into domain rbtree by holding writer lock */
+static void
+iova_insert_rbtree(struct rb_root *root, struct iova *iova,
+  struct rb_node *start)
+{
+   struct rb_node **new, *parent = NULL;
+
+   new = (start) ? &start : &(root->rb_node);
+   /* Figure out where to put new node */
+   while (*new) {
+   struct iova *this = rb_entry(*new, struct iova, node);
+
+   parent = *new;
+
+   if (iova->pfn_lo < this->pfn_lo)
+   new = &((*new)->rb_left);
+   else if (iova->pfn_lo > this->pfn_lo)
+   new = &((*new)->rb_right);
+   else
+   BUG(); /* this should not happen */
+   }
+   /* Add new node and rebalance tree. */
+   rb_link_node(&iova->node, parent, new);
+   rb_insert_color(&iova->node, root);
+}
+
 /*
  * Computes the padding size required, to make the start address
  * naturally aligned on the power-of-two order of its size
@@ -157,35 +183,8 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
new->pfn_lo = limit_pfn - (size + pad_size) + 1;
new->pfn_hi = new->pfn_lo + size - 1;
 
-   /* Insert the new_iova into domain rbtree by holding writer lock */
-   /* Add new node and rebalance tree. */
-   {
-   struct rb_node **entry, *parent = NULL;
-
-   /* If we have 'prev', it's a valid place to start the
-  insertion. Otherwise, start from the root. */
-   if (prev)
-   entry = &prev;
-   else
-   entry = &iovad->rbroot.rb_node;
-
-   /* Figure out where to put new node */
-   while (*entry) {
-   struct iova *this = rb_entry(*entry, struct iova, node);
-   parent = *entry;
-
-   if (new->pfn_lo < this->pfn_lo)
-   entry = &((*entry)->rb_left);
-   else if (new->pfn_lo > this->pfn_lo)
-   entry = &((*entry)->rb_right);
-   else
-   BUG(); /* this should not happen */
-   }
-
-   /* Add new node and rebalance tree. */
-   rb_link_node(&new->node, parent, entry);
-   rb_insert_color(&new->node, &iovad->rbroot);
-   }
+   /* If we have 'prev', it's a valid place to start the insertion. */
+   iova_insert_rbtree(&iovad->rbroot, new, prev);
__cached_rbnode_insert_update(iovad, saved_pfn, new);
 
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
@@ -194,28 +193,6 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
return 0;
 }
 
-static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova)
-{
-   struct rb_node **new = &(root->rb_node), *parent = NULL;
-   /* Figure out where to put new node */
-   while (*new) {
-   struct iova *this = rb_entry(*new, struct iova, node);
-
-   parent = *new;
-
-   if (iova->pfn_lo < this->pfn_lo)
-   new = &((*new)->rb_left);
-   else if (iova->pfn_lo > this->pfn_lo)
-   new = &((*new)->rb_right);
-   else
-   BUG(); /* this should not happen */
-   }
-   /* Add new node and rebalance tree. */
-   rb_link_node(&iova->node, parent, new);
-   rb_insert_color(&iova->node, root);
-}
-
 static struct kmem_cache *iova_cache;
 static unsigned int iova_cache_users;
 static DEFINE_MUTEX(iova_cache_mutex);
@@ -505,7 +482,7 @@ void put_iova_domain(struct iova_domain *iovad)
 
iova = alloc_and_init_iova(pfn_lo, pfn_hi);
if (iova)
-   iova_insert_rbtree(&iovad->rbroot, iova);
+   iova_insert_rbtree(&iovad->rbroot, iova, NULL);
 
return iova;
 }
@@ -612,11 +589,11 @@ struct iova *
rb_erase(&iova->node, &iovad->rbroot);
 
if (prev) {
-   iova_insert_rbtree(&iovad->rbroot, prev);
+   iova_insert_rbtree(&iovad->rbroot, prev, NULL);
iova->pfn_lo = pfn_lo;
}
if (next) {
-   iova_insert_rbtree(&iovad->rbroot, next);
+   iova_insert_rbtree(&iovad->rbroot, next, NULL);
iova->pf

Re: Partial BAR Address Allocation

2017-02-23 Thread Robin Murphy
On 22/02/17 23:39, Bjorn Helgaas wrote:
> [+cc Joerg, iommu list]
> 
> On Wed, Feb 22, 2017 at 03:44:53PM -0500, Sinan Kaya wrote:
>> On 2/22/2017 1:44 PM, Bjorn Helgaas wrote:
>>> There is no way for a driver to say "I only need this memory BAR and
>>> not the other ones."  The reason is because the PCI_COMMAND_MEMORY bit
>>> enables *all* the memory BARs; there's no way to enable memory BARs
>>> selectively.  If we enable memory BARs and one of them is unassigned,
>>> that unassigned BAR is enabled, and the device will respond at
>>> whatever address the register happens to contain, and that may cause
>>> conflicts.
>>>
>>> I'm not sure this answers your question.  Do you want to get rid of
>>> 32-bit BAR addresses because your host bridge doesn't have a window to
>>> 32-bit PCI addresses?  It's typical for a bridge to support a window
>>> to the 32-bit PCI space as well as one to the 64-bit PCI space.  Often
>>> it performs address translation for the 32-bit window so it doesn't
>>> have to be in the 32-bit area on the CPU side, e.g., you could have
>>> something like this where we have three host bridges and the 2-4GB
>>> space on each PCI root bus is addressable:
>>>
>>>   pci_bus :00: root bus resource [mem 0x108000-0x10] (bus 
>>> address [0x8000-0x])
>>>   pci_bus 0001:00: root bus resource [mem 0x118000-0x11] (bus 
>>> address [0x8000-0x])
>>>   pci_bus 0002:00: root bus resource [mem 0x128000-0x12] (bus 
>>> address [0x8000-0x])
>>
>> The problem is that according to PCI specification BAR addresses and
>> DMA addresses cannot overlap.
>>
>> From PCI-to-PCI Bridge Arch. spec.: "A bridge forwards PCI memory
>> transactions from its primary interface to its secondary interface
>> (downstream) if a memory address is in the range defined by the
>> Memory Base and Memory Limit registers (when the base is less than
>> or equal to the limit) as illustrated in Figure 4-3. Conversely, a
>> memory transaction on the secondary interface that is within this
>> address range will not be forwarded upstream to the primary
>> interface."
>>
>> To be specific, if your DMA address happens to be in
>> [0x8000-0x] and root port's aperture includes this
>> range; the DMA will never make to the system memory.
>>
>> Lorenzo and Robin took some steps to carve out PCI addresses out of
>> DMA addresses in IOMMU drivers by using iova_reserve_pci_windows()
>> function.
>>
>> However, I see that we are still exposed when the operating system
>> doesn't have any IOMMU driver and is using the SWIOTLB for instance. 
> 
> Hmmm.  I guess SWIOTLB assumes there's no address translation in the
> DMA direction, right?

Not entirely - it does rely on arch-provided dma_to_phys() and
phys_to_dma() helpers which are free to accommodate such translations in
a device-specific manner. On arm64 we use these to account for
dev->dma_pfn_offset describing a straightforward linear offset, but
unless one constant offset would apply to all possible outbound windows
I'm not sure that's much help here.

>  If there's no address translation in the PIO
> direction, PCI bus BAR addresses are identical to the CPU-side
> addresses.  In that case, there's no conflict because we already have
> to assign BARs so they never look like a system memory address.
> 
> But if there *is* address translation in the PIO direction, we can
> have conflicts because the bridge can translate CPU-side PIO accesses
> to arbitrary PCI bus addresses.
> 
>> The FW solution I'm looking at requires carving out some part of the
>> DDR from before OS boot so that OS doesn't reclaim that area for
>> DMA.
> 
> If you want to reach system RAM, I guess you need to make sure you
> only DMA to bus addresses outside the host bridge windows, as you said
> above.  DMA inside the windows would be handled as peer-to-peer DMA.
> 
>> I'm not very happy with this solution. I'm also surprised that there
>> is no generic solution in the kernel takes care of this for all root
>> ports regardless of IOMMU driver presence.
> 
> The PCI core isn't really involved in allocating DMA addresses,
> although there definitely is the connection with PCI-to-PCI bridge
> windows that you mentioned.  I added IOMMU guys, who would know a lot
> more than I do.

To me, having the bus addresses of windows shadow assigned physical
addresses sounds mostly like a broken system configuration. Can the
firmware not reprogram them elsewhere, or is the entire bottom 4GB of
the physical memory map occupied by system RAM?

Robin.

> 
> Bjorn
> ___
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu: iova: Consolidate code for adding new node to iovad domain rbtree

2017-02-23 Thread Robin Murphy
On 23/02/17 08:17, Marek Szyprowski wrote:
> This patch consolidates almost the same code used in iova_insert_rbtree()
> and __alloc_and_insert_iova_range() functions. There is no functional change.
> 
> Signed-off-by: Marek Szyprowski 
> ---
>  drivers/iommu/iova.c | 85 
> +++-
>  1 file changed, 31 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index b7268a14184f..32b9c2fb37b6 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -100,6 +100,32 @@ static unsigned long iova_rcache_get(struct iova_domain 
> *iovad,
>   }
>  }
>  
> +/* Insert the iova into domain rbtree by holding writer lock */
> +static void
> +iova_insert_rbtree(struct rb_root *root, struct iova *iova,
> +struct rb_node *start)
> +{
> + struct rb_node **new, *parent = NULL;
> +
> + new = (start) ? &start : &(root->rb_node);
> + /* Figure out where to put new node */
> + while (*new) {
> + struct iova *this = rb_entry(*new, struct iova, node);
> +
> + parent = *new;
> +
> + if (iova->pfn_lo < this->pfn_lo)
> + new = &((*new)->rb_left);
> + else if (iova->pfn_lo > this->pfn_lo)
> + new = &((*new)->rb_right);
> + else
> + BUG(); /* this should not happen */

Ooh, if we're touching this, can we downgrade it to a WARN()? Granted,
allocating an IOVA region of size 0 is not a reasonable thing to do
intentionally, but the fact that it's guaranteed to take down the kernel
is perhaps a bit much (I hit it s many times back when debugging the
iommu_dma_map_sg() stuff).

Nice tidyup otherwise, though.

Robin.

> + }
> + /* Add new node and rebalance tree. */
> + rb_link_node(&iova->node, parent, new);
> + rb_insert_color(&iova->node, root);
> +}
> +
>  /*
>   * Computes the padding size required, to make the start address
>   * naturally aligned on the power-of-two order of its size
> @@ -157,35 +183,8 @@ static int __alloc_and_insert_iova_range(struct 
> iova_domain *iovad,
>   new->pfn_lo = limit_pfn - (size + pad_size) + 1;
>   new->pfn_hi = new->pfn_lo + size - 1;
>  
> - /* Insert the new_iova into domain rbtree by holding writer lock */
> - /* Add new node and rebalance tree. */
> - {
> - struct rb_node **entry, *parent = NULL;
> -
> - /* If we have 'prev', it's a valid place to start the
> -insertion. Otherwise, start from the root. */
> - if (prev)
> - entry = &prev;
> - else
> - entry = &iovad->rbroot.rb_node;
> -
> - /* Figure out where to put new node */
> - while (*entry) {
> - struct iova *this = rb_entry(*entry, struct iova, node);
> - parent = *entry;
> -
> - if (new->pfn_lo < this->pfn_lo)
> - entry = &((*entry)->rb_left);
> - else if (new->pfn_lo > this->pfn_lo)
> - entry = &((*entry)->rb_right);
> - else
> - BUG(); /* this should not happen */
> - }
> -
> - /* Add new node and rebalance tree. */
> - rb_link_node(&new->node, parent, entry);
> - rb_insert_color(&new->node, &iovad->rbroot);
> - }
> + /* If we have 'prev', it's a valid place to start the insertion. */
> + iova_insert_rbtree(&iovad->rbroot, new, prev);
>   __cached_rbnode_insert_update(iovad, saved_pfn, new);
>  
>   spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
> @@ -194,28 +193,6 @@ static int __alloc_and_insert_iova_range(struct 
> iova_domain *iovad,
>   return 0;
>  }
>  
> -static void
> -iova_insert_rbtree(struct rb_root *root, struct iova *iova)
> -{
> - struct rb_node **new = &(root->rb_node), *parent = NULL;
> - /* Figure out where to put new node */
> - while (*new) {
> - struct iova *this = rb_entry(*new, struct iova, node);
> -
> - parent = *new;
> -
> - if (iova->pfn_lo < this->pfn_lo)
> - new = &((*new)->rb_left);
> - else if (iova->pfn_lo > this->pfn_lo)
> - new = &((*new)->rb_right);
> - else
> - BUG(); /* this should not happen */
> - }
> - /* Add new node and rebalance tree. */
> - rb_link_node(&iova->node, parent, new);
> - rb_insert_color(&iova->node, root);
> -}
> -
>  static struct kmem_cache *iova_cache;
>  static unsigned int iova_cache_users;
>  static DEFINE_MUTEX(iova_cache_mutex);
> @@ -505,7 +482,7 @@ void put_iova_domain(struct iova_domain *iovad)
>  
>   iova = alloc_and_init_iova(pfn_lo, pfn_hi);
>   if (iova)
> - iova_insert_rbtree(&iovad->rbroot, iova);
> + iova_insert_

Re: Partial BAR Address Allocation

2017-02-23 Thread Sinan Kaya
Hi Robin,

On 2/23/2017 6:40 AM, Robin Murphy wrote:
> On 22/02/17 23:39, Bjorn Helgaas wrote:
>> [+cc Joerg, iommu list]
>>
>> On Wed, Feb 22, 2017 at 03:44:53PM -0500, Sinan Kaya wrote:
>>> On 2/22/2017 1:44 PM, Bjorn Helgaas wrote:
 There is no way for a driver to say "I only need this memory BAR and
 not the other ones."  The reason is because the PCI_COMMAND_MEMORY bit
 enables *all* the memory BARs; there's no way to enable memory BARs
 selectively.  If we enable memory BARs and one of them is unassigned,
 that unassigned BAR is enabled, and the device will respond at
 whatever address the register happens to contain, and that may cause
 conflicts.

 I'm not sure this answers your question.  Do you want to get rid of
 32-bit BAR addresses because your host bridge doesn't have a window to
 32-bit PCI addresses?  It's typical for a bridge to support a window
 to the 32-bit PCI space as well as one to the 64-bit PCI space.  Often
 it performs address translation for the 32-bit window so it doesn't
 have to be in the 32-bit area on the CPU side, e.g., you could have
 something like this where we have three host bridges and the 2-4GB
 space on each PCI root bus is addressable:

   pci_bus :00: root bus resource [mem 0x108000-0x10] (bus 
 address [0x8000-0x])
   pci_bus 0001:00: root bus resource [mem 0x118000-0x11] (bus 
 address [0x8000-0x])
   pci_bus 0002:00: root bus resource [mem 0x128000-0x12] (bus 
 address [0x8000-0x])
>>>
>>> The problem is that according to PCI specification BAR addresses and
>>> DMA addresses cannot overlap.
>>>
>>> From PCI-to-PCI Bridge Arch. spec.: "A bridge forwards PCI memory
>>> transactions from its primary interface to its secondary interface
>>> (downstream) if a memory address is in the range defined by the
>>> Memory Base and Memory Limit registers (when the base is less than
>>> or equal to the limit) as illustrated in Figure 4-3. Conversely, a
>>> memory transaction on the secondary interface that is within this
>>> address range will not be forwarded upstream to the primary
>>> interface."
>>>
>>> To be specific, if your DMA address happens to be in
>>> [0x8000-0x] and root port's aperture includes this
>>> range; the DMA will never make to the system memory.
>>>
>>> Lorenzo and Robin took some steps to carve out PCI addresses out of
>>> DMA addresses in IOMMU drivers by using iova_reserve_pci_windows()
>>> function.
>>>
>>> However, I see that we are still exposed when the operating system
>>> doesn't have any IOMMU driver and is using the SWIOTLB for instance. 
>>
>> Hmmm.  I guess SWIOTLB assumes there's no address translation in the
>> DMA direction, right?
> 
> Not entirely - it does rely on arch-provided dma_to_phys() and
> phys_to_dma() helpers which are free to accommodate such translations in
> a device-specific manner. On arm64 we use these to account for
> dev->dma_pfn_offset describing a straightforward linear offset, but
> unless one constant offset would apply to all possible outbound windows
> I'm not sure that's much help here.

yeah, that won't help. This is a PCI only problem. Arch layer solution
will move the entire DMA ranges for all peripherals in the SOC to a specific 
offset.
This would be most useful if the entire DDR would start at some non-zero offset.

Even then, PCI usually has several ranges. One range like this to have some
space below 4GB and another untranslated range for true 64bit cards. 

   pci_bus 0002:00: root bus resource [mem 0x128000-0x12] (bus 
 address [0x8000-0x])

We have to emulate some range in the first 4GB to make PCI cards happy.

> 
>>  If there's no address translation in the PIO
>> direction, PCI bus BAR addresses are identical to the CPU-side
>> addresses.  In that case, there's no conflict because we already have
>> to assign BARs so they never look like a system memory address.
>>
>> But if there *is* address translation in the PIO direction, we can
>> have conflicts because the bridge can translate CPU-side PIO accesses
>> to arbitrary PCI bus addresses.
>>
>>> The FW solution I'm looking at requires carving out some part of the
>>> DDR from before OS boot so that OS doesn't reclaim that area for
>>> DMA.
>>
>> If you want to reach system RAM, I guess you need to make sure you
>> only DMA to bus addresses outside the host bridge windows, as you said
>> above.  DMA inside the windows would be handled as peer-to-peer DMA.
>>
>>> I'm not very happy with this solution. I'm also surprised that there
>>> is no generic solution in the kernel takes care of this for all root
>>> ports regardless of IOMMU driver presence.
>>
>> The PCI core isn't really involved in allocating DMA addresses,
>> although there definitely is the connection with PCI-to-PCI bridge
>> windows that you mentioned.  I added IOMMU guys,

Re: [RFC PATCH v4 13/28] efi: Update efi_mem_type() to return defined EFI mem types

2017-02-23 Thread Tom Lendacky

On 2/21/2017 6:05 AM, Matt Fleming wrote:

On Thu, 16 Feb, at 09:44:57AM, Tom Lendacky wrote:

Update the efi_mem_type() to return EFI_RESERVED_TYPE instead of a
hardcoded 0.

Signed-off-by: Tom Lendacky 
---
 arch/x86/platform/efi/efi.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index a15cf81..6407103 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -1037,7 +1037,7 @@ u32 efi_mem_type(unsigned long phys_addr)
efi_memory_desc_t *md;

if (!efi_enabled(EFI_MEMMAP))
-   return 0;
+   return EFI_RESERVED_TYPE;

for_each_efi_memory_desc(md) {
if ((md->phys_addr <= phys_addr) &&
@@ -1045,7 +1045,7 @@ u32 efi_mem_type(unsigned long phys_addr)
  (md->num_pages << EFI_PAGE_SHIFT
return md->type;
}
-   return 0;
+   return EFI_RESERVED_TYPE;
 }


I see what you're getting at here, but arguably the return value in
these cases never should have been zero to begin with (your change
just makes that more obvious).

Returning EFI_RESERVED_TYPE implies an EFI memmap entry exists for
this address, which is misleading because it doesn't in the hunks
you've modified above.

Instead, could you look at returning a negative error value in the
usual way we do in the Linux kernel, and update the function prototype
to match? I don't think any callers actually require the return type
to be u32.


I can do that, I'll change the return type to an int. For the
!efi_enabled I can return -ENOTSUPP and for when an entry isn't
found I can return -EINVAL.  Sound good?

The ia64 arch is the only other arch that defines the function. It
has just a single return 0 that I'll change to -EINVAL.

Thanks,
Tom




___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 9/9] perf/amd/iommu: Enable support for multiple IOMMUs

2017-02-23 Thread Suravee Suthikulpanit

Peter,

On 2/14/17 19:31, Peter Zijlstra wrote:

On Tue, Feb 07, 2017 at 08:57:52AM +0700, Suravee Suthikulpanit wrote:

But instead it looks like you get the counter form:

 #define _GET_CNTR(ev)   ((u8)(ev->hw.extra_reg.reg))

Which is absolutely insane.



So, the IOMMU counters are grouped into bank, and there could be
many banks. I use the extra_reg.reg to hold the bank and counter
indices. This will be used to program onto the counter configuration
register. This is handled in get_next_avail_iommu_bnk_cntr() and
clear_avail_iommu_bnk_cntr().


But this is crazy. That's not what extra_regs are for.


Ok, I understand that we should not be using the extra_regs
since it is intended for other purposes. Please see more detail below.


Also, who cares about the banks, why is this exposed?


The bank and counter values are not exposed to the user-space.
The amd_iommu PMU only expose, csource, devid, domid, pasid, devid_mask,
domid_mask, and pasid_mask as event attributes.


That is, I would very much expect a linear range of counters. You can
always decompose this counter number if you really need to somewhere
down near the hardware accessors.



Actually, the counters are treated as linear range of counters. For example,
the IOMMU hardware has 2 banks with 4 counters/bank. So, we have total of 8
counters. The driver then assigns an index to each events when an event is 
added.
Here, the bank/counter are derived from the assigned index, and stored in
the perf_event as bank and counter values.

However, I have looked into reworking to not use the extra_regs, and I see
that the union in struct hw_perf_event currently contains various PMU-specific
structures (hardware, software, tracepoint, intel_cqm, itrace, amd_power,
and breakpoint).

For amd_iommu PMU, we need additional registers for holding amd_iommu-specific
parameters. So, it seems that we can just introduce amd_iommu-specific struct
instead of re-using the existing structure for hardware events.

I'm planning to add the following structure in the same union:

union {
..
struct { /* amd_iommu */
u8  iommu_csource;
u8  iommu_bank;
u8  iommu_cntr;
u16 iommu_devid;
u16 iommu_devid_msk;
u16 iommu_domid;
u16 iommu_domid_msk;
u32 iommu_pasid;
u32 iommu_pasid_msk;
};
};

Please let me know what you think, of if I am still missing your points.

Thanks,
Suravee
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 9/9] perf/amd/iommu: Enable support for multiple IOMMUs

2017-02-23 Thread Peter Zijlstra
On Fri, Feb 24, 2017 at 12:43:19AM +0700, Suravee Suthikulpanit wrote:

> >Also, who cares about the banks, why is this exposed?
> 
> The bank and counter values are not exposed to the user-space.
> The amd_iommu PMU only expose, csource, devid, domid, pasid, devid_mask,
> domid_mask, and pasid_mask as event attributes.

Ah good, for a little while I was worried the BANK stuff came from
userspace; I misread extra_reg.config and extra_reg.reg, the former
being perf_event_attr::config1 and the latter holding the bank thing.

> >That is, I would very much expect a linear range of counters. You can
> >always decompose this counter number if you really need to somewhere
> >down near the hardware accessors.
> >
> 
> Actually, the counters are treated as linear range of counters. For example,
> the IOMMU hardware has 2 banks with 4 counters/bank. So, we have total of 8
> counters. The driver then assigns an index to each events when an event is 
> added.
> Here, the bank/counter are derived from the assigned index, and stored in
> the perf_event as bank and counter values.
> 
> However, I have looked into reworking to not use the extra_regs, and I see
> that the union in struct hw_perf_event currently contains various PMU-specific
> structures (hardware, software, tracepoint, intel_cqm, itrace, amd_power,
> and breakpoint).
> 
> For amd_iommu PMU, we need additional registers for holding amd_iommu-specific
> parameters. So, it seems that we can just introduce amd_iommu-specific struct
> instead of re-using the existing structure for hardware events.
> 
> I'm planning to add the following structure in the same union:
> 
> union {
> ..
> struct { /* amd_iommu */
> u8  iommu_csource;
> u8  iommu_bank;
> u8  iommu_cntr;
> u16 iommu_devid;
> u16 iommu_devid_msk;
> u16 iommu_domid;
> u16 iommu_domid_msk;
> u32 iommu_pasid;
> u32 iommu_pasid_msk;
> };
> };
> 
> Please let me know what you think, of if I am still missing your points.

Yes, adding a struct to that union is fine and clarifies things. And
just because I'm weird like that, there's a u8 hole after iommu_cntr.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 9/9] perf/amd/iommu: Enable support for multiple IOMMUs

2017-02-23 Thread Suravee Suthikulpanit



On 2/24/17 01:11, Peter Zijlstra wrote:

However, I have looked into reworking to not use the extra_regs, and I see
that the union in struct hw_perf_event currently contains various PMU-specific
structures (hardware, software, tracepoint, intel_cqm, itrace, amd_power,
and breakpoint).

For amd_iommu PMU, we need additional registers for holding amd_iommu-specific
parameters. So, it seems that we can just introduce amd_iommu-specific struct
instead of re-using the existing structure for hardware events.

I'm planning to add the following structure in the same union:

union {
..
struct { /* amd_iommu */
u8  iommu_csource;
u8  iommu_bank;
u8  iommu_cntr;
u16 iommu_devid;
u16 iommu_devid_msk;
u16 iommu_domid;
u16 iommu_domid_msk;
u32 iommu_pasid;
u32 iommu_pasid_msk;
};
};

Please let me know what you think, of if I am still missing your points.

Yes, adding a struct to that union is fine and clarifies things. And
just because I'm weird like that, there's a u8 hole after iommu_cntr.


Ok, I'll update this in V10 that I'll be sending out this week.

Thanks,
Suravee
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


intel-iommu sysfs oops.

2017-02-23 Thread Dave Jones
cat /sys/devices/virtual/iommu/dmar0/intel-iommu/version

Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
CPU: 2 PID: 1488 Comm: cat Not tainted 4.10.0-think+ #5 
task: 8804ee440040 task.stack: c9d48000
RIP: 0010:intel_iommu_show_version+0x13/0x40
RSP: 0018:c9d4bcf0 EFLAGS: 00010286
RAX: 8804ede80008 RBX: 81ec2c80 RCX: 
RDX:  RSI: 81ec2c80 RDI: 88017fc2d5d0
RBP: c9d4bcf0 R08:  R09: 8804ede80008
R10: 068b13bb R11: 0006 R12: 1000
R13: 81a96f10 R14: 8804fdd131b8 R15: 8804ede80008
FS:  7feb8ad5e700() GS:880507c0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 0004ee49a000 CR4: 001406e0
Call Trace:
 dev_attr_show+0x20/0x50
 ? sysfs_file_ops+0x46/0x70
 sysfs_kf_seq_show+0xb7/0x110
 kernfs_seq_show+0x26/0x30
 seq_read+0x129/0x4a0
 kernfs_fop_read+0x13b/0x1a0
 __vfs_read+0x37/0x140
 ? __context_tracking_exit.part.5+0x82/0x150
 vfs_read+0xab/0x180
 SyS_read+0x58/0xc0
 do_syscall_64+0x61/0x170
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: 0033:0x7feb8a8875a0
RSP: 002b:7fff8dece608 EFLAGS: 0246
ORIG_RAX: 
RAX: ffda RBX: 0002 RCX: 7feb8a8875a0
RDX: 0002 RSI: 7feb8aba2000 RDI: 0003
RBP: 7feb8aba2000 R08:  R09: 
R10: 037b R11: 0246 R12: 7feb8aba2000
R13: 0003 R14: 0002 R15: 1000


$ dmesg | grep -i dmar
[0.00] ACPI: DMAR 0xD479B0B8 B8 (v01 LENOVO TC-FB
1B30 INTL 0001)
[0.190199] DMAR: Host address width 39
[0.190208] DMAR: DRHD base: 0x00fed9 flags: 0x0
[0.190235] DMAR: dmar0: reg_base_addr fed9 ver 1:0 cap c020660462 
ecap f0101a
[0.190250] DMAR: DRHD base: 0x00fed91000 flags: 0x1
[0.190272] DMAR: dmar1: reg_base_addr fed91000 ver 1:0 cap d2008020660462 
ecap f010da
[0.190286] DMAR: RMRR base: 0x00d5e6c000 end: 0x00d5e92fff
[0.190302] DMAR: RMRR base: 0x00d700 end: 0x00df1f
[0.190318] DMAR-IR: IOAPIC id 8 under DRHD base  0xfed91000 IOMMU 1
[0.190330] DMAR-IR: HPET id 0 under DRHD base 0xfed91000
[0.190341] DMAR-IR: Queued invalidation will be enabled to support x2apic 
and Intr-remapping.
[0.190634] DMAR-IR: Enabled IRQ remapping in x2apic mode
[1.050338] DMAR: No ATSR found
[1.050717] DMAR: dmar0: Using Queued invalidation
[1.050739] DMAR: dmar1: Using Queued invalidation
[1.050799] DMAR: Setting RMRR:
[1.050938] DMAR: Setting identity map for device :00:02.0 [0xd700 - 
0xdf1f]
[1.051763] DMAR: Setting identity map for device :00:14.0 [0xd5e6c000 - 
0xd5e92fff]
[1.051911] DMAR: Setting identity map for device :00:1a.0 [0xd5e6c000 - 
0xd5e92fff]
[1.052054] DMAR: Setting identity map for device :00:1d.0 [0xd5e6c000 - 
0xd5e92fff]
[1.052083] DMAR: Prepare 0-16MiB unity mapping for LPC
[1.052200] DMAR: Setting identity map for device :00:1f.0 [0x0 - 
0xff]
[1.052331] DMAR: Intel(R) Virtualization Technology for Directed I/O
[1.220475] [drm] DMAR active, disabling use of stolen memory
[1.653051] calling  dmar_free_unused_resources+0x0/0xcf @ 1
[1.653054] initcall dmar_free_unused_resources+0x0/0xcf returned 0 after 1 
usecs

$ dmesg | grep -i iommu
[0.190318] DMAR-IR: IOAPIC id 8 under DRHD base  0xfed91000 IOMMU 1
[0.568456] calling  iommu_init+0x0/0x2b @ 1
[0.568470] initcall iommu_init+0x0/0x2b returned 0 after 0 usecs
[0.570137] calling  iommu_dev_init+0x0/0x19 @ 1
[0.570158] initcall iommu_dev_init+0x0/0x19 returned 0 after 0 usecs
[1.050256] calling  pci_iommu_init+0x0/0x3c @ 1
[1.052700] iommu: Adding device :00:00.0 to group 0
[1.052767] iommu: Adding device :00:02.0 to group 1
[1.052833] iommu: Adding device :00:03.0 to group 2
[1.052895] iommu: Adding device :00:14.0 to group 3
[1.052965] iommu: Adding device :00:16.0 to group 4
[1.053026] iommu: Adding device :00:19.0 to group 5
[1.053093] iommu: Adding device :00:1a.0 to group 6
[1.053157] iommu: Adding device :00:1b.0 to group 7
[1.053220] iommu: Adding device :00:1c.0 to group 8
[1.053286] iommu: Adding device :00:1c.3 to group 9
[1.053348] iommu: Adding device :00:1d.0 to group 10
[1.053443] iommu: Adding device :00:1f.0 to group 11
[1.053487] iommu: Adding device :00:1f.2 to group 11
[1.053529] iommu: Adding device :00:1f.3 to group 11
[1.053599] iommu: Adding device :02:00.0 to group 12
[1.065133] initcall pci_iommu_init+0x0/0x3c returned 0 after 14515 usecs


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/list

Re: [RFC PATCH v4 14/28] Add support to access boot related data in the clear

2017-02-23 Thread Tom Lendacky

On 2/21/2017 9:06 AM, Borislav Petkov wrote:

On Thu, Feb 16, 2017 at 09:45:09AM -0600, Tom Lendacky wrote:

Boot data (such as EFI related data) is not encrypted when the system is
booted and needs to be mapped decrypted.  Add support to apply the proper
attributes to the EFI page tables and to the early_memremap and memremap
APIs to identify the type of data being accessed so that the proper
encryption attribute can be applied.


So this doesn't even begin to explain *why* we need this. The emphasis
being on *why*.

Lemme guess? kexec? And because of efi_reuse_config?


Hmm... maybe I'm missing something here.  This doesn't have anything to
do with kexec or efi_reuse_config.  This has to do with the fact that
when a system boots the setup data and the EFI data are not encrypted.
Since it's not encrypted we need to be sure that any early_memremap()
and memremap() calls remove the encryption mask from the resulting
pagetable entry that is created so the data can be accessed properly.



If so, then that whole ad-hoc caching in parse_setup_data() needs to go.
Especially if efi_reuse_config() already sees those addresses so while
we're there, we could save them somewhere or whatnot. But not doing the
whole thing again in parse_setup_data().


Signed-off-by: Tom Lendacky 
---
 arch/x86/include/asm/io.h  |3 +
 arch/x86/include/asm/setup.h   |8 +++
 arch/x86/kernel/setup.c|   33 
 arch/x86/mm/ioremap.c  |  111 
 arch/x86/platform/efi/efi_64.c |   16 --
 kernel/memremap.c  |   11 
 mm/early_ioremap.c |   18 +-
 7 files changed, 192 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 7afb0e2..833f7cc 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -381,4 +381,7 @@ extern int __must_check arch_phys_wc_add(unsigned long base,
 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
 #endif

+extern bool arch_memremap_do_ram_remap(resource_size_t offset, size_t size);
+#define arch_memremap_do_ram_remap arch_memremap_do_ram_remap
+
 #endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ac1d5da..8d9 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -63,6 +63,14 @@ static inline void x86_ce4100_early_setup(void) { }
 #include 
 #include 

+struct setup_data_attrs {
+   u64 paddr;
+   unsigned long size;
+};
+
+extern struct setup_data_attrs setup_data_list[];
+extern unsigned int setup_data_list_count;
+
 /*
  * This is set up by the setup-routine at boot-time
  */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bd5b9a7..d2234bf 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -148,6 +148,9 @@ int default_check_phys_apicid_present(int phys_apicid)

 struct boot_params boot_params;

+struct setup_data_attrs setup_data_list[32];
+unsigned int setup_data_list_count;
+
 /*
  * Machine setup..
  */
@@ -419,6 +422,32 @@ static void __init reserve_initrd(void)
 }
 #endif /* CONFIG_BLK_DEV_INITRD */

+static void __init update_setup_data_list(u64 pa_data, unsigned long size)
+{
+   unsigned int i;
+
+   for (i = 0; i < setup_data_list_count; i++) {
+   if (setup_data_list[i].paddr != pa_data)
+   continue;
+
+   setup_data_list[i].size = size;
+   break;
+   }
+}
+
+static void __init add_to_setup_data_list(u64 pa_data, unsigned long size)
+{
+   if (!sme_active())
+   return;
+
+   if (!WARN(setup_data_list_count == ARRAY_SIZE(setup_data_list),
+ "exceeded maximum setup data list slots")) {
+   setup_data_list[setup_data_list_count].paddr = pa_data;
+   setup_data_list[setup_data_list_count].size = size;
+   setup_data_list_count++;
+   }
+}
+
 static void __init parse_setup_data(void)
 {
struct setup_data *data;
@@ -428,12 +457,16 @@ static void __init parse_setup_data(void)
while (pa_data) {
u32 data_len, data_type;

+   add_to_setup_data_list(pa_data, sizeof(*data));
+
data = early_memremap(pa_data, sizeof(*data));
data_len = data->len + sizeof(struct setup_data);
data_type = data->type;
pa_next = data->next;
early_memunmap(data, sizeof(*data));

+   update_setup_data_list(pa_data, data_len);
+
switch (data_type) {
case SETUP_E820_EXT:
e820__memory_setup_extended(pa_data, data_len);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2385e70..b0ff6bc 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -21,6 +22,7 @@
 #include 
 #include 
 #include 
+#i

Re: [RFC PATCH v4 07/28] x86: Provide general kernel support for memory encryption

2017-02-23 Thread Tom Lendacky

On 2/22/2017 12:13 PM, Dave Hansen wrote:

On 02/16/2017 07:43 AM, Tom Lendacky wrote:

 static inline unsigned long pte_pfn(pte_t pte)
 {
-   return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
+   return (pte_val(pte) & ~sme_me_mask & PTE_PFN_MASK) >> PAGE_SHIFT;
 }

 static inline unsigned long pmd_pfn(pmd_t pmd)
 {
-   return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
+   return (pmd_val(pmd) & ~sme_me_mask & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
 }


Could you talk a bit about why you chose to do the "~sme_me_mask" bit in
here instead of making it a part of PTE_PFN_MASK / pmd_pfn_mask(pmd)?


I think that's a good catch.  Let me look at it, but I believe that it
should be possible to do and avoid what you're worried about below.

Thanks,
Tom



It might not matter, but I'd be worried that this ends up breaking
direct users of PTE_PFN_MASK / pmd_pfn_mask(pmd) since they now no
longer mask the PFN out of a PTE.


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: intel-iommu sysfs oops.

2017-02-23 Thread Joerg Roedel
Hi Dave,

On Thu, Feb 23, 2017 at 02:44:06PM -0500, Dave Jones wrote:
> cat /sys/devices/virtual/iommu/dmar0/intel-iommu/version
> 
> Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC

Thanks for the report, the problem reproduces easily here. The diff
below fixes the issue on Intel, AMD has the same problem and needs a
similar fix I'll do shortly.

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index f5e02f8..ec3f6c8 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4730,11 +4730,16 @@ static int intel_iommu_cpu_dead(unsigned int cpu)
return 0;
 }
 
+static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
+{
+   return container_of(dev, struct intel_iommu, iommu.dev);
+}
+
 static ssize_t intel_iommu_show_version(struct device *dev,
struct device_attribute *attr,
char *buf)
 {
-   struct intel_iommu *iommu = dev_get_drvdata(dev);
+   struct intel_iommu *iommu = dev_to_intel_iommu(dev);
u32 ver = readl(iommu->reg + DMAR_VER_REG);
return sprintf(buf, "%d:%d\n",
   DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
@@ -4745,7 +4750,7 @@ static ssize_t intel_iommu_show_address(struct device 
*dev,
struct device_attribute *attr,
char *buf)
 {
-   struct intel_iommu *iommu = dev_get_drvdata(dev);
+   struct intel_iommu *iommu = dev_to_intel_iommu(dev);
return sprintf(buf, "%llx\n", iommu->reg_phys);
 }
 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
@@ -4754,7 +4759,7 @@ static ssize_t intel_iommu_show_cap(struct device *dev,
struct device_attribute *attr,
char *buf)
 {
-   struct intel_iommu *iommu = dev_get_drvdata(dev);
+   struct intel_iommu *iommu = dev_to_intel_iommu(dev);
return sprintf(buf, "%llx\n", iommu->cap);
 }
 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
@@ -4763,7 +4768,7 @@ static ssize_t intel_iommu_show_ecap(struct device *dev,
struct device_attribute *attr,
char *buf)
 {
-   struct intel_iommu *iommu = dev_get_drvdata(dev);
+   struct intel_iommu *iommu = dev_to_intel_iommu(dev);
return sprintf(buf, "%llx\n", iommu->ecap);
 }
 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
@@ -4772,7 +4777,7 @@ static ssize_t intel_iommu_show_ndoms(struct device *dev,
  struct device_attribute *attr,
  char *buf)
 {
-   struct intel_iommu *iommu = dev_get_drvdata(dev);
+   struct intel_iommu *iommu = dev_to_intel_iommu(dev);
return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
 }
 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
@@ -4781,7 +4786,7 @@ static ssize_t intel_iommu_show_ndoms_used(struct device 
*dev,
   struct device_attribute *attr,
   char *buf)
 {
-   struct intel_iommu *iommu = dev_get_drvdata(dev);
+   struct intel_iommu *iommu = dev_to_intel_iommu(dev);
return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
  cap_ndoms(iommu->cap)));
 }
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu