[PATCH] iommu: iova: Consolidate code for adding new node to iovad domain rbtree
This patch consolidates almost the same code used in iova_insert_rbtree() and __alloc_and_insert_iova_range() functions. There is no functional change. Signed-off-by: Marek Szyprowski --- drivers/iommu/iova.c | 85 +++- 1 file changed, 31 insertions(+), 54 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index b7268a14184f..32b9c2fb37b6 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -100,6 +100,32 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad, } } +/* Insert the iova into domain rbtree by holding writer lock */ +static void +iova_insert_rbtree(struct rb_root *root, struct iova *iova, + struct rb_node *start) +{ + struct rb_node **new, *parent = NULL; + + new = (start) ? &start : &(root->rb_node); + /* Figure out where to put new node */ + while (*new) { + struct iova *this = rb_entry(*new, struct iova, node); + + parent = *new; + + if (iova->pfn_lo < this->pfn_lo) + new = &((*new)->rb_left); + else if (iova->pfn_lo > this->pfn_lo) + new = &((*new)->rb_right); + else + BUG(); /* this should not happen */ + } + /* Add new node and rebalance tree. */ + rb_link_node(&iova->node, parent, new); + rb_insert_color(&iova->node, root); +} + /* * Computes the padding size required, to make the start address * naturally aligned on the power-of-two order of its size @@ -157,35 +183,8 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad, new->pfn_lo = limit_pfn - (size + pad_size) + 1; new->pfn_hi = new->pfn_lo + size - 1; - /* Insert the new_iova into domain rbtree by holding writer lock */ - /* Add new node and rebalance tree. */ - { - struct rb_node **entry, *parent = NULL; - - /* If we have 'prev', it's a valid place to start the - insertion. Otherwise, start from the root. */ - if (prev) - entry = &prev; - else - entry = &iovad->rbroot.rb_node; - - /* Figure out where to put new node */ - while (*entry) { - struct iova *this = rb_entry(*entry, struct iova, node); - parent = *entry; - - if (new->pfn_lo < this->pfn_lo) - entry = &((*entry)->rb_left); - else if (new->pfn_lo > this->pfn_lo) - entry = &((*entry)->rb_right); - else - BUG(); /* this should not happen */ - } - - /* Add new node and rebalance tree. */ - rb_link_node(&new->node, parent, entry); - rb_insert_color(&new->node, &iovad->rbroot); - } + /* If we have 'prev', it's a valid place to start the insertion. */ + iova_insert_rbtree(&iovad->rbroot, new, prev); __cached_rbnode_insert_update(iovad, saved_pfn, new); spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); @@ -194,28 +193,6 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad, return 0; } -static void -iova_insert_rbtree(struct rb_root *root, struct iova *iova) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - /* Figure out where to put new node */ - while (*new) { - struct iova *this = rb_entry(*new, struct iova, node); - - parent = *new; - - if (iova->pfn_lo < this->pfn_lo) - new = &((*new)->rb_left); - else if (iova->pfn_lo > this->pfn_lo) - new = &((*new)->rb_right); - else - BUG(); /* this should not happen */ - } - /* Add new node and rebalance tree. */ - rb_link_node(&iova->node, parent, new); - rb_insert_color(&iova->node, root); -} - static struct kmem_cache *iova_cache; static unsigned int iova_cache_users; static DEFINE_MUTEX(iova_cache_mutex); @@ -505,7 +482,7 @@ void put_iova_domain(struct iova_domain *iovad) iova = alloc_and_init_iova(pfn_lo, pfn_hi); if (iova) - iova_insert_rbtree(&iovad->rbroot, iova); + iova_insert_rbtree(&iovad->rbroot, iova, NULL); return iova; } @@ -612,11 +589,11 @@ struct iova * rb_erase(&iova->node, &iovad->rbroot); if (prev) { - iova_insert_rbtree(&iovad->rbroot, prev); + iova_insert_rbtree(&iovad->rbroot, prev, NULL); iova->pfn_lo = pfn_lo; } if (next) { - iova_insert_rbtree(&iovad->rbroot, next); + iova_insert_rbtree(&iovad->rbroot, next, NULL); iova->pf
Re: Partial BAR Address Allocation
On 22/02/17 23:39, Bjorn Helgaas wrote: > [+cc Joerg, iommu list] > > On Wed, Feb 22, 2017 at 03:44:53PM -0500, Sinan Kaya wrote: >> On 2/22/2017 1:44 PM, Bjorn Helgaas wrote: >>> There is no way for a driver to say "I only need this memory BAR and >>> not the other ones." The reason is because the PCI_COMMAND_MEMORY bit >>> enables *all* the memory BARs; there's no way to enable memory BARs >>> selectively. If we enable memory BARs and one of them is unassigned, >>> that unassigned BAR is enabled, and the device will respond at >>> whatever address the register happens to contain, and that may cause >>> conflicts. >>> >>> I'm not sure this answers your question. Do you want to get rid of >>> 32-bit BAR addresses because your host bridge doesn't have a window to >>> 32-bit PCI addresses? It's typical for a bridge to support a window >>> to the 32-bit PCI space as well as one to the 64-bit PCI space. Often >>> it performs address translation for the 32-bit window so it doesn't >>> have to be in the 32-bit area on the CPU side, e.g., you could have >>> something like this where we have three host bridges and the 2-4GB >>> space on each PCI root bus is addressable: >>> >>> pci_bus :00: root bus resource [mem 0x108000-0x10] (bus >>> address [0x8000-0x]) >>> pci_bus 0001:00: root bus resource [mem 0x118000-0x11] (bus >>> address [0x8000-0x]) >>> pci_bus 0002:00: root bus resource [mem 0x128000-0x12] (bus >>> address [0x8000-0x]) >> >> The problem is that according to PCI specification BAR addresses and >> DMA addresses cannot overlap. >> >> From PCI-to-PCI Bridge Arch. spec.: "A bridge forwards PCI memory >> transactions from its primary interface to its secondary interface >> (downstream) if a memory address is in the range defined by the >> Memory Base and Memory Limit registers (when the base is less than >> or equal to the limit) as illustrated in Figure 4-3. Conversely, a >> memory transaction on the secondary interface that is within this >> address range will not be forwarded upstream to the primary >> interface." >> >> To be specific, if your DMA address happens to be in >> [0x8000-0x] and root port's aperture includes this >> range; the DMA will never make to the system memory. >> >> Lorenzo and Robin took some steps to carve out PCI addresses out of >> DMA addresses in IOMMU drivers by using iova_reserve_pci_windows() >> function. >> >> However, I see that we are still exposed when the operating system >> doesn't have any IOMMU driver and is using the SWIOTLB for instance. > > Hmmm. I guess SWIOTLB assumes there's no address translation in the > DMA direction, right? Not entirely - it does rely on arch-provided dma_to_phys() and phys_to_dma() helpers which are free to accommodate such translations in a device-specific manner. On arm64 we use these to account for dev->dma_pfn_offset describing a straightforward linear offset, but unless one constant offset would apply to all possible outbound windows I'm not sure that's much help here. > If there's no address translation in the PIO > direction, PCI bus BAR addresses are identical to the CPU-side > addresses. In that case, there's no conflict because we already have > to assign BARs so they never look like a system memory address. > > But if there *is* address translation in the PIO direction, we can > have conflicts because the bridge can translate CPU-side PIO accesses > to arbitrary PCI bus addresses. > >> The FW solution I'm looking at requires carving out some part of the >> DDR from before OS boot so that OS doesn't reclaim that area for >> DMA. > > If you want to reach system RAM, I guess you need to make sure you > only DMA to bus addresses outside the host bridge windows, as you said > above. DMA inside the windows would be handled as peer-to-peer DMA. > >> I'm not very happy with this solution. I'm also surprised that there >> is no generic solution in the kernel takes care of this for all root >> ports regardless of IOMMU driver presence. > > The PCI core isn't really involved in allocating DMA addresses, > although there definitely is the connection with PCI-to-PCI bridge > windows that you mentioned. I added IOMMU guys, who would know a lot > more than I do. To me, having the bus addresses of windows shadow assigned physical addresses sounds mostly like a broken system configuration. Can the firmware not reprogram them elsewhere, or is the entire bottom 4GB of the physical memory map occupied by system RAM? Robin. > > Bjorn > ___ > iommu mailing list > iommu@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu > ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu: iova: Consolidate code for adding new node to iovad domain rbtree
On 23/02/17 08:17, Marek Szyprowski wrote: > This patch consolidates almost the same code used in iova_insert_rbtree() > and __alloc_and_insert_iova_range() functions. There is no functional change. > > Signed-off-by: Marek Szyprowski > --- > drivers/iommu/iova.c | 85 > +++- > 1 file changed, 31 insertions(+), 54 deletions(-) > > diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c > index b7268a14184f..32b9c2fb37b6 100644 > --- a/drivers/iommu/iova.c > +++ b/drivers/iommu/iova.c > @@ -100,6 +100,32 @@ static unsigned long iova_rcache_get(struct iova_domain > *iovad, > } > } > > +/* Insert the iova into domain rbtree by holding writer lock */ > +static void > +iova_insert_rbtree(struct rb_root *root, struct iova *iova, > +struct rb_node *start) > +{ > + struct rb_node **new, *parent = NULL; > + > + new = (start) ? &start : &(root->rb_node); > + /* Figure out where to put new node */ > + while (*new) { > + struct iova *this = rb_entry(*new, struct iova, node); > + > + parent = *new; > + > + if (iova->pfn_lo < this->pfn_lo) > + new = &((*new)->rb_left); > + else if (iova->pfn_lo > this->pfn_lo) > + new = &((*new)->rb_right); > + else > + BUG(); /* this should not happen */ Ooh, if we're touching this, can we downgrade it to a WARN()? Granted, allocating an IOVA region of size 0 is not a reasonable thing to do intentionally, but the fact that it's guaranteed to take down the kernel is perhaps a bit much (I hit it s many times back when debugging the iommu_dma_map_sg() stuff). Nice tidyup otherwise, though. Robin. > + } > + /* Add new node and rebalance tree. */ > + rb_link_node(&iova->node, parent, new); > + rb_insert_color(&iova->node, root); > +} > + > /* > * Computes the padding size required, to make the start address > * naturally aligned on the power-of-two order of its size > @@ -157,35 +183,8 @@ static int __alloc_and_insert_iova_range(struct > iova_domain *iovad, > new->pfn_lo = limit_pfn - (size + pad_size) + 1; > new->pfn_hi = new->pfn_lo + size - 1; > > - /* Insert the new_iova into domain rbtree by holding writer lock */ > - /* Add new node and rebalance tree. */ > - { > - struct rb_node **entry, *parent = NULL; > - > - /* If we have 'prev', it's a valid place to start the > -insertion. Otherwise, start from the root. */ > - if (prev) > - entry = &prev; > - else > - entry = &iovad->rbroot.rb_node; > - > - /* Figure out where to put new node */ > - while (*entry) { > - struct iova *this = rb_entry(*entry, struct iova, node); > - parent = *entry; > - > - if (new->pfn_lo < this->pfn_lo) > - entry = &((*entry)->rb_left); > - else if (new->pfn_lo > this->pfn_lo) > - entry = &((*entry)->rb_right); > - else > - BUG(); /* this should not happen */ > - } > - > - /* Add new node and rebalance tree. */ > - rb_link_node(&new->node, parent, entry); > - rb_insert_color(&new->node, &iovad->rbroot); > - } > + /* If we have 'prev', it's a valid place to start the insertion. */ > + iova_insert_rbtree(&iovad->rbroot, new, prev); > __cached_rbnode_insert_update(iovad, saved_pfn, new); > > spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); > @@ -194,28 +193,6 @@ static int __alloc_and_insert_iova_range(struct > iova_domain *iovad, > return 0; > } > > -static void > -iova_insert_rbtree(struct rb_root *root, struct iova *iova) > -{ > - struct rb_node **new = &(root->rb_node), *parent = NULL; > - /* Figure out where to put new node */ > - while (*new) { > - struct iova *this = rb_entry(*new, struct iova, node); > - > - parent = *new; > - > - if (iova->pfn_lo < this->pfn_lo) > - new = &((*new)->rb_left); > - else if (iova->pfn_lo > this->pfn_lo) > - new = &((*new)->rb_right); > - else > - BUG(); /* this should not happen */ > - } > - /* Add new node and rebalance tree. */ > - rb_link_node(&iova->node, parent, new); > - rb_insert_color(&iova->node, root); > -} > - > static struct kmem_cache *iova_cache; > static unsigned int iova_cache_users; > static DEFINE_MUTEX(iova_cache_mutex); > @@ -505,7 +482,7 @@ void put_iova_domain(struct iova_domain *iovad) > > iova = alloc_and_init_iova(pfn_lo, pfn_hi); > if (iova) > - iova_insert_rbtree(&iovad->rbroot, iova); > + iova_insert_
Re: Partial BAR Address Allocation
Hi Robin, On 2/23/2017 6:40 AM, Robin Murphy wrote: > On 22/02/17 23:39, Bjorn Helgaas wrote: >> [+cc Joerg, iommu list] >> >> On Wed, Feb 22, 2017 at 03:44:53PM -0500, Sinan Kaya wrote: >>> On 2/22/2017 1:44 PM, Bjorn Helgaas wrote: There is no way for a driver to say "I only need this memory BAR and not the other ones." The reason is because the PCI_COMMAND_MEMORY bit enables *all* the memory BARs; there's no way to enable memory BARs selectively. If we enable memory BARs and one of them is unassigned, that unassigned BAR is enabled, and the device will respond at whatever address the register happens to contain, and that may cause conflicts. I'm not sure this answers your question. Do you want to get rid of 32-bit BAR addresses because your host bridge doesn't have a window to 32-bit PCI addresses? It's typical for a bridge to support a window to the 32-bit PCI space as well as one to the 64-bit PCI space. Often it performs address translation for the 32-bit window so it doesn't have to be in the 32-bit area on the CPU side, e.g., you could have something like this where we have three host bridges and the 2-4GB space on each PCI root bus is addressable: pci_bus :00: root bus resource [mem 0x108000-0x10] (bus address [0x8000-0x]) pci_bus 0001:00: root bus resource [mem 0x118000-0x11] (bus address [0x8000-0x]) pci_bus 0002:00: root bus resource [mem 0x128000-0x12] (bus address [0x8000-0x]) >>> >>> The problem is that according to PCI specification BAR addresses and >>> DMA addresses cannot overlap. >>> >>> From PCI-to-PCI Bridge Arch. spec.: "A bridge forwards PCI memory >>> transactions from its primary interface to its secondary interface >>> (downstream) if a memory address is in the range defined by the >>> Memory Base and Memory Limit registers (when the base is less than >>> or equal to the limit) as illustrated in Figure 4-3. Conversely, a >>> memory transaction on the secondary interface that is within this >>> address range will not be forwarded upstream to the primary >>> interface." >>> >>> To be specific, if your DMA address happens to be in >>> [0x8000-0x] and root port's aperture includes this >>> range; the DMA will never make to the system memory. >>> >>> Lorenzo and Robin took some steps to carve out PCI addresses out of >>> DMA addresses in IOMMU drivers by using iova_reserve_pci_windows() >>> function. >>> >>> However, I see that we are still exposed when the operating system >>> doesn't have any IOMMU driver and is using the SWIOTLB for instance. >> >> Hmmm. I guess SWIOTLB assumes there's no address translation in the >> DMA direction, right? > > Not entirely - it does rely on arch-provided dma_to_phys() and > phys_to_dma() helpers which are free to accommodate such translations in > a device-specific manner. On arm64 we use these to account for > dev->dma_pfn_offset describing a straightforward linear offset, but > unless one constant offset would apply to all possible outbound windows > I'm not sure that's much help here. yeah, that won't help. This is a PCI only problem. Arch layer solution will move the entire DMA ranges for all peripherals in the SOC to a specific offset. This would be most useful if the entire DDR would start at some non-zero offset. Even then, PCI usually has several ranges. One range like this to have some space below 4GB and another untranslated range for true 64bit cards. pci_bus 0002:00: root bus resource [mem 0x128000-0x12] (bus address [0x8000-0x]) We have to emulate some range in the first 4GB to make PCI cards happy. > >> If there's no address translation in the PIO >> direction, PCI bus BAR addresses are identical to the CPU-side >> addresses. In that case, there's no conflict because we already have >> to assign BARs so they never look like a system memory address. >> >> But if there *is* address translation in the PIO direction, we can >> have conflicts because the bridge can translate CPU-side PIO accesses >> to arbitrary PCI bus addresses. >> >>> The FW solution I'm looking at requires carving out some part of the >>> DDR from before OS boot so that OS doesn't reclaim that area for >>> DMA. >> >> If you want to reach system RAM, I guess you need to make sure you >> only DMA to bus addresses outside the host bridge windows, as you said >> above. DMA inside the windows would be handled as peer-to-peer DMA. >> >>> I'm not very happy with this solution. I'm also surprised that there >>> is no generic solution in the kernel takes care of this for all root >>> ports regardless of IOMMU driver presence. >> >> The PCI core isn't really involved in allocating DMA addresses, >> although there definitely is the connection with PCI-to-PCI bridge >> windows that you mentioned. I added IOMMU guys,
Re: [RFC PATCH v4 13/28] efi: Update efi_mem_type() to return defined EFI mem types
On 2/21/2017 6:05 AM, Matt Fleming wrote: On Thu, 16 Feb, at 09:44:57AM, Tom Lendacky wrote: Update the efi_mem_type() to return EFI_RESERVED_TYPE instead of a hardcoded 0. Signed-off-by: Tom Lendacky --- arch/x86/platform/efi/efi.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a15cf81..6407103 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -1037,7 +1037,7 @@ u32 efi_mem_type(unsigned long phys_addr) efi_memory_desc_t *md; if (!efi_enabled(EFI_MEMMAP)) - return 0; + return EFI_RESERVED_TYPE; for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && @@ -1045,7 +1045,7 @@ u32 efi_mem_type(unsigned long phys_addr) (md->num_pages << EFI_PAGE_SHIFT return md->type; } - return 0; + return EFI_RESERVED_TYPE; } I see what you're getting at here, but arguably the return value in these cases never should have been zero to begin with (your change just makes that more obvious). Returning EFI_RESERVED_TYPE implies an EFI memmap entry exists for this address, which is misleading because it doesn't in the hunks you've modified above. Instead, could you look at returning a negative error value in the usual way we do in the Linux kernel, and update the function prototype to match? I don't think any callers actually require the return type to be u32. I can do that, I'll change the return type to an int. For the !efi_enabled I can return -ENOTSUPP and for when an entry isn't found I can return -EINVAL. Sound good? The ia64 arch is the only other arch that defines the function. It has just a single return 0 that I'll change to -EINVAL. Thanks, Tom ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v8 9/9] perf/amd/iommu: Enable support for multiple IOMMUs
Peter, On 2/14/17 19:31, Peter Zijlstra wrote: On Tue, Feb 07, 2017 at 08:57:52AM +0700, Suravee Suthikulpanit wrote: But instead it looks like you get the counter form: #define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg)) Which is absolutely insane. So, the IOMMU counters are grouped into bank, and there could be many banks. I use the extra_reg.reg to hold the bank and counter indices. This will be used to program onto the counter configuration register. This is handled in get_next_avail_iommu_bnk_cntr() and clear_avail_iommu_bnk_cntr(). But this is crazy. That's not what extra_regs are for. Ok, I understand that we should not be using the extra_regs since it is intended for other purposes. Please see more detail below. Also, who cares about the banks, why is this exposed? The bank and counter values are not exposed to the user-space. The amd_iommu PMU only expose, csource, devid, domid, pasid, devid_mask, domid_mask, and pasid_mask as event attributes. That is, I would very much expect a linear range of counters. You can always decompose this counter number if you really need to somewhere down near the hardware accessors. Actually, the counters are treated as linear range of counters. For example, the IOMMU hardware has 2 banks with 4 counters/bank. So, we have total of 8 counters. The driver then assigns an index to each events when an event is added. Here, the bank/counter are derived from the assigned index, and stored in the perf_event as bank and counter values. However, I have looked into reworking to not use the extra_regs, and I see that the union in struct hw_perf_event currently contains various PMU-specific structures (hardware, software, tracepoint, intel_cqm, itrace, amd_power, and breakpoint). For amd_iommu PMU, we need additional registers for holding amd_iommu-specific parameters. So, it seems that we can just introduce amd_iommu-specific struct instead of re-using the existing structure for hardware events. I'm planning to add the following structure in the same union: union { .. struct { /* amd_iommu */ u8 iommu_csource; u8 iommu_bank; u8 iommu_cntr; u16 iommu_devid; u16 iommu_devid_msk; u16 iommu_domid; u16 iommu_domid_msk; u32 iommu_pasid; u32 iommu_pasid_msk; }; }; Please let me know what you think, of if I am still missing your points. Thanks, Suravee ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v8 9/9] perf/amd/iommu: Enable support for multiple IOMMUs
On Fri, Feb 24, 2017 at 12:43:19AM +0700, Suravee Suthikulpanit wrote: > >Also, who cares about the banks, why is this exposed? > > The bank and counter values are not exposed to the user-space. > The amd_iommu PMU only expose, csource, devid, domid, pasid, devid_mask, > domid_mask, and pasid_mask as event attributes. Ah good, for a little while I was worried the BANK stuff came from userspace; I misread extra_reg.config and extra_reg.reg, the former being perf_event_attr::config1 and the latter holding the bank thing. > >That is, I would very much expect a linear range of counters. You can > >always decompose this counter number if you really need to somewhere > >down near the hardware accessors. > > > > Actually, the counters are treated as linear range of counters. For example, > the IOMMU hardware has 2 banks with 4 counters/bank. So, we have total of 8 > counters. The driver then assigns an index to each events when an event is > added. > Here, the bank/counter are derived from the assigned index, and stored in > the perf_event as bank and counter values. > > However, I have looked into reworking to not use the extra_regs, and I see > that the union in struct hw_perf_event currently contains various PMU-specific > structures (hardware, software, tracepoint, intel_cqm, itrace, amd_power, > and breakpoint). > > For amd_iommu PMU, we need additional registers for holding amd_iommu-specific > parameters. So, it seems that we can just introduce amd_iommu-specific struct > instead of re-using the existing structure for hardware events. > > I'm planning to add the following structure in the same union: > > union { > .. > struct { /* amd_iommu */ > u8 iommu_csource; > u8 iommu_bank; > u8 iommu_cntr; > u16 iommu_devid; > u16 iommu_devid_msk; > u16 iommu_domid; > u16 iommu_domid_msk; > u32 iommu_pasid; > u32 iommu_pasid_msk; > }; > }; > > Please let me know what you think, of if I am still missing your points. Yes, adding a struct to that union is fine and clarifies things. And just because I'm weird like that, there's a u8 hole after iommu_cntr. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v8 9/9] perf/amd/iommu: Enable support for multiple IOMMUs
On 2/24/17 01:11, Peter Zijlstra wrote: However, I have looked into reworking to not use the extra_regs, and I see that the union in struct hw_perf_event currently contains various PMU-specific structures (hardware, software, tracepoint, intel_cqm, itrace, amd_power, and breakpoint). For amd_iommu PMU, we need additional registers for holding amd_iommu-specific parameters. So, it seems that we can just introduce amd_iommu-specific struct instead of re-using the existing structure for hardware events. I'm planning to add the following structure in the same union: union { .. struct { /* amd_iommu */ u8 iommu_csource; u8 iommu_bank; u8 iommu_cntr; u16 iommu_devid; u16 iommu_devid_msk; u16 iommu_domid; u16 iommu_domid_msk; u32 iommu_pasid; u32 iommu_pasid_msk; }; }; Please let me know what you think, of if I am still missing your points. Yes, adding a struct to that union is fine and clarifies things. And just because I'm weird like that, there's a u8 hole after iommu_cntr. Ok, I'll update this in V10 that I'll be sending out this week. Thanks, Suravee ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
intel-iommu sysfs oops.
cat /sys/devices/virtual/iommu/dmar0/intel-iommu/version Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC CPU: 2 PID: 1488 Comm: cat Not tainted 4.10.0-think+ #5 task: 8804ee440040 task.stack: c9d48000 RIP: 0010:intel_iommu_show_version+0x13/0x40 RSP: 0018:c9d4bcf0 EFLAGS: 00010286 RAX: 8804ede80008 RBX: 81ec2c80 RCX: RDX: RSI: 81ec2c80 RDI: 88017fc2d5d0 RBP: c9d4bcf0 R08: R09: 8804ede80008 R10: 068b13bb R11: 0006 R12: 1000 R13: 81a96f10 R14: 8804fdd131b8 R15: 8804ede80008 FS: 7feb8ad5e700() GS:880507c0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 0004ee49a000 CR4: 001406e0 Call Trace: dev_attr_show+0x20/0x50 ? sysfs_file_ops+0x46/0x70 sysfs_kf_seq_show+0xb7/0x110 kernfs_seq_show+0x26/0x30 seq_read+0x129/0x4a0 kernfs_fop_read+0x13b/0x1a0 __vfs_read+0x37/0x140 ? __context_tracking_exit.part.5+0x82/0x150 vfs_read+0xab/0x180 SyS_read+0x58/0xc0 do_syscall_64+0x61/0x170 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7feb8a8875a0 RSP: 002b:7fff8dece608 EFLAGS: 0246 ORIG_RAX: RAX: ffda RBX: 0002 RCX: 7feb8a8875a0 RDX: 0002 RSI: 7feb8aba2000 RDI: 0003 RBP: 7feb8aba2000 R08: R09: R10: 037b R11: 0246 R12: 7feb8aba2000 R13: 0003 R14: 0002 R15: 1000 $ dmesg | grep -i dmar [0.00] ACPI: DMAR 0xD479B0B8 B8 (v01 LENOVO TC-FB 1B30 INTL 0001) [0.190199] DMAR: Host address width 39 [0.190208] DMAR: DRHD base: 0x00fed9 flags: 0x0 [0.190235] DMAR: dmar0: reg_base_addr fed9 ver 1:0 cap c020660462 ecap f0101a [0.190250] DMAR: DRHD base: 0x00fed91000 flags: 0x1 [0.190272] DMAR: dmar1: reg_base_addr fed91000 ver 1:0 cap d2008020660462 ecap f010da [0.190286] DMAR: RMRR base: 0x00d5e6c000 end: 0x00d5e92fff [0.190302] DMAR: RMRR base: 0x00d700 end: 0x00df1f [0.190318] DMAR-IR: IOAPIC id 8 under DRHD base 0xfed91000 IOMMU 1 [0.190330] DMAR-IR: HPET id 0 under DRHD base 0xfed91000 [0.190341] DMAR-IR: Queued invalidation will be enabled to support x2apic and Intr-remapping. [0.190634] DMAR-IR: Enabled IRQ remapping in x2apic mode [1.050338] DMAR: No ATSR found [1.050717] DMAR: dmar0: Using Queued invalidation [1.050739] DMAR: dmar1: Using Queued invalidation [1.050799] DMAR: Setting RMRR: [1.050938] DMAR: Setting identity map for device :00:02.0 [0xd700 - 0xdf1f] [1.051763] DMAR: Setting identity map for device :00:14.0 [0xd5e6c000 - 0xd5e92fff] [1.051911] DMAR: Setting identity map for device :00:1a.0 [0xd5e6c000 - 0xd5e92fff] [1.052054] DMAR: Setting identity map for device :00:1d.0 [0xd5e6c000 - 0xd5e92fff] [1.052083] DMAR: Prepare 0-16MiB unity mapping for LPC [1.052200] DMAR: Setting identity map for device :00:1f.0 [0x0 - 0xff] [1.052331] DMAR: Intel(R) Virtualization Technology for Directed I/O [1.220475] [drm] DMAR active, disabling use of stolen memory [1.653051] calling dmar_free_unused_resources+0x0/0xcf @ 1 [1.653054] initcall dmar_free_unused_resources+0x0/0xcf returned 0 after 1 usecs $ dmesg | grep -i iommu [0.190318] DMAR-IR: IOAPIC id 8 under DRHD base 0xfed91000 IOMMU 1 [0.568456] calling iommu_init+0x0/0x2b @ 1 [0.568470] initcall iommu_init+0x0/0x2b returned 0 after 0 usecs [0.570137] calling iommu_dev_init+0x0/0x19 @ 1 [0.570158] initcall iommu_dev_init+0x0/0x19 returned 0 after 0 usecs [1.050256] calling pci_iommu_init+0x0/0x3c @ 1 [1.052700] iommu: Adding device :00:00.0 to group 0 [1.052767] iommu: Adding device :00:02.0 to group 1 [1.052833] iommu: Adding device :00:03.0 to group 2 [1.052895] iommu: Adding device :00:14.0 to group 3 [1.052965] iommu: Adding device :00:16.0 to group 4 [1.053026] iommu: Adding device :00:19.0 to group 5 [1.053093] iommu: Adding device :00:1a.0 to group 6 [1.053157] iommu: Adding device :00:1b.0 to group 7 [1.053220] iommu: Adding device :00:1c.0 to group 8 [1.053286] iommu: Adding device :00:1c.3 to group 9 [1.053348] iommu: Adding device :00:1d.0 to group 10 [1.053443] iommu: Adding device :00:1f.0 to group 11 [1.053487] iommu: Adding device :00:1f.2 to group 11 [1.053529] iommu: Adding device :00:1f.3 to group 11 [1.053599] iommu: Adding device :02:00.0 to group 12 [1.065133] initcall pci_iommu_init+0x0/0x3c returned 0 after 14515 usecs ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/list
Re: [RFC PATCH v4 14/28] Add support to access boot related data in the clear
On 2/21/2017 9:06 AM, Borislav Petkov wrote: On Thu, Feb 16, 2017 at 09:45:09AM -0600, Tom Lendacky wrote: Boot data (such as EFI related data) is not encrypted when the system is booted and needs to be mapped decrypted. Add support to apply the proper attributes to the EFI page tables and to the early_memremap and memremap APIs to identify the type of data being accessed so that the proper encryption attribute can be applied. So this doesn't even begin to explain *why* we need this. The emphasis being on *why*. Lemme guess? kexec? And because of efi_reuse_config? Hmm... maybe I'm missing something here. This doesn't have anything to do with kexec or efi_reuse_config. This has to do with the fact that when a system boots the setup data and the EFI data are not encrypted. Since it's not encrypted we need to be sure that any early_memremap() and memremap() calls remove the encryption mask from the resulting pagetable entry that is created so the data can be accessed properly. If so, then that whole ad-hoc caching in parse_setup_data() needs to go. Especially if efi_reuse_config() already sees those addresses so while we're there, we could save them somewhere or whatnot. But not doing the whole thing again in parse_setup_data(). Signed-off-by: Tom Lendacky --- arch/x86/include/asm/io.h |3 + arch/x86/include/asm/setup.h |8 +++ arch/x86/kernel/setup.c| 33 arch/x86/mm/ioremap.c | 111 arch/x86/platform/efi/efi_64.c | 16 -- kernel/memremap.c | 11 mm/early_ioremap.c | 18 +- 7 files changed, 192 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 7afb0e2..833f7cc 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -381,4 +381,7 @@ extern int __must_check arch_phys_wc_add(unsigned long base, #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +extern bool arch_memremap_do_ram_remap(resource_size_t offset, size_t size); +#define arch_memremap_do_ram_remap arch_memremap_do_ram_remap + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ac1d5da..8d9 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -63,6 +63,14 @@ static inline void x86_ce4100_early_setup(void) { } #include #include +struct setup_data_attrs { + u64 paddr; + unsigned long size; +}; + +extern struct setup_data_attrs setup_data_list[]; +extern unsigned int setup_data_list_count; + /* * This is set up by the setup-routine at boot-time */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bd5b9a7..d2234bf 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -148,6 +148,9 @@ int default_check_phys_apicid_present(int phys_apicid) struct boot_params boot_params; +struct setup_data_attrs setup_data_list[32]; +unsigned int setup_data_list_count; + /* * Machine setup.. */ @@ -419,6 +422,32 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +static void __init update_setup_data_list(u64 pa_data, unsigned long size) +{ + unsigned int i; + + for (i = 0; i < setup_data_list_count; i++) { + if (setup_data_list[i].paddr != pa_data) + continue; + + setup_data_list[i].size = size; + break; + } +} + +static void __init add_to_setup_data_list(u64 pa_data, unsigned long size) +{ + if (!sme_active()) + return; + + if (!WARN(setup_data_list_count == ARRAY_SIZE(setup_data_list), + "exceeded maximum setup data list slots")) { + setup_data_list[setup_data_list_count].paddr = pa_data; + setup_data_list[setup_data_list_count].size = size; + setup_data_list_count++; + } +} + static void __init parse_setup_data(void) { struct setup_data *data; @@ -428,12 +457,16 @@ static void __init parse_setup_data(void) while (pa_data) { u32 data_len, data_type; + add_to_setup_data_list(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); data_len = data->len + sizeof(struct setup_data); data_type = data->type; pa_next = data->next; early_memunmap(data, sizeof(*data)); + update_setup_data_list(pa_data, data_len); + switch (data_type) { case SETUP_E820_EXT: e820__memory_setup_extended(pa_data, data_len); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 2385e70..b0ff6bc 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -21,6 +22,7 @@ #include #include #include +#i
Re: [RFC PATCH v4 07/28] x86: Provide general kernel support for memory encryption
On 2/22/2017 12:13 PM, Dave Hansen wrote: On 02/16/2017 07:43 AM, Tom Lendacky wrote: static inline unsigned long pte_pfn(pte_t pte) { - return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pte_val(pte) & ~sme_me_mask & PTE_PFN_MASK) >> PAGE_SHIFT; } static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; + return (pmd_val(pmd) & ~sme_me_mask & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } Could you talk a bit about why you chose to do the "~sme_me_mask" bit in here instead of making it a part of PTE_PFN_MASK / pmd_pfn_mask(pmd)? I think that's a good catch. Let me look at it, but I believe that it should be possible to do and avoid what you're worried about below. Thanks, Tom It might not matter, but I'd be worried that this ends up breaking direct users of PTE_PFN_MASK / pmd_pfn_mask(pmd) since they now no longer mask the PFN out of a PTE. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: intel-iommu sysfs oops.
Hi Dave, On Thu, Feb 23, 2017 at 02:44:06PM -0500, Dave Jones wrote: > cat /sys/devices/virtual/iommu/dmar0/intel-iommu/version > > Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Thanks for the report, the problem reproduces easily here. The diff below fixes the issue on Intel, AMD has the same problem and needs a similar fix I'll do shortly. diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f5e02f8..ec3f6c8 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4730,11 +4730,16 @@ static int intel_iommu_cpu_dead(unsigned int cpu) return 0; } +static struct intel_iommu *dev_to_intel_iommu(struct device *dev) +{ + return container_of(dev, struct intel_iommu, iommu.dev); +} + static ssize_t intel_iommu_show_version(struct device *dev, struct device_attribute *attr, char *buf) { - struct intel_iommu *iommu = dev_get_drvdata(dev); + struct intel_iommu *iommu = dev_to_intel_iommu(dev); u32 ver = readl(iommu->reg + DMAR_VER_REG); return sprintf(buf, "%d:%d\n", DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); @@ -4745,7 +4750,7 @@ static ssize_t intel_iommu_show_address(struct device *dev, struct device_attribute *attr, char *buf) { - struct intel_iommu *iommu = dev_get_drvdata(dev); + struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%llx\n", iommu->reg_phys); } static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); @@ -4754,7 +4759,7 @@ static ssize_t intel_iommu_show_cap(struct device *dev, struct device_attribute *attr, char *buf) { - struct intel_iommu *iommu = dev_get_drvdata(dev); + struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%llx\n", iommu->cap); } static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); @@ -4763,7 +4768,7 @@ static ssize_t intel_iommu_show_ecap(struct device *dev, struct device_attribute *attr, char *buf) { - struct intel_iommu *iommu = dev_get_drvdata(dev); + struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%llx\n", iommu->ecap); } static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); @@ -4772,7 +4777,7 @@ static ssize_t intel_iommu_show_ndoms(struct device *dev, struct device_attribute *attr, char *buf) { - struct intel_iommu *iommu = dev_get_drvdata(dev); + struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); } static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); @@ -4781,7 +4786,7 @@ static ssize_t intel_iommu_show_ndoms_used(struct device *dev, struct device_attribute *attr, char *buf) { - struct intel_iommu *iommu = dev_get_drvdata(dev); + struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))); } ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu