[PATCH v3 10/51] PCI: Treat ROM resource as optional during realloc
So will try to allocate them together with must-have ones, if can not assign them, could go with must-have one only, and just skip ROM resources. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 37 - 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 292f2a5..3abf249 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -372,18 +372,10 @@ static void assign_requested_resources_sorted(struct list_head *head, idx = res - &dev_res->dev->resource[0]; if (resource_size(res) && pci_assign_resource(dev_res->dev, idx)) { - if (fail_head) { - /* -* if the failed res is for ROM BAR, and it will -* be enabled later, don't add it to the list -*/ - if (!((idx == PCI_ROM_RESOURCE) && - (!(res->flags & IORESOURCE_ROM_ENABLE - add_to_list(fail_head, - dev_res->dev, res, - 0 /* don't care */, - 0 /* don't care */); - } + if (fail_head) + add_to_list(fail_head, dev_res->dev, res, + 0 /* don't care */, + 0 /* don't care */); reset_resource(res); } } @@ -1143,6 +1135,19 @@ out: return good_align; } +static inline bool is_optional(int i) +{ + + if (i == PCI_ROM_RESOURCE) + return true; + +#ifdef CONFIG_PCI_IOV + if (i >= PCI_IOV_RESOURCES && i <= PCI_IOV_RESOURCE_END) + return true; +#endif + + return false; +} /** * pbus_size_mem() - size the memory window of a given bus * @@ -1199,10 +1204,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, r_size = resource_size(r); align = pci_resource_alignment(dev, r); -#ifdef CONFIG_PCI_IOV - /* put SRIOV requested res to the optional list */ - if (realloc_head && i >= PCI_IOV_RESOURCES && - i <= PCI_IOV_RESOURCE_END) { + /* put SRIOV/ROM res to realloc list */ + if (realloc_head && is_optional(i)) { add_to_align_test_list(&align_test_add_list, align, r_size); r->end = r->start - 1; @@ -1212,7 +1215,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, max_add_align = align; continue; } -#endif + if (align > (1ULL<<37)) { /*128 Gb*/ dev_warn(&dev->dev, "disabling BAR %d: %pR (bad alignment %#llx)\n", i, r, (unsigned long long) align); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed
On Tue, Jul 21, 2015 at 12:31 AM, Dave Young wrote: >> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c >> index 80f874b..36aeac3 100644 >> --- a/arch/x86/kernel/setup.c >> +++ b/arch/x86/kernel/setup.c >> @@ -513,7 +513,7 @@ static void __init >> memblock_x86_reserve_range_setup_data(void) >> # define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM >> #endif >> >> -static void __init reserve_crashkernel_low(void) >> +static int __init reserve_crashkernel_low(void) >> { >> #ifdef CONFIG_X86_64 >> const unsigned long long alignment = 16<<20;/* 16M */ >> @@ -542,7 +542,7 @@ static void __init reserve_crashkernel_low(void) >> } else { >> /* passed with crashkernel=0,low ? */ >> if (!low_size) >> - return; >> + return 0; >> } >> >> low_base = memblock_find_in_range(low_size, (1ULL<<32), >> @@ -552,7 +552,7 @@ static void __init reserve_crashkernel_low(void) >> if (!auto_set) >> pr_info("crashkernel low reservation failed - No >> suitable area found.\n"); >> >> - return; >> + return -EINVAL; >> } >> >> memblock_reserve(low_base, low_size); >> @@ -564,6 +564,7 @@ static void __init reserve_crashkernel_low(void) >> crashk_low_res.end = low_base + low_size - 1; >> insert_resource(&iomem_resource, &crashk_low_res); >> #endif >> + return 0; >> } >> >> static void __init reserve_crashkernel(void) >> @@ -613,6 +614,10 @@ static void __init reserve_crashkernel(void) >> return; >> } >> } >> + >> + if (crash_base >= (1ULL<<32) && reserve_crashkernel_low()) >> + return; >> + >> memblock_reserve(crash_base, crash_size); >> >> printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " >> @@ -624,9 +629,6 @@ static void __init reserve_crashkernel(void) >> crashk_res.start = crash_base; >> crashk_res.end = crash_base + crash_size - 1; >> insert_resource(&iomem_resource, &crashk_res); >> - >> - if (crash_base >= (1ULL<<32)) >> - reserve_crashkernel_low(); >> } >> #else >> static void __init reserve_crashkernel(void) No, you can not move the calling position for reserve_crashkernel_low(). old sequence: memblock_find_in_range for high memblock_reserve for high memblock_find_in_range for low memblock_reserve for low now you change to: memblock_find_in_range for high memblock_find_in_range for low memblock_reserve for low memblock_reserve for high during memblock_reserve, we would double the memblock reserve array. So there is possibility that new membock reserve array is overlapped with range for crashdump high. so you should keep the old sequence, and if reserve_crashkernel_low fail, just call memblock_free to free high range that is reserved before. Thanks Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed
On Tue, Jul 21, 2015 at 5:59 PM, Baoquan He wrote: >> That commit should only be used to workaround some systems that >> have partial iommu support. > > Those big servers mostly has hardware iommu. But they still can > enable swiotlb suport. Then low memory is needed. Do you have whole bootlog? I don't understand why those system can not use full iommu. BIOS problem or HW/silicon limitation? Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed
On Tue, Jul 21, 2015 at 9:47 PM, Minfei Huang wrote: > > Since low memory does not need for some machines, how about kexec does > not allocate low memory automatically, if cmdline does not specify the > option ",low". User shall know well, if they specify the cmdline with > option ",high". That was what I tried to do at that time. Some others think automatically set a small value would be friendly to users. Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed
On Wed, Jul 22, 2015 at 3:11 AM, Joerg Roedel wrote: > On Tue, Jul 21, 2015 at 12:22:53PM -0700, Yinghai Lu wrote: >> On Tue, Jul 21, 2015 at 1:58 AM, Baoquan He wrote: >> >> > Maybe system which don't need low memory is rare, only for testing? >> >> No, it is not rare. >> >> All recent intel based systems with iommu support does not need low. > > All Intel-IOMMU systems have the iommu disabled by default (at least > that is the default in most distros). So low memory is definitly needed > by those systems too. Do those systems need crashkernel=,high? Do you mean BIOS have that disabled with not exposing DMAR table ? kernel for RHEL 6 and RHEL7 have them enabled. Also opensuse kernel have that enabled too. > >> that reserve 256M low always. and those 256M get wasted. >> >> That commit should only be used to workaround some systems that >> have partial iommu support. > > We currently lack the infrastructure for that, but I am happy to review > patches. How about letting subsystems announce their need for low > crash-kernel memory and allocate based on that? > > The subsystems (like iommu or swiotlb code, for example) could even > announce how much memory they need and we base our allocation on that. That would be hard, as we don't know if second kernel could take what kernel parameters. user could disable iommu etc from command kernel for second kernel. Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed
On Tue, Jul 21, 2015 at 1:58 AM, Baoquan He wrote: > Maybe system which don't need low memory is rare, only for testing? No, it is not rare. All recent intel based systems with iommu support does not need low. And those systems get punished by following patch: | commit 94fb9334182284e8e7e4bcb9125c25dc33af19d4 | Author: Joerg Roedel | Date: Wed Jun 10 17:49:42 2015 +0200 | |x86/crash: Allocate enough low memory when crashkernel=high that reserve 256M low always. and those 256M get wasted. That commit should only be used to workaround some systems that have partial iommu support. Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] x86/mm: Assign the initail value to the pmd_idx
On Sun, Jul 12, 2015 at 5:18 AM, Minfei Huang wrote: > From: Minfei Huang > > The variable pmd_idx is undefined, when we try to start the loop to > calculate the page. > > Assign the proper value which indexes the start address to make it work > well. > > Signed-off-by: Minfei Huang > --- > arch/x86/mm/init_32.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c > index 8340e45..68aec42 100644 > --- a/arch/x86/mm/init_32.c > +++ b/arch/x86/mm/init_32.c > @@ -137,6 +137,7 @@ page_table_range_init_count(unsigned long start, unsigned > long end) > > vaddr = start; > pgd_idx = pgd_index(vaddr); > + pmd_idx = pmd_index(vaddr); > > for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { > for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); Reviewed-by: Yinghai Lu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 00/49] PCI: Resource allocation cleanup for v4.3
On Thu, Jul 16, 2015 at 6:51 PM, Wei Yang wrote: > Yinghai, > > Tested your latest for for-pci-v4.3-next branch, it works fine on my P8 > machine. Thanks for testing. > > BTW, the SRIOV works fine too. Previously failure is based on my mistake, I > have disabled SRIOV :-( Good. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3] [SCSI] mpt2sas, mpt3sas: Abort initialization if no memory I/O resources detected
On Wed, Jul 15, 2015 at 6:52 AM, Timothy Pearson wrote: >> I have just kept the same description provide by Timothy in his >> initial patch. >> >> But I observe that their may be chance of getting "unable to handle >> kernel NULL pointer dereference" kernel panic if no Memory Resource >> available in the PCI subsystem. So agreed to the Timothy proposal of >> aborting the driver initialization if it doesn't detect any Memory >> resource instead of whole system get into panic state. >> > On some systems Linux is unable / unwilling to assign a BAR if the BIOS > does not assign one at startup. I didn't look into the Linux allocator > side of things in much detail, but it is quite possible that Linux is > unaware the device only has partial resources assigned. > Would be great if you can post boot log so we can figure about why those BARs are not assigned. Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3] [SCSI] mpt2sas, mpt3sas: Abort initialization if no memory I/O resources detected
On Tue, Jul 14, 2015 at 9:49 PM, Sreekanth Reddy wrote: > Driver crashes if the BIOS do not set up at least one > memory I/O resource. This failure can happen if the device is too > slow to respond during POST and is missed by the BIOS, but Linux > then detects the device later in the boot process. But pci subsystem should assign resources to those unassigned BAR. Do you mean even kernel can not assign resource to them? or it takes so long for mpt FW to get ready? Thanks Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 17/36] PCI: Add support for more than two alt_size under same bridge
On Tue, Jul 14, 2015 at 8:07 PM, Yijing Wang wrote: > On 2015/7/7 7:39, Yinghai Lu wrote: >> Need to increase size to make sure it could fit all alt entries. >> >> So at last, we use 8M/17M as parent bridge alt_align/alt_size. > > Tested-by: Yijing Wang Thanks for testing. > > Hi Yinghai, does this patch depend on the previous items in this patchset ? Yes, it depends most of patches from patch1 to this patch. > Could you provide another version of this patch for stable branch, eg. 3.10 > stable ? That is RHEL 7 kernel, right ? After those patches get into upstream, I will try to port them to 3.10 stable. Thanks Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 35/42] x86, boot, PCI: Convert SETUP_PCI data to list
On Tue, Jul 14, 2015 at 3:35 PM, Bjorn Helgaas wrote: >> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c >> index 16ace12..32d4f21 100644 >> --- a/arch/x86/pci/common.c >> +++ b/arch/x86/pci/common.c > >> +struct firmware_setup_pci_entry { >> + struct list_head list; >> + uint16_t vendor; >> + uint16_t devid; >> + uint64_t pcilen; > > Is there a reason to use uint16_t and uint64_t instead of u16 and u64? keep them same as arch/x86/include/asm/pci.h::pci_setup_rom. and we have that from: commit dd5fc854de5fd37adfcef8a366cd21a55aa01d3d Author: Matthew Garrett Date: Wed Dec 5 14:33:26 2012 -0700 EFI: Stash ROMs if they're not in the PCI BAR EFI provides support for providing PCI ROMs via means other than the ROM BAR. This support vanishes after we've exited boot services, so add support for stashing copies of the ROMs in setup_data if they're not otherwise available. Signed-off-by: Matthew Garrett Signed-off-by: Bjorn Helgaas Tested-by: Seth Forshee diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 6e41b93..dba7805 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -171,4 +171,16 @@ cpumask_of_pcibus(const struct pci_bus *bus) } #endif +struct pci_setup_rom { + struct setup_data data; + uint16_t vendor; + uint16_t devid; + uint64_t pcilen; + unsigned long segment; + unsigned long bus; + unsigned long device; + unsigned long function; + uint8_t romdata[0]; +}; + #endif /* _ASM_X86_PCI_H */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 27/49] PCI: Unifiy calculate_size for io port and mmio
We should check size+size1 with min_size for io port. For example, when hotplug bridge has two children bridges, every child bridge will need 0x1000, so size1 will be 0x2000 and size is 0. The min_size for the hotplug bridge is 0x100. with old version calculate_iosize, we get 0x3000 for final size because we are using size to compare with min_size. That is not right, we should use 0x2000 instead. After this change, calculate_memsize and calculate_iosize is the same. Change them to calculate_size. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 27 ++- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 9d5e550..969a0b1 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1116,23 +1116,7 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, return NULL; } -static resource_size_t calculate_iosize(resource_size_t size, - resource_size_t min_size, - resource_size_t size1, - resource_size_t old_size, - resource_size_t align) -{ - if (size < min_size) - size = min_size; - if (old_size == 1) - old_size = 0; - size = ALIGN(size + size1, align); - if (size < old_size) - size = old_size; - return size; -} - -static resource_size_t calculate_memsize(resource_size_t size, +static resource_size_t calculate_size(resource_size_t size, resource_size_t min_size, resource_size_t old_size, resource_size_t align) @@ -1257,14 +1241,15 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, } size = size_aligned_for_isa(size); - size0 = calculate_iosize(size, min_size, size1, + size += size1; + size0 = calculate_size(size, min_size, resource_size(b_res), min_align); sum_add_size = size_aligned_for_isa(sum_add_size); sum_add_size += sum_add_size1; if (sum_add_size < min_sum_size) sum_add_size = min_sum_size; size1 = !realloc_head ? size0 : - calculate_iosize(sum_add_size, min_size, 0, + calculate_size(sum_add_size, min_size, resource_size(b_res), min_align); if (!size0 && !size1) { if (b_res->start || b_res->end) @@ -1617,7 +1602,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, if (size || min_size) { min_align = calculate_mem_align(&align_test_list, max_align, size, window_align); - size0 = calculate_memsize(size, min_size, + size0 = calculate_size(size, min_size, resource_size(b_res), min_align); } free_align_test_list(&align_test_list); @@ -1642,7 +1627,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_add_align = calculate_mem_align(&align_test_add_list, max_add_align, sum_add_size, window_align); - size1 = calculate_memsize(sum_add_size, min_size, + size1 = calculate_size(sum_add_size, min_size, resource_size(b_res), min_add_align); } free_align_test_list(&align_test_add_list); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 10/49] PCI: Treat ROM resource as optional during realloc
So will try to allocate them together with must-have ones, if can not assign them, could go with must-have one only, and just skip ROM resources. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 37 - 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 292f2a5..3abf249 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -372,18 +372,10 @@ static void assign_requested_resources_sorted(struct list_head *head, idx = res - &dev_res->dev->resource[0]; if (resource_size(res) && pci_assign_resource(dev_res->dev, idx)) { - if (fail_head) { - /* -* if the failed res is for ROM BAR, and it will -* be enabled later, don't add it to the list -*/ - if (!((idx == PCI_ROM_RESOURCE) && - (!(res->flags & IORESOURCE_ROM_ENABLE - add_to_list(fail_head, - dev_res->dev, res, - 0 /* don't care */, - 0 /* don't care */); - } + if (fail_head) + add_to_list(fail_head, dev_res->dev, res, + 0 /* don't care */, + 0 /* don't care */); reset_resource(res); } } @@ -1143,6 +1135,19 @@ out: return good_align; } +static inline bool is_optional(int i) +{ + + if (i == PCI_ROM_RESOURCE) + return true; + +#ifdef CONFIG_PCI_IOV + if (i >= PCI_IOV_RESOURCES && i <= PCI_IOV_RESOURCE_END) + return true; +#endif + + return false; +} /** * pbus_size_mem() - size the memory window of a given bus * @@ -1199,10 +1204,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, r_size = resource_size(r); align = pci_resource_alignment(dev, r); -#ifdef CONFIG_PCI_IOV - /* put SRIOV requested res to the optional list */ - if (realloc_head && i >= PCI_IOV_RESOURCES && - i <= PCI_IOV_RESOURCE_END) { + /* put SRIOV/ROM res to realloc list */ + if (realloc_head && is_optional(i)) { add_to_align_test_list(&align_test_add_list, align, r_size); r->end = r->start - 1; @@ -1212,7 +1215,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, max_add_align = align; continue; } -#endif + if (align > (1ULL<<37)) { /*128 Gb*/ dev_warn(&dev->dev, "disabling BAR %d: %pR (bad alignment %#llx)\n", i, r, (unsigned long long) align); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 07/49] PCI: Reorder resources list for must/optional resources
After we update size and alignment for must+optional resource, we reorder them with new alignment, but this is only for STARTALIGN. For SIZEALIGN type resource, after add back add_size, the alignment get changed, so need to do the sorting like STARTALIGN type resources. Also we need to reorder the sorting back after we restore resource to must only when must+optional fail to allocate for all. So move out the reordering code from the loop to separated function, and call it two times accordingly. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 62 + 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 7346bbf..6f2d508 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -287,6 +287,31 @@ static inline void reset_resource(struct resource *res) res->flags = 0; } +static void __sort_resources(struct list_head *head) +{ + struct pci_dev_resource *res1, *tmp_res, *res2; + + list_for_each_entry_safe(res1, tmp_res, head, list) { + resource_size_t align1, size1, align2, size2; + + align1 = pci_resource_alignment(res1->dev, res1->res); + size1 = resource_size(res1->res); + + /* reorder it */ + list_for_each_entry(res2, head, list) { + if (res2 == res1) + break; + + align2 = pci_resource_alignment(res2->dev, res2->res); + size2 = resource_size(res2->res); + if (is_before(align1, size1, align2, size2)) { + list_move_tail(&res1->list, &res2->list); + break; + } + } + } +} + /** * reassign_resources_sorted() - satisfy any additional resource requests * @@ -449,9 +474,9 @@ static void __assign_resources_sorted(struct list_head *head, LIST_HEAD(save_head); LIST_HEAD(local_fail_head); struct pci_dev_resource *save_res; - struct pci_dev_resource *dev_res, *tmp_res, *dev_res2; + struct pci_dev_resource *dev_res, *tmp_res; unsigned long fail_type; - resource_size_t add_align, align; + resource_size_t add_align; /* Check if optional add_size is there */ if (!realloc_head || list_empty(realloc_head)) @@ -466,47 +491,32 @@ static void __assign_resources_sorted(struct list_head *head, } /* Update res in head list with add_size in realloc_head list */ - list_for_each_entry_safe(dev_res, tmp_res, head, list) { + list_for_each_entry(dev_res, head, list) { dev_res->res->end += get_res_add_size(realloc_head, dev_res->res); /* * There are two kinds of additional resources in the list: -* 1. bridge resource -- IORESOURCE_STARTALIGN -* 2. SR-IOV resource -- IORESOURCE_SIZEALIGN -* Here just fix the additional alignment for bridge +* 1. bridge resource with IORESOURCE_STARTALIGN +*need to update start to change alignment +* 2. resource with IORESOURCE_SIZEALIGN +*update size above already change alignment. */ if (!(dev_res->res->flags & IORESOURCE_STARTALIGN)) continue; add_align = get_res_add_align(realloc_head, dev_res->res); - /* -* The "head" list is sorted by the alignment to make sure -* resources with bigger alignment will be assigned first. -* After we change the alignment of a dev_res in "head" list, -* we need to reorder the list by alignment to make it -* consistent. -*/ - if (add_align > dev_res->res->start) { + if (add_align) { resource_size_t r_size = resource_size(dev_res->res); dev_res->res->start = add_align; dev_res->res->end = add_align + r_size - 1; - - list_for_each_entry(dev_res2, head, list) { - align = pci_resource_alignment(dev_res2->dev, - dev_res2->res); - if (add_align > align) { - list_move_tail(&dev_res->list, - &dev_res2->list); - break; - } - } } - } + __so
[PATCH v2 01/49] PCI: Cleanup res_to_dev_res() printout for addon resources
Now get_res_add_size and get_res_add_align all have same printout from res_to_dev_res(), and it is confusing. Move out debug messages printout from res_to_dev_res(), and later we will reuse res_to_dev_res() in other functions. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 34 -- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 508cc56..f0fa705 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -104,19 +104,9 @@ static struct pci_dev_resource *res_to_dev_res(struct list_head *head, { struct pci_dev_resource *dev_res; - list_for_each_entry(dev_res, head, list) { - if (dev_res->res == res) { - int idx = res - &dev_res->dev->resource[0]; - - dev_printk(KERN_DEBUG, &dev_res->dev->dev, -"res[%d]=%pR res_to_dev_res add_size %llx min_align %llx\n", -idx, dev_res->res, -(unsigned long long)dev_res->add_size, -(unsigned long long)dev_res->min_align); - + list_for_each_entry(dev_res, head, list) + if (dev_res->res == res) return dev_res; - } - } return NULL; } @@ -127,7 +117,15 @@ static resource_size_t get_res_add_size(struct list_head *head, struct pci_dev_resource *dev_res; dev_res = res_to_dev_res(head, res); - return dev_res ? dev_res->add_size : 0; + if (!dev_res || !dev_res->add_size) + return 0; + + dev_printk(KERN_DEBUG, &dev_res->dev->dev, + "BAR %d: %pR get_res_add_size add_size %llx\n", + (int)(res - &dev_res->dev->resource[0]), + res, (unsigned long long)dev_res->add_size); + + return dev_res->add_size; } static resource_size_t get_res_add_align(struct list_head *head, @@ -136,7 +134,15 @@ static resource_size_t get_res_add_align(struct list_head *head, struct pci_dev_resource *dev_res; dev_res = res_to_dev_res(head, res); - return dev_res ? dev_res->min_align : 0; + if (!dev_res || !dev_res->min_align) + return 0; + + dev_printk(KERN_DEBUG, &dev_res->dev->dev, + "BAR %d: %pR get_res_add_align min_align %llx\n", + (int)(res - &dev_res->dev->resource[0]), + res, (unsigned long long)dev_res->min_align); + + return dev_res->min_align; } -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 00/49] PCI: Resource allocation cleanup for v4.3
Hi, After 5b28541552ef (PCI: Restrict 64-bit prefetchable bridge windows to 64-bit resources), we have several reports on resource allocation failure, and we try to fix the problem with resource clip, and get more problems. One is realloc fail with two graphics cards above 4G. One is from sparc that have problem with clip as we don't parse mem64 for it. Other report is about pci remove/rescan does not work on some setup when BIOS tend to allocate small bus size. This patchset enhance resource allocation to address those problems. 1. optimize bus mmio alignment calculation. 2. optimize bus mmio optional alignment calculation. 3. add support for alt size to prefer small bus size to small bus alignment. when we have small resource window on parent bridges. 4. treat ROM bar as optional resource. 5. during allocation, will pick up just fit resource. 6. parse MEM64 for sparc and other system with OF. 7. treat non-pref mmio64 if parent bridges are all pcie. 8. restore old pref allocation logic if hostbridge does not support mmio64 really. 9. don't realloc resource if device firmware does not support bar change. 10. add pci=assign_pref_bars to clear and assign pref bars. 11. don't clear resource when allocation fails. I put latest copy at: git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git for-pci-v4.3-next That is based on kind of v4.2-rc2. v2: - fix resource_disabled allocation from powerpc - other warnings that were found by Fengguang's build robot. - io port alignment clean up. - rebased to v4.2-rc2 - better for two alt_size support. Thanks Yinghai Yinghai Lu (49): PCI: Cleanup res_to_dev_res() printout for addon resources PCI: Reuse res_to_dev_res in reassign_resources_sorted PCI: Use correct align for optional only resources during sorting PCI: Optimize bus align/size calculation during sizing PCI: Optimize bus align/size calculation for optional during sizing PCI: Don't add too much optional size for hotplug bridge mmio PCI: Reorder resources list for must/optional resources PCI: Remove duplicated code for resource sorting PCI: Rename pdev_sort_resources to pdev_check_resources PCI: Treat ROM resource as optional during realloc PCI: Add debug printout during releasing partial assigned resources PCI: Simplify res reference using in __assign_resourcs_sorted PCI: Separate realloc list checking after allocation PCI: Add __add_to_list() PCI: Cache window alignment value PCI: Check if resource is allocated before pci_assign PCI: Separate out save_resources/restore_resource PCI: Move comment to pci_need_to_release() PCI: Separate must+optional assigning to another function PCI: Skip must+optional if there is no optional addon PCI: Move saved required resource list out of must+optional assigning PCI: Add alt_size allocation support PCI: Add support for more than two alt_size under same bridge PCI: Better support for two alt_size PCI: Don't add too much optional size for hotplug bridge io PCI: Move ISA ioport align out of calculate_iosize PCI: Unifiy calculate_size for io port and mmio PCI: Allow optional only io resource must size to be 0 PCI: Unify skip_ioresource_align() PCI: Kill macro checking for bus io port sizing resources: Split out __allocate_resource() resources: Make allocate_resource return just fit resource PCI: Check pref compatible bit for mem64 resource of pcie device PCI: Only treat non-pef mmio64 as pref if all bridges has MEM_64 PCI: Add has_mem64 for host_bridge PCI: Only treat non-pef mmio64 as pref if host-bridge has_mem64 PCI: Restore pref mmio allocation logic for hostbridge without mmio64 sparc/PCI: Add mem64 resource parsing for root bus sparc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing powerpc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing of/PCI: Add IORESOURCE_MEM_64 for 64-bit resource PCI: Treat optional as must in first try for bridge rescan PCI: Get new realloc size for bridge for last try PCI: Don't release sibiling bridge resources during hotplug PCI: Don't release fixed resource for realloc PCI: Set resource to FIXED for lsi devices PCI, x86: Add pci=assign_pref_bars to re-allocate pref bars PCI: Introduce resource_disabled() PCI: Don't set flags to 0 when assign resource fail arch/alpha/kernel/pci.c |2 +- arch/ia64/pci/pci.c |4 +- arch/microblaze/pci/pci-common.c | 23 +- arch/mn10300/unit-asb2305/pci-asb2305.c |4 +- arch/mn10300/unit-asb2305/pci.c |4 +- arch/powerpc/kernel/pci-common.c | 27 +- arch/powerpc/kernel/pci_of_scan.c |4 +- arch/powerpc/platforms/powernv/pci-ioda.c | 12 +- arch/s390/pci/pci.c |2 +- arch/sparc/kernel/of_device_32.c |5 +- arch/sparc/kernel/of_device_64.
[PATCH v2 15/49] PCI: Cache window alignment value
There are several calling to window_alignment(). And we will have more for alt_size support. Cache the value instead of keeping on getting it. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 57b5c09..1b5fbca 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1203,6 +1203,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, mask | IORESOURCE_PREFETCH, type); LIST_HEAD(align_test_list); LIST_HEAD(align_test_add_list); + resource_size_t window_align; if (!b_res) return -ENOSPC; @@ -1212,6 +1213,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_size = 0; } + window_align = window_alignment(bus, b_res->flags); + list_for_each_entry(dev, &bus->devices, bus_list) { int i; @@ -1272,10 +1275,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, } } - max_align = max(max_align, window_alignment(bus, b_res->flags)); + max_align = max(max_align, window_align); if (size || min_size) { min_align = calculate_mem_align(&align_test_list, max_align, -size, window_alignment(bus, b_res->flags)); + size, window_align); size0 = calculate_memsize(size, min_size, resource_size(b_res), min_align); } @@ -1286,7 +1289,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, if (sum_add_size > size && realloc_head) { min_add_align = calculate_mem_align(&align_test_add_list, max_add_align, sum_add_size, - window_alignment(bus, b_res->flags)); + window_align); size1 = calculate_memsize(sum_add_size, min_size, resource_size(b_res), min_add_align); } -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 06/49] PCI: Don't add too much optional size for hotplug bridge mmio
Current code will always add 2M for hotplug bridge mmio even there is child device under it. For example: 40:03.0 --- 43:00.0 --- 44:02.0 -+- 45:00.0 \- 45:00.1 44:02.0 will need 1M as must for 45:00.0 and 45:00.1 When we calculate add_size for 44:02.0, we pass 2M as additional size for hotplug bridge, total will be 3M. That is different from code before changes for optional support, or even current code that treat optional as must directly by not passing realloc head. We only need 2M as total. The optional size should be 1M, and total size should be 2M. This patch change to comparing must+optional with min_sum_size to get smaller optional size. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 28 +++- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 4c7f25f..7346bbf 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1156,7 +1156,6 @@ out: * @type2: second match type * @type3: third match type * @min_size : the minimum memory window that must to be allocated - * @add_size : additional optional memory window * @realloc_head : track the additional memory window on this list * * Calculate the size of the bus and minimal alignment which @@ -1169,10 +1168,11 @@ out: static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long type, unsigned long type2, unsigned long type3, -resource_size_t min_size, resource_size_t add_size, +resource_size_t min_size, struct list_head *realloc_head) { struct pci_dev *dev; + resource_size_t min_sum_size = 0; resource_size_t min_align = 0, min_add_align = 0; resource_size_t max_align = 0, max_add_align = 0; resource_size_t size = 0, size0 = 0, size1 = 0, sum_add_size = 0; @@ -1184,6 +1184,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, if (!b_res) return -ENOSPC; + if (realloc_head) { + min_sum_size = min_size; + min_size = 0; + } + list_for_each_entry(dev, &bus->devices, bus_list) { int i; @@ -1254,8 +1259,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, } free_align_test_list(&align_test_list); - if ((sum_add_size - size) < add_size) - sum_add_size = size + add_size; + if (sum_add_size < min_sum_size) + sum_add_size = min_sum_size; if (sum_add_size > size && realloc_head) { min_add_align = calculate_mem_align(&align_test_add_list, max_add_align, sum_add_size, @@ -1392,7 +1397,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) { struct pci_dev *dev; unsigned long mask, prefmask, type2 = 0, type3 = 0; - resource_size_t additional_mem_size = 0, additional_io_size = 0; + resource_size_t min_mem_size = 0, additional_io_size = 0; struct resource *b_res; int ret; @@ -1426,7 +1431,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) pci_bridge_check_ranges(bus); if (bus->self->is_hotplug_bridge) { additional_io_size = pci_hotplug_io_size; - additional_mem_size = pci_hotplug_mem_size; + min_mem_size = pci_hotplug_mem_size; } /* Fall through */ default: @@ -1445,8 +1450,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) prefmask |= IORESOURCE_MEM_64; ret = pbus_size_mem(bus, prefmask, prefmask, prefmask, prefmask, - realloc_head ? 0 : additional_mem_size, - additional_mem_size, realloc_head); + min_mem_size, realloc_head); /* * If successful, all non-prefetchable resources @@ -1469,8 +1473,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) prefmask &= ~IORESOURCE_MEM_64; ret = pbus_size_mem(bus, prefmask, prefmask, prefmask, prefmask, -realloc_head ? 0 : additional_mem_size, -additional_mem_size, realloc_head); +min_mem_size, realloc_head); /* * If successful, only non-prefetchable resources @@ -1479,7 +1482,7 @@ void
[PATCH v2 08/49] PCI: Remove duplicated code for resource sorting
Now __sort_resources, and pdev_sort_resources all have sorting code. As we are going to call __sort_resources several places later, so choose to keep __sort_resources, and remove related code in pdev_sort_resources. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 22 +++--- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 6f2d508..6642a60 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -215,9 +215,8 @@ static void pdev_sort_resources(struct pci_dev *dev, for (i = 0; i < PCI_NUM_RESOURCES; i++) { struct resource *r; - struct pci_dev_resource *dev_res, *tmp; + struct pci_dev_resource *tmp; resource_size_t r_align; - struct list_head *n; r = &dev->resource[i]; @@ -240,22 +239,7 @@ static void pdev_sort_resources(struct pci_dev *dev, tmp->res = r; tmp->dev = dev; - /* fallback is smallest one or list is empty*/ - n = head; - list_for_each_entry(dev_res, head, list) { - resource_size_t align; - - align = __pci_resource_alignment(dev_res->dev, -dev_res->res, -realloc_head); - - if (r_align > align) { - n = &dev_res->list; - break; - } - } - /* Insert it just before n*/ - list_add_tail(&tmp->list, n); + list_add_tail(&tmp->list, head); } } @@ -558,9 +542,9 @@ static void __assign_resources_sorted(struct list_head *head, } free_list(&save_head); +requested_and_reassign: __sort_resources(head); -requested_and_reassign: /* Satisfy the must-have resource requests */ assign_requested_resources_sorted(head, fail_head); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 19/49] PCI: Separate must+optional assigning to another function
__assign_resources_sorted() is getting too big if we put alt_size support into it. Split must_add assigning code out to another function. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 47 +++ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index f8b9a24..d1f9e19 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -473,20 +473,9 @@ static void restore_resource(struct pci_dev_resource *save_res, res->flags = save_res->flags; } -static void __assign_resources_sorted(struct list_head *head, -struct list_head *realloc_head, -struct list_head *fail_head) +static bool __assign_resources_must_add_sorted(struct list_head *head, +struct list_head *realloc_head) { - /* -* Should not assign requested resources at first. -* they could be adjacent, so later reassign can not reallocate -* them one by one in parent resource window. -* Try to assign requested + add_size at beginning -* if could do that, could get out early. -* if could not do that, we still try to assign requested at first, -*then try to reassign add_size for some resources. -*/ - LIST_HEAD(save_head); LIST_HEAD(local_fail_head); struct pci_dev_resource *save_res; @@ -495,12 +484,8 @@ static void __assign_resources_sorted(struct list_head *head, resource_size_t add_align; struct resource *res; - /* Check if optional add_size is there */ - if (!realloc_head || list_empty(realloc_head)) - goto requested_and_reassign; - if (!save_resources(head, &save_head)) - goto requested_and_reassign; + return false; /* Update res in head list with add_size in realloc_head list */ list_for_each_entry(dev_res, head, list) { @@ -539,7 +524,8 @@ static void __assign_resources_sorted(struct list_head *head, remove_from_list(realloc_head, dev_res->res); free_list(&save_head); free_list(head); - return; + + return true; } /* check failed type */ @@ -574,7 +560,28 @@ static void __assign_resources_sorted(struct list_head *head, free_list(&save_head); -requested_and_reassign: + return false; +} + +static void __assign_resources_sorted(struct list_head *head, +struct list_head *realloc_head, +struct list_head *fail_head) +{ + /* +* Should not assign requested resources at first. +* they could be adjacent, so later reassign can not reallocate +* them one by one in parent resource window. +* Try to assign requested + add_size at beginning +* if could do that, could get out early. +* if could not do that, we still try to assign requested at first, +*then try to reassign add_size for some resources. +*/ + + /* Check must+optional add */ + if (realloc_head && !list_empty(realloc_head) && + __assign_resources_must_add_sorted(head, realloc_head)) + return; + __sort_resources(head); /* Satisfy the must-have resource requests */ -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 02/49] PCI: Reuse res_to_dev_res in reassign_resources_sorted
Now res_to_dev_res() does not print out debug message anymore, so reuse it in reassign_resource_sorted(). Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 11 +-- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index f0fa705..247d8fe 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -240,26 +240,17 @@ static void reassign_resources_sorted(struct list_head *realloc_head, { struct resource *res; struct pci_dev_resource *add_res, *tmp; - struct pci_dev_resource *dev_res; resource_size_t add_size, align; int idx; list_for_each_entry_safe(add_res, tmp, realloc_head, list) { - bool found_match = false; - res = add_res->res; /* skip resource that has been reset */ if (!res->flags) goto out; /* skip this resource if not found in head list */ - list_for_each_entry(dev_res, head, list) { - if (dev_res->res == res) { - found_match = true; - break; - } - } - if (!found_match)/* just skip */ + if (!res_to_dev_res(head, res)) continue; idx = res - &add_res->dev->resource[0]; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 13/49] PCI: Separate realloc list checking after allocation
We check the realloc list, as list must be empty after allocation. Separate the realloc list checking to another function. Add checking that is missed in acpiphp driver. Signed-off-by: Yinghai Lu Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: linux-a...@vger.kernel.org --- drivers/pci/hotplug/acpiphp_glue.c | 1 + drivers/pci/pci.h | 1 + drivers/pci/setup-bus.c| 11 --- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index ff53856..134caee 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -507,6 +507,7 @@ static void enable_slot(struct acpiphp_slot *slot) } } __pci_bus_assign_resources(bus, &add_list, NULL); + __pci_bus_check_realloc(&add_list); acpiphp_sanitize_bus(bus); pcie_bus_configure_settings(bus); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 4ff0ff1..2b83977 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -235,6 +235,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, void __pci_bus_assign_resources(const struct pci_bus *bus, struct list_head *realloc_head, struct list_head *fail_head); +void __pci_bus_check_realloc(struct list_head *realloc_head); bool pci_bus_clip_resource(struct pci_dev *dev, int idx); void pci_reassigndev_resource_alignment(struct pci_dev *dev); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 06664db..f30225c 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -350,6 +350,11 @@ out: } } +void __pci_bus_check_realloc(struct list_head *realloc_head) +{ + BUG_ON(!list_empty(realloc_head)); +} + /** * assign_requested_resources_sorted() - satisfy resource requests * @@ -1861,7 +1866,7 @@ again: /* Depth last, allocate resources and update the hardware. */ __pci_bus_assign_resources(bus, add_list, &fail_head); if (add_list) - BUG_ON(!list_empty(add_list)); + __pci_bus_check_realloc(add_list); tried_times++; /* any device complain? */ @@ -1936,7 +1941,7 @@ void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge) again: __pci_bus_size_bridges(parent, &add_list); __pci_bridge_assign_resources(bridge, &add_list, &fail_head); - BUG_ON(!list_empty(&add_list)); + __pci_bus_check_realloc(&add_list); tried_times++; if (list_empty(&fail_head)) @@ -1995,6 +2000,6 @@ void pci_assign_unassigned_bus_resources(struct pci_bus *bus) &add_list); up_read(&pci_bus_sem); __pci_bus_assign_resources(bus, &add_list, NULL); - BUG_ON(!list_empty(&add_list)); + __pci_bus_check_realloc(&add_list); } EXPORT_SYMBOL_GPL(pci_assign_unassigned_bus_resources); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 12/49] PCI: Simplify res reference using in __assign_resourcs_sorted
Use res instead of dev_res->res. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 32 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 6dff258..06664db 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -453,6 +453,7 @@ static void __assign_resources_sorted(struct list_head *head, struct pci_dev_resource *dev_res, *tmp_res; unsigned long fail_type; resource_size_t add_align; + struct resource *res; /* Check if optional add_size is there */ if (!realloc_head || list_empty(realloc_head)) @@ -468,8 +469,8 @@ static void __assign_resources_sorted(struct list_head *head, /* Update res in head list with add_size in realloc_head list */ list_for_each_entry(dev_res, head, list) { - dev_res->res->end += get_res_add_size(realloc_head, - dev_res->res); + res = dev_res->res; + res->end += get_res_add_size(realloc_head, res); /* * There are two kinds of additional resources in the list: @@ -478,16 +479,16 @@ static void __assign_resources_sorted(struct list_head *head, * 2. resource with IORESOURCE_SIZEALIGN *update size above already change alignment. */ - if (!(dev_res->res->flags & IORESOURCE_STARTALIGN)) + if (!(res->flags & IORESOURCE_STARTALIGN)) continue; - add_align = get_res_add_align(realloc_head, dev_res->res); + add_align = get_res_add_align(realloc_head, res); if (add_align) { - resource_size_t r_size = resource_size(dev_res->res); + resource_size_t r_size = resource_size(res); - dev_res->res->start = add_align; - dev_res->res->end = add_align + r_size - 1; + res->start = add_align; + res->end = add_align + r_size - 1; } } @@ -509,21 +510,21 @@ static void __assign_resources_sorted(struct list_head *head, /* check failed type */ fail_type = pci_fail_res_type_mask(&local_fail_head); /* remove not need to be released assigned res from head list etc */ - list_for_each_entry_safe(dev_res, tmp_res, head, list) - if (dev_res->res->parent && - !pci_need_to_release(fail_type, dev_res->res)) { + list_for_each_entry_safe(dev_res, tmp_res, head, list) { + res = dev_res->res; + if (res->parent && !pci_need_to_release(fail_type, res)) { /* remove it from realloc_head list */ - remove_from_list(realloc_head, dev_res->res); - remove_from_list(&save_head, dev_res->res); + remove_from_list(realloc_head, res); + remove_from_list(&save_head, res); list_del(&dev_res->list); kfree(dev_res); } + } free_list(&local_fail_head); /* Release assigned resource */ list_for_each_entry(dev_res, head, list) { - struct resource *res = dev_res->res; - + res = dev_res->res; if (res->parent) { dev_printk(KERN_DEBUG, &dev_res->dev->dev, "BAR %d: released %pR\n", @@ -534,8 +535,7 @@ static void __assign_resources_sorted(struct list_head *head, } /* Restore start/end/flags from saved list */ list_for_each_entry(save_res, &save_head, list) { - struct resource *res = save_res->res; - + res = save_res->res; res->start = save_res->start; res->end = save_res->end; res->flags = save_res->flags; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 03/49] PCI: Use correct align for optional only resources during sorting
During sorting before assign, we only put resource with non-zero align in the sorted list, so for optional resources that must size is 0 and only have addon parts, we need to have correct align. While treating SRIOV as optional resources, we always read alignment for SRIOV bars, so they are ok. Hotplug bridge resources are using STARTALIGN so it is ok when size is 0 if we have correct start for them. Later we want to treat the ROM BAR as optional resource, and it has have SIZEALIGN, we need to find a way to get align for them. We can use addon resource align instead in that case, and it will be ok for SRIOV path and hotplug bridge resource path. Sorted list will contain must resource align/size to 0/0 to hold spot for optional resources. We need to pass realloc_head from sizing stage to sorting stage, and get entry from realloc list and calculate align from the entry. Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431 Reported-by: TJ Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 50 ++--- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 247d8fe..27cb0f0 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -145,9 +145,43 @@ static resource_size_t get_res_add_align(struct list_head *head, return dev_res->min_align; } +static resource_size_t __pci_resource_alignment( + struct pci_dev *dev, + struct resource *r, + struct list_head *realloc_head) +{ + resource_size_t r_align = pci_resource_alignment(dev, r); + resource_size_t orig_start, orig_end; + struct pci_dev_resource *dev_res; + + if (r_align || !realloc_head) + return r_align; + + dev_res = res_to_dev_res(realloc_head, r); + if (!dev_res || !dev_res->add_size) + return r_align; + + orig_start = r->start; + orig_end = r->end; + r->end += dev_res->add_size; + if ((r->flags & IORESOURCE_STARTALIGN)) { + resource_size_t r_size = resource_size(r); + resource_size_t add_align = dev_res->min_align; + + r->start = add_align; + r->end = add_align + r_size - 1; + } + r_align = pci_resource_alignment(dev, r); + r->start = orig_start; + r->end = orig_end; + + return r_align; +} /* Sort resources by alignment */ -static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head) +static void pdev_sort_resources(struct pci_dev *dev, +struct list_head *realloc_head, +struct list_head *head) { int i; @@ -165,7 +199,7 @@ static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head) if (!(r->flags) || r->parent) continue; - r_align = pci_resource_alignment(dev, r); + r_align = __pci_resource_alignment(dev, r, realloc_head); if (!r_align) { dev_warn(&dev->dev, "BAR %d: %pR has bogus alignment\n", i, r); @@ -183,8 +217,9 @@ static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head) list_for_each_entry(dev_res, head, list) { resource_size_t align; - align = pci_resource_alignment(dev_res->dev, -dev_res->res); + align = __pci_resource_alignment(dev_res->dev, +dev_res->res, +realloc_head); if (r_align > align) { n = &dev_res->list; @@ -197,6 +232,7 @@ static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head) } static void __dev_sort_resources(struct pci_dev *dev, +struct list_head *realloc_head, struct list_head *head) { u16 class = dev->class >> 8; @@ -213,7 +249,7 @@ static void __dev_sort_resources(struct pci_dev *dev, return; } - pdev_sort_resources(dev, head); + pdev_sort_resources(dev, realloc_head, head); } static inline void reset_resource(struct resource *res) @@ -501,7 +537,7 @@ static void pdev_assign_resources_sorted(struct pci_dev *dev, { LIST_HEAD(head); - __dev_sort_resources(dev, &head); + __dev_sort_resources(dev, add_head, &head); __assign_resources_sorted(&head, add_head, fail_head); } @@ -514,7 +550,7 @@ static void pbus_assign_resources_sorted(const
[PATCH v2 04/49] PCI: Optimize bus align/size calculation during sizing
Current code try to get align as small as possible and use that to align final size. But it does not handle resource that size is bigger than align in optimal way, kernel only use max align for them. For example: when we have resources with align/size: 1M/2M, 512M/512M, bus resource min_align/size0 will be 512M/1024M, but optimal value should be 256M/768M. For following cases that we have resource size that is bigger than resource alignment: 1. SRIOV bar. 2. PCI bridges with several bridges or devices as children. We can keep on trying to allocate children devices resources under range [half_align, half_align + aligned_size). If sucesses, we can use that half_align as new min_align. After this patch, we get: align/size: 1M/2M, 2M/4M, 4M/8M, 8M/16M new min_align/min_size: 4M/32M, and old is 8M/32M align/size: 1M/2M, 2M/4M, 4M/8M new min_align/min_size: 2M/14M, and old is 4M/16M align/size: 1M/2M, 512M/512M new min_align/min_size: 256M/768M, and old is 512M/1024M The real result from one system with one pcie card that has four functions that support sriov: align/size: 0080/0080 0080/0080 0080/0080 0080/0080 0001/0020 0001/0020 0001/0020 0001/0020 8000/8000 8000/8000 8000/8000 8000/8000 4000/0008 4000/0008 4000/0008 4000/0008 old min_align/min_size: 0040/02c0 min_align/min_size: 0010/02b0 So align will be 1M instead of 4M. Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431 Reported-by: TJ Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 195 ++-- 1 file changed, 157 insertions(+), 38 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 27cb0f0..ecdf011 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -30,6 +30,34 @@ unsigned int pci_flags; +static inline bool is_before(resource_size_t align1, resource_size_t size1, +resource_size_t align2, resource_size_t size2) +{ + resource_size_t size1_left, size2_left; + + /* big align is before small align */ + if (align1 > align2) + return true; + + /* +* for same align: +* aligned is before not aligned +* for not aligned, big remainder is before small remainder +*/ + if (align1 == align2) { + size1_left = size1 & (align1 - 1); + if (!size1_left) + size1_left = align1; + size2_left = size2 & (align2 - 1); + if (!size2_left) + size2_left = align2; + if (size1_left > size2_left) + return true; + } + + return false; +} + struct pci_dev_resource { struct list_head list; struct resource *res; @@ -999,26 +1027,125 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, } } -static inline resource_size_t calculate_mem_align(resource_size_t *aligns, - int max_order) +struct align_test_res { + struct list_head list; + struct resource res; + resource_size_t size; + resource_size_t align; +}; + +static void free_align_test_list(struct list_head *head) { - resource_size_t align = 0; - resource_size_t min_align = 0; - int order; + struct align_test_res *p, *tmp; - for (order = 0; order <= max_order; order++) { - resource_size_t align1 = 1; + list_for_each_entry_safe(p, tmp, head, list) { + list_del(&p->list); + kfree(p); + } +} - align1 <<= (order + 20); +static int add_to_align_test_list(struct list_head *head, + resource_size_t align, resource_size_t size) +{ + struct align_test_res *tmp; + + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + tmp->align = align; + tmp->size = size; + + list_add_tail(&tmp->list, head); + + return 0; +} + +static void __sort_align_test(struct list_head *head) +{ + struct align_test_res *res1, *tmp_res, *res2; - if (!align) - min_align = align1; - else if (ALIGN(align + min_align, min_align) < align1) - min_align = align1 >> 1; - align += aligns[order]; + list_for_each_entry_safe(res1, tmp_res, head, list) { + /* reorder it */ + list_for_each_entry(res2, head, list) { + if (res2 == res1) + break; + + if (is_before(res1->align, res1->size, +
[PATCH v2 17/49] PCI: Separate out save_resources/restore_resource
will reuse it in alt_size support. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 49 ++--- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 1622ad2..2e3d00b 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -436,6 +436,29 @@ static bool pci_need_to_release(unsigned long mask, struct resource *res) return false; /* should not get here */ } +static bool save_resources(struct list_head *head, + struct list_head *save_head) +{ + struct pci_dev_resource *dev_res; + + /* Save original start, end, flags etc at first */ + list_for_each_entry(dev_res, head, list) + if (add_to_list(save_head, dev_res->dev, dev_res->res)) { + free_list(save_head); + return false; + } + + return true; +} + +static void restore_resource(struct pci_dev_resource *save_res, +struct resource *res) +{ + res->start = save_res->start; + res->end = save_res->end; + res->flags = save_res->flags; +} + static void __assign_resources_sorted(struct list_head *head, struct list_head *realloc_head, struct list_head *fail_head) @@ -473,13 +496,8 @@ static void __assign_resources_sorted(struct list_head *head, if (!realloc_head || list_empty(realloc_head)) goto requested_and_reassign; - /* Save original start, end, flags etc at first */ - list_for_each_entry(dev_res, head, list) { - if (add_to_list(&save_head, dev_res->dev, dev_res->res)) { - free_list(&save_head); - goto requested_and_reassign; - } - } + if (!save_resources(head, &save_head)) + goto requested_and_reassign; /* Update res in head list with add_size in realloc_head list */ list_for_each_entry(dev_res, head, list) { @@ -548,12 +566,9 @@ static void __assign_resources_sorted(struct list_head *head, } } /* Restore start/end/flags from saved list */ - list_for_each_entry(save_res, &save_head, list) { - res = save_res->res; - res->start = save_res->start; - res->end = save_res->end; - res->flags = save_res->flags; - } + list_for_each_entry(save_res, &save_head, list) + restore_resource(save_res, save_res->res); + free_list(&save_head); requested_and_reassign: @@ -1917,9 +1932,7 @@ again: list_for_each_entry(fail_res, &fail_head, list) { struct resource *res = fail_res->res; - res->start = fail_res->start; - res->end = fail_res->end; - res->flags = fail_res->flags; + restore_resource(fail_res, res); if (fail_res->dev->subordinate) res->flags = 0; } @@ -1983,9 +1996,7 @@ again: list_for_each_entry(fail_res, &fail_head, list) { struct resource *res = fail_res->res; - res->start = fail_res->start; - res->end = fail_res->end; - res->flags = fail_res->flags; + restore_resource(fail_res, res); if (fail_res->dev->subordinate) res->flags = 0; } -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 05/49] PCI: Optimize bus align/size calculation for optional during sizing
Current add_align always use max align, that make must+optional to get allocated more than needed in some cases. Now we have new calculate_mem_align, we could use it for add_align calculation. Need to create separated list for must+optional align/size info. After that we can get smaller add_align/size, we have more chance to make must+optional to get allocated. The result for bridge that have Intel 4x10g card installed. pci :20:03.2: bridge window [mem 0x-0x000f 64bit pref] to [bus 2a-31] calculate_mem for must ===BEGIN align/size: 0080/0080 0080/0080 0080/0080 0080/0080 8000/8000 8000/8000 8000/8000 8000/8000 old min_align/min_size: 0040/0240 min_align/min_size: 0040/0240 ===END pci :20:03.2: bridge window [mem 0x-0x000f 64bit pref] to [bus 2a-31] calculate_mem for add ===BEGIN align/size: 0080/0080 0080/0080 0080/0080 0080/0080 0001/0020 0001/0020 0001/0020 0001/0020 8000/8000 8000/8000 8000/8000 8000/8000 4000/0008 4000/0008 4000/0008 4000/0008 old min_align/min_size: 0080/0300 min_align/min_size: 0010/02b0 ===END so must align/size: 0x40/0x240, and new must+optional align/size: 0x10/0x2b0, and it is better than old must+optional align/size: 0x80/0x300 Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431 Reported-by: TJ Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 82 ++--- 1 file changed, 51 insertions(+), 31 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index ecdf011..4c7f25f 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -901,7 +901,6 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, resource_size_t old_size, resource_size_t align) { @@ -911,7 +910,7 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + size = ALIGN(size, align); return size; } @@ -1174,44 +1173,45 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, struct list_head *realloc_head) { struct pci_dev *dev; - resource_size_t min_align, align, size, size0, size1; - resource_size_t max_align = 0; + resource_size_t min_align = 0, min_add_align = 0; + resource_size_t max_align = 0, max_add_align = 0; + resource_size_t size = 0, size0 = 0, size1 = 0, sum_add_size = 0; struct resource *b_res = find_free_bus_resource(bus, mask | IORESOURCE_PREFETCH, type); - resource_size_t children_add_size = 0; - resource_size_t children_add_align = 0; - resource_size_t add_align = 0; LIST_HEAD(align_test_list); + LIST_HEAD(align_test_add_list); if (!b_res) return -ENOSPC; - size = 0; - list_for_each_entry(dev, &bus->devices, bus_list) { int i; for (i = 0; i < PCI_NUM_RESOURCES; i++) { struct resource *r = &dev->resource[i]; - resource_size_t r_size; + resource_size_t r_size, align; if (r->parent || ((r->flags & mask) != type && (r->flags & mask) != type2 && (r->flags & mask) != type3)) continue; + r_size = resource_size(r); + align = pci_resource_alignment(dev, r); #ifdef CONFIG_PCI_IOV /* put SRIOV requested res to the optional list */ if (realloc_head && i >= PCI_IOV_RESOURCES && i <= PCI_IOV_RESOURCE_END) { - add_align = max(pci_resource_alignment(dev, r), add_align); + add_to_align_test_list(&align_test_add_list, + align, r_size); r->end = r->start - 1; add_to_list(realloc_head, dev, r, r_size, 0/* don't care */); -
[PATCH v2 26/49] PCI: Move ISA ioport align out of calculate_iosize
So we could unify calculate_iosize and calculate_memsize later. when one bridge have several children devices, and every devices have several io port resources and resource size < 0x400. We need to check size, and add extra size to make sure bit8/9 to be zero. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 39 +++ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index de55e07..9d5e550 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1126,11 +1126,6 @@ static resource_size_t calculate_iosize(resource_size_t size, size = min_size; if (old_size == 1) old_size = 0; - /* To be fixed in 2.5: we should have sort of HAVE_ISA - flag in the struct pci_bus. */ -#if defined(CONFIG_ISA) || defined(CONFIG_EISA) - size = (size & 0xff) + ((size & ~0xffUL) << 2); -#endif size = ALIGN(size + size1, align); if (size < old_size) size = old_size; @@ -1184,6 +1179,18 @@ static resource_size_t window_alignment(struct pci_bus *bus, return max(align, arch_align); } +static resource_size_t size_aligned_for_isa(resource_size_t size) +{ + /* +* To be fixed in 2.5: we should have sort of HAVE_ISA +* flag in the struct pci_bus. +*/ +#if defined(CONFIG_ISA) || defined(CONFIG_EISA) + size = (size & 0xff) + ((size & ~0xffUL) << 2); +#endif + return size; +} + /** * pbus_size_io() - size the io window of a given bus * @@ -1201,11 +1208,10 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, { struct pci_dev *dev; resource_size_t min_sum_size = 0; - resource_size_t sum_add_size; struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO, IORESOURCE_IO); resource_size_t size = 0, size0 = 0, size1 = 0; - resource_size_t children_add_size = 0; + resource_size_t sum_add_size = 0, sum_add_size1 = 0; resource_size_t min_align, align; if (!b_res) @@ -1222,7 +1228,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, for (i = 0; i < PCI_NUM_RESOURCES; i++) { struct resource *r = &dev->resource[i]; - unsigned long r_size; + unsigned long r_size, r_add_size; if (r->parent || !(r->flags & IORESOURCE_IO)) continue; @@ -1238,18 +1244,27 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, if (align > min_align) min_align = align; - if (realloc_head) - children_add_size += get_res_add_size(realloc_head, r); + if (realloc_head) { + r_add_size = get_res_add_size(realloc_head, r); + r_add_size += r_size; + if (r_add_size < 0x400) + /* Might be re-aligned for ISA */ + sum_add_size += r_add_size; + else + sum_add_size1 += r_add_size; + } } } + size = size_aligned_for_isa(size); size0 = calculate_iosize(size, min_size, size1, resource_size(b_res), min_align); - sum_add_size = children_add_size + size + size1; + sum_add_size = size_aligned_for_isa(sum_add_size); + sum_add_size += sum_add_size1; if (sum_add_size < min_sum_size) sum_add_size = min_sum_size; size1 = !realloc_head ? size0 : - calculate_iosize(size, min_size, sum_add_size - size, + calculate_iosize(sum_add_size, min_size, 0, resource_size(b_res), min_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 41/49] of/PCI: Add IORESOURCE_MEM_64 for 64-bit resource
For device resource PREF bit setting under bridge 64-bit pref resource, we need to make sure only set PREF for 64bit resource, so set IORESOUCE_MEM_64 for 64bit resource during of device resource flags parsing. Link: https://bugzilla.kernel.org/show_bug.cgi?id=96261 Link: https://bugzilla.kernel.org/show_bug.cgi?id=96241 Signed-off-by: Yinghai Lu Cc: Grant Likely Cc: Rob Herring Cc: devicet...@vger.kernel.org --- drivers/of/address.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index 8bfda6a..073125f 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -128,9 +128,11 @@ static unsigned int of_bus_pci_get_flags(const __be32 *addr) flags |= IORESOURCE_IO; break; case 0x02: /* 32 bits */ - case 0x03: /* 64 bits */ flags |= IORESOURCE_MEM; break; + case 0x03: /* 64 bits */ + flags |= IORESOURCE_MEM | IORESOURCE_MEM_64; + break; } if (w & 0x4000) flags |= IORESOURCE_PREFETCH; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 11/49] PCI: Add debug printout during releasing partial assigned resources
We try to assign must+optional at first, and we only accept the result if all resources get allocated. Otherwise will release assigned in the list, and try to assign must and expand to optional. We have to do that to make sure any must has priority than any optional addon. When that happens, we only print out "assigned" info, that is confusing as it looks like same range is assigned to two peer resources at the same time. Add printout for releasing so we have whole picture in debug messages. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 3abf249..6dff258 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -521,9 +521,17 @@ static void __assign_resources_sorted(struct list_head *head, free_list(&local_fail_head); /* Release assigned resource */ - list_for_each_entry(dev_res, head, list) - if (dev_res->res->parent) - release_resource(dev_res->res); + list_for_each_entry(dev_res, head, list) { + struct resource *res = dev_res->res; + + if (res->parent) { + dev_printk(KERN_DEBUG, &dev_res->dev->dev, + "BAR %d: released %pR\n", + (int)(res - &dev_res->dev->resource[0]), + res); + release_resource(res); + } + } /* Restore start/end/flags from saved list */ list_for_each_entry(save_res, &save_head, list) { struct resource *res = save_res->res; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 31/49] resources: Split out __allocate_resource()
It will not hold lock, so we could use it in other functions that hold the resource lock already. -v2: according to Linus, using "bool lock" as parameter aka "conditionally take lock" is *wrong*. Signed-off-by: Yinghai Lu Acked-by: Linus Torvalds --- kernel/resource.c | 70 +++ 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 90552aa..830cc11 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -619,7 +619,7 @@ static int find_resource(struct resource *root, struct resource *new, } /** - * reallocate_resource - allocate a slot in the resource tree given range & alignment. + * __reallocate_resource - allocate a slot in the resource tree given range & alignment. * The resource will be relocated if the new size cannot be reallocated in the * current location. * @@ -628,7 +628,7 @@ static int find_resource(struct resource *root, struct resource *new, * @newsize: new size of the resource descriptor * @constraint: the size and alignment constraints to be met. */ -static int reallocate_resource(struct resource *root, struct resource *old, +static int __reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, struct resource_constraint *constraint) { @@ -636,8 +636,6 @@ static int reallocate_resource(struct resource *root, struct resource *old, struct resource new = *old; struct resource *conflict; - write_lock(&resource_lock); - if ((err = __find_resource(root, old, &new, newsize, constraint))) goto out; @@ -662,14 +660,13 @@ static int reallocate_resource(struct resource *root, struct resource *old, BUG_ON(conflict); } out: - write_unlock(&resource_lock); return err; } - /** - * allocate_resource - allocate empty slot in the resource tree given range & alignment. - * The resource will be reallocated with a new size if it was already allocated + * __allocate_resource - allocate empty slot in the resource tree given range & alignment. + * The resource will be reallocated with a new size if it was already + * allocated * @root: root resource descriptor * @new: resource descriptor desired by caller * @size: requested resource region size @@ -678,15 +675,17 @@ out: * @align: alignment requested, in bytes * @alignf: alignment function, optional, called if not NULL * @alignf_data: arbitrary data to pass to the @alignf function + * + * Caller need to hold resource_lock if needed. */ -int allocate_resource(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), - void *alignf_data) +static int __allocate_resource(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), + void *alignf_data) { int err; struct resource_constraint constraint; @@ -700,20 +699,51 @@ int allocate_resource(struct resource *root, struct resource *new, constraint.alignf = alignf; constraint.alignf_data = alignf_data; - if ( new->parent ) { + if (new->parent) { /* resource is already allocated, try reallocating with the new constraints */ - return reallocate_resource(root, new, size, &constraint); + return __reallocate_resource(root, new, size, &constraint); } - write_lock(&resource_lock); err = find_resource(root, new, size, &constraint); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; - write_unlock(&resource_lock); + return err; } +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment. + * The resource will be reallocated with a new size if it was already + * allocated + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * @size: requested resource region size + * @min: minimum boundary to allocate + *
[PATCH v2 39/49] sparc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing
For device resource PREF bit setting under bridge 64-bit pref resource, we need to make sure only set PREF for 64bit resource, so set IORESOUCE_MEM_64 for 64bit resource during of device resource flags parsing. Link: https://bugzilla.kernel.org/show_bug.cgi?id=96261 Link: https://bugzilla.kernel.org/show_bug.cgi?id=96241 Signed-off-by: Yinghai Lu Cc: "David S. Miller" Cc: sparcli...@vger.kernel.org --- arch/sparc/kernel/of_device_32.c | 5 +++-- arch/sparc/kernel/of_device_64.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/sparc/kernel/of_device_32.c b/arch/sparc/kernel/of_device_32.c index 185aa96..3e9f273 100644 --- a/arch/sparc/kernel/of_device_32.c +++ b/arch/sparc/kernel/of_device_32.c @@ -83,11 +83,12 @@ static unsigned long of_bus_pci_get_flags(const u32 *addr, unsigned long flags) case 0x01: flags |= IORESOURCE_IO; break; - case 0x02: /* 32 bits */ - case 0x03: /* 64 bits */ flags |= IORESOURCE_MEM; break; + case 0x03: /* 64 bits */ + flags |= IORESOURCE_MEM | IORESOURCE_MEM_64; + break; } if (w & 0x4000) flags |= IORESOURCE_PREFETCH; diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c index 7bbdc26..defee61 100644 --- a/arch/sparc/kernel/of_device_64.c +++ b/arch/sparc/kernel/of_device_64.c @@ -146,11 +146,12 @@ static unsigned long of_bus_pci_get_flags(const u32 *addr, unsigned long flags) case 0x01: flags |= IORESOURCE_IO; break; - case 0x02: /* 32 bits */ - case 0x03: /* 64 bits */ flags |= IORESOURCE_MEM; break; + case 0x03: /* 64 bits */ + flags |= IORESOURCE_MEM | IORESOURCE_MEM_64; + break; } if (w & 0x4000) flags |= IORESOURCE_PREFETCH; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 32/49] resources: Make allocate_resource return just fit resource
Find all suitable empty slots and pick one just fit, so we could save the big slot for needed ones later when we have several pcie switches and some bridges get assigned bios and we need to assign others in kernel. Signed-off-by: Yinghai Lu --- kernel/resource.c | 81 ++- 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 830cc11..c630ef1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -48,6 +48,7 @@ struct resource_constraint { resource_size_t (*alignf)(void *, const struct resource *, resource_size_t, resource_size_t); void *alignf_data; + bool fit; }; static DEFINE_RWLOCK(resource_lock); @@ -554,12 +555,15 @@ static void resource_clip(struct resource *res, resource_size_t min, * alignment constraints */ static int __find_resource(struct resource *root, struct resource *old, -struct resource *new, +struct resource *new, struct resource *avail, resource_size_t size, struct resource_constraint *constraint) { struct resource *this = root->child; - struct resource tmp = *new, avail, alloc; + struct resource tmp = *new, availx, alloc; + + if (!avail || avail == new) + avail = &availx; tmp.start = root->start; /* @@ -583,15 +587,16 @@ static int __find_resource(struct resource *root, struct resource *old, arch_remove_reservations(&tmp); /* Check for overflow after ALIGN() */ - avail.start = ALIGN(tmp.start, constraint->align); - avail.end = tmp.end; - avail.flags = new->flags & ~IORESOURCE_UNSET; - if (avail.start >= tmp.start) { - alloc.flags = avail.flags; - alloc.start = constraint->alignf(constraint->alignf_data, &avail, + avail->start = ALIGN(tmp.start, constraint->align); + avail->end = tmp.end; + avail->flags = new->flags & ~IORESOURCE_UNSET; + if (avail->start >= tmp.start) { + alloc.flags = avail->flags; + alloc.start = constraint->alignf( + constraint->alignf_data, avail, size, constraint->align); alloc.end = alloc.start + size - 1; - if (resource_contains(&avail, &alloc)) { + if (resource_contains(avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; @@ -608,6 +613,11 @@ next: if (!this || this->end == root->end) return -EBUSY; } +struct good_resource { + struct list_head list; + struct resource avail; + struct resource new; +}; /* * Find empty slot in the resource tree given range and alignment. */ @@ -615,7 +625,49 @@ static int find_resource(struct resource *root, struct resource *new, resource_size_t size, struct resource_constraint *constraint) { - return __find_resource(root, NULL, new, size, constraint); + int ret = -1; + LIST_HEAD(head); + struct good_resource *good, *tmp; + resource_size_t avail_size = (resource_size_t)-1ULL; + + if (!constraint->fit) + return __find_resource(root, NULL, new, NULL, size, + constraint); + + /* find all suitable ones and add to the list */ + for (;;) { + good = kzalloc(sizeof(*good), GFP_KERNEL); + if (!good) + break; + + good->new.start = new->start; + good->new.end = new->end; + good->new.flags = new->flags; + ret = __find_resource(root, NULL, &good->new, &good->avail, + size, constraint); + if (ret || __request_resource(root, &good->avail)) { + ret = -EBUSY; + kfree(good); + break; + } + + list_add(&good->list, &head); + } + + /* pick up the smallest one and delete the list */ + list_for_each_entry_safe(good, tmp, &head, list) { + if (resource_size(&good->avail) < avail_size) { + avail_size = resource_size(&good->avail); + new->start = good->new.start; + new->end = good->new.end; + ret = 0; +
[PATCH v2 34/49] PCI: Only treat non-pef mmio64 as pref if all bridges has MEM_64
If any bridge up to root only have 32bit pref mmio, We don't need to treat device non-pref mmio64 as as pref mmio64. We need to move pci_bridge_check_ranges calling early. for parent bridges pref mmio BAR may not allocated by BIOS, res flags is still 0, we need to have it correct set before we check them for child device resources. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 3a1d659..8a8e5a7 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1038,6 +1038,18 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, int i) return -EINVAL; } +static bool pci_up_path_over_pref_mem64(struct pci_bus *bus) +{ + if (pci_is_root_bus(bus)) + return true; + + if (bus->self && !(bus->self->resource[PCI_BRIDGE_RESOURCES + 2].flags & + IORESOURCE_MEM_64)) + return false; + + return pci_up_path_over_pref_mem64(bus->parent); +} + int pci_resource_pref_compatible(const struct pci_dev *dev, struct resource *res) { @@ -1046,7 +1058,8 @@ int pci_resource_pref_compatible(const struct pci_dev *dev, if ((res->flags & IORESOURCE_MEM) && (res->flags & IORESOURCE_MEM_64) && - dev->on_all_pcie_path) + dev->on_all_pcie_path && + pci_up_path_over_pref_mem64(dev->bus)) return res->flags | IORESOURCE_PREFETCH; return res->flags; @@ -1816,6 +1829,10 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) struct resource *b_res; int ret; + if (!pci_is_root_bus(bus) && + (bus->self->class >> 8) == PCI_CLASS_BRIDGE_PCI) + pci_bridge_check_ranges(bus); + list_for_each_entry(dev, &bus->devices, bus_list) { struct pci_bus *b = dev->subordinate; if (!b) @@ -1843,7 +1860,6 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) break; case PCI_CLASS_BRIDGE_PCI: - pci_bridge_check_ranges(bus); if (bus->self->is_hotplug_bridge) { min_io_size = pci_hotplug_io_size; min_mem_size = pci_hotplug_mem_size; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 36/49] PCI: Only treat non-pef mmio64 as pref if host-bridge has_mem64
If host bridge does not have mmio64 above 4G, We don't need to treat device non-pref mmio64 as as pref mmio64. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 8a8e5a7..37d5a48 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1041,7 +1041,7 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, int i) static bool pci_up_path_over_pref_mem64(struct pci_bus *bus) { if (pci_is_root_bus(bus)) - return true; + return to_pci_host_bridge(bus->bridge)->has_mem64; if (bus->self && !(bus->self->resource[PCI_BRIDGE_RESOURCES + 2].flags & IORESOURCE_MEM_64)) -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 49/49] PCI: Don't set flags to 0 when assign resource fail
make flags take IORESOURCE_UNSET | IORESOURCE_DISABLED instead. Signed-off-by: Yinghai Lu --- drivers/pci/bus.c | 2 +- drivers/pci/setup-bus.c | 45 +++-- drivers/pci/setup-res.c | 3 ++- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index b043bdf..b68f1cd 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -140,7 +140,7 @@ static int pci_bus_alloc_from_region(struct pci_bus *bus, struct resource *res, type_mask |= IORESOURCE_TYPE_BITS; pci_bus_for_each_resource(bus, r, i) { - if (!r) + if (!r || resource_disabled(r)) continue; /* type_mask must match */ diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 7734be6..a7fbdcc 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -289,13 +289,6 @@ static void __dev_check_resources(struct pci_dev *dev, pdev_check_resources(dev, realloc_head, head); } -static inline void reset_resource(struct resource *res) -{ - res->start = 0; - res->end = 0; - res->flags = 0; -} - static void __sort_resources(struct list_head *head) { struct pci_dev_resource *res1, *tmp_res, *res2; @@ -398,7 +391,7 @@ static void reassign_resources_sorted(struct list_head *realloc_head, res->start = align; res->end = res->start + add_size - 1; if (pci_assign_resource(add_res->dev, idx)) - reset_resource(res); + res->flags |= IORESOURCE_DISABLED; } else { /* could just assigned with alt, add difference ? */ resource_size_t must_size; @@ -451,7 +444,7 @@ static void assign_requested_resources_sorted(struct list_head *head, pci_assign_resource(dev_res->dev, idx)) { if (fail_head) add_to_list(fail_head, dev_res->dev, res); - reset_resource(res); + res->flags |= IORESOURCE_DISABLED; } } } @@ -737,7 +730,7 @@ static void __assign_resources_alt_sorted(struct list_head *head, if (!res_to_dev_res(local_fail_head, res)) add_to_list(local_fail_head, fail_res->dev, res); - reset_resource(res); + res->flags |= IORESOURCE_DISABLED; } free_list(&local_alt_fail_head); } @@ -903,7 +896,7 @@ static void pci_setup_bridge_io(struct pci_dev *bridge) /* Set up the top and bottom of the PCI I/O segment for this bus. */ res = &bridge->resource[PCI_BRIDGE_RESOURCES + 0]; pcibios_resource_to_bus(bridge->bus, ®ion, res); - if (res->flags & IORESOURCE_IO) { + if ((res->flags & IORESOURCE_IO) && !(res->flags & IORESOURCE_UNSET)) { pci_read_config_word(bridge, PCI_IO_BASE, &l); io_base_lo = (region.start >> 8) & io_mask; io_limit_lo = (region.end >> 8) & io_mask; @@ -933,7 +926,8 @@ static void pci_setup_bridge_mmio(struct pci_dev *bridge) /* Set up the top and bottom of the PCI Memory segment for this bus. */ res = &bridge->resource[PCI_BRIDGE_RESOURCES + 1]; pcibios_resource_to_bus(bridge->bus, ®ion, res); - if (res->flags & IORESOURCE_MEM) { + if ((res->flags & IORESOURCE_MEM) && + !(res->flags & IORESOURCE_UNSET)) { l = (region.start >> 16) & 0xfff0; l |= region.end & 0xfff0; dev_info(&bridge->dev, " bridge window %pR\n", res); @@ -958,7 +952,8 @@ static void pci_setup_bridge_mmio_pref(struct pci_dev *bridge) bu = lu = 0; res = &bridge->resource[PCI_BRIDGE_RESOURCES + 2]; pcibios_resource_to_bus(bridge->bus, ®ion, res); - if (res->flags & IORESOURCE_PREFETCH) { + if ((res->flags & IORESOURCE_PREFETCH) && + !(res->flags & IORESOURCE_UNSET)) { l = (region.start >> 16) & 0xfff0; l |= region.end & 0xfff0; if (res->flags & IORESOURCE_MEM_64) { @@ -1077,6 +1072,7 @@ static void pci_bridge_check_ranges(struct pci_bus *bus) b_res = &bridge->resource[PCI_BRIDGE_RESOURCES]; b_res[1].flags |= IORESOURCE_MEM; + b_res[1].flags &= ~IORESOURCE_DISABLED; pci_read_config_word(bridge, PCI_IO_BASE, &io); if (!io) { @@ -1084,8 +1080,10 @@ static void pci_bridge_check_ranges(struct pci_bus *bus) pci_read_config_word(bridge, PCI_IO_BASE, &io);
[PATCH v2 45/49] PCI: Don't release fixed resource for realloc
We should not release bridge resource if there is fixed resources under it, otherwise the children firmware would stop working. Reported-by: Paul Johnson Suggested-by: Bjorn Helgaas Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=92351 Signed-off-by: Yinghai Lu Cc: sta...@vger.kernel.org --- drivers/pci/setup-bus.c | 6 -- include/linux/ioport.h | 2 +- kernel/resource.c | 28 ++-- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index dc9ba41..9d5423c 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2059,14 +2059,16 @@ static void pci_bridge_release_resources(struct pci_bus *bus, r = &b_res[idx]; - if (!r->parent) + if (!r->parent || r->flags & IORESOURCE_PCI_FIXED) return; /* * if there are children under that, we should release them * all */ - release_child_resources(r); + if (!release_child_resources(r)) + return; + if (!release_resource(r)) { type = old_flags = r->flags & type_mask; dev_printk(KERN_DEBUG, &dev->dev, "resource %d %pR released\n", diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 388e3ae..27dbb18 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -141,7 +141,7 @@ extern struct resource iomem_resource; extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); extern int release_resource(struct resource *new); -void release_child_resources(struct resource *new); +bool release_child_resources(struct resource *new); extern void reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name); diff --git a/kernel/resource.c b/kernel/resource.c index c630ef1..0285f11 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -276,11 +276,35 @@ static void __release_child_resources(struct resource *r) } } -void release_child_resources(struct resource *r) +static bool __has_fixed_child_resources(struct resource *r) { + struct resource *p; + + p = r->child; + while (p) { + if (p->flags & IORESOURCE_PCI_FIXED) + return true; + + if (__has_fixed_child_resources(p)) + return true; + + p = p->sibling; + } + + return false; +} + +bool release_child_resources(struct resource *r) +{ + bool fixed; + write_lock(&resource_lock); - __release_child_resources(r); + fixed = __has_fixed_child_resources(r); + if (!fixed) + __release_child_resources(r); write_unlock(&resource_lock); + + return !fixed; } /** -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 43/49] PCI: Get new realloc size for bridge for last try
Current realloc path would not shrink bridge resource through pbus_size_mem() checking with the old size. That cause problem: when "must+optional" resource allocation fails, the cached bridge resource size will prevent "must" resource to get allocated smaller resource. Clear the old resource size for last try or third and later try. Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431 Tested-by: TJ Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 15 +-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index da0a259..7ffb113 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2327,8 +2327,15 @@ again: struct resource *res = fail_res->res; restore_resource(fail_res, res); - if (fail_res->dev->subordinate) + if (fail_res->dev->subordinate) { res->flags = 0; + /* last or third times and later */ + if (tried_times + 1 == pci_try_num || + tried_times + 1 > 2) { + res->start = 0; + res->end = res->start - 1; + } + } } free_list(&fail_head); @@ -2400,8 +2407,12 @@ again: struct resource *res = fail_res->res; restore_resource(fail_res, res); - if (fail_res->dev->subordinate) + if (fail_res->dev->subordinate) { res->flags = 0; + /* last time */ + res->start = 0; + res->end = res->start - 1; + } } free_list(&fail_head); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 42/49] PCI: Treat optional as must in first try for bridge rescan
For rescan bridge/bus that children are removed before, we should treat optional as must just like root bus the boot time in 19aa7ee432ce (PCI: make re-allocation try harder by reassigning ranges higher in the heirarchy). The reason: allocate must and expand to optional path do not put failed resource to fail list, so will lose must info before next try. So we are using following way: 1. First and following try before last try: We don't keep realloc list so treat every optional as must. allocate for must+optional and put failed in the fail list. then size info (include must and optonal separatedly) will be kept for next try. 2. last try: a: try to allocate must+optional to see if all get allocated. b: try to allocate must then expand to optional. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 19 ++- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index f5b07d8..da0a259 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2350,25 +2350,34 @@ void __init pci_assign_unassigned_resources(void) void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge) { struct pci_bus *parent = bridge->subordinate; - LIST_HEAD(add_list); /* list of resources that + LIST_HEAD(realloc_head); /* list of resources that want additional resources */ + struct list_head *add_list = NULL; int tried_times = 0; LIST_HEAD(fail_head); struct pci_dev_resource *fail_res; int retval; unsigned long type_mask = IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH | IORESOURCE_MEM_64; + int pci_try_num = 2; again: - __pci_bus_size_bridges(parent, &add_list); - __pci_bridge_assign_resources(bridge, &add_list, &fail_head); - __pci_bus_check_realloc(&add_list); + /* +* last try will use add_list, otherwise will try good to have as +* must have, so can realloc parent bridge resource +*/ + if (tried_times + 1 == pci_try_num) + add_list = &realloc_head; + __pci_bus_size_bridges(parent, add_list); + __pci_bridge_assign_resources(bridge, add_list, &fail_head); + if (add_list) + __pci_bus_check_realloc(add_list); tried_times++; if (list_empty(&fail_head)) goto enable_all; - if (tried_times >= 2) { + if (tried_times >= pci_try_num) { /* still fail, don't need to try more */ free_list(&fail_head); goto enable_all; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 29/49] PCI: Unify skip_ioresource_align()
There are powerpc generic version and x86 local version. Move the powerpc version to setup-bus.c, and kill x86 local version. Also kill dummy version in microblaze. Cc: Michal Simek Cc: Paul Mackerras Cc: Michael Ellerman Cc: Arnd Bergmann Cc: linuxppc-...@lists.ozlabs.org Cc: linux-a...@vger.kernel.org Signed-off-by: Yinghai Lu --- arch/microblaze/pci/pci-common.c | 8 arch/powerpc/kernel/pci-common.c | 11 +-- arch/x86/include/asm/pci_x86.h | 1 - arch/x86/pci/common.c| 4 ++-- arch/x86/pci/i386.c | 12 ++-- drivers/pci/setup-bus.c | 9 + include/asm-generic/pci-bridge.h | 2 ++ 7 files changed, 16 insertions(+), 31 deletions(-) diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index ae838ed..09b1af6 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -878,11 +878,6 @@ void pcibios_fixup_bus(struct pci_bus *bus) } EXPORT_SYMBOL(pcibios_fixup_bus); -static int skip_isa_ioresource_align(struct pci_dev *dev) -{ - return 0; -} - /* * We need to avoid collisions with `mirrored' VGA ports * and other strange ISA hardware, so we always want the @@ -899,12 +894,9 @@ static int skip_isa_ioresource_align(struct pci_dev *dev) resource_size_t pcibios_align_resource(void *data, const struct resource *res, resource_size_t size, resource_size_t align) { - struct pci_dev *dev = data; resource_size_t start = res->start; if (res->flags & IORESOURCE_IO) { - if (skip_isa_ioresource_align(dev)) - return start; if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; } diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index b9de34d..2d8d654 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1064,15 +1064,6 @@ void pci_fixup_cardbus(struct pci_bus *bus) pcibios_setup_bus_devices(bus); } - -static int skip_isa_ioresource_align(struct pci_dev *dev) -{ - if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) && - !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA)) - return 1; - return 0; -} - /* * We need to avoid collisions with `mirrored' VGA ports * and other strange ISA hardware, so we always want the @@ -1093,7 +1084,7 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, resource_size_t start = res->start; if (res->flags & IORESOURCE_IO) { - if (skip_isa_ioresource_align(dev)) + if (skip_isa_ioresource_align(dev->bus)) return start; if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 164e3f8..ddac225 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -28,7 +28,6 @@ do { \ #define PCI_ASSIGN_ROMS0x1000 #define PCI_BIOS_IRQ_SCAN 0x2000 #define PCI_ASSIGN_ALL_BUSSES 0x4000 -#define PCI_CAN_SKIP_ISA_ALIGN 0x8000 #define PCI_USE__CRS 0x1 #define PCI_CHECK_ENABLE_AMD_MMCONF0x2 #define PCI_HAS_IO_ECS 0x4 diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 8fd6f44..e8df922 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -83,7 +83,7 @@ DEFINE_RAW_SPINLOCK(pci_config_lock); static int __init can_skip_ioresource_align(const struct dmi_system_id *d) { - pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; + pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN); printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident); return 0; } @@ -619,7 +619,7 @@ char *__init pcibios_setup(char *str) pci_routeirq = 1; return NULL; } else if (!strcmp(str, "skip_isa_align")) { - pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; + pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN); return NULL; } else if (!strcmp(str, "noioapicquirk")) { noioapicquirk = 1; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 0a9f2ca..3f17726 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -128,15 +129,6 @@ static void __init pcibios_fw_addr_list_del(void) pcibios_fw_addr_done = true; } -static int -skip_isa_ioresource_align(struct pci_dev *dev) { - - if ((pci_probe & PCI_CAN_SKIP_ISA_ALIGN) && - !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA)) - return 1; - return 0; -} - /* * We need to avoid collisions with `mir
[PATCH v2 40/49] powerpc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing
For device resource PREF bit setting under bridge 64-bit pref resource, we need to make sure only set PREF for 64bit resource, so set IORESOUCE_MEM_64 for 64bit resource during of device resource flags parsing. Link: https://bugzilla.kernel.org/show_bug.cgi?id=96261 Link: https://bugzilla.kernel.org/show_bug.cgi?id=96241 Signed-off-by: Yinghai Lu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Gavin Shan Cc: Yijing Wang Cc: Anton Blanchard Cc: linuxppc-...@lists.ozlabs.org --- arch/powerpc/kernel/pci_of_scan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 42e02a2..f31bfd0 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -44,8 +44,10 @@ static unsigned int pci_parse_of_flags(u32 addr0, int bridge) if (addr0 & 0x0200) { flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; - flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; + if (addr0 & 0x0100) + flags |= IORESOURCE_MEM_64 +| PCI_BASE_ADDRESS_MEM_TYPE_64; if (addr0 & 0x4000) flags |= IORESOURCE_PREFETCH | PCI_BASE_ADDRESS_MEM_PREFETCH; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 09/49] PCI: Rename pdev_sort_resources to pdev_check_resources
We don't do sorting in those functions anymore, so change "sort" to "check" instead. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 6642a60..292f2a5 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -206,8 +206,8 @@ static resource_size_t __pci_resource_alignment( return r_align; } -/* Sort resources by alignment */ -static void pdev_sort_resources(struct pci_dev *dev, +/* check resources and save to the list */ +static void pdev_check_resources(struct pci_dev *dev, struct list_head *realloc_head, struct list_head *head) { @@ -243,7 +243,7 @@ static void pdev_sort_resources(struct pci_dev *dev, } } -static void __dev_sort_resources(struct pci_dev *dev, +static void __dev_check_resources(struct pci_dev *dev, struct list_head *realloc_head, struct list_head *head) { @@ -261,7 +261,7 @@ static void __dev_sort_resources(struct pci_dev *dev, return; } - pdev_sort_resources(dev, realloc_head, head); + pdev_check_resources(dev, realloc_head, head); } static inline void reset_resource(struct resource *res) @@ -561,7 +561,7 @@ static void pdev_assign_resources_sorted(struct pci_dev *dev, { LIST_HEAD(head); - __dev_sort_resources(dev, add_head, &head); + __dev_check_resources(dev, add_head, &head); __assign_resources_sorted(&head, add_head, fail_head); } @@ -574,7 +574,7 @@ static void pbus_assign_resources_sorted(const struct pci_bus *bus, LIST_HEAD(head); list_for_each_entry(dev, &bus->devices, bus_list) - __dev_sort_resources(dev, realloc_head, &head); + __dev_check_resources(dev, realloc_head, &head); __assign_resources_sorted(&head, realloc_head, fail_head); } -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 33/49] PCI: Check pref compatible bit for mem64 resource of pcie device
We still get "no compatible bridge window" warning on sparc T5-8 after we add support for 64bit resource parsing for root bus. PCI: scan_bus[/pci@300/pci@1/pci@0/pci@6] bus no 8 PCI: Claiming :00:01.0: Resource 15: 8001..8004afff [220c] PCI: Claiming :01:00.0: Resource 15: 8001..8004afff [220c] PCI: Claiming :02:04.0: Resource 15: 8001..80012fff [220c] PCI: Claiming :03:00.0: Resource 15: 8001..80012fff [220c] PCI: Claiming :04:06.0: Resource 14: 8001..80010fff [220c] PCI: Claiming :05:00.0: Resource 0: 8001..80011fff [204] pci :05:00.0: can't claim BAR 0 [mem 0x8001-0x80011fff]: no compatible bridge window All the bridges 64-bit resource have pref bit, but the device resource does not have pref set, then we can not find parent for the device resource, as we can not put non-pref mem under pref mem. According to pcie spec errta https://www.pcisig.com/specifications/pciexpress/base2/PCIe_Base_r2.1_Errata_08Jun10.pdf page 13, in some case it is ok to mark some as pref. Mark if the entire path from the host to the adapter is over PCI Express. Then set pref compatible bit for claim/sizing/assign for 64bit mem resource on that pcie device. Fixes: commit d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") Link: http://lkml.kernel.org/r/cae9fiqu1gjy1lyrxs+ma5lcteee4xmtjrg0axj9k_tsu+m9...@mail.gmail.com Reported-by: David Ahern Tested-by: David Ahern Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431 Tested-by: TJ Signed-off-by: Yinghai Lu Cc: #3.19 --- drivers/pci/pci.c | 3 ++- drivers/pci/pci.h | 2 ++ drivers/pci/probe.c | 33 + drivers/pci/setup-bus.c | 21 ++--- drivers/pci/setup-res.c | 4 include/linux/pci.h | 1 + 6 files changed, 60 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0008c95..ff1192a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -414,6 +414,7 @@ EXPORT_SYMBOL_GPL(pci_find_ht_capability); struct resource *pci_find_parent_resource(const struct pci_dev *dev, struct resource *res) { + int flags = pci_resource_pref_compatible(dev, res); const struct pci_bus *bus = dev->bus; struct resource *r; int i; @@ -428,7 +429,7 @@ struct resource *pci_find_parent_resource(const struct pci_dev *dev, * not, the allocator made a mistake. */ if (r->flags & IORESOURCE_PREFETCH && - !(res->flags & IORESOURCE_PREFETCH)) + !(flags & IORESOURCE_PREFETCH)) return NULL; /* diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 2b83977..1804d44 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -336,4 +336,6 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe) struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus); +int pci_resource_pref_compatible(const struct pci_dev *dev, +struct resource *res); #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index cefd636..010d8d9 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1544,6 +1544,36 @@ static void pci_init_capabilities(struct pci_dev *dev) pci_enable_acs(dev); } +static bool pci_up_path_over_pcie(struct pci_bus *bus) +{ + if (pci_is_root_bus(bus)) + return true; + + if (bus->self && !pci_is_pcie(bus->self)) + return false; + + return pci_up_path_over_pcie(bus->parent); +} + +/* + * According to + * https://www.pcisig.com/specifications/pciexpress/base2/PCIe_Base_r2.1_Errata_08Jun10.pdf + * page 13, system firmware could put some 64bit non-pref under 64bit pref, + * on some cases. + * Let's mark if entire path from the host to the adapter is over PCI + * Express. later will use that compute pref compaitable bit. + */ +static void pci_set_on_all_pcie_path(struct pci_dev *dev) +{ + if (!pci_is_pcie(dev)) + return; + + if (!pci_up_path_over_pcie(dev->bus)) + return; + + dev->on_all_pcie_path = 1; +} + void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) { int ret; @@ -1574,6 +1604,9 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) /* Initialize various capabilities */ pci_init_capabilities(dev); + /* After pcie_cap is assigned */ + pci_set_on_all_pcie_path(dev); + /* * Add the device to our list of discovered devices * and the bus list for fixup functions, etc. diff --git a/dri
[PATCH v2 44/49] PCI: Don't release sibiling bridge resources during hotplug
On hotplug path, we can not touch sibling bridges that is out side of the slot. That could happen when BIOS does not assign some bridge BARs and later can not assign resource to them in first try. Check if fail dev is the parent bridge, then just use subordinate bus instead use parent bus. Reported-by: Andreas Noever Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 7ffb113..dc9ba41 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2397,10 +2397,16 @@ again: * Try to release leaf bridge's resources that doesn't fit resource of * child device under that bridge */ - list_for_each_entry(fail_res, &fail_head, list) - pci_bus_release_bridge_resources(fail_res->dev->bus, + list_for_each_entry(fail_res, &fail_head, list) { + struct pci_bus *bus = fail_res->dev->bus; + + if (fail_res->dev == bridge) + bus = bridge->subordinate; + + pci_bus_release_bridge_resources(bus, fail_res->flags & type_mask, whole_subtree); + } /* restore size and flags */ list_for_each_entry(fail_res, &fail_head, list) { -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 47/49] PCI, x86: Add pci=assign_pref_bars to re-allocate pref bars
So could reallocate pref mmio64 above 4G later. Signed-off-by: Yinghai Lu --- arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/common.c | 3 +++ arch/x86/pci/i386.c| 56 ++ 3 files changed, 39 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index ddac225..7b634b8 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -34,6 +34,7 @@ do { \ #define PCI_NOASSIGN_ROMS 0x8 #define PCI_ROOT_NO_CRS0x10 #define PCI_NOASSIGN_BARS 0x20 +#define PCI_ASSIGN_PREF_BARS 0x40 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index e8df922..dcc7c48 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -606,6 +606,9 @@ char *__init pcibios_setup(char *str) } else if (!strcmp(str, "assign-busses")) { pci_probe |= PCI_ASSIGN_ALL_BUSSES; return NULL; + } else if (!strcmp(str, "assign_pref_bars")) { + pci_probe |= PCI_ASSIGN_PREF_BARS; + return NULL; } else if (!strcmp(str, "use_crs")) { pci_probe |= PCI_USE__CRS; return NULL; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 3f17726..0b74efe 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -208,16 +208,25 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev) continue; if (r->parent) /* Already allocated */ continue; - if (!r->start || pci_claim_bridge_resource(dev, idx) < 0) { - /* -* Something is wrong with the region. -* Invalidate the resource to prevent -* child resource allocations in this -* range. -*/ - r->start = r->end = 0; - r->flags = 0; - } + + if ((r->flags & IORESOURCE_PREFETCH) && + (pci_probe & PCI_ASSIGN_PREF_BARS)) + goto clear; + + if (!r->start) + goto clear; + + if (pci_claim_bridge_resource(dev, idx) == 0) + continue; + +clear: + /* +* Something is wrong with the region. +* Invalidate the resource to prevent +* child resource allocations in this range. +*/ + r->start = r->end = 0; + r->flags = 0; } } @@ -263,21 +272,26 @@ static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { + if ((r->flags & IORESOURCE_PREFETCH) && + (pci_probe & PCI_ASSIGN_PREF_BARS)) + goto clear; + dev_dbg(&dev->dev, "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); - if (pci_claim_resource(dev, idx) < 0) { - if (r->flags & IORESOURCE_PCI_FIXED) { - dev_info(&dev->dev, "BAR %d %pR is immovable\n", -idx, r); - } else { - /* We'll assign a new address later */ - pcibios_save_fw_addr(dev, - idx, r->start); - r->end -= r->start; - r->start = 0; - } + if (pci_claim_resource(dev, idx) == 0) + continue; + if (r->flags & IORESOURCE_PCI_FIXED) { + dev_info(&dev->dev, "BAR %d %pR is immovable\n", +idx, r); + continue; } + +clear: + /* We'll assign a new address later */ + pcibios_save_fw_addr(dev, idx, r->start); +
[PATCH v2 48/49] PCI: Introduce resource_disabled()
so we can cover !flags and IORESOURCE_DISABLED both. Cc: linux-al...@vger.kernel.org Cc: linux-i...@vger.kernel.org Cc: linux-am33-l...@redhat.com Cc: linuxppc-...@lists.ozlabs.org Cc: linux-s...@vger.kernel.org Cc: sparcli...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: linux-xte...@linux-xtensa.org Cc: io...@lists.linux-foundation.org Cc: linux...@vger.kernel.org Signed-off-by: Yinghai Lu --- arch/alpha/kernel/pci.c | 2 +- arch/ia64/pci/pci.c | 4 ++-- arch/microblaze/pci/pci-common.c | 15 --- arch/mn10300/unit-asb2305/pci-asb2305.c | 4 ++-- arch/mn10300/unit-asb2305/pci.c | 4 ++-- arch/powerpc/kernel/pci-common.c | 16 +--- arch/powerpc/platforms/powernv/pci-ioda.c | 12 ++-- arch/s390/pci/pci.c | 2 +- arch/sparc/kernel/pci.c | 2 +- arch/x86/pci/i386.c | 4 ++-- arch/xtensa/kernel/pci.c | 4 ++-- drivers/iommu/intel-iommu.c | 3 ++- drivers/pci/host/pcie-rcar.c | 2 +- drivers/pci/iov.c | 2 +- drivers/pci/probe.c | 2 +- drivers/pci/quirks.c | 2 +- drivers/pci/rom.c | 2 +- drivers/pci/setup-bus.c | 8 drivers/pci/setup-res.c | 2 +- include/linux/ioport.h| 4 20 files changed, 52 insertions(+), 44 deletions(-) diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 82f738e..91a7153 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -282,7 +282,7 @@ pcibios_claim_one_bus(struct pci_bus *b) for (i = 0; i < PCI_NUM_RESOURCES; i++) { struct resource *r = &dev->resource[i]; - if (r->parent || !r->start || !r->flags) + if (r->parent || !r->start || resource_disabled(r)) continue; if (pci_has_flag(PCI_PROBE_ONLY) || (r->flags & IORESOURCE_PCI_FIXED)) { diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 7cc3be9..cc293ea 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -501,7 +501,7 @@ void pcibios_fixup_device_resources(struct pci_dev *dev) for (idx = 0; idx < PCI_BRIDGE_RESOURCES; idx++) { struct resource *r = &dev->resource[idx]; - if (!r->flags || r->parent || !r->start) + if (resource_disabled(r) || r->parent || !r->start) continue; pci_claim_resource(dev, idx); @@ -519,7 +519,7 @@ static void pcibios_fixup_bridge_resources(struct pci_dev *dev) for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) { struct resource *r = &dev->resource[idx]; - if (!r->flags || r->parent || !r->start) + if (resource_disabled(r) || r->parent || !r->start) continue; pci_claim_bridge_resource(dev, idx); diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 09b1af6..c123d3c 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -705,7 +705,7 @@ static void pcibios_fixup_resources(struct pci_dev *dev) } for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { struct resource *res = dev->resource + i; - if (!res->flags) + if (resource_disabled(res)) continue; if (res->start == 0) { pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]", @@ -806,7 +806,7 @@ static void pcibios_fixup_bridge(struct pci_bus *bus) pci_bus_for_each_resource(bus, res, i) { if (!res) continue; - if (!res->flags) + if (resource_disabled(res)) continue; if (i >= 3 && bus->self->transparent) continue; @@ -985,7 +985,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus *bus) pci_domain_nr(bus), bus->number); pci_bus_for_each_resource(bus, res, i) { - if (!res || !res->flags + if (!res || resource_disabled(res) || res->start > res->end || res->parent) continue; if (bus->parent == NULL) @@ -1087,7 +1087,8 @@ static void __init pcibios_allocate_resources(int pass) r = &dev->resource[idx]; if (r->parent) /* Already allocated */ continue; - i
[PATCH v2 21/49] PCI: Move saved required resource list out of must+optional assigning
We will need to share that saved list for alt_size support. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 30 -- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 64ef516..1c0b4c5 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -456,6 +456,9 @@ static bool __has_addon(struct list_head *head, int add_count = 0; struct pci_dev_resource *dev_res, *tmp_res; + if (!realloc_head) + return false; + /* check if we have add really */ list_for_each_entry(dev_res, head, list) { tmp_res = res_to_dev_res(realloc_head, dev_res->res); @@ -492,9 +495,9 @@ static void restore_resource(struct pci_dev_resource *save_res, } static bool __assign_resources_must_add_sorted(struct list_head *head, +struct list_head *save_head, struct list_head *realloc_head) { - LIST_HEAD(save_head); LIST_HEAD(local_fail_head); struct pci_dev_resource *save_res; struct pci_dev_resource *dev_res, *tmp_res; @@ -502,12 +505,6 @@ static bool __assign_resources_must_add_sorted(struct list_head *head, resource_size_t add_align, add_size; struct resource *res; - if (!__has_addon(head, realloc_head)) - return false; - - if (!save_resources(head, &save_head)) - return false; - /* Update res in head list with add_size in realloc_head list */ list_for_each_entry(dev_res, head, list) { res = dev_res->res; @@ -548,7 +545,6 @@ static bool __assign_resources_must_add_sorted(struct list_head *head, /* Remove head list from realloc_head list */ list_for_each_entry(dev_res, head, list) remove_from_list(realloc_head, dev_res->res); - free_list(&save_head); free_list(head); return true; @@ -562,7 +558,7 @@ static bool __assign_resources_must_add_sorted(struct list_head *head, if (res->parent && !pci_need_to_release(fail_type, res)) { /* remove it from realloc_head list */ remove_from_list(realloc_head, res); - remove_from_list(&save_head, res); + remove_from_list(save_head, res); list_del(&dev_res->list); kfree(dev_res); } @@ -581,11 +577,9 @@ static bool __assign_resources_must_add_sorted(struct list_head *head, } } /* Restore start/end/flags from saved list */ - list_for_each_entry(save_res, &save_head, list) + list_for_each_entry(save_res, save_head, list) restore_resource(save_res, save_res->res); - free_list(&save_head); - return false; } @@ -603,16 +597,24 @@ static void __assign_resources_sorted(struct list_head *head, *then try to reassign add_size for some resources. */ + LIST_HEAD(save_head); + /* Check must+optional add */ - if (realloc_head && - __assign_resources_must_add_sorted(head, realloc_head)) + if (__has_addon(head, realloc_head) && + save_resources(head, &save_head) && + __assign_resources_must_add_sorted(head, &save_head, + realloc_head)) { + free_list(&save_head); return; + } __sort_resources(head); /* Satisfy the must-have resource requests */ assign_requested_resources_sorted(head, fail_head); + free_list(&save_head); + /* Try to satisfy any additional optional resource requests */ if (realloc_head) -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 28/49] PCI: Allow optional only io resource must size to be 0
When there is no child device under the non hotplug bridge, We can use 0 for must size, and do not use old size as must size. When there is child device, size will not be 0. when the bridge is not hotplug, min_size will not be 0. So they will still honor the old size as must size. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 969a0b1..0420d27 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1242,8 +1242,9 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, size = size_aligned_for_isa(size); size += size1; - size0 = calculate_size(size, min_size, - resource_size(b_res), min_align); + if (size || min_size) + size0 = calculate_size(size, min_size, + resource_size(b_res), min_align); sum_add_size = size_aligned_for_isa(sum_add_size); sum_add_size += sum_add_size1; if (sum_add_size < min_sum_size) @@ -1259,7 +1260,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, return; } - b_res->start = min_align; + b_res->start = size0 ? min_align : 0; b_res->end = b_res->start + size0 - 1; b_res->flags |= IORESOURCE_STARTALIGN; if (size1 > size0 && realloc_head) { -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 25/49] PCI: Don't add too much optional size for hotplug bridge io
Same as patch for mmio (PCI: Don't add too much optional size for hotplug bridge mmio), and this one is addressing io port. It will compare must+optional with min_sum_size to get smaller optional size. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 26 -- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 8999ead..de55e07 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1189,7 +1189,6 @@ static resource_size_t window_alignment(struct pci_bus *bus, * * @bus : the bus * @min_size : the minimum io window that must to be allocated - * @add_size : additional optional io window * @realloc_head : track the additional io window on this list * * Sizing the IO windows of the PCI-PCI bridge is trivial, @@ -1198,9 +1197,11 @@ static resource_size_t window_alignment(struct pci_bus *bus, * We must be careful with the ISA aliasing though. */ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, - resource_size_t add_size, struct list_head *realloc_head) +struct list_head *realloc_head) { struct pci_dev *dev; + resource_size_t min_sum_size = 0; + resource_size_t sum_add_size; struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO, IORESOURCE_IO); resource_size_t size = 0, size0 = 0, size1 = 0; @@ -1210,6 +1211,11 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, if (!b_res) return; + if (realloc_head) { + min_sum_size = min_size; + min_size = 0; + } + min_align = window_alignment(bus, IORESOURCE_IO); list_for_each_entry(dev, &bus->devices, bus_list) { int i; @@ -1239,10 +1245,11 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, size0 = calculate_iosize(size, min_size, size1, resource_size(b_res), min_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_iosize(size, min_size, add_size + size1, + sum_add_size = children_add_size + size + size1; + if (sum_add_size < min_sum_size) + sum_add_size = min_sum_size; + size1 = !realloc_head ? size0 : + calculate_iosize(size, min_size, sum_add_size - size, resource_size(b_res), min_align); if (!size0 && !size1) { if (b_res->start || b_res->end) @@ -1783,7 +1790,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) { struct pci_dev *dev; unsigned long mask, prefmask, type2 = 0, type3 = 0; - resource_size_t min_mem_size = 0, additional_io_size = 0; + resource_size_t min_mem_size = 0, min_io_size = 0; struct resource *b_res; int ret; @@ -1816,13 +1823,12 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) case PCI_CLASS_BRIDGE_PCI: pci_bridge_check_ranges(bus); if (bus->self->is_hotplug_bridge) { - additional_io_size = pci_hotplug_io_size; + min_io_size = pci_hotplug_io_size; min_mem_size = pci_hotplug_mem_size; } /* Fall through */ default: - pbus_size_io(bus, realloc_head ? 0 : additional_io_size, -additional_io_size, realloc_head); + pbus_size_io(bus, min_io_size, realloc_head); /* * If there's a 64-bit prefetchable MMIO window, compute -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 14/49] PCI: Add __add_to_list()
to take alt_size, alt_align. preparation patch for alt_size support. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 51 ++--- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index f30225c..57b5c09 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -66,6 +66,8 @@ struct pci_dev_resource { resource_size_t end; resource_size_t add_size; resource_size_t min_align; + resource_size_t alt_size; + resource_size_t alt_align; unsigned long flags; }; @@ -88,15 +90,16 @@ static void free_list(struct list_head *head) * @add_size: additional size to be optionally added * to the resource */ -static int add_to_list(struct list_head *head, +static int __add_to_list(struct list_head *head, struct pci_dev *dev, struct resource *res, -resource_size_t add_size, resource_size_t min_align) +resource_size_t add_size, resource_size_t min_align, +resource_size_t alt_size, resource_size_t alt_align) { struct pci_dev_resource *tmp; tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); if (!tmp) { - pr_warn("add_to_list: kmalloc() failed!\n"); + pr_warn("__add_to_list: kmalloc() failed!\n"); return -ENOMEM; } @@ -107,12 +110,20 @@ static int add_to_list(struct list_head *head, tmp->flags = res->flags; tmp->add_size = add_size; tmp->min_align = min_align; + tmp->alt_size = alt_size; + tmp->alt_align = alt_align; list_add(&tmp->list, head); return 0; } +static int add_to_list(struct list_head *head, +struct pci_dev *dev, struct resource *res) +{ + return __add_to_list(head, dev, res, 0, 0, 0, 0); +} + static void remove_from_list(struct list_head *head, struct resource *res) { @@ -378,9 +389,7 @@ static void assign_requested_resources_sorted(struct list_head *head, if (resource_size(res) && pci_assign_resource(dev_res->dev, idx)) { if (fail_head) - add_to_list(fail_head, dev_res->dev, res, - 0 /* don't care */, - 0 /* don't care */); + add_to_list(fail_head, dev_res->dev, res); reset_resource(res); } } @@ -466,7 +475,7 @@ static void __assign_resources_sorted(struct list_head *head, /* Save original start, end, flags etc at first */ list_for_each_entry(dev_res, head, list) { - if (add_to_list(&save_head, dev_res->dev, dev_res->res, 0, 0)) { + if (add_to_list(&save_head, dev_res->dev, dev_res->res)) { free_list(&save_head); goto requested_and_reassign; } @@ -1019,8 +1028,8 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, b_res->end = b_res->start + size0 - 1; b_res->flags |= IORESOURCE_STARTALIGN; if (size1 > size0 && realloc_head) { - add_to_list(realloc_head, bus->self, b_res, size1-size0, - min_align); + __add_to_list(realloc_head, bus->self, b_res, + size1 - size0, min_align, 0, 0); dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n", b_res, &bus->busn_res, (unsigned long long)size1-size0); @@ -1222,7 +1231,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, add_to_align_test_list(&align_test_add_list, align, r_size); r->end = r->start - 1; - add_to_list(realloc_head, dev, r, r_size, 0/* don't care */); + __add_to_list(realloc_head, dev, r, + r_size, align, 0, 0); sum_add_size += r_size; if (align > max_add_align) max_add_align = align; @@ -1293,8 +1303,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, b_res->end = size0 + min_align - 1; b_res->flags |= IORESOURCE_STARTALIGN; if (size1 > size0 && realloc_head) { - add_to_list(realloc_head, bus->self, b_res, size1 - size0, - min
[PATCH v2 18/49] PCI: Move comment to pci_need_to_release()
Move comment from caller to callee, as we will have one new caller for alt_size support later. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 27 +++ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 2e3d00b..f8b9a24 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -415,6 +415,20 @@ static unsigned long pci_fail_res_type_mask(struct list_head *fail_head) static bool pci_need_to_release(unsigned long mask, struct resource *res) { + /* +* Separate three resource type checking if we need to release +* assigned resource. +* 1. if there is io port assign fail, will release assigned +* io port. +* 2. if there is pref mmio assign fail, release assigned +* pref mmio. +* if assigned pref mmio's parent is non-pref mmio and there +* is non-pref mmio assign fail, will release that assigned +* pref mmio. +* 3. if there is non-pref mmio assign fail or pref mmio +* assigned fail, will release assigned non-pref mmio. +*/ + if (res->flags & IORESOURCE_IO) return !!(mask & IORESOURCE_IO); @@ -471,19 +485,8 @@ static void __assign_resources_sorted(struct list_head *head, * if could do that, could get out early. * if could not do that, we still try to assign requested at first, *then try to reassign add_size for some resources. -* -* Separate three resource type checking if we need to release -* assigned resource after requested + add_size try. -* 1. if there is io port assign fail, will release assigned -* io port. -* 2. if there is pref mmio assign fail, release assigned -* pref mmio. -* if assigned pref mmio's parent is non-pref mmio and there -* is non-pref mmio assign fail, will release that assigned -* pref mmio. -* 3. if there is non-pref mmio assign fail or pref mmio -* assigned fail, will release assigned non-pref mmio. */ + LIST_HEAD(save_head); LIST_HEAD(local_fail_head); struct pci_dev_resource *save_res; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 23/49] PCI: Add support for more than two alt_size under same bridge
Need to increase size to make sure it could fit all alt entries. In the patch, we first select one big size, and then keep reducing the size and retrying to get the minimum value for alt_size. Example: two bridges: one have 8M/8M, and 1M/1M children res. one have 4M/4M, and 1M/1M children res. Then we have child pridges alt_align/alt_size: 8M/9M, 4M/5M. Before this patch, parent bridge alt_align/alt_size is 8M/14M that is wrong. With this patch parent bridge alt_align/alt_size: 8M/17M. At same time, child bridges must align/size: 4M/12M, 2M/6M. and prarent bridge must align/size: 4M/20M. So at last, we use 8M/17M as parent bridge alt_align/alt_size. Link: https://bugzilla.kernel.org/show_bug.cgi?id=100451 Reported-by: Yijing Wang Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 56 +++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 9da8b23..bf28f32 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1335,6 +1335,47 @@ out: return good_align; } +static resource_size_t calculate_mem_alt_size(struct list_head *head, + resource_size_t max_align, resource_size_t size, + resource_size_t align_low) +{ + struct align_test_res *p; + resource_size_t tmp; + resource_size_t good_size, bad_size; + int count = 0, order; + + good_size = ALIGN(size, align_low); + + list_for_each_entry(p, head, list) + count++; + + if (count <= 1) + goto out; + + __sort_align_test(head); + + tmp = max(size, max_align); + order = __fls(count); + if ((1ULL << order) < count) + order++; + good_size = ALIGN((tmp << order), align_low); + bad_size = ALIGN(size, align_low) - align_low; + size = good_size; + while (size > bad_size) { + /* check if align/size fit all entries */ + if (is_align_size_good(head, max_align, size, 0)) + good_size = size; + else + bad_size = size; + + size = bad_size + ((good_size - bad_size) >> 1); + size = round_down(size, align_low); + } + +out: + return good_size; +} + static inline bool is_optional(int i) { @@ -1381,6 +1422,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, mask | IORESOURCE_PREFETCH, type); LIST_HEAD(align_test_list); LIST_HEAD(align_test_add_list); + LIST_HEAD(align_test_alt_list); resource_size_t alt_size = 0, alt_align = 0; resource_size_t window_align; @@ -1454,10 +1496,17 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, dev_res = res_to_dev_res(realloc_head, r); if (dev_res && dev_res->alt_size) { + add_to_align_test_list( + &align_test_alt_list, + dev_res->alt_align, + dev_res->alt_size); alt_size += dev_res->alt_size; if (alt_align < dev_res->alt_align) alt_align = dev_res->alt_align; } else if (r_size > 1) { + add_to_align_test_list( + &align_test_alt_list, + align, r_size); alt_size += r_size; if (alt_align < align) alt_align = align; @@ -1477,14 +1526,17 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, if (size0 && realloc_head) { alt_align = max(alt_align, window_align); - alt_size = calculate_memsize(alt_size, min_size, -0, window_align); + /* need to increase size to fit more alt */ + alt_size = calculate_mem_alt_size(&align_test_alt_list, + alt_align, alt_size, + window_align); /* must is better ? */ if (alt_size >= size0) { alt_align = 0; alt_size = 0; } } + free_align_test_list(&align_test_alt_list); if (sum_add_size < min_sum_size) sum_add_size = min_sum_size; -- 1.8.4.5
[PATCH v2 20/49] PCI: Skip must+optional if there is no optional addon
If the bridge does not support hotplug or no child with sriov support we could get out early and don't try must+optional allocation. Also in the loop that update res with optional add info, skip resource that add_size is 0. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 32 +--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index d1f9e19..64ef516 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -450,6 +450,24 @@ static bool pci_need_to_release(unsigned long mask, struct resource *res) return false; /* should not get here */ } +static bool __has_addon(struct list_head *head, + struct list_head *realloc_head) +{ + int add_count = 0; + struct pci_dev_resource *dev_res, *tmp_res; + + /* check if we have add really */ + list_for_each_entry(dev_res, head, list) { + tmp_res = res_to_dev_res(realloc_head, dev_res->res); + if (!tmp_res || !tmp_res->add_size) + continue; + + add_count++; + } + + return add_count != 0; +} + static bool save_resources(struct list_head *head, struct list_head *save_head) { @@ -481,16 +499,24 @@ static bool __assign_resources_must_add_sorted(struct list_head *head, struct pci_dev_resource *save_res; struct pci_dev_resource *dev_res, *tmp_res; unsigned long fail_type; - resource_size_t add_align; + resource_size_t add_align, add_size; struct resource *res; + if (!__has_addon(head, realloc_head)) + return false; + if (!save_resources(head, &save_head)) return false; /* Update res in head list with add_size in realloc_head list */ list_for_each_entry(dev_res, head, list) { res = dev_res->res; - res->end += get_res_add_size(realloc_head, res); + add_size = get_res_add_size(realloc_head, res); + + if (!add_size) + continue; + + res->end += add_size; /* * There are two kinds of additional resources in the list: @@ -578,7 +604,7 @@ static void __assign_resources_sorted(struct list_head *head, */ /* Check must+optional add */ - if (realloc_head && !list_empty(realloc_head) && + if (realloc_head && __assign_resources_must_add_sorted(head, realloc_head)) return; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 35/49] PCI: Add has_mem64 for host_bridge
On system that does not support mmio64 above 4g, will not set that. We will use that info next two following patches: 1. Don't treat non-pref mmio64 as pref mmio, so will not put it under bridge's pref range when rescan the devices 2. will keep pref mmio64 and pref mmio32 under bridge pref bar. Signed-off-by: Yinghai Lu --- drivers/pci/probe.c | 9 + include/linux/pci.h | 1 + 2 files changed, 10 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 010d8d9..14bdbca 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2051,6 +2051,15 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus, dev_info(&b->dev, "root bus resource %pR%s\n", res, bus_addr); } + resource_list_for_each_entry(window, &bridge->windows) { + res = window->res; + if (resource_type(res) == IORESOURCE_MEM && + (res->end - window->offset) > 0x) { + bridge->has_mem64 = 1; + break; + } + } + down_write(&pci_bus_sem); list_add_tail(&b->node, &pci_root_buses); up_write(&pci_bus_sem); diff --git a/include/linux/pci.h b/include/linux/pci.h index 33ef25f..0771b37 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -409,6 +409,7 @@ struct pci_host_bridge { void (*release_fn)(struct pci_host_bridge *); void *release_data; unsigned int ignore_reset_delay:1; /* for entire hierarchy */ + unsigned int has_mem64:1; }; #defineto_pci_host_bridge(n) container_of(n, struct pci_host_bridge, dev) -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 24/49] PCI: Better support for two alt_size
Need to put aligned with max align before not aligned. For example: alt align/size: 8M/9M, 4M/8M before this patch we have 8M/20M. After this patch we will have 8M/17M. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 80 +++-- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index bf28f32..8999ead 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -58,6 +58,20 @@ static inline bool is_before(resource_size_t align1, resource_size_t size1, return false; } +static inline bool is_before_alt(resource_size_t align, resource_size_t size1, +resource_size_t size2) +{ + resource_size_t size1_left, size2_left; + + /* aligned is before not aligned */ + size1_left = size1 & (align - 1); + size2_left = size2 & (align - 1); + if (!size1_left && size2_left) + return true; + + return false; +} + struct pci_dev_resource { struct list_head list; struct resource *res; @@ -307,6 +321,42 @@ static void __sort_resources(struct list_head *head) } } +static void __sort_resources_alt(struct list_head *head) +{ + struct pci_dev_resource *res1, *tmp_res, *res2; + resource_size_t align = 0; + + + __sort_resources(head); + + /* get max align at first */ + list_for_each_entry(res1, head, list) { + resource_size_t align1; + + align1 = pci_resource_alignment(res1->dev, res1->res); + if (align1 > align) + align = align1; + } + + list_for_each_entry_safe(res1, tmp_res, head, list) { + resource_size_t size1, size2; + + size1 = resource_size(res1->res); + + /* reorder it */ + list_for_each_entry(res2, head, list) { + if (res2 == res1) + break; + + size2 = resource_size(res2->res); + if (is_before_alt(align, size1, size2)) { + list_move_tail(&res1->list, &res2->list); + break; + } + } + } +} + /** * reassign_resources_sorted() - satisfy any additional resource requests * @@ -673,7 +723,7 @@ static void __assign_resources_alt_sorted(struct list_head *head, res->end = res->start + alt_res->alt_size - 1; } - __sort_resources(head); + __sort_resources_alt(head); /* Satisfy the alt resource requests */ assign_requested_resources_sorted(head, &local_alt_fail_head); @@ -1267,6 +1317,32 @@ static void __sort_align_test(struct list_head *head) } } +static void __sort_align_test_alt(struct list_head *head) +{ + struct align_test_res *res1, *tmp_res, *res2; + resource_size_t align = 0; + + __sort_align_test(head); + + /* get max align at first */ + list_for_each_entry(res1, head, list) + if (res1->align > align) + align = res1->align; + + list_for_each_entry_safe(res1, tmp_res, head, list) { + /* reorder it */ + list_for_each_entry(res2, head, list) { + if (res2 == res1) + break; + + if (is_before_alt(align, res1->size, res2->size)) { + list_move_tail(&res1->list, &res2->list); + break; + } + } + } +} + static bool is_align_size_good(struct list_head *head, resource_size_t min_align, resource_size_t size, resource_size_t start) @@ -1352,7 +1428,7 @@ static resource_size_t calculate_mem_alt_size(struct list_head *head, if (count <= 1) goto out; - __sort_align_test(head); + __sort_align_test_alt(head); tmp = max(size, max_align); order = __fls(count); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 46/49] PCI: Set resource to FIXED for lsi devices
LSI HBA firmware stop responding pci read from host if pci core ever change pci device BAR values. Set their resources to FIXED, so will allow realloc to skip them. Reported-by: Paul Johnson Suggested-by: Bjorn Helgaas Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=92351 Signed-off-by: Yinghai Lu Cc: sta...@vger.kernel.org --- drivers/pci/pci.h | 1 + drivers/pci/quirks.c| 20 drivers/pci/setup-bus.c | 4 3 files changed, 25 insertions(+) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1804d44..dec1c18 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -168,6 +168,7 @@ static inline void pci_msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u } void pci_realloc_get_opt(char *); +bool pci_realloc_user_enabled(void); static inline int pci_no_d1d2(struct pci_dev *dev) { diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index e9fd0e9..184a09e 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -324,6 +324,26 @@ static void quirk_s3_64M(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_868, quirk_s3_64M); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_968, quirk_s3_64M); +/* + * LSI devices firmware does not like BAR get changed + */ +static void quirk_bar_fixed(struct pci_dev *dev) +{ + int i; + + if (pci_realloc_user_enabled()) + return; + + for (i = 0; i < PCI_STD_RESOURCE_END; i++) { + struct resource *r = &dev->resource[i]; + + if (!r->flags || r->flags & IORESOURCE_UNSET) + continue; + r->flags |= IORESOURCE_PCI_FIXED; + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_LSI_LOGIC, PCI_ANY_ID, quirk_bar_fixed); + static void quirk_io(struct pci_dev *dev, int pos, unsigned size, const char *name) { diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 9d5423c..d9cfb55 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2196,6 +2196,10 @@ void __init pci_realloc_get_opt(char *str) else if (!strncmp(str, "on", 2)) pci_realloc_enable = user_enabled; } +bool pci_realloc_user_enabled(void) +{ + return pci_realloc_enable == user_enabled; +} static bool pci_realloc_enabled(enum enable_type enable) { return enable >= user_enabled; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 37/49] PCI: Restore pref mmio allocation logic for hostbridge without mmio64
>From 5b2854155 (PCI: Restrict 64-bit prefetchable bridge windows to 64-bit resources), we change the logic for pref mmio allocation: When bridge pref support mmio64, we will only put children pref that support mmio64 into it, and will put children pref mmio32 into bridge's non-pref mmio32. That could leave bridge pref bar not used when that pref bar is mmio64, and children res only has mmio32. Also could have allocation failure when non-pref mmio32 is not big enough space for those children pref mmio32. That is not rational when the host bridge does not 64bit mmio above 4g at all. The patch restore to old logic: when hostbridge does not have has_mem64 so put children pref mmio64 and pref mmio32 all under bridges pref bars. Signed-off-by: Yinghai Lu --- drivers/pci/bus.c | 4 +++- drivers/pci/setup-bus.c | 13 + drivers/pci/setup-res.c | 9 ++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 6fbd3f2..b043bdf 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -202,8 +202,10 @@ int pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res, { #ifdef CONFIG_PCI_BUS_ADDR_T_64BIT int rc; + unsigned long mmio64 = pci_find_host_bridge(bus)->has_mem64 ? + IORESOURCE_MEM_64 : 0; - if (res->flags & IORESOURCE_MEM_64) { + if (res->flags & mmio64) { rc = pci_bus_alloc_from_region(bus, res, size, align, min, type_mask, alignf, alignf_data, &pci_high); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 37d5a48..f5b07d8 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1876,7 +1876,8 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) b_res = &bus->self->resource[PCI_BRIDGE_RESOURCES]; mask = IORESOURCE_MEM; prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH; - if (b_res[2].flags & IORESOURCE_MEM_64) { + if ((b_res[2].flags & IORESOURCE_MEM_64) && + pci_find_host_bridge(bus)->has_mem64) { prefmask |= IORESOURCE_MEM_64; ret = pbus_size_mem(bus, prefmask, prefmask, prefmask, prefmask, @@ -2032,17 +2033,21 @@ static void pci_bridge_release_resources(struct pci_bus *bus, *io port. * 2. if there is non pref mmio assign fail, release bridge *nonpref mmio. -* 3. if there is 64bit pref mmio assign fail, and bridge pref +* 3. if there is pref mmio assign fail, and host bridge does +*have 64bit mmio, release bridge pref mmio. +* 4. if there is 64bit pref mmio assign fail, and bridge pref *is 64bit, release bridge pref mmio. -* 4. if there is pref mmio assign fail, and bridge pref is +* 5. if there is pref mmio assign fail, and bridge pref is *32bit mmio, release bridge pref mmio -* 5. if there is pref mmio assign fail, and bridge pref is not +* 6. if there is pref mmio assign fail, and bridge pref is not *assigned, release bridge nonpref mmio. */ if (type & IORESOURCE_IO) idx = 0; else if (!(type & IORESOURCE_PREFETCH)) idx = 1; + else if (!pci_find_host_bridge(bus)->has_mem64) + idx = 2; else if ((type & IORESOURCE_MEM_64) && (b_res[2].flags & IORESOURCE_MEM_64)) idx = 2; diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index b19aa5b..26aedde 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -205,6 +205,8 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, struct resource *res = dev->resource + resno; resource_size_t min; int ret; + unsigned long mmio64 = pci_find_host_bridge(bus)->has_mem64 ? + IORESOURCE_MEM_64 : 0; min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM; @@ -216,7 +218,7 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, * things differently than they were sized, not everything will fit. */ ret = pci_bus_alloc_resource(bus, res, size, align, min, -IORESOURCE_PREFETCH | IORESOURCE_MEM_64, +IORESOURCE_PREFETCH | mmio64, pcibios_align_resource, dev); if (ret == 0) return 0; @@ -225,7 +227,8 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_
[PATCH v2 38/49] sparc/PCI: Add mem64 resource parsing for root bus
Found "no compatible bridge window" warning in boot log from T5-8. pci :00:01.0: can't claim BAR 15 [mem 0x1-0x4afff pref]: no compatible bridge window That resource is above 4G, but does not get offset correctly as root bus only report io and mem32. pci_sun4v f02dbcfc: PCI host bridge to bus :00 pci_bus :00: root bus resource [io 0x8040-0x80400fff] (bus address [0x-0xfff]) pci_bus :00: root bus resource [mem 0x8000-0x80007eff] (bus address [0x-0x7eff]) pci_bus :00: root bus resource [bus 00-77] Add mem64 handling in pci_common for sparc, so we can have 64bit resource registered for root bus at first. After patch, will have: pci_sun4v f02dbcfc: PCI host bridge to bus :00 pci_bus :00: root bus resource [io 0x8040-0x80400fff] (bus address [0x-0xfff]) pci_bus :00: root bus resource [mem 0x8000-0x80007eff] (bus address [0x-0x7eff]) pci_bus :00: root bus resource [mem 0x8001-0x8007] (bus address [0x1-0x7]) pci_bus :00: root bus resource [bus 00-77] Fixes: commit d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") Link: http://lkml.kernel.org/r/cae9fiqu1gjy1lyrxs+ma5lcteee4xmtjrg0axj9k_tsu+m9...@mail.gmail.com Reported-by: David Ahern Tested-by: David Ahern Signed-off-by: Yinghai Lu Cc: #3.19 --- arch/sparc/kernel/pci.c| 7 ++- arch/sparc/kernel/pci_common.c | 15 +-- arch/sparc/kernel/pci_impl.h | 1 + 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index c928bc6..bfd0b70 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -185,8 +185,10 @@ static unsigned long pci_parse_of_flags(u32 addr0) if (addr0 & 0x0200) { flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; - flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; + if (addr0 & 0x0100) + flags |= IORESOURCE_MEM_64 +| PCI_BASE_ADDRESS_MEM_TYPE_64; if (addr0 & 0x4000) flags |= IORESOURCE_PREFETCH | PCI_BASE_ADDRESS_MEM_PREFETCH; @@ -660,6 +662,9 @@ struct pci_bus *pci_scan_one_pbm(struct pci_pbm_info *pbm, pbm->io_space.start); pci_add_resource_offset(&resources, &pbm->mem_space, pbm->mem_space.start); + if (pbm->mem64_space.flags) + pci_add_resource_offset(&resources, &pbm->mem64_space, + pbm->mem_space.start); pbm->busn.start = pbm->pci_first_busno; pbm->busn.end = pbm->pci_last_busno; pbm->busn.flags = IORESOURCE_BUS; diff --git a/arch/sparc/kernel/pci_common.c b/arch/sparc/kernel/pci_common.c index 944a065..a859a86 100644 --- a/arch/sparc/kernel/pci_common.c +++ b/arch/sparc/kernel/pci_common.c @@ -406,6 +406,7 @@ void pci_determine_mem_io_space(struct pci_pbm_info *pbm) } num_pbm_ranges = i / sizeof(*pbm_ranges); + memset(&pbm->mem64_space, 0, sizeof(struct resource)); for (i = 0; i < num_pbm_ranges; i++) { const struct linux_prom_pci_ranges *pr = &pbm_ranges[i]; @@ -451,7 +452,11 @@ void pci_determine_mem_io_space(struct pci_pbm_info *pbm) break; case 3: - /* XXX 64-bit MEM handling XXX */ + /* 64-bit MEM handling */ + pbm->mem64_space.start = a; + pbm->mem64_space.end = a + size - 1UL; + pbm->mem64_space.flags = IORESOURCE_MEM; + break; default: break; @@ -465,15 +470,21 @@ void pci_determine_mem_io_space(struct pci_pbm_info *pbm) prom_halt(); } - printk("%s: PCI IO[%llx] MEM[%llx]\n", + printk("%s: PCI IO[%llx] MEM[%llx]", pbm->name, pbm->io_space.start, pbm->mem_space.start); + if (pbm->mem64_space.flags) + printk(" MEM64[%llx]", + pbm->mem64_space.start); + printk("\n"); pbm->io_space.name = pbm->mem_space.name = pbm->name; request_resource(&ioport_resource, &pbm->io_space); request_resource(&iomem_resource, &pbm->mem_space); + if (pbm->mem64_space.flags) + request_resource(&iomem_resource, &pbm->mem64_space); pci_register_legacy_regions(&a
[PATCH v2 22/49] PCI: Add alt_size allocation support
On system with several pcie switches, BIOS allocate very tight resources to the bridge bar, and it is not aligned to min_align as kernel allocation code. For example: 02:03.0---0c:00.0---0d:04.0---18:00.0 18:00.0 need 0x1000, and 0x0001. BIOS only allocate 0x1010 to 0d:04.0 and above bridges. Later after using /sys/bus/pci/devices/:0c:00.0/remove to remove 0c:00.0, rescan with /sys/bus/pci/rescan can not allocate 0x1800 to 0c:00.0. another example: 00:1c.0-[02-21]00.0-[03-21]--+-01.0-[04-12]00.0-[05-12]19.0-[06-12]00.0 +-05.0-[13]-- +-07.0-[14-20]00.0-[15-20]--+-08.0-[16]--+-00.0 | | \-00.1 | +-14.0-[17]00.0 | \-19.0-[18-20]00.0 \-09.0-[21]-- 06:00.0 need 0x400 and 0x80. BIOS only allocate 0x480 to 05:19.0 and 04:00.0. when 05:19.0 get removed via /sys/bus/pci/devices/:05:19.0/remove, rescan with /sys/bus/pci/rescan will fail. pci :05:19.0: BAR 14: no space for [mem size 0x0600] pci :05:19.0: BAR 14: failed to assign [mem size 0x0600] pci :06:00.0: BAR 2: no space for [mem size 0x0400 64bit] pci :06:00.0: BAR 2: failed to assign [mem size 0x0400 64bit] pci :06:00.0: BAR 0: no space for [mem size 0x0080] pci :06:00.0: BAR 0: failed to assign [mem size 0x0080] current code try to use align 0x200 and size 0x600, but parent bridge only have 0x480. Introduce alt_align/alt_size and store them in realloc list in addition to addon info, and will try it after min_align/min_size allocation fails. The alt_align is max_align, and alt_size is aligned size with bridge minimum window alignment. on my test setup: 00:1c.7---61:00.0---62:00.0 62:00.0 needs 0x80 and 0x2. and 00:1c.7 only have 9M allocated for mmio, with this patch we have pci :61:00.0: bridge window [mem 0x0040-0x00ff] to [bus 62] add_size 0 add_align 0 alt_size 90 alt_align 80 must_size c0 must_align 40 pci :61:00.0: BAR 14: no space for [mem size 0x00c0] pci :61:00.0: BAR 14: failed to assign [mem size 0x00c0] pci :61:00.0: BAR 14: assigned [mem 0xdf00-0xdf8f] pci :62:00.0: BAR 0: assigned [mem 0xdf00-0xdf7f pref] pci :62:00.0: BAR 1: assigned [mem 0xdf80-0xdf81] pci :61:00.0: PCI bridge to [bus 62] pci :61:00.0: bridge window [io 0x6000-0x6fff] pci :61:00.0: bridge window [mem 0xdf00-0xdf8f] pci :00:1c.7: PCI bridge to [bus 61-68] pci :00:1c.7: bridge window [io 0x6000-0x6fff] pci :00:1c.7: bridge window [mem 0xdf00-0xdf8f] so for 61:00.0 first try with 12M fails, and second try with 9M the alt_size works. Later 62:00.0 get correct resource allocated too. Link: https://bugzilla.kernel.org/show_bug.cgi?id=100451 Reported-by: Yijing Wang Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 203 +--- 1 file changed, 191 insertions(+), 12 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 1c0b4c5..9da8b23 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -324,7 +324,7 @@ static void reassign_resources_sorted(struct list_head *realloc_head, { struct resource *res; struct pci_dev_resource *add_res, *tmp; - resource_size_t add_size, align; + resource_size_t add_size, align, r_size; int idx; list_for_each_entry_safe(add_res, tmp, realloc_head, list) { @@ -340,12 +340,23 @@ static void reassign_resources_sorted(struct list_head *realloc_head, idx = res - &add_res->dev->resource[0]; add_size = add_res->add_size; align = add_res->min_align; - if (!resource_size(res)) { + if (!add_size || !align) /* alt_size only */ + goto out; + + r_size = resource_size(res); + if (!r_size) { res->start = align; res->end = res->start + add_size - 1; if (pci_assign_resource(add_res->dev, idx)) reset_resource(res); } else { + /* could just assigned with alt, add difference ? */ + resource_size_t must_size; + + must_size = add_res->end - add_res->start + 1; + if (r_size < must_size) + add_size += must_size - r_size; + res->flags |= add_res->flags & (IORESOURCE_STARTALIGN|IO
[PATCH v2 30/49] PCI: Kill macro checking for bus io port sizing
Use new generic version skip_isa_ioresource_align() instead. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 17 +++-- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 9478e91..f3bb309 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1172,15 +1172,12 @@ int skip_isa_ioresource_align(struct pci_bus *bus) return 0; } -static resource_size_t size_aligned_for_isa(resource_size_t size) +static resource_size_t size_aligned_for_isa(resource_size_t size, + struct pci_bus *bus) { - /* -* To be fixed in 2.5: we should have sort of HAVE_ISA -* flag in the struct pci_bus. -*/ -#if defined(CONFIG_ISA) || defined(CONFIG_EISA) - size = (size & 0xff) + ((size & ~0xffUL) << 2); -#endif + if (!skip_isa_ioresource_align(bus)) + size = (size & 0xff) + ((size & ~0xffUL) << 2); + return size; } @@ -1249,12 +1246,12 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, } } - size = size_aligned_for_isa(size); + size = size_aligned_for_isa(size, bus); size += size1; if (size || min_size) size0 = calculate_size(size, min_size, resource_size(b_res), min_align); - sum_add_size = size_aligned_for_isa(sum_add_size); + sum_add_size = size_aligned_for_isa(sum_add_size, bus); sum_add_size += sum_add_size1; if (sum_add_size < min_sum_size) sum_add_size = min_sum_size; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 16/49] PCI: Check if resource is allocated before pci_assign
Skip allocated resource in the list, as pci_assign_resource() only can handle not assigned resource. And we could have assigned resource already in the list before alt_size trying. Signed-off-by: Yinghai Lu --- drivers/pci/setup-bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 1b5fbca..1622ad2 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -386,7 +386,7 @@ static void assign_requested_resources_sorted(struct list_head *head, list_for_each_entry(dev_res, head, list) { res = dev_res->res; idx = res - &dev_res->dev->resource[0]; - if (resource_size(res) && + if (!res->parent && resource_size(res) && pci_assign_resource(dev_res->dev, idx)) { if (fail_head) add_to_list(fail_head, dev_res->dev, res); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [lkp] [PCI] c0d1a185278: EDAC sbridge: Couldn't enable 8086:6fa0
On Tue, Jul 14, 2015 at 6:13 AM, Huang Ying wrote: > FYI, we noticed the below changes on > > git://internal_merge_and_test_tree > revert-c0d1a18527806a3938e76a0e648cae690510b6a3-c0d1a18527806a3938e76a0e648cae690510b6a3 > commit c0d1a18527806a3938e76a0e648cae690510b6a3 ("PCI: Don't set flags to 0 > when assign resource fail") > > > [ 30.350859] EDAC sbridge: Seeking for: PCI ID 8086:6fa0 > [ 30.350867] sbridge_edac :ff:12.0: can't enable device: BAR 1 [mem > size 0x0010 disabled] not assigned > [ 30.350867] EDAC sbridge: Couldn't enable 8086:6fa0 > [ 30.350901] EDAC sbridge: Some needed devices are missing > [ 30.350904] EDAC sbridge: Couldn't find mci handler > Hi Ying, I updated the branch, it should fix the problem. Can you test that again ? BTW, there should be BIOS problem with it. [4.144987] pci :ff:12.0: [8086:6fa0] type 00 class 0x088000 [4.151702] pci :ff:12.0: reg 0x14: [mem 0x-0x000f] [4.158703] pci :ff:12.0: reg 0x18: [mem 0x-0x003f] [4.165705] pci :ff:12.0: reg 0x1c: [mem 0x-0x000f] [4.172706] pci :ff:12.0: reg 0x20: [mem 0x-0x003f] [4.179708] pci :ff:12.0: reg 0x24: [mem 0x-0x000f] [4.186739] pci :ff:12.1: [8086:6f30] type 00 class 0x110100 [4.193493] pci :ff:12.4: [8086:6f60] type 00 class 0x088000 [4.200208] pci :ff:12.4: reg 0x14: [mem 0x-0x000f] [4.207210] pci :ff:12.4: reg 0x18: [mem 0x-0x003f] [4.214212] pci :ff:12.4: reg 0x1c: [mem 0x-0x000f] [4.221214] pci :ff:12.4: reg 0x20: [mem 0x-0x003f] [4.228215] pci :ff:12.4: reg 0x24: [mem 0x-0x000f] but for bios 0xff, there is no _CRS mmio. also [4.876484] pci :7f:1e.3: [Firmware Bug]: reg 0x10: invalid BAR (can't size) so it has silicon problem ? Thanks Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail
On Fri, Jul 10, 2015 at 5:03 PM, Wei Yang wrote: > On Thu, Jul 09, 2015 at 10:49:06PM -0700, Yinghai Lu wrote: > I have tested you latest branch with this one as the last commit: > > ec94cc7 PCI: Don't set flags to 0 when assign resource fail > > My P8 machine boots up. Good. > > Another issue is the SRIOV couldn't be enabled, I am checking the reason. > This may not related to this patch series. wonder if could be related to : https://git.kernel.org/cgit/linux/kernel/git/yinghai/linux-yinghai.git/patch/?id=c642f79dcd6becbb92741816e0b5e81f7664acc7 PCI: Restore pref mmio allocation logic for hostbridge without mmio64 Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail
On Thu, Jul 9, 2015 at 7:48 PM, Yinghai Lu wrote: > On Thu, Jul 9, 2015 at 7:30 PM, Wei Yang wrote: >> If you could update your for-pci-v4.3-next branch, that would be more >> convenient for me to do the test. > > Just updated that branch, please check it. > just updated the branch again. If you don't want to re get it again, please apply attached patch. --- drivers/pci/bus.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-2.6/drivers/pci/bus.c === --- linux-2.6.orig/drivers/pci/bus.c +++ linux-2.6/drivers/pci/bus.c @@ -140,7 +140,7 @@ static int pci_bus_alloc_from_region(str type_mask |= IORESOURCE_TYPE_BITS; pci_bus_for_each_resource(bus, r, i) { - if (!r) + if (!r || resource_disabled(r)) continue; /* type_mask must match */
Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail
On Thu, Jul 9, 2015 at 7:30 PM, Wei Yang wrote: > If you could update your for-pci-v4.3-next branch, that would be more > convenient for me to do the test. Just updated that branch, please check it. Thanks Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail
On Wed, Jul 8, 2015 at 11:04 PM, Wei Yang wrote: > This one is on top of the last one ? or replace the last one? should be just before last one. Yinghai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail
On Wed, Jul 8, 2015 at 8:30 PM, Wei Yang wrote: > Hi, Yinghai > > This patch may introduce some problem. > > On my P8 machine, after applying this patch, I see following error: > > [0.589948] pnv_ioda_setup_pe_seg: trigger IO SEG 0 > [0.589992] pnv_ioda_setup_pe_seg: res[io 0x1000-0x3fff] 100 > > The last 0x100 is the res->flags, which indicates the UNSET and DISABLED bit > is not set. Maybe we should introduce resource_disabled() for that. Please check if attached patch would fix the problem. Thanks Yinghai Subject: [PATCH] PCI: Introduce resource_disabled() so we can cover !flags and IORESOURCE_DISABLED both. Signed-off-by: Yinghai Lu diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 82f738e..91a7153 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -282,7 +282,7 @@ pcibios_claim_one_bus(struct pci_bus *b) for (i = 0; i < PCI_NUM_RESOURCES; i++) { struct resource *r = &dev->resource[i]; - if (r->parent || !r->start || !r->flags) + if (r->parent || !r->start || resource_disabled(r)) continue; if (pci_has_flag(PCI_PROBE_ONLY) || (r->flags & IORESOURCE_PCI_FIXED)) { diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 7cc3be9..cc293ea 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -501,7 +501,7 @@ void pcibios_fixup_device_resources(struct pci_dev *dev) for (idx = 0; idx < PCI_BRIDGE_RESOURCES; idx++) { struct resource *r = &dev->resource[idx]; - if (!r->flags || r->parent || !r->start) + if (resource_disabled(r) || r->parent || !r->start) continue; pci_claim_resource(dev, idx); @@ -519,7 +519,7 @@ static void pcibios_fixup_bridge_resources(struct pci_dev *dev) for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) { struct resource *r = &dev->resource[idx]; - if (!r->flags || r->parent || !r->start) + if (resource_disabled(r) || r->parent || !r->start) continue; pci_claim_bridge_resource(dev, idx); diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index ae838ed..67848f8 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -705,7 +705,7 @@ static void pcibios_fixup_resources(struct pci_dev *dev) } for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { struct resource *res = dev->resource + i; - if (!res->flags) + if (resource_disabled(res)) continue; if (res->start == 0) { pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]", @@ -806,7 +806,7 @@ static void pcibios_fixup_bridge(struct pci_bus *bus) pci_bus_for_each_resource(bus, res, i) { if (!res) continue; - if (!res->flags) + if (resource_disabled(res)) continue; if (i >= 3 && bus->self->transparent) continue; @@ -993,7 +993,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus *bus) pci_domain_nr(bus), bus->number); pci_bus_for_each_resource(bus, res, i) { - if (!res || !res->flags + if (!res || resource_disabled(res) || res->start > res->end || res->parent) continue; if (bus->parent == NULL) @@ -1095,7 +1095,8 @@ static void __init pcibios_allocate_resources(int pass) r = &dev->resource[idx]; if (r->parent) /* Already allocated */ continue; - if (!r->flags || (r->flags & IORESOURCE_UNSET)) + if (resource_disabled(r) || + (r->flags & IORESOURCE_UNSET)) continue; /* Not assigned at all */ /* We only allocate ROMs on pass 1 just in case they * have been screwed up by firmware @@ -1226,7 +1227,7 @@ void pcibios_claim_one_bus(struct pci_bus *bus) for (i = 0; i < PCI_NUM_RESOURCES; i++) { struct resource *r = &dev->resource[i]; - if (r->parent || !r->start || !r->flags) + if (r->parent || !r->start || resource_disabled(r)) continue; pr_debug("PCI: Claiming %s: ", pci_name(dev)); @@ -1286,7 +1287,7 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose, res->start = (res->start + io_offset) & 0xu; res->end = (res->end + io_offset) & 0xu; - if (!res->flags) { + if (resource_disabled(res)) { pr_warn("PCI: I/O resource not set for host "); pr_cont("bridge %s (domain %d)\n", hose->dn->full_name, hose->global_number); @@ -1306,7 +1307,7 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose, /* Hookup PHB Memory resources */ for (i = 0; i < 3; ++i) { res = &hose->mem_resources[i]; - if (!res->flags) { + if (resource_disabled(res)) { if (i > 0) continue; pr_err("PCI: Memory resource 0 not set for "); diff --git a/arch/mn10300/unit-asb2305/pci-asb2305.c b/arch/mn10300/unit-asb2305/pci-asb2305.c index b5b036f..a249821 100644
[PATCH 15/42] x86, kaslr: Introduce fetch_random_virt_offset to randomize the kernel text mapping address
From: Baoquan He Kaslr extended kernel text mapping region size from 512M to 1G, namely CONFIG_RANDOMIZE_BASE_MAX_OFFSET. This means kernel text can be mapped to below region: [__START_KERNEL_map + LOAD_PHYSICAL_ADDR, __START_KERNEL_map + 1G] Introduce a function find_random_virt_offset() to get random value between LOAD_PHYSICAL_ADDR and CONFIG_RANDOMIZE_BASE_MAX_OFFSET. This random value will be added to __START_KERNEL_map to get the starting address which kernel text is mapped from. Since slot can be anywhere of this region, means it is an independent slot_area, it is simple to get a slot according to random value. Signed-off-by: Baoquan He --- arch/x86/boot/compressed/aslr.c | 21 + 1 file changed, 21 insertions(+) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 81070e9..775c6f9 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -366,6 +366,27 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } +static unsigned long find_random_virt_offset(unsigned long minimum, + unsigned long image_size) +{ + unsigned long slot_num, random; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + if (image_size <= CONFIG_PHYSICAL_ALIGN) + slot_num = (CONFIG_RANDOMIZE_BASE_MAX_OFFSET - minimum) / + CONFIG_PHYSICAL_ALIGN; + else + slot_num = (CONFIG_RANDOMIZE_BASE_MAX_OFFSET - + minimum - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + random = get_random_long() % slot_num; + + return random * CONFIG_PHYSICAL_ALIGN + minimum; +} + unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 11/42] x86, boot: Add checking for memcpy
parse_elf is using local memcpy to move section to running position. That memcpy actually only support no overlapping or dest < src. Add checking in memcpy to find out wrong with future use, at that time we will need to have backward memcpy for it. Also put comments in parse_elf about the fact. Signed-off-by: Yinghai Lu --- arch/x86/boot/compressed/misc.c | 14 +++--- arch/x86/boot/compressed/misc.h | 2 ++ arch/x86/boot/compressed/string.c | 28 ++-- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8fb74ba..83f98a5 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -106,9 +106,6 @@ #undef memset #define memzero(s, n) memset((s), 0, (n)) - -static void error(char *m); - /* * This is set up by the setup-routine at boot-time */ @@ -218,7 +215,7 @@ void __putstr(const char *s) outb(0xff & (pos >> 1), vidport+1); } -static void error(char *x) +void error(char *x) { error_putstr("\n\n"); error_putstr(x); @@ -353,9 +350,12 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, - output + phdr->p_offset, - phdr->p_filesz); + /* +* simple version memcpy only can work when dest is +* smaller than src or no overlapping. +* Here dest is smaller than src always. +*/ + memcpy(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0104c0be..af135b7 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -36,6 +36,8 @@ extern struct boot_params *real_mode; /* Pointer to real-mode data */ void __putstr(const char *s); #define error_putstr(__x) __putstr(__x) +void error(char *x); + #ifdef CONFIG_X86_VERBOSE_BOOTUP #define debug_putstr(__x) __putstr(__x) diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 00e788b..03805a4 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,7 +1,7 @@ #include "../string.c" #ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) +void *__memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -15,7 +15,7 @@ void *memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *memcpy(void *dest, const void *src, size_t n) +void *__memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -30,6 +30,30 @@ void *memcpy(void *dest, const void *src, size_t n) } #endif +void *memcpy(void *dest, const void *src, size_t n) +{ + unsigned long start_dest, end_dest; + unsigned long start_src, end_src; + unsigned long max_start, min_end; + + if (dest < src) + return __memcpy(dest, src, n); + + start_dest = (unsigned long)dest; + end_dest = (unsigned long)dest + n; + start_src = (unsigned long)src; + end_src = (unsigned long)src + n; + max_start = (start_dest > start_src) ? start_dest : start_src; + min_end = (end_dest < end_src) ? end_dest : end_src; + + if (max_start >= min_end) + return __memcpy(dest, src, n); + + error("memcpy does not support overlapping with dest > src!\n"); + + return dest; +} + void *memset(void *s, int c, size_t n) { int i; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 08/42] x86, kaslr: Get correct max_addr for relocs pointer
There is boundary checking for pointer in kaslr relocation handling. Current code is using output_len, and that is VO (vmlinux after objcopy) file size plus vmlinux.relocs file size. That is not right, as we should use loaded address for running. At that time parse_elf already move the sections according to ELF headers. The valid range should be VO [_text, __bss_start) loaded physical addresses. In the patch, add export for __bss_start to voffset.h and use it to get max_addr. Signed-off-by: Yinghai Lu --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/misc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 50daea7..e12a93c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -40,7 +40,7 @@ LDFLAGS_vmlinux := -T hostprogs-y:= mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index db97bdf..8fb74ba 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -234,7 +234,7 @@ static void handle_relocations(void *output, unsigned long output_len) int *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; - unsigned long max_addr = min_addr + output_len; + unsigned long max_addr = min_addr + (VO___bss_start - VO__text); /* * Calculate the delta between where vmlinux was linked to load -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 14/42] x86, kaslr: Add two functions which will be used later
From: Baoquan He Add two functions mem_min_overlap() and store_slot_info() which will be used later. Given a memory region mem_min_overlap will iterate all avoid region to find the first one which overlap with it. store_slot_info() calculates the slot info of passed in region and store it into slot_areas[]. Signed-off-by: Baoquan He --- arch/x86/boot/compressed/aslr.c | 51 + 1 file changed, 51 insertions(+) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index e3995f1..81070e9 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -214,6 +214,40 @@ static bool mem_avoid_overlap(struct mem_vector *img) return false; } +static unsigned long +mem_min_overlap(struct mem_vector *img, struct mem_vector *out) +{ + int i; + struct setup_data *ptr; + unsigned long min = img->start + img->size; + + for (i = 0; i < MEM_AVOID_MAX; i++) { + if (mem_overlaps(img, &mem_avoid[i]) && + (mem_avoid[i].start < min)) { + *out = mem_avoid[i]; + min = mem_avoid[i].start; + } + } + + /* Check all entries in the setup_data linked list. */ + ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; + while (ptr) { + struct mem_vector avoid; + + avoid.start = (unsigned long)ptr; + avoid.size = sizeof(*ptr) + ptr->len; + + if (mem_overlaps(img, &avoid) && (avoid.start < min)) { + *out = avoid; + min = avoid.start; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + return min; +} + static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / CONFIG_PHYSICAL_ALIGN]; @@ -230,6 +264,23 @@ static unsigned long slot_max; static unsigned long slot_area_index; +static void store_slot_info(struct mem_vector *region, unsigned long image_size) +{ + struct slot_area slot_area; + + slot_area.addr = region->start; + if (image_size <= CONFIG_PHYSICAL_ALIGN) + slot_area.num = region->size / CONFIG_PHYSICAL_ALIGN; + else + slot_area.num = (region->size - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + if (slot_area.num > 0) { + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; + } +} + static void slots_append(unsigned long addr) { /* Overflowing the slots list should be impossible. */ -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 12/42] x86, kaslr: Fix a bug that relocation can not be handled when kernel is loaded above 2G
From: Baoquan He When process 32 bit relocation tables a local variable extended is defined to calculate the physical address of relocs entry. However it's type is int which is enough for i386, for x86_64 not enough. That's why relocation can only be handled when kernel is loaded below 2G, otherwise a overflow will happen and cause system hang. Here change it to long as 32 bit inverse relocation processing does, and this change is safe for i386 relocation handling too. Signed-off-by: Baoquan He --- arch/x86/boot/compressed/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 83f98a5..bfa4f0a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -273,7 +273,7 @@ static void handle_relocations(void *output, unsigned long output_len) * So we work backwards from the end of the decompressed image. */ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { - int extended = *reloc; + long extended = *reloc; extended += map; ptr = (unsigned long)extended; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 18/42] x86, kaslr: Remove useless codes
From: Baoquan He Several auxiliary functions and slots[] are not needed any more since struct slot_area is used to store the slot info of kaslr now. Hence remove them in this patch. Signed-off-by: Baoquan He --- arch/x86/boot/compressed/aslr.c | 24 1 file changed, 24 deletions(-) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 9158882..7c0e1da 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -112,17 +112,6 @@ struct mem_vector { #define MEM_AVOID_MAX 4 static struct mem_vector mem_avoid[MEM_AVOID_MAX]; -static bool mem_contains(struct mem_vector *region, struct mem_vector *item) -{ - /* Item at least partially before region. */ - if (item->start < region->start) - return false; - /* Item at least partially after region. */ - if (item->start + item->size > region->start + region->size) - return false; - return true; -} - static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) { /* Item one is entirely before item two. */ @@ -248,9 +237,6 @@ mem_min_overlap(struct mem_vector *img, struct mem_vector *out) return min; } -static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN]; - struct slot_area { unsigned long addr; int num; @@ -281,16 +267,6 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) } } -static void slots_append(unsigned long addr) -{ - /* Overflowing the slots list should be impossible. */ - if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN) - return; - - slots[slot_max++] = addr; -} - static unsigned long slots_fetch_random(void) { unsigned long random; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 19/42] x86, kaslr: Allow random address could be below loaded address
Now new output buffer is always after current one. With correct tracking in mem_avoid, we can buffer below that. That would make sure when bootloader like patched grub2 or kexec have put output rather near the end of ram, we still can get random base below output. Now just pick 512M as min_addr. with this patch, will get: early console in decompress_kernel decompress_kernel: input: [0x13e9ee3b4-0x13f36b9df], output: [0x13c00-0x13f394fff], heap: [0x13f376ac0-0x13f37eabf] boot via startup_64 KASLR using RDTSC... KASLR using RDTSC... new output: [0x6f00-0x72394fff] Decompressing Linux... xz... Parsing ELF... Performing relocations... done. Booting the kernel. [0.00] bootconsole [uart0] enabled [0.00] Kernel Layout: [0.00] .text: [0x6f00-0x70096a9c] [0.00] .rodata: [0x7020-0x70a4efff] [0.00] .data: [0x70c0-0x70e4e9bf] [0.00] .init: [0x70e5-0x7120bfff] [0.00].bss: [0x71219000-0x7234efff] [0.00].brk: [0x7234f000-0x72374fff] Signed-off-by: Yinghai Lu --- arch/x86/boot/compressed/aslr.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 7c0e1da..a1535c1 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -403,7 +403,8 @@ void choose_kernel_location(unsigned char *input, unsigned long output_run_size, unsigned char **virt_offset) { - unsigned long random; + unsigned long random, min_addr; + *virt_offset = (unsigned char *)LOAD_PHYSICAL_ADDR; #ifdef CONFIG_HIBERNATION @@ -424,8 +425,13 @@ void choose_kernel_location(unsigned char *input, mem_avoid_init((unsigned long)input, input_size, (unsigned long)*output); + /* start from 512M */ + min_addr = (unsigned long)*output; + if (min_addr > (512UL<<20)) + min_addr = 512UL<<20; + /* Walk e820 and find a random address. */ - random = find_random_phy_addr((unsigned long)*output, output_run_size); + random = find_random_phy_addr(min_addr, output_run_size); if (!random) debug_putstr("KASLR could not find suitable E820 region...\n"); else { -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 13/42] x86, kaslr: Introduce struct slot_area to manage randomization slot info
From: Baoquan He Kernel is expected to be randomly reloaded anywhere in the whole physical memory area, it could be near 64T at most. In this case there could be about 4*1024*1024 randomization slots. Hence the old slot array will cost too much memory and also not efficient to store the slot information one by one into slot array. Here introduce struct slot_area to manage randomization slot info in one contiguous memory area excluding the avoid area. slot_areas is used to store all slot area info. Since setup_data is a linked list, could contain many datas by pointer to point one by one, excluding them will split RAM memory into many smaller areas, here only take the first 100 slot areas if too many of them. Signed-off-by: Baoquan He --- arch/x86/boot/compressed/aslr.c | 12 1 file changed, 12 insertions(+) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 0990c78..e3995f1 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -216,8 +216,20 @@ static bool mem_avoid_overlap(struct mem_vector *img) static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / CONFIG_PHYSICAL_ALIGN]; + +struct slot_area { + unsigned long addr; + int num; +}; + +#define MAX_SLOT_AREA 100 + +static struct slot_area slot_areas[MAX_SLOT_AREA]; + static unsigned long slot_max; +static unsigned long slot_area_index; + static void slots_append(unsigned long addr) { /* Overflowing the slots list should be impossible. */ -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 20/42] x86, boot: Add printf support for early console in compressed/misc.c
Reuse printf.c in x86 setup code. And print out decompress_kernel input and output info. Later decompresser code could print out more info for debug info. Signed-off-by: Yinghai Lu --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/misc.c | 38 ++ arch/x86/boot/compressed/misc.h | 7 +++ arch/x86/boot/compressed/printf.c | 5 + 4 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 arch/x86/boot/compressed/printf.c diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 66461b4..8fc7dd9 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -54,7 +54,7 @@ $(obj)/misc.o: $(obj)/../voffset.h vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/string.o $(obj)/cmdline.o \ - $(obj)/piggy.o $(obj)/cpuflags.o + $(obj)/printf.o $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 6b2a308..ee73b7b 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -387,6 +387,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *output_orig = output; unsigned long output_run_size; unsigned char *virt_offset; + unsigned long init_size; real_mode = rmode; @@ -414,6 +415,37 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, output_run_size = output_len > run_size ? output_len : run_size; + init_size = real_mode->hdr.init_size; + debug_putstr("decompress_kernel:\n"); + debug_printf(" input: [0x%010lx-0x%010lx]\n", +(unsigned long)input_data, +(unsigned long)input_data + input_len - 1); + debug_printf(" output: [0x%010lx-0x%010lx] 0x%08lx: output_len\n", +(unsigned long)output, +(unsigned long)output + output_len - 1, +(unsigned long)output_len); + debug_printf(" [0x%010lx-0x%010lx] 0x%08lx: run_size\n", +(unsigned long)output, +(unsigned long)output + run_size - 1, +(unsigned long)run_size); + debug_printf(" [0x%010lx-0x%010lx] 0x%08lx: output_run_size\n", +(unsigned long)output, +(unsigned long)output + output_run_size - 1, +(unsigned long)output_run_size); + debug_printf(" [0x%010lx-0x%010lx] 0x%08lx: init_size\n", +(unsigned long)output, +(unsigned long)output + init_size - 1, +(unsigned long)init_size); + debug_printf("ZO text/data: [0x%010lx-0x%010lx]\n", +(unsigned long)input_data + input_len, +(unsigned long)output + init_size - 1); + debug_printf(" ZO heap: [0x%010lx-0x%010lx]\n", +(unsigned long)heap, +(unsigned long)heap + BOOT_HEAP_SIZE - 1); + debug_printf(" VO bss/brk: [0x%010lx-0x%010lx]\n", +(unsigned long)output + (VO___bss_start - VO__text), +(unsigned long)output + run_size - 1); + /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the @@ -422,6 +454,12 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, choose_kernel_location(input_data, input_len, &output, output_run_size, &virt_offset); + if (output != output_orig) + debug_printf(" new output: [0x%010lx-0x%010lx] 0x%08lx: output_run_size\n", +(unsigned long)output, +(unsigned long)output + output_run_size - 1, +(unsigned long)output_run_size); + /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) error("Destination address inappropriately aligned"); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index b44a7c0..410e5d3 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -38,14 +38,21 @@ void __putstr(const char *s); void error(char *x); +/* printf.c */ +int sprintf(char *buf, const char *fmt, ...); +int printf(const char *fmt, ...); + #ifdef CONFIG_X86_VERBOSE_BOOTUP #define debug_putstr(__x) __putstr(__x) +#define debug_printf printf #else static inline void debug_putstr(const char *s) { } +static inline int debug_printf(const char *fmt, ...) +{ } #endif
[PATCH 33/42] x86, boot: Add add_pci handler for SETUP_PCI
Let it reserve setup_data, and keep it's own list. Also clear the hdr.setup_data, as all handler now handle or reserve setup_data locally already. Cc: Bjorn Helgaas Cc: Matt Fleming Cc: linux-...@vger.kernel.org Signed-off-by: Yinghai Lu --- arch/x86/include/asm/pci.h | 2 ++ arch/x86/kernel/setup.c| 8 arch/x86/pci/common.c | 42 -- 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 4625943..7d2468c 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -80,8 +80,10 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, #ifdef CONFIG_PCI extern void early_quirks(void); +void add_pci(u64 pa_data); #else static inline void early_quirks(void) { } +static inline void add_pci(u64 pa_data) { } #endif extern void pci_iommu_alloc(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a3b65f1..de0f830 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -440,6 +440,8 @@ static void __init parse_setup_data(void) pa_next = data->next; early_memunmap(data, sizeof(*data)); + printk(KERN_DEBUG "setup_data type: %d @ %#010llx\n", + data_type, pa_data); switch (data_type) { case SETUP_E820_EXT: parse_e820_ext(pa_data, data_len); @@ -447,14 +449,20 @@ static void __init parse_setup_data(void) case SETUP_DTB: add_dtb(pa_data); break; + case SETUP_PCI: + add_pci(pa_data); + break; case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; default: + pr_warn("Unknown setup_data type: %d @ %#010llx ignored!\n", + data_type, pa_data); break; } pa_data = pa_next; } + boot_params.hdr.setup_data = 0; /* all done */ } static void __init memblock_x86_reserve_range_setup_data(void) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 8fd6f44..16ace12 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -641,31 +642,44 @@ unsigned int pcibios_assign_all_busses(void) return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0; } +static u64 pci_setup_data; +void __init add_pci(u64 pa_data) +{ + struct setup_data *data; + + data = early_memremap(pa_data, sizeof(*data)); + memblock_reserve(pa_data, sizeof(*data) + data->len); + data->next = pci_setup_data; + pci_setup_data = pa_data; + early_memunmap(data, sizeof(*data)); +} + int pcibios_add_device(struct pci_dev *dev) { struct setup_data *data; struct pci_setup_rom *rom; u64 pa_data; - pa_data = boot_params.hdr.setup_data; + pa_data = pci_setup_data; while (pa_data) { data = ioremap(pa_data, sizeof(*rom)); if (!data) return -ENOMEM; - if (data->type == SETUP_PCI) { - rom = (struct pci_setup_rom *)data; - - if ((pci_domain_nr(dev->bus) == rom->segment) && - (dev->bus->number == rom->bus) && - (PCI_SLOT(dev->devfn) == rom->device) && - (PCI_FUNC(dev->devfn) == rom->function) && - (dev->vendor == rom->vendor) && - (dev->device == rom->devid)) { - dev->rom = pa_data + - offsetof(struct pci_setup_rom, romdata); - dev->romlen = rom->pcilen; - } + rom = (struct pci_setup_rom *)data; + + if ((pci_domain_nr(dev->bus) == rom->segment) && + (dev->bus->number == rom->bus) && + (PCI_SLOT(dev->devfn) == rom->device) && + (PCI_FUNC(dev->devfn) == rom->function) && + (dev->vendor == rom->vendor) && + (dev->device == rom->devid)) { + dev->rom = pa_data + + offsetof(struct pci_setup_rom, romdata); + dev->romlen = rom->pcilen; + dev_printk(KERN_DEBUG, &dev->dev, "set rom to [%#010lx, %#010lx] via SETUP_PCI\n", +
[PATCH 31/42] x86, efi: Copy SETUP_EFI data and access directly
The copy will be in __initdata, and it is small. We can use pointer to access the setup_data instead of using early_memmap everywhere. Cc: Matt Fleming Cc: linux-...@vger.kernel.org Signed-off-by: Yinghai Lu --- arch/x86/include/asm/efi.h | 2 +- arch/x86/platform/efi/efi.c| 13 ++--- arch/x86/platform/efi/efi_64.c | 10 +- arch/x86/platform/efi/quirks.c | 23 ++- 4 files changed, 18 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 155162e..a3e3aee 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -116,7 +116,7 @@ struct efi_setup_data { u64 reserved[8]; }; -extern u64 efi_setup; +extern struct efi_setup_data *efi_setup; #ifdef CONFIG_EFI diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index cfba30f..33036ce 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -68,7 +68,7 @@ static efi_config_table_type_t arch_tables[] __initdata = { {NULL_GUID, NULL, NULL}, }; -u64 efi_setup; /* efi setup_data physical address */ +struct efi_setup_data *efi_setup __initdata; /* cached efi setup_data pointer */ static int add_efi_memmap __initdata; static int __init setup_add_efi_memmap(char *arg) @@ -257,20 +257,13 @@ static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { efi_system_table_64_t *systab64; - struct efi_setup_data *data = NULL; + struct efi_setup_data *data = efi_setup; u64 tmp = 0; - if (efi_setup) { - data = early_memremap(efi_setup, sizeof(*data)); - if (!data) - return -ENOMEM; - } systab64 = early_memremap((unsigned long)phys, sizeof(*systab64)); if (systab64 == NULL) { pr_err("Couldn't map the system table!\n"); - if (data) - early_memunmap(data, sizeof(*data)); return -ENOMEM; } @@ -303,8 +296,6 @@ static int __init efi_systab_init(void *phys) tmp |= data ? data->tables : systab64->tables; early_memunmap(systab64, sizeof(*systab64)); - if (data) - early_memunmap(data, sizeof(*data)); #ifdef CONFIG_X86_32 if (tmp >> 32) { pr_err("EFI data located above 4GB, disabling EFI.\n"); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a0ac0f9..a255491 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -295,9 +295,17 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, return (void __iomem *)__va(phys_addr); } +static struct efi_setup_data efi_setup_data __initdata; + void __init parse_efi_setup(u64 phys_addr, u32 data_len) { - efi_setup = phys_addr + sizeof(struct setup_data); + struct efi_setup_data *data; + + data = early_memremap(phys_addr + sizeof(struct setup_data), + sizeof(*data)); + efi_setup_data = *data; + early_memunmap(data, sizeof(*data)); + efi_setup = &efi_setup_data; } void __init efi_runtime_mkexec(void) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 1c7380d..45fec7d 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -203,9 +203,8 @@ void __init efi_free_boot_services(void) */ int __init efi_reuse_config(u64 tables, int nr_tables) { - int i, sz, ret = 0; + int i, sz; void *p, *tablep; - struct efi_setup_data *data; if (!efi_setup) return 0; @@ -213,22 +212,15 @@ int __init efi_reuse_config(u64 tables, int nr_tables) if (!efi_enabled(EFI_64BIT)) return 0; - data = early_memremap(efi_setup, sizeof(*data)); - if (!data) { - ret = -ENOMEM; - goto out; - } - - if (!data->smbios) - goto out_memremap; + if (!efi_setup->smbios) + return 0; sz = sizeof(efi_config_table_64_t); p = tablep = early_memremap(tables, nr_tables * sz); if (!p) { pr_err("Could not map Configuration table!\n"); - ret = -ENOMEM; - goto out_memremap; + return -ENOMEM; } for (i = 0; i < efi.systab->nr_tables; i++) { @@ -237,15 +229,12 @@ int __init efi_reuse_config(u64 tables, int nr_tables) guid = ((efi_config_table_64_t *)p)->guid; if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) -
[PATCH 40/42] x86, 64bit: remove highmap for not needed ranges
add cleanup_highmap_late to remove highmap for initmem, around rodata, and [_brk_end, all_end). Kernel Layout: [0.00] .text: [0x0100-0x0200df88] [0.00] .rodata: [0x0220-0x02a1dfff] [0.00] .data: [0x02c0-0x02e510ff] [0.00] .init: [0x02e53000-0x03213fff] [0.00].bss: [0x03222000-0x0437cfff] [0.00].brk: [0x0437d000-0x043a2fff] Actually used brk: [0.270365] memblock_reserve: [0x000437d000-0x0004383fff] flags 0x0 BRK Before patch: ---[ High Kernel Mapping ]--- 0x8000-0x8100 16M pmd 0x8100-0x8200 16M ro PSE GLB x pmd 0x8200-0x82011000 68K ro GLB x pte 0x82011000-0x82201980K RW GLB x pte 0x8220-0x82a0 8M ro PSE GLB NX pmd 0x82a0-0x82a1e000 120K ro GLB NX pte 0x82a1e000-0x82c01928K RW GLB NX pte 0x82c0-0x82e0 2M RW PSE GLB NX pmd 0x82e0-0x8300 2M RW GLB NX pte 0x8300-0x8320 2M RW PSE GLB NX pmd 0x8320-0x8340 2M RW GLB NX pte 0x8340-0x8440 16M RW PSE GLB NX pmd 0x8440-0xa000 444M pmd After patch: ---[ High Kernel Mapping ]--- 0x8000-0x8100 16M pmd 0x8100-0x8200 16M ro PSE GLB x pmd 0x8200-0x82012000 72K ro GLB x pte 0x82012000-0x82201976K pte 0x8220-0x82a0 8M ro PSE GLB NX pmd 0x82a0-0x82a1e000 120K ro GLB NX pte 0x82a1e000-0x82c01928K pte 0x82c0-0x82e0 2M RW PSE GLB NX pmd 0x82e0-0x82e53000 332K RW GLB NX pte 0x82e53000-0x83001716K pte 0x8300-0x8320 2M pmd 0x8320-0x83214000 80K pte 0x83214000-0x83401968K RW GLB NX pte 0x8340-0x8420 14M RW PSE GLB NX pmd 0x8420-0x843840001552K RW GLB NX pte 0x84384000-0x8440 496K pte 0x8440-0xa000 444M pmd So remove some range around rodata. -v4: adapt it to all_end change. Signed-off-by: Yinghai Lu --- arch/x86/mm/init_64.c | 62 +++ 1 file changed, 62 insertions(+) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 2507b98..38aa59c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1010,6 +1010,61 @@ void __init mem_init(void) } #ifdef CONFIG_DEBUG_RODATA +static void remove_highmap_2m(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud = (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr); + pmd_t *pmd = (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); + + set_pmd(pmd, __pmd(0)); +} + +static void remove_highmap_2m_partial(unsigned long addr, unsigned long end) +{ + int i; + pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud = (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr); + pmd_t *pmd = (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd) + pte_index(addr); + + for (i = pte_index(addr); i < pte_index(end - 1) + 1; i++, pte++) + set_pte(pte, __pte(0)); +} + +static void cleanup_highmap_late(unsigned long start, unsigned long end) +{ + unsigned long addr; + unsigned long start_2m_aligned = roundup(start, PMD_SIZE); + unsigned long end_2m_aligned = rounddown(end, PMD_SIZE); + + start = PFN_ALIGN(start); + end &= PAGE_MASK; + + if (start >= end) + return; + + if (start < start_2m_aligned) { + unsigned long tmp = min(start_2m_aligned, end); + + set_memory_4k(start, (tmp - start) >> PAGE_SHIFT); + remove_highmap_2m_partial(start, tmp); + } + + for (addr = start_2m_aligned; addr < end_2m_aligned; addr += PMD_SIZE) + remove_highmap_2m(addr); + + if (start <= end_2m_aligne
[PATCH 29/42] x86: Find correct 64 bit ramdisk address for microcode early update
When using kexec with 64bit kernel, bzImage and ramdisk could be loaded above 4G. We need this to get correct ramdisk adress. Make get_ramdisk_image() global and use it for early microcode updating. -v2: update changelog. Signed-off-by: Yinghai Lu --- arch/x86/include/asm/setup.h| 3 +++ arch/x86/kernel/cpu/microcode/amd_early.c | 10 +- arch/x86/kernel/cpu/microcode/intel_early.c | 8 arch/x86/kernel/setup.c | 28 ++-- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 3e5aa41..496515b 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -119,6 +119,9 @@ void *extend_brk(size_t size, size_t align); RESERVE_BRK(name, sizeof(type) * entries) extern void probe_roms(void); +u64 get_ramdisk_image(struct boot_params *bp); +u64 get_ramdisk_size(struct boot_params *bp); + #ifdef __i386__ asmlinkage void __init i386_start_kernel(void); diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index e8a215a..4c579c7 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -51,12 +51,12 @@ static struct cpio_data __init find_ucode_in_initrd(void) */ p = (struct boot_params *)__pa_nodebug(&boot_params); path= (char *)__pa_nodebug(ucode_path); - start = (void *)p->hdr.ramdisk_image; - size= p->hdr.ramdisk_size; + start = (void *)(unsigned long)get_ramdisk_image(p); + size= get_ramdisk_size(p); #else path= ucode_path; - start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); - size= boot_params.hdr.ramdisk_size; + start = (void *)(get_ramdisk_image(&boot_params) + PAGE_OFFSET); + size= get_ramdisk_size(&boot_params); #endif return find_cpio_data(path, start, size, &offset); @@ -396,7 +396,7 @@ int __init save_microcode_in_initrd_amd(void) */ if (relocated_ramdisk) container = (u8 *)(__va(relocated_ramdisk) + -(cont - boot_params.hdr.ramdisk_image)); +(cont - get_ramdisk_size(&boot_params))); else container = cont_va; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 8187b72..c85dcb2 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -736,16 +736,16 @@ void __init load_ucode_intel_bsp(void) struct boot_params *p; p = (struct boot_params *)__pa_nodebug(&boot_params); - start = p->hdr.ramdisk_image; - size= p->hdr.ramdisk_size; + start = get_ramdisk_image(p); + size= get_ramdisk_size(p); _load_ucode_intel_bsp( (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), start, size); #else - start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; - size= boot_params.hdr.ramdisk_size; + start = get_ramdisk_image(&boot_params) + PAGE_OFFSET; + size= get_ramdisk_size(&boot_params); _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80f874b..2d808e6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -300,19 +300,19 @@ u64 relocated_ramdisk; #ifdef CONFIG_BLK_DEV_INITRD -static u64 __init get_ramdisk_image(void) +u64 __init get_ramdisk_image(struct boot_params *bp) { - u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_image = bp->hdr.ramdisk_image; - ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + ramdisk_image |= (u64)bp->ext_ramdisk_image << 32; return ramdisk_image; } -static u64 __init get_ramdisk_size(void) +u64 __init get_ramdisk_size(struct boot_params *bp) { - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_size = bp->hdr.ramdisk_size; - ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + ramdisk_size |= (u64)bp->ext_ramdisk_size << 32; return ramdisk_size; } @@ -321,8 +321,8 @@ static u64 __init get_ramdisk_size(void) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = get_ramdisk_image(); - u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_image = get_ramdisk_image(&boot_params); + u64 ramdisk_size = get_ramdisk_size(&boot_params); u64 area_size = PAGE_ALIGN(ramdisk_size);
[PATCH 03/42] x86, boot: Fix run_size calculation
While looking at the boot code to add mem mapping for kasl with 64bit above 4G support, I found that e6023367d779 ("x86, kaslr: Prevent .bss from overlaping initrd") and later introduced way to get kernel run_size and pass it around. At first run_size calculation is via perl and then changed to shell scripts. At first, that calculation is not right in the shell scripts: it is using bss offset in the file plus bss/brk section size. run_size=$(( $offsetA + $sizeA + $sizeB )) Idx Name Size VMA LMA File off Algn ... 24 .bss 000a1000 825e 025e 019e 2**12 ALLOC 25 .brk 00026000 82681000 02681000 019e 2**0 ALLOC that run_size will be 27947008. it has extra not needed size as 1. we have hole between the sections in file to get aligned in file. 2. start of text is from 0x20 in elf file. [Nr] Name Type Address Offset Size EntSize Flags Link Info Align ... [25] .bss NOBITS 825e 019e 000a1000 WA 0 0 4096 [26] .brk NOBITS 82681000 019e 00026000 WA 0 0 1 Program Headers: Type Offset VirtAddr PhysAddr FileSizMemSiz Flags Align LOAD 0x0020 0x8100 0x0100 0x013a9000 0x013a9000 R E20 LOAD 0x0160 0x8240 0x0240 0x000ed000 0x000ed000 RW 20 LOAD 0x0180 0x 0x024ed000 0x00013698 0x00013698 RW 20 LOAD 0x01901000 0x82501000 0x02501000 0x000df000 0x001a6000 RWE20 NOTE 0x00e9d7dc 0x81c9d7dc 0x01c9d7dc 0x0024 0x0024 4 Section to Segment mapping: Segment Sections... 00 .text .notes .. 01 .data .vvar 02 .data..percpu 03 .init.text ... .bss .brk 04 .notes During decompress_kernel, parse_elf will move forward section to run time position. parse_elf: [0x009a00-0x009b3a8fff] <=== [0x009a20-0x009b5a8fff] parse_elf: [0x009b40-0x009b4ecfff] <=== [0x009b60-0x009b6ecfff] parse_elf: [0x009b4ed000-0x009b500697] <=== [0x009b80-0x009b813697] parse_elf: [0x009b501000-0x009b5d] <=== [0x009b901000-0x009b9d] Secondly it is not necessary. As run_size is simple constant, we don't need to pass it around and we already have voffset.h for that. We can share voffset.h between misc.c and header.S instead of adding other way to get run_size. In this patch, we move voffset.h creation code to boot/compressed/Makefile. Dependence was: boot/header.S ==> boot/voffset.h ==> vmlinux boot/header.S ==> compressed/vmlinux ==> compressed/misc.c Now become: boot/header.S ==> compressed/vmlinux ==> compressed/misc.c ==> boot/voffset.h ==> vmlinux Use macro in misc.c to replace passed run_size. Fixes: e6023367d779 ("x86, kaslr: Prevent .bss from overlaping initrd") Cc: Junjie Mao Cc: Kees Cook Cc: Josh Triplett Cc: Matt Fleming Cc: Andrew Morton Signed-off-by: Yinghai Lu --- arch/x86/boot/Makefile| 11 +-- arch/x86/boot/compressed/Makefile | 12 arch/x86/boot/compressed/misc.c | 3 +++ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 57bbf2f..4d27e8b 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -77,15 +77,6 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' - -quiet_cmd_voffset = VOFFSET $@ - cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ - -targets += voffset.h -$(obj)/voffset.h: vmlinux FORCE - $(call if_changed,voffset) - sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ @@ -97,7 +88,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h +$(obj)/header.o: $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile in
[PATCH 21/42] x86, boot: Add more debug printout in compressed/misc.c
with support that use printf.c in x86 setup code. print out more info for debug info. Signed-off-by: Yinghai Lu --- arch/x86/boot/compressed/misc.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index ee73b7b..a428c03 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -344,7 +344,7 @@ static void parse_elf(void *output) return; } - debug_putstr("Parsing ELF... "); + debug_putstr("Parsing ELF...\n"); phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); if (!phdrs) @@ -369,6 +369,11 @@ static void parse_elf(void *output) * Here dest is smaller than src always. */ memcpy(dest, output + phdr->p_offset, phdr->p_filesz); + debug_printf(" parse_elf: [0x%010lx-0x%010lx] <=== [0x%010lx-0x%010lx]\n", + (unsigned long)dest, + (unsigned long)dest + phdr->p_filesz - 1, + (unsigned long)output + phdr->p_offset, + (unsigned long)output + phdr->p_offset + phdr->p_filesz - 1); break; default: /* Ignore other PT_* */ break; } @@ -475,6 +480,11 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, error("Wrong destination address"); #endif + debug_printf(" decompress: [0x%010lx-0x%010lx] <=== [0x%010lx-0x%010lx]\n", + (unsigned long)output, + (unsigned long)output + output_len - 1, + (unsigned long)input_data, + (unsigned long)input_data + input_len - 1); debug_putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 32/42] x86, of: Let add_dtb reserve setup_data locally
We will not reserve setup_data in generic code. Every handler need to reserve and copy setup_data locally. Current dtd handling already have code for copying, just add reserve code. Also simplify code a bit by storing real dtb size. Cc: Rob Herring Cc: David Vrabel Signed-off-by: Yinghai Lu --- arch/x86/include/asm/prom.h | 9 ++--- arch/x86/kernel/devicetree.c | 39 +-- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 1d081ac..fb716eddc 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -24,17 +24,20 @@ #ifdef CONFIG_OF extern int of_ioapic; -extern u64 initial_dtb; -extern void add_dtb(u64 data); void x86_of_pci_init(void); void x86_dtb_init(void); #else -static inline void add_dtb(u64 data) { } static inline void x86_of_pci_init(void) { } static inline void x86_dtb_init(void) { } #define of_ioapic 0 #endif +#ifdef CONFIG_OF_FLATTREE +extern void add_dtb(u64 data); +#else +static inline void add_dtb(u64 data) { } +#endif + extern char cmd_line[COMMAND_LINE_SIZE]; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 1f4acd6..19fb3cf 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,6 +2,7 @@ * Architecture specific OF callbacks. */ #include +#include #include #include #include @@ -23,7 +24,6 @@ #include #include -__initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; int __initdata of_ioapic; @@ -43,11 +43,23 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); } +#ifdef CONFIG_OF_FLATTREE +static u64 initial_dtb __initdata; +static u32 initial_dtb_size __initdata; void __init add_dtb(u64 data) { + u32 map_len; + initial_dtb = data + offsetof(struct setup_data, data); -} + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); + initial_boot_params = early_memremap(initial_dtb, map_len); + initial_dtb_size = of_get_flat_dt_size(); + early_memunmap(initial_boot_params, map_len); + initial_boot_params = NULL; + memblock_reserve(initial_dtb, initial_dtb_size); +} +#endif /* * CE4100 ids. Will be moved to machine_device_initcall() once we have it. */ @@ -265,31 +277,22 @@ static void __init dtb_apic_setup(void) dtb_ioapic_setup(); } -#ifdef CONFIG_OF_FLATTREE static void __init x86_flattree_get_config(void) { - u32 size, map_len; +#ifdef CONFIG_OF_FLATTREE void *dt; if (!initial_dtb) return; - map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - - initial_boot_params = dt = early_memremap(initial_dtb, map_len); - size = of_get_flat_dt_size(); - if (map_len < size) { - early_memunmap(dt, map_len); - initial_boot_params = dt = early_memremap(initial_dtb, size); - map_len = size; - } - + initial_boot_params = dt = early_memremap(initial_dtb, + initial_dtb_size); unflatten_and_copy_device_tree(); - early_memunmap(dt, map_len); -} -#else -static inline void x86_flattree_get_config(void) { } + early_memunmap(dt, initial_dtb_size); + + memblock_free(initial_dtb, initial_dtb_size); #endif +} void __init x86_dtb_init(void) { -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 10/42] x86, 64bit: Set ident_mapping for kaslr
Current aslr only support random in near range, and new range still use old mapping. Also it does not support new range above 4G. We need to have ident mapping for the new range before we can do decompress to the new output, and later run them. In this patch, we add ident mapping for all needed range. At first, to support aslr to put random VO above 4G, we must set ident mapping for the new range when it come via startup_32 path. Secondly, when boot from 64bit bootloader, bootloader set ident mapping, and boot via ZO (arch/x86/boot/compressed/vmlinux) startup_64. Those pages for pagetable need to be avoided when we select new random VO (vmlinux) base. Otherwise decompressor would overwrite them during decompressing. First way would be: walk through pagetable and find out every page is used by pagetable for every mem_aovid checking but we will need extra code, and may need to increase mem_avoid array size to hold them. Other way would be: We can create new ident mapping instead, and pages for pagetable will come from _pagetable section of ZO, and they are in mem_avoid array already. In this way, we can reuse the code for ident mapping. The _pgtable will be shared 32bit and 64bit path to reduce init_size, as now ZO _rodata to _end will contribute init_size. We need to increase pgt buffer size. When boot via startup_64, as we need to cover old VO, params, cmdline and new VO, in extreme case we could have them all cross 512G boundary, will need (2+2)*4 pages with 2M mapping. And need 2 for first 2M for vga ram. Plus one for level4. Total will be 19 pages. When boot via startup_32, aslr would move new VO above 4G, we need set extra ident mapping for new VO, pgt buffer come from _pgtable offset 6 pages. Should only need (2+2) pages at most when it cross 512G boundary. So 19 pages could make both paths happy. Cc: Kees Cook Cc: Jiri Kosina Cc: Matt Fleming Signed-off-by: Yinghai Lu --- arch/x86/boot/compressed/Makefile | 3 ++ arch/x86/boot/compressed/aslr.c | 14 ++ arch/x86/boot/compressed/head_64.S | 4 +- arch/x86/boot/compressed/misc.h | 11 + arch/x86/boot/compressed/misc_pgt.c | 91 + arch/x86/include/asm/boot.h | 19 6 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 arch/x86/boot/compressed/misc_pgt.c diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e12a93c..66461b4 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -58,6 +58,9 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o +ifdef CONFIG_X86_64 + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/misc_pgt.o +endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index d753fb3..0990c78 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -151,6 +151,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, */ mem_avoid[0].start = input; mem_avoid[0].size = (output + init_size) - input; + fill_pagetable(input, (output + init_size) - input); /* Avoid initrd. */ initrd_start = (u64)real_mode->ext_ramdisk_image << 32; @@ -159,6 +160,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, initrd_size |= real_mode->hdr.ramdisk_size; mem_avoid[1].start = initrd_start; mem_avoid[1].size = initrd_size; + /* don't need to set mapping for initrd */ /* Avoid kernel command line. */ cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; @@ -169,10 +171,19 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, ; mem_avoid[2].start = cmd_line; mem_avoid[2].size = cmd_line_size; + fill_pagetable(cmd_line, cmd_line_size); /* Avoid params */ mem_avoid[3].start = (unsigned long)real_mode; mem_avoid[3].size = sizeof(*real_mode); + fill_pagetable((unsigned long)real_mode, sizeof(*real_mode)); + + /* don't need to set mapping for setup_data */ + +#ifdef CONFIG_X86_VERBOSE_BOOTUP + /* for video ram */ + fill_pagetable(0, PMD_SIZE); +#endif } /* Does this memory vector overlap a known avoided area? */ @@ -330,6 +341,9 @@ unsigned char *choose_kernel_location(unsigned char *input, goto out; choice = random; + + fill_pagetable(choice, output_run_size); + switch_pagetable(); out: return (unsigned char *)choice; } diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 3691451..075bb15 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arc
[PATCH 23/42] x86, setup: Use puts() instead of printf() in edd code
don't need to use printf there. Signed-off-by: Yinghai Lu --- arch/x86/boot/edd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 223e425..88d7c7f 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -157,7 +157,7 @@ void query_edd(void) */ if (!be_quiet) - printf("Probing EDD (edd=off to disable)... "); + puts("Probing EDD (edd=off to disable)... "); for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) { /* @@ -176,7 +176,7 @@ void query_edd(void) } if (!be_quiet) - printf("ok\n"); + puts("ok\n"); } #endif -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 28/42] x86, boot: Allow 64bit EFI kernel to be loaded above 4G
Now could use kexec to place kernel/boot_params/cmd_line/initrd above 4G, but that is with legacy interface with startup_64 directly. This patch will allow 64bit EFI kernel to be loaded above 4G and use EFI HANDOVER PROTOCOL to start the kernel. Current 32bit code32_start is used for passing around load address, so it will overflow when kernel is loaded abover 4G. The patch mainly add ext_code32_start to take load address high 32bits. After this patch, could use patched grub2-x86_64.efi to place kernel/boot_params/cmd_line/initrd all above 4G and execute the kernel above 4G. bootlog like: kernel: done [ linux 9.25MiB 100% 6.66MiB/s ] params: [1618fc000,1618f] cmdline: [1618fb000,1618fb7fe] kernel: [15e00,161385fff] initrd: [15bcbe000,15dbb] initrd: 1 file done [ initrd.img 35.26MiB 100% 11.93MiB/s ] early console in decompress_kernel decompress_kernel: input: [0x15fd0b3b4-0x16063c803], output: 0x15e00, heap: [0x160645b00-0x16064daff] Decompressing Linux... xz... Parsing ELF... done. Booting the kernel. [0.00] bootconsole [uart0] enabled [0.00]real_mode_data : phys 0001618fc000 [0.00]real_mode_data : virt 8801618fc000 [0.00] Kernel Layout: [0.00] .text: [0x15e00-0x15f08f72c] [0.00] .rodata: [0x15f20-0x15fa44fff] [0.00] .data: [0x15fc0-0x15fe545ff] [0.00] .init: [0x15fe56000-0x16021afff] [0.00].bss: [0x160229000-0x16135] [0.00].brk: [0x16136-0x161385fff] [0.00] memblock_reserve: [0x09f000-0x0f] flags 0x0 * BIOS reserved ... [0.00] memblock_reserve: [0x015e00-0x016135] flags 0x0 TEXT DATA BSS [0.00] memblock_reserve: [0x015bcbe000-0x015dff] flags 0x0 RAMDISK -v2: add cast to avoid warning with 32bit, also update description for ext_code32_start in boot.txt -v3: change to 4.0 from 3.20. Signed-off-by: Yinghai Lu --- Documentation/x86/boot.txt| 19 +++ arch/x86/boot/compressed/eboot.c | 15 ++- arch/x86/boot/compressed/head_64.S| 7 ++- arch/x86/boot/header.S| 3 ++- arch/x86/include/uapi/asm/bootparam.h | 1 + arch/x86/kernel/asm-offsets.c | 1 + 6 files changed, 39 insertions(+), 7 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 9da6f35..90efaa2 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -61,6 +61,9 @@ Protocol 2.12:(Kernel 3.8) Added the xloadflags field and extension fields to struct boot_params for loading bzImage and ramdisk above 4G in 64bit. +Protocol 2.14: (Kernel 4.0) Added the ext_code32_start to support 64bit + EFI kernel to be loaded above 4G. + MEMORY LAYOUT The traditional memory map for the kernel loader, used for Image or @@ -197,6 +200,7 @@ Offset Proto NameMeaning 0258/8 2.10+ pref_addressPreferred loading address 0260/4 2.10+ init_size Linear memory required during initialization 0264/4 2.11+ handover_offset Offset of handover entry point +0268/4 2.14+ ext_code32_startExtended part for code32_start (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -744,6 +748,14 @@ Offset/size: 0x264/4 See EFI HANDOVER PROTOCOL below for more details. +Field name:ext_code32_start +Type: modify (optional, reloc) +Offset/size: 0x268/4 +Protocol: 2.14+ + + This field is the upper 32bits of load address when EFI 64bit kernel + is loaded above 4G. And it is used with code32_start to compare to + pref_address to decide if kernel need to be relocated further. THE IMAGE CHECKSUM @@ -1127,4 +1139,11 @@ The boot loader *must* fill out the following fields in bp, o hdr.ramdisk_image (if applicable) o hdr.ramdisk_size (if applicable) +for 64bit, when loading above 4G, *must* fill out the following fields, + +o hdr.ext_code32_start +o ext_cmd_line_ptr +o ext_ramdisk_image (if applicable) +o ext_ramdisk_size (if applicable) + All other fields should be zero. diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 2c82bd1..05d77a5 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1394,6 +1394,7 @@ struct boot_params *efi_main(struct efi_config *c, void *handle; efi_system_table_t *_table; bool is64; + unsigned long loaded_addr; efi_early = c; @@ -1435,9 +1436,12 @@ struct boot_params *efi_main(struct efi_config *c, * If the kernel isn't already loaded at the preferred load * address, relocate it. */ - if (hdr->pref_address != hdr->code32_start) { - unsigned long bzimage
[PATCH 17/42] x86, kaslr: Add support of kernel physical address randomization above 4G
From: Baoquan He In kaslr implementation mechanism, mainly process_e820_entry and slots_fetch_random do the job. process_e820_entry is responsible for storing the slot information. slots_fetch_random takes care of fetching slot information. In this patch, for adding support of kernel physical address randomization above 4G, both of these two functions are changed based on the new slot_area data structure. Now kernel can be reloaded and decompressed anywhere of the whole physical memory, even near 64T at most. Signed-off-by: Baoquan He --- arch/x86/boot/compressed/aslr.c | 68 ++--- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 554b637..9158882 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -293,27 +293,40 @@ static void slots_append(unsigned long addr) static unsigned long slots_fetch_random(void) { + unsigned long random; + int i; + /* Handle case of no slots stored. */ if (slot_max == 0) return 0; - return slots[get_random_long() % slot_max]; + random = get_random_long() % slot_max; + + for (i = 0; i < slot_area_index; i++) { + if (random >= slot_areas[i].num) { + random -= slot_areas[i].num; + continue; + } + return slot_areas[i].addr + random * CONFIG_PHYSICAL_ALIGN; + } + + if (i == slot_area_index) + debug_putstr("Something wrong happened in slots_fetch_random()...\n"); + return 0; } static void process_e820_entry(struct e820entry *entry, unsigned long minimum, unsigned long image_size) { - struct mem_vector region, img; + struct mem_vector region, out; + struct slot_area slot_area; + unsigned long min, start_orig; /* Skip non-RAM entries. */ if (entry->type != E820_RAM) return; - /* Ignore entries entirely above our maximum. */ - if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - return; - /* Ignore entries entirely below our minimum. */ if (entry->addr + entry->size < minimum) return; @@ -321,10 +334,17 @@ static void process_e820_entry(struct e820entry *entry, region.start = entry->addr; region.size = entry->size; +repeat: + start_orig = region.start; + /* Potentially raise address to minimum location. */ if (region.start < minimum) region.start = minimum; + /* Return if slot area array is full */ + if (slot_area_index == MAX_SLOT_AREA) + return; + /* Potentially raise address to meet alignment requirements. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); @@ -333,20 +353,30 @@ static void process_e820_entry(struct e820entry *entry, return; /* Reduce size by any delta from the original address. */ - region.size -= region.start - entry->addr; + region.size -= region.start - start_orig; - /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; + /* Return if region can't contain decompressed kernel */ + if (region.size < image_size) + return; - /* Walk each aligned slot and check for avoided areas. */ - for (img.start = region.start, img.size = image_size ; -mem_contains(®ion, &img) ; -img.start += CONFIG_PHYSICAL_ALIGN) { - if (mem_avoid_overlap(&img)) - continue; - slots_append(img.start); + if (!mem_avoid_overlap(®ion)) { + store_slot_info(®ion, image_size); + return; } + + min = mem_min_overlap(®ion, &out); + + if (min > region.start + image_size) { + struct mem_vector tmp; + + tmp.start = region.start; + tmp.size = min - region.start; + store_slot_info(&tmp, image_size); + } + + region.size -= out.start - region.start + out.size; + region.start = out.start + out.size; + goto repeat; } static unsigned long find_random_phy_addr(unsigned long minimum, @@ -361,6 +391,10 @@ static unsigned long find_random_phy_addr(unsigned long minimum, /* Verify potential e820 positions, appending to slots list. */ for (i = 0; i < real_mode->e820_entries; i++) { process_e820_entry(&real_mode->e820_map[i], minimum, size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Stop processing e820 since slot_areas is full...\n"); +
[PATCH 36/42] x86, boot, PCI: Copy SETUP_PCI rom to kernel space
As EFI stub code could put them high when on 32bit or with exactmap= on 64bit conf. Check if the range is mapped, otherwise allocate new one and have the rom data copied. So we could access them directly. Signed-off-by: Yinghai Lu --- arch/x86/pci/common.c | 47 +-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 32d4f21..4d6b128 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -668,6 +668,48 @@ struct firmware_setup_pci_entry { static LIST_HEAD(setup_pci_entries); +static phys_addr_t check_copy(phys_addr_t start, unsigned long size) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(start + size); + unsigned char *p, *q; + phys_addr_t pa_p, pa_q; + long sz = size; + + if (pfn_range_is_mapped(start_pfn, end_pfn)) + return start; + + /* allocate and copy */ + pa_p = memblock_alloc(size, PAGE_SIZE); + if (!pa_p) + return start; + + p = phys_to_virt(pa_p); + + pa_q = start; + while (sz > 0) { + long chunk_size = 64<<10; + + if (chunk_size > sz) + chunk_size = sz; + + q = early_memremap(pa_q, chunk_size); + if (!q) { + memblock_free(pa_p, size); + return start; + } + memcpy(p, q, chunk_size); + early_memunmap(q, chunk_size); + p += chunk_size; + pa_q += chunk_size; + sz -= chunk_size; + } + + memblock_free(start, size); + + return pa_p; +} + int __init fill_setup_pci_entries(void) { struct setup_data *data; @@ -697,8 +739,9 @@ int __init fill_setup_pci_entries(void) entry->vendor = rom->vendor; entry->devid = rom->devid; entry->pcilen = rom->pcilen; - entry->romdata = pa_data + -offsetof(struct pci_setup_rom, romdata); + entry->romdata = check_copy(pa_data + + offsetof(struct pci_setup_rom, romdata), + rom->pcilen); list_add(&entry->list, &setup_pci_entries); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 34/42] x86: Kill not used setup_data handling code
Cc: Matt Fleming Signed-off-by: Yinghai Lu --- arch/x86/kernel/kdebugfs.c | 142 - arch/x86/kernel/setup.c| 17 -- 2 files changed, 159 deletions(-) diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index dc1404b..c8ca86c 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -21,142 +21,6 @@ struct dentry *arch_debugfs_dir; EXPORT_SYMBOL(arch_debugfs_dir); #ifdef CONFIG_DEBUG_BOOT_PARAMS -struct setup_data_node { - u64 paddr; - u32 type; - u32 len; -}; - -static ssize_t setup_data_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct setup_data_node *node = file->private_data; - unsigned long remain; - loff_t pos = *ppos; - struct page *pg; - void *p; - u64 pa; - - if (pos < 0) - return -EINVAL; - - if (pos >= node->len) - return 0; - - if (count > node->len - pos) - count = node->len - pos; - - pa = node->paddr + sizeof(struct setup_data) + pos; - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - p = ioremap_cache(pa, count); - if (!p) - return -ENXIO; - } else - p = __va(pa); - - remain = copy_to_user(user_buf, p, count); - - if (PageHighMem(pg)) - iounmap(p); - - if (remain) - return -EFAULT; - - *ppos = pos + count; - - return count; -} - -static const struct file_operations fops_setup_data = { - .read = setup_data_read, - .open = simple_open, - .llseek = default_llseek, -}; - -static int __init -create_setup_data_node(struct dentry *parent, int no, - struct setup_data_node *node) -{ - struct dentry *d, *type, *data; - char buf[16]; - - sprintf(buf, "%d", no); - d = debugfs_create_dir(buf, parent); - if (!d) - return -ENOMEM; - - type = debugfs_create_x32("type", S_IRUGO, d, &node->type); - if (!type) - goto err_dir; - - data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); - if (!data) - goto err_type; - - return 0; - -err_type: - debugfs_remove(type); -err_dir: - debugfs_remove(d); - return -ENOMEM; -} - -static int __init create_setup_data_nodes(struct dentry *parent) -{ - struct setup_data_node *node; - struct setup_data *data; - int error; - struct dentry *d; - struct page *pg; - u64 pa_data; - int no = 0; - - d = debugfs_create_dir("setup_data", parent); - if (!d) - return -ENOMEM; - - pa_data = boot_params.hdr.setup_data; - - while (pa_data) { - node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) { - error = -ENOMEM; - goto err_dir; - } - - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - data = ioremap_cache(pa_data, sizeof(*data)); - if (!data) { - kfree(node); - error = -ENXIO; - goto err_dir; - } - } else - data = __va(pa_data); - - node->paddr = pa_data; - node->type = data->type; - node->len = data->len; - error = create_setup_data_node(d, no, node); - pa_data = data->next; - - if (PageHighMem(pg)) - iounmap(data); - if (error) - goto err_dir; - no++; - } - - return 0; - -err_dir: - debugfs_remove(d); - return error; -} - static struct debugfs_blob_wrapper boot_params_blob = { .data = &boot_params, .size = sizeof(boot_params), @@ -181,14 +45,8 @@ static int __init boot_params_kdebugfs_init(void) if (!data) goto err_version; - error = create_setup_data_nodes(dbp); - if (error) - goto err_data; - return 0; -err_data: - debugfs_remove(data); err_version: debugfs_remove(version); err_dir: diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index de0f830..35d9ff5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -465,20 +465,6 @@ static void __init parse_setup_data(void) boot_params.hdr.setup_data = 0; /* all done */ } -static void __init memblock_x86_reserve_range_setup_data(void) -{ -
[PATCH 38/42] x86: Fix typo in mark_rodata_ro
In the comment, should use cleanup_highmap(). and also remove not needed cast for _brk_end, as it is unsigned long. Signed-off-by: Yinghai Lu --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 257ba4b..3b7453a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1054,9 +1054,9 @@ void mark_rodata_ro(void) * of the PMD will remain mapped executable. * * Any PMD which was setup after the one which covers _brk_end -* has been zapped already via cleanup_highmem(). +* has been zapped already via cleanup_highmap(). */ - all_end = roundup((unsigned long)_brk_end, PMD_SIZE); + all_end = roundup(_brk_end, PMD_SIZE); set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 30/42] x86: Kill E820_RESERVED_KERN
Now we are using memblock to do early resource reserver/allocation instead of using e820 map directly, and setup_data is reserved in memblock early already. Also kexec generate setup_data and pass pointer to second kernel, so second kernel reserve setup_data by their own. (Now kexec-tools create SETUP_EFI and SETUP_E820_EXT). We can kill E820_RESERVED_KERN and not touch e820 map at all. That will fix bug in mark_nonsave_region that can not handle that case: E820_RAM and E820_RESERVED_KERN ranges are continuous and boundary is not page aligned. Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=913885 Reported-by: "Lee, Chun-Yi" Tested-by: "Lee, Chun-Yi" Cc: "Lee, Chun-Yi" Signed-off-by: Yinghai Lu Cc: sta...@vger.kernel.org --- arch/x86/include/uapi/asm/e820.h | 8 arch/x86/kernel/e820.c | 6 ++ arch/x86/kernel/setup.c | 25 - arch/x86/kernel/tboot.c | 3 +-- arch/x86/mm/init_64.c| 11 --- 5 files changed, 7 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 0f457e6..a9216a1 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -45,14 +45,6 @@ */ #define E820_PRAM 12 -/* - * reserved RAM used by kernel itself - * if CONFIG_INTEL_TXT is enabled, memory of this type will be - * included in the S3 integrity calculation and so should not include - * any memory that BIOS might alter over the S3 transition - */ -#define E820_RESERVED_KERN128 - #ifndef __ASSEMBLY__ #include struct e820entry { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 46ec08d..49d8c50 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -134,7 +134,6 @@ static void __init e820_print_type(u32 type) { switch (type) { case E820_RAM: - case E820_RESERVED_KERN: printk(KERN_CONT "usable"); break; case E820_RESERVED: @@ -693,7 +692,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) pfn = PFN_DOWN(ei->addr + ei->size); - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + if (ei->type != E820_RAM) register_nosave_region(PFN_UP(ei->addr), pfn); if (pfn >= limit_pfn) @@ -910,7 +909,6 @@ void __init finish_e820_parsing(void) static inline const char *e820_type_to_string(int e820_type) { switch (e820_type) { - case E820_RESERVED_KERN: case E820_RAM: return "System RAM"; case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; @@ -1107,7 +1105,7 @@ void __init memblock_x86_fill(void) if (end != (resource_size_t)end) continue; - if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + if (ei->type != E820_RAM) continue; memblock_add(ei->addr, ei->size); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2d808e6..a3b65f1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -457,29 +457,6 @@ static void __init parse_setup_data(void) } } -static void __init e820_reserve_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - e820_update_range(pa_data, sizeof(*data)+data->len, -E820_RAM, E820_RESERVED_KERN); - pa_data = data->next; - early_memunmap(data, sizeof(*data)); - } - - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); - printk(KERN_INFO "extended physical RAM map:\n"); - e820_print_map("reserve setup_data"); -} - static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; @@ -1018,8 +995,6 @@ void __init setup_arch(char **cmdline_p) early_dump_pci_devices(); #endif - /* update the e820_saved too */ - e820_reserve_setup_data(); finish_e820_parsing(); if (efi_enabled(EFI_BOOT)) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 91a4496..3c2752a 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -195,8 +195,7 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; for (i = 0; i < e820.nr_map; i++) { - if ((e820.map[i].type != E820_RAM) -&& (e820.map[i].type != E820_RESERVED
[PATCH 41/42] x86, 64bit: Add __pa_high/__va_high
and use it to make the early page table setup code more readable, as we are using kernel high mapping address. Signed-off-by: Yinghai Lu --- arch/x86/kernel/head64.c | 15 +-- 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index a9f0299..cd0a820 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -37,6 +37,9 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt = 2; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +#define __va_high(x) ((void *)((unsigned long)(x) + __START_KERNEL_map - phys_base)) +#define __pa_high(x) ((unsigned long)(x) - __START_KERNEL_map + phys_base) + /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { @@ -47,7 +50,7 @@ static void __init reset_early_page_tables(void) next_early_pgt = 0; - write_cr3(__pa_nodebug(early_level4_pgt)); + write_cr3(__pa_high(early_level4_pgt)); } /* Create a new PMD entry */ @@ -60,7 +63,7 @@ int __init early_make_pgtable(unsigned long address) pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ - if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) + if (physaddr >= MAXMEM || read_cr3() != __pa_high(early_level4_pgt)) return -1; again: @@ -73,7 +76,7 @@ again: * range and we might end up looping forever... */ if (pgd) - pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + pud_p = (pudval_t *)__va_high(pgd & PTE_PFN_MASK); else { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); @@ -83,13 +86,13 @@ again: pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; for (i = 0; i < PTRS_PER_PUD; i++) pud_p[i] = 0; - *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + *pgd_p = __pa_high(pud_p) + _KERNPG_TABLE; } pud_p += pud_index(address); pud = *pud_p; if (pud) - pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + pmd_p = (pmdval_t *)__va_high(pud & PTE_PFN_MASK); else { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); @@ -99,7 +102,7 @@ again: pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; for (i = 0; i < PTRS_PER_PMD; i++) pmd_p[i] = 0; - *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + *pud_p = __pa_high(pmd_p) + _KERNPG_TABLE; } pmd = (physaddr & PMD_MASK) + early_pmd_flags; pmd_p[pmd_index(address)] = pmd; -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 24/42] x86: Setup early console as early as possible in x86_start_kernel()
Analyze "console=uart8250,io,0x3f8,115200n8" in i386_start_kernel/x86_64_start_kernel, and call setup_early_serial8250_console() to init early serial console. Only can handle io port kind of 8250, because mmio need ioremap. Use boot_params.hdr.version instead of adding another variable, Suggested by hpa. Also need to apply this one after x86 memblock patchset. Signed-off-by: Yinghai Lu --- arch/x86/include/asm/setup.h | 2 ++ arch/x86/kernel/head.c | 26 ++ arch/x86/kernel/head32.c | 1 + arch/x86/kernel/head64.c | 5 - drivers/tty/serial/8250/8250_early.c | 17 + kernel/printk/printk.c | 11 +++ 6 files changed, 57 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 11af24e..3e5aa41 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -40,6 +40,8 @@ static inline void vsmp_init(void) { } void setup_bios_corruption_check(void); extern unsigned long saved_video_mode; +int setup_early_serial8250_console(char *cmdline); +void setup_early_console(void); extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 992f442..cc0cd83 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -69,3 +69,29 @@ void __init reserve_ebda_region(void) /* reserve all memory between lowmem and the 1MB mark */ memblock_reserve(lowmem, 0x10 - lowmem); } + +void __init setup_early_console(void) +{ +#ifdef CONFIG_SERIAL_8250_CONSOLE + char constr[64], *p, *q; + + /* Can not handle mmio type 8250 uart yet, too early */ + p = strstr(boot_command_line, "console=uart8250,io,"); + if (!p) + p = strstr(boot_command_line, "console=uart,io,"); + if (!p) + return; + + p += 8; /* sizeof "console=" */ + q = strchrnul(p, ' '); + if ((q - p) >= sizeof(constr)) + return; + + memset(constr, 0, sizeof(constr)); + memcpy(constr, p, q - p); + + lockdep_init(); + + setup_early_serial8250_console(constr); +#endif +} diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2911ef3..87ddca1 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -33,6 +33,7 @@ asmlinkage __visible void __init i386_start_kernel(void) { cr4_init_shadow(); sanitize_boot_params(&boot_params); + setup_early_console(); /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5a46681..44dc63b 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -171,6 +171,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); + setup_early_console(); /* * Load microcode early on BSP. @@ -189,8 +190,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) void __init x86_64_start_reservations(char *real_mode_data) { /* version is always not zero if it is copied */ - if (!boot_params.hdr.version) + if (!boot_params.hdr.version) { copy_bootdata(__va(real_mode_data)); + setup_early_console(); + } reserve_ebda_region(); diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c index 771dda2..8a7fe75 100644 --- a/drivers/tty/serial/8250/8250_early.c +++ b/drivers/tty/serial/8250/8250_early.c @@ -152,3 +152,20 @@ int __init early_serial8250_setup(struct earlycon_device *device, } EARLYCON_DECLARE(uart8250, early_serial8250_setup); EARLYCON_DECLARE(uart, early_serial8250_setup); + +/* for x86 early early console */ +int __init setup_early_serial8250_console(char *cmdline) +{ + char *options; + + options = strstr(cmdline, "uart8250,"); + if (options) + return setup_earlycon(options); + + options = strstr(cmdline, "uart,"); + if (options) + return setup_earlycon(options); + + return 0; +} + diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index cf8c242..f554c5f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2454,11 +2454,14 @@ void register_console(struct console *newcon) struct console_cmdline *c; if (console_drivers) - for_each_console(bcon) - if (WARN(bcon == newcon, - "console '%s%d' already registered\n", - bcon->name, bcon->index))
[PATCH 16/42] x86, kaslr: Randomize physical and virtual address of kernel separately
From: Baoquan He On x86_64, in old kaslr implementaion only physical address of kernel loading is randomized. Then calculate the delta of physical address where vmlinux was linked to load and where it is finally loaded. If delta is not equal to 0, namely there's a new physical address where kernel is actually decompressed, relocation handling need be done. Then delta is added to offset of kernel symbol relocation, this makes the address of kernel text mapping move delta long. Here the behavior is changed. Randomize both the physical address where kernel is decompressed and the virtual address where kernel text is mapped. And relocation handling only depends on virtual address randomization. Means if and only if virtual address is randomized to a different value, we add the delta to the offset of kernel relocs. Note that up to now both virtual offset and physical addr randomization cann't exceed CONFIG_RANDOMIZE_BASE_MAX_OFFSET, namely 1G. Signed-off-by: Baoquan He --- arch/x86/boot/compressed/aslr.c | 46 + arch/x86/boot/compressed/misc.c | 39 -- arch/x86/boot/compressed/misc.h | 19 + 3 files changed, 58 insertions(+), 46 deletions(-) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 775c6f9..554b637 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -349,7 +349,7 @@ static void process_e820_entry(struct e820entry *entry, } } -static unsigned long find_random_addr(unsigned long minimum, +static unsigned long find_random_phy_addr(unsigned long minimum, unsigned long size) { int i; @@ -387,23 +387,24 @@ static unsigned long find_random_virt_offset(unsigned long minimum, return random * CONFIG_PHYSICAL_ALIGN + minimum; } -unsigned char *choose_kernel_location(unsigned char *input, - unsigned long input_size, - unsigned char *output, - unsigned long output_run_size) +void choose_kernel_location(unsigned char *input, + unsigned long input_size, + unsigned char **output, + unsigned long output_run_size, + unsigned char **virt_offset) { - unsigned long choice = (unsigned long)output; unsigned long random; + *virt_offset = (unsigned char *)LOAD_PHYSICAL_ADDR; #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { debug_putstr("KASLR disabled by default...\n"); - goto out; + return; } #else if (cmdline_find_option_bool("nokaslr")) { debug_putstr("KASLR disabled by cmdline...\n"); - goto out; + return; } #endif @@ -411,23 +412,24 @@ unsigned char *choose_kernel_location(unsigned char *input, /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, - (unsigned long)output); + (unsigned long)*output); /* Walk e820 and find a random address. */ - random = find_random_addr(choice, output_run_size); - if (!random) { + random = find_random_phy_addr((unsigned long)*output, output_run_size); + if (!random) debug_putstr("KASLR could not find suitable E820 region...\n"); - goto out; + else { + if ((unsigned long)*output != random) { + fill_pagetable(random, output_run_size); + switch_pagetable(); + *output = (unsigned char *)random; + } } - /* Always enforce the minimum. */ - if (random < choice) - goto out; - - choice = random; - - fill_pagetable(choice, output_run_size); - switch_pagetable(); -out: - return (unsigned char *)choice; + /* +* Get a random address between LOAD_PHYSICAL_ADDR and +* CONFIG_RANDOMIZE_BASE_MAX_OFFSET +*/ + random = find_random_virt_offset(LOAD_PHYSICAL_ADDR, output_run_size); + *virt_offset = (unsigned char *)random; } diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index bfa4f0a..6b2a308 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -226,7 +226,8 @@ void error(char *x) } #if CONFIG_X86_NEED_RELOCS -static void handle_relocations(void *output, unsigned long output_len) +static void handle_relocations(void *output, unsigned long output_len, + void *virt_offset) { int *reloc; unsigned long delta, map, ptr; @@ -238,11 +239,6 @@ static void handle_relocations(void *output, unsigned long output_len)
[PATCH 07/42] x86, boot: Move z_extract_offset calculation to header.S
Old extract_offset calculation is done without knowledge of decompressor size. so it guess one big size. We can move it to header.S, where we have exact decompressor size. We save 8 pages for init_size with this patch. before patch: kernel: [13e00,13fa1dfff] input: [0x13f32d3b4-0x13fa01cc7], output: [0x13e00-0x13f9ef81f], heap: [0x13fa0b680-0x13fa1367f] after patch: kernel: [13e00,13fa15fff] input: [0x13f3253b4-0x13f9f9cc7], output: [0x13e00-0x13f9ef81f], heap: [0x13fa03680-0x13fa0b67f] Signed-off-by: Yinghai Lu --- arch/x86/boot/Makefile | 2 +- arch/x86/boot/compressed/misc.c| 5 + arch/x86/boot/compressed/mkpiggy.c | 16 +--- arch/x86/boot/header.S | 29 + 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 4d27e8b..e7196cf 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -77,7 +77,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 1c03098..db97bdf 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -84,13 +84,10 @@ * To avoid problems with the compressed data's meta information an extra 18 * bytes are needed. Leading to the formula: * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. + * extra_bytes = (uncompressed_size >> 12) + 32768 + 18. * * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. * */ diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index c03b009..c5148642 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -21,8 +21,7 @@ * --- */ /* - * Compute the desired load offset from a compressed program; outputs - * a small assembly wrapper with the appropriate symbols defined. + * outputs a small assembly wrapper with the appropriate symbols defined. */ #include @@ -35,7 +34,6 @@ int main(int argc, char *argv[]) { uint32_t olen; long ilen; - unsigned long offs; FILE *f = NULL; int retval = 1; @@ -65,23 +63,11 @@ int main(int argc, char *argv[]) ilen = ftell(f); olen = get_unaligned_le32(&olen); - /* -* Now we have the input (compressed) and output (uncompressed) -* sizes, compute the necessary decompression offset... -*/ - - offs = (olen > ilen) ? olen - ilen : 0; - offs += olen >> 12; /* Add 8 bytes for each 32K block */ - offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ - offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ - printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); printf("z_input_len = %lu\n", ilen); printf(".globl z_output_len\n"); printf("z_output_len = %lu\n", (unsigned long)olen); - printf(".globl z_min_extract_offset\n"); - printf("z_min_extract_offset = 0x%lx\n", offs); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 9bfab22..99204e5 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -440,7 +440,36 @@ setup_data:.quad 0 # 64-bit physical pointer to pref_address: .quad LOAD_PHYSICAL_ADDR# preferred load addr +/* check arch/x86/boot/compressed/misc.c for the formula about extra_bytes. */ +#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 32768 + 18) +#if ZO_z_output_len > ZO_z_input_len +#define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - ZO_z_input_len) +#else +#define ZO_z_extract_offset ZO_z_extra_bytes +#endif + +/* + * extract_offset has to be bigger than ZO head section. + * otherwise during head code
[PATCH 04/42] x86, kaslr: Kill not needed and wrong run_size calculation code.
We use simple and correct version to get run_size now, remove code for wrong run_size calculation. Fixes: e6023367d779 ("x86, kaslr: Prevent .bss from overlaping initrd") Cc: "H. Peter Anvin" Cc: Josh Triplett Cc: Matt Fleming Cc: Kees Cook Cc: Andrew Morton Cc: Ard Biesheuvel Cc: Junjie Mao Signed-off-by: Yinghai Lu --- arch/x86/boot/compressed/Makefile | 4 +--- arch/x86/boot/compressed/head_32.S | 3 +-- arch/x86/boot/compressed/head_64.S | 3 --- arch/x86/boot/compressed/misc.c| 6 ++ arch/x86/boot/compressed/mkpiggy.c | 9 ++-- arch/x86/tools/calc_run_size.sh| 42 -- 6 files changed, 6 insertions(+), 61 deletions(-) delete mode 100644 arch/x86/tools/calc_run_size.sh diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index d9fee82..50daea7 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -104,10 +104,8 @@ suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO):= lzo suffix-$(CONFIG_KERNEL_LZ4):= lz4 -RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ -$(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh) quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 0c140f9..122b32f 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -210,7 +210,6 @@ relocated: * Do the decompression, and jump to the new kernel.. */ /* push arguments for decompress_kernel: */ - pushl $z_run_size /* size of kernel with .bss and .brk */ pushl $z_output_len /* decompressed length, end of relocs */ movlBP_init_size(%esi), %eax @@ -226,7 +225,7 @@ relocated: pushl %eax/* heap area */ pushl %esi/* real mode pointer */ calldecompress_kernel /* returns kernel location in %eax */ - addl$28, %esp + addl$24, %esp /* * Jump to the decompressed kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 67dd8d3..3691451 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -407,8 +407,6 @@ relocated: * Do the decompression, and jump to the new kernel.. */ pushq %rsi/* Save the real mode argument */ - movq$z_run_size, %r9/* size of kernel with .bss and .brk */ - pushq %r9 movq%rsi, %rdi /* real mode address */ leaqboot_heap(%rip), %rsi /* malloc area for uncompression */ leaqinput_data(%rip), %rdx /* input_data */ @@ -416,7 +414,6 @@ relocated: movq%rbp, %r8 /* output target address */ movq$z_output_len, %r9 /* decompressed length, end of relocs */ calldecompress_kernel /* returns kernel location in %rax */ - popq%r9 popq%rsi /* diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a88b591..96201aa 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -371,9 +371,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, - unsigned long output_len, - unsigned long run_size) + unsigned long output_len) { + unsigned long run_size = VO__end - VO__text; unsigned char *output_orig = output; real_mode = rmode; @@ -394,8 +394,6 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, lines = real_mode->screen_info.orig_video_lines; cols = real_mode->screen_info.orig_video_cols; - run_size = VO__end - VO__text; - console_init(); debug_putstr("early console in decompress_kernel\n"); diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 5faad09..c03b009 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -36,13 +36,11 @@ int main(int argc, char *argv[]) uint32_t olen; long ilen; unsigned long offs; - unsigned long run_size; FILE *f = NULL; int retval = 1; - if (argc < 3) { - fprintf(stderr, "Usage: %s compressed_file run_size\n", - argv[0]); +
[PATCH 06/42] x86, kaslr: Consolidate mem_avoid array filling
We are going to support kaslr with 64bit above 4G, and new random output buffer could be anywhere. mem_avoid array is used for kaslr to search new output buffer. Current code only track range that is after output+output_run_size. We need to track all range instead of just after output+output_run_size. Current code has first entry is extra bytes after input+input_size, and it is according to output_run_size. Other entries are for initrd, cmdline, and heap/stack for ZO running. At first, check the first entry that should be in the mem_avoid array. Now ZO sit end of the buffer always, we can find out where is ZO text and data/bss etc. output+run_size | 0 output input input+input_size | output+init_size | || | | | |-|-|--|---|--|---|--| | | output+init_size-ZO_SIZE output+output_size [output, output+init_size) is the buffer for decompress. [output, output+run_size) is for VO run size. [output, output+output_size) is (VO (vmlinux after objcopy) plus relocs) [output+init_size-ZO_SIZE, output+init_size) is copied ZO. [input, input+input_size) is copied compressed (VO (vmlinux after objcopy) plus relocs), not the ZO. [input+input_size, output+init_size) is [_text, _end) for ZO. that could be first range in mem_avoid. That new first entry already include heap and stack for ZO running. So we don't need to put them separatedly into mem_avoid array. Also we need to put [input, input+input_size) in mem_avoid array, ant it is connected to first one, so merge them. At last we need to put boot_params into the mem_avoid too. As with 64bit bootloader could put it anywhere. After those changes, we have all range needed to be avoided in mem_avoid array. Cc: Kees Cook Signed-off-by: Yinghai Lu --- arch/x86/boot/compressed/aslr.c | 29 + 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 0e1dac0..d753fb3 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -109,7 +109,7 @@ struct mem_vector { unsigned long size; }; -#define MEM_AVOID_MAX 5 +#define MEM_AVOID_MAX 4 static struct mem_vector mem_avoid[MEM_AVOID_MAX]; static bool mem_contains(struct mem_vector *region, struct mem_vector *item) @@ -135,21 +135,22 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) } static void mem_avoid_init(unsigned long input, unsigned long input_size, - unsigned long output, unsigned long output_run_size) + unsigned long output) { + unsigned long init_size = real_mode->hdr.init_size; u64 initrd_start, initrd_size; u64 cmd_line, cmd_line_size; - unsigned long unsafe, unsafe_len; char *ptr; /* * Avoid the region that is unsafe to overlap during -* decompression (see calculations at top of misc.c). +* decompression. +* As we already move ZO (arch/x86/boot/compressed/vmlinux) +* to the end of buffer, [input+input_size, output+init_size) +* has [_text, _end) for ZO. */ - unsafe_len = (output_run_size >> 12) + 32768 + 18; - unsafe = (unsigned long)input + input_size - unsafe_len; - mem_avoid[0].start = unsafe; - mem_avoid[0].size = unsafe_len; + mem_avoid[0].start = input; + mem_avoid[0].size = (output + init_size) - input; /* Avoid initrd. */ initrd_start = (u64)real_mode->ext_ramdisk_image << 32; @@ -169,13 +170,9 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, mem_avoid[2].start = cmd_line; mem_avoid[2].size = cmd_line_size; - /* Avoid heap memory. */ - mem_avoid[3].start = (unsigned long)free_mem_ptr; - mem_avoid[3].size = BOOT_HEAP_SIZE; - - /* Avoid stack memory. */ - mem_avoid[4].start = (unsigned long)free_mem_end_ptr; - mem_avoid[4].size = BOOT_STACK_SIZE; + /* Avoid params */ + mem_avoid[3].start = (unsigned long)real_mode; + mem_avoid[3].size = sizeof(*real_mode); } /* Does this memory vector overlap a known avoided area? */ @@ -319,7 +316,7 @@ unsigned char *choose_kernel_location(unsigned char *input, /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, - (unsigned long)output, output_run_size); + (unsigned long)output); /* Walk e820 and find a random address. */ random = find_random_addr(choice, output_run_size); -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsu
[PATCH 09/42] x86, boot: Split kernel_ident_mapping_init to another file
We need to include that in boot::decompress_kernel stage to set new ident mapping. Also add checking for __pa/__va macro definition, as we need to override them in boot::decompress_kernel stage. Reviewed-by: Kees Cook Signed-off-by: Yinghai Lu --- arch/x86/include/asm/page.h | 5 +++ arch/x86/mm/ident_map.c | 74 + arch/x86/mm/init_64.c | 74 + 3 files changed, 80 insertions(+), 73 deletions(-) create mode 100644 arch/x86/mm/ident_map.c diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 802dde3..cf8f619 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#ifndef __pa #define __pa(x)__phys_addr((unsigned long)(x)) +#endif + #define __pa_nodebug(x)__phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ @@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) +#ifndef __va #define __va(x)((void *)((unsigned long)(x)+PAGE_OFFSET)) +#endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c new file mode 100644 index 000..751ca92 --- /dev/null +++ b/arch/x86/mm/ident_map.c @@ -0,0 +1,74 @@ + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3fba623..6f457a4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -56,79 +56,7 @@ #include "mm_internal.h" -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, - unsigned long addr, unsigned long end) -{ - addr &= PMD_MASK; - for (; addr < end; addr += PMD_SIZE) { - pmd_t *pmd = pmd_page + pmd_index(addr); - - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); - } -} -static int ident_pud_init(struct x86_mapping_info *in
[PATCH 42/42] x86: fix msr print again
msr early print out get broken again, fix it. Signed-off-by: Yinghai Lu --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/cpu/common.c | 61 +--- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 43e6519..3a7bd35 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -177,7 +177,6 @@ extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); -void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 922c5e0..3c87e75 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1016,27 +1016,6 @@ out: } #endif -void __init identify_boot_cpu(void) -{ - identify_cpu(&boot_cpu_data); - init_amd_e400_c1e_mask(); -#ifdef CONFIG_X86_32 - sysenter_setup(); - enable_sep_cpu(); -#endif - cpu_detect_tlb(&boot_cpu_data); -} - -void identify_secondary_cpu(struct cpuinfo_x86 *c) -{ - BUG_ON(c == &boot_cpu_data); - identify_cpu(c); -#ifdef CONFIG_X86_32 - enable_sep_cpu(); -#endif - mtrr_ap_init(); -} - struct msr_range { unsignedmin; unsignedmax; @@ -1082,6 +1061,38 @@ static __init int setup_show_msr(char *arg) } __setup("show_msr=", setup_show_msr); +static void print_cpu_msr(struct cpuinfo_x86 *c) +{ + if (c->cpu_index < show_msr) + __print_cpu_msr(); +} + +void __init identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); + init_amd_e400_c1e_mask(); +#ifdef CONFIG_X86_32 + sysenter_setup(); + enable_sep_cpu(); +#endif + cpu_detect_tlb(&boot_cpu_data); + + print_cpu_msr(&boot_cpu_data); +} + +void identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); +#ifdef CONFIG_X86_32 + enable_sep_cpu(); +#endif + + print_cpu_msr(c); + + mtrr_ap_init(); +} + static __init int setup_noclflush(char *arg) { setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); @@ -1115,14 +1126,6 @@ void print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); else printk(KERN_CONT ")\n"); - - print_cpu_msr(c); -} - -void print_cpu_msr(struct cpuinfo_x86 *c) -{ - if (c->cpu_index < show_msr) - __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) -- 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/