[PATCH v3 10/51] PCI: Treat ROM resource as optional during realloc

2015-07-27 Thread Yinghai Lu
So will try to allocate them together with must-have ones, if can
not assign them, could go with must-have one only, and just skip
ROM resources.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 37 -
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 292f2a5..3abf249 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -372,18 +372,10 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
idx = res - &dev_res->dev->resource[0];
if (resource_size(res) &&
pci_assign_resource(dev_res->dev, idx)) {
-   if (fail_head) {
-   /*
-* if the failed res is for ROM BAR, and it will
-* be enabled later, don't add it to the list
-*/
-   if (!((idx == PCI_ROM_RESOURCE) &&
- (!(res->flags & IORESOURCE_ROM_ENABLE
-   add_to_list(fail_head,
-   dev_res->dev, res,
-   0 /* don't care */,
-   0 /* don't care */);
-   }
+   if (fail_head)
+   add_to_list(fail_head, dev_res->dev, res,
+   0 /* don't care */,
+   0 /* don't care */);
reset_resource(res);
}
}
@@ -1143,6 +1135,19 @@ out:
return good_align;
 }
 
+static inline bool is_optional(int i)
+{
+
+   if (i == PCI_ROM_RESOURCE)
+   return true;
+
+#ifdef CONFIG_PCI_IOV
+   if (i >= PCI_IOV_RESOURCES && i <= PCI_IOV_RESOURCE_END)
+   return true;
+#endif
+
+   return false;
+}
 /**
  * pbus_size_mem() - size the memory window of a given bus
  *
@@ -1199,10 +1204,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
r_size = resource_size(r);
align = pci_resource_alignment(dev, r);
-#ifdef CONFIG_PCI_IOV
-   /* put SRIOV requested res to the optional list */
-   if (realloc_head && i >= PCI_IOV_RESOURCES &&
-   i <= PCI_IOV_RESOURCE_END) {
+   /* put SRIOV/ROM res to realloc list */
+   if (realloc_head && is_optional(i)) {
add_to_align_test_list(&align_test_add_list,
align, r_size);
r->end = r->start - 1;
@@ -1212,7 +1215,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
max_add_align = align;
continue;
}
-#endif
+
if (align > (1ULL<<37)) { /*128 Gb*/
dev_warn(&dev->dev, "disabling BAR %d: %pR (bad 
alignment %#llx)\n",
i, r, (unsigned long long) align);
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed

2015-07-27 Thread Yinghai Lu
On Tue, Jul 21, 2015 at 12:31 AM, Dave Young  wrote:
>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>> index 80f874b..36aeac3 100644
>> --- a/arch/x86/kernel/setup.c
>> +++ b/arch/x86/kernel/setup.c
>> @@ -513,7 +513,7 @@ static void __init 
>> memblock_x86_reserve_range_setup_data(void)
>>  # define CRASH_KERNEL_ADDR_HIGH_MAX  MAXMEM
>>  #endif
>>
>> -static void __init reserve_crashkernel_low(void)
>> +static int __init reserve_crashkernel_low(void)
>>  {
>>  #ifdef CONFIG_X86_64
>>   const unsigned long long alignment = 16<<20;/* 16M */
>> @@ -542,7 +542,7 @@ static void __init reserve_crashkernel_low(void)
>>   } else {
>>   /* passed with crashkernel=0,low ? */
>>   if (!low_size)
>> - return;
>> + return 0;
>>   }
>>
>>   low_base = memblock_find_in_range(low_size, (1ULL<<32),
>> @@ -552,7 +552,7 @@ static void __init reserve_crashkernel_low(void)
>>   if (!auto_set)
>>   pr_info("crashkernel low reservation failed - No 
>> suitable area found.\n");
>>
>> - return;
>> + return -EINVAL;
>>   }
>>
>>   memblock_reserve(low_base, low_size);
>> @@ -564,6 +564,7 @@ static void __init reserve_crashkernel_low(void)
>>   crashk_low_res.end   = low_base + low_size - 1;
>>   insert_resource(&iomem_resource, &crashk_low_res);
>>  #endif
>> + return 0;
>>  }
>>
>>  static void __init reserve_crashkernel(void)
>> @@ -613,6 +614,10 @@ static void __init reserve_crashkernel(void)
>>   return;
>>   }
>>   }
>> +
>> + if (crash_base >= (1ULL<<32) && reserve_crashkernel_low())
>> + return;
>> +
>>   memblock_reserve(crash_base, crash_size);
>>
>>   printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
>> @@ -624,9 +629,6 @@ static void __init reserve_crashkernel(void)
>>   crashk_res.start = crash_base;
>>   crashk_res.end   = crash_base + crash_size - 1;
>>   insert_resource(&iomem_resource, &crashk_res);
>> -
>> - if (crash_base >= (1ULL<<32))
>> - reserve_crashkernel_low();
>>  }
>>  #else
>>  static void __init reserve_crashkernel(void)

No, you can not move the calling position for reserve_crashkernel_low().

old sequence:

memblock_find_in_range  for high
memblock_reserve for high
memblock_find_in_range  for low
memblock_reserve for low

now you change to:
memblock_find_in_range  for high
memblock_find_in_range  for low
memblock_reserve for low
memblock_reserve for high

during memblock_reserve, we would double the memblock reserve array.
So there is possibility that new membock reserve array is overlapped with
range for  crashdump high.

so you should keep the old sequence, and if reserve_crashkernel_low fail,
just call memblock_free to free high range that is reserved before.

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed

2015-07-22 Thread Yinghai Lu
On Tue, Jul 21, 2015 at 5:59 PM, Baoquan He  wrote:
>> That commit should only be used to workaround some systems that
>> have partial iommu support.
>
> Those big servers mostly has hardware iommu. But they still can
> enable swiotlb suport. Then low memory is needed.

Do you have whole bootlog? I don't understand why those system can not use
full iommu. BIOS problem or HW/silicon limitation?

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed

2015-07-22 Thread Yinghai Lu
On Tue, Jul 21, 2015 at 9:47 PM, Minfei Huang  wrote:
>
> Since low memory does not need for some machines, how about kexec does
> not allocate low memory automatically, if cmdline does not specify the
> option ",low". User shall know well, if they specify the cmdline with
> option ",high".

That was what I tried to do at that time.  Some others think
automatically set a small
value would be friendly to users.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed

2015-07-22 Thread Yinghai Lu
On Wed, Jul 22, 2015 at 3:11 AM, Joerg Roedel  wrote:
> On Tue, Jul 21, 2015 at 12:22:53PM -0700, Yinghai Lu wrote:
>> On Tue, Jul 21, 2015 at 1:58 AM, Baoquan He  wrote:
>>
>> > Maybe system which don't need low memory is rare, only for testing?
>>
>> No, it is not rare.
>>
>> All recent intel based systems with iommu support does not need low.
>
> All Intel-IOMMU systems have the iommu disabled by default (at least
> that is the default in most distros). So low memory is definitly needed
> by those systems too.

Do those systems need crashkernel=,high?

Do you mean BIOS have that disabled with not exposing DMAR table ?

kernel for RHEL 6 and RHEL7 have them enabled.
Also opensuse kernel have that enabled too.


>
>> that reserve 256M low always. and those 256M get wasted.
>>
>> That commit should only be used to workaround some systems that
>> have partial iommu support.
>
> We currently lack the infrastructure for that, but I am happy to review
> patches. How about letting subsystems announce their need for low
> crash-kernel memory and allocate based on that?
>
> The subsystems (like iommu or swiotlb code, for example) could even
> announce how much memory they need and we base our allocation on that.

That would be hard, as we don't know if second kernel could take what
kernel parameters.
user could disable iommu etc from command kernel for second kernel.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] Do not reserve crashkernel high memory if crashkernel low memory reserving failed

2015-07-21 Thread Yinghai Lu
On Tue, Jul 21, 2015 at 1:58 AM, Baoquan He  wrote:

> Maybe system which don't need low memory is rare, only for testing?

No, it is not rare.

All recent intel based systems with iommu support does not need low.

And those systems get punished by following patch:

| commit 94fb9334182284e8e7e4bcb9125c25dc33af19d4
| Author: Joerg Roedel 
| Date:   Wed Jun 10 17:49:42 2015 +0200
|
|x86/crash: Allocate enough low memory when crashkernel=high

that reserve 256M low always. and those 256M get wasted.

That commit should only be used to workaround some systems that
have partial iommu support.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86/mm: Assign the initail value to the pmd_idx

2015-07-20 Thread Yinghai Lu
On Sun, Jul 12, 2015 at 5:18 AM, Minfei Huang  wrote:
> From: Minfei Huang 
>
> The variable pmd_idx is undefined, when we try to start the loop to
> calculate the page.
>
> Assign the proper value which indexes the start address to make it work
> well.
>
> Signed-off-by: Minfei Huang 
> ---
>  arch/x86/mm/init_32.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
> index 8340e45..68aec42 100644
> --- a/arch/x86/mm/init_32.c
> +++ b/arch/x86/mm/init_32.c
> @@ -137,6 +137,7 @@ page_table_range_init_count(unsigned long start, unsigned 
> long end)
>
> vaddr = start;
> pgd_idx = pgd_index(vaddr);
> +   pmd_idx = pmd_index(vaddr);
>
> for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
>     for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);

Reviewed-by: Yinghai Lu 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 00/49] PCI: Resource allocation cleanup for v4.3

2015-07-16 Thread Yinghai Lu
On Thu, Jul 16, 2015 at 6:51 PM, Wei Yang  wrote:
> Yinghai,
>
> Tested your latest for for-pci-v4.3-next branch, it works fine on my P8
> machine.

Thanks for testing.

>
> BTW, the SRIOV works fine too. Previously failure is based on my mistake, I
> have disabled SRIOV :-(

Good.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3] [SCSI] mpt2sas, mpt3sas: Abort initialization if no memory I/O resources detected

2015-07-15 Thread Yinghai Lu
On Wed, Jul 15, 2015 at 6:52 AM, Timothy Pearson
 wrote:
>> I have just kept the same description provide by Timothy in his
>> initial patch.
>>
>> But I observe that their may be chance of getting "unable to handle
>> kernel NULL pointer dereference" kernel panic if no Memory Resource
>> available in the PCI subsystem. So agreed to the Timothy proposal of
>> aborting the driver initialization if it doesn't detect any Memory
>> resource instead of whole system get into panic state.
>>
> On some systems Linux is unable / unwilling to assign a BAR if the BIOS
> does not assign one at startup.  I didn't look into the Linux allocator
> side of things in much detail, but it is quite possible that Linux is
> unaware the device only has partial resources assigned.
>

Would be great if you can post boot log so we can figure about why those
BARs are not assigned.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3] [SCSI] mpt2sas, mpt3sas: Abort initialization if no memory I/O resources detected

2015-07-14 Thread Yinghai Lu
On Tue, Jul 14, 2015 at 9:49 PM, Sreekanth Reddy
 wrote:
> Driver crashes if the BIOS do not set up at least one
> memory I/O resource. This failure can happen if the device is too
> slow to respond during POST and is missed by the BIOS, but Linux
> then detects the device later in the boot process.

But pci subsystem should assign resources to those unassigned BAR.

Do you mean even kernel can not assign resource to them? or it takes so long for
mpt FW to get ready?

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 17/36] PCI: Add support for more than two alt_size under same bridge

2015-07-14 Thread Yinghai Lu
On Tue, Jul 14, 2015 at 8:07 PM, Yijing Wang  wrote:
> On 2015/7/7 7:39, Yinghai Lu wrote:
>> Need to increase size to make sure it could fit all alt entries.
>>
>> So at last, we use 8M/17M as parent bridge alt_align/alt_size.
>
> Tested-by: Yijing Wang 

Thanks for testing.

>
> Hi Yinghai, does this patch depend on the previous items in this patchset ?

Yes, it depends most of patches from patch1 to this patch.

> Could you provide another version of this patch for stable branch, eg. 3.10 
> stable ?

That is RHEL 7 kernel, right ?

After those patches get into upstream, I will try to port them to 3.10 stable.

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 35/42] x86, boot, PCI: Convert SETUP_PCI data to list

2015-07-14 Thread Yinghai Lu
On Tue, Jul 14, 2015 at 3:35 PM, Bjorn Helgaas  wrote:
>> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
>> index 16ace12..32d4f21 100644
>> --- a/arch/x86/pci/common.c
>> +++ b/arch/x86/pci/common.c
>
>> +struct firmware_setup_pci_entry {
>> + struct list_head list;
>> + uint16_t vendor;
>> + uint16_t devid;
>> + uint64_t pcilen;
>
> Is there a reason to use uint16_t and uint64_t instead of u16 and u64?

keep them same as arch/x86/include/asm/pci.h::pci_setup_rom.

and we have that from:

commit dd5fc854de5fd37adfcef8a366cd21a55aa01d3d
Author: Matthew Garrett 
Date:   Wed Dec 5 14:33:26 2012 -0700

EFI: Stash ROMs if they're not in the PCI BAR

EFI provides support for providing PCI ROMs via means other than the ROM
BAR. This support vanishes after we've exited boot services, so add support
for stashing copies of the ROMs in setup_data if they're not otherwise
available.

Signed-off-by: Matthew Garrett 
Signed-off-by: Bjorn Helgaas 
Tested-by: Seth Forshee 

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 6e41b93..dba7805 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -171,4 +171,16 @@ cpumask_of_pcibus(const struct pci_bus *bus)
 }
 #endif

+struct pci_setup_rom {
+   struct setup_data data;
+   uint16_t vendor;
+   uint16_t devid;
+   uint64_t pcilen;
+   unsigned long segment;
+   unsigned long bus;
+   unsigned long device;
+   unsigned long function;
+   uint8_t romdata[0];
+};
+
 #endif /* _ASM_X86_PCI_H */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 27/49] PCI: Unifiy calculate_size for io port and mmio

2015-07-14 Thread Yinghai Lu
We should check size+size1 with min_size for io port.
For example, when hotplug bridge has two children bridges,
every child bridge will need 0x1000, so size1 will be 0x2000
and size is 0. The min_size for the hotplug bridge is 0x100.
with old version calculate_iosize, we get 0x3000 for final
size because we are using size to compare with min_size. That is
not right, we should use 0x2000 instead.

After this change, calculate_memsize and calculate_iosize
is the same.

Change them to calculate_size.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 27 ++-
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 9d5e550..969a0b1 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1116,23 +1116,7 @@ static struct resource *find_free_bus_resource(struct 
pci_bus *bus,
return NULL;
 }
 
-static resource_size_t calculate_iosize(resource_size_t size,
-   resource_size_t min_size,
-   resource_size_t size1,
-   resource_size_t old_size,
-   resource_size_t align)
-{
-   if (size < min_size)
-   size = min_size;
-   if (old_size == 1)
-   old_size = 0;
-   size = ALIGN(size + size1, align);
-   if (size < old_size)
-   size = old_size;
-   return size;
-}
-
-static resource_size_t calculate_memsize(resource_size_t size,
+static resource_size_t calculate_size(resource_size_t size,
resource_size_t min_size,
resource_size_t old_size,
resource_size_t align)
@@ -1257,14 +1241,15 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
}
 
size = size_aligned_for_isa(size);
-   size0 = calculate_iosize(size, min_size, size1,
+   size += size1;
+   size0 = calculate_size(size, min_size,
resource_size(b_res), min_align);
sum_add_size = size_aligned_for_isa(sum_add_size);
sum_add_size += sum_add_size1;
if (sum_add_size < min_sum_size)
sum_add_size = min_sum_size;
size1 = !realloc_head ? size0 :
-   calculate_iosize(sum_add_size, min_size, 0,
+   calculate_size(sum_add_size, min_size,
resource_size(b_res), min_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
@@ -1617,7 +1602,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
if (size || min_size) {
min_align = calculate_mem_align(&align_test_list, max_align,
size, window_align);
-   size0 = calculate_memsize(size, min_size,
+   size0 = calculate_size(size, min_size,
  resource_size(b_res), min_align);
}
free_align_test_list(&align_test_list);
@@ -1642,7 +1627,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
min_add_align = calculate_mem_align(&align_test_add_list,
max_add_align, sum_add_size,
window_align);
-   size1 = calculate_memsize(sum_add_size, min_size,
+   size1 = calculate_size(sum_add_size, min_size,
 resource_size(b_res), min_add_align);
}
free_align_test_list(&align_test_add_list);
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 10/49] PCI: Treat ROM resource as optional during realloc

2015-07-14 Thread Yinghai Lu
So will try to allocate them together with must-have ones, if can
not assign them, could go with must-have one only, and just skip
ROM resources.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 37 -
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 292f2a5..3abf249 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -372,18 +372,10 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
idx = res - &dev_res->dev->resource[0];
if (resource_size(res) &&
pci_assign_resource(dev_res->dev, idx)) {
-   if (fail_head) {
-   /*
-* if the failed res is for ROM BAR, and it will
-* be enabled later, don't add it to the list
-*/
-   if (!((idx == PCI_ROM_RESOURCE) &&
- (!(res->flags & IORESOURCE_ROM_ENABLE
-   add_to_list(fail_head,
-   dev_res->dev, res,
-   0 /* don't care */,
-   0 /* don't care */);
-   }
+   if (fail_head)
+   add_to_list(fail_head, dev_res->dev, res,
+   0 /* don't care */,
+   0 /* don't care */);
reset_resource(res);
}
}
@@ -1143,6 +1135,19 @@ out:
return good_align;
 }
 
+static inline bool is_optional(int i)
+{
+
+   if (i == PCI_ROM_RESOURCE)
+   return true;
+
+#ifdef CONFIG_PCI_IOV
+   if (i >= PCI_IOV_RESOURCES && i <= PCI_IOV_RESOURCE_END)
+   return true;
+#endif
+
+   return false;
+}
 /**
  * pbus_size_mem() - size the memory window of a given bus
  *
@@ -1199,10 +1204,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
r_size = resource_size(r);
align = pci_resource_alignment(dev, r);
-#ifdef CONFIG_PCI_IOV
-   /* put SRIOV requested res to the optional list */
-   if (realloc_head && i >= PCI_IOV_RESOURCES &&
-   i <= PCI_IOV_RESOURCE_END) {
+   /* put SRIOV/ROM res to realloc list */
+   if (realloc_head && is_optional(i)) {
add_to_align_test_list(&align_test_add_list,
align, r_size);
r->end = r->start - 1;
@@ -1212,7 +1215,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
max_add_align = align;
continue;
}
-#endif
+
if (align > (1ULL<<37)) { /*128 Gb*/
dev_warn(&dev->dev, "disabling BAR %d: %pR (bad 
alignment %#llx)\n",
i, r, (unsigned long long) align);
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 07/49] PCI: Reorder resources list for must/optional resources

2015-07-14 Thread Yinghai Lu
After we update size and alignment for must+optional resource, we
reorder them with new alignment, but this is only for STARTALIGN.

For SIZEALIGN type resource, after add back add_size, the alignment
get changed, so need to do the sorting like STARTALIGN type resources.

Also we need to reorder the sorting back after we restore
resource to must only when must+optional fail to allocate for all.

So move out the reordering code from the loop to separated function,
and call it two times accordingly.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 62 +
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 7346bbf..6f2d508 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -287,6 +287,31 @@ static inline void reset_resource(struct resource *res)
res->flags = 0;
 }
 
+static void __sort_resources(struct list_head *head)
+{
+   struct pci_dev_resource *res1, *tmp_res, *res2;
+
+   list_for_each_entry_safe(res1, tmp_res, head, list) {
+   resource_size_t align1, size1, align2, size2;
+
+   align1 = pci_resource_alignment(res1->dev, res1->res);
+   size1 = resource_size(res1->res);
+
+   /* reorder it */
+   list_for_each_entry(res2, head, list) {
+   if (res2 == res1)
+   break;
+
+   align2 = pci_resource_alignment(res2->dev, res2->res);
+   size2 = resource_size(res2->res);
+   if (is_before(align1, size1, align2, size2)) {
+   list_move_tail(&res1->list, &res2->list);
+   break;
+   }
+   }
+   }
+}
+
 /**
  * reassign_resources_sorted() - satisfy any additional resource requests
  *
@@ -449,9 +474,9 @@ static void __assign_resources_sorted(struct list_head 
*head,
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
-   struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
+   struct pci_dev_resource *dev_res, *tmp_res;
unsigned long fail_type;
-   resource_size_t add_align, align;
+   resource_size_t add_align;
 
/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -466,47 +491,32 @@ static void __assign_resources_sorted(struct list_head 
*head,
}
 
/* Update res in head list with add_size in realloc_head list */
-   list_for_each_entry_safe(dev_res, tmp_res, head, list) {
+   list_for_each_entry(dev_res, head, list) {
dev_res->res->end += get_res_add_size(realloc_head,
dev_res->res);
 
/*
 * There are two kinds of additional resources in the list:
-* 1. bridge resource  -- IORESOURCE_STARTALIGN
-* 2. SR-IOV resource   -- IORESOURCE_SIZEALIGN
-* Here just fix the additional alignment for bridge
+* 1. bridge resource with IORESOURCE_STARTALIGN
+*need to update start to change alignment
+* 2. resource with IORESOURCE_SIZEALIGN
+*update size above already change alignment.
 */
if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
continue;
 
add_align = get_res_add_align(realloc_head, dev_res->res);
 
-   /*
-* The "head" list is sorted by the alignment to make sure
-* resources with bigger alignment will be assigned first.
-* After we change the alignment of a dev_res in "head" list,
-* we need to reorder the list by alignment to make it
-* consistent.
-*/
-   if (add_align > dev_res->res->start) {
+   if (add_align) {
resource_size_t r_size = resource_size(dev_res->res);
 
dev_res->res->start = add_align;
dev_res->res->end = add_align + r_size - 1;
-
-   list_for_each_entry(dev_res2, head, list) {
-   align = pci_resource_alignment(dev_res2->dev,
-  dev_res2->res);
-   if (add_align > align) {
-   list_move_tail(&dev_res->list,
-  &dev_res2->list);
-   break;
-   }
-   }
}
-
}
 
+   __so

[PATCH v2 01/49] PCI: Cleanup res_to_dev_res() printout for addon resources

2015-07-14 Thread Yinghai Lu
Now get_res_add_size and get_res_add_align all have same printout
from res_to_dev_res(), and it is confusing.

Move out debug messages printout from res_to_dev_res(),
and later we will reuse res_to_dev_res() in other functions.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 34 --
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 508cc56..f0fa705 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -104,19 +104,9 @@ static struct pci_dev_resource *res_to_dev_res(struct 
list_head *head,
 {
struct pci_dev_resource *dev_res;
 
-   list_for_each_entry(dev_res, head, list) {
-   if (dev_res->res == res) {
-   int idx = res - &dev_res->dev->resource[0];
-
-   dev_printk(KERN_DEBUG, &dev_res->dev->dev,
-"res[%d]=%pR res_to_dev_res add_size %llx 
min_align %llx\n",
-idx, dev_res->res,
-(unsigned long long)dev_res->add_size,
-(unsigned long long)dev_res->min_align);
-
+   list_for_each_entry(dev_res, head, list)
+   if (dev_res->res == res)
return dev_res;
-   }
-   }
 
return NULL;
 }
@@ -127,7 +117,15 @@ static resource_size_t get_res_add_size(struct list_head 
*head,
struct pci_dev_resource *dev_res;
 
dev_res = res_to_dev_res(head, res);
-   return dev_res ? dev_res->add_size : 0;
+   if (!dev_res || !dev_res->add_size)
+   return 0;
+
+   dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+  "BAR %d: %pR get_res_add_size add_size   %llx\n",
+  (int)(res - &dev_res->dev->resource[0]),
+  res, (unsigned long long)dev_res->add_size);
+
+   return dev_res->add_size;
 }
 
 static resource_size_t get_res_add_align(struct list_head *head,
@@ -136,7 +134,15 @@ static resource_size_t get_res_add_align(struct list_head 
*head,
struct pci_dev_resource *dev_res;
 
dev_res = res_to_dev_res(head, res);
-   return dev_res ? dev_res->min_align : 0;
+   if (!dev_res || !dev_res->min_align)
+   return 0;
+
+   dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+  "BAR %d: %pR get_res_add_align min_align %llx\n",
+  (int)(res - &dev_res->dev->resource[0]),
+  res, (unsigned long long)dev_res->min_align);
+
+   return dev_res->min_align;
 }
 
 
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 00/49] PCI: Resource allocation cleanup for v4.3

2015-07-14 Thread Yinghai Lu
Hi,

After 5b28541552ef (PCI: Restrict 64-bit prefetchable bridge windows
to 64-bit resources), we have several reports on resource allocation
failure, and we try to fix the problem with resource clip, and get
more problems.

One is realloc fail with two graphics cards above 4G.
One is from sparc that have problem with clip as we don't parse
mem64 for it.

Other report is about pci remove/rescan does not work on some setup
when BIOS tend to allocate small bus size.

This patchset enhance resource allocation to address those problems.
1. optimize bus mmio alignment calculation.
2. optimize bus mmio optional alignment calculation.
3. add support for alt size to prefer small bus size to small bus alignment.
   when we have small resource window on parent bridges.
4. treat ROM bar as optional resource.
5. during allocation, will pick up just fit resource.
6. parse MEM64 for sparc and other system with OF.
7. treat non-pref mmio64 if parent bridges are all pcie.
8. restore old pref allocation logic if hostbridge does not support mmio64 
really.
9. don't realloc resource if device firmware does not support bar change.
10. add pci=assign_pref_bars to clear and assign pref bars.
11. don't clear resource when allocation fails.

I put latest copy at:
  git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
for-pci-v4.3-next

That is based on kind of v4.2-rc2.

v2:
- fix resource_disabled allocation from powerpc
- other warnings that were found by Fengguang's build robot.
- io port alignment clean up.
- rebased to v4.2-rc2
- better for two alt_size support.

Thanks

Yinghai

Yinghai Lu (49):
  PCI: Cleanup res_to_dev_res() printout for addon resources
  PCI: Reuse res_to_dev_res in reassign_resources_sorted
  PCI: Use correct align for optional only resources during sorting
  PCI: Optimize bus align/size calculation during sizing
  PCI: Optimize bus align/size calculation for optional during sizing
  PCI: Don't add too much optional size for hotplug bridge mmio
  PCI: Reorder resources list for must/optional resources
  PCI: Remove duplicated code for resource sorting
  PCI: Rename pdev_sort_resources to pdev_check_resources
  PCI: Treat ROM resource as optional during realloc
  PCI: Add debug printout during releasing partial assigned resources
  PCI: Simplify res reference using in __assign_resourcs_sorted
  PCI: Separate realloc list checking after allocation
  PCI: Add __add_to_list()
  PCI: Cache window alignment value
  PCI: Check if resource is allocated before pci_assign
  PCI: Separate out save_resources/restore_resource
  PCI: Move comment to pci_need_to_release()
  PCI: Separate must+optional assigning to another function
  PCI: Skip must+optional if there is no optional addon
  PCI: Move saved required resource list out of must+optional assigning
  PCI: Add alt_size allocation support
  PCI: Add support for more than two alt_size under same bridge
  PCI: Better support for two alt_size
  PCI: Don't add too much optional size for hotplug bridge io
  PCI: Move ISA ioport align out of calculate_iosize
  PCI: Unifiy calculate_size for io port and mmio
  PCI: Allow optional only io resource must size to be 0
  PCI: Unify skip_ioresource_align()
  PCI: Kill macro checking for bus io port sizing
  resources: Split out __allocate_resource()
  resources: Make allocate_resource return just fit resource
  PCI: Check pref compatible bit for mem64 resource of pcie device
  PCI: Only treat non-pef mmio64 as pref if all bridges has MEM_64
  PCI: Add has_mem64 for host_bridge
  PCI: Only treat non-pef mmio64 as pref if host-bridge has_mem64
  PCI: Restore pref mmio allocation logic for hostbridge without mmio64
  sparc/PCI: Add mem64 resource parsing for root bus
  sparc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing
  powerpc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing
  of/PCI: Add IORESOURCE_MEM_64 for 64-bit resource
  PCI: Treat optional as must in first try for bridge rescan
  PCI: Get new realloc size for bridge for last try
  PCI: Don't release sibiling bridge resources during hotplug
  PCI: Don't release fixed resource for realloc
  PCI: Set resource to FIXED for lsi devices
  PCI, x86: Add pci=assign_pref_bars to re-allocate pref bars
  PCI: Introduce resource_disabled()
  PCI: Don't set flags to 0 when assign resource fail

 arch/alpha/kernel/pci.c   |2 +-
 arch/ia64/pci/pci.c   |4 +-
 arch/microblaze/pci/pci-common.c  |   23 +-
 arch/mn10300/unit-asb2305/pci-asb2305.c   |4 +-
 arch/mn10300/unit-asb2305/pci.c   |4 +-
 arch/powerpc/kernel/pci-common.c  |   27 +-
 arch/powerpc/kernel/pci_of_scan.c |4 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |   12 +-
 arch/s390/pci/pci.c   |2 +-
 arch/sparc/kernel/of_device_32.c  |5 +-
 arch/sparc/kernel/of_device_64.

[PATCH v2 15/49] PCI: Cache window alignment value

2015-07-14 Thread Yinghai Lu
There are several calling to window_alignment().
And we will have more for alt_size support.

Cache the value instead of keeping on getting it.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 57b5c09..1b5fbca 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1203,6 +1203,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
mask | IORESOURCE_PREFETCH, type);
LIST_HEAD(align_test_list);
LIST_HEAD(align_test_add_list);
+   resource_size_t window_align;
 
if (!b_res)
return -ENOSPC;
@@ -1212,6 +1213,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
min_size = 0;
}
 
+   window_align = window_alignment(bus, b_res->flags);
+
list_for_each_entry(dev, &bus->devices, bus_list) {
int i;
 
@@ -1272,10 +1275,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
}
}
 
-   max_align = max(max_align, window_alignment(bus, b_res->flags));
+   max_align = max(max_align, window_align);
if (size || min_size) {
min_align = calculate_mem_align(&align_test_list, max_align,
-size, window_alignment(bus, b_res->flags));
+   size, window_align);
size0 = calculate_memsize(size, min_size,
  resource_size(b_res), min_align);
}
@@ -1286,7 +1289,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
if (sum_add_size > size && realloc_head) {
min_add_align = calculate_mem_align(&align_test_add_list,
max_add_align, sum_add_size,
-   window_alignment(bus, b_res->flags));
+   window_align);
size1 = calculate_memsize(sum_add_size, min_size,
 resource_size(b_res), min_add_align);
}
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 06/49] PCI: Don't add too much optional size for hotplug bridge mmio

2015-07-14 Thread Yinghai Lu
Current code will always add 2M for hotplug bridge mmio even
there is child device under it.

For example:
40:03.0 --- 43:00.0 --- 44:02.0 -+- 45:00.0
 \- 45:00.1

44:02.0 will need 1M as must for 45:00.0 and 45:00.1
When we calculate add_size for 44:02.0, we pass 2M as additional
size for hotplug bridge, total will be 3M.

That is different from code before changes for optional support,
or even current code that treat optional as must directly by
not passing realloc head. We only need 2M as total.

The optional size should be 1M, and total size should be 2M.

This patch change to comparing must+optional with min_sum_size to
get smaller optional size.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 4c7f25f..7346bbf 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1156,7 +1156,6 @@ out:
  * @type2: second match type
  * @type3: third match type
  * @min_size : the minimum memory window that must to be allocated
- * @add_size : additional optional memory window
  * @realloc_head : track the additional memory window on this list
  *
  * Calculate the size of the bus and minimal alignment which
@@ -1169,10 +1168,11 @@ out:
 static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 unsigned long type, unsigned long type2,
 unsigned long type3,
-resource_size_t min_size, resource_size_t add_size,
+resource_size_t min_size,
 struct list_head *realloc_head)
 {
struct pci_dev *dev;
+   resource_size_t min_sum_size = 0;
resource_size_t min_align = 0, min_add_align = 0;
resource_size_t max_align = 0, max_add_align = 0;
resource_size_t size = 0, size0 = 0, size1 = 0, sum_add_size = 0;
@@ -1184,6 +1184,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
if (!b_res)
return -ENOSPC;
 
+   if (realloc_head) {
+   min_sum_size = min_size;
+   min_size = 0;
+   }
+
list_for_each_entry(dev, &bus->devices, bus_list) {
int i;
 
@@ -1254,8 +1259,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
}
free_align_test_list(&align_test_list);
 
-   if ((sum_add_size - size) < add_size)
-   sum_add_size = size + add_size;
+   if (sum_add_size < min_sum_size)
+   sum_add_size = min_sum_size;
if (sum_add_size > size && realloc_head) {
min_add_align = calculate_mem_align(&align_test_add_list,
max_add_align, sum_add_size,
@@ -1392,7 +1397,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
 {
struct pci_dev *dev;
unsigned long mask, prefmask, type2 = 0, type3 = 0;
-   resource_size_t additional_mem_size = 0, additional_io_size = 0;
+   resource_size_t min_mem_size = 0, additional_io_size = 0;
struct resource *b_res;
int ret;
 
@@ -1426,7 +1431,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
pci_bridge_check_ranges(bus);
if (bus->self->is_hotplug_bridge) {
additional_io_size  = pci_hotplug_io_size;
-   additional_mem_size = pci_hotplug_mem_size;
+   min_mem_size = pci_hotplug_mem_size;
}
/* Fall through */
default:
@@ -1445,8 +1450,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
prefmask |= IORESOURCE_MEM_64;
ret = pbus_size_mem(bus, prefmask, prefmask,
  prefmask, prefmask,
- realloc_head ? 0 : additional_mem_size,
- additional_mem_size, realloc_head);
+ min_mem_size, realloc_head);
 
/*
 * If successful, all non-prefetchable resources
@@ -1469,8 +1473,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
prefmask &= ~IORESOURCE_MEM_64;
ret = pbus_size_mem(bus, prefmask, prefmask,
 prefmask, prefmask,
-realloc_head ? 0 : additional_mem_size,
-additional_mem_size, realloc_head);
+min_mem_size, realloc_head);
 
/*
 * If successful, only non-prefetchable resources
@@ -1479,7 +1482,7 @@ void 

[PATCH v2 08/49] PCI: Remove duplicated code for resource sorting

2015-07-14 Thread Yinghai Lu
Now __sort_resources, and pdev_sort_resources all have sorting
code.

As we are going to call __sort_resources several places later, so
choose to keep __sort_resources, and remove related code in
pdev_sort_resources.


Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 22 +++---
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 6f2d508..6642a60 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -215,9 +215,8 @@ static void pdev_sort_resources(struct pci_dev *dev,
 
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
struct resource *r;
-   struct pci_dev_resource *dev_res, *tmp;
+   struct pci_dev_resource *tmp;
resource_size_t r_align;
-   struct list_head *n;
 
r = &dev->resource[i];
 
@@ -240,22 +239,7 @@ static void pdev_sort_resources(struct pci_dev *dev,
tmp->res = r;
tmp->dev = dev;
 
-   /* fallback is smallest one or list is empty*/
-   n = head;
-   list_for_each_entry(dev_res, head, list) {
-   resource_size_t align;
-
-   align = __pci_resource_alignment(dev_res->dev,
-dev_res->res,
-realloc_head);
-
-   if (r_align > align) {
-   n = &dev_res->list;
-   break;
-   }
-   }
-   /* Insert it just before n*/
-   list_add_tail(&tmp->list, n);
+   list_add_tail(&tmp->list, head);
}
 }
 
@@ -558,9 +542,9 @@ static void __assign_resources_sorted(struct list_head 
*head,
}
free_list(&save_head);
 
+requested_and_reassign:
__sort_resources(head);
 
-requested_and_reassign:
/* Satisfy the must-have resource requests */
assign_requested_resources_sorted(head, fail_head);
 
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 19/49] PCI: Separate must+optional assigning to another function

2015-07-14 Thread Yinghai Lu
__assign_resources_sorted() is getting too big if we put alt_size support
into it.  Split must_add assigning code out to another function.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 47 +++
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f8b9a24..d1f9e19 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -473,20 +473,9 @@ static void restore_resource(struct pci_dev_resource 
*save_res,
res->flags = save_res->flags;
 }
 
-static void __assign_resources_sorted(struct list_head *head,
-struct list_head *realloc_head,
-struct list_head *fail_head)
+static bool __assign_resources_must_add_sorted(struct list_head *head,
+struct list_head *realloc_head)
 {
-   /*
-* Should not assign requested resources at first.
-*   they could be adjacent, so later reassign can not reallocate
-*   them one by one in parent resource window.
-* Try to assign requested + add_size at beginning
-*  if could do that, could get out early.
-*  if could not do that, we still try to assign requested at first,
-*then try to reassign add_size for some resources.
-*/
-
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
@@ -495,12 +484,8 @@ static void __assign_resources_sorted(struct list_head 
*head,
resource_size_t add_align;
struct resource *res;
 
-   /* Check if optional add_size is there */
-   if (!realloc_head || list_empty(realloc_head))
-   goto requested_and_reassign;
-
if (!save_resources(head, &save_head))
-   goto requested_and_reassign;
+   return false;
 
/* Update res in head list with add_size in realloc_head list */
list_for_each_entry(dev_res, head, list) {
@@ -539,7 +524,8 @@ static void __assign_resources_sorted(struct list_head 
*head,
remove_from_list(realloc_head, dev_res->res);
free_list(&save_head);
free_list(head);
-   return;
+
+   return true;
}
 
/* check failed type */
@@ -574,7 +560,28 @@ static void __assign_resources_sorted(struct list_head 
*head,
 
free_list(&save_head);
 
-requested_and_reassign:
+   return false;
+}
+
+static void __assign_resources_sorted(struct list_head *head,
+struct list_head *realloc_head,
+struct list_head *fail_head)
+{
+   /*
+* Should not assign requested resources at first.
+*   they could be adjacent, so later reassign can not reallocate
+*   them one by one in parent resource window.
+* Try to assign requested + add_size at beginning
+*  if could do that, could get out early.
+*  if could not do that, we still try to assign requested at first,
+*then try to reassign add_size for some resources.
+*/
+
+   /* Check must+optional add */
+   if (realloc_head && !list_empty(realloc_head) &&
+   __assign_resources_must_add_sorted(head, realloc_head))
+   return;
+
__sort_resources(head);
 
/* Satisfy the must-have resource requests */
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 02/49] PCI: Reuse res_to_dev_res in reassign_resources_sorted

2015-07-14 Thread Yinghai Lu
Now res_to_dev_res() does not print out debug message anymore, so
reuse it in reassign_resource_sorted().

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f0fa705..247d8fe 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -240,26 +240,17 @@ static void reassign_resources_sorted(struct list_head 
*realloc_head,
 {
struct resource *res;
struct pci_dev_resource *add_res, *tmp;
-   struct pci_dev_resource *dev_res;
resource_size_t add_size, align;
int idx;
 
list_for_each_entry_safe(add_res, tmp, realloc_head, list) {
-   bool found_match = false;
-
res = add_res->res;
/* skip resource that has been reset */
if (!res->flags)
goto out;
 
/* skip this resource if not found in head list */
-   list_for_each_entry(dev_res, head, list) {
-   if (dev_res->res == res) {
-   found_match = true;
-   break;
-   }
-   }
-   if (!found_match)/* just skip */
+   if (!res_to_dev_res(head, res))
continue;
 
idx = res - &add_res->dev->resource[0];
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 13/49] PCI: Separate realloc list checking after allocation

2015-07-14 Thread Yinghai Lu
We check the realloc list, as list must be empty after allocation.

Separate the realloc list checking to another function.

Add checking that is missed in acpiphp driver.

Signed-off-by: Yinghai Lu 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: linux-a...@vger.kernel.org
---
 drivers/pci/hotplug/acpiphp_glue.c |  1 +
 drivers/pci/pci.h  |  1 +
 drivers/pci/setup-bus.c| 11 ---
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/hotplug/acpiphp_glue.c 
b/drivers/pci/hotplug/acpiphp_glue.c
index ff53856..134caee 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -507,6 +507,7 @@ static void enable_slot(struct acpiphp_slot *slot)
}
}
__pci_bus_assign_resources(bus, &add_list, NULL);
+   __pci_bus_check_realloc(&add_list);
 
acpiphp_sanitize_bus(bus);
pcie_bus_configure_settings(bus);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4ff0ff1..2b83977 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -235,6 +235,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus,
 void __pci_bus_assign_resources(const struct pci_bus *bus,
struct list_head *realloc_head,
struct list_head *fail_head);
+void __pci_bus_check_realloc(struct list_head *realloc_head);
 bool pci_bus_clip_resource(struct pci_dev *dev, int idx);
 
 void pci_reassigndev_resource_alignment(struct pci_dev *dev);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 06664db..f30225c 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -350,6 +350,11 @@ out:
}
 }
 
+void __pci_bus_check_realloc(struct list_head *realloc_head)
+{
+   BUG_ON(!list_empty(realloc_head));
+}
+
 /**
  * assign_requested_resources_sorted() - satisfy resource requests
  *
@@ -1861,7 +1866,7 @@ again:
/* Depth last, allocate resources and update the hardware. */
__pci_bus_assign_resources(bus, add_list, &fail_head);
if (add_list)
-   BUG_ON(!list_empty(add_list));
+   __pci_bus_check_realloc(add_list);
tried_times++;
 
/* any device complain? */
@@ -1936,7 +1941,7 @@ void pci_assign_unassigned_bridge_resources(struct 
pci_dev *bridge)
 again:
__pci_bus_size_bridges(parent, &add_list);
__pci_bridge_assign_resources(bridge, &add_list, &fail_head);
-   BUG_ON(!list_empty(&add_list));
+   __pci_bus_check_realloc(&add_list);
tried_times++;
 
if (list_empty(&fail_head))
@@ -1995,6 +2000,6 @@ void pci_assign_unassigned_bus_resources(struct pci_bus 
*bus)
 &add_list);
up_read(&pci_bus_sem);
__pci_bus_assign_resources(bus, &add_list, NULL);
-   BUG_ON(!list_empty(&add_list));
+   __pci_bus_check_realloc(&add_list);
 }
 EXPORT_SYMBOL_GPL(pci_assign_unassigned_bus_resources);
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 12/49] PCI: Simplify res reference using in __assign_resourcs_sorted

2015-07-14 Thread Yinghai Lu
Use res instead of dev_res->res.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 6dff258..06664db 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -453,6 +453,7 @@ static void __assign_resources_sorted(struct list_head 
*head,
struct pci_dev_resource *dev_res, *tmp_res;
unsigned long fail_type;
resource_size_t add_align;
+   struct resource *res;
 
/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -468,8 +469,8 @@ static void __assign_resources_sorted(struct list_head 
*head,
 
/* Update res in head list with add_size in realloc_head list */
list_for_each_entry(dev_res, head, list) {
-   dev_res->res->end += get_res_add_size(realloc_head,
-   dev_res->res);
+   res = dev_res->res;
+   res->end += get_res_add_size(realloc_head, res);
 
/*
 * There are two kinds of additional resources in the list:
@@ -478,16 +479,16 @@ static void __assign_resources_sorted(struct list_head 
*head,
 * 2. resource with IORESOURCE_SIZEALIGN
 *update size above already change alignment.
 */
-   if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+   if (!(res->flags & IORESOURCE_STARTALIGN))
continue;
 
-   add_align = get_res_add_align(realloc_head, dev_res->res);
+   add_align = get_res_add_align(realloc_head, res);
 
if (add_align) {
-   resource_size_t r_size = resource_size(dev_res->res);
+   resource_size_t r_size = resource_size(res);
 
-   dev_res->res->start = add_align;
-   dev_res->res->end = add_align + r_size - 1;
+   res->start = add_align;
+   res->end = add_align + r_size - 1;
}
}
 
@@ -509,21 +510,21 @@ static void __assign_resources_sorted(struct list_head 
*head,
/* check failed type */
fail_type = pci_fail_res_type_mask(&local_fail_head);
/* remove not need to be released assigned res from head list etc */
-   list_for_each_entry_safe(dev_res, tmp_res, head, list)
-   if (dev_res->res->parent &&
-   !pci_need_to_release(fail_type, dev_res->res)) {
+   list_for_each_entry_safe(dev_res, tmp_res, head, list) {
+   res = dev_res->res;
+   if (res->parent && !pci_need_to_release(fail_type, res)) {
/* remove it from realloc_head list */
-   remove_from_list(realloc_head, dev_res->res);
-   remove_from_list(&save_head, dev_res->res);
+   remove_from_list(realloc_head, res);
+   remove_from_list(&save_head, res);
list_del(&dev_res->list);
kfree(dev_res);
}
+   }
 
free_list(&local_fail_head);
/* Release assigned resource */
list_for_each_entry(dev_res, head, list) {
-   struct resource *res = dev_res->res;
-
+   res = dev_res->res;
if (res->parent) {
dev_printk(KERN_DEBUG, &dev_res->dev->dev,
   "BAR %d: released %pR\n",
@@ -534,8 +535,7 @@ static void __assign_resources_sorted(struct list_head 
*head,
}
/* Restore start/end/flags from saved list */
list_for_each_entry(save_res, &save_head, list) {
-   struct resource *res = save_res->res;
-
+   res = save_res->res;
res->start = save_res->start;
res->end = save_res->end;
res->flags = save_res->flags;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 03/49] PCI: Use correct align for optional only resources during sorting

2015-07-14 Thread Yinghai Lu
During sorting before assign, we only put resource with non-zero align
in the sorted list, so for optional resources that must size is 0 and only
have addon parts, we need to have correct align.

While treating SRIOV as optional resources, we always read alignment for
SRIOV bars, so they are ok.
Hotplug bridge resources are using STARTALIGN so it is ok when size is 0
if we have correct start for them.

Later we want to treat the ROM BAR as optional resource, and it has
have SIZEALIGN, we need to find a way to get align for them.

We can use addon resource align instead in that case, and it will
be ok for SRIOV path and hotplug bridge resource path.

Sorted list will contain must resource align/size to 0/0 to hold spot for
optional resources.

We need to pass realloc_head from sizing stage to sorting stage, and
get entry from realloc list and calculate align from the entry.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431
Reported-by: TJ 
Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 50 ++---
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 247d8fe..27cb0f0 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -145,9 +145,43 @@ static resource_size_t get_res_add_align(struct list_head 
*head,
return dev_res->min_align;
 }
 
+static resource_size_t __pci_resource_alignment(
+   struct pci_dev *dev,
+   struct resource *r,
+   struct list_head *realloc_head)
+{
+   resource_size_t r_align = pci_resource_alignment(dev, r);
+   resource_size_t orig_start, orig_end;
+   struct pci_dev_resource *dev_res;
+
+   if (r_align || !realloc_head)
+   return r_align;
+
+   dev_res = res_to_dev_res(realloc_head, r);
+   if (!dev_res || !dev_res->add_size)
+   return r_align;
+
+   orig_start = r->start;
+   orig_end = r->end;
+   r->end += dev_res->add_size;
+   if ((r->flags & IORESOURCE_STARTALIGN)) {
+   resource_size_t r_size = resource_size(r);
+   resource_size_t add_align = dev_res->min_align;
+
+   r->start = add_align;
+   r->end = add_align + r_size - 1;
+   }
+   r_align = pci_resource_alignment(dev, r);
+   r->start = orig_start;
+   r->end = orig_end;
+
+   return r_align;
+}
 
 /* Sort resources by alignment */
-static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
+static void pdev_sort_resources(struct pci_dev *dev,
+struct list_head *realloc_head,
+struct list_head *head)
 {
int i;
 
@@ -165,7 +199,7 @@ static void pdev_sort_resources(struct pci_dev *dev, struct 
list_head *head)
if (!(r->flags) || r->parent)
continue;
 
-   r_align = pci_resource_alignment(dev, r);
+   r_align = __pci_resource_alignment(dev, r, realloc_head);
if (!r_align) {
dev_warn(&dev->dev, "BAR %d: %pR has bogus alignment\n",
 i, r);
@@ -183,8 +217,9 @@ static void pdev_sort_resources(struct pci_dev *dev, struct 
list_head *head)
list_for_each_entry(dev_res, head, list) {
resource_size_t align;
 
-   align = pci_resource_alignment(dev_res->dev,
-dev_res->res);
+   align = __pci_resource_alignment(dev_res->dev,
+dev_res->res,
+realloc_head);
 
if (r_align > align) {
n = &dev_res->list;
@@ -197,6 +232,7 @@ static void pdev_sort_resources(struct pci_dev *dev, struct 
list_head *head)
 }
 
 static void __dev_sort_resources(struct pci_dev *dev,
+struct list_head *realloc_head,
 struct list_head *head)
 {
u16 class = dev->class >> 8;
@@ -213,7 +249,7 @@ static void __dev_sort_resources(struct pci_dev *dev,
return;
}
 
-   pdev_sort_resources(dev, head);
+   pdev_sort_resources(dev, realloc_head, head);
 }
 
 static inline void reset_resource(struct resource *res)
@@ -501,7 +537,7 @@ static void pdev_assign_resources_sorted(struct pci_dev 
*dev,
 {
LIST_HEAD(head);
 
-   __dev_sort_resources(dev, &head);
+   __dev_sort_resources(dev, add_head, &head);
__assign_resources_sorted(&head, add_head, fail_head);
 
 }
@@ -514,7 +550,7 @@ static void pbus_assign_resources_sorted(const

[PATCH v2 04/49] PCI: Optimize bus align/size calculation during sizing

2015-07-14 Thread Yinghai Lu
Current code try to get align as small as possible and use that to
align final size. But it does not handle resource that size is bigger
than align in optimal way, kernel only use max align for them.

For example:
 when we have resources with align/size: 1M/2M, 512M/512M,
   bus resource min_align/size0 will be 512M/1024M,
   but optimal value should be 256M/768M.

For following cases that we have resource size that is bigger
than resource alignment:
1. SRIOV bar.
2. PCI bridges with several bridges or devices as children.

We can keep on trying to allocate children devices resources under range
[half_align, half_align + aligned_size).
If sucesses, we can use that half_align as new min_align.

After this patch, we get:
 align/size: 1M/2M, 2M/4M, 4M/8M, 8M/16M
 new min_align/min_size: 4M/32M, and old is 8M/32M

 align/size: 1M/2M, 2M/4M, 4M/8M
 new min_align/min_size: 2M/14M, and old is 4M/16M

 align/size: 1M/2M, 512M/512M
 new min_align/min_size: 256M/768M, and old is 512M/1024M

The real result from one system with one pcie card that has
four functions that support sriov:
 align/size:
   0080/0080
   0080/0080
   0080/0080
   0080/0080
   0001/0020
   0001/0020
   0001/0020
   0001/0020
   8000/8000
   8000/8000
   8000/8000
   8000/8000
   4000/0008
   4000/0008
   4000/0008
   4000/0008
 old min_align/min_size: 0040/02c0
 min_align/min_size: 0010/02b0

So align will be 1M instead of 4M.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431
Reported-by: TJ 
Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 195 ++--
 1 file changed, 157 insertions(+), 38 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 27cb0f0..ecdf011 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -30,6 +30,34 @@
 
 unsigned int pci_flags;
 
+static inline bool is_before(resource_size_t align1, resource_size_t size1,
+resource_size_t align2, resource_size_t size2)
+{
+   resource_size_t size1_left, size2_left;
+
+   /* big align is before small align */
+   if (align1 > align2)
+   return true;
+
+   /*
+* for same align:
+*   aligned is before not aligned
+*   for not aligned, big remainder is before small remainder
+*/
+   if (align1 == align2) {
+   size1_left = size1 & (align1 - 1);
+   if (!size1_left)
+   size1_left = align1;
+   size2_left = size2 & (align2 - 1);
+   if (!size2_left)
+   size2_left = align2;
+   if (size1_left > size2_left)
+   return true;
+   }
+
+   return false;
+}
+
 struct pci_dev_resource {
struct list_head list;
struct resource *res;
@@ -999,26 +1027,125 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
}
 }
 
-static inline resource_size_t calculate_mem_align(resource_size_t *aligns,
- int max_order)
+struct align_test_res {
+   struct list_head list;
+   struct resource res;
+   resource_size_t size;
+   resource_size_t align;
+};
+
+static void free_align_test_list(struct list_head *head)
 {
-   resource_size_t align = 0;
-   resource_size_t min_align = 0;
-   int order;
+   struct align_test_res *p, *tmp;
 
-   for (order = 0; order <= max_order; order++) {
-   resource_size_t align1 = 1;
+   list_for_each_entry_safe(p, tmp, head, list) {
+   list_del(&p->list);
+   kfree(p);
+   }
+}
 
-   align1 <<= (order + 20);
+static int add_to_align_test_list(struct list_head *head,
+ resource_size_t align, resource_size_t size)
+{
+   struct align_test_res *tmp;
+
+   tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+   if (!tmp)
+   return -ENOMEM;
+
+   tmp->align = align;
+   tmp->size = size;
+
+   list_add_tail(&tmp->list, head);
+
+   return 0;
+}
+
+static void __sort_align_test(struct list_head *head)
+{
+   struct align_test_res *res1, *tmp_res, *res2;
 
-   if (!align)
-   min_align = align1;
-   else if (ALIGN(align + min_align, min_align) < align1)
-   min_align = align1 >> 1;
-   align += aligns[order];
+   list_for_each_entry_safe(res1, tmp_res, head, list) {
+   /* reorder it */
+   list_for_each_entry(res2, head, list) {
+   if (res2 == res1)
+   break;
+
+   if (is_before(res1->align, res1->size,
+  

[PATCH v2 17/49] PCI: Separate out save_resources/restore_resource

2015-07-14 Thread Yinghai Lu
will reuse it in alt_size support.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 49 ++---
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 1622ad2..2e3d00b 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -436,6 +436,29 @@ static bool pci_need_to_release(unsigned long mask, struct 
resource *res)
return false;   /* should not get here */
 }
 
+static bool save_resources(struct list_head *head,
+  struct list_head *save_head)
+{
+   struct pci_dev_resource *dev_res;
+
+   /* Save original start, end, flags etc at first */
+   list_for_each_entry(dev_res, head, list)
+   if (add_to_list(save_head, dev_res->dev, dev_res->res)) {
+   free_list(save_head);
+   return false;
+   }
+
+   return true;
+}
+
+static void restore_resource(struct pci_dev_resource *save_res,
+struct resource *res)
+{
+   res->start = save_res->start;
+   res->end = save_res->end;
+   res->flags = save_res->flags;
+}
+
 static void __assign_resources_sorted(struct list_head *head,
 struct list_head *realloc_head,
 struct list_head *fail_head)
@@ -473,13 +496,8 @@ static void __assign_resources_sorted(struct list_head 
*head,
if (!realloc_head || list_empty(realloc_head))
goto requested_and_reassign;
 
-   /* Save original start, end, flags etc at first */
-   list_for_each_entry(dev_res, head, list) {
-   if (add_to_list(&save_head, dev_res->dev, dev_res->res)) {
-   free_list(&save_head);
-   goto requested_and_reassign;
-   }
-   }
+   if (!save_resources(head, &save_head))
+   goto requested_and_reassign;
 
/* Update res in head list with add_size in realloc_head list */
list_for_each_entry(dev_res, head, list) {
@@ -548,12 +566,9 @@ static void __assign_resources_sorted(struct list_head 
*head,
}
}
/* Restore start/end/flags from saved list */
-   list_for_each_entry(save_res, &save_head, list) {
-   res = save_res->res;
-   res->start = save_res->start;
-   res->end = save_res->end;
-   res->flags = save_res->flags;
-   }
+   list_for_each_entry(save_res, &save_head, list)
+   restore_resource(save_res, save_res->res);
+
free_list(&save_head);
 
 requested_and_reassign:
@@ -1917,9 +1932,7 @@ again:
list_for_each_entry(fail_res, &fail_head, list) {
struct resource *res = fail_res->res;
 
-   res->start = fail_res->start;
-   res->end = fail_res->end;
-   res->flags = fail_res->flags;
+   restore_resource(fail_res, res);
if (fail_res->dev->subordinate)
res->flags = 0;
}
@@ -1983,9 +1996,7 @@ again:
list_for_each_entry(fail_res, &fail_head, list) {
struct resource *res = fail_res->res;
 
-   res->start = fail_res->start;
-   res->end = fail_res->end;
-   res->flags = fail_res->flags;
+   restore_resource(fail_res, res);
if (fail_res->dev->subordinate)
res->flags = 0;
}
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 05/49] PCI: Optimize bus align/size calculation for optional during sizing

2015-07-14 Thread Yinghai Lu
Current add_align always use max align, that make must+optional
to get allocated more than needed in some cases.

Now we have new calculate_mem_align, we could use it for add_align
calculation.

Need to create separated list for must+optional align/size info.

After that we can get smaller add_align/size, we have more chance
to make must+optional to get allocated.

The result for bridge that have Intel 4x10g card installed.

 pci :20:03.2: bridge window [mem 0x-0x000f 64bit pref]
to [bus 2a-31] calculate_mem for must
 ===BEGIN
 align/size:
   0080/0080
   0080/0080
   0080/0080
   0080/0080
   8000/8000
   8000/8000
   8000/8000
   8000/8000
 old min_align/min_size: 0040/0240
 min_align/min_size: 0040/0240
 ===END

 pci :20:03.2: bridge window [mem 0x-0x000f 64bit pref]
to [bus 2a-31] calculate_mem for add
 ===BEGIN
 align/size:
   0080/0080
   0080/0080
   0080/0080
   0080/0080
   0001/0020
   0001/0020
   0001/0020
   0001/0020
   8000/8000
   8000/8000
   8000/8000
   8000/8000
   4000/0008
   4000/0008
   4000/0008
   4000/0008
 old min_align/min_size: 0080/0300
 min_align/min_size: 0010/02b0
 ===END

so must align/size: 0x40/0x240, and
 new must+optional align/size: 0x10/0x2b0, and it is better
than old must+optional align/size: 0x80/0x300

Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431
Reported-by: TJ 
Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 82 ++---
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index ecdf011..4c7f25f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -901,7 +901,6 @@ static resource_size_t calculate_iosize(resource_size_t 
size,
 
 static resource_size_t calculate_memsize(resource_size_t size,
resource_size_t min_size,
-   resource_size_t size1,
resource_size_t old_size,
resource_size_t align)
 {
@@ -911,7 +910,7 @@ static resource_size_t calculate_memsize(resource_size_t 
size,
old_size = 0;
if (size < old_size)
size = old_size;
-   size = ALIGN(size + size1, align);
+   size = ALIGN(size, align);
return size;
 }
 
@@ -1174,44 +1173,45 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 struct list_head *realloc_head)
 {
struct pci_dev *dev;
-   resource_size_t min_align, align, size, size0, size1;
-   resource_size_t max_align = 0;
+   resource_size_t min_align = 0, min_add_align = 0;
+   resource_size_t max_align = 0, max_add_align = 0;
+   resource_size_t size = 0, size0 = 0, size1 = 0, sum_add_size = 0;
struct resource *b_res = find_free_bus_resource(bus,
mask | IORESOURCE_PREFETCH, type);
-   resource_size_t children_add_size = 0;
-   resource_size_t children_add_align = 0;
-   resource_size_t add_align = 0;
LIST_HEAD(align_test_list);
+   LIST_HEAD(align_test_add_list);
 
if (!b_res)
return -ENOSPC;
 
-   size = 0;
-
list_for_each_entry(dev, &bus->devices, bus_list) {
int i;
 
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
struct resource *r = &dev->resource[i];
-   resource_size_t r_size;
+   resource_size_t r_size, align;
 
if (r->parent || ((r->flags & mask) != type &&
  (r->flags & mask) != type2 &&
  (r->flags & mask) != type3))
continue;
+
r_size = resource_size(r);
+   align = pci_resource_alignment(dev, r);
 #ifdef CONFIG_PCI_IOV
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
i <= PCI_IOV_RESOURCE_END) {
-   add_align = max(pci_resource_alignment(dev, r), 
add_align);
+   add_to_align_test_list(&align_test_add_list,
+   align, r_size);
r->end = r->start - 1;
add_to_list(realloc_head, dev, r, r_size, 0/* 
don't care */);
-   

[PATCH v2 26/49] PCI: Move ISA ioport align out of calculate_iosize

2015-07-14 Thread Yinghai Lu
So we could unify calculate_iosize and calculate_memsize later.

when one bridge have several children devices, and every devices
have several io port resources and resource size < 0x400.

We need to check size, and add extra size to make sure bit8/9
to be zero.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 39 +++
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index de55e07..9d5e550 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1126,11 +1126,6 @@ static resource_size_t calculate_iosize(resource_size_t 
size,
size = min_size;
if (old_size == 1)
old_size = 0;
-   /* To be fixed in 2.5: we should have sort of HAVE_ISA
-  flag in the struct pci_bus. */
-#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
-   size = (size & 0xff) + ((size & ~0xffUL) << 2);
-#endif
size = ALIGN(size + size1, align);
if (size < old_size)
size = old_size;
@@ -1184,6 +1179,18 @@ static resource_size_t window_alignment(struct pci_bus 
*bus,
return max(align, arch_align);
 }
 
+static resource_size_t size_aligned_for_isa(resource_size_t size)
+{
+   /*
+* To be fixed in 2.5: we should have sort of HAVE_ISA
+*  flag in the struct pci_bus.
+*/
+#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
+   size = (size & 0xff) + ((size & ~0xffUL) << 2);
+#endif
+   return size;
+}
+
 /**
  * pbus_size_io() - size the io window of a given bus
  *
@@ -1201,11 +1208,10 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
 {
struct pci_dev *dev;
resource_size_t min_sum_size = 0;
-   resource_size_t sum_add_size;
struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO,
IORESOURCE_IO);
resource_size_t size = 0, size0 = 0, size1 = 0;
-   resource_size_t children_add_size = 0;
+   resource_size_t sum_add_size = 0, sum_add_size1 = 0;
resource_size_t min_align, align;
 
if (!b_res)
@@ -1222,7 +1228,7 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
 
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
struct resource *r = &dev->resource[i];
-   unsigned long r_size;
+   unsigned long r_size, r_add_size;
 
if (r->parent || !(r->flags & IORESOURCE_IO))
continue;
@@ -1238,18 +1244,27 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
if (align > min_align)
min_align = align;
 
-   if (realloc_head)
-   children_add_size += 
get_res_add_size(realloc_head, r);
+   if (realloc_head) {
+   r_add_size = get_res_add_size(realloc_head, r);
+   r_add_size += r_size;
+   if (r_add_size < 0x400)
+   /* Might be re-aligned for ISA */
+   sum_add_size += r_add_size;
+   else
+   sum_add_size1 += r_add_size;
+   }
}
}
 
+   size = size_aligned_for_isa(size);
size0 = calculate_iosize(size, min_size, size1,
resource_size(b_res), min_align);
-   sum_add_size = children_add_size + size + size1;
+   sum_add_size = size_aligned_for_isa(sum_add_size);
+   sum_add_size += sum_add_size1;
if (sum_add_size < min_sum_size)
sum_add_size = min_sum_size;
size1 = !realloc_head ? size0 :
-   calculate_iosize(size, min_size, sum_add_size - size,
+   calculate_iosize(sum_add_size, min_size, 0,
resource_size(b_res), min_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 41/49] of/PCI: Add IORESOURCE_MEM_64 for 64-bit resource

2015-07-14 Thread Yinghai Lu
For device resource PREF bit setting under bridge 64-bit pref resource,
we need to make sure only set PREF for 64bit resource, so set
IORESOUCE_MEM_64 for 64bit resource during of device resource flags
parsing.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=96261
Link: https://bugzilla.kernel.org/show_bug.cgi?id=96241
Signed-off-by: Yinghai Lu 
Cc: Grant Likely 
Cc: Rob Herring 
Cc: devicet...@vger.kernel.org
---
 drivers/of/address.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 8bfda6a..073125f 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -128,9 +128,11 @@ static unsigned int of_bus_pci_get_flags(const __be32 
*addr)
flags |= IORESOURCE_IO;
break;
case 0x02: /* 32 bits */
-   case 0x03: /* 64 bits */
flags |= IORESOURCE_MEM;
break;
+   case 0x03: /* 64 bits */
+   flags |= IORESOURCE_MEM | IORESOURCE_MEM_64;
+   break;
}
if (w & 0x4000)
flags |= IORESOURCE_PREFETCH;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 11/49] PCI: Add debug printout during releasing partial assigned resources

2015-07-14 Thread Yinghai Lu
We try to assign must+optional at first, and we only accept the result if
all resources get allocated. Otherwise will release assigned in the list,
and try to assign must and expand to optional.

We have to do that to make sure any must has priority than any optional
addon.

When that happens, we only print out "assigned" info, that is confusing
as it looks like same range is assigned to two peer resources at the same
time.

Add printout for releasing so we have whole picture in debug messages.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 3abf249..6dff258 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -521,9 +521,17 @@ static void __assign_resources_sorted(struct list_head 
*head,
 
free_list(&local_fail_head);
/* Release assigned resource */
-   list_for_each_entry(dev_res, head, list)
-   if (dev_res->res->parent)
-   release_resource(dev_res->res);
+   list_for_each_entry(dev_res, head, list) {
+   struct resource *res = dev_res->res;
+
+   if (res->parent) {
+   dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+  "BAR %d: released %pR\n",
+  (int)(res - &dev_res->dev->resource[0]),
+  res);
+   release_resource(res);
+   }
+   }
/* Restore start/end/flags from saved list */
list_for_each_entry(save_res, &save_head, list) {
struct resource *res = save_res->res;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 31/49] resources: Split out __allocate_resource()

2015-07-14 Thread Yinghai Lu
It will not hold lock, so we could use it in other functions that
hold the resource lock already.

-v2: according to Linus, using "bool lock" as parameter
 aka "conditionally take lock" is *wrong*.

Signed-off-by: Yinghai Lu 
Acked-by: Linus Torvalds 
---
 kernel/resource.c | 70 +++
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/kernel/resource.c b/kernel/resource.c
index 90552aa..830cc11 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -619,7 +619,7 @@ static int find_resource(struct resource *root, struct 
resource *new,
 }
 
 /**
- * reallocate_resource - allocate a slot in the resource tree given range & 
alignment.
+ * __reallocate_resource - allocate a slot in the resource tree given range & 
alignment.
  * The resource will be relocated if the new size cannot be reallocated in 
the
  * current location.
  *
@@ -628,7 +628,7 @@ static int find_resource(struct resource *root, struct 
resource *new,
  * @newsize: new size of the resource descriptor
  * @constraint: the size and alignment constraints to be met.
  */
-static int reallocate_resource(struct resource *root, struct resource *old,
+static int __reallocate_resource(struct resource *root, struct resource *old,
resource_size_t newsize,
struct resource_constraint  *constraint)
 {
@@ -636,8 +636,6 @@ static int reallocate_resource(struct resource *root, 
struct resource *old,
struct resource new = *old;
struct resource *conflict;
 
-   write_lock(&resource_lock);
-
if ((err = __find_resource(root, old, &new, newsize, constraint)))
goto out;
 
@@ -662,14 +660,13 @@ static int reallocate_resource(struct resource *root, 
struct resource *old,
BUG_ON(conflict);
}
 out:
-   write_unlock(&resource_lock);
return err;
 }
 
-
 /**
- * allocate_resource - allocate empty slot in the resource tree given range & 
alignment.
- * The resource will be reallocated with a new size if it was already 
allocated
+ * __allocate_resource - allocate empty slot in the resource tree given range 
& alignment.
+ * The resource will be reallocated with a new size if it was already
+ * allocated
  * @root: root resource descriptor
  * @new: resource descriptor desired by caller
  * @size: requested resource region size
@@ -678,15 +675,17 @@ out:
  * @align: alignment requested, in bytes
  * @alignf: alignment function, optional, called if not NULL
  * @alignf_data: arbitrary data to pass to the @alignf function
+ *
+ * Caller need to hold resource_lock if needed.
  */
-int allocate_resource(struct resource *root, struct resource *new,
- resource_size_t size, resource_size_t min,
- resource_size_t max, resource_size_t align,
- resource_size_t (*alignf)(void *,
-   const struct resource *,
-   resource_size_t,
-   resource_size_t),
- void *alignf_data)
+static int __allocate_resource(struct resource *root, struct resource *new,
+   resource_size_t size, resource_size_t min,
+   resource_size_t max, resource_size_t align,
+   resource_size_t (*alignf)(void *,
+ const struct resource *,
+ resource_size_t,
+ resource_size_t),
+   void *alignf_data)
 {
int err;
struct resource_constraint constraint;
@@ -700,20 +699,51 @@ int allocate_resource(struct resource *root, struct 
resource *new,
constraint.alignf = alignf;
constraint.alignf_data = alignf_data;
 
-   if ( new->parent ) {
+   if (new->parent) {
/* resource is already allocated, try reallocating with
   the new constraints */
-   return reallocate_resource(root, new, size, &constraint);
+   return __reallocate_resource(root, new, size, &constraint);
}
 
-   write_lock(&resource_lock);
err = find_resource(root, new, size, &constraint);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
-   write_unlock(&resource_lock);
+
return err;
 }
 
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & 
alignment.
+ * The resource will be reallocated with a new size if it was already
+ * allocated
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ * @size: requested resource region size
+ * @min: minimum boundary to allocate
+ *

[PATCH v2 39/49] sparc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing

2015-07-14 Thread Yinghai Lu
For device resource PREF bit setting under bridge 64-bit pref resource,
we need to make sure only set PREF for 64bit resource, so set
IORESOUCE_MEM_64 for 64bit resource during of device resource flags
parsing.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=96261
Link: https://bugzilla.kernel.org/show_bug.cgi?id=96241
Signed-off-by: Yinghai Lu 
Cc: "David S. Miller" 
Cc: sparcli...@vger.kernel.org
---
 arch/sparc/kernel/of_device_32.c | 5 +++--
 arch/sparc/kernel/of_device_64.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/sparc/kernel/of_device_32.c b/arch/sparc/kernel/of_device_32.c
index 185aa96..3e9f273 100644
--- a/arch/sparc/kernel/of_device_32.c
+++ b/arch/sparc/kernel/of_device_32.c
@@ -83,11 +83,12 @@ static unsigned long of_bus_pci_get_flags(const u32 *addr, 
unsigned long flags)
case 0x01:
flags |= IORESOURCE_IO;
break;
-
case 0x02: /* 32 bits */
-   case 0x03: /* 64 bits */
flags |= IORESOURCE_MEM;
break;
+   case 0x03: /* 64 bits */
+   flags |= IORESOURCE_MEM | IORESOURCE_MEM_64;
+   break;
}
if (w & 0x4000)
flags |= IORESOURCE_PREFETCH;
diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c
index 7bbdc26..defee61 100644
--- a/arch/sparc/kernel/of_device_64.c
+++ b/arch/sparc/kernel/of_device_64.c
@@ -146,11 +146,12 @@ static unsigned long of_bus_pci_get_flags(const u32 
*addr, unsigned long flags)
case 0x01:
flags |= IORESOURCE_IO;
break;
-
case 0x02: /* 32 bits */
-   case 0x03: /* 64 bits */
flags |= IORESOURCE_MEM;
break;
+   case 0x03: /* 64 bits */
+   flags |= IORESOURCE_MEM | IORESOURCE_MEM_64;
+   break;
}
if (w & 0x4000)
flags |= IORESOURCE_PREFETCH;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 32/49] resources: Make allocate_resource return just fit resource

2015-07-14 Thread Yinghai Lu
Find all suitable empty slots and pick one just fit, so we could save
the big slot for needed ones later when we have several pcie switches
and some bridges get assigned bios and we need to assign others in kernel.

Signed-off-by: Yinghai Lu 
---
 kernel/resource.c | 81 ++-
 1 file changed, 68 insertions(+), 13 deletions(-)

diff --git a/kernel/resource.c b/kernel/resource.c
index 830cc11..c630ef1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -48,6 +48,7 @@ struct resource_constraint {
resource_size_t (*alignf)(void *, const struct resource *,
resource_size_t, resource_size_t);
void *alignf_data;
+   bool fit;
 };
 
 static DEFINE_RWLOCK(resource_lock);
@@ -554,12 +555,15 @@ static void resource_clip(struct resource *res, 
resource_size_t min,
  * alignment constraints
  */
 static int __find_resource(struct resource *root, struct resource *old,
-struct resource *new,
+struct resource *new, struct resource *avail,
 resource_size_t  size,
 struct resource_constraint *constraint)
 {
struct resource *this = root->child;
-   struct resource tmp = *new, avail, alloc;
+   struct resource tmp = *new, availx, alloc;
+
+   if (!avail || avail == new)
+   avail = &availx;
 
tmp.start = root->start;
/*
@@ -583,15 +587,16 @@ static int __find_resource(struct resource *root, struct 
resource *old,
arch_remove_reservations(&tmp);
 
/* Check for overflow after ALIGN() */
-   avail.start = ALIGN(tmp.start, constraint->align);
-   avail.end = tmp.end;
-   avail.flags = new->flags & ~IORESOURCE_UNSET;
-   if (avail.start >= tmp.start) {
-   alloc.flags = avail.flags;
-   alloc.start = 
constraint->alignf(constraint->alignf_data, &avail,
+   avail->start = ALIGN(tmp.start, constraint->align);
+   avail->end = tmp.end;
+   avail->flags = new->flags & ~IORESOURCE_UNSET;
+   if (avail->start >= tmp.start) {
+   alloc.flags = avail->flags;
+   alloc.start = constraint->alignf(
+   constraint->alignf_data, avail,
size, constraint->align);
alloc.end = alloc.start + size - 1;
-   if (resource_contains(&avail, &alloc)) {
+   if (resource_contains(avail, &alloc)) {
new->start = alloc.start;
new->end = alloc.end;
return 0;
@@ -608,6 +613,11 @@ next:  if (!this || this->end == root->end)
return -EBUSY;
 }
 
+struct good_resource {
+   struct list_head list;
+   struct resource avail;
+   struct resource new;
+};
 /*
  * Find empty slot in the resource tree given range and alignment.
  */
@@ -615,7 +625,49 @@ static int find_resource(struct resource *root, struct 
resource *new,
resource_size_t size,
struct resource_constraint  *constraint)
 {
-   return  __find_resource(root, NULL, new, size, constraint);
+   int ret = -1;
+   LIST_HEAD(head);
+   struct good_resource *good, *tmp;
+   resource_size_t avail_size = (resource_size_t)-1ULL;
+
+   if (!constraint->fit)
+   return __find_resource(root, NULL, new, NULL, size,
+   constraint);
+
+   /* find all suitable ones and add to the list */
+   for (;;) {
+   good = kzalloc(sizeof(*good), GFP_KERNEL);
+   if (!good)
+   break;
+
+   good->new.start = new->start;
+   good->new.end = new->end;
+   good->new.flags = new->flags;
+   ret = __find_resource(root, NULL, &good->new, &good->avail,
+   size, constraint);
+   if (ret || __request_resource(root, &good->avail)) {
+   ret = -EBUSY;
+   kfree(good);
+   break;
+   }
+
+   list_add(&good->list, &head);
+   }
+
+   /* pick up the smallest one and delete the list */
+   list_for_each_entry_safe(good, tmp, &head, list) {
+   if (resource_size(&good->avail) < avail_size) {
+   avail_size = resource_size(&good->avail);
+   new->start = good->new.start;
+   new->end = good->new.end;
+   ret = 0;
+  

[PATCH v2 34/49] PCI: Only treat non-pef mmio64 as pref if all bridges has MEM_64

2015-07-14 Thread Yinghai Lu
If any bridge up to root only have 32bit pref mmio, We don't need to
treat device non-pref mmio64 as as pref mmio64.

We need to move pci_bridge_check_ranges calling early.
for parent bridges pref mmio BAR may not allocated by BIOS, res flags
is still 0, we need to have it correct set before we check them for
child device resources.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 3a1d659..8a8e5a7 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1038,6 +1038,18 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, 
int i)
return -EINVAL;
 }
 
+static bool pci_up_path_over_pref_mem64(struct pci_bus *bus)
+{
+   if (pci_is_root_bus(bus))
+   return true;
+
+   if (bus->self && !(bus->self->resource[PCI_BRIDGE_RESOURCES + 2].flags &
+  IORESOURCE_MEM_64))
+   return false;
+
+   return pci_up_path_over_pref_mem64(bus->parent);
+}
+
 int pci_resource_pref_compatible(const struct pci_dev *dev,
 struct resource *res)
 {
@@ -1046,7 +1058,8 @@ int pci_resource_pref_compatible(const struct pci_dev 
*dev,
 
if ((res->flags & IORESOURCE_MEM) &&
(res->flags & IORESOURCE_MEM_64) &&
-   dev->on_all_pcie_path)
+   dev->on_all_pcie_path &&
+   pci_up_path_over_pref_mem64(dev->bus))
return res->flags | IORESOURCE_PREFETCH;
 
return res->flags;
@@ -1816,6 +1829,10 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
struct resource *b_res;
int ret;
 
+   if (!pci_is_root_bus(bus) &&
+   (bus->self->class >> 8) == PCI_CLASS_BRIDGE_PCI)
+   pci_bridge_check_ranges(bus);
+
list_for_each_entry(dev, &bus->devices, bus_list) {
struct pci_bus *b = dev->subordinate;
if (!b)
@@ -1843,7 +1860,6 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
break;
 
case PCI_CLASS_BRIDGE_PCI:
-   pci_bridge_check_ranges(bus);
if (bus->self->is_hotplug_bridge) {
min_io_size  = pci_hotplug_io_size;
min_mem_size = pci_hotplug_mem_size;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 36/49] PCI: Only treat non-pef mmio64 as pref if host-bridge has_mem64

2015-07-14 Thread Yinghai Lu
If host bridge does not have mmio64 above 4G, We don't need to
treat device non-pref mmio64 as as pref mmio64.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 8a8e5a7..37d5a48 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1041,7 +1041,7 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, int 
i)
 static bool pci_up_path_over_pref_mem64(struct pci_bus *bus)
 {
if (pci_is_root_bus(bus))
-   return true;
+   return to_pci_host_bridge(bus->bridge)->has_mem64;
 
if (bus->self && !(bus->self->resource[PCI_BRIDGE_RESOURCES + 2].flags &
   IORESOURCE_MEM_64))
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 49/49] PCI: Don't set flags to 0 when assign resource fail

2015-07-14 Thread Yinghai Lu
make flags take IORESOURCE_UNSET | IORESOURCE_DISABLED instead.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/bus.c   |  2 +-
 drivers/pci/setup-bus.c | 45 +++--
 drivers/pci/setup-res.c |  3 ++-
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index b043bdf..b68f1cd 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -140,7 +140,7 @@ static int pci_bus_alloc_from_region(struct pci_bus *bus, 
struct resource *res,
type_mask |= IORESOURCE_TYPE_BITS;
 
pci_bus_for_each_resource(bus, r, i) {
-   if (!r)
+   if (!r || resource_disabled(r))
continue;
 
/* type_mask must match */
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 7734be6..a7fbdcc 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -289,13 +289,6 @@ static void __dev_check_resources(struct pci_dev *dev,
pdev_check_resources(dev, realloc_head, head);
 }
 
-static inline void reset_resource(struct resource *res)
-{
-   res->start = 0;
-   res->end = 0;
-   res->flags = 0;
-}
-
 static void __sort_resources(struct list_head *head)
 {
struct pci_dev_resource *res1, *tmp_res, *res2;
@@ -398,7 +391,7 @@ static void reassign_resources_sorted(struct list_head 
*realloc_head,
res->start = align;
res->end = res->start + add_size - 1;
if (pci_assign_resource(add_res->dev, idx))
-   reset_resource(res);
+   res->flags |= IORESOURCE_DISABLED;
} else {
/* could just assigned with alt, add difference ? */
resource_size_t must_size;
@@ -451,7 +444,7 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
pci_assign_resource(dev_res->dev, idx)) {
if (fail_head)
add_to_list(fail_head, dev_res->dev, res);
-   reset_resource(res);
+   res->flags |= IORESOURCE_DISABLED;
}
}
 }
@@ -737,7 +730,7 @@ static void __assign_resources_alt_sorted(struct list_head 
*head,
 
if (!res_to_dev_res(local_fail_head, res))
add_to_list(local_fail_head, fail_res->dev, res);
-   reset_resource(res);
+   res->flags |= IORESOURCE_DISABLED;
}
free_list(&local_alt_fail_head);
 }
@@ -903,7 +896,7 @@ static void pci_setup_bridge_io(struct pci_dev *bridge)
/* Set up the top and bottom of the PCI I/O segment for this bus. */
res = &bridge->resource[PCI_BRIDGE_RESOURCES + 0];
pcibios_resource_to_bus(bridge->bus, ®ion, res);
-   if (res->flags & IORESOURCE_IO) {
+   if ((res->flags & IORESOURCE_IO) && !(res->flags & IORESOURCE_UNSET)) {
pci_read_config_word(bridge, PCI_IO_BASE, &l);
io_base_lo = (region.start >> 8) & io_mask;
io_limit_lo = (region.end >> 8) & io_mask;
@@ -933,7 +926,8 @@ static void pci_setup_bridge_mmio(struct pci_dev *bridge)
/* Set up the top and bottom of the PCI Memory segment for this bus. */
res = &bridge->resource[PCI_BRIDGE_RESOURCES + 1];
pcibios_resource_to_bus(bridge->bus, ®ion, res);
-   if (res->flags & IORESOURCE_MEM) {
+   if ((res->flags & IORESOURCE_MEM) &&
+   !(res->flags & IORESOURCE_UNSET)) {
l = (region.start >> 16) & 0xfff0;
l |= region.end & 0xfff0;
dev_info(&bridge->dev, "  bridge window %pR\n", res);
@@ -958,7 +952,8 @@ static void pci_setup_bridge_mmio_pref(struct pci_dev 
*bridge)
bu = lu = 0;
res = &bridge->resource[PCI_BRIDGE_RESOURCES + 2];
pcibios_resource_to_bus(bridge->bus, ®ion, res);
-   if (res->flags & IORESOURCE_PREFETCH) {
+   if ((res->flags & IORESOURCE_PREFETCH) &&
+   !(res->flags & IORESOURCE_UNSET)) {
l = (region.start >> 16) & 0xfff0;
l |= region.end & 0xfff0;
if (res->flags & IORESOURCE_MEM_64) {
@@ -1077,6 +1072,7 @@ static void pci_bridge_check_ranges(struct pci_bus *bus)
 
b_res = &bridge->resource[PCI_BRIDGE_RESOURCES];
b_res[1].flags |= IORESOURCE_MEM;
+   b_res[1].flags &= ~IORESOURCE_DISABLED;
 
pci_read_config_word(bridge, PCI_IO_BASE, &io);
if (!io) {
@@ -1084,8 +1080,10 @@ static void pci_bridge_check_ranges(struct pci_bus *bus)
pci_read_config_word(bridge, PCI_IO_BASE, &io);
   

[PATCH v2 45/49] PCI: Don't release fixed resource for realloc

2015-07-14 Thread Yinghai Lu
We should not release bridge resource if there is fixed resources
under it, otherwise the children firmware would stop working.

Reported-by: Paul Johnson 
Suggested-by: Bjorn Helgaas 
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=92351
Signed-off-by: Yinghai Lu 
Cc: sta...@vger.kernel.org
---
 drivers/pci/setup-bus.c |  6 --
 include/linux/ioport.h  |  2 +-
 kernel/resource.c   | 28 ++--
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index dc9ba41..9d5423c 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -2059,14 +2059,16 @@ static void pci_bridge_release_resources(struct pci_bus 
*bus,
 
r = &b_res[idx];
 
-   if (!r->parent)
+   if (!r->parent || r->flags & IORESOURCE_PCI_FIXED)
return;
 
/*
 * if there are children under that, we should release them
 *  all
 */
-   release_child_resources(r);
+   if (!release_child_resources(r))
+   return;
+
if (!release_resource(r)) {
type = old_flags = r->flags & type_mask;
dev_printk(KERN_DEBUG, &dev->dev, "resource %d %pR released\n",
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 388e3ae..27dbb18 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -141,7 +141,7 @@ extern struct resource iomem_resource;
 extern struct resource *request_resource_conflict(struct resource *root, 
struct resource *new);
 extern int request_resource(struct resource *root, struct resource *new);
 extern int release_resource(struct resource *new);
-void release_child_resources(struct resource *new);
+bool release_child_resources(struct resource *new);
 extern void reserve_region_with_split(struct resource *root,
 resource_size_t start, resource_size_t end,
 const char *name);
diff --git a/kernel/resource.c b/kernel/resource.c
index c630ef1..0285f11 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -276,11 +276,35 @@ static void __release_child_resources(struct resource *r)
}
 }
 
-void release_child_resources(struct resource *r)
+static bool __has_fixed_child_resources(struct resource *r)
 {
+   struct resource *p;
+
+   p = r->child;
+   while (p) {
+   if (p->flags & IORESOURCE_PCI_FIXED)
+   return true;
+
+   if (__has_fixed_child_resources(p))
+   return true;
+
+   p = p->sibling;
+   }
+
+   return false;
+}
+
+bool release_child_resources(struct resource *r)
+{
+   bool fixed;
+
write_lock(&resource_lock);
-   __release_child_resources(r);
+   fixed = __has_fixed_child_resources(r);
+   if (!fixed)
+   __release_child_resources(r);
write_unlock(&resource_lock);
+
+   return !fixed;
 }
 
 /**
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 43/49] PCI: Get new realloc size for bridge for last try

2015-07-14 Thread Yinghai Lu
Current realloc path would not shrink bridge resource through
pbus_size_mem() checking with the old size.

That cause problem: when "must+optional" resource allocation fails,
the cached bridge resource size will prevent "must" resource to get
allocated smaller resource.

Clear the old resource size for last try or third and later try.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431
Tested-by: TJ 
Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index da0a259..7ffb113 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -2327,8 +2327,15 @@ again:
struct resource *res = fail_res->res;
 
restore_resource(fail_res, res);
-   if (fail_res->dev->subordinate)
+   if (fail_res->dev->subordinate) {
res->flags = 0;
+   /* last or third times and later */
+   if (tried_times + 1 == pci_try_num ||
+   tried_times + 1 > 2) {
+   res->start = 0;
+   res->end = res->start - 1;
+   }
+   }
}
free_list(&fail_head);
 
@@ -2400,8 +2407,12 @@ again:
struct resource *res = fail_res->res;
 
restore_resource(fail_res, res);
-   if (fail_res->dev->subordinate)
+   if (fail_res->dev->subordinate) {
res->flags = 0;
+   /* last time */
+   res->start = 0;
+   res->end = res->start - 1;
+   }
}
free_list(&fail_head);
 
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 42/49] PCI: Treat optional as must in first try for bridge rescan

2015-07-14 Thread Yinghai Lu
For rescan bridge/bus that children are removed before, we should treat
optional as must just like root bus the boot time in 19aa7ee432ce
(PCI: make re-allocation try harder by reassigning ranges higher in
the heirarchy).

The reason: allocate must and expand to optional path do not
put failed resource to fail list, so will lose must info before
next try.

So we are using following way:
1. First and following try before last try:
   We don't keep realloc list so treat every optional as must.
   allocate for must+optional and put failed in the fail list.
   then size info (include must and optonal separatedly) will be kept
   for next try.
2. last try:
   a: try to allocate must+optional to see if all get allocated.
   b: try to allocate must then expand to optional.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f5b07d8..da0a259 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -2350,25 +2350,34 @@ void __init pci_assign_unassigned_resources(void)
 void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge)
 {
struct pci_bus *parent = bridge->subordinate;
-   LIST_HEAD(add_list); /* list of resources that
+   LIST_HEAD(realloc_head); /* list of resources that
want additional resources */
+   struct list_head *add_list = NULL;
int tried_times = 0;
LIST_HEAD(fail_head);
struct pci_dev_resource *fail_res;
int retval;
unsigned long type_mask = IORESOURCE_IO | IORESOURCE_MEM |
  IORESOURCE_PREFETCH | IORESOURCE_MEM_64;
+   int pci_try_num = 2;
 
 again:
-   __pci_bus_size_bridges(parent, &add_list);
-   __pci_bridge_assign_resources(bridge, &add_list, &fail_head);
-   __pci_bus_check_realloc(&add_list);
+   /*
+* last try will use add_list, otherwise will try good to have as
+* must have, so can realloc parent bridge resource
+*/
+   if (tried_times + 1 == pci_try_num)
+   add_list = &realloc_head;
+   __pci_bus_size_bridges(parent, add_list);
+   __pci_bridge_assign_resources(bridge, add_list, &fail_head);
+   if (add_list)
+   __pci_bus_check_realloc(add_list);
tried_times++;
 
if (list_empty(&fail_head))
goto enable_all;
 
-   if (tried_times >= 2) {
+   if (tried_times >= pci_try_num) {
/* still fail, don't need to try more */
free_list(&fail_head);
goto enable_all;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 29/49] PCI: Unify skip_ioresource_align()

2015-07-14 Thread Yinghai Lu
There are powerpc generic version and x86 local version.

Move the powerpc version to setup-bus.c, and kill x86 local version.

Also kill dummy version in microblaze.

Cc: Michal Simek 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Arnd Bergmann 
Cc: linuxppc-...@lists.ozlabs.org
Cc: linux-a...@vger.kernel.org
Signed-off-by: Yinghai Lu 
---
 arch/microblaze/pci/pci-common.c |  8 
 arch/powerpc/kernel/pci-common.c | 11 +--
 arch/x86/include/asm/pci_x86.h   |  1 -
 arch/x86/pci/common.c|  4 ++--
 arch/x86/pci/i386.c  | 12 ++--
 drivers/pci/setup-bus.c  |  9 +
 include/asm-generic/pci-bridge.h |  2 ++
 7 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index ae838ed..09b1af6 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -878,11 +878,6 @@ void pcibios_fixup_bus(struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pcibios_fixup_bus);
 
-static int skip_isa_ioresource_align(struct pci_dev *dev)
-{
-   return 0;
-}
-
 /*
  * We need to avoid collisions with `mirrored' VGA ports
  * and other strange ISA hardware, so we always want the
@@ -899,12 +894,9 @@ static int skip_isa_ioresource_align(struct pci_dev *dev)
 resource_size_t pcibios_align_resource(void *data, const struct resource *res,
resource_size_t size, resource_size_t align)
 {
-   struct pci_dev *dev = data;
resource_size_t start = res->start;
 
if (res->flags & IORESOURCE_IO) {
-   if (skip_isa_ioresource_align(dev))
-   return start;
if (start & 0x300)
start = (start + 0x3ff) & ~0x3ff;
}
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index b9de34d..2d8d654 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1064,15 +1064,6 @@ void pci_fixup_cardbus(struct pci_bus *bus)
pcibios_setup_bus_devices(bus);
 }
 
-
-static int skip_isa_ioresource_align(struct pci_dev *dev)
-{
-   if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) &&
-   !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA))
-   return 1;
-   return 0;
-}
-
 /*
  * We need to avoid collisions with `mirrored' VGA ports
  * and other strange ISA hardware, so we always want the
@@ -1093,7 +1084,7 @@ resource_size_t pcibios_align_resource(void *data, const 
struct resource *res,
resource_size_t start = res->start;
 
if (res->flags & IORESOURCE_IO) {
-   if (skip_isa_ioresource_align(dev))
+   if (skip_isa_ioresource_align(dev->bus))
return start;
if (start & 0x300)
start = (start + 0x3ff) & ~0x3ff;
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 164e3f8..ddac225 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -28,7 +28,6 @@ do {  \
 #define PCI_ASSIGN_ROMS0x1000
 #define PCI_BIOS_IRQ_SCAN  0x2000
 #define PCI_ASSIGN_ALL_BUSSES  0x4000
-#define PCI_CAN_SKIP_ISA_ALIGN 0x8000
 #define PCI_USE__CRS   0x1
 #define PCI_CHECK_ENABLE_AMD_MMCONF0x2
 #define PCI_HAS_IO_ECS 0x4
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 8fd6f44..e8df922 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -83,7 +83,7 @@ DEFINE_RAW_SPINLOCK(pci_config_lock);
 
 static int __init can_skip_ioresource_align(const struct dmi_system_id *d)
 {
-   pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+   pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", 
d->ident);
return 0;
 }
@@ -619,7 +619,7 @@ char *__init pcibios_setup(char *str)
pci_routeirq = 1;
return NULL;
} else if (!strcmp(str, "skip_isa_align")) {
-   pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+   pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
return NULL;
} else if (!strcmp(str, "noioapicquirk")) {
noioapicquirk = 1;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 0a9f2ca..3f17726 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -128,15 +129,6 @@ static void __init pcibios_fw_addr_list_del(void)
pcibios_fw_addr_done = true;
 }
 
-static int
-skip_isa_ioresource_align(struct pci_dev *dev) {
-
-   if ((pci_probe & PCI_CAN_SKIP_ISA_ALIGN) &&
-   !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA))
-   return 1;
-   return 0;
-}
-
 /*
  * We need to avoid collisions with `mir

[PATCH v2 40/49] powerpc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing

2015-07-14 Thread Yinghai Lu
For device resource PREF bit setting under bridge 64-bit pref resource,
we need to make sure only set PREF for 64bit resource, so set
IORESOUCE_MEM_64 for 64bit resource during of device resource flags
parsing.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=96261
Link: https://bugzilla.kernel.org/show_bug.cgi?id=96241
Signed-off-by: Yinghai Lu 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Gavin Shan 
Cc: Yijing Wang 
Cc: Anton Blanchard 
Cc: linuxppc-...@lists.ozlabs.org
---
 arch/powerpc/kernel/pci_of_scan.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/pci_of_scan.c 
b/arch/powerpc/kernel/pci_of_scan.c
index 42e02a2..f31bfd0 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -44,8 +44,10 @@ static unsigned int pci_parse_of_flags(u32 addr0, int bridge)
 
if (addr0 & 0x0200) {
flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
-   flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64;
flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M;
+   if (addr0 & 0x0100)
+   flags |= IORESOURCE_MEM_64
+| PCI_BASE_ADDRESS_MEM_TYPE_64;
if (addr0 & 0x4000)
flags |= IORESOURCE_PREFETCH
 | PCI_BASE_ADDRESS_MEM_PREFETCH;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 09/49] PCI: Rename pdev_sort_resources to pdev_check_resources

2015-07-14 Thread Yinghai Lu
We don't do sorting in those functions anymore, so change "sort" to "check"
instead.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 6642a60..292f2a5 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -206,8 +206,8 @@ static resource_size_t __pci_resource_alignment(
return r_align;
 }
 
-/* Sort resources by alignment */
-static void pdev_sort_resources(struct pci_dev *dev,
+/* check resources and save to the list */
+static void pdev_check_resources(struct pci_dev *dev,
 struct list_head *realloc_head,
 struct list_head *head)
 {
@@ -243,7 +243,7 @@ static void pdev_sort_resources(struct pci_dev *dev,
}
 }
 
-static void __dev_sort_resources(struct pci_dev *dev,
+static void __dev_check_resources(struct pci_dev *dev,
 struct list_head *realloc_head,
 struct list_head *head)
 {
@@ -261,7 +261,7 @@ static void __dev_sort_resources(struct pci_dev *dev,
return;
}
 
-   pdev_sort_resources(dev, realloc_head, head);
+   pdev_check_resources(dev, realloc_head, head);
 }
 
 static inline void reset_resource(struct resource *res)
@@ -561,7 +561,7 @@ static void pdev_assign_resources_sorted(struct pci_dev 
*dev,
 {
LIST_HEAD(head);
 
-   __dev_sort_resources(dev, add_head, &head);
+   __dev_check_resources(dev, add_head, &head);
__assign_resources_sorted(&head, add_head, fail_head);
 
 }
@@ -574,7 +574,7 @@ static void pbus_assign_resources_sorted(const struct 
pci_bus *bus,
LIST_HEAD(head);
 
list_for_each_entry(dev, &bus->devices, bus_list)
-   __dev_sort_resources(dev, realloc_head, &head);
+   __dev_check_resources(dev, realloc_head, &head);
 
__assign_resources_sorted(&head, realloc_head, fail_head);
 }
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 33/49] PCI: Check pref compatible bit for mem64 resource of pcie device

2015-07-14 Thread Yinghai Lu
We still get "no compatible bridge window" warning on sparc T5-8
after we add support for 64bit resource parsing for root bus.

 PCI: scan_bus[/pci@300/pci@1/pci@0/pci@6] bus no 8
 PCI: Claiming :00:01.0: Resource 15: 8001..8004afff 
[220c]
 PCI: Claiming :01:00.0: Resource 15: 8001..8004afff 
[220c]
 PCI: Claiming :02:04.0: Resource 15: 8001..80012fff 
[220c]
 PCI: Claiming :03:00.0: Resource 15: 8001..80012fff 
[220c]
 PCI: Claiming :04:06.0: Resource 14: 8001..80010fff 
[220c]
 PCI: Claiming :05:00.0: Resource 0: 8001..80011fff 
[204]
 pci :05:00.0: can't claim BAR 0 [mem 0x8001-0x80011fff]: no 
compatible bridge window

All the bridges 64-bit resource have pref bit, but the device resource does not
have pref set, then we can not find parent for the device resource,
as we can not put non-pref mem under pref mem.

According to pcie spec errta
https://www.pcisig.com/specifications/pciexpress/base2/PCIe_Base_r2.1_Errata_08Jun10.pdf
page 13, in some case it is ok to mark some as pref.

Mark if the entire path from the host to the adapter is over PCI Express.
Then set pref compatible bit for claim/sizing/assign for 64bit mem resource
on that pcie device.

Fixes: commit d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream 
windows")
Link: 
http://lkml.kernel.org/r/cae9fiqu1gjy1lyrxs+ma5lcteee4xmtjrg0axj9k_tsu+m9...@mail.gmail.com
Reported-by: David Ahern 
Tested-by: David Ahern 
Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431
Tested-by: TJ 
Signed-off-by: Yinghai Lu 
Cc:  #3.19
---
 drivers/pci/pci.c   |  3 ++-
 drivers/pci/pci.h   |  2 ++
 drivers/pci/probe.c | 33 +
 drivers/pci/setup-bus.c | 21 ++---
 drivers/pci/setup-res.c |  4 
 include/linux/pci.h |  1 +
 6 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 0008c95..ff1192a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -414,6 +414,7 @@ EXPORT_SYMBOL_GPL(pci_find_ht_capability);
 struct resource *pci_find_parent_resource(const struct pci_dev *dev,
  struct resource *res)
 {
+   int flags = pci_resource_pref_compatible(dev, res);
const struct pci_bus *bus = dev->bus;
struct resource *r;
int i;
@@ -428,7 +429,7 @@ struct resource *pci_find_parent_resource(const struct 
pci_dev *dev,
 * not, the allocator made a mistake.
 */
if (r->flags & IORESOURCE_PREFETCH &&
-   !(res->flags & IORESOURCE_PREFETCH))
+   !(flags & IORESOURCE_PREFETCH))
return NULL;
 
/*
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2b83977..1804d44 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -336,4 +336,6 @@ static inline int pci_dev_specific_reset(struct pci_dev 
*dev, int probe)
 
 struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus);
 
+int pci_resource_pref_compatible(const struct pci_dev *dev,
+struct resource *res);
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index cefd636..010d8d9 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1544,6 +1544,36 @@ static void pci_init_capabilities(struct pci_dev *dev)
pci_enable_acs(dev);
 }
 
+static bool pci_up_path_over_pcie(struct pci_bus *bus)
+{
+   if (pci_is_root_bus(bus))
+   return true;
+
+   if (bus->self && !pci_is_pcie(bus->self))
+   return false;
+
+   return pci_up_path_over_pcie(bus->parent);
+}
+
+/*
+ * According to
+ * 
https://www.pcisig.com/specifications/pciexpress/base2/PCIe_Base_r2.1_Errata_08Jun10.pdf
+ * page 13, system firmware could put some 64bit non-pref under 64bit pref,
+ * on some cases.
+ * Let's mark if entire path from the host to the adapter is over PCI
+ * Express. later will use that compute pref compaitable bit.
+ */
+static void pci_set_on_all_pcie_path(struct pci_dev *dev)
+{
+   if (!pci_is_pcie(dev))
+   return;
+
+   if (!pci_up_path_over_pcie(dev->bus))
+   return;
+
+   dev->on_all_pcie_path = 1;
+}
+
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 {
int ret;
@@ -1574,6 +1604,9 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus 
*bus)
/* Initialize various capabilities */
pci_init_capabilities(dev);
 
+   /* After pcie_cap is assigned */
+   pci_set_on_all_pcie_path(dev);
+
/*
 * Add the device to our list of discovered devices
 * and the bus list for fixup functions, etc.
diff --git a/dri

[PATCH v2 44/49] PCI: Don't release sibiling bridge resources during hotplug

2015-07-14 Thread Yinghai Lu
On hotplug path, we can not touch sibling bridges that is out
side of the slot.

That could happen when BIOS does not assign some bridge BARs and
later can not assign resource to them in first try.

Check if fail dev is the parent bridge, then just use subordinate
bus instead use parent bus.

Reported-by: Andreas Noever 
Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 7ffb113..dc9ba41 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -2397,10 +2397,16 @@ again:
 * Try to release leaf bridge's resources that doesn't fit resource of
 * child device under that bridge
 */
-   list_for_each_entry(fail_res, &fail_head, list)
-   pci_bus_release_bridge_resources(fail_res->dev->bus,
+   list_for_each_entry(fail_res, &fail_head, list) {
+   struct pci_bus *bus = fail_res->dev->bus;
+
+   if (fail_res->dev == bridge)
+   bus = bridge->subordinate;
+
+   pci_bus_release_bridge_resources(bus,
 fail_res->flags & type_mask,
 whole_subtree);
+   }
 
/* restore size and flags */
list_for_each_entry(fail_res, &fail_head, list) {
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 47/49] PCI, x86: Add pci=assign_pref_bars to re-allocate pref bars

2015-07-14 Thread Yinghai Lu
So could reallocate pref mmio64 above 4G later.

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/pci_x86.h |  1 +
 arch/x86/pci/common.c  |  3 +++
 arch/x86/pci/i386.c| 56 ++
 3 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index ddac225..7b634b8 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -34,6 +34,7 @@ do {  \
 #define PCI_NOASSIGN_ROMS  0x8
 #define PCI_ROOT_NO_CRS0x10
 #define PCI_NOASSIGN_BARS  0x20
+#define PCI_ASSIGN_PREF_BARS   0x40
 
 extern unsigned int pci_probe;
 extern unsigned long pirq_table_addr;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index e8df922..dcc7c48 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -606,6 +606,9 @@ char *__init pcibios_setup(char *str)
} else if (!strcmp(str, "assign-busses")) {
pci_probe |= PCI_ASSIGN_ALL_BUSSES;
return NULL;
+   } else if (!strcmp(str, "assign_pref_bars")) {
+   pci_probe |= PCI_ASSIGN_PREF_BARS;
+   return NULL;
} else if (!strcmp(str, "use_crs")) {
pci_probe |= PCI_USE__CRS;
return NULL;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 3f17726..0b74efe 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -208,16 +208,25 @@ static void pcibios_allocate_bridge_resources(struct 
pci_dev *dev)
continue;
if (r->parent)  /* Already allocated */
continue;
-   if (!r->start || pci_claim_bridge_resource(dev, idx) < 0) {
-   /*
-* Something is wrong with the region.
-* Invalidate the resource to prevent
-* child resource allocations in this
-* range.
-*/
-   r->start = r->end = 0;
-   r->flags = 0;
-   }
+
+   if ((r->flags & IORESOURCE_PREFETCH) &&
+   (pci_probe & PCI_ASSIGN_PREF_BARS))
+   goto clear;
+
+   if (!r->start)
+   goto clear;
+
+   if (pci_claim_bridge_resource(dev, idx) == 0)
+   continue;
+
+clear:
+   /*
+* Something is wrong with the region.
+* Invalidate the resource to prevent
+* child resource allocations in this range.
+*/
+   r->start = r->end = 0;
+   r->flags = 0;
}
 }
 
@@ -263,21 +272,26 @@ static void pcibios_allocate_dev_resources(struct pci_dev 
*dev, int pass)
else
disabled = !(command & PCI_COMMAND_MEMORY);
if (pass == disabled) {
+   if ((r->flags & IORESOURCE_PREFETCH) &&
+   (pci_probe & PCI_ASSIGN_PREF_BARS))
+   goto clear;
+
dev_dbg(&dev->dev,
"BAR %d: reserving %pr (d=%d, p=%d)\n",
idx, r, disabled, pass);
-   if (pci_claim_resource(dev, idx) < 0) {
-   if (r->flags & IORESOURCE_PCI_FIXED) {
-   dev_info(&dev->dev, "BAR %d %pR 
is immovable\n",
-idx, r);
-   } else {
-   /* We'll assign a new address 
later */
-   pcibios_save_fw_addr(dev,
-   idx, r->start);
-   r->end -= r->start;
-   r->start = 0;
-   }
+   if (pci_claim_resource(dev, idx) == 0)
+   continue;
+   if (r->flags & IORESOURCE_PCI_FIXED) {
+   dev_info(&dev->dev, "BAR %d %pR is 
immovable\n",
+idx, r);
+   continue;
}
+
+clear:
+   /* We'll assign a new address later */
+   pcibios_save_fw_addr(dev, idx, r->start);
+  

[PATCH v2 48/49] PCI: Introduce resource_disabled()

2015-07-14 Thread Yinghai Lu
so we can cover !flags and IORESOURCE_DISABLED both.

Cc: linux-al...@vger.kernel.org
Cc: linux-i...@vger.kernel.org
Cc: linux-am33-l...@redhat.com
Cc: linuxppc-...@lists.ozlabs.org
Cc: linux-s...@vger.kernel.org
Cc: sparcli...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-xte...@linux-xtensa.org
Cc: io...@lists.linux-foundation.org
Cc: linux...@vger.kernel.org
Signed-off-by: Yinghai Lu 
---
 arch/alpha/kernel/pci.c   |  2 +-
 arch/ia64/pci/pci.c   |  4 ++--
 arch/microblaze/pci/pci-common.c  | 15 ---
 arch/mn10300/unit-asb2305/pci-asb2305.c   |  4 ++--
 arch/mn10300/unit-asb2305/pci.c   |  4 ++--
 arch/powerpc/kernel/pci-common.c  | 16 +---
 arch/powerpc/platforms/powernv/pci-ioda.c | 12 ++--
 arch/s390/pci/pci.c   |  2 +-
 arch/sparc/kernel/pci.c   |  2 +-
 arch/x86/pci/i386.c   |  4 ++--
 arch/xtensa/kernel/pci.c  |  4 ++--
 drivers/iommu/intel-iommu.c   |  3 ++-
 drivers/pci/host/pcie-rcar.c  |  2 +-
 drivers/pci/iov.c |  2 +-
 drivers/pci/probe.c   |  2 +-
 drivers/pci/quirks.c  |  2 +-
 drivers/pci/rom.c |  2 +-
 drivers/pci/setup-bus.c   |  8 
 drivers/pci/setup-res.c   |  2 +-
 include/linux/ioport.h|  4 
 20 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c
index 82f738e..91a7153 100644
--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@@ -282,7 +282,7 @@ pcibios_claim_one_bus(struct pci_bus *b)
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
struct resource *r = &dev->resource[i];
 
-   if (r->parent || !r->start || !r->flags)
+   if (r->parent || !r->start || resource_disabled(r))
continue;
if (pci_has_flag(PCI_PROBE_ONLY) ||
(r->flags & IORESOURCE_PCI_FIXED)) {
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 7cc3be9..cc293ea 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -501,7 +501,7 @@ void pcibios_fixup_device_resources(struct pci_dev *dev)
for (idx = 0; idx < PCI_BRIDGE_RESOURCES; idx++) {
struct resource *r = &dev->resource[idx];
 
-   if (!r->flags || r->parent || !r->start)
+   if (resource_disabled(r) || r->parent || !r->start)
continue;
 
pci_claim_resource(dev, idx);
@@ -519,7 +519,7 @@ static void pcibios_fixup_bridge_resources(struct pci_dev 
*dev)
for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) {
struct resource *r = &dev->resource[idx];
 
-   if (!r->flags || r->parent || !r->start)
+   if (resource_disabled(r) || r->parent || !r->start)
continue;
 
pci_claim_bridge_resource(dev, idx);
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 09b1af6..c123d3c 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -705,7 +705,7 @@ static void pcibios_fixup_resources(struct pci_dev *dev)
}
for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
struct resource *res = dev->resource + i;
-   if (!res->flags)
+   if (resource_disabled(res))
continue;
if (res->start == 0) {
pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]",
@@ -806,7 +806,7 @@ static void pcibios_fixup_bridge(struct pci_bus *bus)
pci_bus_for_each_resource(bus, res, i) {
if (!res)
continue;
-   if (!res->flags)
+   if (resource_disabled(res))
continue;
if (i >= 3 && bus->self->transparent)
continue;
@@ -985,7 +985,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus 
*bus)
 pci_domain_nr(bus), bus->number);
 
pci_bus_for_each_resource(bus, res, i) {
-   if (!res || !res->flags
+   if (!res || resource_disabled(res)
|| res->start > res->end || res->parent)
continue;
if (bus->parent == NULL)
@@ -1087,7 +1087,8 @@ static void __init pcibios_allocate_resources(int pass)
r = &dev->resource[idx];
if (r->parent)  /* Already allocated */
continue;
-   i

[PATCH v2 21/49] PCI: Move saved required resource list out of must+optional assigning

2015-07-14 Thread Yinghai Lu
We will need to share that saved list for alt_size support.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 64ef516..1c0b4c5 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -456,6 +456,9 @@ static bool __has_addon(struct list_head *head,
int add_count = 0;
struct pci_dev_resource *dev_res, *tmp_res;
 
+   if (!realloc_head)
+   return false;
+
/* check if we have add really */
list_for_each_entry(dev_res, head, list) {
tmp_res = res_to_dev_res(realloc_head, dev_res->res);
@@ -492,9 +495,9 @@ static void restore_resource(struct pci_dev_resource 
*save_res,
 }
 
 static bool __assign_resources_must_add_sorted(struct list_head *head,
+struct list_head *save_head,
 struct list_head *realloc_head)
 {
-   LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
struct pci_dev_resource *dev_res, *tmp_res;
@@ -502,12 +505,6 @@ static bool __assign_resources_must_add_sorted(struct 
list_head *head,
resource_size_t add_align, add_size;
struct resource *res;
 
-   if (!__has_addon(head, realloc_head))
-   return false;
-
-   if (!save_resources(head, &save_head))
-   return false;
-
/* Update res in head list with add_size in realloc_head list */
list_for_each_entry(dev_res, head, list) {
res = dev_res->res;
@@ -548,7 +545,6 @@ static bool __assign_resources_must_add_sorted(struct 
list_head *head,
/* Remove head list from realloc_head list */
list_for_each_entry(dev_res, head, list)
remove_from_list(realloc_head, dev_res->res);
-   free_list(&save_head);
free_list(head);
 
return true;
@@ -562,7 +558,7 @@ static bool __assign_resources_must_add_sorted(struct 
list_head *head,
if (res->parent && !pci_need_to_release(fail_type, res)) {
/* remove it from realloc_head list */
remove_from_list(realloc_head, res);
-   remove_from_list(&save_head, res);
+   remove_from_list(save_head, res);
list_del(&dev_res->list);
kfree(dev_res);
}
@@ -581,11 +577,9 @@ static bool __assign_resources_must_add_sorted(struct 
list_head *head,
}
}
/* Restore start/end/flags from saved list */
-   list_for_each_entry(save_res, &save_head, list)
+   list_for_each_entry(save_res, save_head, list)
restore_resource(save_res, save_res->res);
 
-   free_list(&save_head);
-
return false;
 }
 
@@ -603,16 +597,24 @@ static void __assign_resources_sorted(struct list_head 
*head,
 *then try to reassign add_size for some resources.
 */
 
+   LIST_HEAD(save_head);
+
/* Check must+optional add */
-   if (realloc_head &&
-   __assign_resources_must_add_sorted(head, realloc_head))
+   if (__has_addon(head, realloc_head) &&
+   save_resources(head, &save_head) &&
+   __assign_resources_must_add_sorted(head, &save_head,
+  realloc_head)) {
+   free_list(&save_head);
return;
+   }
 
__sort_resources(head);
 
/* Satisfy the must-have resource requests */
assign_requested_resources_sorted(head, fail_head);
 
+   free_list(&save_head);
+
/* Try to satisfy any additional optional resource
requests */
if (realloc_head)
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 28/49] PCI: Allow optional only io resource must size to be 0

2015-07-14 Thread Yinghai Lu
When there is no child device under the non hotplug bridge,
We can use 0 for must size, and do not use old size as must size.

When there is child device, size will not be 0.
when the bridge is not hotplug, min_size will not be 0.
So they will still honor the old size as must size.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 969a0b1..0420d27 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1242,8 +1242,9 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
 
size = size_aligned_for_isa(size);
size += size1;
-   size0 = calculate_size(size, min_size,
-   resource_size(b_res), min_align);
+   if (size || min_size)
+   size0 = calculate_size(size, min_size,
+   resource_size(b_res), min_align);
sum_add_size = size_aligned_for_isa(sum_add_size);
sum_add_size += sum_add_size1;
if (sum_add_size < min_sum_size)
@@ -1259,7 +1260,7 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
return;
}
 
-   b_res->start = min_align;
+   b_res->start = size0 ? min_align : 0;
b_res->end = b_res->start + size0 - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 25/49] PCI: Don't add too much optional size for hotplug bridge io

2015-07-14 Thread Yinghai Lu
Same as patch for mmio (PCI: Don't add too much optional size for hotplug
bridge mmio), and this one is addressing io port.

It will compare must+optional with min_sum_size to get smaller
optional size.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 26 --
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 8999ead..de55e07 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1189,7 +1189,6 @@ static resource_size_t window_alignment(struct pci_bus 
*bus,
  *
  * @bus : the bus
  * @min_size : the minimum io window that must to be allocated
- * @add_size : additional optional io window
  * @realloc_head : track the additional io window on this list
  *
  * Sizing the IO windows of the PCI-PCI bridge is trivial,
@@ -1198,9 +1197,11 @@ static resource_size_t window_alignment(struct pci_bus 
*bus,
  * We must be careful with the ISA aliasing though.
  */
 static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
-   resource_size_t add_size, struct list_head *realloc_head)
+struct list_head *realloc_head)
 {
struct pci_dev *dev;
+   resource_size_t min_sum_size = 0;
+   resource_size_t sum_add_size;
struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO,
IORESOURCE_IO);
resource_size_t size = 0, size0 = 0, size1 = 0;
@@ -1210,6 +1211,11 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
if (!b_res)
return;
 
+   if (realloc_head) {
+   min_sum_size = min_size;
+   min_size = 0;
+   }
+
min_align = window_alignment(bus, IORESOURCE_IO);
list_for_each_entry(dev, &bus->devices, bus_list) {
int i;
@@ -1239,10 +1245,11 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
 
size0 = calculate_iosize(size, min_size, size1,
resource_size(b_res), min_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_iosize(size, min_size, add_size + size1,
+   sum_add_size = children_add_size + size + size1;
+   if (sum_add_size < min_sum_size)
+   sum_add_size = min_sum_size;
+   size1 = !realloc_head ? size0 :
+   calculate_iosize(size, min_size, sum_add_size - size,
resource_size(b_res), min_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
@@ -1783,7 +1790,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
 {
struct pci_dev *dev;
unsigned long mask, prefmask, type2 = 0, type3 = 0;
-   resource_size_t min_mem_size = 0, additional_io_size = 0;
+   resource_size_t min_mem_size = 0, min_io_size = 0;
struct resource *b_res;
int ret;
 
@@ -1816,13 +1823,12 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
case PCI_CLASS_BRIDGE_PCI:
pci_bridge_check_ranges(bus);
if (bus->self->is_hotplug_bridge) {
-   additional_io_size  = pci_hotplug_io_size;
+   min_io_size  = pci_hotplug_io_size;
min_mem_size = pci_hotplug_mem_size;
}
/* Fall through */
default:
-   pbus_size_io(bus, realloc_head ? 0 : additional_io_size,
-additional_io_size, realloc_head);
+   pbus_size_io(bus, min_io_size, realloc_head);
 
/*
 * If there's a 64-bit prefetchable MMIO window, compute
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 14/49] PCI: Add __add_to_list()

2015-07-14 Thread Yinghai Lu
to take alt_size, alt_align.

preparation patch for alt_size support.


Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 51 ++---
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f30225c..57b5c09 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -66,6 +66,8 @@ struct pci_dev_resource {
resource_size_t end;
resource_size_t add_size;
resource_size_t min_align;
+   resource_size_t alt_size;
+   resource_size_t alt_align;
unsigned long flags;
 };
 
@@ -88,15 +90,16 @@ static void free_list(struct list_head *head)
  * @add_size:  additional size to be optionally added
  *  to the resource
  */
-static int add_to_list(struct list_head *head,
+static int __add_to_list(struct list_head *head,
 struct pci_dev *dev, struct resource *res,
-resource_size_t add_size, resource_size_t min_align)
+resource_size_t add_size, resource_size_t min_align,
+resource_size_t alt_size, resource_size_t alt_align)
 {
struct pci_dev_resource *tmp;
 
tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
if (!tmp) {
-   pr_warn("add_to_list: kmalloc() failed!\n");
+   pr_warn("__add_to_list: kmalloc() failed!\n");
return -ENOMEM;
}
 
@@ -107,12 +110,20 @@ static int add_to_list(struct list_head *head,
tmp->flags = res->flags;
tmp->add_size = add_size;
tmp->min_align = min_align;
+   tmp->alt_size = alt_size;
+   tmp->alt_align = alt_align;
 
list_add(&tmp->list, head);
 
return 0;
 }
 
+static int add_to_list(struct list_head *head,
+struct pci_dev *dev, struct resource *res)
+{
+   return __add_to_list(head, dev, res, 0, 0, 0, 0);
+}
+
 static void remove_from_list(struct list_head *head,
 struct resource *res)
 {
@@ -378,9 +389,7 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
if (resource_size(res) &&
pci_assign_resource(dev_res->dev, idx)) {
if (fail_head)
-   add_to_list(fail_head, dev_res->dev, res,
-   0 /* don't care */,
-   0 /* don't care */);
+   add_to_list(fail_head, dev_res->dev, res);
reset_resource(res);
}
}
@@ -466,7 +475,7 @@ static void __assign_resources_sorted(struct list_head 
*head,
 
/* Save original start, end, flags etc at first */
list_for_each_entry(dev_res, head, list) {
-   if (add_to_list(&save_head, dev_res->dev, dev_res->res, 0, 0)) {
+   if (add_to_list(&save_head, dev_res->dev, dev_res->res)) {
free_list(&save_head);
goto requested_and_reassign;
}
@@ -1019,8 +1028,8 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
b_res->end = b_res->start + size0 - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
-   add_to_list(realloc_head, bus->self, b_res, size1-size0,
-   min_align);
+   __add_to_list(realloc_head, bus->self, b_res,
+ size1 - size0, min_align, 0, 0);
dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to 
%pR add_size %llx\n",
   b_res, &bus->busn_res,
   (unsigned long long)size1-size0);
@@ -1222,7 +1231,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
add_to_align_test_list(&align_test_add_list,
align, r_size);
r->end = r->start - 1;
-   add_to_list(realloc_head, dev, r, r_size, 0/* 
don't care */);
+   __add_to_list(realloc_head, dev, r,
+ r_size, align, 0, 0);
sum_add_size += r_size;
if (align > max_add_align)
max_add_align = align;
@@ -1293,8 +1303,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
b_res->end = size0 + min_align - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
-   add_to_list(realloc_head, bus->self, b_res, size1 - size0,
-   min

[PATCH v2 18/49] PCI: Move comment to pci_need_to_release()

2015-07-14 Thread Yinghai Lu
Move comment from caller to callee, as we will have one new caller
for alt_size support later.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 27 +++
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 2e3d00b..f8b9a24 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -415,6 +415,20 @@ static unsigned long pci_fail_res_type_mask(struct 
list_head *fail_head)
 
 static bool pci_need_to_release(unsigned long mask, struct resource *res)
 {
+   /*
+* Separate three resource type checking if we need to release
+* assigned resource.
+*  1. if there is io port assign fail, will release assigned
+* io port.
+*  2. if there is pref mmio assign fail, release assigned
+* pref mmio.
+* if assigned pref mmio's parent is non-pref mmio and there
+* is non-pref mmio assign fail, will release that assigned
+* pref mmio.
+*  3. if there is non-pref mmio assign fail or pref mmio
+* assigned fail, will release assigned non-pref mmio.
+*/
+
if (res->flags & IORESOURCE_IO)
return !!(mask & IORESOURCE_IO);
 
@@ -471,19 +485,8 @@ static void __assign_resources_sorted(struct list_head 
*head,
 *  if could do that, could get out early.
 *  if could not do that, we still try to assign requested at first,
 *then try to reassign add_size for some resources.
-*
-* Separate three resource type checking if we need to release
-* assigned resource after requested + add_size try.
-*  1. if there is io port assign fail, will release assigned
-* io port.
-*  2. if there is pref mmio assign fail, release assigned
-* pref mmio.
-* if assigned pref mmio's parent is non-pref mmio and there
-* is non-pref mmio assign fail, will release that assigned
-* pref mmio.
-*  3. if there is non-pref mmio assign fail or pref mmio
-* assigned fail, will release assigned non-pref mmio.
 */
+
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 23/49] PCI: Add support for more than two alt_size under same bridge

2015-07-14 Thread Yinghai Lu
Need to increase size to make sure it could fit all alt entries.

In the patch, we first select one big size, and then keep reducing
the size and retrying to get the minimum value for alt_size.

Example:
two bridges: one have 8M/8M, and 1M/1M children res.
 one have 4M/4M, and 1M/1M children res.

Then we have child pridges alt_align/alt_size: 8M/9M, 4M/5M.
Before this patch, parent bridge alt_align/alt_size is 8M/14M
that is wrong.
With this patch parent bridge alt_align/alt_size: 8M/17M.

At same time, child bridges must align/size: 4M/12M, 2M/6M.
and prarent bridge must align/size: 4M/20M.

So at last, we use 8M/17M as parent bridge alt_align/alt_size.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=100451
Reported-by: Yijing Wang 
Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 56 +++--
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 9da8b23..bf28f32 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1335,6 +1335,47 @@ out:
return good_align;
 }
 
+static resource_size_t calculate_mem_alt_size(struct list_head *head,
+   resource_size_t max_align, resource_size_t size,
+   resource_size_t align_low)
+{
+   struct align_test_res *p;
+   resource_size_t tmp;
+   resource_size_t good_size, bad_size;
+   int count = 0, order;
+
+   good_size = ALIGN(size, align_low);
+
+   list_for_each_entry(p, head, list)
+   count++;
+
+   if (count <= 1)
+   goto out;
+
+   __sort_align_test(head);
+
+   tmp = max(size, max_align);
+   order = __fls(count);
+   if ((1ULL << order) < count)
+   order++;
+   good_size = ALIGN((tmp << order), align_low);
+   bad_size = ALIGN(size, align_low) - align_low;
+   size = good_size;
+   while (size > bad_size) {
+   /* check if align/size fit all entries */
+   if (is_align_size_good(head, max_align, size, 0))
+   good_size = size;
+   else
+   bad_size = size;
+
+   size = bad_size + ((good_size - bad_size) >> 1);
+   size = round_down(size, align_low);
+   }
+
+out:
+   return good_size;
+}
+
 static inline bool is_optional(int i)
 {
 
@@ -1381,6 +1422,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
mask | IORESOURCE_PREFETCH, type);
LIST_HEAD(align_test_list);
LIST_HEAD(align_test_add_list);
+   LIST_HEAD(align_test_alt_list);
resource_size_t alt_size = 0, alt_align = 0;
resource_size_t window_align;
 
@@ -1454,10 +1496,17 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
dev_res = res_to_dev_res(realloc_head, r);
if (dev_res && dev_res->alt_size) {
+   add_to_align_test_list(
+   &align_test_alt_list,
+   dev_res->alt_align,
+   dev_res->alt_size);
alt_size += dev_res->alt_size;
if (alt_align < dev_res->alt_align)
alt_align = dev_res->alt_align;
} else if (r_size > 1) {
+   add_to_align_test_list(
+   &align_test_alt_list,
+   align, r_size);
alt_size += r_size;
if (alt_align < align)
alt_align = align;
@@ -1477,14 +1526,17 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
if (size0 && realloc_head) {
alt_align = max(alt_align, window_align);
-   alt_size = calculate_memsize(alt_size, min_size,
-0, window_align);
+   /* need to increase size to fit more alt */
+   alt_size = calculate_mem_alt_size(&align_test_alt_list,
+ alt_align, alt_size,
+ window_align);
/* must is better ? */
if (alt_size >= size0) {
alt_align = 0;
alt_size = 0;
}
}
+   free_align_test_list(&align_test_alt_list);
 
if (sum_add_size < min_sum_size)
sum_add_size = min_sum_size;
-- 
1.8.4.5

[PATCH v2 20/49] PCI: Skip must+optional if there is no optional addon

2015-07-14 Thread Yinghai Lu
If the bridge does not support hotplug or no child with sriov support
we could get out early and don't try must+optional allocation.

Also in the loop that update res with optional add info, skip resource
that add_size is 0.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 32 +---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index d1f9e19..64ef516 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -450,6 +450,24 @@ static bool pci_need_to_release(unsigned long mask, struct 
resource *res)
return false;   /* should not get here */
 }
 
+static bool __has_addon(struct list_head *head,
+   struct list_head *realloc_head)
+{
+   int add_count = 0;
+   struct pci_dev_resource *dev_res, *tmp_res;
+
+   /* check if we have add really */
+   list_for_each_entry(dev_res, head, list) {
+   tmp_res = res_to_dev_res(realloc_head, dev_res->res);
+   if (!tmp_res || !tmp_res->add_size)
+   continue;
+
+   add_count++;
+   }
+
+   return add_count != 0;
+}
+
 static bool save_resources(struct list_head *head,
   struct list_head *save_head)
 {
@@ -481,16 +499,24 @@ static bool __assign_resources_must_add_sorted(struct 
list_head *head,
struct pci_dev_resource *save_res;
struct pci_dev_resource *dev_res, *tmp_res;
unsigned long fail_type;
-   resource_size_t add_align;
+   resource_size_t add_align, add_size;
struct resource *res;
 
+   if (!__has_addon(head, realloc_head))
+   return false;
+
if (!save_resources(head, &save_head))
return false;
 
/* Update res in head list with add_size in realloc_head list */
list_for_each_entry(dev_res, head, list) {
res = dev_res->res;
-   res->end += get_res_add_size(realloc_head, res);
+   add_size = get_res_add_size(realloc_head, res);
+
+   if (!add_size)
+   continue;
+
+   res->end += add_size;
 
/*
 * There are two kinds of additional resources in the list:
@@ -578,7 +604,7 @@ static void __assign_resources_sorted(struct list_head 
*head,
 */
 
/* Check must+optional add */
-   if (realloc_head && !list_empty(realloc_head) &&
+   if (realloc_head &&
__assign_resources_must_add_sorted(head, realloc_head))
return;
 
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 35/49] PCI: Add has_mem64 for host_bridge

2015-07-14 Thread Yinghai Lu
On system that does not support mmio64 above 4g, will not set that.
We will use that info next two following patches:
1. Don't treat non-pref mmio64 as pref mmio, so will not put
   it under bridge's pref range when rescan the devices
2. will keep pref mmio64 and pref mmio32 under bridge pref bar.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/probe.c | 9 +
 include/linux/pci.h | 1 +
 2 files changed, 10 insertions(+)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 010d8d9..14bdbca 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2051,6 +2051,15 @@ struct pci_bus *pci_create_root_bus(struct device 
*parent, int bus,
dev_info(&b->dev, "root bus resource %pR%s\n", res, bus_addr);
}
 
+   resource_list_for_each_entry(window, &bridge->windows) {
+   res = window->res;
+   if (resource_type(res) == IORESOURCE_MEM &&
+   (res->end - window->offset) > 0x) {
+   bridge->has_mem64 = 1;
+   break;
+   }
+   }
+
down_write(&pci_bus_sem);
list_add_tail(&b->node, &pci_root_buses);
up_write(&pci_bus_sem);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 33ef25f..0771b37 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -409,6 +409,7 @@ struct pci_host_bridge {
void (*release_fn)(struct pci_host_bridge *);
void *release_data;
unsigned int ignore_reset_delay:1;  /* for entire hierarchy */
+   unsigned int has_mem64:1;
 };
 
 #defineto_pci_host_bridge(n) container_of(n, struct pci_host_bridge, 
dev)
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 24/49] PCI: Better support for two alt_size

2015-07-14 Thread Yinghai Lu
Need to put aligned with max align before not aligned.

For example:
alt align/size: 8M/9M, 4M/8M
before this patch we have 8M/20M.
After this patch we will have 8M/17M.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 80 +++--
 1 file changed, 78 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index bf28f32..8999ead 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -58,6 +58,20 @@ static inline bool is_before(resource_size_t align1, 
resource_size_t size1,
return false;
 }
 
+static inline bool is_before_alt(resource_size_t align, resource_size_t size1,
+resource_size_t size2)
+{
+   resource_size_t size1_left, size2_left;
+
+   /*  aligned is before not aligned */
+   size1_left = size1 & (align - 1);
+   size2_left = size2 & (align - 1);
+   if (!size1_left && size2_left)
+   return true;
+
+   return false;
+}
+
 struct pci_dev_resource {
struct list_head list;
struct resource *res;
@@ -307,6 +321,42 @@ static void __sort_resources(struct list_head *head)
}
 }
 
+static void __sort_resources_alt(struct list_head *head)
+{
+   struct pci_dev_resource *res1, *tmp_res, *res2;
+   resource_size_t align = 0;
+
+
+   __sort_resources(head);
+
+   /* get max align at first */
+   list_for_each_entry(res1, head, list) {
+   resource_size_t align1;
+
+   align1 = pci_resource_alignment(res1->dev, res1->res);
+   if (align1 > align)
+   align = align1;
+   }
+
+   list_for_each_entry_safe(res1, tmp_res, head, list) {
+   resource_size_t size1, size2;
+
+   size1 = resource_size(res1->res);
+
+   /* reorder it */
+   list_for_each_entry(res2, head, list) {
+   if (res2 == res1)
+   break;
+
+   size2 = resource_size(res2->res);
+   if (is_before_alt(align, size1, size2)) {
+   list_move_tail(&res1->list, &res2->list);
+   break;
+   }
+   }
+   }
+}
+
 /**
  * reassign_resources_sorted() - satisfy any additional resource requests
  *
@@ -673,7 +723,7 @@ static void __assign_resources_alt_sorted(struct list_head 
*head,
res->end = res->start + alt_res->alt_size - 1;
}
 
-   __sort_resources(head);
+   __sort_resources_alt(head);
/* Satisfy the alt resource requests */
assign_requested_resources_sorted(head, &local_alt_fail_head);
 
@@ -1267,6 +1317,32 @@ static void __sort_align_test(struct list_head *head)
}
 }
 
+static void __sort_align_test_alt(struct list_head *head)
+{
+   struct align_test_res *res1, *tmp_res, *res2;
+   resource_size_t align = 0;
+
+   __sort_align_test(head);
+
+   /* get max align at first */
+   list_for_each_entry(res1, head, list)
+   if (res1->align > align)
+   align = res1->align;
+
+   list_for_each_entry_safe(res1, tmp_res, head, list) {
+   /* reorder it */
+   list_for_each_entry(res2, head, list) {
+   if (res2 == res1)
+   break;
+
+   if (is_before_alt(align, res1->size, res2->size)) {
+   list_move_tail(&res1->list, &res2->list);
+   break;
+   }
+   }
+   }
+}
+
 static bool is_align_size_good(struct list_head *head,
resource_size_t min_align, resource_size_t size,
resource_size_t start)
@@ -1352,7 +1428,7 @@ static resource_size_t calculate_mem_alt_size(struct 
list_head *head,
if (count <= 1)
goto out;
 
-   __sort_align_test(head);
+   __sort_align_test_alt(head);
 
tmp = max(size, max_align);
order = __fls(count);
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 46/49] PCI: Set resource to FIXED for lsi devices

2015-07-14 Thread Yinghai Lu
LSI HBA firmware stop responding pci read from host if pci core ever change
pci device BAR values.

Set their resources to FIXED, so will allow realloc to skip them.

Reported-by: Paul Johnson 
Suggested-by: Bjorn Helgaas 
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=92351
Signed-off-by: Yinghai Lu 
Cc: sta...@vger.kernel.org
---
 drivers/pci/pci.h   |  1 +
 drivers/pci/quirks.c| 20 
 drivers/pci/setup-bus.c |  4 
 3 files changed, 25 insertions(+)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1804d44..dec1c18 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -168,6 +168,7 @@ static inline void pci_msix_clear_and_set_ctrl(struct 
pci_dev *dev, u16 clear, u
 }
 
 void pci_realloc_get_opt(char *);
+bool pci_realloc_user_enabled(void);
 
 static inline int pci_no_d1d2(struct pci_dev *dev)
 {
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index e9fd0e9..184a09e 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -324,6 +324,26 @@ static void quirk_s3_64M(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_868,   
quirk_s3_64M);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_968,   
quirk_s3_64M);
 
+/*
+ * LSI devices firmware does not like BAR get changed
+ */
+static void quirk_bar_fixed(struct pci_dev *dev)
+{
+   int i;
+
+   if (pci_realloc_user_enabled())
+   return;
+
+   for (i = 0; i < PCI_STD_RESOURCE_END; i++) {
+   struct resource *r = &dev->resource[i];
+
+   if (!r->flags || r->flags & IORESOURCE_UNSET)
+   continue;
+   r->flags |= IORESOURCE_PCI_FIXED;
+   }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_LSI_LOGIC,  PCI_ANY_ID, 
quirk_bar_fixed);
+
 static void quirk_io(struct pci_dev *dev, int pos, unsigned size,
 const char *name)
 {
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 9d5423c..d9cfb55 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -2196,6 +2196,10 @@ void __init pci_realloc_get_opt(char *str)
else if (!strncmp(str, "on", 2))
pci_realloc_enable = user_enabled;
 }
+bool pci_realloc_user_enabled(void)
+{
+   return pci_realloc_enable == user_enabled;
+}
 static bool pci_realloc_enabled(enum enable_type enable)
 {
return enable >= user_enabled;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 37/49] PCI: Restore pref mmio allocation logic for hostbridge without mmio64

2015-07-14 Thread Yinghai Lu
>From 5b2854155 (PCI: Restrict 64-bit prefetchable bridge windows to 64-bit
resources), we change the logic for pref mmio allocation:
When bridge pref support mmio64, we will only put children pref
that support mmio64 into it, and will put children pref mmio32
into bridge's non-pref mmio32.

That could leave bridge pref bar not used when that pref bar is mmio64,
and children res only has mmio32.
Also could have allocation failure when non-pref mmio32 is not big
enough space for those children pref mmio32.

That is not rational when the host bridge does not 64bit mmio above 4g
at all.

The patch restore to old logic:
when hostbridge does not have has_mem64 so put children pref mmio64 and
pref mmio32 all under bridges pref bars.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/bus.c   |  4 +++-
 drivers/pci/setup-bus.c | 13 +
 drivers/pci/setup-res.c |  9 ++---
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 6fbd3f2..b043bdf 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -202,8 +202,10 @@ int pci_bus_alloc_resource(struct pci_bus *bus, struct 
resource *res,
 {
 #ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
int rc;
+   unsigned long mmio64 = pci_find_host_bridge(bus)->has_mem64 ?
+   IORESOURCE_MEM_64 : 0;
 
-   if (res->flags & IORESOURCE_MEM_64) {
+   if (res->flags & mmio64) {
rc = pci_bus_alloc_from_region(bus, res, size, align, min,
   type_mask, alignf, alignf_data,
   &pci_high);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 37d5a48..f5b07d8 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1876,7 +1876,8 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
b_res = &bus->self->resource[PCI_BRIDGE_RESOURCES];
mask = IORESOURCE_MEM;
prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
-   if (b_res[2].flags & IORESOURCE_MEM_64) {
+   if ((b_res[2].flags & IORESOURCE_MEM_64) &&
+   pci_find_host_bridge(bus)->has_mem64) {
prefmask |= IORESOURCE_MEM_64;
ret = pbus_size_mem(bus, prefmask, prefmask,
  prefmask, prefmask,
@@ -2032,17 +2033,21 @@ static void pci_bridge_release_resources(struct pci_bus 
*bus,
 *io port.
 * 2. if there is non pref mmio assign fail, release bridge
 *nonpref mmio.
-* 3. if there is 64bit pref mmio assign fail, and bridge pref
+* 3. if there is pref mmio assign fail, and host bridge does
+*have 64bit mmio, release bridge pref mmio.
+* 4. if there is 64bit pref mmio assign fail, and bridge pref
 *is 64bit, release bridge pref mmio.
-* 4. if there is pref mmio assign fail, and bridge pref is
+* 5. if there is pref mmio assign fail, and bridge pref is
 *32bit mmio, release bridge pref mmio
-* 5. if there is pref mmio assign fail, and bridge pref is not
+* 6. if there is pref mmio assign fail, and bridge pref is not
 *assigned, release bridge nonpref mmio.
 */
if (type & IORESOURCE_IO)
idx = 0;
else if (!(type & IORESOURCE_PREFETCH))
idx = 1;
+   else if (!pci_find_host_bridge(bus)->has_mem64)
+   idx = 2;
else if ((type & IORESOURCE_MEM_64) &&
 (b_res[2].flags & IORESOURCE_MEM_64))
idx = 2;
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index b19aa5b..26aedde 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -205,6 +205,8 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
struct resource *res = dev->resource + resno;
resource_size_t min;
int ret;
+   unsigned long mmio64 = pci_find_host_bridge(bus)->has_mem64 ?
+   IORESOURCE_MEM_64 : 0;
 
min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM;
 
@@ -216,7 +218,7 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
 * things differently than they were sized, not everything will fit.
 */
ret = pci_bus_alloc_resource(bus, res, size, align, min,
-IORESOURCE_PREFETCH | IORESOURCE_MEM_64,
+IORESOURCE_PREFETCH | mmio64,
 pcibios_align_resource, dev);
if (ret == 0)
return 0;
@@ -225,7 +227,8 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_

[PATCH v2 38/49] sparc/PCI: Add mem64 resource parsing for root bus

2015-07-14 Thread Yinghai Lu
Found "no compatible bridge window" warning in boot log from T5-8.

pci :00:01.0: can't claim BAR 15 [mem 0x1-0x4afff pref]: no 
compatible bridge window

That resource is above 4G, but does not get offset correctly as
root bus only report io and mem32.

pci_sun4v f02dbcfc: PCI host bridge to bus :00
pci_bus :00: root bus resource [io  0x8040-0x80400fff] (bus 
address [0x-0xfff])
pci_bus :00: root bus resource [mem 0x8000-0x80007eff] (bus 
address [0x-0x7eff])
pci_bus :00: root bus resource [bus 00-77]

Add mem64 handling in pci_common for sparc, so we can have 64bit resource
registered for root bus at first.

After patch, will have:
pci_sun4v f02dbcfc: PCI host bridge to bus :00
pci_bus :00: root bus resource [io  0x8040-0x80400fff] (bus 
address [0x-0xfff])
pci_bus :00: root bus resource [mem 0x8000-0x80007eff] (bus 
address [0x-0x7eff])
pci_bus :00: root bus resource [mem 0x8001-0x8007] (bus 
address [0x1-0x7])
pci_bus :00: root bus resource [bus 00-77]

Fixes: commit d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream 
windows")
Link: 
http://lkml.kernel.org/r/cae9fiqu1gjy1lyrxs+ma5lcteee4xmtjrg0axj9k_tsu+m9...@mail.gmail.com
Reported-by: David Ahern 
Tested-by: David Ahern 
Signed-off-by: Yinghai Lu 
Cc:  #3.19
---
 arch/sparc/kernel/pci.c|  7 ++-
 arch/sparc/kernel/pci_common.c | 15 +--
 arch/sparc/kernel/pci_impl.h   |  1 +
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index c928bc6..bfd0b70 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -185,8 +185,10 @@ static unsigned long pci_parse_of_flags(u32 addr0)
 
if (addr0 & 0x0200) {
flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
-   flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64;
flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M;
+   if (addr0 & 0x0100)
+   flags |= IORESOURCE_MEM_64
+| PCI_BASE_ADDRESS_MEM_TYPE_64;
if (addr0 & 0x4000)
flags |= IORESOURCE_PREFETCH
 | PCI_BASE_ADDRESS_MEM_PREFETCH;
@@ -660,6 +662,9 @@ struct pci_bus *pci_scan_one_pbm(struct pci_pbm_info *pbm,
pbm->io_space.start);
pci_add_resource_offset(&resources, &pbm->mem_space,
pbm->mem_space.start);
+   if (pbm->mem64_space.flags)
+   pci_add_resource_offset(&resources, &pbm->mem64_space,
+   pbm->mem_space.start);
pbm->busn.start = pbm->pci_first_busno;
pbm->busn.end   = pbm->pci_last_busno;
pbm->busn.flags = IORESOURCE_BUS;
diff --git a/arch/sparc/kernel/pci_common.c b/arch/sparc/kernel/pci_common.c
index 944a065..a859a86 100644
--- a/arch/sparc/kernel/pci_common.c
+++ b/arch/sparc/kernel/pci_common.c
@@ -406,6 +406,7 @@ void pci_determine_mem_io_space(struct pci_pbm_info *pbm)
}
 
num_pbm_ranges = i / sizeof(*pbm_ranges);
+   memset(&pbm->mem64_space, 0, sizeof(struct resource));
 
for (i = 0; i < num_pbm_ranges; i++) {
const struct linux_prom_pci_ranges *pr = &pbm_ranges[i];
@@ -451,7 +452,11 @@ void pci_determine_mem_io_space(struct pci_pbm_info *pbm)
break;
 
case 3:
-   /* XXX 64-bit MEM handling XXX */
+   /* 64-bit MEM handling */
+   pbm->mem64_space.start = a;
+   pbm->mem64_space.end = a + size - 1UL;
+   pbm->mem64_space.flags = IORESOURCE_MEM;
+   break;
 
default:
break;
@@ -465,15 +470,21 @@ void pci_determine_mem_io_space(struct pci_pbm_info *pbm)
prom_halt();
}
 
-   printk("%s: PCI IO[%llx] MEM[%llx]\n",
+   printk("%s: PCI IO[%llx] MEM[%llx]",
   pbm->name,
   pbm->io_space.start,
   pbm->mem_space.start);
+   if (pbm->mem64_space.flags)
+   printk(" MEM64[%llx]",
+  pbm->mem64_space.start);
+   printk("\n");
 
pbm->io_space.name = pbm->mem_space.name = pbm->name;
 
request_resource(&ioport_resource, &pbm->io_space);
request_resource(&iomem_resource, &pbm->mem_space);
+   if (pbm->mem64_space.flags)
+   request_resource(&iomem_resource, &pbm->mem64_space);
 
pci_register_legacy_regions(&a

[PATCH v2 22/49] PCI: Add alt_size allocation support

2015-07-14 Thread Yinghai Lu
On system with several pcie switches, BIOS allocate very tight resources
to the bridge bar, and it is not aligned to min_align as kernel allocation
code.

For example:
02:03.0---0c:00.0---0d:04.0---18:00.0
18:00.0 need 0x1000, and 0x0001.
BIOS only allocate 0x1010 to 0d:04.0 and above bridges.
Later after using /sys/bus/pci/devices/:0c:00.0/remove to remove 0c:00.0,
rescan with /sys/bus/pci/rescan can not allocate 0x1800 to 0c:00.0.

another example:
00:1c.0-[02-21]00.0-[03-21]--+-01.0-[04-12]00.0-[05-12]19.0-[06-12]00.0
 +-05.0-[13]--
 
+-07.0-[14-20]00.0-[15-20]--+-08.0-[16]--+-00.0
 |   |
\-00.1
 |   
+-14.0-[17]00.0
 |   
\-19.0-[18-20]00.0
 \-09.0-[21]--
06:00.0 need 0x400 and 0x80.
BIOS only allocate 0x480 to 05:19.0 and 04:00.0.
when 05:19.0 get removed via /sys/bus/pci/devices/:05:19.0/remove,
rescan with /sys/bus/pci/rescan will fail.
 pci :05:19.0: BAR 14: no space for [mem size 0x0600]
 pci :05:19.0: BAR 14: failed to assign [mem size 0x0600]
 pci :06:00.0: BAR 2: no space for [mem size 0x0400 64bit]
 pci :06:00.0: BAR 2: failed to assign [mem size 0x0400 64bit]
 pci :06:00.0: BAR 0: no space for [mem size 0x0080]
 pci :06:00.0: BAR 0: failed to assign [mem size 0x0080]
current code try to use align 0x200 and size 0x600, but parent
bridge only have 0x480.

Introduce alt_align/alt_size and store them in realloc list in addition
to addon info, and will try it after min_align/min_size allocation fails.

The alt_align is max_align, and alt_size is aligned size with bridge
minimum window alignment.

on my test setup:
00:1c.7---61:00.0---62:00.0
62:00.0 needs 0x80 and 0x2.
and 00:1c.7 only have 9M allocated for mmio, with this patch we have

 pci :61:00.0: bridge window [mem 0x0040-0x00ff] to [bus 62] 
add_size 0 add_align 0 alt_size 90 alt_align 80 must_size c0 
must_align 40
 pci :61:00.0: BAR 14: no space for [mem size 0x00c0]
 pci :61:00.0: BAR 14: failed to assign [mem size 0x00c0]
 pci :61:00.0: BAR 14: assigned [mem 0xdf00-0xdf8f]
 pci :62:00.0: BAR 0: assigned [mem 0xdf00-0xdf7f pref]
 pci :62:00.0: BAR 1: assigned [mem 0xdf80-0xdf81]
 pci :61:00.0: PCI bridge to [bus 62]
 pci :61:00.0:   bridge window [io  0x6000-0x6fff]
 pci :61:00.0:   bridge window [mem 0xdf00-0xdf8f]
 pci :00:1c.7: PCI bridge to [bus 61-68]
 pci :00:1c.7:   bridge window [io  0x6000-0x6fff]
 pci :00:1c.7:   bridge window [mem 0xdf00-0xdf8f]

so for 61:00.0 first try with 12M fails, and second try with 9M the
alt_size works. Later 62:00.0 get correct resource allocated too.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=100451
Reported-by: Yijing Wang 
Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 203 +---
 1 file changed, 191 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 1c0b4c5..9da8b23 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -324,7 +324,7 @@ static void reassign_resources_sorted(struct list_head 
*realloc_head,
 {
struct resource *res;
struct pci_dev_resource *add_res, *tmp;
-   resource_size_t add_size, align;
+   resource_size_t add_size, align, r_size;
int idx;
 
list_for_each_entry_safe(add_res, tmp, realloc_head, list) {
@@ -340,12 +340,23 @@ static void reassign_resources_sorted(struct list_head 
*realloc_head,
idx = res - &add_res->dev->resource[0];
add_size = add_res->add_size;
align = add_res->min_align;
-   if (!resource_size(res)) {
+   if (!add_size || !align) /* alt_size only */
+   goto out;
+
+   r_size = resource_size(res);
+   if (!r_size) {
res->start = align;
res->end = res->start + add_size - 1;
if (pci_assign_resource(add_res->dev, idx))
reset_resource(res);
} else {
+   /* could just assigned with alt, add difference ? */
+   resource_size_t must_size;
+
+   must_size = add_res->end - add_res->start + 1;
+   if (r_size < must_size)
+   add_size += must_size - r_size;
+
res->flags |= add_res->flags &
 (IORESOURCE_STARTALIGN|IO

[PATCH v2 30/49] PCI: Kill macro checking for bus io port sizing

2015-07-14 Thread Yinghai Lu
Use new generic version skip_isa_ioresource_align() instead.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 9478e91..f3bb309 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1172,15 +1172,12 @@ int skip_isa_ioresource_align(struct pci_bus *bus)
return 0;
 }
 
-static resource_size_t size_aligned_for_isa(resource_size_t size)
+static resource_size_t size_aligned_for_isa(resource_size_t size,
+   struct pci_bus *bus)
 {
-   /*
-* To be fixed in 2.5: we should have sort of HAVE_ISA
-*  flag in the struct pci_bus.
-*/
-#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
-   size = (size & 0xff) + ((size & ~0xffUL) << 2);
-#endif
+   if (!skip_isa_ioresource_align(bus))
+   size = (size & 0xff) + ((size & ~0xffUL) << 2);
+
return size;
 }
 
@@ -1249,12 +1246,12 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
}
}
 
-   size = size_aligned_for_isa(size);
+   size = size_aligned_for_isa(size, bus);
size += size1;
if (size || min_size)
size0 = calculate_size(size, min_size,
resource_size(b_res), min_align);
-   sum_add_size = size_aligned_for_isa(sum_add_size);
+   sum_add_size = size_aligned_for_isa(sum_add_size, bus);
sum_add_size += sum_add_size1;
if (sum_add_size < min_sum_size)
sum_add_size = min_sum_size;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 16/49] PCI: Check if resource is allocated before pci_assign

2015-07-14 Thread Yinghai Lu
Skip allocated resource in the list, as pci_assign_resource()
only can handle not assigned resource. And we could have
assigned resource already in the list before alt_size trying.

Signed-off-by: Yinghai Lu 
---
 drivers/pci/setup-bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 1b5fbca..1622ad2 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -386,7 +386,7 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
list_for_each_entry(dev_res, head, list) {
res = dev_res->res;
idx = res - &dev_res->dev->resource[0];
-   if (resource_size(res) &&
+   if (!res->parent && resource_size(res) &&
pci_assign_resource(dev_res->dev, idx)) {
if (fail_head)
add_to_list(fail_head, dev_res->dev, res);
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [lkp] [PCI] c0d1a185278: EDAC sbridge: Couldn't enable 8086:6fa0

2015-07-14 Thread Yinghai Lu
On Tue, Jul 14, 2015 at 6:13 AM, Huang Ying  wrote:
> FYI, we noticed the below changes on
>
> git://internal_merge_and_test_tree 
> revert-c0d1a18527806a3938e76a0e648cae690510b6a3-c0d1a18527806a3938e76a0e648cae690510b6a3
> commit c0d1a18527806a3938e76a0e648cae690510b6a3 ("PCI: Don't set flags to 0 
> when assign resource fail")
>
>
> [   30.350859] EDAC sbridge: Seeking for: PCI ID 8086:6fa0
> [   30.350867] sbridge_edac :ff:12.0: can't enable device: BAR 1 [mem 
> size 0x0010 disabled] not assigned
> [   30.350867] EDAC sbridge: Couldn't enable 8086:6fa0
> [   30.350901] EDAC sbridge: Some needed devices are missing
> [   30.350904] EDAC sbridge: Couldn't find mci handler
>

Hi Ying,

I updated the branch, it should fix the problem.

Can you test that again ?

BTW, there should be BIOS problem with it.

[4.144987] pci :ff:12.0: [8086:6fa0] type 00 class 0x088000
[4.151702] pci :ff:12.0: reg 0x14: [mem 0x-0x000f]
[4.158703] pci :ff:12.0: reg 0x18: [mem 0x-0x003f]
[4.165705] pci :ff:12.0: reg 0x1c: [mem 0x-0x000f]
[4.172706] pci :ff:12.0: reg 0x20: [mem 0x-0x003f]
[4.179708] pci :ff:12.0: reg 0x24: [mem 0x-0x000f]
[4.186739] pci :ff:12.1: [8086:6f30] type 00 class 0x110100
[4.193493] pci :ff:12.4: [8086:6f60] type 00 class 0x088000
[4.200208] pci :ff:12.4: reg 0x14: [mem 0x-0x000f]
[4.207210] pci :ff:12.4: reg 0x18: [mem 0x-0x003f]
[4.214212] pci :ff:12.4: reg 0x1c: [mem 0x-0x000f]
[4.221214] pci :ff:12.4: reg 0x20: [mem 0x-0x003f]
[4.228215] pci :ff:12.4: reg 0x24: [mem 0x-0x000f]

but for bios 0xff, there is no _CRS mmio.


also
[4.876484] pci :7f:1e.3: [Firmware Bug]: reg 0x10: invalid BAR
(can't size)
so it has silicon problem ?

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail

2015-07-11 Thread Yinghai Lu
On Fri, Jul 10, 2015 at 5:03 PM, Wei Yang  wrote:
> On Thu, Jul 09, 2015 at 10:49:06PM -0700, Yinghai Lu wrote:

> I have tested you latest branch with this one as the last commit:
>
> ec94cc7 PCI: Don't set flags to 0 when assign resource fail
>
> My P8 machine boots up.

Good.

>
> Another issue is the SRIOV couldn't be enabled, I am checking the reason.
> This may not related to this patch series.

wonder if could be related to :

https://git.kernel.org/cgit/linux/kernel/git/yinghai/linux-yinghai.git/patch/?id=c642f79dcd6becbb92741816e0b5e81f7664acc7
PCI: Restore pref mmio allocation logic for hostbridge without mmio64

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail

2015-07-09 Thread Yinghai Lu
On Thu, Jul 9, 2015 at 7:48 PM, Yinghai Lu  wrote:
> On Thu, Jul 9, 2015 at 7:30 PM, Wei Yang  wrote:
>> If you could update your for-pci-v4.3-next branch, that would be more
>> convenient for me to do the test.
>
> Just updated that branch, please check it.
>

just updated the branch again.

If you don't want to re get it again, please apply attached patch.
---
 drivers/pci/bus.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6/drivers/pci/bus.c
===
--- linux-2.6.orig/drivers/pci/bus.c
+++ linux-2.6/drivers/pci/bus.c
@@ -140,7 +140,7 @@ static int pci_bus_alloc_from_region(str
 	type_mask |= IORESOURCE_TYPE_BITS;
 
 	pci_bus_for_each_resource(bus, r, i) {
-		if (!r)
+		if (!r || resource_disabled(r))
 			continue;
 
 		/* type_mask must match */


Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail

2015-07-09 Thread Yinghai Lu
On Thu, Jul 9, 2015 at 7:30 PM, Wei Yang  wrote:
> If you could update your for-pci-v4.3-next branch, that would be more
> convenient for me to do the test.

Just updated that branch, please check it.

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail

2015-07-09 Thread Yinghai Lu
On Wed, Jul 8, 2015 at 11:04 PM, Wei Yang  wrote:
> This one is on top of the last one ? or replace the last one?

should be just before last one.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail

2015-07-08 Thread Yinghai Lu
On Wed, Jul 8, 2015 at 8:30 PM, Wei Yang  wrote:
> Hi, Yinghai
>
> This patch may introduce some problem.
>
> On my P8 machine, after applying this patch, I see following error:
>
> [0.589948] pnv_ioda_setup_pe_seg: trigger IO SEG 0
> [0.589992] pnv_ioda_setup_pe_seg: res[io  0x1000-0x3fff] 100
>
> The last 0x100 is the res->flags, which indicates the UNSET and DISABLED bit
> is not set.

Maybe we should introduce resource_disabled() for that.

Please check if attached patch would fix the problem.

Thanks

Yinghai
Subject: [PATCH] PCI: Introduce resource_disabled()

so we can cover !flags and IORESOURCE_DISABLED both.

Signed-off-by: Yinghai Lu 

diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c
index 82f738e..91a7153 100644
--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@@ -282,7 +282,7 @@ pcibios_claim_one_bus(struct pci_bus *b)
 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 			struct resource *r = &dev->resource[i];
 
-			if (r->parent || !r->start || !r->flags)
+			if (r->parent || !r->start || resource_disabled(r))
 continue;
 			if (pci_has_flag(PCI_PROBE_ONLY) ||
 			(r->flags & IORESOURCE_PCI_FIXED)) {
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 7cc3be9..cc293ea 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -501,7 +501,7 @@ void pcibios_fixup_device_resources(struct pci_dev *dev)
 	for (idx = 0; idx < PCI_BRIDGE_RESOURCES; idx++) {
 		struct resource *r = &dev->resource[idx];
 
-		if (!r->flags || r->parent || !r->start)
+		if (resource_disabled(r) || r->parent || !r->start)
 			continue;
 
 		pci_claim_resource(dev, idx);
@@ -519,7 +519,7 @@ static void pcibios_fixup_bridge_resources(struct pci_dev *dev)
 	for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) {
 		struct resource *r = &dev->resource[idx];
 
-		if (!r->flags || r->parent || !r->start)
+		if (resource_disabled(r) || r->parent || !r->start)
 			continue;
 
 		pci_claim_bridge_resource(dev, idx);
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index ae838ed..67848f8 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -705,7 +705,7 @@ static void pcibios_fixup_resources(struct pci_dev *dev)
 	}
 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
 		struct resource *res = dev->resource + i;
-		if (!res->flags)
+		if (resource_disabled(res))
 			continue;
 		if (res->start == 0) {
 			pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]",
@@ -806,7 +806,7 @@ static void pcibios_fixup_bridge(struct pci_bus *bus)
 	pci_bus_for_each_resource(bus, res, i) {
 		if (!res)
 			continue;
-		if (!res->flags)
+		if (resource_disabled(res))
 			continue;
 		if (i >= 3 && bus->self->transparent)
 			continue;
@@ -993,7 +993,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus *bus)
 		 pci_domain_nr(bus), bus->number);
 
 	pci_bus_for_each_resource(bus, res, i) {
-		if (!res || !res->flags
+		if (!res || resource_disabled(res)
 		|| res->start > res->end || res->parent)
 			continue;
 		if (bus->parent == NULL)
@@ -1095,7 +1095,8 @@ static void __init pcibios_allocate_resources(int pass)
 			r = &dev->resource[idx];
 			if (r->parent)		/* Already allocated */
 continue;
-			if (!r->flags || (r->flags & IORESOURCE_UNSET))
+			if (resource_disabled(r) ||
+			(r->flags & IORESOURCE_UNSET))
 continue;	/* Not assigned at all */
 			/* We only allocate ROMs on pass 1 just in case they
 			 * have been screwed up by firmware
@@ -1226,7 +1227,7 @@ void pcibios_claim_one_bus(struct pci_bus *bus)
 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 			struct resource *r = &dev->resource[i];
 
-			if (r->parent || !r->start || !r->flags)
+			if (r->parent || !r->start || resource_disabled(r))
 continue;
 
 			pr_debug("PCI: Claiming %s: ", pci_name(dev));
@@ -1286,7 +1287,7 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose,
 	res->start = (res->start + io_offset) & 0xu;
 	res->end = (res->end + io_offset) & 0xu;
 
-	if (!res->flags) {
+	if (resource_disabled(res)) {
 		pr_warn("PCI: I/O resource not set for host ");
 		pr_cont("bridge %s (domain %d)\n",
 			hose->dn->full_name, hose->global_number);
@@ -1306,7 +1307,7 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose,
 	/* Hookup PHB Memory resources */
 	for (i = 0; i < 3; ++i) {
 		res = &hose->mem_resources[i];
-		if (!res->flags) {
+		if (resource_disabled(res)) {
 			if (i > 0)
 continue;
 			pr_err("PCI: Memory resource 0 not set for ");
diff --git a/arch/mn10300/unit-asb2305/pci-asb2305.c b/arch/mn10300/unit-asb2305/pci-asb2305.c
index b5b036f..a249821 100644

[PATCH 15/42] x86, kaslr: Introduce fetch_random_virt_offset to randomize the kernel text mapping address

2015-07-07 Thread Yinghai Lu
From: Baoquan He 

Kaslr extended kernel text mapping region size from 512M to 1G,
namely CONFIG_RANDOMIZE_BASE_MAX_OFFSET. This means kernel text
can be mapped to below region:

[__START_KERNEL_map + LOAD_PHYSICAL_ADDR, __START_KERNEL_map + 1G]

Introduce a function find_random_virt_offset() to get random value
between LOAD_PHYSICAL_ADDR and CONFIG_RANDOMIZE_BASE_MAX_OFFSET.
This random value will be added to __START_KERNEL_map to get the
starting address which kernel text is mapped from. Since slot can
be anywhere of this region, means it is an independent slot_area,
it is simple to get a slot according to random value.

Signed-off-by: Baoquan He 
---
 arch/x86/boot/compressed/aslr.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 81070e9..775c6f9 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -366,6 +366,27 @@ static unsigned long find_random_addr(unsigned long 
minimum,
return slots_fetch_random();
 }
 
+static unsigned long find_random_virt_offset(unsigned long minimum,
+ unsigned long image_size)
+{
+   unsigned long slot_num, random;
+
+   /* Make sure minimum is aligned. */
+   minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+
+   if (image_size <= CONFIG_PHYSICAL_ALIGN)
+   slot_num = (CONFIG_RANDOMIZE_BASE_MAX_OFFSET - minimum) /
+   CONFIG_PHYSICAL_ALIGN;
+   else
+   slot_num = (CONFIG_RANDOMIZE_BASE_MAX_OFFSET -
+   minimum - image_size) /
+   CONFIG_PHYSICAL_ALIGN + 1;
+
+   random = get_random_long() % slot_num;
+
+   return random * CONFIG_PHYSICAL_ALIGN + minimum;
+}
+
 unsigned char *choose_kernel_location(unsigned char *input,
  unsigned long input_size,
  unsigned char *output,
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 11/42] x86, boot: Add checking for memcpy

2015-07-07 Thread Yinghai Lu
parse_elf is using local memcpy to move section to running position.

That memcpy actually only support no overlapping or dest < src.

Add checking in memcpy to find out wrong with future use, at that time
we will need to have backward memcpy for it.

Also put comments in parse_elf about the fact.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/misc.c   | 14 +++---
 arch/x86/boot/compressed/misc.h   |  2 ++
 arch/x86/boot/compressed/string.c | 28 ++--
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 8fb74ba..83f98a5 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -106,9 +106,6 @@
 #undef memset
 #define memzero(s, n)  memset((s), 0, (n))
 
-
-static void error(char *m);
-
 /*
  * This is set up by the setup-routine at boot-time
  */
@@ -218,7 +215,7 @@ void __putstr(const char *s)
outb(0xff & (pos >> 1), vidport+1);
 }
 
-static void error(char *x)
+void error(char *x)
 {
error_putstr("\n\n");
error_putstr(x);
@@ -353,9 +350,12 @@ static void parse_elf(void *output)
 #else
dest = (void *)(phdr->p_paddr);
 #endif
-   memcpy(dest,
-  output + phdr->p_offset,
-  phdr->p_filesz);
+   /*
+* simple version memcpy only can work when dest is
+*   smaller than src or no overlapping.
+* Here dest is smaller than src always.
+*/
+   memcpy(dest, output + phdr->p_offset, phdr->p_filesz);
break;
default: /* Ignore other PT_* */ break;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 0104c0be..af135b7 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -36,6 +36,8 @@ extern struct boot_params *real_mode; /* Pointer to 
real-mode data */
 void __putstr(const char *s);
 #define error_putstr(__x)  __putstr(__x)
 
+void error(char *x);
+
 #ifdef CONFIG_X86_VERBOSE_BOOTUP
 
 #define debug_putstr(__x)  __putstr(__x)
diff --git a/arch/x86/boot/compressed/string.c 
b/arch/x86/boot/compressed/string.c
index 00e788b..03805a4 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,7 +1,7 @@
 #include "../string.c"
 
 #ifdef CONFIG_X86_32
-void *memcpy(void *dest, const void *src, size_t n)
+void *__memcpy(void *dest, const void *src, size_t n)
 {
int d0, d1, d2;
asm volatile(
@@ -15,7 +15,7 @@ void *memcpy(void *dest, const void *src, size_t n)
return dest;
 }
 #else
-void *memcpy(void *dest, const void *src, size_t n)
+void *__memcpy(void *dest, const void *src, size_t n)
 {
long d0, d1, d2;
asm volatile(
@@ -30,6 +30,30 @@ void *memcpy(void *dest, const void *src, size_t n)
 }
 #endif
 
+void *memcpy(void *dest, const void *src, size_t n)
+{
+   unsigned long start_dest, end_dest;
+   unsigned long start_src, end_src;
+   unsigned long max_start, min_end;
+
+   if (dest < src)
+   return __memcpy(dest, src, n);
+
+   start_dest = (unsigned long)dest;
+   end_dest = (unsigned long)dest + n;
+   start_src = (unsigned long)src;
+   end_src = (unsigned long)src + n;
+   max_start = (start_dest > start_src) ?  start_dest : start_src;
+   min_end = (end_dest < end_src) ? end_dest : end_src;
+
+   if (max_start >= min_end)
+   return __memcpy(dest, src, n);
+
+   error("memcpy does not support overlapping with dest > src!\n");
+
+   return dest;
+}
+
 void *memset(void *s, int c, size_t n)
 {
int i;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 08/42] x86, kaslr: Get correct max_addr for relocs pointer

2015-07-07 Thread Yinghai Lu
There is boundary checking for pointer in kaslr relocation handling.

Current code is using output_len, and that is VO (vmlinux after objcopy)
file size plus vmlinux.relocs file size.

That is not right, as we should use loaded address for running.

At that time parse_elf already move the sections according to ELF headers.

The valid range should be VO [_text, __bss_start) loaded physical addresses.

In the patch, add export for __bss_start to voffset.h and use it to get
max_addr.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/Makefile | 2 +-
 arch/x86/boot/compressed/misc.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index 50daea7..e12a93c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -40,7 +40,7 @@ LDFLAGS_vmlinux := -T
 hostprogs-y:= mkpiggy
 HOST_EXTRACFLAGS += -I$(srctree)/tools/include
 
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define 
VO_\2 _AC(0x\1,UL)/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] 
\(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
 
 quiet_cmd_voffset = VOFFSET $@
   cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index db97bdf..8fb74ba 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -234,7 +234,7 @@ static void handle_relocations(void *output, unsigned long 
output_len)
int *reloc;
unsigned long delta, map, ptr;
unsigned long min_addr = (unsigned long)output;
-   unsigned long max_addr = min_addr + output_len;
+   unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
 
/*
 * Calculate the delta between where vmlinux was linked to load
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 14/42] x86, kaslr: Add two functions which will be used later

2015-07-07 Thread Yinghai Lu
From: Baoquan He 

Add two functions mem_min_overlap() and store_slot_info() which will be
used later.

Given a memory region mem_min_overlap will iterate all avoid region to
find the first one which overlap with it.

store_slot_info() calculates the slot info of passed in region and
store it into slot_areas[].

Signed-off-by: Baoquan He 
---
 arch/x86/boot/compressed/aslr.c | 51 +
 1 file changed, 51 insertions(+)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index e3995f1..81070e9 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -214,6 +214,40 @@ static bool mem_avoid_overlap(struct mem_vector *img)
return false;
 }
 
+static unsigned long
+mem_min_overlap(struct mem_vector *img, struct mem_vector *out)
+{
+   int i;
+   struct setup_data *ptr;
+   unsigned long min = img->start + img->size;
+
+   for (i = 0; i < MEM_AVOID_MAX; i++) {
+   if (mem_overlaps(img, &mem_avoid[i]) &&
+   (mem_avoid[i].start < min)) {
+   *out = mem_avoid[i];
+   min = mem_avoid[i].start;
+   }
+   }
+
+   /* Check all entries in the setup_data linked list. */
+   ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data;
+   while (ptr) {
+   struct mem_vector avoid;
+
+   avoid.start = (unsigned long)ptr;
+   avoid.size = sizeof(*ptr) + ptr->len;
+
+   if (mem_overlaps(img, &avoid) && (avoid.start < min)) {
+   *out = avoid;
+   min = avoid.start;
+   }
+
+   ptr = (struct setup_data *)(unsigned long)ptr->next;
+   }
+
+   return min;
+}
+
 static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
   CONFIG_PHYSICAL_ALIGN];
 
@@ -230,6 +264,23 @@ static unsigned long slot_max;
 
 static unsigned long slot_area_index;
 
+static void store_slot_info(struct mem_vector *region, unsigned long 
image_size)
+{
+   struct slot_area slot_area;
+
+   slot_area.addr = region->start;
+   if (image_size <= CONFIG_PHYSICAL_ALIGN)
+   slot_area.num = region->size / CONFIG_PHYSICAL_ALIGN;
+   else
+   slot_area.num = (region->size - image_size) /
+   CONFIG_PHYSICAL_ALIGN + 1;
+
+   if (slot_area.num > 0) {
+   slot_areas[slot_area_index++] = slot_area;
+   slot_max += slot_area.num;
+   }
+}
+
 static void slots_append(unsigned long addr)
 {
/* Overflowing the slots list should be impossible. */
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 12/42] x86, kaslr: Fix a bug that relocation can not be handled when kernel is loaded above 2G

2015-07-07 Thread Yinghai Lu
From: Baoquan He 

When process 32 bit relocation tables a local variable extended is
defined to calculate the physical address of relocs entry. However
it's type is int which is enough for i386, for x86_64 not enough.
That's why relocation can only be handled when kernel is loaded
below 2G, otherwise a overflow will happen and cause system hang.

Here change it to long as 32 bit inverse relocation processing does,
and this change is safe for i386 relocation handling too.

Signed-off-by: Baoquan He 
---
 arch/x86/boot/compressed/misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 83f98a5..bfa4f0a 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -273,7 +273,7 @@ static void handle_relocations(void *output, unsigned long 
output_len)
 * So we work backwards from the end of the decompressed image.
 */
for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
-   int extended = *reloc;
+   long extended = *reloc;
extended += map;
 
ptr = (unsigned long)extended;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 18/42] x86, kaslr: Remove useless codes

2015-07-07 Thread Yinghai Lu
From: Baoquan He 

Several auxiliary functions and slots[] are not needed any more since
struct slot_area is used to store the slot info of kaslr now. Hence
remove them in this patch.

Signed-off-by: Baoquan He 
---
 arch/x86/boot/compressed/aslr.c | 24 
 1 file changed, 24 deletions(-)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 9158882..7c0e1da 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -112,17 +112,6 @@ struct mem_vector {
 #define MEM_AVOID_MAX 4
 static struct mem_vector mem_avoid[MEM_AVOID_MAX];
 
-static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
-{
-   /* Item at least partially before region. */
-   if (item->start < region->start)
-   return false;
-   /* Item at least partially after region. */
-   if (item->start + item->size > region->start + region->size)
-   return false;
-   return true;
-}
-
 static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
 {
/* Item one is entirely before item two. */
@@ -248,9 +237,6 @@ mem_min_overlap(struct mem_vector *img, struct mem_vector 
*out)
return min;
 }
 
-static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
-  CONFIG_PHYSICAL_ALIGN];
-
 struct slot_area {
unsigned long addr;
int num;
@@ -281,16 +267,6 @@ static void store_slot_info(struct mem_vector *region, 
unsigned long image_size)
}
 }
 
-static void slots_append(unsigned long addr)
-{
-   /* Overflowing the slots list should be impossible. */
-   if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
-   CONFIG_PHYSICAL_ALIGN)
-   return;
-
-   slots[slot_max++] = addr;
-}
-
 static unsigned long slots_fetch_random(void)
 {
unsigned long random;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 19/42] x86, kaslr: Allow random address could be below loaded address

2015-07-07 Thread Yinghai Lu
Now new output buffer is always after current one.

With correct tracking in mem_avoid, we can buffer below that.

That would make sure when bootloader like patched grub2 or kexec
have put output rather near the end of ram, we still can get
random base below output.

Now just pick 512M as min_addr.

with this patch, will get:

early console in decompress_kernel
decompress_kernel:
  input: [0x13e9ee3b4-0x13f36b9df], output: [0x13c00-0x13f394fff], heap: 
[0x13f376ac0-0x13f37eabf]
boot via startup_64
KASLR using RDTSC...
KASLR using RDTSC...
 new output: [0x6f00-0x72394fff]

Decompressing Linux... xz... Parsing ELF... Performing relocations... done.
Booting the kernel.
[0.00] bootconsole [uart0] enabled
[0.00] Kernel Layout:
[0.00]   .text: [0x6f00-0x70096a9c]
[0.00] .rodata: [0x7020-0x70a4efff]
[0.00]   .data: [0x70c0-0x70e4e9bf]
[0.00]   .init: [0x70e5-0x7120bfff]
[0.00].bss: [0x71219000-0x7234efff]
[0.00].brk: [0x7234f000-0x72374fff]

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/aslr.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 7c0e1da..a1535c1 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -403,7 +403,8 @@ void choose_kernel_location(unsigned char *input,
unsigned long output_run_size,
unsigned char **virt_offset)
 {
-   unsigned long random;
+   unsigned long random, min_addr;
+
*virt_offset = (unsigned char *)LOAD_PHYSICAL_ADDR;
 
 #ifdef CONFIG_HIBERNATION
@@ -424,8 +425,13 @@ void choose_kernel_location(unsigned char *input,
mem_avoid_init((unsigned long)input, input_size,
   (unsigned long)*output);
 
+   /* start from 512M */
+   min_addr = (unsigned long)*output;
+   if (min_addr > (512UL<<20))
+   min_addr = 512UL<<20;
+
/* Walk e820 and find a random address. */
-   random = find_random_phy_addr((unsigned long)*output, output_run_size);
+   random = find_random_phy_addr(min_addr, output_run_size);
if (!random)
debug_putstr("KASLR could not find suitable E820 region...\n");
else {
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 13/42] x86, kaslr: Introduce struct slot_area to manage randomization slot info

2015-07-07 Thread Yinghai Lu
From: Baoquan He 

Kernel is expected to be randomly reloaded anywhere in the whole
physical memory area, it could be near 64T at most. In this case
there could be about 4*1024*1024 randomization slots. Hence the
old slot array will cost too much memory and also not efficient
to store the slot information one by one into slot array.

Here introduce struct slot_area to manage randomization slot info
in one contiguous memory area excluding the avoid area. slot_areas
is used to store all slot area info. Since setup_data is a linked
list, could contain many datas by pointer to point one by one,
excluding them will split RAM memory into many smaller areas, here
only take the first 100 slot areas if too many of them.

Signed-off-by: Baoquan He 
---
 arch/x86/boot/compressed/aslr.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 0990c78..e3995f1 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -216,8 +216,20 @@ static bool mem_avoid_overlap(struct mem_vector *img)
 
 static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
   CONFIG_PHYSICAL_ALIGN];
+
+struct slot_area {
+   unsigned long addr;
+   int num;
+};
+
+#define MAX_SLOT_AREA 100
+
+static struct slot_area slot_areas[MAX_SLOT_AREA];
+
 static unsigned long slot_max;
 
+static unsigned long slot_area_index;
+
 static void slots_append(unsigned long addr)
 {
/* Overflowing the slots list should be impossible. */
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 20/42] x86, boot: Add printf support for early console in compressed/misc.c

2015-07-07 Thread Yinghai Lu
Reuse printf.c in x86 setup code.
And print out decompress_kernel input and output info.

Later decompresser code could print out more info for debug info.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/Makefile |  2 +-
 arch/x86/boot/compressed/misc.c   | 38 ++
 arch/x86/boot/compressed/misc.h   |  7 +++
 arch/x86/boot/compressed/printf.c |  5 +
 4 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/boot/compressed/printf.c

diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index 66461b4..8fc7dd9 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -54,7 +54,7 @@ $(obj)/misc.o: $(obj)/../voffset.h
 
 vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
$(obj)/string.o $(obj)/cmdline.o \
-   $(obj)/piggy.o $(obj)/cpuflags.o
+   $(obj)/printf.o $(obj)/piggy.o $(obj)/cpuflags.o
 
 vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 6b2a308..ee73b7b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -387,6 +387,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, 
memptr heap,
unsigned char *output_orig = output;
unsigned long output_run_size;
unsigned char *virt_offset;
+   unsigned long init_size;
 
real_mode = rmode;
 
@@ -414,6 +415,37 @@ asmlinkage __visible void *decompress_kernel(void *rmode, 
memptr heap,
 
output_run_size = output_len > run_size ? output_len : run_size;
 
+   init_size = real_mode->hdr.init_size;
+   debug_putstr("decompress_kernel:\n");
+   debug_printf("   input: [0x%010lx-0x%010lx]\n",
+(unsigned long)input_data,
+(unsigned long)input_data + input_len - 1);
+   debug_printf("  output: [0x%010lx-0x%010lx] 0x%08lx: output_len\n",
+(unsigned long)output,
+(unsigned long)output + output_len - 1,
+(unsigned long)output_len);
+   debug_printf("  [0x%010lx-0x%010lx] 0x%08lx: run_size\n",
+(unsigned long)output,
+(unsigned long)output + run_size - 1,
+(unsigned long)run_size);
+   debug_printf("  [0x%010lx-0x%010lx] 0x%08lx: 
output_run_size\n",
+(unsigned long)output,
+(unsigned long)output + output_run_size - 1,
+(unsigned long)output_run_size);
+   debug_printf("  [0x%010lx-0x%010lx] 0x%08lx: init_size\n",
+(unsigned long)output,
+(unsigned long)output + init_size - 1,
+(unsigned long)init_size);
+   debug_printf("ZO text/data: [0x%010lx-0x%010lx]\n",
+(unsigned long)input_data + input_len,
+(unsigned long)output + init_size - 1);
+   debug_printf(" ZO heap: [0x%010lx-0x%010lx]\n",
+(unsigned long)heap,
+(unsigned long)heap + BOOT_HEAP_SIZE - 1);
+   debug_printf("  VO bss/brk: [0x%010lx-0x%010lx]\n",
+(unsigned long)output + (VO___bss_start - VO__text),
+(unsigned long)output + run_size - 1);
+
/*
 * The memory hole needed for the kernel is the larger of either
 * the entire decompressed kernel plus relocation table, or the
@@ -422,6 +454,12 @@ asmlinkage __visible void *decompress_kernel(void *rmode, 
memptr heap,
choose_kernel_location(input_data, input_len, &output,
   output_run_size, &virt_offset);
 
+   if (output != output_orig)
+   debug_printf("  new output: [0x%010lx-0x%010lx] 0x%08lx: 
output_run_size\n",
+(unsigned long)output,
+(unsigned long)output + output_run_size - 1,
+(unsigned long)output_run_size);
+
/* Validate memory location choices. */
if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
error("Destination address inappropriately aligned");
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index b44a7c0..410e5d3 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -38,14 +38,21 @@ void __putstr(const char *s);
 
 void error(char *x);
 
+/* printf.c */
+int sprintf(char *buf, const char *fmt, ...);
+int printf(const char *fmt, ...);
+
 #ifdef CONFIG_X86_VERBOSE_BOOTUP
 
 #define debug_putstr(__x)  __putstr(__x)
+#define debug_printf printf
 
 #else
 
 static inline void debug_putstr(const char *s)
 { }
+static inline int debug_printf(const char *fmt, ...)
+{ }
 
 #endif
 

[PATCH 33/42] x86, boot: Add add_pci handler for SETUP_PCI

2015-07-07 Thread Yinghai Lu
Let it reserve setup_data, and keep it's own list.

Also clear the hdr.setup_data, as all handler now handle or
reserve setup_data locally already.

Cc: Bjorn Helgaas 
Cc: Matt Fleming 
Cc: linux-...@vger.kernel.org
Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/pci.h |  2 ++
 arch/x86/kernel/setup.c|  8 
 arch/x86/pci/common.c  | 42 --
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 4625943..7d2468c 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -80,8 +80,10 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct 
vm_area_struct *vma,
 
 #ifdef CONFIG_PCI
 extern void early_quirks(void);
+void add_pci(u64 pa_data);
 #else
 static inline void early_quirks(void) { }
+static inline void add_pci(u64 pa_data) { }
 #endif
 
 extern void pci_iommu_alloc(void);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a3b65f1..de0f830 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -440,6 +440,8 @@ static void __init parse_setup_data(void)
pa_next = data->next;
early_memunmap(data, sizeof(*data));
 
+   printk(KERN_DEBUG "setup_data type: %d @ %#010llx\n",
+   data_type, pa_data);
switch (data_type) {
case SETUP_E820_EXT:
parse_e820_ext(pa_data, data_len);
@@ -447,14 +449,20 @@ static void __init parse_setup_data(void)
case SETUP_DTB:
add_dtb(pa_data);
break;
+   case SETUP_PCI:
+   add_pci(pa_data);
+   break;
case SETUP_EFI:
parse_efi_setup(pa_data, data_len);
break;
default:
+   pr_warn("Unknown setup_data type: %d @ %#010llx 
ignored!\n",
+   data_type, pa_data);
break;
}
pa_data = pa_next;
}
+   boot_params.hdr.setup_data = 0; /* all done */
 }
 
 static void __init memblock_x86_reserve_range_setup_data(void)
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 8fd6f44..16ace12 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -641,31 +642,44 @@ unsigned int pcibios_assign_all_busses(void)
return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
 }
 
+static u64 pci_setup_data;
+void __init add_pci(u64 pa_data)
+{
+   struct setup_data *data;
+
+   data = early_memremap(pa_data, sizeof(*data));
+   memblock_reserve(pa_data, sizeof(*data) + data->len);
+   data->next = pci_setup_data;
+   pci_setup_data = pa_data;
+   early_memunmap(data, sizeof(*data));
+}
+
 int pcibios_add_device(struct pci_dev *dev)
 {
struct setup_data *data;
struct pci_setup_rom *rom;
u64 pa_data;
 
-   pa_data = boot_params.hdr.setup_data;
+   pa_data = pci_setup_data;
while (pa_data) {
data = ioremap(pa_data, sizeof(*rom));
if (!data)
return -ENOMEM;
 
-   if (data->type == SETUP_PCI) {
-   rom = (struct pci_setup_rom *)data;
-
-   if ((pci_domain_nr(dev->bus) == rom->segment) &&
-   (dev->bus->number == rom->bus) &&
-   (PCI_SLOT(dev->devfn) == rom->device) &&
-   (PCI_FUNC(dev->devfn) == rom->function) &&
-   (dev->vendor == rom->vendor) &&
-   (dev->device == rom->devid)) {
-   dev->rom = pa_data +
- offsetof(struct pci_setup_rom, romdata);
-   dev->romlen = rom->pcilen;
-   }
+   rom = (struct pci_setup_rom *)data;
+
+   if ((pci_domain_nr(dev->bus) == rom->segment) &&
+   (dev->bus->number == rom->bus) &&
+   (PCI_SLOT(dev->devfn) == rom->device) &&
+   (PCI_FUNC(dev->devfn) == rom->function) &&
+   (dev->vendor == rom->vendor) &&
+   (dev->device == rom->devid)) {
+   dev->rom = pa_data +
+ offsetof(struct pci_setup_rom, romdata);
+   dev->romlen = rom->pcilen;
+   dev_printk(KERN_DEBUG, &dev->dev, "set rom to [%#010lx, 
%#010lx] via SETUP_PCI\n",
+   

[PATCH 31/42] x86, efi: Copy SETUP_EFI data and access directly

2015-07-07 Thread Yinghai Lu
The copy will be in __initdata, and it is small.

We can use pointer to access the setup_data instead of using early_memmap
everywhere.

Cc: Matt Fleming 
Cc: linux-...@vger.kernel.org
Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/efi.h |  2 +-
 arch/x86/platform/efi/efi.c| 13 ++---
 arch/x86/platform/efi/efi_64.c | 10 +-
 arch/x86/platform/efi/quirks.c | 23 ++-
 4 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 155162e..a3e3aee 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -116,7 +116,7 @@ struct efi_setup_data {
u64 reserved[8];
 };
 
-extern u64 efi_setup;
+extern struct efi_setup_data *efi_setup;
 
 #ifdef CONFIG_EFI
 
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index cfba30f..33036ce 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -68,7 +68,7 @@ static efi_config_table_type_t arch_tables[] __initdata = {
{NULL_GUID, NULL, NULL},
 };
 
-u64 efi_setup; /* efi setup_data physical address */
+struct efi_setup_data *efi_setup __initdata; /* cached efi setup_data pointer 
*/
 
 static int add_efi_memmap __initdata;
 static int __init setup_add_efi_memmap(char *arg)
@@ -257,20 +257,13 @@ static int __init efi_systab_init(void *phys)
 {
if (efi_enabled(EFI_64BIT)) {
efi_system_table_64_t *systab64;
-   struct efi_setup_data *data = NULL;
+   struct efi_setup_data *data = efi_setup;
u64 tmp = 0;
 
-   if (efi_setup) {
-   data = early_memremap(efi_setup, sizeof(*data));
-   if (!data)
-   return -ENOMEM;
-   }
systab64 = early_memremap((unsigned long)phys,
 sizeof(*systab64));
if (systab64 == NULL) {
pr_err("Couldn't map the system table!\n");
-   if (data)
-   early_memunmap(data, sizeof(*data));
return -ENOMEM;
}
 
@@ -303,8 +296,6 @@ static int __init efi_systab_init(void *phys)
tmp |= data ? data->tables : systab64->tables;
 
early_memunmap(systab64, sizeof(*systab64));
-   if (data)
-   early_memunmap(data, sizeof(*data));
 #ifdef CONFIG_X86_32
if (tmp >> 32) {
pr_err("EFI data located above 4GB, disabling EFI.\n");
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index a0ac0f9..a255491 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -295,9 +295,17 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, 
unsigned long size,
return (void __iomem *)__va(phys_addr);
 }
 
+static struct efi_setup_data efi_setup_data __initdata;
+
 void __init parse_efi_setup(u64 phys_addr, u32 data_len)
 {
-   efi_setup = phys_addr + sizeof(struct setup_data);
+   struct efi_setup_data *data;
+
+   data = early_memremap(phys_addr + sizeof(struct setup_data),
+ sizeof(*data));
+   efi_setup_data = *data;
+   early_memunmap(data, sizeof(*data));
+   efi_setup = &efi_setup_data;
 }
 
 void __init efi_runtime_mkexec(void)
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 1c7380d..45fec7d 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -203,9 +203,8 @@ void __init efi_free_boot_services(void)
  */
 int __init efi_reuse_config(u64 tables, int nr_tables)
 {
-   int i, sz, ret = 0;
+   int i, sz;
void *p, *tablep;
-   struct efi_setup_data *data;
 
if (!efi_setup)
return 0;
@@ -213,22 +212,15 @@ int __init efi_reuse_config(u64 tables, int nr_tables)
if (!efi_enabled(EFI_64BIT))
return 0;
 
-   data = early_memremap(efi_setup, sizeof(*data));
-   if (!data) {
-   ret = -ENOMEM;
-   goto out;
-   }
-
-   if (!data->smbios)
-   goto out_memremap;
+   if (!efi_setup->smbios)
+   return 0;
 
sz = sizeof(efi_config_table_64_t);
 
p = tablep = early_memremap(tables, nr_tables * sz);
if (!p) {
pr_err("Could not map Configuration table!\n");
-   ret = -ENOMEM;
-   goto out_memremap;
+   return -ENOMEM;
}
 
for (i = 0; i < efi.systab->nr_tables; i++) {
@@ -237,15 +229,12 @@ int __init efi_reuse_config(u64 tables, int nr_tables)
guid = ((efi_config_table_64_t *)p)->guid;
 
if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
-   

[PATCH 40/42] x86, 64bit: remove highmap for not needed ranges

2015-07-07 Thread Yinghai Lu
add cleanup_highmap_late to remove highmap for initmem, around rodata, and
[_brk_end, all_end).

Kernel Layout:

[0.00]   .text: [0x0100-0x0200df88]
[0.00] .rodata: [0x0220-0x02a1dfff]
[0.00]   .data: [0x02c0-0x02e510ff]
[0.00]   .init: [0x02e53000-0x03213fff]
[0.00].bss: [0x03222000-0x0437cfff]
[0.00].brk: [0x0437d000-0x043a2fff]

Actually used brk:
[0.270365] memblock_reserve: [0x000437d000-0x0004383fff] flags 0x0 
BRK

Before patch:
---[ High Kernel Mapping ]---
0x8000-0x8100  16M   pmd
0x8100-0x8200  16M ro PSE GLB x  pmd
0x8200-0x82011000  68K ro GLB x  pte
0x82011000-0x82201980K RW GLB x  pte
0x8220-0x82a0   8M ro PSE GLB NX pmd
0x82a0-0x82a1e000 120K ro GLB NX pte
0x82a1e000-0x82c01928K RW GLB NX pte
0x82c0-0x82e0   2M RW PSE GLB NX pmd
0x82e0-0x8300   2M RW GLB NX pte
0x8300-0x8320   2M RW PSE GLB NX pmd
0x8320-0x8340   2M RW GLB NX pte
0x8340-0x8440  16M RW PSE GLB NX pmd
0x8440-0xa000 444M   pmd

After patch:
---[ High Kernel Mapping ]---
0x8000-0x8100  16M   pmd
0x8100-0x8200  16M ro PSE GLB x  pmd
0x8200-0x82012000  72K ro GLB x  pte
0x82012000-0x82201976K   pte
0x8220-0x82a0   8M ro PSE GLB NX pmd
0x82a0-0x82a1e000 120K ro GLB NX pte
0x82a1e000-0x82c01928K   pte
0x82c0-0x82e0   2M RW PSE GLB NX pmd
0x82e0-0x82e53000 332K RW GLB NX pte
0x82e53000-0x83001716K   pte
0x8300-0x8320   2M   pmd
0x8320-0x83214000  80K   pte
0x83214000-0x83401968K RW GLB NX pte
0x8340-0x8420  14M RW PSE GLB NX pmd
0x8420-0x843840001552K RW GLB NX pte
0x84384000-0x8440 496K   pte
0x8440-0xa000 444M   pmd

So remove some range around rodata.

-v4: adapt it to all_end change.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init_64.c | 62 +++
 1 file changed, 62 insertions(+)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 2507b98..38aa59c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1010,6 +1010,61 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_DEBUG_RODATA
+static void remove_highmap_2m(unsigned long addr)
+{
+   pgd_t *pgd = pgd_offset_k(addr);
+   pud_t *pud = (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr);
+   pmd_t *pmd = (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr);
+
+   set_pmd(pmd, __pmd(0));
+}
+
+static void remove_highmap_2m_partial(unsigned long addr, unsigned long end)
+{
+   int i;
+   pgd_t *pgd = pgd_offset_k(addr);
+   pud_t *pud = (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr);
+   pmd_t *pmd = (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr);
+   pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd) + pte_index(addr);
+
+   for (i = pte_index(addr); i < pte_index(end - 1) + 1; i++, pte++)
+   set_pte(pte, __pte(0));
+}
+
+static void cleanup_highmap_late(unsigned long start, unsigned long end)
+{
+   unsigned long addr;
+   unsigned long start_2m_aligned = roundup(start, PMD_SIZE);
+   unsigned long end_2m_aligned = rounddown(end, PMD_SIZE);
+
+   start = PFN_ALIGN(start);
+   end &= PAGE_MASK;
+
+   if (start >= end)
+   return;
+
+   if (start < start_2m_aligned) {
+   unsigned long tmp = min(start_2m_aligned, end);
+
+   set_memory_4k(start, (tmp - start) >> PAGE_SHIFT);
+   remove_highmap_2m_partial(start, tmp);
+   }
+
+   for (addr = start_2m_aligned; addr < end_2m_aligned; addr += PMD_SIZE)
+   remove_highmap_2m(addr);
+
+   if (start <= end_2m_aligne

[PATCH 29/42] x86: Find correct 64 bit ramdisk address for microcode early update

2015-07-07 Thread Yinghai Lu
When using kexec with 64bit kernel, bzImage and ramdisk could be
loaded above 4G. We need this to get correct ramdisk adress.

Make get_ramdisk_image() global and use it for early microcode updating.

-v2: update changelog.

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/setup.h|  3 +++
 arch/x86/kernel/cpu/microcode/amd_early.c   | 10 +-
 arch/x86/kernel/cpu/microcode/intel_early.c |  8 
 arch/x86/kernel/setup.c | 28 ++--
 4 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 3e5aa41..496515b 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -119,6 +119,9 @@ void *extend_brk(size_t size, size_t align);
RESERVE_BRK(name, sizeof(type) * entries)
 
 extern void probe_roms(void);
+u64 get_ramdisk_image(struct boot_params *bp);
+u64 get_ramdisk_size(struct boot_params *bp);
+
 #ifdef __i386__
 
 asmlinkage void __init i386_start_kernel(void);
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c 
b/arch/x86/kernel/cpu/microcode/amd_early.c
index e8a215a..4c579c7 100644
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -51,12 +51,12 @@ static struct cpio_data __init find_ucode_in_initrd(void)
 */
p   = (struct boot_params *)__pa_nodebug(&boot_params);
path= (char *)__pa_nodebug(ucode_path);
-   start   = (void *)p->hdr.ramdisk_image;
-   size= p->hdr.ramdisk_size;
+   start   = (void *)(unsigned long)get_ramdisk_image(p);
+   size= get_ramdisk_size(p);
 #else
path= ucode_path;
-   start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
-   size= boot_params.hdr.ramdisk_size;
+   start   = (void *)(get_ramdisk_image(&boot_params) + PAGE_OFFSET);
+   size= get_ramdisk_size(&boot_params);
 #endif
 
return find_cpio_data(path, start, size, &offset);
@@ -396,7 +396,7 @@ int __init save_microcode_in_initrd_amd(void)
 */
if (relocated_ramdisk)
container = (u8 *)(__va(relocated_ramdisk) +
-(cont - boot_params.hdr.ramdisk_image));
+(cont - get_ramdisk_size(&boot_params)));
else
container = cont_va;
 
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c 
b/arch/x86/kernel/cpu/microcode/intel_early.c
index 8187b72..c85dcb2 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -736,16 +736,16 @@ void __init load_ucode_intel_bsp(void)
struct boot_params *p;
 
p   = (struct boot_params *)__pa_nodebug(&boot_params);
-   start   = p->hdr.ramdisk_image;
-   size= p->hdr.ramdisk_size;
+   start   = get_ramdisk_image(p);
+   size= get_ramdisk_size(p);
 
_load_ucode_intel_bsp(
(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
start, size);
 #else
-   start   = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
-   size= boot_params.hdr.ramdisk_size;
+   start   = get_ramdisk_image(&boot_params) + PAGE_OFFSET;
+   size= get_ramdisk_size(&boot_params);
 
_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
 #endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874b..2d808e6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -300,19 +300,19 @@ u64 relocated_ramdisk;
 
 #ifdef CONFIG_BLK_DEV_INITRD
 
-static u64 __init get_ramdisk_image(void)
+u64 __init get_ramdisk_image(struct boot_params *bp)
 {
-   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+   u64 ramdisk_image = bp->hdr.ramdisk_image;
 
-   ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+   ramdisk_image |= (u64)bp->ext_ramdisk_image << 32;
 
return ramdisk_image;
 }
-static u64 __init get_ramdisk_size(void)
+u64 __init get_ramdisk_size(struct boot_params *bp)
 {
-   u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+   u64 ramdisk_size = bp->hdr.ramdisk_size;
 
-   ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+   ramdisk_size |= (u64)bp->ext_ramdisk_size << 32;
 
return ramdisk_size;
 }
@@ -321,8 +321,8 @@ static u64 __init get_ramdisk_size(void)
 static void __init relocate_initrd(void)
 {
/* Assume only end is not page aligned */
-   u64 ramdisk_image = get_ramdisk_image();
-   u64 ramdisk_size  = get_ramdisk_size();
+   u64 ramdisk_image = get_ramdisk_image(&boot_params);
+   u64 ramdisk_size  = get_ramdisk_size(&boot_params);
u64 area_size = PAGE_ALIGN(ramdisk_size);
  

[PATCH 03/42] x86, boot: Fix run_size calculation

2015-07-07 Thread Yinghai Lu
While looking at the boot code to add mem mapping for kasl
with 64bit above 4G support, I found that e6023367d779 ("x86, kaslr: Prevent
.bss from overlaping initrd") and later introduced way to get kernel run_size
and pass it around.  At first run_size calculation is via perl and then
changed to shell scripts.

At first, that calculation is not right in the shell scripts:
it is using bss offset in the file plus bss/brk section size.

   run_size=$(( $offsetA + $sizeA + $sizeB ))

Idx Name  Size  VMA   LMA   File off  Algn
...
 24 .bss  000a1000  825e  025e  019e  2**12
  ALLOC
 25 .brk  00026000  82681000  02681000  019e  2**0
  ALLOC

that run_size will be 27947008.

it has extra not needed size as
1. we have hole between the sections in file to get aligned in file.
2. start of text is from 0x20 in elf file.

  [Nr] Name  Type Address   Offset
   Size  EntSize  Flags  Link  Info  Align
  ...
  [25] .bss  NOBITS   825e  019e
   000a1000    WA   0 0 4096
  [26] .brk  NOBITS   82681000  019e
   00026000    WA   0 0 1

Program Headers:
  Type   Offset VirtAddr   PhysAddr
 FileSizMemSiz  Flags  Align
  LOAD   0x0020 0x8100 0x0100
 0x013a9000 0x013a9000  R E20
  LOAD   0x0160 0x8240 0x0240
 0x000ed000 0x000ed000  RW 20
  LOAD   0x0180 0x 0x024ed000
 0x00013698 0x00013698  RW 20
  LOAD   0x01901000 0x82501000 0x02501000
 0x000df000 0x001a6000  RWE20
  NOTE   0x00e9d7dc 0x81c9d7dc 0x01c9d7dc
 0x0024 0x0024 4

 Section to Segment mapping:
  Segment Sections...
   00 .text .notes ..
   01 .data .vvar
   02 .data..percpu
   03 .init.text ... .bss .brk
   04 .notes

During decompress_kernel, parse_elf will move forward section to run time 
position.

   parse_elf: [0x009a00-0x009b3a8fff] <=== [0x009a20-0x009b5a8fff]
   parse_elf: [0x009b40-0x009b4ecfff] <=== [0x009b60-0x009b6ecfff]
   parse_elf: [0x009b4ed000-0x009b500697] <=== [0x009b80-0x009b813697]
   parse_elf: [0x009b501000-0x009b5d] <=== [0x009b901000-0x009b9d]

Secondly it is not necessary. As run_size is simple constant, we don't
need to pass it around and we already have voffset.h for that.

We can share voffset.h between misc.c and header.S instead of adding
other way to get run_size.

In this patch, we move voffset.h creation code to boot/compressed/Makefile.

Dependence was:
boot/header.S ==> boot/voffset.h ==> vmlinux
boot/header.S ==> compressed/vmlinux ==> compressed/misc.c
Now become:
boot/header.S ==> compressed/vmlinux ==> compressed/misc.c ==> boot/voffset.h 
==> vmlinux

Use macro in misc.c to replace passed run_size.

Fixes: e6023367d779 ("x86, kaslr: Prevent .bss from overlaping initrd")
Cc: Junjie Mao 
Cc: Kees Cook 
Cc: Josh Triplett 
Cc: Matt Fleming 
Cc: Andrew Morton 
Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/Makefile| 11 +--
 arch/x86/boot/compressed/Makefile | 12 
 arch/x86/boot/compressed/misc.c   |  3 +++
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 57bbf2f..4d27e8b 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -77,15 +77,6 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
 
 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
 
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define 
VO_\2 0x\1/p'
-
-quiet_cmd_voffset = VOFFSET $@
-  cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
-
-targets += voffset.h
-$(obj)/voffset.h: vmlinux FORCE
-   $(call if_changed,voffset)
-
 sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] 
\(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define
 ZO_\2 0x\1/p'
 
 quiet_cmd_zoffset = ZOFFSET $@
@@ -97,7 +88,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
 
 
 AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
+$(obj)/header.o: $(obj)/zoffset.h
 
 LDFLAGS_setup.elf  := -T
 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
in

[PATCH 21/42] x86, boot: Add more debug printout in compressed/misc.c

2015-07-07 Thread Yinghai Lu
with support that use printf.c in x86 setup code.
print out more info for debug info.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/misc.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index ee73b7b..a428c03 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -344,7 +344,7 @@ static void parse_elf(void *output)
return;
}
 
-   debug_putstr("Parsing ELF... ");
+   debug_putstr("Parsing ELF...\n");
 
phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
if (!phdrs)
@@ -369,6 +369,11 @@ static void parse_elf(void *output)
 * Here dest is smaller than src always.
 */
memcpy(dest, output + phdr->p_offset, phdr->p_filesz);
+   debug_printf("   parse_elf: [0x%010lx-0x%010lx] <=== 
[0x%010lx-0x%010lx]\n",
+   (unsigned long)dest,
+   (unsigned long)dest + phdr->p_filesz - 1,
+   (unsigned long)output + phdr->p_offset,
+   (unsigned long)output + phdr->p_offset + 
phdr->p_filesz - 1);
break;
default: /* Ignore other PT_* */ break;
}
@@ -475,6 +480,11 @@ asmlinkage __visible void *decompress_kernel(void *rmode, 
memptr heap,
error("Wrong destination address");
 #endif
 
+   debug_printf("  decompress: [0x%010lx-0x%010lx] <=== 
[0x%010lx-0x%010lx]\n",
+   (unsigned long)output,
+   (unsigned long)output + output_len - 1,
+   (unsigned long)input_data,
+   (unsigned long)input_data + input_len - 1);
debug_putstr("\nDecompressing Linux... ");
decompress(input_data, input_len, NULL, NULL, output, NULL, error);
parse_elf(output);
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 32/42] x86, of: Let add_dtb reserve setup_data locally

2015-07-07 Thread Yinghai Lu
We will not reserve setup_data in generic code. Every handler need to
reserve and copy setup_data locally.

Current dtd handling already have code for copying, just add reserve code.

Also simplify code a bit by storing real dtb size.

Cc: Rob Herring 
Cc: David Vrabel 
Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/prom.h  |  9 ++---
 arch/x86/kernel/devicetree.c | 39 +--
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 1d081ac..fb716eddc 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -24,17 +24,20 @@
 
 #ifdef CONFIG_OF
 extern int of_ioapic;
-extern u64 initial_dtb;
-extern void add_dtb(u64 data);
 void x86_of_pci_init(void);
 void x86_dtb_init(void);
 #else
-static inline void add_dtb(u64 data) { }
 static inline void x86_of_pci_init(void) { }
 static inline void x86_dtb_init(void) { }
 #define of_ioapic 0
 #endif
 
+#ifdef CONFIG_OF_FLATTREE
+extern void add_dtb(u64 data);
+#else
+static inline void add_dtb(u64 data) { }
+#endif
+
 extern char cmd_line[COMMAND_LINE_SIZE];
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 1f4acd6..19fb3cf 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -2,6 +2,7 @@
  * Architecture specific OF callbacks.
  */
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -23,7 +24,6 @@
 #include 
 #include 
 
-__initdata u64 initial_dtb;
 char __initdata cmd_line[COMMAND_LINE_SIZE];
 
 int __initdata of_ioapic;
@@ -43,11 +43,23 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 
align)
return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
 }
 
+#ifdef CONFIG_OF_FLATTREE
+static u64 initial_dtb __initdata;
+static u32 initial_dtb_size __initdata;
 void __init add_dtb(u64 data)
 {
+   u32 map_len;
+
initial_dtb = data + offsetof(struct setup_data, data);
-}
 
+   map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+   initial_boot_params = early_memremap(initial_dtb, map_len);
+   initial_dtb_size = of_get_flat_dt_size();
+   early_memunmap(initial_boot_params, map_len);
+   initial_boot_params = NULL;
+   memblock_reserve(initial_dtb, initial_dtb_size);
+}
+#endif
 /*
  * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
  */
@@ -265,31 +277,22 @@ static void __init dtb_apic_setup(void)
dtb_ioapic_setup();
 }
 
-#ifdef CONFIG_OF_FLATTREE
 static void __init x86_flattree_get_config(void)
 {
-   u32 size, map_len;
+#ifdef CONFIG_OF_FLATTREE
void *dt;
 
if (!initial_dtb)
return;
 
-   map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
-
-   initial_boot_params = dt = early_memremap(initial_dtb, map_len);
-   size = of_get_flat_dt_size();
-   if (map_len < size) {
-   early_memunmap(dt, map_len);
-   initial_boot_params = dt = early_memremap(initial_dtb, size);
-   map_len = size;
-   }
-
+   initial_boot_params = dt = early_memremap(initial_dtb,
+ initial_dtb_size);
unflatten_and_copy_device_tree();
-   early_memunmap(dt, map_len);
-}
-#else
-static inline void x86_flattree_get_config(void) { }
+   early_memunmap(dt, initial_dtb_size);
+
+   memblock_free(initial_dtb, initial_dtb_size);
 #endif
+}
 
 void __init x86_dtb_init(void)
 {
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 10/42] x86, 64bit: Set ident_mapping for kaslr

2015-07-07 Thread Yinghai Lu
Current aslr only support random in near range, and new range still use
old mapping. Also it does not support new range above 4G.

We need to have ident mapping for the new range before we can do
decompress to the new output, and later run them.

In this patch, we add ident mapping for all needed range.

At first, to support aslr to put random VO above 4G, we must set ident
mapping for the new range when it come via startup_32 path.

Secondly, when boot from 64bit bootloader, bootloader set ident mapping,
and boot via ZO (arch/x86/boot/compressed/vmlinux) startup_64.
Those pages for pagetable need to be avoided when we select new random
VO (vmlinux) base. Otherwise decompressor would overwrite them during
decompressing.
First way would be: walk through pagetable and find out every page is used
by pagetable for every mem_aovid checking but we will need extra code, and
may need to increase mem_avoid array size to hold them.
Other way would be: We can create new ident mapping instead, and pages for
pagetable will come from _pagetable section of ZO, and they are in
mem_avoid array already. In this way, we can reuse the code for ident
mapping.

The _pgtable will be shared 32bit and 64bit path to reduce init_size,
as now ZO _rodata to _end will contribute init_size.

We need to increase pgt buffer size.
When boot via startup_64, as we need to cover old VO, params, cmdline
and new VO, in extreme case we could have them all cross 512G boundary,
will need (2+2)*4 pages with 2M mapping. And need 2 for first 2M for vga
ram. Plus one for level4. Total will be 19 pages.
When boot via startup_32, aslr would move new VO above 4G, we need set
extra ident mapping for new VO, pgt buffer come from _pgtable offset 6
pages. Should only need (2+2) pages at most when it cross 512G boundary.
So 19 pages could make both paths happy.

Cc: Kees Cook 
Cc: Jiri Kosina 
Cc: Matt Fleming 
Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/Makefile   |  3 ++
 arch/x86/boot/compressed/aslr.c | 14 ++
 arch/x86/boot/compressed/head_64.S  |  4 +-
 arch/x86/boot/compressed/misc.h | 11 +
 arch/x86/boot/compressed/misc_pgt.c | 91 +
 arch/x86/include/asm/boot.h | 19 
 6 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/boot/compressed/misc_pgt.c

diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index e12a93c..66461b4 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -58,6 +58,9 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o 
$(obj)/misc.o \
 
 vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o
+ifdef CONFIG_X86_64
+   vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/misc_pgt.o
+endif
 
 $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
 
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index d753fb3..0990c78 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -151,6 +151,7 @@ static void mem_avoid_init(unsigned long input, unsigned 
long input_size,
 */
mem_avoid[0].start = input;
mem_avoid[0].size = (output + init_size) - input;
+   fill_pagetable(input, (output + init_size) - input);
 
/* Avoid initrd. */
initrd_start  = (u64)real_mode->ext_ramdisk_image << 32;
@@ -159,6 +160,7 @@ static void mem_avoid_init(unsigned long input, unsigned 
long input_size,
initrd_size |= real_mode->hdr.ramdisk_size;
mem_avoid[1].start = initrd_start;
mem_avoid[1].size = initrd_size;
+   /* don't need to set mapping for initrd */
 
/* Avoid kernel command line. */
cmd_line  = (u64)real_mode->ext_cmd_line_ptr << 32;
@@ -169,10 +171,19 @@ static void mem_avoid_init(unsigned long input, unsigned 
long input_size,
;
mem_avoid[2].start = cmd_line;
mem_avoid[2].size = cmd_line_size;
+   fill_pagetable(cmd_line, cmd_line_size);
 
/* Avoid params */
mem_avoid[3].start = (unsigned long)real_mode;
mem_avoid[3].size = sizeof(*real_mode);
+   fill_pagetable((unsigned long)real_mode, sizeof(*real_mode));
+
+   /* don't need to set mapping for setup_data */
+
+#ifdef CONFIG_X86_VERBOSE_BOOTUP
+   /* for video ram */
+   fill_pagetable(0, PMD_SIZE);
+#endif
 }
 
 /* Does this memory vector overlap a known avoided area? */
@@ -330,6 +341,9 @@ unsigned char *choose_kernel_location(unsigned char *input,
goto out;
 
choice = random;
+
+   fill_pagetable(choice, output_run_size);
+   switch_pagetable();
 out:
return (unsigned char *)choice;
 }
diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index 3691451..075bb15 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arc

[PATCH 23/42] x86, setup: Use puts() instead of printf() in edd code

2015-07-07 Thread Yinghai Lu
don't need to use printf there.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/edd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 223e425..88d7c7f 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -157,7 +157,7 @@ void query_edd(void)
 */
 
if (!be_quiet)
-   printf("Probing EDD (edd=off to disable)... ");
+   puts("Probing EDD (edd=off to disable)... ");
 
for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
/*
@@ -176,7 +176,7 @@ void query_edd(void)
}
 
if (!be_quiet)
-   printf("ok\n");
+   puts("ok\n");
 }
 
 #endif
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 28/42] x86, boot: Allow 64bit EFI kernel to be loaded above 4G

2015-07-07 Thread Yinghai Lu
Now could use kexec to place kernel/boot_params/cmd_line/initrd
above 4G, but that is with legacy interface with startup_64 directly.

This patch will allow 64bit EFI kernel to be loaded above 4G
and use EFI HANDOVER PROTOCOL to start the kernel.

Current 32bit code32_start is used for passing around load address,
so it will overflow when kernel is loaded abover 4G.

The patch mainly add ext_code32_start to take load address high 32bits.

After this patch, could use patched grub2-x86_64.efi to place
kernel/boot_params/cmd_line/initrd all above 4G and execute the kernel
above 4G.

bootlog like:

kernel: done   [ linux  9.25MiB  100%  6.66MiB/s ]
params: [1618fc000,1618f]
cmdline: [1618fb000,1618fb7fe]
kernel: [15e00,161385fff]
initrd: [15bcbe000,15dbb]
initrd: 1 file done [ initrd.img  35.26MiB  100%  11.93MiB/s ]
early console in decompress_kernel
decompress_kernel:
  input: [0x15fd0b3b4-0x16063c803], output: 0x15e00, heap: 
[0x160645b00-0x16064daff]

Decompressing Linux... xz... Parsing ELF... done.
Booting the kernel.
[0.00] bootconsole [uart0] enabled
[0.00]real_mode_data :  phys 0001618fc000
[0.00]real_mode_data :  virt 8801618fc000
[0.00] Kernel Layout:
[0.00]   .text: [0x15e00-0x15f08f72c]
[0.00] .rodata: [0x15f20-0x15fa44fff]
[0.00]   .data: [0x15fc0-0x15fe545ff]
[0.00]   .init: [0x15fe56000-0x16021afff]
[0.00].bss: [0x160229000-0x16135]
[0.00].brk: [0x16136-0x161385fff]
[0.00] memblock_reserve: [0x09f000-0x0f] flags 0x0 
* BIOS reserved
...
[0.00] memblock_reserve: [0x015e00-0x016135] flags 0x0 
TEXT DATA BSS
[0.00] memblock_reserve: [0x015bcbe000-0x015dff] flags 0x0 
RAMDISK

-v2: add cast to avoid warning with 32bit, also update description for
 ext_code32_start in boot.txt
-v3: change to 4.0 from 3.20.

Signed-off-by: Yinghai Lu 
---
 Documentation/x86/boot.txt| 19 +++
 arch/x86/boot/compressed/eboot.c  | 15 ++-
 arch/x86/boot/compressed/head_64.S|  7 ++-
 arch/x86/boot/header.S|  3 ++-
 arch/x86/include/uapi/asm/bootparam.h |  1 +
 arch/x86/kernel/asm-offsets.c |  1 +
 6 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 9da6f35..90efaa2 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -61,6 +61,9 @@ Protocol 2.12:(Kernel 3.8) Added the xloadflags field 
and extension fields
to struct boot_params for loading bzImage and ramdisk
above 4G in 64bit.
 
+Protocol 2.14: (Kernel 4.0) Added the ext_code32_start to support 64bit
+   EFI kernel to be loaded above 4G.
+
  MEMORY LAYOUT
 
 The traditional memory map for the kernel loader, used for Image or
@@ -197,6 +200,7 @@ Offset  Proto   NameMeaning
 0258/8 2.10+   pref_addressPreferred loading address
 0260/4 2.10+   init_size   Linear memory required during initialization
 0264/4 2.11+   handover_offset Offset of handover entry point
+0268/4 2.14+   ext_code32_startExtended part for code32_start
 
 (1) For backwards compatibility, if the setup_sects field contains 0, the
 real value is 4.
@@ -744,6 +748,14 @@ Offset/size:   0x264/4
 
   See EFI HANDOVER PROTOCOL below for more details.
 
+Field name:ext_code32_start
+Type:  modify (optional, reloc)
+Offset/size:   0x268/4
+Protocol:  2.14+
+
+  This field is the upper 32bits of load address when EFI 64bit kernel
+  is loaded above 4G. And it is used with code32_start to compare to
+  pref_address to decide if kernel need to be relocated further.
 
  THE IMAGE CHECKSUM
 
@@ -1127,4 +1139,11 @@ The boot loader *must* fill out the following fields in 
bp,
 o hdr.ramdisk_image (if applicable)
 o hdr.ramdisk_size  (if applicable)
 
+for 64bit, when loading above 4G, *must* fill out the following fields,
+
+o hdr.ext_code32_start
+o ext_cmd_line_ptr
+o ext_ramdisk_image (if applicable)
+o ext_ramdisk_size  (if applicable)
+
 All other fields should be zero.
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 2c82bd1..05d77a5 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1394,6 +1394,7 @@ struct boot_params *efi_main(struct efi_config *c,
void *handle;
efi_system_table_t *_table;
bool is64;
+   unsigned long loaded_addr;
 
efi_early = c;
 
@@ -1435,9 +1436,12 @@ struct boot_params *efi_main(struct efi_config *c,
 * If the kernel isn't already loaded at the preferred load
 * address, relocate it.
 */
-   if (hdr->pref_address != hdr->code32_start) {
-   unsigned long bzimage

[PATCH 17/42] x86, kaslr: Add support of kernel physical address randomization above 4G

2015-07-07 Thread Yinghai Lu
From: Baoquan He 

In kaslr implementation mechanism, mainly process_e820_entry and
slots_fetch_random do the job. process_e820_entry is responsible
for storing the slot information. slots_fetch_random takes care
of fetching slot information. In this patch, for adding support
of kernel physical address randomization above 4G, both of these
two functions are changed based on the new slot_area data structure.

Now kernel can be reloaded and decompressed anywhere of the whole
physical memory, even near 64T at most.

Signed-off-by: Baoquan He 
---
 arch/x86/boot/compressed/aslr.c | 68 ++---
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 554b637..9158882 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -293,27 +293,40 @@ static void slots_append(unsigned long addr)
 
 static unsigned long slots_fetch_random(void)
 {
+   unsigned long random;
+   int i;
+
/* Handle case of no slots stored. */
if (slot_max == 0)
return 0;
 
-   return slots[get_random_long() % slot_max];
+   random = get_random_long() % slot_max;
+
+   for (i = 0; i < slot_area_index; i++) {
+   if (random >= slot_areas[i].num) {
+   random -= slot_areas[i].num;
+   continue;
+   }
+   return slot_areas[i].addr + random * CONFIG_PHYSICAL_ALIGN;
+   }
+
+   if (i == slot_area_index)
+   debug_putstr("Something wrong happened in 
slots_fetch_random()...\n");
+   return 0;
 }
 
 static void process_e820_entry(struct e820entry *entry,
   unsigned long minimum,
   unsigned long image_size)
 {
-   struct mem_vector region, img;
+   struct mem_vector region, out;
+   struct slot_area slot_area;
+   unsigned long min, start_orig;
 
/* Skip non-RAM entries. */
if (entry->type != E820_RAM)
return;
 
-   /* Ignore entries entirely above our maximum. */
-   if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
-   return;
-
/* Ignore entries entirely below our minimum. */
if (entry->addr + entry->size < minimum)
return;
@@ -321,10 +334,17 @@ static void process_e820_entry(struct e820entry *entry,
region.start = entry->addr;
region.size = entry->size;
 
+repeat:
+   start_orig = region.start;
+
/* Potentially raise address to minimum location. */
if (region.start < minimum)
region.start = minimum;
 
+   /* Return if slot area array is full */
+   if (slot_area_index == MAX_SLOT_AREA)
+   return;
+
/* Potentially raise address to meet alignment requirements. */
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
 
@@ -333,20 +353,30 @@ static void process_e820_entry(struct e820entry *entry,
return;
 
/* Reduce size by any delta from the original address. */
-   region.size -= region.start - entry->addr;
+   region.size -= region.start - start_orig;
 
-   /* Reduce maximum size to fit end of image within maximum limit. */
-   if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
-   region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start;
+   /* Return if region can't contain decompressed kernel */
+   if (region.size < image_size)
+   return;
 
-   /* Walk each aligned slot and check for avoided areas. */
-   for (img.start = region.start, img.size = image_size ;
-mem_contains(®ion, &img) ;
-img.start += CONFIG_PHYSICAL_ALIGN) {
-   if (mem_avoid_overlap(&img))
-   continue;
-   slots_append(img.start);
+   if (!mem_avoid_overlap(®ion)) {
+   store_slot_info(®ion, image_size);
+   return;
}
+
+   min = mem_min_overlap(®ion, &out);
+
+   if (min > region.start + image_size) {
+   struct mem_vector tmp;
+
+   tmp.start = region.start;
+   tmp.size = min - region.start;
+   store_slot_info(&tmp, image_size);
+   }
+
+   region.size -= out.start - region.start + out.size;
+   region.start = out.start + out.size;
+   goto repeat;
 }
 
 static unsigned long find_random_phy_addr(unsigned long minimum,
@@ -361,6 +391,10 @@ static unsigned long find_random_phy_addr(unsigned long 
minimum,
/* Verify potential e820 positions, appending to slots list. */
for (i = 0; i < real_mode->e820_entries; i++) {
process_e820_entry(&real_mode->e820_map[i], minimum, size);
+   if (slot_area_index == MAX_SLOT_AREA) {
+   debug_putstr("Stop processing e820 since slot_areas is 
full...\n");
+ 

[PATCH 36/42] x86, boot, PCI: Copy SETUP_PCI rom to kernel space

2015-07-07 Thread Yinghai Lu
As EFI stub code could put them high when on 32bit or with exactmap=
on 64bit conf.

Check if the range is mapped, otherwise allocate new one and have
the rom data copied. So we could access them directly.

Signed-off-by: Yinghai Lu 
---
 arch/x86/pci/common.c | 47 +--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 32d4f21..4d6b128 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -668,6 +668,48 @@ struct firmware_setup_pci_entry {
 
 static LIST_HEAD(setup_pci_entries);
 
+static phys_addr_t check_copy(phys_addr_t start, unsigned long size)
+{
+   unsigned long start_pfn = PFN_DOWN(start);
+   unsigned long end_pfn = PFN_UP(start + size);
+   unsigned char *p, *q;
+   phys_addr_t pa_p, pa_q;
+   long sz = size;
+
+   if (pfn_range_is_mapped(start_pfn, end_pfn))
+   return start;
+
+   /* allocate and copy */
+   pa_p = memblock_alloc(size, PAGE_SIZE);
+   if (!pa_p)
+   return start;
+
+   p = phys_to_virt(pa_p);
+
+   pa_q = start;
+   while (sz > 0) {
+   long chunk_size = 64<<10;
+
+   if (chunk_size > sz)
+   chunk_size = sz;
+
+   q = early_memremap(pa_q, chunk_size);
+   if (!q) {
+   memblock_free(pa_p, size);
+   return start;
+   }
+   memcpy(p, q, chunk_size);
+   early_memunmap(q, chunk_size);
+   p += chunk_size;
+   pa_q += chunk_size;
+   sz -= chunk_size;
+   }
+
+   memblock_free(start, size);
+
+   return pa_p;
+}
+
 int __init fill_setup_pci_entries(void)
 {
struct setup_data *data;
@@ -697,8 +739,9 @@ int __init fill_setup_pci_entries(void)
entry->vendor = rom->vendor;
entry->devid = rom->devid;
entry->pcilen = rom->pcilen;
-   entry->romdata = pa_data +
-offsetof(struct pci_setup_rom, romdata);
+   entry->romdata = check_copy(pa_data +
+ offsetof(struct pci_setup_rom, romdata),
+ rom->pcilen);
 
list_add(&entry->list, &setup_pci_entries);
 
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 34/42] x86: Kill not used setup_data handling code

2015-07-07 Thread Yinghai Lu
Cc: Matt Fleming 
Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/kdebugfs.c | 142 -
 arch/x86/kernel/setup.c|  17 --
 2 files changed, 159 deletions(-)

diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index dc1404b..c8ca86c 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -21,142 +21,6 @@ struct dentry *arch_debugfs_dir;
 EXPORT_SYMBOL(arch_debugfs_dir);
 
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
-struct setup_data_node {
-   u64 paddr;
-   u32 type;
-   u32 len;
-};
-
-static ssize_t setup_data_read(struct file *file, char __user *user_buf,
-  size_t count, loff_t *ppos)
-{
-   struct setup_data_node *node = file->private_data;
-   unsigned long remain;
-   loff_t pos = *ppos;
-   struct page *pg;
-   void *p;
-   u64 pa;
-
-   if (pos < 0)
-   return -EINVAL;
-
-   if (pos >= node->len)
-   return 0;
-
-   if (count > node->len - pos)
-   count = node->len - pos;
-
-   pa = node->paddr + sizeof(struct setup_data) + pos;
-   pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
-   if (PageHighMem(pg)) {
-   p = ioremap_cache(pa, count);
-   if (!p)
-   return -ENXIO;
-   } else
-   p = __va(pa);
-
-   remain = copy_to_user(user_buf, p, count);
-
-   if (PageHighMem(pg))
-   iounmap(p);
-
-   if (remain)
-   return -EFAULT;
-
-   *ppos = pos + count;
-
-   return count;
-}
-
-static const struct file_operations fops_setup_data = {
-   .read   = setup_data_read,
-   .open   = simple_open,
-   .llseek = default_llseek,
-};
-
-static int __init
-create_setup_data_node(struct dentry *parent, int no,
-  struct setup_data_node *node)
-{
-   struct dentry *d, *type, *data;
-   char buf[16];
-
-   sprintf(buf, "%d", no);
-   d = debugfs_create_dir(buf, parent);
-   if (!d)
-   return -ENOMEM;
-
-   type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
-   if (!type)
-   goto err_dir;
-
-   data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
-   if (!data)
-   goto err_type;
-
-   return 0;
-
-err_type:
-   debugfs_remove(type);
-err_dir:
-   debugfs_remove(d);
-   return -ENOMEM;
-}
-
-static int __init create_setup_data_nodes(struct dentry *parent)
-{
-   struct setup_data_node *node;
-   struct setup_data *data;
-   int error;
-   struct dentry *d;
-   struct page *pg;
-   u64 pa_data;
-   int no = 0;
-
-   d = debugfs_create_dir("setup_data", parent);
-   if (!d)
-   return -ENOMEM;
-
-   pa_data = boot_params.hdr.setup_data;
-
-   while (pa_data) {
-   node = kmalloc(sizeof(*node), GFP_KERNEL);
-   if (!node) {
-   error = -ENOMEM;
-   goto err_dir;
-   }
-
-   pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
-   if (PageHighMem(pg)) {
-   data = ioremap_cache(pa_data, sizeof(*data));
-   if (!data) {
-   kfree(node);
-   error = -ENXIO;
-   goto err_dir;
-   }
-   } else
-   data = __va(pa_data);
-
-   node->paddr = pa_data;
-   node->type = data->type;
-   node->len = data->len;
-   error = create_setup_data_node(d, no, node);
-   pa_data = data->next;
-
-   if (PageHighMem(pg))
-   iounmap(data);
-   if (error)
-   goto err_dir;
-   no++;
-   }
-
-   return 0;
-
-err_dir:
-   debugfs_remove(d);
-   return error;
-}
-
 static struct debugfs_blob_wrapper boot_params_blob = {
.data   = &boot_params,
.size   = sizeof(boot_params),
@@ -181,14 +45,8 @@ static int __init boot_params_kdebugfs_init(void)
if (!data)
goto err_version;
 
-   error = create_setup_data_nodes(dbp);
-   if (error)
-   goto err_data;
-
return 0;
 
-err_data:
-   debugfs_remove(data);
 err_version:
debugfs_remove(version);
 err_dir:
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de0f830..35d9ff5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -465,20 +465,6 @@ static void __init parse_setup_data(void)
boot_params.hdr.setup_data = 0; /* all done */
 }
 
-static void __init memblock_x86_reserve_range_setup_data(void)
-{
- 

[PATCH 38/42] x86: Fix typo in mark_rodata_ro

2015-07-07 Thread Yinghai Lu
In the comment, should use cleanup_highmap().
and also remove not needed cast for _brk_end, as it is
unsigned long.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 257ba4b..3b7453a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1054,9 +1054,9 @@ void mark_rodata_ro(void)
 * of the PMD will remain mapped executable.
 *
 * Any PMD which was setup after the one which covers _brk_end
-* has been zapped already via cleanup_highmem().
+* has been zapped already via cleanup_highmap().
 */
-   all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
+   all_end = roundup(_brk_end, PMD_SIZE);
set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
 
rodata_test();
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 30/42] x86: Kill E820_RESERVED_KERN

2015-07-07 Thread Yinghai Lu
Now we are using memblock to do early resource reserver/allocation
instead of using e820 map directly, and setup_data is reserved in
memblock early already.
Also kexec generate setup_data and pass pointer to second kernel,
so second kernel reserve setup_data by their own.
(Now kexec-tools create SETUP_EFI and SETUP_E820_EXT).

We can kill E820_RESERVED_KERN and not touch e820 map at all.

That will fix bug in mark_nonsave_region that can not handle that
case: E820_RAM and E820_RESERVED_KERN ranges are continuous and
boundary is not page aligned.

Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=913885
Reported-by: "Lee, Chun-Yi" 
Tested-by: "Lee, Chun-Yi" 
Cc: "Lee, Chun-Yi" 
Signed-off-by: Yinghai Lu 
Cc: sta...@vger.kernel.org
---
 arch/x86/include/uapi/asm/e820.h |  8 
 arch/x86/kernel/e820.c   |  6 ++
 arch/x86/kernel/setup.c  | 25 -
 arch/x86/kernel/tboot.c  |  3 +--
 arch/x86/mm/init_64.c| 11 ---
 5 files changed, 7 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 0f457e6..a9216a1 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -45,14 +45,6 @@
  */
 #define E820_PRAM  12
 
-/*
- * reserved RAM used by kernel itself
- * if CONFIG_INTEL_TXT is enabled, memory of this type will be
- * included in the S3 integrity calculation and so should not include
- * any memory that BIOS might alter over the S3 transition
- */
-#define E820_RESERVED_KERN128
-
 #ifndef __ASSEMBLY__
 #include 
 struct e820entry {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46ec08d..49d8c50 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -134,7 +134,6 @@ static void __init e820_print_type(u32 type)
 {
switch (type) {
case E820_RAM:
-   case E820_RESERVED_KERN:
printk(KERN_CONT "usable");
break;
case E820_RESERVED:
@@ -693,7 +692,7 @@ void __init e820_mark_nosave_regions(unsigned long 
limit_pfn)
 
pfn = PFN_DOWN(ei->addr + ei->size);
 
-   if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+   if (ei->type != E820_RAM)
register_nosave_region(PFN_UP(ei->addr), pfn);
 
if (pfn >= limit_pfn)
@@ -910,7 +909,6 @@ void __init finish_e820_parsing(void)
 static inline const char *e820_type_to_string(int e820_type)
 {
switch (e820_type) {
-   case E820_RESERVED_KERN:
case E820_RAM:  return "System RAM";
case E820_ACPI: return "ACPI Tables";
case E820_NVS:  return "ACPI Non-volatile Storage";
@@ -1107,7 +1105,7 @@ void __init memblock_x86_fill(void)
if (end != (resource_size_t)end)
continue;
 
-   if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+   if (ei->type != E820_RAM)
continue;
 
memblock_add(ei->addr, ei->size);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2d808e6..a3b65f1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -457,29 +457,6 @@ static void __init parse_setup_data(void)
}
 }
 
-static void __init e820_reserve_setup_data(void)
-{
-   struct setup_data *data;
-   u64 pa_data;
-
-   pa_data = boot_params.hdr.setup_data;
-   if (!pa_data)
-   return;
-
-   while (pa_data) {
-   data = early_memremap(pa_data, sizeof(*data));
-   e820_update_range(pa_data, sizeof(*data)+data->len,
-E820_RAM, E820_RESERVED_KERN);
-   pa_data = data->next;
-   early_memunmap(data, sizeof(*data));
-   }
-
-   sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-   memcpy(&e820_saved, &e820, sizeof(struct e820map));
-   printk(KERN_INFO "extended physical RAM map:\n");
-   e820_print_map("reserve setup_data");
-}
-
 static void __init memblock_x86_reserve_range_setup_data(void)
 {
struct setup_data *data;
@@ -1018,8 +995,6 @@ void __init setup_arch(char **cmdline_p)
early_dump_pci_devices();
 #endif
 
-   /* update the e820_saved too */
-   e820_reserve_setup_data();
finish_e820_parsing();
 
if (efi_enabled(EFI_BOOT))
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 91a4496..3c2752a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -195,8 +195,7 @@ static int tboot_setup_sleep(void)
tboot->num_mac_regions = 0;
 
for (i = 0; i < e820.nr_map; i++) {
-   if ((e820.map[i].type != E820_RAM)
-&& (e820.map[i].type != E820_RESERVED

[PATCH 41/42] x86, 64bit: Add __pa_high/__va_high

2015-07-07 Thread Yinghai Lu
and use it to make the early page table setup code more readable,
as we are using kernel high mapping address.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/head64.c | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index a9f0299..cd0a820 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -37,6 +37,9 @@ extern pmd_t 
early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
 static unsigned int __initdata next_early_pgt = 2;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 
+#define __va_high(x) ((void *)((unsigned long)(x) + __START_KERNEL_map - 
phys_base))
+#define __pa_high(x) ((unsigned long)(x) - __START_KERNEL_map + phys_base)
+
 /* Wipe all early page tables except for the kernel symbol map */
 static void __init reset_early_page_tables(void)
 {
@@ -47,7 +50,7 @@ static void __init reset_early_page_tables(void)
 
next_early_pgt = 0;
 
-   write_cr3(__pa_nodebug(early_level4_pgt));
+   write_cr3(__pa_high(early_level4_pgt));
 }
 
 /* Create a new PMD entry */
@@ -60,7 +63,7 @@ int __init early_make_pgtable(unsigned long address)
pmdval_t pmd, *pmd_p;
 
/* Invalid address or early pgt is done ?  */
-   if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+   if (physaddr >= MAXMEM || read_cr3() != __pa_high(early_level4_pgt))
return -1;
 
 again:
@@ -73,7 +76,7 @@ again:
 * range and we might end up looping forever...
 */
if (pgd)
-   pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map 
- phys_base);
+   pud_p = (pudval_t *)__va_high(pgd & PTE_PFN_MASK);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -83,13 +86,13 @@ again:
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
for (i = 0; i < PTRS_PER_PUD; i++)
pud_p[i] = 0;
-   *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + 
_KERNPG_TABLE;
+   *pgd_p = __pa_high(pud_p) + _KERNPG_TABLE;
}
pud_p += pud_index(address);
pud = *pud_p;
 
if (pud)
-   pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map 
- phys_base);
+   pmd_p = (pmdval_t *)__va_high(pud & PTE_PFN_MASK);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -99,7 +102,7 @@ again:
pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
for (i = 0; i < PTRS_PER_PMD; i++)
pmd_p[i] = 0;
-   *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + 
_KERNPG_TABLE;
+   *pud_p = __pa_high(pmd_p) + _KERNPG_TABLE;
}
pmd = (physaddr & PMD_MASK) + early_pmd_flags;
pmd_p[pmd_index(address)] = pmd;
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 24/42] x86: Setup early console as early as possible in x86_start_kernel()

2015-07-07 Thread Yinghai Lu
Analyze "console=uart8250,io,0x3f8,115200n8" in 
i386_start_kernel/x86_64_start_kernel,
and call setup_early_serial8250_console() to init early serial console.

Only can handle io port kind of 8250, because mmio need ioremap.

Use boot_params.hdr.version instead of adding another variable, Suggested by 
hpa.
Also need to apply this one after x86 memblock patchset.

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/setup.h |  2 ++
 arch/x86/kernel/head.c   | 26 ++
 arch/x86/kernel/head32.c |  1 +
 arch/x86/kernel/head64.c |  5 -
 drivers/tty/serial/8250/8250_early.c | 17 +
 kernel/printk/printk.c   | 11 +++
 6 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 11af24e..3e5aa41 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -40,6 +40,8 @@ static inline void vsmp_init(void) { }
 void setup_bios_corruption_check(void);
 
 extern unsigned long saved_video_mode;
+int setup_early_serial8250_console(char *cmdline);
+void setup_early_console(void);
 
 extern void reserve_standard_io_resources(void);
 extern void i386_reserve_resources(void);
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 992f442..cc0cd83 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -69,3 +69,29 @@ void __init reserve_ebda_region(void)
/* reserve all memory between lowmem and the 1MB mark */
memblock_reserve(lowmem, 0x10 - lowmem);
 }
+
+void __init setup_early_console(void)
+{
+#ifdef CONFIG_SERIAL_8250_CONSOLE
+   char constr[64], *p, *q;
+
+   /* Can not handle mmio type 8250 uart yet, too early */
+   p = strstr(boot_command_line, "console=uart8250,io,");
+   if (!p)
+   p = strstr(boot_command_line, "console=uart,io,");
+   if (!p)
+   return;
+
+   p += 8; /* sizeof "console=" */
+   q = strchrnul(p, ' ');
+   if ((q - p) >= sizeof(constr))
+   return;
+
+   memset(constr, 0, sizeof(constr));
+   memcpy(constr, p, q - p);
+
+   lockdep_init();
+
+   setup_early_serial8250_console(constr);
+#endif
+}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 2911ef3..87ddca1 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -33,6 +33,7 @@ asmlinkage __visible void __init i386_start_kernel(void)
 {
cr4_init_shadow();
sanitize_boot_params(&boot_params);
+   setup_early_console();
 
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5a46681..44dc63b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -171,6 +171,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * 
real_mode_data)
load_idt((const struct desc_ptr *)&idt_descr);
 
copy_bootdata(__va(real_mode_data));
+   setup_early_console();
 
/*
 * Load microcode early on BSP.
@@ -189,8 +190,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char 
* real_mode_data)
 void __init x86_64_start_reservations(char *real_mode_data)
 {
/* version is always not zero if it is copied */
-   if (!boot_params.hdr.version)
+   if (!boot_params.hdr.version) {
copy_bootdata(__va(real_mode_data));
+   setup_early_console();
+   }
 
reserve_ebda_region();
 
diff --git a/drivers/tty/serial/8250/8250_early.c 
b/drivers/tty/serial/8250/8250_early.c
index 771dda2..8a7fe75 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -152,3 +152,20 @@ int __init early_serial8250_setup(struct earlycon_device 
*device,
 }
 EARLYCON_DECLARE(uart8250, early_serial8250_setup);
 EARLYCON_DECLARE(uart, early_serial8250_setup);
+
+/* for x86 early early console */
+int __init setup_early_serial8250_console(char *cmdline)
+{
+   char *options;
+
+   options = strstr(cmdline, "uart8250,");
+   if (options)
+   return setup_earlycon(options);
+
+   options = strstr(cmdline, "uart,");
+   if (options)
+   return setup_earlycon(options);
+
+   return 0;
+}
+
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c242..f554c5f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2454,11 +2454,14 @@ void register_console(struct console *newcon)
struct console_cmdline *c;
 
if (console_drivers)
-   for_each_console(bcon)
-   if (WARN(bcon == newcon,
-   "console '%s%d' already registered\n",
-   bcon->name, bcon->index))

[PATCH 16/42] x86, kaslr: Randomize physical and virtual address of kernel separately

2015-07-07 Thread Yinghai Lu
From: Baoquan He 

On x86_64, in old kaslr implementaion only physical address of kernel
loading is randomized. Then calculate the delta of physical address
where vmlinux was linked to load and where it is finally loaded. If
delta is not equal to 0, namely there's a new physical address where
kernel is actually decompressed, relocation handling need be done. Then
delta is added to offset of kernel symbol relocation, this makes the
address of kernel text mapping move delta long.

Here the behavior is changed. Randomize both the physical address
where kernel is decompressed and the virtual address where kernel text
is mapped. And relocation handling only depends on virtual address
randomization. Means if and only if virtual address is randomized to
a different value, we add the delta to the offset of kernel relocs.

Note that up to now both virtual offset and physical addr randomization
cann't exceed CONFIG_RANDOMIZE_BASE_MAX_OFFSET, namely 1G.

Signed-off-by: Baoquan He 
---
 arch/x86/boot/compressed/aslr.c | 46 +
 arch/x86/boot/compressed/misc.c | 39 --
 arch/x86/boot/compressed/misc.h | 19 +
 3 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 775c6f9..554b637 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -349,7 +349,7 @@ static void process_e820_entry(struct e820entry *entry,
}
 }
 
-static unsigned long find_random_addr(unsigned long minimum,
+static unsigned long find_random_phy_addr(unsigned long minimum,
  unsigned long size)
 {
int i;
@@ -387,23 +387,24 @@ static unsigned long find_random_virt_offset(unsigned 
long minimum,
return random * CONFIG_PHYSICAL_ALIGN + minimum;
 }
 
-unsigned char *choose_kernel_location(unsigned char *input,
- unsigned long input_size,
- unsigned char *output,
- unsigned long output_run_size)
+void choose_kernel_location(unsigned char *input,
+   unsigned long input_size,
+   unsigned char **output,
+   unsigned long output_run_size,
+   unsigned char **virt_offset)
 {
-   unsigned long choice = (unsigned long)output;
unsigned long random;
+   *virt_offset = (unsigned char *)LOAD_PHYSICAL_ADDR;
 
 #ifdef CONFIG_HIBERNATION
if (!cmdline_find_option_bool("kaslr")) {
debug_putstr("KASLR disabled by default...\n");
-   goto out;
+   return;
}
 #else
if (cmdline_find_option_bool("nokaslr")) {
debug_putstr("KASLR disabled by cmdline...\n");
-   goto out;
+   return;
}
 #endif
 
@@ -411,23 +412,24 @@ unsigned char *choose_kernel_location(unsigned char 
*input,
 
/* Record the various known unsafe memory ranges. */
mem_avoid_init((unsigned long)input, input_size,
-  (unsigned long)output);
+  (unsigned long)*output);
 
/* Walk e820 and find a random address. */
-   random = find_random_addr(choice, output_run_size);
-   if (!random) {
+   random = find_random_phy_addr((unsigned long)*output, output_run_size);
+   if (!random)
debug_putstr("KASLR could not find suitable E820 region...\n");
-   goto out;
+   else {
+   if ((unsigned long)*output != random) {
+   fill_pagetable(random, output_run_size);
+   switch_pagetable();
+   *output = (unsigned char *)random;
+   }
}
 
-   /* Always enforce the minimum. */
-   if (random < choice)
-   goto out;
-
-   choice = random;
-
-   fill_pagetable(choice, output_run_size);
-   switch_pagetable();
-out:
-   return (unsigned char *)choice;
+   /*
+* Get a random address between LOAD_PHYSICAL_ADDR and
+* CONFIG_RANDOMIZE_BASE_MAX_OFFSET
+*/
+   random = find_random_virt_offset(LOAD_PHYSICAL_ADDR, output_run_size);
+   *virt_offset = (unsigned char *)random;
 }
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index bfa4f0a..6b2a308 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -226,7 +226,8 @@ void error(char *x)
 }
 
 #if CONFIG_X86_NEED_RELOCS
-static void handle_relocations(void *output, unsigned long output_len)
+static void handle_relocations(void *output, unsigned long output_len,
+  void *virt_offset)
 {
int *reloc;
unsigned long delta, map, ptr;
@@ -238,11 +239,6 @@ static void handle_relocations(void *output, unsigned long 
output_len)
   

[PATCH 07/42] x86, boot: Move z_extract_offset calculation to header.S

2015-07-07 Thread Yinghai Lu
Old extract_offset calculation is done without knowledge of decompressor size.
so it guess one big size.

We can move it to header.S, where we have exact decompressor size.

We save 8 pages for init_size with this patch.

before patch:
kernel: [13e00,13fa1dfff]
  input: [0x13f32d3b4-0x13fa01cc7], output: [0x13e00-0x13f9ef81f], heap: 
[0x13fa0b680-0x13fa1367f]

after patch:
kernel: [13e00,13fa15fff]
  input: [0x13f3253b4-0x13f9f9cc7], output: [0x13e00-0x13f9ef81f], heap: 
[0x13fa03680-0x13fa0b67f]

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/Makefile |  2 +-
 arch/x86/boot/compressed/misc.c|  5 +
 arch/x86/boot/compressed/mkpiggy.c | 16 +---
 arch/x86/boot/header.S | 29 +
 4 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 4d27e8b..e7196cf 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -77,7 +77,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
 
 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
 
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] 
\(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define
 ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] 
\(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define
 ZO_\2 0x\1/p'
 
 quiet_cmd_zoffset = ZOFFSET $@
   cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 1c03098..db97bdf 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -84,13 +84,10 @@
  * To avoid problems with the compressed data's meta information an extra 18
  * bytes are needed.  Leading to the formula:
  *
- * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
+ * extra_bytes = (uncompressed_size >> 12) + 32768 + 18.
  *
  * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
  * Adding 32768 instead of 32767 just makes for round numbers.
- * Adding the decompressor_size is necessary as it musht live after all
- * of the data as well.  Last I measured the decompressor is about 14K.
- * 10K of actual data and 4K of bss.
  *
  */
 
diff --git a/arch/x86/boot/compressed/mkpiggy.c 
b/arch/x86/boot/compressed/mkpiggy.c
index c03b009..c5148642 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -21,8 +21,7 @@
  * --- */
 
 /*
- * Compute the desired load offset from a compressed program; outputs
- * a small assembly wrapper with the appropriate symbols defined.
+ * outputs a small assembly wrapper with the appropriate symbols defined.
  */
 
 #include 
@@ -35,7 +34,6 @@ int main(int argc, char *argv[])
 {
uint32_t olen;
long ilen;
-   unsigned long offs;
FILE *f = NULL;
int retval = 1;
 
@@ -65,23 +63,11 @@ int main(int argc, char *argv[])
ilen = ftell(f);
olen = get_unaligned_le32(&olen);
 
-   /*
-* Now we have the input (compressed) and output (uncompressed)
-* sizes, compute the necessary decompression offset...
-*/
-
-   offs = (olen > ilen) ? olen - ilen : 0;
-   offs += olen >> 12; /* Add 8 bytes for each 32K block */
-   offs += 64*1024 + 128;  /* Add 64K + 128 bytes slack */
-   offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
-
printf(".section \".rodata..compressed\",\"a\",@progbits\n");
printf(".globl z_input_len\n");
printf("z_input_len = %lu\n", ilen);
printf(".globl z_output_len\n");
printf("z_output_len = %lu\n", (unsigned long)olen);
-   printf(".globl z_min_extract_offset\n");
-   printf("z_min_extract_offset = 0x%lx\n", offs);
 
printf(".globl input_data, input_data_end\n");
printf("input_data:\n");
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 9bfab22..99204e5 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -440,7 +440,36 @@ setup_data:.quad 0 # 
64-bit physical pointer to
 
 pref_address:  .quad LOAD_PHYSICAL_ADDR# preferred load addr
 
+/* check arch/x86/boot/compressed/misc.c for the formula about extra_bytes.  */
+#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 32768 + 18)
+#if ZO_z_output_len > ZO_z_input_len
+#define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - 
ZO_z_input_len)
+#else
+#define ZO_z_extract_offset ZO_z_extra_bytes
+#endif
+
+/*
+ * extract_offset has to be bigger than ZO head section.
+ * otherwise during head code

[PATCH 04/42] x86, kaslr: Kill not needed and wrong run_size calculation code.

2015-07-07 Thread Yinghai Lu
We use simple and correct version to get run_size now, remove code for
wrong run_size calculation.

Fixes: e6023367d779 ("x86, kaslr: Prevent .bss from overlaping initrd")
Cc: "H. Peter Anvin" 
Cc: Josh Triplett 
Cc: Matt Fleming 
Cc: Kees Cook 
Cc: Andrew Morton 
Cc: Ard Biesheuvel 
Cc: Junjie Mao 
Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/Makefile  |  4 +---
 arch/x86/boot/compressed/head_32.S |  3 +--
 arch/x86/boot/compressed/head_64.S |  3 ---
 arch/x86/boot/compressed/misc.c|  6 ++
 arch/x86/boot/compressed/mkpiggy.c |  9 ++--
 arch/x86/tools/calc_run_size.sh| 42 --
 6 files changed, 6 insertions(+), 61 deletions(-)
 delete mode 100644 arch/x86/tools/calc_run_size.sh

diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index d9fee82..50daea7 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -104,10 +104,8 @@ suffix-$(CONFIG_KERNEL_XZ) := xz
 suffix-$(CONFIG_KERNEL_LZO):= lzo
 suffix-$(CONFIG_KERNEL_LZ4):= lz4
 
-RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \
-$(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh)
 quiet_cmd_mkpiggy = MKPIGGY $@
-  cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false )
+  cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
 
 targets += piggy.S
 $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
diff --git a/arch/x86/boot/compressed/head_32.S 
b/arch/x86/boot/compressed/head_32.S
index 0c140f9..122b32f 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -210,7 +210,6 @@ relocated:
  * Do the decompression, and jump to the new kernel..
  */
/* push arguments for decompress_kernel: */
-   pushl   $z_run_size /* size of kernel with .bss and .brk */
pushl   $z_output_len   /* decompressed length, end of relocs */
 
movlBP_init_size(%esi), %eax
@@ -226,7 +225,7 @@ relocated:
pushl   %eax/* heap area */
pushl   %esi/* real mode pointer */
calldecompress_kernel /* returns kernel location in %eax */
-   addl$28, %esp
+   addl$24, %esp
 
 /*
  * Jump to the decompressed kernel.
diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index 67dd8d3..3691451 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -407,8 +407,6 @@ relocated:
  * Do the decompression, and jump to the new kernel..
  */
pushq   %rsi/* Save the real mode argument */
-   movq$z_run_size, %r9/* size of kernel with .bss and .brk */
-   pushq   %r9
movq%rsi, %rdi  /* real mode address */
leaqboot_heap(%rip), %rsi   /* malloc area for uncompression */
leaqinput_data(%rip), %rdx  /* input_data */
@@ -416,7 +414,6 @@ relocated:
movq%rbp, %r8   /* output target address */
movq$z_output_len, %r9  /* decompressed length, end of relocs */
calldecompress_kernel   /* returns kernel location in %rax */
-   popq%r9
popq%rsi
 
 /*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a88b591..96201aa 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -371,9 +371,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, 
memptr heap,
  unsigned char *input_data,
  unsigned long input_len,
  unsigned char *output,
- unsigned long output_len,
- unsigned long run_size)
+ unsigned long output_len)
 {
+   unsigned long run_size = VO__end - VO__text;
unsigned char *output_orig = output;
 
real_mode = rmode;
@@ -394,8 +394,6 @@ asmlinkage __visible void *decompress_kernel(void *rmode, 
memptr heap,
lines = real_mode->screen_info.orig_video_lines;
cols = real_mode->screen_info.orig_video_cols;
 
-   run_size = VO__end - VO__text;
-
console_init();
debug_putstr("early console in decompress_kernel\n");
 
diff --git a/arch/x86/boot/compressed/mkpiggy.c 
b/arch/x86/boot/compressed/mkpiggy.c
index 5faad09..c03b009 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -36,13 +36,11 @@ int main(int argc, char *argv[])
uint32_t olen;
long ilen;
unsigned long offs;
-   unsigned long run_size;
FILE *f = NULL;
int retval = 1;
 
-   if (argc < 3) {
-   fprintf(stderr, "Usage: %s compressed_file run_size\n",
-   argv[0]);
+

[PATCH 06/42] x86, kaslr: Consolidate mem_avoid array filling

2015-07-07 Thread Yinghai Lu
We are going to support kaslr with 64bit above 4G, and new random output
buffer could be anywhere.

mem_avoid array is used for kaslr to search new output buffer.
Current code only track range that is after output+output_run_size.

We need to track all range instead of just after output+output_run_size.

Current code has first entry is extra bytes after input+input_size, and it
is according to output_run_size. Other entries are for initrd, cmdline,
and heap/stack for ZO running.

At first, check the first entry that should be in the mem_avoid array.

Now ZO sit end of the buffer always, we can find out where is ZO text
and data/bss etc.
output+run_size
  |
0   output   input  input+input_size  | output+init_size
| ||   |  |  |
|-|-|--|---|--|---|--|
| |
   output+init_size-ZO_SIZE   output+output_size

[output, output+init_size) is the buffer for decompress.

[output, output+run_size) is for VO run size.
[output, output+output_size) is (VO (vmlinux after objcopy) plus relocs)

[output+init_size-ZO_SIZE, output+init_size) is copied ZO.
[input, input+input_size) is copied compressed (VO (vmlinux after objcopy)
plus relocs), not the ZO.

[input+input_size, output+init_size) is [_text, _end) for ZO. that could be
first range in mem_avoid.

That new first entry already include heap and stack for ZO running.  So we
don't need to put them separatedly into mem_avoid array.

Also we need to put [input, input+input_size) in mem_avoid array, ant it
is connected to first one, so merge them.

At last we need to put boot_params into the mem_avoid too. As with 64bit 
bootloader
could put it anywhere.

After those changes, we have all range needed to be avoided in mem_avoid array.

Cc: Kees Cook 
Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/aslr.c | 29 +
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 0e1dac0..d753fb3 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -109,7 +109,7 @@ struct mem_vector {
unsigned long size;
 };
 
-#define MEM_AVOID_MAX 5
+#define MEM_AVOID_MAX 4
 static struct mem_vector mem_avoid[MEM_AVOID_MAX];
 
 static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
@@ -135,21 +135,22 @@ static bool mem_overlaps(struct mem_vector *one, struct 
mem_vector *two)
 }
 
 static void mem_avoid_init(unsigned long input, unsigned long input_size,
-  unsigned long output, unsigned long output_run_size)
+  unsigned long output)
 {
+   unsigned long init_size = real_mode->hdr.init_size;
u64 initrd_start, initrd_size;
u64 cmd_line, cmd_line_size;
-   unsigned long unsafe, unsafe_len;
char *ptr;
 
/*
 * Avoid the region that is unsafe to overlap during
-* decompression (see calculations at top of misc.c).
+* decompression.
+* As we already move ZO (arch/x86/boot/compressed/vmlinux)
+* to the end of buffer, [input+input_size, output+init_size)
+* has [_text, _end) for ZO.
 */
-   unsafe_len = (output_run_size >> 12) + 32768 + 18;
-   unsafe = (unsigned long)input + input_size - unsafe_len;
-   mem_avoid[0].start = unsafe;
-   mem_avoid[0].size = unsafe_len;
+   mem_avoid[0].start = input;
+   mem_avoid[0].size = (output + init_size) - input;
 
/* Avoid initrd. */
initrd_start  = (u64)real_mode->ext_ramdisk_image << 32;
@@ -169,13 +170,9 @@ static void mem_avoid_init(unsigned long input, unsigned 
long input_size,
mem_avoid[2].start = cmd_line;
mem_avoid[2].size = cmd_line_size;
 
-   /* Avoid heap memory. */
-   mem_avoid[3].start = (unsigned long)free_mem_ptr;
-   mem_avoid[3].size = BOOT_HEAP_SIZE;
-
-   /* Avoid stack memory. */
-   mem_avoid[4].start = (unsigned long)free_mem_end_ptr;
-   mem_avoid[4].size = BOOT_STACK_SIZE;
+   /* Avoid params */
+   mem_avoid[3].start = (unsigned long)real_mode;
+   mem_avoid[3].size = sizeof(*real_mode);
 }
 
 /* Does this memory vector overlap a known avoided area? */
@@ -319,7 +316,7 @@ unsigned char *choose_kernel_location(unsigned char *input,
 
/* Record the various known unsafe memory ranges. */
mem_avoid_init((unsigned long)input, input_size,
-  (unsigned long)output, output_run_size);
+  (unsigned long)output);
 
/* Walk e820 and find a random address. */
random = find_random_addr(choice, output_run_size);
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsu

[PATCH 09/42] x86, boot: Split kernel_ident_mapping_init to another file

2015-07-07 Thread Yinghai Lu
We need to include that in boot::decompress_kernel stage to set new
ident mapping.

Also add checking for __pa/__va macro definition, as we need to override them
in boot::decompress_kernel stage.

Reviewed-by: Kees Cook 
Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/page.h |  5 +++
 arch/x86/mm/ident_map.c | 74 +
 arch/x86/mm/init_64.c   | 74 +
 3 files changed, 80 insertions(+), 73 deletions(-)
 create mode 100644 arch/x86/mm/ident_map.c

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 802dde3..cf8f619 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, 
unsigned long vaddr,
alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
+#ifndef __pa
 #define __pa(x)__phys_addr((unsigned long)(x))
+#endif
+
 #define __pa_nodebug(x)__phys_addr_nodebug((unsigned long)(x))
 /* __pa_symbol should be used for C visible symbols.
This seems to be the official gcc blessed way to do such arithmetic. */
@@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, 
unsigned long vaddr,
 #define __pa_symbol(x) \
__phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
 
+#ifndef __va
 #define __va(x)((void *)((unsigned 
long)(x)+PAGE_OFFSET))
+#endif
 
 #define __boot_va(x)   __va(x)
 #define __boot_pa(x)   __pa(x)
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
new file mode 100644
index 000..751ca92
--- /dev/null
+++ b/arch/x86/mm/ident_map.c
@@ -0,0 +1,74 @@
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+  unsigned long addr, unsigned long end)
+{
+   addr &= PMD_MASK;
+   for (; addr < end; addr += PMD_SIZE) {
+   pmd_t *pmd = pmd_page + pmd_index(addr);
+
+   if (!pmd_present(*pmd))
+   set_pmd(pmd, __pmd(addr | pmd_flag));
+   }
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+ unsigned long addr, unsigned long end)
+{
+   unsigned long next;
+
+   for (; addr < end; addr = next) {
+   pud_t *pud = pud_page + pud_index(addr);
+   pmd_t *pmd;
+
+   next = (addr & PUD_MASK) + PUD_SIZE;
+   if (next > end)
+   next = end;
+
+   if (pud_present(*pud)) {
+   pmd = pmd_offset(pud, 0);
+   ident_pmd_init(info->pmd_flag, pmd, addr, next);
+   continue;
+   }
+   pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+   if (!pmd)
+   return -ENOMEM;
+   ident_pmd_init(info->pmd_flag, pmd, addr, next);
+   set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+   }
+
+   return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end)
+{
+   unsigned long next;
+   int result;
+   int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+   for (; addr < end; addr = next) {
+   pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+   pud_t *pud;
+
+   next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+   if (next > end)
+   next = end;
+
+   if (pgd_present(*pgd)) {
+   pud = pud_offset(pgd, 0);
+   result = ident_pud_init(info, pud, addr, next);
+   if (result)
+   return result;
+   continue;
+   }
+
+   pud = (pud_t *)info->alloc_pgt_page(info->context);
+   if (!pud)
+   return -ENOMEM;
+   result = ident_pud_init(info, pud, addr, next);
+   if (result)
+   return result;
+   set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+   }
+
+   return 0;
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3fba623..6f457a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -56,79 +56,7 @@
 
 #include "mm_internal.h"
 
-static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
-  unsigned long addr, unsigned long end)
-{
-   addr &= PMD_MASK;
-   for (; addr < end; addr += PMD_SIZE) {
-   pmd_t *pmd = pmd_page + pmd_index(addr);
-
-   if (!pmd_present(*pmd))
-   set_pmd(pmd, __pmd(addr | pmd_flag));
-   }
-}
-static int ident_pud_init(struct x86_mapping_info *in

[PATCH 42/42] x86: fix msr print again

2015-07-07 Thread Yinghai Lu
msr early print out get broken again, fix it.

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/processor.h |  1 -
 arch/x86/kernel/cpu/common.c | 61 +---
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519..3a7bd35 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -177,7 +177,6 @@ extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
-void print_cpu_msr(struct cpuinfo_x86 *);
 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 922c5e0..3c87e75 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1016,27 +1016,6 @@ out:
 }
 #endif
 
-void __init identify_boot_cpu(void)
-{
-   identify_cpu(&boot_cpu_data);
-   init_amd_e400_c1e_mask();
-#ifdef CONFIG_X86_32
-   sysenter_setup();
-   enable_sep_cpu();
-#endif
-   cpu_detect_tlb(&boot_cpu_data);
-}
-
-void identify_secondary_cpu(struct cpuinfo_x86 *c)
-{
-   BUG_ON(c == &boot_cpu_data);
-   identify_cpu(c);
-#ifdef CONFIG_X86_32
-   enable_sep_cpu();
-#endif
-   mtrr_ap_init();
-}
-
 struct msr_range {
unsignedmin;
unsignedmax;
@@ -1082,6 +1061,38 @@ static __init int setup_show_msr(char *arg)
 }
 __setup("show_msr=", setup_show_msr);
 
+static void print_cpu_msr(struct cpuinfo_x86 *c)
+{
+   if (c->cpu_index < show_msr)
+   __print_cpu_msr();
+}
+
+void __init identify_boot_cpu(void)
+{
+   identify_cpu(&boot_cpu_data);
+   init_amd_e400_c1e_mask();
+#ifdef CONFIG_X86_32
+   sysenter_setup();
+   enable_sep_cpu();
+#endif
+   cpu_detect_tlb(&boot_cpu_data);
+
+   print_cpu_msr(&boot_cpu_data);
+}
+
+void identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+   BUG_ON(c == &boot_cpu_data);
+   identify_cpu(c);
+#ifdef CONFIG_X86_32
+   enable_sep_cpu();
+#endif
+
+   print_cpu_msr(c);
+
+   mtrr_ap_init();
+}
+
 static __init int setup_noclflush(char *arg)
 {
setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
@@ -1115,14 +1126,6 @@ void print_cpu_info(struct cpuinfo_x86 *c)
printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
else
printk(KERN_CONT ")\n");
-
-   print_cpu_msr(c);
-}
-
-void print_cpu_msr(struct cpuinfo_x86 *c)
-{
-   if (c->cpu_index < show_msr)
-   __print_cpu_msr();
 }
 
 static __init int setup_disablecpuid(char *arg)
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    4   5   6   7   8   9   10   11   12   13   >