Re: [RFC FIX v1 1/2] powerpc: Discover radix availability before scanning the memory nodes

2018-01-05 Thread Michael Ellerman
Bharata B Rao  writes:

> Currently device tree nodes for memory are scanned before the
> radix feature is discovered in mmu_early_init_devtree(). Move this
> routine ahead of scanning memory nodes so that we know if the
> guest is radix or not when scanning ibm,dynamic-reconfiguration-memory.

Sorry this doesn't work.

> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index b15bae2..079d893 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -722,6 +722,8 @@ void __init early_init_devtree(void *params)
>*/
>   of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line);
>  
> + mmu_early_init_devtree();
> +

You've moved this above parse_early_param(), but
mmu_early_init_devtree() uses disable_radix, which is an early param. So
this will break disable_radix handling.

It will probably break other things too because the ordering of this
init code is very fragile - bootstrapping is hard :)

>   /* Scan memory nodes and rebuild MEMBLOCKs */
>   of_scan_flat_dt(early_init_dt_scan_root, NULL);
>   of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
> @@ -783,8 +785,6 @@ void __init early_init_devtree(void *params)
>   spinning_secondaries = boot_cpu_count - 1;
>  #endif
>  
> - mmu_early_init_devtree();
> -
>  #ifdef CONFIG_PPC_POWERNV
>   /* Scan and build the list of machine check recoverable ranges */
>   of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);

cheers


Re: [PATCH 09/67] arc: remove CONFIG_ARC_PLAT_NEEDS_PHYS_TO_DMA

2018-01-05 Thread Vineet Gupta

On 12/29/2017 12:25 AM, Christoph Hellwig wrote:

We always use the stub definitions, so remove the unused other code.

Signed-off-by: Christoph Hellwig 


Acked-by: Vineet Gupta 

FWIW, it was removed and reintroduced as one of the customers wanted it, which is 
not relevant now !


Thx,
-Vineet


---
  arch/arc/Kconfig   |  3 ---
  arch/arc/include/asm/dma-mapping.h |  7 ---
  arch/arc/mm/dma.c  | 14 +++---
  3 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 9d5fd00d9e91..f3a80cf164cc 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -463,9 +463,6 @@ config ARCH_PHYS_ADDR_T_64BIT
  config ARCH_DMA_ADDR_T_64BIT
bool
  
-config ARC_PLAT_NEEDS_PHYS_TO_DMA

-   bool
-
  config ARC_KVADDR_SIZE
int "Kernel Virtual Address Space size (MB)"
range 0 512
diff --git a/arch/arc/include/asm/dma-mapping.h 
b/arch/arc/include/asm/dma-mapping.h
index 94285031c4fb..7a16824bfe98 100644
--- a/arch/arc/include/asm/dma-mapping.h
+++ b/arch/arc/include/asm/dma-mapping.h
@@ -11,13 +11,6 @@
  #ifndef ASM_ARC_DMA_MAPPING_H
  #define ASM_ARC_DMA_MAPPING_H
  
-#ifndef CONFIG_ARC_PLAT_NEEDS_PHYS_TO_DMA

-#define plat_dma_to_phys(dev, dma_handle) ((phys_addr_t)(dma_handle))
-#define plat_phys_to_dma(dev, paddr) ((dma_addr_t)(paddr))
-#else
-#include 
-#endif
-
  extern const struct dma_map_ops arc_dma_ops;
  
  static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)

diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index fad18261ef6a..1d405b86250c 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -60,7 +60,7 @@ static void *arc_dma_alloc(struct device *dev, size_t size,
/* This is linear addr (0x8000_ based) */
paddr = page_to_phys(page);
  
-	*dma_handle = plat_phys_to_dma(dev, paddr);

+   *dma_handle = paddr;
  
  	/* This is kernel Virtual address (0x7000_ based) */

if (need_kvaddr) {
@@ -92,7 +92,7 @@ static void *arc_dma_alloc(struct device *dev, size_t size,
  static void arc_dma_free(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs)
  {
-   phys_addr_t paddr = plat_dma_to_phys(dev, dma_handle);
+   phys_addr_t paddr = dma_handle;
struct page *page = virt_to_page(paddr);
int is_non_coh = 1;
  
@@ -111,7 +111,7 @@ static int arc_dma_mmap(struct device *dev, struct vm_area_struct *vma,

  {
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-   unsigned long pfn = __phys_to_pfn(plat_dma_to_phys(dev, dma_addr));
+   unsigned long pfn = __phys_to_pfn(dma_addr);
unsigned long off = vma->vm_pgoff;
int ret = -ENXIO;
  
@@ -175,7 +175,7 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,

if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
_dma_cache_sync(paddr, size, dir);
  
-	return plat_phys_to_dma(dev, paddr);

+   return paddr;
  }
  
  /*

@@ -190,7 +190,7 @@ static void arc_dma_unmap_page(struct device *dev, 
dma_addr_t handle,
   size_t size, enum dma_data_direction dir,
   unsigned long attrs)
  {
-   phys_addr_t paddr = plat_dma_to_phys(dev, handle);
+   phys_addr_t paddr = handle;
  
  	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))

_dma_cache_sync(paddr, size, dir);
@@ -224,13 +224,13 @@ static void arc_dma_unmap_sg(struct device *dev, struct 
scatterlist *sg,
  static void arc_dma_sync_single_for_cpu(struct device *dev,
dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
  {
-   _dma_cache_sync(plat_dma_to_phys(dev, dma_handle), size, 
DMA_FROM_DEVICE);
+   _dma_cache_sync(dma_handle, size, DMA_FROM_DEVICE);
  }
  
  static void arc_dma_sync_single_for_device(struct device *dev,

dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
  {
-   _dma_cache_sync(plat_dma_to_phys(dev, dma_handle), size, DMA_TO_DEVICE);
+   _dma_cache_sync(dma_handle, size, DMA_TO_DEVICE);
  }
  
  static void arc_dma_sync_sg_for_cpu(struct device *dev,




Re: [PATCH v7 07/10] kernel/jump_label: abstract jump_entry member accessors

2018-01-05 Thread Ard Biesheuvel
On 5 January 2018 at 18:22, Catalin Marinas  wrote:
> On Fri, Jan 05, 2018 at 06:01:33PM +, Ard Biesheuvel wrote:
>> On 5 January 2018 at 17:58, Catalin Marinas  wrote:
>> > On Tue, Jan 02, 2018 at 08:05:46PM +, Ard Biesheuvel wrote:
>> >> diff --git a/arch/arm/include/asm/jump_label.h 
>> >> b/arch/arm/include/asm/jump_label.h
>> >> index e12d7d096fc0..7b05b404063a 100644
>> >> --- a/arch/arm/include/asm/jump_label.h
>> >> +++ b/arch/arm/include/asm/jump_label.h
>> >> @@ -45,5 +45,32 @@ struct jump_entry {
>> >>   jump_label_t key;
>> >>  };
>> >>
>> >> +static inline jump_label_t jump_entry_code(const struct jump_entry 
>> >> *entry)
>> >> +{
>> >> + return entry->code;
>> >> +}
>> >> +
>> >> +static inline struct static_key *jump_entry_key(const struct jump_entry 
>> >> *entry)
>> >> +{
>> >> + return (struct static_key *)((unsigned long)entry->key & ~1UL);
>> >> +}
>> >> +
>> >> +static inline bool jump_entry_is_branch(const struct jump_entry *entry)
>> >> +{
>> >> + return (unsigned long)entry->key & 1UL;
>> >> +}
>> >> +
>> >> +static inline bool jump_entry_is_module_init(const struct jump_entry 
>> >> *entry)
>> >> +{
>> >> + return entry->code == 0;
>> >> +}
>> >> +
>> >> +static inline void jump_entry_set_module_init(struct jump_entry *entry)
>> >> +{
>> >> + entry->code = 0;
>> >> +}
>> >> +
>> >> +#define jump_label_swap  NULL
>> >
>> > Is there any difference between these functions on any of the
>> > architectures touched? Even with the relative offset, arm64 and x86
>> > looked the same to me (well, I may have missed some detail).
>>
>> No, the latter two are identical everywhere, and the others are the
>> same modulo absolute vs relative.
>>
>> The issue is that the struct definition is per-arch so the accessors
>> should be as well.
>
> Up to this patch, even the jump_entry structure is the same on all
> architectures (the jump_label_t type differs).
>
> With relative offset, can you not just define jump_label_t to s32? At a
> quick grep in mainline, it doesn't seem to be used outside the structure
> definition.
>

I think we can just remove jump_label_t entirely, and replace it with
unsigned long for absolute, and s32 for relative. Maybe I am missing
something, but things like

#ifdef CONFIG_X86_64
typedef u64 jump_label_t;
#else
typedef u32 jump_label_t;
#endif

seem a bit pointless to me anyway.


>> Perhaps I should introduce two variants two asm-generic, similar to
>> how we have different flavors of unaligned accessors.
>
> You could as well define them directly in kernel/jump_label.h or, if
> used outside this file, include/linux/jump_label.h.
>

Perhaps I should define a Kconfig symbol after all for relative jump
labels, and just keep everything in the same file. The question is
whether I should use CONFIG_HAVE_ARCH_PREL32_RELOCATIONS for this as
well.


Re: [PATCH v7 07/10] kernel/jump_label: abstract jump_entry member accessors

2018-01-05 Thread Catalin Marinas
On Fri, Jan 05, 2018 at 06:01:33PM +, Ard Biesheuvel wrote:
> On 5 January 2018 at 17:58, Catalin Marinas  wrote:
> > On Tue, Jan 02, 2018 at 08:05:46PM +, Ard Biesheuvel wrote:
> >> diff --git a/arch/arm/include/asm/jump_label.h 
> >> b/arch/arm/include/asm/jump_label.h
> >> index e12d7d096fc0..7b05b404063a 100644
> >> --- a/arch/arm/include/asm/jump_label.h
> >> +++ b/arch/arm/include/asm/jump_label.h
> >> @@ -45,5 +45,32 @@ struct jump_entry {
> >>   jump_label_t key;
> >>  };
> >>
> >> +static inline jump_label_t jump_entry_code(const struct jump_entry *entry)
> >> +{
> >> + return entry->code;
> >> +}
> >> +
> >> +static inline struct static_key *jump_entry_key(const struct jump_entry 
> >> *entry)
> >> +{
> >> + return (struct static_key *)((unsigned long)entry->key & ~1UL);
> >> +}
> >> +
> >> +static inline bool jump_entry_is_branch(const struct jump_entry *entry)
> >> +{
> >> + return (unsigned long)entry->key & 1UL;
> >> +}
> >> +
> >> +static inline bool jump_entry_is_module_init(const struct jump_entry 
> >> *entry)
> >> +{
> >> + return entry->code == 0;
> >> +}
> >> +
> >> +static inline void jump_entry_set_module_init(struct jump_entry *entry)
> >> +{
> >> + entry->code = 0;
> >> +}
> >> +
> >> +#define jump_label_swap  NULL
> >
> > Is there any difference between these functions on any of the
> > architectures touched? Even with the relative offset, arm64 and x86
> > looked the same to me (well, I may have missed some detail).
> 
> No, the latter two are identical everywhere, and the others are the
> same modulo absolute vs relative.
> 
> The issue is that the struct definition is per-arch so the accessors
> should be as well.

Up to this patch, even the jump_entry structure is the same on all
architectures (the jump_label_t type differs).

With relative offset, can you not just define jump_label_t to s32? At a
quick grep in mainline, it doesn't seem to be used outside the structure
definition.

> Perhaps I should introduce two variants two asm-generic, similar to
> how we have different flavors of unaligned accessors.

You could as well define them directly in kernel/jump_label.h or, if
used outside this file, include/linux/jump_label.h.

-- 
Catalin


Re: [PATCH v7 07/10] kernel/jump_label: abstract jump_entry member accessors

2018-01-05 Thread Ard Biesheuvel
On 5 January 2018 at 17:58, Catalin Marinas  wrote:
> On Tue, Jan 02, 2018 at 08:05:46PM +, Ard Biesheuvel wrote:
>> diff --git a/arch/arm/include/asm/jump_label.h 
>> b/arch/arm/include/asm/jump_label.h
>> index e12d7d096fc0..7b05b404063a 100644
>> --- a/arch/arm/include/asm/jump_label.h
>> +++ b/arch/arm/include/asm/jump_label.h
>> @@ -45,5 +45,32 @@ struct jump_entry {
>>   jump_label_t key;
>>  };
>>
>> +static inline jump_label_t jump_entry_code(const struct jump_entry *entry)
>> +{
>> + return entry->code;
>> +}
>> +
>> +static inline struct static_key *jump_entry_key(const struct jump_entry 
>> *entry)
>> +{
>> + return (struct static_key *)((unsigned long)entry->key & ~1UL);
>> +}
>> +
>> +static inline bool jump_entry_is_branch(const struct jump_entry *entry)
>> +{
>> + return (unsigned long)entry->key & 1UL;
>> +}
>> +
>> +static inline bool jump_entry_is_module_init(const struct jump_entry *entry)
>> +{
>> + return entry->code == 0;
>> +}
>> +
>> +static inline void jump_entry_set_module_init(struct jump_entry *entry)
>> +{
>> + entry->code = 0;
>> +}
>> +
>> +#define jump_label_swap  NULL
>
> Is there any difference between these functions on any of the
> architectures touched? Even with the relative offset, arm64 and x86
> looked the same to me (well, I may have missed some detail).
>

No, the latter two are identical everywhere, and the others are the
same modulo absolute vs relative.

The issue is that the struct definition is per-arch so the accessors
should be as well. Perhaps I should introduce two variants two
asm-generic, similar to how we have different flavors of unaligned
accessors.


Re: [PATCH v7 07/10] kernel/jump_label: abstract jump_entry member accessors

2018-01-05 Thread Catalin Marinas
On Tue, Jan 02, 2018 at 08:05:46PM +, Ard Biesheuvel wrote:
> diff --git a/arch/arm/include/asm/jump_label.h 
> b/arch/arm/include/asm/jump_label.h
> index e12d7d096fc0..7b05b404063a 100644
> --- a/arch/arm/include/asm/jump_label.h
> +++ b/arch/arm/include/asm/jump_label.h
> @@ -45,5 +45,32 @@ struct jump_entry {
>   jump_label_t key;
>  };
>  
> +static inline jump_label_t jump_entry_code(const struct jump_entry *entry)
> +{
> + return entry->code;
> +}
> +
> +static inline struct static_key *jump_entry_key(const struct jump_entry 
> *entry)
> +{
> + return (struct static_key *)((unsigned long)entry->key & ~1UL);
> +}
> +
> +static inline bool jump_entry_is_branch(const struct jump_entry *entry)
> +{
> + return (unsigned long)entry->key & 1UL;
> +}
> +
> +static inline bool jump_entry_is_module_init(const struct jump_entry *entry)
> +{
> + return entry->code == 0;
> +}
> +
> +static inline void jump_entry_set_module_init(struct jump_entry *entry)
> +{
> + entry->code = 0;
> +}
> +
> +#define jump_label_swap  NULL

Is there any difference between these functions on any of the
architectures touched? Even with the relative offset, arm64 and x86
looked the same to me (well, I may have missed some detail).

-- 
Catalin


Re: [PATCH v7 05/10] PCI: Add support for relative addressing in quirk tables

2018-01-05 Thread Ard Biesheuvel
On 5 January 2018 at 17:41, Catalin Marinas  wrote:
> On Tue, Jan 02, 2018 at 08:05:44PM +, Ard Biesheuvel wrote:
>> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
>> index 10684b17d0bd..b6d51b4d5ce1 100644
>> --- a/drivers/pci/quirks.c
>> +++ b/drivers/pci/quirks.c
>> @@ -3556,9 +3556,16 @@ static void pci_do_fixups(struct pci_dev *dev, struct 
>> pci_fixup *f,
>>f->vendor == (u16) PCI_ANY_ID) &&
>>   (f->device == dev->device ||
>>f->device == (u16) PCI_ANY_ID)) {
>> - calltime = fixup_debug_start(dev, f->hook);
>> - f->hook(dev);
>> - fixup_debug_report(dev, calltime, f->hook);
>> + void (*hook)(struct pci_dev *dev);
>> +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
>> + hook = (void *)((unsigned long)>hook_offset +
>> + f->hook_offset);
>> +#else
>> + hook = f->hook;
>> +#endif
>
> More of a nitpick but I've seen this pattern in several places in your
> code, maybe worth defining a macro (couldn't come up with a better
> name):
>
> #define offset_to_ptr(off) \
> ((void *)((unsigned long)&(off) + (off)))
>

Yeah, good point. Or even

static inline void *offset_to_ptr(const s32 *off)
{
return (void *)((unsigned long)off + *off);
}


Re: [PATCH v7 05/10] PCI: Add support for relative addressing in quirk tables

2018-01-05 Thread Catalin Marinas
On Tue, Jan 02, 2018 at 08:05:44PM +, Ard Biesheuvel wrote:
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index 10684b17d0bd..b6d51b4d5ce1 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -3556,9 +3556,16 @@ static void pci_do_fixups(struct pci_dev *dev, struct 
> pci_fixup *f,
>f->vendor == (u16) PCI_ANY_ID) &&
>   (f->device == dev->device ||
>f->device == (u16) PCI_ANY_ID)) {
> - calltime = fixup_debug_start(dev, f->hook);
> - f->hook(dev);
> - fixup_debug_report(dev, calltime, f->hook);
> + void (*hook)(struct pci_dev *dev);
> +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
> + hook = (void *)((unsigned long)>hook_offset +
> + f->hook_offset);
> +#else
> + hook = f->hook;
> +#endif

More of a nitpick but I've seen this pattern in several places in your
code, maybe worth defining a macro (couldn't come up with a better
name):

#define offset_to_ptr(off) \
((void *)((unsigned long)&(off) + (off)))

-- 
Catalin


[PATCH 1/3] powerpc/32: Fix hugepage allocation on 8xx at hint address

2018-01-05 Thread Christophe Leroy
When an app has some regular pages allocated (e.g. see below) and tries
to mmap() a huge page at a hint address covered by the same PMD entry,
the kernel accepts the hint allthough the 8xx cannot handle different
page sizes in the same PMD entry.

1000-10001000 r-xp  00:0f 2597 /root/malloc
1001-10011000 rwxp  00:0f 2597 /root/malloc

mmap(0x1008, 524288, PROT_READ|PROT_WRITE,
 MAP_PRIVATE|MAP_ANONYMOUS|0x4, -1, 0) = 0x1008

This results in the following warning, and the app remains forever in
do_page_fault()/hugetlb_fault()

[162980.035629] WARNING: CPU: 0 PID: 2777 at arch/powerpc/mm/hugetlbpage.c:354 
hugetlb_free_pgd_range+0xc8/0x1e4
[162980.035699] CPU: 0 PID: 2777 Comm: malloc Tainted: G W   4.14.6 #85
[162980.035744] task: c67e2c00 task.stack: c668e000
[162980.035783] NIP:  c000fe18 LR: c00e1eec CTR: c00f90c0
[162980.035830] REGS: c668fc20 TRAP: 0700   Tainted: G W(4.14.6)
[162980.035854] MSR:  00029032   CR: 24044224 XER: 2000
[162980.036003]
[162980.036003] GPR00: c00e1eec c668fcd0 c67e2c00 0010 c6869410 1008 
 77fb4000
[162980.036003] GPR08: 0001 0683c001  ff80 44028228 10018a34 
4008 418004fc
[162980.036003] GPR16: c668e000 00040100 c668e000 c06c c668fe78 c668e000 
c6835ba0 c668fd48
[162980.036003] GPR24:  73ff 7400 0001 77fb4000 100f 
1010 1010
[162980.036743] NIP [c000fe18] hugetlb_free_pgd_range+0xc8/0x1e4
[162980.036839] LR [c00e1eec] free_pgtables+0x12c/0x150
[162980.036861] Call Trace:
[162980.036939] [c668fcd0] [c00f0774] unlink_anon_vmas+0x1c4/0x214 (unreliable)
[162980.037040] [c668fd10] [c00e1eec] free_pgtables+0x12c/0x150
[162980.037118] [c668fd40] [c00eabac] exit_mmap+0xe8/0x1b4
[162980.037210] [c668fda0] [c0019710] mmput.part.9+0x20/0xd8
[162980.037301] [c668fdb0] [c001ecb0] do_exit+0x1f0/0x93c
[162980.037386] [c668fe00] [c001f478] do_group_exit+0x40/0xcc
[162980.037479] [c668fe10] [c002a76c] get_signal+0x47c/0x614
[162980.037570] [c668fe70] [c0007840] do_signal+0x54/0x244
[162980.037654] [c668ff30] [c0007ae8] do_notify_resume+0x34/0x88
[162980.037744] [c668ff40] [c000dae8] do_user_signal+0x74/0xc4
[162980.037781] Instruction dump:
[162980.037821] 7fdff378 8137 54a3463a 80890020 7d24182e 7c841a14 712a0004 
4082ff94
[162980.038014] 2f89 419e0010 712a0ff0 408200e0 <0fe0> 54a9000a 
7f984840 419d0094
[162980.038216] ---[ end trace c0ceeca8e7a5800a ]---
[162980.038754] BUG: non-zero nr_ptes on freeing mm: 1
[162985.363322] BUG: non-zero nr_ptes on freeing mm: -1

In order to fix this, the address space "slices" implemented
for BOOK3S/64 is reused.

This patch:
1/ Modifies the "slices" implementation to support 32 bits CPUs,
based on using only the low slices.
2/ Moves "slices" functions prototypes from page64.h to page.h
3/ Modifies the context.id on the 8xx to be in the range [1:16]
instead of [0:15] in order to identify context.id == 0 as
not initialised contexts
4/ Activates CONFIG_PPC_MM_SLICES when CONFIG_HUGETLB_PAGE is
selected for the 8xx

Alltough we could in theory have as many slices as PMD entries, the current
slices implementation limits the number of low slices to 16.

Fixes: 4b91428699477 ("powerpc/8xx: Implement support of hugepages")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu-8xx.h |  6 
 arch/powerpc/include/asm/page.h| 14 
 arch/powerpc/include/asm/page_32.h | 19 +++
 arch/powerpc/include/asm/page_64.h | 21 ++--
 arch/powerpc/kernel/setup-common.c |  2 +-
 arch/powerpc/mm/8xx_mmu.c  |  2 +-
 arch/powerpc/mm/hash_utils_64.c|  2 +-
 arch/powerpc/mm/hugetlbpage.c  |  2 ++
 arch/powerpc/mm/mmu_context_nohash.c   | 11 +--
 arch/powerpc/mm/slice.c| 58 +++---
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 11 files changed, 95 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index 5bb3dbede41a..5f89b6010453 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -169,6 +169,12 @@ typedef struct {
unsigned int id;
unsigned int active;
unsigned long vdso_base;
+#ifdef CONFIG_PPC_MM_SLICES
+   u16 user_psize; /* page size index */
+   u64 low_slices_psize;   /* page size encodings */
+   unsigned char high_slices_psize[0];
+   unsigned long slb_addr_limit;
+#endif
 } mm_context_t;
 
 #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff8)
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 8da5d4c1cab2..d0384f9db9eb 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -342,6 +342,20 @@ typedef struct page *pgtable_t;
 #endif
 #endif
 
+#ifdef CONFIG_PPC_MM_SLICES
+struct mm_struct;
+
+unsigned long slice_get_unmapped_area(unsigned long 

[PATCH 3/3] powerpc/8xx: Increase the number of mm slices

2018-01-05 Thread Christophe Leroy
On the 8xx, we can have as many slices as PMD entries.
This means we could have 1024 slices in 4k size pages mode
and 64 slices in 16k size pages.

However, due to a stack overflow in slice_get_unmapped_area(),
we limit to 512 slices.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu-8xx.h | 6 +-
 arch/powerpc/include/asm/page_32.h | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index d669d0062da4..40aa7b0cd0dc 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -171,7 +171,11 @@ typedef struct {
unsigned long vdso_base;
 #ifdef CONFIG_PPC_MM_SLICES
u16 user_psize; /* page size index */
-   unsigned char low_slices_psize[8]; /* 16 slices */
+#if defined(CONFIG_PPC_16K_PAGES)
+   unsigned char low_slices_psize[32]; /* 64 slices */
+#else
+   unsigned char low_slices_psize[256]; /* 512 slices */
+#endif
unsigned char high_slices_psize[0];
unsigned long slb_addr_limit;
 #endif
diff --git a/arch/powerpc/include/asm/page_32.h 
b/arch/powerpc/include/asm/page_32.h
index f7d1bd1183c8..43695ce7ee07 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -62,7 +62,8 @@ extern void copy_page(void *to, void *from);
 
 #ifdef CONFIG_PPC_MM_SLICES
 
-#define SLICE_LOW_SHIFT28
+/* SLICE_LOW_SHIFT >= 23 to avoid stack overflow in slice_get_unmapped_area() 
*/
+#define SLICE_LOW_SHIFT(PMD_SHIFT > 23 ? PMD_SHIFT : 23)
 #define SLICE_HIGH_SHIFT   0
 
 #define SLICE_LOW_TOP  (0xul)
-- 
2.13.3



[PATCH v4 1/7] platform/pseries: Update VF config space after EEH

2018-01-05 Thread Bryant G. Ly
Add EEH platform operations for pseries to update VF
config space. With this change after EEH, the VF
will have updated config space for pseries platform.

Signed-off-by: Bryant G. Ly 
Signed-off-by: Juan J. Alvarez 
---
 arch/powerpc/include/asm/eeh.h   |  1 +
 arch/powerpc/kernel/eeh.c| 59 +
 arch/powerpc/platforms/powernv/eeh-powernv.c | 65 ++--
 arch/powerpc/platforms/pseries/eeh_pseries.c | 26 ++-
 4 files changed, 88 insertions(+), 63 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 5161c37dd039..82829c65f31a 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -297,6 +297,7 @@ int eeh_pe_reset(struct eeh_pe *pe, int option);
 int eeh_pe_configure(struct eeh_pe *pe);
 int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
  unsigned long addr, unsigned long mask);
+int eeh_restore_vf_config(struct pci_dn *pdn);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index cbca0a667682..cc649809885e 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -740,6 +740,65 @@ static void *eeh_restore_dev_state(void *data, void 
*userdata)
return NULL;
 }
 
+int eeh_restore_vf_config(struct pci_dn *pdn)
+{
+   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+   u32 devctl, cmd, cap2, aer_capctl;
+   int old_mps;
+
+   if (edev->pcie_cap) {
+   /* Restore MPS */
+   old_mps = (ffs(pdn->mps) - 8) << 5;
+   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+2, );
+   devctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
+   devctl |= old_mps;
+   eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 2, devctl);
+
+   /* Disable Completion Timeout */
+   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2,
+4, );
+   if (cap2 & 0x10) {
+   eeh_ops->read_config(pdn,
+edev->pcie_cap + PCI_EXP_DEVCTL2,
+4, );
+   cap2 |= 0x10;
+   eeh_ops->write_config(pdn,
+ edev->pcie_cap + PCI_EXP_DEVCTL2,
+ 4, cap2);
+   }
+   }
+
+   /* Enable SERR and parity checking */
+   eeh_ops->read_config(pdn, PCI_COMMAND, 2, );
+   cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
+   eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd);
+
+   /* Enable report various errors */
+   if (edev->pcie_cap) {
+   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+2, );
+   devctl &= ~PCI_EXP_DEVCTL_CERE;
+   devctl |= (PCI_EXP_DEVCTL_NFERE |
+  PCI_EXP_DEVCTL_FERE |
+  PCI_EXP_DEVCTL_URRE);
+   eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 2, devctl);
+   }
+
+   /* Enable ECRC generation and check */
+   if (edev->pcie_cap && edev->aer_cap) {
+   eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP,
+4, _capctl);
+   aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+   eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP,
+ 4, aer_capctl);
+   }
+
+   return 0;
+}
+
 /**
  * pcibios_set_pcie_reset_state - Set PCI-E reset state
  * @dev: pci device struct
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 961e64115d92..0665b6d03cb3 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1655,70 +1655,11 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
return ret;
 }
 
-static int pnv_eeh_restore_vf_config(struct pci_dn *pdn)
-{
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
-   u32 devctl, cmd, cap2, aer_capctl;
-   int old_mps;
-
-   if (edev->pcie_cap) {
-   /* Restore MPS */
-   old_mps = (ffs(pdn->mps) - 8) << 5;
-   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-2, );
-   devctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
-   devctl |= old_mps;
-   eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
- 2, devctl);
-
-   /* Disable Completion Timeout */
-

[PATCH v4 3/7] platforms/pseries: Set eeh_pe of EEH_PE_VF type

2018-01-05 Thread Bryant G. Ly
To correctly use EEH code one has to make
sure that the EEH_PE_VF is set for dynamic created
VFs. Therefore this patch allocates an eeh_pe of
eeh type EEH_PE_VF and associates PE with parent.

Signed-off-by: Bryant G. Ly 
Signed-off-by: Juan J. Alvarez 
---
 arch/powerpc/include/asm/pci-bridge.h|  5 -
 arch/powerpc/platforms/pseries/eeh_pseries.c | 17 +
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 9f66ddebb799..16d70740a76f 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -211,7 +211,10 @@ struct pci_dn {
unsigned int *pe_num_map;   /* PE# for the first VF PE or array */
boolm64_single_mode;/* Use M64 BAR in Single Mode */
 #define IODA_INVALID_M64(-1)
-   int (*m64_map)[PCI_SRIOV_NUM_BARS];
+   union {
+   int (*m64_map)[PCI_SRIOV_NUM_BARS]; /*Only used in powernv 
*/
+   int last_allow_rc;  /* Only used in pSeries */
+   };
 #endif /* CONFIG_PCI_IOV */
int mps;/* Maximum Payload Size */
struct list_head child_list;
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index a671ef4f57f5..ca6bbfd83701 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -55,9 +55,12 @@ static int ibm_get_config_addr_info;
 static int ibm_get_config_addr_info2;
 static int ibm_configure_pe;
 
+#ifdef CONFIG_PCI_IOV
 void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 {
struct pci_dn *pdn = pci_get_pdn(pdev);
+   struct pci_dn *physfn_pdn;
+   struct eeh_dev *edev;
 
if (!pdev->is_virtfn)
return;
@@ -65,6 +68,14 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
pdn->device_id  =  pdev->device;
pdn->vendor_id  =  pdev->vendor;
pdn->class_code =  pdev->class;
+   /* Last allow unfreeze return code used for retrieval
+* by user space in eeh-sysfs to show the last command
+* completion from platform
+*/
+   pdn->last_allow_rc =  0;
+   physfn_pdn  =  pci_get_pdn(pdev->physfn);
+   pdn->pe_number  =  physfn_pdn->pe_num_map[pdn->vf_index];
+   edev = pdn_to_eeh_dev(pdn);
 
/*
 * The following operations will fail if VF's sysfs files
@@ -72,9 +83,13 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 */
eeh_add_device_early(pdn);
eeh_add_device_late(pdev);
+   edev->pe_config_addr =  (pdn->busno << 16) | (pdn->devfn << 8);
+   eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
+   eeh_add_to_parent_pe(edev);   /* Add as VF PE type */
eeh_sysfs_add_device(pdev);
 
 }
+#endif
 
 /*
  * Buffer for reporting slot-error-detail rtas calls. Its here
@@ -141,8 +156,10 @@ static int pseries_eeh_init(void)
/* Set EEH probe mode */
eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
 
+#ifdef CONFIG_PCI_IOV
/* Set EEH machine dependent code */
ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
+#endif
 
return 0;
 }
-- 
2.14.3 (Apple Git-98)



[PATCH v4 6/7] pseries/pci: Associate PEs to VFs in configure SR-IOV

2018-01-05 Thread Bryant G. Ly
After initial validation of SR-IOV resources, firmware will
associate PEs to the dynamic VFs created within this call. This
patch adds the association of PEs to the PF array of PE numbers
indexed by VF.

Signed-off-by: Bryant G. Ly 
Signed-off-by: Juan J. Alvarez 
---
 arch/powerpc/platforms/pseries/pci.c | 150 ++-
 1 file changed, 148 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/pci.c 
b/arch/powerpc/platforms/pseries/pci.c
index 48d3af026f90..eab96637d6cf 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -59,16 +59,162 @@ DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, 
pcibios_name_device);
 #endif
 
 #ifdef CONFIG_PCI_IOV
+#define MAX_VFS_FOR_MAP_PE 256
+struct pe_map_bar_entry {
+   __be64 bar;   /* Input:  Virtual Function BAR */
+   __be16 rid;   /* Input:  Virtual Function Router ID */
+   __be16 pe_num;/* Output: Virtual Function PE Number */
+   __be32 reserved;  /* Reserved Space */
+};
+
+int pseries_send_map_pe(struct pci_dev *pdev,
+   u16 num_vfs,
+   struct pe_map_bar_entry *vf_pe_array)
+{
+   struct pci_dn *pdn;
+   int rc;
+   unsigned long buid, addr;
+   int ibm_map_pes = rtas_token("ibm,open-sriov-map-pe-number");
+
+   if (ibm_map_pes == RTAS_UNKNOWN_SERVICE)
+   return -EINVAL;
+
+   pdn = pci_get_pdn(pdev);
+   addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+   buid = pdn->phb->buid;
+   spin_lock(_data_buf_lock);
+   memcpy(rtas_data_buf, vf_pe_array,
+  RTAS_DATA_BUF_SIZE);
+   rc = rtas_call(ibm_map_pes, 5, 1, NULL, addr,
+  BUID_HI(buid), BUID_LO(buid),
+  rtas_data_buf,
+  num_vfs * sizeof(struct pe_map_bar_entry));
+   memcpy(vf_pe_array, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+   spin_unlock(_data_buf_lock);
+
+   if (rc)
+   dev_err(>dev,
+   "%s: Failed to associate pes PE#%lx, rc=%x\n",
+   __func__,  addr, rc);
+
+   return rc;
+}
+
+void pseries_set_pe_num(struct pci_dev *pdev, u16 vf_index, __be16 pe_num)
+{
+   struct pci_dn *pdn;
+
+   pdn = pci_get_pdn(pdev);
+   pdn->pe_num_map[vf_index] = be16_to_cpu(pe_num);
+   dev_dbg(>dev, "VF %04x:%02x:%02x.%x associated with PE#%x\n",
+   pci_domain_nr(pdev->bus),
+   pdev->bus->number,
+   PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
+   PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)),
+   pdn->pe_num_map[vf_index]);
+}
+
+int pseries_associate_pes(struct pci_dev *pdev, u16 num_vfs)
+{
+   struct pci_dn *pdn;
+   int i, rc, vf_index;
+   struct pe_map_bar_entry *vf_pe_array;
+   struct resource *res;
+   u64 size;
+
+   vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+   if (!vf_pe_array)
+   return -ENOMEM;
+
+   pdn = pci_get_pdn(pdev);
+   /* create firmware structure to associate pes */
+   for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+   pdn->pe_num_map[vf_index] = IODA_INVALID_PE;
+   for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+   res = >resource[i + PCI_IOV_RESOURCES];
+   if (!res->parent)
+   continue;
+   size = pcibios_iov_resource_alignment(pdev, i +
+   PCI_IOV_RESOURCES);
+   vf_pe_array[vf_index].bar =
+   cpu_to_be64(res->start + size * vf_index);
+   vf_pe_array[vf_index].rid =
+   cpu_to_be16((pci_iov_virtfn_bus(pdev, vf_index)
+   << 8) | pci_iov_virtfn_devfn(pdev,
+   vf_index));
+   vf_pe_array[vf_index].pe_num =
+   cpu_to_be16(IODA_INVALID_PE);
+   }
+   }
+
+   rc = pseries_send_map_pe(pdev, num_vfs, vf_pe_array);
+   /* Only zero is success */
+   if (!rc)
+   for (vf_index = 0; vf_index < num_vfs; vf_index++)
+   pseries_set_pe_num(pdev, vf_index,
+  vf_pe_array[vf_index].pe_num);
+
+   kfree(vf_pe_array);
+   return rc;
+}
+
+int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+   struct pci_dn *pdn;
+   intrc;
+   const int *max_vfs;
+   int max_config_vfs;
+   struct device_node *dn = pci_device_to_OF_node(pdev);
+
+   max_vfs = of_get_property(dn, "ibm,number-of-configurable-vfs", NULL);
+
+   if (!max_vfs)
+   return -EINVAL;
+
+   /* First integer 

[PATCH v4 4/7] powerpc/kernel Add EEH operations to notify resume

2018-01-05 Thread Bryant G. Ly
When pseries SR-IOV is enabled and after a PF driver
has resumed from EEH, platform has to be notified
of the event so the child VFs can be allowed to
resume their normal recovery path.

This patch makes the EEH operation allow unfreeze
platform dependent code and adds the call to
pseries EEH code.

Signed-off-by: Bryant G. Ly 
Signed-off-by: Juan J. Alvarez 
---
 arch/powerpc/include/asm/eeh.h   |  1 +
 arch/powerpc/platforms/powernv/eeh-powernv.c |  3 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 96 +++-
 3 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 82829c65f31a..fd37cc101f4f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -214,6 +214,7 @@ struct eeh_ops {
int (*write_config)(struct pci_dn *pdn, int where, int size, u32 val);
int (*next_error)(struct eeh_pe **pe);
int (*restore_config)(struct pci_dn *pdn);
+   int (*notify_resume)(struct pci_dn *pdn);
 };
 
 extern int eeh_subsystem_flags;
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 0665b6d03cb3..33c86c1a1720 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1704,7 +1704,8 @@ static struct eeh_ops pnv_eeh_ops = {
.read_config= pnv_eeh_read_config,
.write_config   = pnv_eeh_write_config,
.next_error = pnv_eeh_next_error,
-   .restore_config = pnv_eeh_restore_config
+   .restore_config = pnv_eeh_restore_config,
+   .notify_resume  = NULL
 };
 
 #ifdef CONFIG_PCI_IOV
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index ca6bbfd83701..898bb055cb19 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -749,6 +749,97 @@ static int pseries_eeh_restore_config(struct pci_dn *pdn)
return ret;
 }
 
+#ifdef CONFIG_PCI_IOV
+int pseries_send_allow_unfreeze(struct pci_dn *pdn,
+   u16 *vf_pe_array, int cur_vfs)
+{
+   int rc;
+   int ibm_allow_unfreeze = rtas_token("ibm,open-sriov-allow-unfreeze");
+   unsigned long buid, addr;
+
+   addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+   buid = pdn->phb->buid;
+   spin_lock(_data_buf_lock);
+   memcpy(rtas_data_buf, vf_pe_array, RTAS_DATA_BUF_SIZE);
+   rc = rtas_call(ibm_allow_unfreeze, 5, 1, NULL,
+  addr,
+  BUID_HI(buid),
+  BUID_LO(buid),
+  rtas_data_buf, cur_vfs * sizeof(u16));
+   spin_unlock(_data_buf_lock);
+   if (rc)
+   pr_warn("%s: Failed to allow unfreeze for PHB#%x-PE#%lx, 
rc=%x\n",
+   __func__,
+   pdn->phb->global_number, addr, rc);
+   return rc;
+}
+
+static int pseries_call_allow_unfreeze(struct eeh_dev *edev)
+{
+   struct pci_dn *pdn, *tmp, *parent, *physfn_pdn;
+   int cur_vfs = 0, rc = 0, vf_index, bus, devfn;
+   u16 *vf_pe_array;
+
+   vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+   if (!vf_pe_array)
+   return -ENOMEM;
+   if (pci_num_vf(edev->physfn ? edev->physfn : edev->pdev)) {
+   if (edev->pdev->is_physfn) {
+   cur_vfs = pci_num_vf(edev->pdev);
+   pdn = eeh_dev_to_pdn(edev);
+   parent = pdn->parent;
+   for (vf_index = 0; vf_index < cur_vfs; vf_index++)
+   vf_pe_array[vf_index] =
+   cpu_to_be16(pdn->pe_num_map[vf_index]);
+   rc = pseries_send_allow_unfreeze(pdn, vf_pe_array,
+cur_vfs);
+   pdn->last_allow_rc = rc;
+   for (vf_index = 0; vf_index < cur_vfs; vf_index++) {
+   list_for_each_entry_safe(pdn, tmp,
+>child_list,
+list) {
+   bus = pci_iov_virtfn_bus(edev->pdev,
+vf_index);
+   devfn = pci_iov_virtfn_devfn(edev->pdev,
+vf_index);
+   if (pdn->busno != bus ||
+   pdn->devfn != devfn)
+   continue;
+   pdn->last_allow_rc = rc;
+   }
+  

[PATCH 2/3] powerpc/mm: Allow more than 16 low slices

2018-01-05 Thread Christophe Leroy
While the implementation of the "slices" address space allows
a significant amount of high slices, it limits the number of
low slices to 16 due to the use of a single u64 low_slices element
in struct slice_mask.

In order to override this limitation, this patch switches the
handling of low_slices to BITMAPs as done already for high_slices.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/64/mmu.h |   2 +-
 arch/powerpc/include/asm/mmu-8xx.h   |   2 +-
 arch/powerpc/include/asm/paca.h  |   2 +-
 arch/powerpc/kernel/paca.c   |   3 +-
 arch/powerpc/mm/hash_utils_64.c  |  13 ++--
 arch/powerpc/mm/slb_low.S|   8 ++-
 arch/powerpc/mm/slice.c  | 102 +--
 7 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index c9448e19847a..27e7e9732ea1 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -91,7 +91,7 @@ typedef struct {
struct npu_context *npu_context;
 
 #ifdef CONFIG_PPC_MM_SLICES
-   u64 low_slices_psize;   /* SLB page size encodings */
+   unsigned char low_slices_psize[8]; /* SLB page size encodings */
unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
unsigned long slb_addr_limit;
 #else
diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index 5f89b6010453..d669d0062da4 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -171,7 +171,7 @@ typedef struct {
unsigned long vdso_base;
 #ifdef CONFIG_PPC_MM_SLICES
u16 user_psize; /* page size index */
-   u64 low_slices_psize;   /* page size encodings */
+   unsigned char low_slices_psize[8]; /* 16 slices */
unsigned char high_slices_psize[0];
unsigned long slb_addr_limit;
 #endif
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 3892db93b837..612017054825 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -141,7 +141,7 @@ struct paca_struct {
 #ifdef CONFIG_PPC_BOOK3S
mm_context_id_t mm_ctx_id;
 #ifdef CONFIG_PPC_MM_SLICES
-   u64 mm_ctx_low_slices_psize;
+   unsigned char mm_ctx_low_slices_psize[8];
unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
unsigned long mm_ctx_slb_addr_limit;
 #else
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index d6597038931d..8e1566bf82b8 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -264,7 +264,8 @@ void copy_mm_to_paca(struct mm_struct *mm)
 #ifdef CONFIG_PPC_MM_SLICES
VM_BUG_ON(!mm->context.slb_addr_limit);
get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit;
-   get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
+   memcpy(_paca()->mm_ctx_low_slices_psize,
+  >low_slices_psize, sizeof(context->low_slices_psize));
memcpy(_paca()->mm_ctx_high_slices_psize,
   >high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
 #else /* CONFIG_PPC_MM_SLICES */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 3266b3326088..2f0c6b527a83 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1097,19 +1097,18 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, 
pte_t pte, int trap)
 #ifdef CONFIG_PPC_MM_SLICES
 static unsigned int get_paca_psize(unsigned long addr)
 {
-   u64 lpsizes;
-   unsigned char *hpsizes;
+   unsigned char *psizes;
unsigned long index, mask_index;
 
if (addr <= SLICE_LOW_TOP) {
-   lpsizes = get_paca()->mm_ctx_low_slices_psize;
+   psizes = get_paca()->mm_ctx_low_slices_psize;
index = GET_LOW_SLICE_INDEX(addr);
-   return (lpsizes >> (index * 4)) & 0xF;
+   } else {
+   psizes = get_paca()->mm_ctx_high_slices_psize;
+   index = GET_HIGH_SLICE_INDEX(addr);
}
-   hpsizes = get_paca()->mm_ctx_high_slices_psize;
-   index = GET_HIGH_SLICE_INDEX(addr);
mask_index = index & 0x1;
-   return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF;
+   return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
 }
 
 #else
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 2cf5ef3fc50d..2c7c717fd2ea 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -200,10 +200,12 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 5:
/*
 * Handle lpsizes
-* r9 is get_paca()->context.low_slices_psize, r11 is index
+* r9 is get_paca()->context.low_slices_psize[index], r11 is mask_index
 */
-   ld  r9,PACALOWSLICESPSIZE(r13)
-   mr  r11,r10
+   srdir11,r10,1 /* index */
+   

[PATCH v4 2/7] linux/pci: Add uevents in AER and EEH error/resume

2018-01-05 Thread Bryant G. Ly
Devices can go offline when erors reported. This
patch adds a change to the kernel object and lets udev
know of error. When device resumes, a change is also set
reporting device as online. Therefore, EEH and AER events
are better propagated to user space for PCI devices in
all arches.

Signed-off-by: Bryant G. Ly 
Signed-off-by: Juan J. Alvarez 
Acked-by: Bjorn Helgaas 
---
 arch/powerpc/kernel/eeh_driver.c   |  6 ++
 drivers/pci/pcie/aer/aerdrv_core.c |  3 +++
 include/linux/pci.h| 36 
 3 files changed, 45 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 3c0fa99c5533..beea2182d754 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -228,6 +228,7 @@ static void *eeh_report_error(void *data, void *userdata)
 
edev->in_error = true;
eeh_pcid_put(dev);
+   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
return NULL;
 }
 
@@ -381,6 +382,10 @@ static void *eeh_report_resume(void *data, void *userdata)
driver->err_handler->resume(dev);
 
eeh_pcid_put(dev);
+   pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
+#ifdef CONFIG_PCI_IOV
+   eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
+#endif
return NULL;
 }
 
@@ -416,6 +421,7 @@ static void *eeh_report_failure(void *data, void *userdata)
driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
 
eeh_pcid_put(dev);
+   pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
return NULL;
 }
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 744805232155..8d7448063fd1 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -278,6 +278,7 @@ static int report_error_detected(struct pci_dev *dev, void 
*data)
} else {
err_handler = dev->driver->err_handler;
vote = err_handler->error_detected(dev, result_data->state);
+   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
}
 
result_data->result = merge_result(result_data->result, vote);
@@ -341,6 +342,7 @@ static int report_resume(struct pci_dev *dev, void *data)
 
err_handler = dev->driver->err_handler;
err_handler->resume(dev);
+   pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
 out:
device_unlock(>dev);
return 0;
@@ -541,6 +543,7 @@ static void do_recovery(struct pci_dev *dev, int severity)
return;
 
 failed:
+   pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
/* TODO: Should kernel panic here? */
dev_info(>dev, "AER: Device recovery failed\n");
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e3e94467687a..405630441b74 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2277,6 +2277,42 @@ static inline bool pci_is_thunderbolt_attached(struct 
pci_dev *pdev)
return false;
 }
 
+/**
+ * pci_uevent_ers - emit a uevent during recovery path of pci device
+ * @pdev: pci device to check
+ * @err_type: type of error event
+ *
+ */
+static inline void pci_uevent_ers(struct pci_dev *pdev,
+ enum  pci_ers_result err_type)
+{
+   int idx = 0;
+   char *envp[3];
+
+   switch (err_type) {
+   case PCI_ERS_RESULT_NONE:
+   case PCI_ERS_RESULT_CAN_RECOVER:
+   envp[idx++] = "ERROR_EVENT=BEGIN_RECOVERY";
+   envp[idx++] = "DEVICE_ONLINE=0";
+   break;
+   case PCI_ERS_RESULT_RECOVERED:
+   envp[idx++] = "ERROR_EVENT=SUCCESSFUL_RECOVERY";
+   envp[idx++] = "DEVICE_ONLINE=1";
+   break;
+   case PCI_ERS_RESULT_DISCONNECT:
+   envp[idx++] = "ERROR_EVENT=FAILED_RECOVERY";
+   envp[idx++] = "DEVICE_ONLINE=0";
+   break;
+   default:
+   break;
+   }
+
+   if (idx > 0) {
+   envp[idx++] = NULL;
+   kobject_uevent_env(>dev.kobj, KOBJ_CHANGE, envp);
+   }
+}
+
 /* provide the legacy pci_dma_* API */
 #include 
 
-- 
2.14.3 (Apple Git-98)



[PATCH v4 0/7] SR-IOV Enablement on PowerVM

2018-01-05 Thread Bryant G. Ly
This patch series will enable SR-IOV on PowerVM. A specific set of
lids for PFW/PHYP is required. They are planned to release with
920 at the moment.

For IBM internal testers let me know of a system you want to test on
and we can put on the lids required or we can provide a system to run
the tests.

This patch depends on the three patches:
988fc3ba5653278a8c14d6ccf687371775930d2b
dae7253f9f78a731755ca20c66b2d2c40b86baea
608c0d8804ef3ca4cda8ec6ad914e47deb283d7b

v1 - Initial Patch
v2 - Addressed Alexey and Russell's comments
v3 - Unify the call of .error_detected()
v4 - Fixed subject and change log per Bjorn's comments and
 fixed Alexey's comments

Bryant G. Ly (7):
  platform/pseries: Update VF config space after EEH
  linux/pci: Add uevents in AER and EEH error/resume
  platforms/pseries: Set eeh_pe of EEH_PE_VF type
  powerpc/kernel Add EEH operations to notify resume
  powerpc/kernel: Add EEH notify resume sysfs
  pseries/pci: Associate PEs to VFs in configure SR-IOV
  pseries/setup: Add Initialization of VF Bars

 arch/powerpc/include/asm/eeh.h   |   2 +
 arch/powerpc/include/asm/pci-bridge.h|   5 +-
 arch/powerpc/include/asm/pci.h   |   2 +
 arch/powerpc/kernel/eeh.c|  59 ++
 arch/powerpc/kernel/eeh_driver.c |   6 +
 arch/powerpc/kernel/eeh_sysfs.c  |  45 
 arch/powerpc/kernel/pci_of_scan.c|   2 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  68 +--
 arch/powerpc/platforms/pseries/eeh_pseries.c | 137 +-
 arch/powerpc/platforms/pseries/pci.c | 150 +++-
 arch/powerpc/platforms/pseries/setup.c   | 164 +++
 drivers/pci/pcie/aer/aerdrv_core.c   |   3 +
 include/linux/pci.h  |  36 ++
 13 files changed, 611 insertions(+), 68 deletions(-)

-- 
2.14.3 (Apple Git-98)



[PATCH v4 7/7] pseries/setup: Add Initialization of VF Bars

2018-01-05 Thread Bryant G. Ly
When enabling SR-IOV in pseries platform,
the VF bar properties for a PF are reported on
the device node in the device tree.

This patch adds the IOV Bar resources to Linux
structures from the device tree for later use
when configuring SR-IOV by PF driver.

Signed-off-by: Bryant G. Ly 
Signed-off-by: Juan J. Alvarez 
---
 arch/powerpc/include/asm/pci.h |   2 +
 arch/powerpc/kernel/pci_of_scan.c  |   2 +-
 arch/powerpc/platforms/pseries/setup.c | 164 +
 3 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 8dc32eacc97c..d82802ff5088 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -121,6 +121,8 @@ extern int remove_phb_dynamic(struct pci_controller *phb);
 extern struct pci_dev *of_create_pci_dev(struct device_node *node,
struct pci_bus *bus, int devfn);
 
+extern unsigned int pci_parse_of_flags(u32 addr0, int bridge);
+
 extern void of_scan_pci_bridge(struct pci_dev *dev);
 
 extern void of_scan_bus(struct device_node *node, struct pci_bus *bus);
diff --git a/arch/powerpc/kernel/pci_of_scan.c 
b/arch/powerpc/kernel/pci_of_scan.c
index 0d790f8432d2..20ceec4a5f5e 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -38,7 +38,7 @@ static u32 get_int_prop(struct device_node *np, const char 
*name, u32 def)
  * @addr0: value of 1st cell of a device tree PCI address.
  * @bridge: Set this flag if the address is from a bridge 'ranges' property
  */
-static unsigned int pci_parse_of_flags(u32 addr0, int bridge)
+unsigned int pci_parse_of_flags(u32 addr0, int bridge)
 {
unsigned int flags = 0;
 
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 1d6e2de2445c..e8f523cb5526 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -459,6 +459,162 @@ static void __init find_and_init_phbs(void)
of_pci_check_probe_only();
 }
 
+#ifdef CONFIG_PCI_IOV
+enum rtas_iov_fw_value_map {
+   NUM_RES_PROPERTY  = 0, /* Number of Resources */
+   LOW_INT   = 1, /* Lowest 32 bits of Address */
+   START_OF_ENTRIES  = 2, /* Always start of entry */
+   APERTURE_PROPERTY = 2, /* Start of entry+ to  Aperture Size */
+   WDW_SIZE_PROPERTY = 4, /* Start of entry+ to Window Size */
+   NEXT_ENTRY= 7  /* Go to next entry on array */
+};
+
+enum get_iov_fw_value_index {
+   BAR_ADDRS = 1,/*  Get Bar Address */
+   APERTURE_SIZE = 2,/*  Get Aperture Size */
+   WDW_SIZE  = 3 /*  Get Window Size */
+};
+
+resource_size_t pseries_get_iov_fw_value(struct pci_dev *dev, int resno,
+enum get_iov_fw_value_index value)
+{
+   const int *indexes;
+   struct device_node *dn = pci_device_to_OF_node(dev);
+   int i, num_res, ret = 0;
+
+   indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
+   if (!indexes)
+   return  0;
+
+   /*
+* First element in the array is the number of Bars
+* returned.  Search through the list to find the matching
+* bar
+*/
+   num_res = of_read_number([NUM_RES_PROPERTY], 1);
+   if (resno >= num_res)
+   return 0; /* or an errror */
+
+   i = START_OF_ENTRIES + NEXT_ENTRY * resno;
+   switch (value) {
+   case BAR_ADDRS:
+   ret = of_read_number([i], 2);
+   break;
+   case APERTURE_SIZE:
+   ret = of_read_number([i + APERTURE_PROPERTY], 2);
+   break;
+   case WDW_SIZE:
+   ret = of_read_number([i + WDW_SIZE_PROPERTY], 2);
+   break;
+   }
+
+   return ret;
+}
+
+void of_pci_set_vf_bar_size(struct pci_dev *dev, const int *indexes)
+{
+   struct resource *res;
+   resource_size_t base, size;
+   int i, r, num_res;
+
+   num_res = of_read_number([NUM_RES_PROPERTY], 1);
+   num_res = min_t(int, num_res, PCI_SRIOV_NUM_BARS);
+   for (i = START_OF_ENTRIES, r = 0; r < num_res && r < PCI_SRIOV_NUM_BARS;
+i += NEXT_ENTRY, r++) {
+   res = >resource[r + PCI_IOV_RESOURCES];
+   base = of_read_number([i], 2);
+   size = of_read_number([i + APERTURE_PROPERTY], 2);
+   res->flags = pci_parse_of_flags(of_read_number
+   ([i + LOW_INT], 1), 0);
+   res->flags |= (IORESOURCE_MEM_64 | IORESOURCE_PCI_FIXED);
+   res->name = pci_name(dev);
+   res->start = base;
+   res->end = base + size - 1;
+   }
+}
+
+void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
+{
+   struct resource *res, *root, *conflict;
+   resource_size_t base, size;
+

[PATCH v4 5/7] powerpc/kernel: Add EEH notify resume sysfs

2018-01-05 Thread Bryant G. Ly
Introduce a method for notify resume to be
called from sysfs. In this patch one can
now call notify resume from sysfs when
is supported by platform.

Signed-off-by: Bryant G. Ly 
Signed-off-by: Juan J. Alvarez 
---
 arch/powerpc/kernel/eeh_sysfs.c | 45 +
 1 file changed, 45 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
index 797549289798..9c513abc102c 100644
--- a/arch/powerpc/kernel/eeh_sysfs.c
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -90,6 +90,38 @@ static ssize_t eeh_pe_state_store(struct device *dev,
 
 static DEVICE_ATTR_RW(eeh_pe_state);
 
+#ifdef CONFIG_PCI_IOV
+static ssize_t eeh_notify_resume_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+   struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+   struct pci_dn *pdn = pci_get_pdn(pdev);
+
+   if (!edev || !edev->pe)
+   return -ENODEV;
+
+   pdn = pci_get_pdn(pdev);
+   return sprintf(buf, "%d\n", pdn->last_allow_rc);
+}
+
+static ssize_t eeh_notify_resume_store(struct device *dev,
+  struct device_attribute *attr,
+  const char *buf, size_t count)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+   struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+
+   if (!edev || !edev->pe)
+   return -ENODEV;
+
+   if (eeh_ops->notify_resume(pci_get_pdn(pdev)))
+   return -EIO;
+   return count;
+}
+static DEVICE_ATTR_RW(eeh_notify_resume);
+#endif
+
 void eeh_sysfs_add_device(struct pci_dev *pdev)
 {
struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
@@ -105,6 +137,13 @@ void eeh_sysfs_add_device(struct pci_dev *pdev)
rc += device_create_file(>dev, _attr_eeh_pe_config_addr);
rc += device_create_file(>dev, _attr_eeh_pe_state);
 
+#ifdef CONFIG_PCI_IOV
+   if (of_get_property(pci_device_to_OF_node
+   ((pdev->is_physfn ? pdev : pdev->physfn)),
+   "ibm,is-open-sriov-pf", NULL))
+   rc += device_create_file(>dev,
+_attr_eeh_notify_resume);
+#endif
if (rc)
pr_warn("EEH: Unable to create sysfs entries\n");
else if (edev)
@@ -128,6 +167,12 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev)
device_remove_file(>dev, _attr_eeh_mode);
device_remove_file(>dev, _attr_eeh_pe_config_addr);
device_remove_file(>dev, _attr_eeh_pe_state);
+#ifdef CONFIG_PCI_IOV
+   if (of_get_property(pci_device_to_OF_node
+   ((pdev->is_physfn ? pdev : pdev->physfn)),
+   "ibm,is-open-sriov-pf", NULL))
+   device_remove_file(>dev, _attr_eeh_notify_resume);
+#endif
 
if (edev)
edev->mode &= ~EEH_DEV_SYSFS;
-- 
2.14.3 (Apple Git-98)



Re: [RFC FIX v1 1/2] powerpc: Discover radix availability before scanning the memory nodes

2018-01-05 Thread Aneesh Kumar K.V



On 01/05/2018 04:35 PM, Bharata B Rao wrote:

Currently device tree nodes for memory are scanned before the
radix feature is discovered in mmu_early_init_devtree(). Move this
routine ahead of scanning memory nodes so that we know if the
guest is radix or not when scanning ibm,dynamic-reconfiguration-memory.

Signed-off-by: Bharata B Rao 
---
  arch/powerpc/kernel/prom.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index b15bae2..079d893 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -722,6 +722,8 @@ void __init early_init_devtree(void *params)
 */
of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line);

+   mmu_early_init_devtree();
+
/* Scan memory nodes and rebuild MEMBLOCKs */
of_scan_flat_dt(early_init_dt_scan_root, NULL);
of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
@@ -783,8 +785,6 @@ void __init early_init_devtree(void *params)
spinning_secondaries = boot_cpu_count - 1;
  #endif

-   mmu_early_init_devtree();
-
  #ifdef CONFIG_PPC_POWERNV
/* Scan and build the list of machine check recoverable ranges */
of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);



I guess that will cause issues with pa-feature scanning. I don't think 
we finalize cpu features/mmu features that early.


-aneesh



[RFC FIX v1 2/2] powerpc: Fix memory unplug failure on radix guest

2018-01-05 Thread Bharata B Rao
For a PowerKVM guest, it is possible to explicitly specify a DIMM device
in addition to the system RAM at boot time. When such a cold plugged DIMM
device is removed from a radix guest, we hit the following warning in the
guest kernel resulting in the eventual failure of memory unplug:

remove_pud_table: unaligned range
WARNING: CPU: 3 PID: 164 at arch/powerpc/mm/pgtable-radix.c:597 
remove_pagetable+0x468/0xca0
Call Trace:
remove_pagetable+0x464/0xca0 (unreliable)
radix__remove_section_mapping+0x24/0x40
remove_section_mapping+0x28/0x60
arch_remove_memory+0xcc/0x120
remove_memory+0x1ac/0x270
dlpar_remove_lmb+0x1ac/0x210
dlpar_memory+0xbc4/0xeb0
pseries_hp_work_fn+0x1a4/0x230
process_one_work+0x1cc/0x660
worker_thread+0xac/0x6d0
kthread+0x16c/0x1b0
ret_from_kernel_thread+0x5c/0x74

The DIMM memory that is cold plugged gets merged to the same memblock
region as RAM and hence gets mapped at 1G alignment. However since the
removal is done for one LMB (lmb size 256MB) at a time, the address
of the LMB (which is 256MB aligned) would get flagged as unaligned
in remove_pud_table() resulting in the above failure.

This problem is not seen for hot plugged memory because for the
hot plugged memory, the mappings are created separately for each
LMB and hence they all get aligned at 256MB.

To fix this problem for the cold plugged memory, let us mark the
cold plugged memblock region explicitly as hotplugged so that the
region doesn't get merged with RAM. All the memory that is discovered
via ibm,dynamic-reconfiguration-memory is marked so(1). Next identify
such regions in radix_init_pgtable() and create separate mappings
within that region for each LMB so that they get don't get aligned
like RAM region at 1G (2).

(1) The effect of marking the memory as hotplugged is that the
marked memory falls into ZONE_MOVABLE if movable_node kernel command line
option is enabled. This means no kernel allocations can occur from this
memory. This should be reasonalble to expect for hotplugged memory but
has an undesirable effect on PowerVM. On PowerVM, all the memory except RMA
is represented via ibm,dynamic-reconfiguration-memory and hence we can't
mark that entire memory as hotpluggable and movable. However since radix
isn't supported on PowerVM, we make this marking conditional to radix
so that PowerVM isn't affected.

For PowerKVM guests, all boot time memory is represented via
memory@ nodes and hot plugged/pluggable memory is represented via
ibm,dynamic-reconfiguration-memory property. We are marking all
the memory that is in ASSIGNED state during boot as hotplugged.
With this only cold plugged memory gets marked for PowerKVM.

(2) To create separate mappings for every LMB in the hot plugged
region, we need lmb-size. I am currently using memory_block_size_bytes()
API to get the lmb-size. Since this is early init time code, the
machine type isn't probed yet and hence memory_block_size_bytes()
would return the default LMB size as 16MB. Hence we end up creating
separate mappings at much lower granularity than what we can ideally
do for pseries machine.

Signed-off-by: Bharata B Rao 
---
 arch/powerpc/kernel/prom.c  |  2 ++
 arch/powerpc/mm/pgtable-radix.c | 17 ++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 079d893..2ad8fb1 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -525,6 +525,8 @@ static int __init early_init_dt_scan_drconf_memory(unsigned 
long node)
size = 0x8000ul - base;
}
memblock_add(base, size);
+   if (early_radix_enabled())
+   memblock_mark_hotplug(base, size);
} while (--rngs);
}
memblock_dump_all();
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index cfbbee9..10ceced 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -278,15 +279,25 @@ static void __init radix_init_pgtable(void)
 {
unsigned long rts_field;
struct memblock_region *reg;
+   phys_addr_t addr;
+   u64 lmb_size = memory_block_size_bytes();
 
/* We don't support slb for radix */
mmu_slb_size = 0;
/*
 * Create the linear mapping, using standard page size for now
 */
-   for_each_memblock(memory, reg)
-   WARN_ON(create_physical_mapping(reg->base,
-   reg->base + reg->size));
+   for_each_memblock(memory, reg) {
+   if (memblock_is_hotpluggable(reg)) {
+   for (addr = reg->base; addr < (reg->base + reg->size);
+   addr += lmb_size)
+   

[RFC FIX v1 1/2] powerpc: Discover radix availability before scanning the memory nodes

2018-01-05 Thread Bharata B Rao
Currently device tree nodes for memory are scanned before the
radix feature is discovered in mmu_early_init_devtree(). Move this
routine ahead of scanning memory nodes so that we know if the
guest is radix or not when scanning ibm,dynamic-reconfiguration-memory.

Signed-off-by: Bharata B Rao 
---
 arch/powerpc/kernel/prom.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index b15bae2..079d893 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -722,6 +722,8 @@ void __init early_init_devtree(void *params)
 */
of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line);
 
+   mmu_early_init_devtree();
+
/* Scan memory nodes and rebuild MEMBLOCKs */
of_scan_flat_dt(early_init_dt_scan_root, NULL);
of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
@@ -783,8 +785,6 @@ void __init early_init_devtree(void *params)
spinning_secondaries = boot_cpu_count - 1;
 #endif
 
-   mmu_early_init_devtree();
-
 #ifdef CONFIG_PPC_POWERNV
/* Scan and build the list of machine check recoverable ranges */
of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);
-- 
2.7.4



[RFC FIX v1 0/2] powerpc: Fix memory unplug failure for radix guests

2018-01-05 Thread Bharata B Rao
This is an RFC patchset to fix the call trace observed during memory
unplug for radix guests. The problem and the fix is described in patch 2/2.

The ideal fix for this is to break the bigger radix mapping into smaller
mappings during memory hot removal, but in the meanwhile I am just posting
an alternate and easier solution which of course, has its own
limitations (mentioned in 2/2)

Changes in v1:
--
- Move mmu-early-init-devtree() ahead of scanning memory DT nodes
  so that we know if the guest is radix or not when scanning
  memory nodes.
- Make the marking of hotpluggable only if the guest is radix.

v0: http://patchwork.ozlabs.org/patch/800142/

Bharata B Rao (2):
  powerpc: Discover radix availability before scanning the memory nodes
  powerpc: Fix memory unplug failure on radix guest

 arch/powerpc/kernel/prom.c  |  6 --
 arch/powerpc/mm/pgtable-radix.c | 17 ++---
 2 files changed, 18 insertions(+), 5 deletions(-)

-- 
2.7.4