[PATCH 07/14] MIPS: memblock: Mark present sparsemem sections

2018-01-17 Thread Serge Semin
If sparsemem is activated all sections with present pages must
be accordingly marked after memblock is fully initialized.

Signed-off-by: Serge Semin 
---
 arch/mips/kernel/setup.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index b121fa702..6df1eaf38 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -778,7 +778,7 @@ static void __init request_crashkernel(struct resource *res)
 
 static void __init arch_mem_init(char **cmdline_p)
 {
-   struct memblock_region *reg;
+   struct memblock_region *reg __maybe_unused;
extern void plat_mem_setup(void);
 
/* call board setup routine */
@@ -860,6 +860,11 @@ static void __init arch_mem_init(char **cmdline_p)
 crashk_res.end - crashk_res.start + 1);
 #endif
device_tree_init();
+#ifdef CONFIG_SPARSEMEM
+   for_each_memblock(memory, reg)
+   memory_present(0, memblock_region_memory_base_pfn(reg),
+   memblock_region_memory_end_pfn(reg));
+#endif /* CONFIG_SPARSEMEM */
sparse_init();
plat_swiotlb_setup();
 
-- 
2.12.0



[PATCH 05/14] MIPS: memblock: Add reserved memory regions to memblock

2018-01-17 Thread Serge Semin
The memory reservation has to be performed for all the crucial
objects like kernel itself, it data and fdt blob. FDT reserved-memory
nodes should also be scanned to declare or discard reserved memory
regions, but it has to be done after the memblock is fully initialized
with low/high RAM (see the function description/code).

Signed-off-by: Serge Semin 
---
 arch/mips/kernel/setup.c | 96 +-
 1 file changed, 54 insertions(+), 42 deletions(-)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 0ffbc3bb5..9e14d9833 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -362,6 +362,10 @@ static unsigned long __init init_initrd(void)
 static void __init bootmem_init(void)
 {
init_initrd();
+}
+
+static void __init reservation_init(void)
+{
finalize_initrd();
 }
 
@@ -478,54 +482,58 @@ static void __init bootmem_init(void)
memblock_add_node(PFN_PHYS(start), PFN_PHYS(end - start), 0);
}
memblock_set_current_limit(PFN_PHYS(max_low_pfn));
+}
+
+static void __init reservation_init(void)
+{
+   phys_addr_t size;
+   int i;
 
/*
-* Register fully available low RAM pages with the bootmem allocator.
+* Reserve memory occupied by the kernel and it data
 */
-   for (i = 0; i < boot_mem_map.nr_map; i++) {
-   unsigned long start, end, size;
+   size = __pa_symbol(&_end) - __pa_symbol(&_text);
+   memblock_reserve(__pa_symbol(&_text), size);
 
-   start = PFN_UP(boot_mem_map.map[i].addr);
-   end   = PFN_DOWN(boot_mem_map.map[i].addr
-   + boot_mem_map.map[i].size);
+   /*
+* Handle FDT and it reserved-memory nodes now
+*/
+   early_init_fdt_reserve_self();
+   early_init_fdt_scan_reserved_mem();
 
-   /*
-* Reserve usable memory.
-*/
-   switch (boot_mem_map.map[i].type) {
-   case BOOT_MEM_RAM:
-   break;
-   case BOOT_MEM_INIT_RAM:
-   memory_present(0, start, end);
-   continue;
-   default:
-   /* Not usable memory */
-   if (start > min_low_pfn && end < max_low_pfn)
-   reserve_bootmem(boot_mem_map.map[i].addr,
-   boot_mem_map.map[i].size,
-   BOOTMEM_DEFAULT);
-   continue;
-   }
+   /*
+* Reserve requested memory ranges with the memblock allocator.
+*/
+   for (i = 0; i < boot_mem_map.nr_map; i++) {
+   phys_addr_t start, end;
 
-   /*
-* We are rounding up the start address of usable memory
-* and at the end of the usable range downwards.
-*/
-   if (start >= max_low_pfn)
+   if (boot_mem_map.map[i].type == BOOT_MEM_RAM)
continue;
-   if (end > max_low_pfn)
-   end = max_low_pfn;
+
+   start = boot_mem_map.map[i].addr;
+   end   = boot_mem_map.map[i].addr + boot_mem_map.map[i].size;
+   size  = boot_mem_map.map[i].size;
 
/*
-* ... finally, is the area going away?
+* Make sure the region isn't already reserved
 */
-   if (end <= start)
+   if (memblock_is_region_reserved(start, size)) {
+   pr_warn("Reserved region %08zx @ %pa already in-use\n",
+   (size_t)size, &start);
continue;
-   size = end - start;
+   }
 
-   /* Register lowmem ranges */
-   free_bootmem(PFN_PHYS(start), size << PAGE_SHIFT);
-   memory_present(0, start, end);
+   switch (boot_mem_map.map[i].type) {
+   case BOOT_MEM_ROM_DATA:
+   case BOOT_MEM_RESERVED:
+   case BOOT_MEM_INIT_RAM:
+   memblock_reserve(start, size);
+   break;
+   case BOOT_MEM_RESERVED_NOMAP:
+   default:
+   memblock_remove(start, size);
+   break;
+   }
}
 
 #ifdef CONFIG_RELOCATABLE
@@ -555,6 +563,12 @@ static void __init bootmem_init(void)
 * Reserve initrd memory if needed.
 */
finalize_initrd();
+
+   /*
+* Reserve for hibernation
+*/
+   size = __pa_symbol(&__nosave_end) - __pa_symbol(&__nosave_begin);
+   memblock_reserve(__pa_symbol(&__nosave_begin), size);
 }
 
 #endif /* CONFIG_SGI_IP27 */
@@ -569,6 +583,7 @@ static void __init bootmem_init(void)
  * kernel but generic memory management system is still entirely uninitialized.
  *

[PATCH 11/14] MIPS: memblock: Print out kernel virtual mem layout

2018-01-17 Thread Serge Semin
It is useful to have the kernel virtual memory layout printed
at boot time so to have the full information about the booted
kernel. In some cases it might be unsafe to have virtual
addresses freely visible in logs, so the %pK format is used if
one want to hide them.

Signed-off-by: Serge Semin 
---
 arch/mips/mm/init.c | 47 +++
 1 file changed, 47 insertions(+)

diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 15040266b..d3e6bb531 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -60,6 +61,51 @@ EXPORT_SYMBOL_GPL(empty_zero_page);
 EXPORT_SYMBOL(zero_page_mask);
 
 /*
+ * Print out the kernel virtual memory layout
+ */
+#define MLK(b, t) (void *)b, (void *)t, ((t) - (b)) >> 10
+#define MLM(b, t) (void *)b, (void *)t, ((t) - (b)) >> 20
+#define MLK_ROUNDUP(b, t) (void *)b, (void *)t, DIV_ROUND_UP(((t) - (b)), 
SZ_1K)
+static void __init __maybe_unused mem_print_kmap_info(void)
+{
+   pr_notice("Kernel virtual memory layout:\n"
+ "lowmem  : 0x%pK - 0x%pK  (%4ld MB)\n"
+ "  .text : 0x%pK - 0x%pK  (%4td kB)\n"
+ "  .data : 0x%pK - 0x%pK  (%4td kB)\n"
+ "  .init : 0x%pK - 0x%pK  (%4td kB)\n"
+ "  .bss  : 0x%pK - 0x%pK  (%4td kB)\n"
+ "vmalloc : 0x%pK - 0x%pK  (%4ld MB)\n"
+#ifdef CONFIG_HIGHMEM
+ "pkmap   : 0x%pK - 0x%pK  (%4ld MB)\n"
+#endif
+ "fixmap  : 0x%pK - 0x%pK  (%4ld kB)\n",
+ MLM(PAGE_OFFSET, (unsigned long)high_memory),
+ MLK_ROUNDUP(_text, _etext),
+ MLK_ROUNDUP(_sdata, _edata),
+ MLK_ROUNDUP(__init_begin, __init_end),
+ MLK_ROUNDUP(__bss_start, __bss_stop),
+ MLM(VMALLOC_START, VMALLOC_END),
+#ifdef CONFIG_HIGHMEM
+ MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP)*(PAGE_SIZE)),
+#endif
+ MLK(FIXADDR_START, FIXADDR_TOP));
+
+   /* Check some fundamental inconsistencies. May add something else? */
+#ifdef CONFIG_HIGHMEM
+   BUILD_BUG_ON(VMALLOC_END < PAGE_OFFSET);
+   BUG_ON(VMALLOC_END < (unsigned long)high_memory);
+   BUILD_BUG_ON((PKMAP_BASE) + (LAST_PKMAP)*(PAGE_SIZE) < PAGE_OFFSET);
+   BUG_ON((PKMAP_BASE) + (LAST_PKMAP)*(PAGE_SIZE) <
+   (unsigned long)high_memory);
+#endif
+   BUILD_BUG_ON(FIXADDR_TOP < PAGE_OFFSET);
+   BUG_ON(FIXADDR_TOP < (unsigned long)high_memory);
+}
+#undef MLK
+#undef MLM
+#undef MLK_ROUNDUP
+
+/*
  * Not static inline because used by IP27 special magic initialization code
  */
 void setup_zero_pages(void)
@@ -468,6 +514,7 @@ void __init mem_init(void)
free_all_bootmem();
setup_zero_pages(); /* Setup zeroed pages.  */
mem_init_free_highmem();
+   mem_print_kmap_info();
mem_init_print_info(NULL);
 
 #ifdef CONFIG_64BIT
-- 
2.12.0



[PATCH 08/14] MIPS: memblock: Simplify DMA contiguous reservation

2018-01-17 Thread Serge Semin
CMA reserves it areas in the memblock allocator. Since we aren't
using bootmem anymore, the reservations copying should be discarded.

Signed-off-by: Serge Semin 
---
 arch/mips/kernel/setup.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 6df1eaf38..e0ca0d2bc 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -869,10 +869,6 @@ static void __init arch_mem_init(char **cmdline_p)
plat_swiotlb_setup();
 
dma_contiguous_reserve(PFN_PHYS(max_low_pfn));
-   /* Tell bootmem about cma reserved memblock section */
-   for_each_memblock(reserved, reg)
-   if (reg->size != 0)
-   reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
 }
 
 static void __init resource_init(void)
-- 
2.12.0



[PATCH 10/14] MIPS: memblock: Perform early low memory test

2018-01-17 Thread Serge Semin
Low memory can be tested at this point, since all the
reservations have just been finished without much of
additional allocations.

Signed-off-by: Serge Semin 
---
 arch/mips/kernel/setup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 82c6b77f6..b65047d85 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -873,6 +873,8 @@ static void __init arch_mem_init(char **cmdline_p)
memblock_allow_resize();
 
memblock_dump_all();
+
+   early_memtest(PFN_PHYS(min_low_pfn), PFN_PHYS(max_low_pfn));
 }
 
 static void __init resource_init(void)
-- 
2.12.0



Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread David Miller
From: Eric Dumazet 
Date: Wed, 17 Jan 2018 14:02:43 -0800

> On Wed, Jan 17, 2018 at 2:00 PM, Thomas Gleixner  wrote:
>> On Wed, 17 Jan 2018, Linus Torvalds wrote:
>>
>>> On Wed, Jan 17, 2018 at 1:54 PM, Thomas Gleixner  wrote:
>>> > raise_softirq() -> raise_softirq_irqoff()
>>> >
>>> > set_softirq_bit();
>>> >
>>> > if (!in_interrupt())
>>> > wake_softirqd();
>>> >
>>> > So if the caller is not in hard or soft interrupt context, which includes
>>> > bottom half disabled regions softirqd is woken.
>>>
>>> That does seem unnecessarily expensive, and maybe we could just do it
>>> with thread flag (TIF_NOTIFY_RESUME or whatever).
>>>
>>> In fact, that was what I *thought* we did. Maybe I just remember some
>>> historical behavior.
>>>
>>> Since networking seems to largely prefer softirqd anyway, maybe that
>>> wake_softirqd() is the right thing to do anyway.
>>
>> Well, but we only do it when we are not in a bh disabled region. The places
>> where thread context raises the network softirqs is usually inside a bh
>> disabled region, so the softirq is executed on local_bh_enable(). The
>> thread is woken up rarely.
> 
> There is also the netif_rx_ni() stuff.
> 
> Can't remember right now why it is not using
> local_bh_{diable,enable}() pair instead
> of preempt_disable() ... if (local_softirq_pending()) do_softirq();

Hmmm, that code predates the initial GIT repository build.

I do remember we had some back and forth with that stuff.



[PATCH 13/14] MIPS: memblock: Discard bootmem from SGI IP27 code

2018-01-17 Thread Serge Semin
SGI IP27 got its own code to set the early memory allocator up since it's
NUMA-based system. So in order to be compatible with NO_BOOTMEM config
we need to discard the bootmem allocator initialization and insert the
memblock reservation method. Although in my opinion the code isn't
working anyway since I couldn't find a place where prom_meminit() called
and kernel memory isn't reserved. It must have been untested since the
time the arch/mips/mips-boards/generic code was in the kernel.

Signed-off-by: Serge Semin 
---
 arch/mips/sgi-ip27/ip27-memory.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 8d0eb2643..d25758e25 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -389,7 +389,6 @@ static void __init node_mem_init(cnodeid_t node)
 {
unsigned long slot_firstpfn = slot_getbasepfn(node, 0);
unsigned long slot_freepfn = node_getfirstfree(node);
-   unsigned long bootmap_size;
unsigned long start_pfn, end_pfn;
 
get_pfn_range_for_nid(node, &start_pfn, &end_pfn);
@@ -400,7 +399,6 @@ static void __init node_mem_init(cnodeid_t node)
__node_data[node] = __va(slot_freepfn << PAGE_SHIFT);
memset(__node_data[node], 0, PAGE_SIZE);
 
-   NODE_DATA(node)->bdata = &bootmem_node_data[node];
NODE_DATA(node)->node_start_pfn = start_pfn;
NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn;
 
@@ -409,12 +407,9 @@ static void __init node_mem_init(cnodeid_t node)
slot_freepfn += PFN_UP(sizeof(struct pglist_data) +
   sizeof(struct hub_data));
 
-   bootmap_size = init_bootmem_node(NODE_DATA(node), slot_freepfn,
-   start_pfn, end_pfn);
free_bootmem_with_active_regions(node, end_pfn);
-   reserve_bootmem_node(NODE_DATA(node), slot_firstpfn << PAGE_SHIFT,
-   ((slot_freepfn - slot_firstpfn) << PAGE_SHIFT) + bootmap_size,
-   BOOTMEM_DEFAULT);
+   memblock_reserve(slot_firstpfn << PAGE_SHIFT,
+   ((slot_freepfn - slot_firstpfn) << PAGE_SHIFT));
sparse_memory_present_with_active_regions(node);
 }
 
-- 
2.12.0



[PATCH 09/14] MIPS: memblock: Allow memblock regions resize

2018-01-17 Thread Serge Semin
When all the main reservations are done the memblock regions
can be dynamically resized. Additionally it would be useful to have
memblock regions dumped on debug at this point.

Signed-off-by: Serge Semin 
---
 arch/mips/kernel/setup.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index e0ca0d2bc..82c6b77f6 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -869,6 +869,10 @@ static void __init arch_mem_init(char **cmdline_p)
plat_swiotlb_setup();
 
dma_contiguous_reserve(PFN_PHYS(max_low_pfn));
+
+   memblock_allow_resize();
+
+   memblock_dump_all();
 }
 
 static void __init resource_init(void)
-- 
2.12.0



[PATCH 12/14] MIPS: memblock: Discard bootmem from Loongson3 code

2018-01-17 Thread Serge Semin
Loongson64/3 runs its own code to initialize memory allocator in
case of NUMA configuration is selected. So in order to move to the
pure memblock utilization we discard the bootmem allocator usage
and insert the memblock reservation method for kernel/addrspace_offset
memory regions.

Signed-off-by: Serge Semin 
---
 arch/mips/loongson64/loongson-3/numa.c | 16 +---
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/mips/loongson64/loongson-3/numa.c 
b/arch/mips/loongson64/loongson-3/numa.c
index 282c5a8c2..902843516 100644
--- a/arch/mips/loongson64/loongson-3/numa.c
+++ b/arch/mips/loongson64/loongson-3/numa.c
@@ -180,7 +180,6 @@ static void __init szmem(unsigned int node)
 
 static void __init node_mem_init(unsigned int node)
 {
-   unsigned long bootmap_size;
unsigned long node_addrspace_offset;
unsigned long start_pfn, end_pfn, freepfn;
 
@@ -197,26 +196,21 @@ static void __init node_mem_init(unsigned int node)
 
__node_data[node] = prealloc__node_data + node;
 
-   NODE_DATA(node)->bdata = &bootmem_node_data[node];
NODE_DATA(node)->node_start_pfn = start_pfn;
NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn;
 
-   bootmap_size = init_bootmem_node(NODE_DATA(node), freepfn,
-   start_pfn, end_pfn);
free_bootmem_with_active_regions(node, end_pfn);
if (node == 0) /* used by finalize_initrd() */
max_low_pfn = end_pfn;
 
-   /* This is reserved for the kernel and bdata->node_bootmem_map */
-   reserve_bootmem_node(NODE_DATA(node), start_pfn << PAGE_SHIFT,
-   ((freepfn - start_pfn) << PAGE_SHIFT) + bootmap_size,
-   BOOTMEM_DEFAULT);
+   /* This is reserved for the kernel only */
+   if (node == 0)
+   memblock_reserve(start_pfn << PAGE_SHIFT,
+   ((freepfn - start_pfn) << PAGE_SHIFT));
 
if (node == 0 && node_end_pfn(0) >= (0x >> PAGE_SHIFT)) {
/* Reserve 0xfe00~0x for RS780E integrated GPU */
-   reserve_bootmem_node(NODE_DATA(node),
-   (node_addrspace_offset | 0xfe00),
-   32 << 20, BOOTMEM_DEFAULT);
+   memblock_reserve(node_addrspace_offset | 0xfe00, 32 << 20);
}
 
sparse_memory_present_with_active_regions(node);
-- 
2.12.0



[PATCH 01/14] MIPS: memblock: Add RESERVED_NOMAP memory flag

2018-01-17 Thread Serge Semin
Even if nomap flag is specified the reserved memory declared in dts
isn't really discarded from the buddy allocator in the current code.
We'll fix it by adding the no-map MIPS memory flag. Additionally
lets add the RESERVED_NOMAP memory regions handling to the methods,
which aren't going to be changed in the further patches.

Signed-off-by: Serge Semin 
---
 arch/mips/include/asm/bootinfo.h | 1 +
 arch/mips/kernel/prom.c  | 8 ++--
 arch/mips/kernel/setup.c | 8 
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/bootinfo.h b/arch/mips/include/asm/bootinfo.h
index b603804ca..f7be3148a 100644
--- a/arch/mips/include/asm/bootinfo.h
+++ b/arch/mips/include/asm/bootinfo.h
@@ -90,6 +90,7 @@ extern unsigned long mips_machtype;
 #define BOOT_MEM_ROM_DATA  2
 #define BOOT_MEM_RESERVED  3
 #define BOOT_MEM_INIT_RAM  4
+#define BOOT_MEM_RESERVED_NOMAP5
 
 /*
  * A memory map that's built upon what was determined
diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c
index 0dbcd152a..b123eb827 100644
--- a/arch/mips/kernel/prom.c
+++ b/arch/mips/kernel/prom.c
@@ -41,7 +41,7 @@ char *mips_get_machine_name(void)
 #ifdef CONFIG_USE_OF
 void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 {
-   return add_memory_region(base, size, BOOT_MEM_RAM);
+   add_memory_region(base, size, BOOT_MEM_RAM);
 }
 
 void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
@@ -52,7 +52,11 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 
align)
 int __init early_init_dt_reserve_memory_arch(phys_addr_t base,
phys_addr_t size, bool nomap)
 {
-   add_memory_region(base, size, BOOT_MEM_RESERVED);
+   if (!nomap)
+   add_memory_region(base, size, BOOT_MEM_RESERVED);
+   else
+   add_memory_region(base, size, BOOT_MEM_RESERVED_NOMAP);
+
return 0;
 }
 
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 4020d8f98..76e9e2075 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -172,6 +172,7 @@ bool __init memory_region_available(phys_addr_t start, 
phys_addr_t size)
in_ram = true;
break;
case BOOT_MEM_RESERVED:
+   case BOOT_MEM_RESERVED_NOMAP:
if ((start >= start_ && start < end_) ||
(start < start_ && start + size >= start_))
free = false;
@@ -207,6 +208,9 @@ static void __init print_memory_map(void)
case BOOT_MEM_RESERVED:
printk(KERN_CONT "(reserved)\n");
break;
+   case BOOT_MEM_RESERVED_NOMAP:
+   printk(KERN_CONT "(reserved nomap)\n");
+   break;
default:
printk(KERN_CONT "type %lu\n", 
boot_mem_map.map[i].type);
break;
@@ -955,9 +969,13 @@ static void __init resource_init(void)
res->name = "System RAM";
res->flags |= IORESOURCE_SYSRAM;
break;
+   case BOOT_MEM_RESERVED_NOMAP:
+   res->name = "reserved nomap";
+   break;
case BOOT_MEM_RESERVED:
default:
res->name = "reserved";
+   break;
}
 
request_resource(&iomem_resource, res);
-- 
2.12.0



[PATCH 06/14] MIPS: memblock: Reserve kdump/crash regions in memblock

2018-01-17 Thread Serge Semin
Kdump/crashkernel memory regions should be reserved in the
memblock allocator so they wouldn't be occupied by any further
allocations.

Signed-off-by: Serge Semin 
---
 arch/mips/kernel/setup.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 9e14d9833..b121fa702 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -849,17 +849,15 @@ static void __init arch_mem_init(char **cmdline_p)
if (setup_elfcorehdr && setup_elfcorehdr_size) {
printk(KERN_INFO "kdump reserved memory at %lx-%lx\n",
   setup_elfcorehdr, setup_elfcorehdr_size);
-   reserve_bootmem(setup_elfcorehdr, setup_elfcorehdr_size,
-   BOOTMEM_DEFAULT);
+   memblock_reserve(setup_elfcorehdr, setup_elfcorehdr_size);
}
 #endif
 
mips_parse_crashkernel();
 #ifdef CONFIG_KEXEC
if (crashk_res.start != crashk_res.end)
-   reserve_bootmem(crashk_res.start,
-   crashk_res.end - crashk_res.start + 1,
-   BOOTMEM_DEFAULT);
+   memblock_reserve(crashk_res.start,
+crashk_res.end - crashk_res.start + 1);
 #endif
device_tree_init();
sparse_init();
-- 
2.12.0



[PATCH 14/14] MIPS: memblock: Deactivate bootmem allocator

2018-01-17 Thread Serge Semin
Memblock allocator can be successfully used from now for early
memory management

Signed-off-by: Serge Semin 
---
 arch/mips/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 725b5ece7..a6c4fb6b6 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -4,7 +4,6 @@ config MIPS
default y
select ARCH_BINFMT_ELF_STATE
select ARCH_CLOCKSOURCE_DATA
-   select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -57,6 +57,7 @@ config MIPS
select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KPROBES
select HAVE_KRETPROBES
+   select NO_BOOTMEM
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MOD_ARCH_SPECIFIC
-- 
2.12.0



[PATCH 00/14] MIPS: memblock: Switch arch code to NO_BOOTMEM

2018-01-17 Thread Serge Semin
Even though it's common to see the architecture code using both
bootmem and memblock early memory allocators, it's not good for
multiple reasons. First of all, it's redundant to have two
early memory allocator while one would be more than enough from
functionality and stability points of view. Secondly, some new
features introduced in the kernel utilize the methods of the most
modern allocator ignoring the older one. It means the architecture
code must keep the both subsystems up synchronized with information
about memory regions and reservations, which leads to the code
complexity increase, that obviously increases bugs probability.
Finally it's better to keep all the architectures code unified for
better readability and code simplification. All these reasons lead
to one conclusion - arch code should use just one memory allocator,
which is supposed to be memblock as the most modern and already
utilized by the most of the kernel platforms. This patchset is
mostly about it.

One more reason why the MIPS arch code should finally move to
memblock is a BUG somewhere in the initialization process, when
CMA is activated:

[0.248762] BUG: Bad page state in process swapper/0  pfn:01f93
[0.255415] page:8205b0ac count:0 mapcount:-127 mapping:  (null) index:0x1
[0.263172] flags: 0x4000()
[0.266723] page dumped because: nonzero mapcount
[0.272049] Modules linked in:
[0.275511] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.4.88-module #5
[0.282900] Stack :   80b6dd6a 003a   
8093 8092bff4
  86073a14 80ac88c7 809f21ac  0001 80b6998c 0400 

  80a0 801822e8 80b6dd68  0002  809f8024 
86077ccc
  80b8 801e9328 809fcbc0  0400 0001 86077ccc 
86073a14
         

  ...
[0.323148] Call Trace:
[0.325935] [<8010e7c4>] show_stack+0x8c/0xa8
[0.330859] [<80404814>] dump_stack+0xd4/0x110
[0.335879] [<801f0bc0>] bad_page+0xfc/0x14c
[0.340710] [<801f0e04>] free_pages_prepare+0x1f4/0x330
[0.346632] [<801f36c4>] __free_pages_ok+0x2c/0x104
[0.352154] [<80b23a40>] init_cma_reserved_pageblock+0x5c/0x74
[0.358761] [<80b29390>] cma_init_reserved_areas+0x1b4/0x240
[0.365170] [<8010058c>] do_one_initcall+0xe8/0x27c
[0.370697] [<80b14e60>] kernel_init_freeable+0x200/0x2c4
[0.376828] [<808faca4>] kernel_init+0x14/0x104
[0.381939] [<80107598>] ret_from_kernel_thread+0x14/0x1c

The bugus pfn seems to be the one allocated for bootmem allocator
pages and hasn't been freed before letting the CMA working with its
areas. Anyway the bug is solved by this patchset.

Another reason why this patchset is useful is that it fixes the fdt
reserved-memory nodes functionality for MIPS. Really it's bug to have
the fdt reserved nodes scanning before the memblock is
fully initialized (calling early_init_fdt_scan_reserved_mem before
bootmem_init is called). Additionally no-map flag of the
reserved-memory node hasn't been taking into account. This patchset
fixes all of these.

As you probably remember I already did another attempt to merge a
similar functionality into the kernel. This time the patchset got
to be less complex (14 patches vs 21 last time) and fixes the
platform code like SGI IP27 and Loongson3, which due to being
NUMA introduce its own memory initialization process. Although
I have much doubt in SGI IP27 code operability in the first place,
since it got prom_meminit() method of early memory initialization,
which hasn't been called at any other place in the kernel. It must
have been left there unrenamed after arch/mips/mips-boards/generic
code had been discarded.

Here are the list of folks, who agreed to perform some tests of
the patchset:
Alexander Sverdlin  - Octeon2
Matt Redfearn  - Loongson3, etc
Joshua Kinard  - IP27
Marcin Nowakowski 
Thanks to you all in regards and for everybody, who will be involved
in reviewing and testing.

The patchset is applied on top of kernel 4.15-rc8 and can be found
submitted at my repo:
https://github.com/fancer/Linux-kernel-MIPS-memblock-project

Signed-off-by: Serge Semin 

Serge Semin (14):
  MIPS: memblock: Add RESERVED_NOMAP memory flag
  MIPS: memblock: Surely map BSS kernel memory section
  MIPS: memblock: Reserve initrd memory in memblock
  MIPS: memblock: Discard bootmem initialization
  MIPS: memblock: Add reserved memory regions to memblock
  MIPS: memblock: Reserve kdump/crash regions in memblock
  MIPS: memblock: Mark present sparsemem sections
  MIPS: memblock: Simplify DMA contiguous reservation
  MIPS: memblock: Allow memblock regions resize
  MIPS: memblock: Perform early low memory test
  MIPS: memblock: Print out kernel virtual mem layout
  MIPS: memblock: Discard bootmem from Loongson3 code
  MIPS: memblock: Discard bootmem from SGI IP27 code
  MIPS: memblock: Deactivate bootmem allocator

 arch/mips/Kconfig

Re: ipv6 redefinition build issue with 4.15-rc8

2018-01-17 Thread Hauke Mehrtens
On 01/17/2018 08:31 PM, Neil MacLeod wrote:
> All
> 
> Further to my previous reply (reproduced below having been bounced by
> linux-kernel) I have successfully built LibreELEC when using the
> ConnMan patch from Jonas - there were no other failures.
> 
> I have also built a number of network related packages (iftop, iperf,
> ngrp, nmap, sshfs, tcpdump, udpxy, wireless-tools), again without
> issue, so this particular 4.15-rc8 kernel change is only affecting
> ConnMan as far as I can tell.

Thanks for testing.

> Regards
> Neil
> 
>> All
>>
>> Many thanks for the replies.
>>
>> To ensure my build environment is sane I tested again without reverting the 
>> kernel commit, and reproduced the connman build failure.
>>
>> Next I tested the change suggested by Hauke (kernel patch: http://ix.io/Eh5) 
>> and connman fails to build, however it fails with a different error this 
>> time: http://ix.io/Eh2
>>
>> I then tested the change suggested by Jonas (connman patch: 
>> http://ix.io/Eh6) and connman builds successfully, no failure, so this might 
>> be a potential fix.

You should import the libc header files first and then the Linux header
files in user space applications, this is the supported order.

Can you try this patch please:

--- a/src/tethering.c
+++ b/src/tethering.c
@@ -31,11 +31,11 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
 #include 
+#include 
+#include 
 #include 

 #include "connman.h"


Do we want to do any changes to the kernel header files? I do not know
of any clean workaround to make this work, we can probably hack
something for connman, but I think it is not worth the trouble.

Hauke

>> I'll now try a clean build with Jonas' patch and see if any other packages 
>> fail to build for the same reason as connman (I'm building a complete 
>> embedded distro with about 700 packages).
>>
>> I'll post again later with an update.
>>
>> Thanks
>> Neil
> 
> On 17 January 2018 at 15:25, Neil MacLeod  wrote:
>> All
>>
>> Many thanks for the replies.
>>
>> To ensure my build environment is sane I tested again without reverting the
>> kernel commit, and reproduced the connman build failure.
>>
>> Next I tested the change suggested by Hauke (kernel patch: http://ix.io/Eh5)
>> and connman fails to build, however it fails with a different error this
>> time: http://ix.io/Eh2
>>
>> I then tested the change suggested by Jonas (connman patch:
>> http://ix.io/Eh6) and connman builds successfully, no failure, so this might
>> be a potential fix.
>>
>> I'll now try a clean build with Jonas' patch and see if any other packages
>> fail to build for the same reason as connman (I'm building a complete
>> embedded distro with about 700 packages).
>>
>> I'll post again later with an update.
>>
>> Thanks
>> Neil
>>
>> On 17 January 2018 at 09:03, Jonas Bonn  wrote:
>>>
>>> On 01/17/2018 08:59 AM, Daniel Wagner wrote:

 Hi Neil,

 On 01/16/2018 07:51 PM, Neil MacLeod wrote:
>
> Since this commit in 4.15-rc8:
>
>
> https://github.com/torvalds/linux/commit/6926e041a8920c8ec27e4e155efa760aa01551fd
>
> building connman 1.35 with glibc 2.26 now fails as follows:
>
> http://ix.io/EbP
>
> I'm not sure if this is a kernel issue, a glibc issue, or a connman
> issue.
>
> Reverting the kernel commit resolves the issue, but isn't ideal (unless
> it's the correct solution, of course).
>
> Does anyone have any better ideas?
>>>
>>>
>>> Try switching the order of these headers around (src/tethering.c)...
>>> netinet/in.h seems to depend on linux/in.h being included _first_ and it's
>>> presumably being pulled in via linux/if_bridge.h now as a result of the
>>> kernel patch (couldn't immediately see why, though... I suspect the
>>> inclusion of libc-compat.h is the culprit.)
>>>
>>> #include 
>>> #include 
>>>
>>> Yes, this is a hack and only masks the issue... nonetheless.
>>>
>>> /Jonas
>>>
>>>

 Since ConnMan does not redefine 'struct in6_addr' and friends I would say
 it is kernel/glibc header include problem. But I might be wrong here.

 @Hauke: Do you happen to know what is going on?

 Thanks,
 Daniel
 ___
 connman mailing list
 conn...@lists.01.org
 https://lists.01.org/mailman/listinfo/connman
>>>
>>>
>>



linux-next: manual merge of the powerpc tree with the powerpc-fixes tree

2018-01-17 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the powerpc tree got a conflict in:

  arch/powerpc/kernel/setup-common.c

between commit:

  349524bc0da6 ("powerpc: Don't preempt_disable() in show_cpuinfo()")

from the powerpc-fixes tree and commit:

  f5f563012a70 ("powerpc: Make newline in cpuinfo unconditional")

from the powerpc tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/kernel/setup-common.c
index 3f33869c6486,24da91768133..
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@@ -346,10 -354,10 +346,7 @@@ static int show_cpuinfo(struct seq_fil
   loops_per_jiffy / (50/HZ),
   (loops_per_jiffy / (5000/HZ)) % 100);
  #endif
- 
- #ifdef CONFIG_SMP
seq_printf(m, "\n");
- #endif
 -
 -  preempt_enable();
 -
/* If this is the last cpu, print the summary */
if (cpumask_next(cpu_id, cpu_online_mask) >= nr_cpu_ids)
show_cpuinfo_summary(m);


Re: [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable

2018-01-17 Thread David Rientjes
On Wed, 17 Jan 2018, Michal Hocko wrote:

> Absolutely agreed! And moreover, there are not all that many ways what
> to do as an action. You just kill a logical entity - be it a process or
> a logical group of processes. But you have way too many policies how
> to select that entity. Do you want to chose the youngest process/group
> because all the older ones have been computing real stuff and you would
> lose days of your cpu time? Or should those who pay more should be
> protected (aka give them static priorities), or you name it...
> 

That's an argument for making the interface extensible, yes.

> I am sorry, I still didn't grasp the full semantic of the proposed
> soluton but the mere fact it is starting by conflating selection and the
> action is a no go and a wrong API. This is why I've said that what you
> (David) outlined yesterday is probably going to suffer from a much
> longer discussion and most likely to be not acceptable. Your patchset
> proves me correct...

I'm very happy to change the API if there are better suggestions.  That 
may end up just being an memory.oom_policy file, as this implements, and 
separating out a new memory.oom_action that isn't a boolean value to 
either do a full group kill or only a single process.  Or it could be what 
I suggested in my mail to Tejun, such as "hierarchy killall" written to
memory.oom_policy, which would specify a single policy and then an 
optional mechanism.  With my proposed patchset, there would then be three 
policies: "none", "cgroup", and "tree" and one possible optional 
mechanism: "killall".


Re: [REGRESSION][v4.14.y][v4.15] x86/intel_rdt/cqm: Improve limbo list processing

2018-01-17 Thread Joseph Salisbury
On 01/16/2018 01:59 PM, Thomas Gleixner wrote:
> On Tue, 16 Jan 2018, Yu, Fenghua wrote:
>>> From: Thomas Gleixner [mailto:t...@linutronix.de]
>> Is this a Haswell specific issue?
>>
>> I run the following test forever without issue on Broadwell and 4.15.0-rc6 
>> with rdt mounted:
>> for ((;;)) do
>> for ((i=1;i<88;i++)) do
>> echo 0 >/sys/devices/system/cpu/cpu$i/online
>> done
>> echo "online cpus:"
>> grep processor /proc/cpuinfo |wc
>> for ((i=1;i<88;i++)) do
>> echo 1 >/sys/devices/system/cpu/cpu$i/online
>> done
>> echo "online cpus:"
>> grep processor /proc/cpuinfo|wc
>> done
>>
>> I'm finding a Haswell to reproduce the issue.
> Come on. This is crystal clear from the KASAN trace. And the fix is simple 
> enough.
>
> You simply do not run into it because on your machine
>
> is_llc_occupancy_enabled() is false...
>
> Thanks,
>
>   tglx
>   
> 8<
>
> diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
> index 88dcf8479013..99442370de40 100644
> --- a/arch/x86/kernel/cpu/intel_rdt.c
> +++ b/arch/x86/kernel/cpu/intel_rdt.c
> @@ -525,10 +525,6 @@ static void domain_remove_cpu(int cpu, struct 
> rdt_resource *r)
>*/
>   if (static_branch_unlikely(&rdt_mon_enable_key))
>   rmdir_mondata_subdir_allrdtgrp(r, d->id);
> - kfree(d->ctrl_val);
> - kfree(d->rmid_busy_llc);
> - kfree(d->mbm_total);
> - kfree(d->mbm_local);
>   list_del(&d->list);
>   if (is_mbm_enabled())
>   cancel_delayed_work(&d->mbm_over);
> @@ -545,6 +541,10 @@ static void domain_remove_cpu(int cpu, struct 
> rdt_resource *r)
>   cancel_delayed_work(&d->cqm_limbo);
>   }
>  
> + kfree(d->ctrl_val);
> + kfree(d->rmid_busy_llc);
> + kfree(d->mbm_total);
> + kfree(d->mbm_local);
>   kfree(d);
>   return;
>   }

Hi Thomas,

Testing of your patch shows that your patch resolves the bug.  Thanks
for the assistance!  Is this something you could submit to mainline?

Thanks,


Joe



Re: [mm 4.15-rc8] Random oopses under memory pressure.

2018-01-17 Thread Linus Torvalds
On Wed, Jan 17, 2018 at 2:00 PM, Dave Hansen
 wrote:
>
> I thought that page_zone_id() stuff was there to prevent this kind of
> cross-zone stuff from happening.

Ahh, that was the part I missed. Yeah looks like that checks things
properly. Although the mask generation is *so* confusing that I
stopped following it and will just take your word for it ;)

 Linus


[PATCH] ubi: fastmap: Don't flush fastmap work on detach

2018-01-17 Thread Richard Weinberger
At this point UBI volumes have already been free()'ed and fastmap can no
longer access these data structures.

Reported-by: Martin Townsend 
Fixes: 74cdaf24004a ("UBI: Fastmap: Fix memory leaks while closing the WL 
sub-system")
Cc: sta...@vger.kernel.org
Signed-off-by: Richard Weinberger 
---
 drivers/mtd/ubi/fastmap-wl.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/mtd/ubi/fastmap-wl.c b/drivers/mtd/ubi/fastmap-wl.c
index 4f0bd6b4422a..69dd21679a30 100644
--- a/drivers/mtd/ubi/fastmap-wl.c
+++ b/drivers/mtd/ubi/fastmap-wl.c
@@ -362,7 +362,6 @@ static void ubi_fastmap_close(struct ubi_device *ubi)
 {
int i;
 
-   flush_work(&ubi->fm_work);
return_unused_pool_pebs(ubi, &ubi->fm_pool);
return_unused_pool_pebs(ubi, &ubi->fm_wl_pool);
 
-- 
2.13.6



Re: [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable

2018-01-17 Thread David Rientjes
On Wed, 17 Jan 2018, Tejun Heo wrote:

> Hello, David.
> 

Hi Tejun!

> > The behavior of killing an entire indivisible memory consumer, enabled
> > by memory.oom_group, is an oom policy itself.  It specifies that all
> 
> I thought we discussed this before but maybe I'm misremembering.
> There are two parts to the OOM policy.  One is victim selection, the
> other is the action to take thereafter.
> 
> The two are different and conflating the two don't work too well.  For
> example, please consider what should be given to the delegatee when
> delegating a subtree, which often is a good excercise when designing
> these APIs.
> 
> When a given workload is selected for OOM kill (IOW, selected to free
> some memory), whether the workload can handle individual process kills
> or not is the property of the workload itself.  Some applications can
> safely handle some of its processes picked off and killed.  Most
> others can't and want to be handled as a single unit, which makes it a
> property of the workload.
> 

Yes, this is a valid point.  The policy of "tree" and "all" are identical 
policies and then the mechanism differs wrt to whether one process is 
killed or all eligible processes are killed, respectively.  My motivation 
for this was to avoid having two different tunables, especially because 
later we'll need a way for userspace to influence the decisionmaking to 
protect (bias against) important subtrees.  What would really be nice is 
cgroup.subtree_control-type behavior where we could effect a policy and a 
mechanism at the same time.  It's not clear how that would be handled to 
allow only one policy and one mechanism, however, in a clean way.  The 
simplest for the user would be a new file, to specify the mechanism and 
leave memory.oom_policy alone.  Would another file really be warranted?  
Not sure.

> That makes sense in the hierarchy too because whether one process or
> the whole workload is killed doesn't infringe upon the parent's
> authority over resources which in turn implies that there's nothing to
> worry about how the parent's groupoom setting should constrain the
> descendants.
> 
> OOM victim selection policy is a different beast.  As you've mentioned
> multiple times, especially if you're worrying about people abusing OOM
> policies by creating sub-cgroups and so on, the policy, first of all,
> shouldn't be delegatable and secondly should have meaningful
> hierarchical restrictions so that a policy that an ancestor chose
> can't be nullified by a descendant.
> 

The goal here was to require a policy of either "tree" or "all" that the 
user can't change.  They are allowed to have their own oom policies 
internal to their subtree, however, for oom conditions in that subtree 
alone.  However, if the common ancestor hits its limit, it is forced to 
either be "tree" or "all" and require hierarchical usage to be considered 
instead of localized usage.  Either "tree" or "all" is appropriate, and 
this may be why you brought up the point about separating them out, i.e. 
the policy can be demanded by the common ancestor but the actual mechanism 
that the oom killer uses, kill either a single process or the full cgroup, 
is left to the user depending on their workload.  That sounds reasonable 
and I can easily separate the two by introducing a new file, similar to 
memory.oom_group but in a more extensible way so that it is not simply a 
selection of either full cgroup kill or single process.

> I'm not necessarily against adding hierarchical victim selection
> policy tunables; however, I am skeptical whether static tunables on
> cgroup hierarchy (including selectable policies) can be made clean and
> versatile enough, especially because the resource hierarchy doesn't
> necessarily, or rather in most cases, match the OOM victim selection
> decision tree, but I'd be happy to be proven wrong.
> 

Right, and I think that giving users control over their subtrees is a 
powerful tool and one that can lead to very effective use of the cgroup v2 
hierarchy.  Being able to circumvent the oom selection by creating child 
cgroups is certainly something that can trivially be prevented.  The 
argument that users can currently divide their entire processes into 
several different smaller processes to circumvent today's heuristic 
doesn't mean we can't have "tree"-like comparisons between cgroups to 
address that issue itself since all processes charge to the tree itself.

I became convinced of this when I saw the real-world usecases that would 
use such a feature on cgroup v2: we want to have hierarchical usage for 
comparison when full subtrees are dedicated to individual consumers, for 
example, and local mem cgroup usage for comparison when using hierarchies 
for top-level /admins and /students cgroups for which Michal provided an 
example.  These can coexist on systems and it's clear that there needs to 
be a system-wide policy decision for the cgroup aware oom killer (the idea 
behind the current mou

RE: [PATCH] drm/vmwgfx: fix memory corruption with legacy/sou connectors

2018-01-17 Thread Deepak Singh Rawat
Thanks Rob for finding this one.

Reviewed-by: Deepak Rawat 

> From: dri-devel [mailto:dri-devel-boun...@lists.freedesktop.org] On Behalf
> Of Rob Clark
> Sent: Wednesday, January 17, 2018 7:16 AM
> To: dri-de...@lists.freedesktop.org
> Cc: Thomas Hellstrom ; Rob Clark
> ; David Airlie ; linux-
> ker...@vger.kernel.org; sta...@vger.kernel.org; linux-graphics-maintainer
> 
> Subject: [PATCH] drm/vmwgfx: fix memory corruption with legacy/sou
> connectors
> 
> From: Rob Clark 
> 
> It looks like in all cases 'struct vmw_connector_state' is used.  But
> only in stdu connectors, was atomic_{duplicate,destroy}_state() properly
> subclassed.  Leading to writes beyond the end of the allocated connector
> state block and all sorts of fun memory corruption related crashes.
> 
> Fixes: d7721ca71126 "drm/vmwgfx: Connector atomic state"
> Cc: 
> Signed-off-by: Rob Clark 
> ---
>  drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c  | 4 ++--
>  drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c | 4 ++--
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c
> index b8a09807c5de..3824595fece1 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ldu.c
> @@ -266,8 +266,8 @@ static const struct drm_connector_funcs
> vmw_legacy_connector_funcs = {
>   .set_property = vmw_du_connector_set_property,
>   .destroy = vmw_ldu_connector_destroy,
>   .reset = vmw_du_connector_reset,
> - .atomic_duplicate_state =
> drm_atomic_helper_connector_duplicate_state,
> - .atomic_destroy_state =
> drm_atomic_helper_connector_destroy_state,
> + .atomic_duplicate_state = vmw_du_connector_duplicate_state,
> + .atomic_destroy_state = vmw_du_connector_destroy_state,
>   .atomic_set_property = vmw_du_connector_atomic_set_property,
>   .atomic_get_property = vmw_du_connector_atomic_get_property,
>  };
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
> index bc5f6026573d..63a4cd794b73 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
> @@ -420,8 +420,8 @@ static const struct drm_connector_funcs
> vmw_sou_connector_funcs = {
>   .set_property = vmw_du_connector_set_property,
>   .destroy = vmw_sou_connector_destroy,
>   .reset = vmw_du_connector_reset,
> - .atomic_duplicate_state =
> drm_atomic_helper_connector_duplicate_state,
> - .atomic_destroy_state =
> drm_atomic_helper_connector_destroy_state,
> + .atomic_duplicate_state = vmw_du_connector_duplicate_state,
> + .atomic_destroy_state = vmw_du_connector_destroy_state,
>   .atomic_set_property = vmw_du_connector_atomic_set_property,
>   .atomic_get_property = vmw_du_connector_atomic_get_property,
>  };
> --
> 2.14.3
> 
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://urldefense.proofpoint.com/v2/url?u=https-
> 3A__lists.freedesktop.org_mailman_listinfo_dri-
> 2Ddevel&d=DwIGaQ&c=uilaK90D4TOVoH58JNXRgQ&r=zOOG28inJK0762SxAf
> -
> cyZdStnd2NQpRu98lJP2iYGw&m=h6TncxsxWnJOk_7aZZClV8O_VZGw8rtrIk_
> BKtLUKM0&s=bizsNwniM18rOqG6MjlSY5t9fPsmxFrMgN_UqnI0vP0&e=


Re: [PATCH 1/3] dt-bindings: phy: phy-rockchip-typec: add usb3 otg reset

2018-01-17 Thread Brian Norris
+ Enric

On Fri, Jan 12, 2018 at 06:08:22PM +0800, William Wu wrote:
> This patch adds USB3 OTG reset property for rk3399 Type-C PHY
> to hold the USB3 controller in reset state.
> 
> Signed-off-by: William Wu 
> ---

I was going back and forth on this, since at one point this binding was
merged but had no enabled users...but now I see Heiko has queued up some
of Enric's work for 4.16, and it uses the existing binding.

So, if this reset is added, it should be optional.

Brian

>  Documentation/devicetree/bindings/phy/phy-rockchip-typec.txt | 12 
> +++-
>  1 file changed, 7 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/phy/phy-rockchip-typec.txt 
> b/Documentation/devicetree/bindings/phy/phy-rockchip-typec.txt
> index 6ea867e..db2902e 100644
> --- a/Documentation/devicetree/bindings/phy/phy-rockchip-typec.txt
> +++ b/Documentation/devicetree/bindings/phy/phy-rockchip-typec.txt
> @@ -13,7 +13,7 @@ Required properties:
>   - assigned-clock-rates : the phy core clk frequency, shall be: 5000
>   - resets : a list of phandle + reset specifier pairs
>   - reset-names : string reset name, must be:
> -  "uphy", "uphy-pipe", "uphy-tcphy"
> +  "uphy", "uphy-pipe", "uphy-tcphy", "usb3-otg"
>   - extcon : extcon specifier for the Power Delivery
>  
>  Note, there are 2 type-c phys for RK3399, and they are almost identical, 
> except
> @@ -56,8 +56,9 @@ Example:
>   assigned-clock-rates = <5000>;
>   resets = <&cru SRST_UPHY0>,
><&cru SRST_UPHY0_PIPE_L00>,
> -  <&cru SRST_P_UPHY0_TCPHY>;
> - reset-names = "uphy", "uphy-pipe", "uphy-tcphy";
> +  <&cru SRST_P_UPHY0_TCPHY>,
> +  <&cru SRST_A_USB3_OTG0>;
> + reset-names = "uphy", "uphy-pipe", "uphy-tcphy", "usb3-otg";
>   rockchip,typec-conn-dir = <0xe580 0 16>;
>   rockchip,usb3tousb2-en = <0xe580 3 19>;
>   rockchip,external-psm = <0xe588 14 30>;
> @@ -84,8 +85,9 @@ Example:
>   assigned-clock-rates = <5000>;
>   resets = <&cru SRST_UPHY1>,
><&cru SRST_UPHY1_PIPE_L00>,
> -  <&cru SRST_P_UPHY1_TCPHY>;
> - reset-names = "uphy", "uphy-pipe", "uphy-tcphy";
> +  <&cru SRST_P_UPHY1_TCPHY>,
> +  <&cru SRST_A_USB3_OTG1>;
> + reset-names = "uphy", "uphy-pipe", "uphy-tcphy", "usb3-otg";
>   rockchip,typec-conn-dir = <0xe58c 0 16>;
>   rockchip,usb3tousb2-en = <0xe58c 3 19>;
>   rockchip,external-psm = <0xe594 14 30>;
> -- 
> 2.0.0
> 
> 


Re: [mm 4.15-rc8] Random oopses under memory pressure.

2018-01-17 Thread Dave Hansen
On 01/17/2018 01:51 PM, Linus Torvalds wrote:
> In fact, it seems to be such a fundamental bug that I suspect I'm
> entirely wrong, and full of shit. So it's an interesting and not
> _obviously_ incorrect theory, but I suspect I must be missing
> something.

I'll just note that a few of the pfns I decoded were smack in the middle
of the zone, not near either the high or low end of ZONE_NORMAL where we
would expect this cross-zone stuff to happen.

But I guess we could get similar wonkiness where 'struct page' is
screwed up in so many different ways if during buddy joining you do:

list_del(&buddy->lru);

and 'buddy' is off in another zone for which you do not hold the
spinlock.  If we are somehow missing some locking, or double-allocating
a page, something like this would help:

 static inline void rmv_page_order(struct page *page)
 {
+WARN_ON_ONCE(!PageBuddy(page));
 __ClearPageBuddy(page);
 set_page_private(page, 0);
 }


[PATCH 4.9] usbip: fix warning in vhci_hcd_probe/lockdep_init_map

2018-01-17 Thread Shuah Khan
commit 918b8ac55b6c809b70aa05c279087109584e393e upstream

vhci_hcd calls sysfs_create_group() with dynamically allocated sysfs
attributes triggering the lock-class key not persistent warning. Call
sysfs_attr_init() for dynamically allocated sysfs attributes to fix it.

vhci_hcd vhci_hcd: USB/IP Virtual Host Controller
vhci_hcd vhci_hcd: new USB bus registered, assigned bus number 2
BUG: key 88006a7e8d18 not in .data!
[ cut here ]
WARNING: CPU: 0 PID: 1 at kernel/locking/lockdep.c:3131
lockdep_init_map+0x60c/0x770
DEBUG_LOCKS_WARN_ON(1)[1.567044] Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.9.0-rc7+ #58
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
 88006bce6eb8 81f96c8a 0a02 11000d79cd6a
 ed000d79cd62 00046bce6ed8 41b58ab3 8598af40
 81f969f8  41b58ab3 0200
Call Trace:
 [< inline >] __dump_stack lib/dump_stack.c:15
 [] dump_stack+0x292/0x398 lib/dump_stack.c:51
 [] __warn+0x19f/0x1e0 kernel/panic.c:550
 [] warn_slowpath_fmt+0xc5/0x110 kernel/panic.c:565
 [] lockdep_init_map+0x60c/0x770 kernel/locking/lockdep.c:3131
 [] __kernfs_create_file+0x114/0x2a0 fs/kernfs/file.c:954
 [] sysfs_add_file_mode_ns+0x225/0x520 fs/sysfs/file.c:305
 [< inline >] create_files fs/sysfs/group.c:64
 [] internal_create_group+0x239/0x8f0 fs/sysfs/group.c:134
 [] sysfs_create_group+0x1f/0x30 fs/sysfs/group.c:156
 [] vhci_start+0x5b4/0x7a0 drivers/usb/usbip/vhci_hcd.c:978
 [] usb_add_hcd+0x8da/0x1c60 drivers/usb/core/hcd.c:2867
 [] vhci_hcd_probe+0x97/0x130
drivers/usb/usbip/vhci_hcd.c:1103
 ---
 ---
---[ end trace c33c7b202cf3aac8 ]---

Reported-by: Andrey Konovalov 
Signed-off-by: Shuah Khan 
---

Greg,

Please apply this fix to 4.9 stable. I re-discovered the problem
on 4.9.77-rc1 and re-tested the patch on it.

thanks,
-- Shuah

 drivers/usb/usbip/vhci_sysfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/usbip/vhci_sysfs.c b/drivers/usb/usbip/vhci_sysfs.c
index c404017..b96e5b1 100644
--- a/drivers/usb/usbip/vhci_sysfs.c
+++ b/drivers/usb/usbip/vhci_sysfs.c
@@ -361,6 +361,7 @@ static void set_status_attr(int id)
status->attr.attr.name = status->name;
status->attr.attr.mode = S_IRUGO;
status->attr.show = status_show;
+   sysfs_attr_init(&status->attr.attr);
 }
 
 static int init_status_attrs(void)
-- 
2.7.4



Re: [PATCH v16 01/10] video: backlight: Add helpers to enable and disable backlight

2018-01-17 Thread Noralf Trønnes


Den 17.01.2018 18.00, skrev Daniel Thompson:



On 16/01/18 10:31, Meghana Madhyastha wrote:

Add helper functions backlight_enable and backlight_disable to
enable/disable a backlight device. These helper functions can
then be used by different drm and tinydrm drivers to avoid
repetition of code and also to enforce a uniform and consistent
way to enable/disable a backlight device.

Signed-off-by: Meghana Madhyastha 


To be clear I don't disagree with anthing Daniel V. said about the 
horribly confused (and confusing) power states for backlight.


Nevertheless I don't recall seeing any response (positive or negative) 
to this post from v13:

https://www.spinics.net/lists/dri-devel/msg154459.html



I see that Daniel V has answered while I was chasing this down, but anyways:

A grep suggests that omap1_bl is the only driver that only checks fb_blank.
All the other drivers check both fb_blank and power, a few check state. The
backlight fbdev notifier callback doesn't set power, but sets fb_blank and
state.

fb_blank was marked 'Due to be removed' 9 years ago, so it hasn't been
high priority.

So for completeness I guess it makes sense to set fb_blank.

Noralf.

$ grep -r -C10 "props\.fb_blank" .
./drivers/video/backlight/corgi_lcd.c-  if (bd->props.power != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/corgi_lcd.c-  intensity = 0;
./drivers/video/backlight/corgi_lcd.c-
./drivers/video/backlight/corgi_lcd.c:  if (bd->props.fb_blank != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/corgi_lcd.c-  intensity = 0;
--
./drivers/video/backlight/adp8860_bl.c- if (bl->props.power != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/adp8860_bl.c- brightness = 0;
./drivers/video/backlight/adp8860_bl.c-
./drivers/video/backlight/adp8860_bl.c: if (bl->props.fb_blank != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/adp8860_bl.c- brightness = 0;
--
./drivers/video/backlight/hp680_bl.c-   if (bd->props.power != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/hp680_bl.c-   intensity = 0;
./drivers/video/backlight/hp680_bl.c:   if (bd->props.fb_blank != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/hp680_bl.c-   intensity = 0;
--
./drivers/video/backlight/cr_bllcd.c-static int 
cr_backlight_set_intensity(struct backlight_device *bd)

./drivers/video/backlight/cr_bllcd.c-{
./drivers/video/backlight/cr_bllcd.c-   int intensity = 
bd->props.brightness;
./drivers/video/backlight/cr_bllcd.c-   u32 addr = gpio_bar + 
CRVML_PANEL_PORT;

./drivers/video/backlight/cr_bllcd.c-   u32 cur = inl(addr);
./drivers/video/backlight/cr_bllcd.c-
./drivers/video/backlight/cr_bllcd.c-   if (bd->props.power == 
FB_BLANK_UNBLANK)
./drivers/video/backlight/cr_bllcd.c-   intensity = 
FB_BLANK_UNBLANK;
./drivers/video/backlight/cr_bllcd.c:   if (bd->props.fb_blank == 
FB_BLANK_UNBLANK)
./drivers/video/backlight/cr_bllcd.c-   intensity = 
FB_BLANK_UNBLANK;
./drivers/video/backlight/cr_bllcd.c-   if (bd->props.power == 
FB_BLANK_POWERDOWN)
./drivers/video/backlight/cr_bllcd.c-   intensity = 
FB_BLANK_POWERDOWN;
./drivers/video/backlight/cr_bllcd.c:   if (bd->props.fb_blank == 
FB_BLANK_POWERDOWN)
./drivers/video/backlight/cr_bllcd.c-   intensity = 
FB_BLANK_POWERDOWN;

--
./drivers/video/backlight/max8925_bl.c- if (bl->props.power != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/max8925_bl.c- brightness = 0;
./drivers/video/backlight/max8925_bl.c-
./drivers/video/backlight/max8925_bl.c: if (bl->props.fb_blank != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/max8925_bl.c- brightness = 0;
./drivers/video/backlight/max8925_bl.c-
./drivers/video/backlight/max8925_bl.c- if (bl->props.state & 
BL_CORE_SUSPENDED)

./drivers/video/backlight/max8925_bl.c- brightness = 0;
--
./drivers/video/backlight/lv5207lp.c-   if (backlight->props.power != 
FB_BLANK_UNBLANK ||
./drivers/video/backlight/lv5207lp.c: backlight->props.fb_blank != 
FB_BLANK_UNBLANK ||
./drivers/video/backlight/lv5207lp.c- backlight->props.state & 
(BL_CORE_SUSPENDED | BL_CORE_FBBLANK))

./drivers/video/backlight/lv5207lp.c-   brightness = 0;
--
./drivers/video/backlight/lm3533_bl.c-  if (bd->props.power != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/lm3533_bl.c-  brightness = 0;
./drivers/video/backlight/lm3533_bl.c:  if (bd->props.fb_blank != 
FB_BLANK_UNBLANK)

./drivers/video/backlight/lm3533_bl.c-  brightness = 0;
--
./drivers/video/backlight/omap1_bl.c-static int 
omapbl_update_status(struct backlight_device *dev)

./drivers/video/backlight/omap1_bl.c-{
./drivers/video/backlight/omap1_bl.c-   struct omap_backlight *bl = 
bl_get_data(dev);

./drivers/video/backlight/omap1_bl.c-
./drivers/video/backlight/omap1_bl.c-   if (bl->current_intensity != 
dev->props.brightness) {
./drivers/video/backlight/omap1_bl.c-   if (bl->powermode == 
FB_BLANK_UNBLANK)
./drivers/video/backlight/omap1_bl.c- 
omapbl_send_intensity(dev->props.brightness);
./drivers/video/backlight/omap1

Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread Eric Dumazet
On Wed, Jan 17, 2018 at 2:00 PM, Thomas Gleixner  wrote:
> On Wed, 17 Jan 2018, Linus Torvalds wrote:
>
>> On Wed, Jan 17, 2018 at 1:54 PM, Thomas Gleixner  wrote:
>> > raise_softirq() -> raise_softirq_irqoff()
>> >
>> > set_softirq_bit();
>> >
>> > if (!in_interrupt())
>> > wake_softirqd();
>> >
>> > So if the caller is not in hard or soft interrupt context, which includes
>> > bottom half disabled regions softirqd is woken.
>>
>> That does seem unnecessarily expensive, and maybe we could just do it
>> with thread flag (TIF_NOTIFY_RESUME or whatever).
>>
>> In fact, that was what I *thought* we did. Maybe I just remember some
>> historical behavior.
>>
>> Since networking seems to largely prefer softirqd anyway, maybe that
>> wake_softirqd() is the right thing to do anyway.
>
> Well, but we only do it when we are not in a bh disabled region. The places
> where thread context raises the network softirqs is usually inside a bh
> disabled region, so the softirq is executed on local_bh_enable(). The
> thread is woken up rarely.

There is also the netif_rx_ni() stuff.

Can't remember right now why it is not using
local_bh_{diable,enable}() pair instead
of preempt_disable() ... if (local_softirq_pending()) do_softirq();


Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread Thomas Gleixner
On Wed, 17 Jan 2018, Linus Torvalds wrote:

> On Wed, Jan 17, 2018 at 1:54 PM, Thomas Gleixner  wrote:
> > raise_softirq() -> raise_softirq_irqoff()
> >
> > set_softirq_bit();
> >
> > if (!in_interrupt())
> > wake_softirqd();
> >
> > So if the caller is not in hard or soft interrupt context, which includes
> > bottom half disabled regions softirqd is woken.
> 
> That does seem unnecessarily expensive, and maybe we could just do it
> with thread flag (TIF_NOTIFY_RESUME or whatever).
> 
> In fact, that was what I *thought* we did. Maybe I just remember some
> historical behavior.
> 
> Since networking seems to largely prefer softirqd anyway, maybe that
> wake_softirqd() is the right thing to do anyway.

Well, but we only do it when we are not in a bh disabled region. The places
where thread context raises the network softirqs is usually inside a bh
disabled region, so the softirq is executed on local_bh_enable(). The
thread is woken up rarely.

Thanks,

tglx


Re: [mm 4.15-rc8] Random oopses under memory pressure.

2018-01-17 Thread Dave Hansen
On 01/17/2018 01:39 PM, Linus Torvalds wrote:
> 
> So maybe something like this to test the theory?
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 76c9688b6a0a..f919a5548943 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -756,6 +756,8 @@ static inline void rmv_page_order(struct page *page)
>  static inline int page_is_buddy(struct page *page, struct page *buddy,
> unsigned int 
> order)
>  {
> +   if (WARN_ON_ONCE(page_zone(page) != page_zone(buddy)))
> +   return 0;
> if (page_is_guard(buddy) && page_order(buddy) == order) {
> if (page_zone_id(page) != page_zone_id(buddy))
> return 0;

I thought that page_zone_id() stuff was there to prevent this kind of
cross-zone stuff from happening.


Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread Linus Torvalds
On Wed, Jan 17, 2018 at 1:54 PM, Thomas Gleixner  wrote:
> raise_softirq() -> raise_softirq_irqoff()
>
> set_softirq_bit();
>
> if (!in_interrupt())
> wake_softirqd();
>
> So if the caller is not in hard or soft interrupt context, which includes
> bottom half disabled regions softirqd is woken.

That does seem unnecessarily expensive, and maybe we could just do it
with thread flag (TIF_NOTIFY_RESUME or whatever).

In fact, that was what I *thought* we did. Maybe I just remember some
historical behavior.

Since networking seems to largely prefer softirqd anyway, maybe that
wake_softirqd() is the right thing to do anyway.

 Linus


Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread Thomas Gleixner
On Wed, 17 Jan 2018, Linus Torvalds wrote:

> On Wed, Jan 17, 2018 at 1:49 PM, David Miller  wrote:
> >
> >> That said, this made me wonder a bit. I wonder how bounded the latency
> >> is for raising a softirq from process context. We only _check_ the
> >> softirq on the last hardirq exit, I think.
> >
> > System call return checks it, otherwise this situation would be
> > completely bolixed.
> 
> That's what I thought too. But then I went and looked, and I can't find it.
> 
> But you're probably right, and I just missed it.

Not really. There is nothing to see there.

Thanks,

tglx


Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread Linus Torvalds
On Wed, Jan 17, 2018 at 1:49 PM, David Miller  wrote:
>
>> That said, this made me wonder a bit. I wonder how bounded the latency
>> is for raising a softirq from process context. We only _check_ the
>> softirq on the last hardirq exit, I think.
>
> System call return checks it, otherwise this situation would be
> completely bolixed.

That's what I thought too. But then I went and looked, and I can't find it.

But you're probably right, and I just missed it.

  Linus


Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread Thomas Gleixner
On Wed, 17 Jan 2018, David Miller wrote:

> From: Linus Torvalds 
> Date: Wed, 17 Jan 2018 13:06:58 -0800
> 
> > It was in some way always a "poor mans interrupt thread" (with no
> > blocking like a real thread context, but at least not impacting actual
> > interrupt latency).
> 
> Or in this loopback device case (and tunnel decapsulation) a poor
> man's longjmp, releasing the current stack frame to keep the depth
> in check.
> 
> Anyways...
> 
> > That said, this made me wonder a bit. I wonder how bounded the latency
> > is for raising a softirq from process context. We only _check_ the
> > softirq on the last hardirq exit, I think.
> 
> System call return checks it, otherwise this situation would be
> completely bolixed.

Errm. No.

> 
> > I wonder if we should run softirqs on return to user mode (and make
> > softirq set a thread flag if not in interrupt context).
> 
> I'm pretty sure we already do.

Nope.

raise_softirq() -> raise_softirq_irqoff()

set_softirq_bit();

if (!in_interrupt())
wake_softirqd();

So if the caller is not in hard or soft interrupt context, which includes
bottom half disabled regions softirqd is woken.

If the caller is in a bottom half disabled region then local_bh_enable()
will run the pending softirqs.

Thanks,

tglx



Re: [PATCH] input: joystick: make USB drivers depend on USB

2018-01-17 Thread Dmitry Torokhov
On Wed, Jan 17, 2018 at 10:30:10PM +0100, Marcus Folkesson wrote:
> A driver should not enable an entire subsystem.

I disagree. As you go through menuconfig and you encounter this option
and you have the hardware and you want to enable it, you should be able
to do so. Otherwise you enable bunch of functionality, then go back,
see what new options appeared, enable them, go back, see if any more new
options appeared, and so on.

What exactly prompted this change?

> 
> Signed-off-by: Marcus Folkesson 
> ---
>  drivers/input/joystick/Kconfig | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig
> index 332c0cc1b2ab..4a199cff8c68 100644
> --- a/drivers/input/joystick/Kconfig
> +++ b/drivers/input/joystick/Kconfig
> @@ -279,8 +279,7 @@ config JOYSTICK_JOYDUMP
>  
>  config JOYSTICK_XPAD
>   tristate "X-Box gamepad support"
> - depends on USB_ARCH_HAS_HCD
> - select USB
> + depends on USB
>   help
> Say Y here if you want to use the X-Box pad with your computer.
> Make sure to say Y to "Joystick support" (CONFIG_INPUT_JOYDEV)
> -- 
> 2.15.1
> 

-- 
Dmitry


Re: [PATCH 2/2] drm/msm/adreno: fix nvmem related link error

2018-01-17 Thread Rob Clark
On Mon, Jan 15, 2018 at 11:14 AM, Arnd Bergmann  wrote:
> When NVMEM is configured as a loadable module, and adreno
> is built-in, we get a link failure:
>
> drivers/gpu/drm/msm/adreno/a5xx_gpu.o: In function `a5xx_gpu_init':
> a5xx_gpu.c:(.text+0x15cc): undefined reference to `nvmem_cell_get'
> a5xx_gpu.c:(.text+0x15da): undefined reference to `nvmem_cell_read'
> a5xx_gpu.c:(.text+0x15e4): undefined reference to `nvmem_cell_put'
>
> This adds a Kconfig dependency to enforce valid configurations,
> when NVMEM is a loadable module, adreno now has to also be one.
> The code seems to deal fine with nvmem being completely disabled,
> it will just not set the right speed bin then, so we don't need
> a hard dependency.
>
> Fixes: f56d9df656c4 ("drm/msm/adreno: Read the speed bins for a5xx targets")
> Signed-off-by: Arnd Bergmann 

Arnd, beyond randconfig, I guess there are probably two real-world
scenarios, both =m (distro) and both =y (android/oe/etc)..

Is there a kconfig way to say if nvmem=m then drm_msm must be =n or =m?

BR,
-R


> ---
>  drivers/gpu/drm/msm/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
> index 99d39b2aefa6..74fb1c816da9 100644
> --- a/drivers/gpu/drm/msm/Kconfig
> +++ b/drivers/gpu/drm/msm/Kconfig
> @@ -4,6 +4,7 @@ config DRM_MSM
> depends on DRM
> depends on ARCH_QCOM || (ARM && COMPILE_TEST)
> depends on OF && COMMON_CLK
> +   depends on NVMEM || !NVMEM
> depends on MMU
> select QCOM_MDT_LOADER if ARCH_QCOM
> select REGULATOR
> --
> 2.9.0
>


Re: [mm 4.15-rc8] Random oopses under memory pressure.

2018-01-17 Thread Linus Torvalds
On Wed, Jan 17, 2018 at 1:39 PM, Linus Torvalds
 wrote:
>
> In fact, the whole
>
>pfn_valid_within(buddy_pfn)
>
> test looks very odd. Maybe the pfn of the buddy is valid, but it's not
> in the same zone? Then we'd combine the two pages in two different
> zones into one combined page.

It might also be the same allocation zone, but if the pfn's are in
different sparsemem sections that would also be problematic.

But I hope/assume that all sparsemem sections are always aligned to
(PAGE_SIZE << MAXORDER).

In contrast, the ZONE_HIGHMEM limit really does seems to be
potentially not aligned to anything, ie

 arch/x86/include/asm/pgtable_32_types.h:
 #define MAXMEM  (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)

which I have no idea what the alignment is, but VMALLOC_END at least
does not seem to have any MAXORDER alignment.

So it really does look like the zone for two page orders that would
otherwise be buddies might actually be different.

Interesting if this really is the case. Because afaik, if that
WARN_ON_ONCE actually triggers, it does seem like this bug could go
back pretty much forever.

In fact, it seems to be such a fundamental bug that I suspect I'm
entirely wrong, and full of shit. So it's an interesting and not
_obviously_ incorrect theory, but I suspect I must be missing
something.

  Linus


Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread David Miller
From: Linus Torvalds 
Date: Wed, 17 Jan 2018 13:06:58 -0800

> It was in some way always a "poor mans interrupt thread" (with no
> blocking like a real thread context, but at least not impacting actual
> interrupt latency).

Or in this loopback device case (and tunnel decapsulation) a poor
man's longjmp, releasing the current stack frame to keep the depth
in check.

Anyways...

> That said, this made me wonder a bit. I wonder how bounded the latency
> is for raising a softirq from process context. We only _check_ the
> softirq on the last hardirq exit, I think.

System call return checks it, otherwise this situation would be
completely bolixed.

> I wonder if we should run softirqs on return to user mode (and make
> softirq set a thread flag if not in interrupt context).

I'm pretty sure we already do.


[PATCH 3/3] clocksource: timer-dm: Make unexported functions static

2018-01-17 Thread Ladislav Michl
As dmtimer no longer exports functions, make those previously
exported static.

Signed-off-by: Ladislav Michl 
---
 Note: only those functions assigned to timer ops are made static
   as some others will be needed later for event capture.

 drivers/clocksource/timer-dm.c |  218 -
 include/clocksource/dmtimer.h  |   24 
 2 files changed, 109 insertions(+), 133 deletions(-)

diff --git a/drivers/clocksource/timer-dm.c b/drivers/clocksource/timer-dm.c
index 324ec93d3dd2..8de9e543d129 100644
--- a/drivers/clocksource/timer-dm.c
+++ b/drivers/clocksource/timer-dm.c
@@ -163,6 +163,92 @@ static int omap_dm_timer_of_set_source(struct 
omap_dm_timer *timer)
return ret;
 }
 
+static int omap_dm_timer_set_source(struct omap_dm_timer *timer, int source)
+{
+   int ret;
+   char *parent_name = NULL;
+   struct clk *parent;
+   struct dmtimer_platform_data *pdata;
+
+   if (unlikely(!timer))
+   return -EINVAL;
+
+   pdata = timer->pdev->dev.platform_data;
+
+   if (source < 0 || source >= 3)
+   return -EINVAL;
+
+   /*
+* FIXME: Used for OMAP1 devices only because they do not currently
+* use the clock framework to set the parent clock. To be removed
+* once OMAP1 migrated to using clock framework for dmtimers
+*/
+   if (pdata && pdata->set_timer_src)
+   return pdata->set_timer_src(timer->pdev, source);
+
+   if (IS_ERR(timer->fclk))
+   return -EINVAL;
+
+#if defined(CONFIG_COMMON_CLK)
+   /* Check if the clock has configurable parents */
+   if (clk_hw_get_num_parents(__clk_get_hw(timer->fclk)) < 2)
+   return 0;
+#endif
+
+   switch (source) {
+   case OMAP_TIMER_SRC_SYS_CLK:
+   parent_name = "timer_sys_ck";
+   break;
+
+   case OMAP_TIMER_SRC_32_KHZ:
+   parent_name = "timer_32k_ck";
+   break;
+
+   case OMAP_TIMER_SRC_EXT_CLK:
+   parent_name = "timer_ext_ck";
+   break;
+   }
+
+   parent = clk_get(&timer->pdev->dev, parent_name);
+   if (IS_ERR(parent)) {
+   pr_err("%s: %s not found\n", __func__, parent_name);
+   return -EINVAL;
+   }
+
+   ret = clk_set_parent(timer->fclk, parent);
+   if (ret < 0)
+   pr_err("%s: failed to set %s as parent\n", __func__,
+   parent_name);
+
+   clk_put(parent);
+
+   return ret;
+}
+
+static void omap_dm_timer_enable(struct omap_dm_timer *timer)
+{
+   int c;
+
+   pm_runtime_get_sync(&timer->pdev->dev);
+
+   if (!(timer->capability & OMAP_TIMER_ALWON)) {
+   if (timer->get_context_loss_count) {
+   c = timer->get_context_loss_count(&timer->pdev->dev);
+   if (c != timer->ctx_loss_count) {
+   omap_timer_restore_context(timer);
+   timer->ctx_loss_count = c;
+   }
+   } else {
+   omap_timer_restore_context(timer);
+   }
+   }
+}
+
+static void omap_dm_timer_disable(struct omap_dm_timer *timer)
+{
+   pm_runtime_put_sync(&timer->pdev->dev);
+}
+
 static int omap_dm_timer_prepare(struct omap_dm_timer *timer)
 {
int rc;
@@ -298,16 +384,16 @@ static struct omap_dm_timer *_omap_dm_timer_request(int 
req_type, void *data)
return timer;
 }
 
-struct omap_dm_timer *omap_dm_timer_request(void)
+static struct omap_dm_timer *omap_dm_timer_request(void)
 {
return _omap_dm_timer_request(REQUEST_ANY, NULL);
 }
 
-struct omap_dm_timer *omap_dm_timer_request_specific(int id)
+static struct omap_dm_timer *omap_dm_timer_request_specific(int id)
 {
/* Requesting timer by ID is not supported when device tree is used */
if (of_have_populated_dt()) {
-   pr_warn("%s: Please use omap_dm_timer_request_by_cap/node()\n",
+   pr_warn("%s: Please use omap_dm_timer_request_by_node()\n",
__func__);
return NULL;
}
@@ -336,7 +422,7 @@ struct omap_dm_timer *omap_dm_timer_request_by_cap(u32 cap)
  * Request a timer based upon a device node pointer. Returns pointer to
  * timer handle on success and a NULL pointer on failure.
  */
-struct omap_dm_timer *omap_dm_timer_request_by_node(struct device_node *np)
+static struct omap_dm_timer *omap_dm_timer_request_by_node(struct device_node 
*np)
 {
if (!np)
return NULL;
@@ -344,7 +430,7 @@ struct omap_dm_timer *omap_dm_timer_request_by_node(struct 
device_node *np)
return _omap_dm_timer_request(REQUEST_BY_NODE, np);
 }
 
-int omap_dm_timer_free(struct omap_dm_timer *timer)
+static int omap_dm_timer_free(struct omap_dm_timer *timer)
 {
if (unlikely(!timer))
return -EINVAL;
@@ -356,30 +442,6 @@ int omap_dm_timer_free(struct omap_dm_timer *tim

[PATCH 2/3] pwm: pwm-omap-dmtimer: Fix frequency when using prescaler

2018-01-17 Thread Ladislav Michl
Prescaler setting is currently not taken into account.
Fix that by introducing freq member variable and initialize
it at device probe time. This also avoids frequency
recomputing at each pwm configure time.

Signed-off-by: Ladislav Michl 
---
 drivers/pwm/pwm-omap-dmtimer.c |   92 +++
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/drivers/pwm/pwm-omap-dmtimer.c b/drivers/pwm/pwm-omap-dmtimer.c
index cc485d9946f3..81c79e41a167 100644
--- a/drivers/pwm/pwm-omap-dmtimer.c
+++ b/drivers/pwm/pwm-omap-dmtimer.c
@@ -40,6 +40,7 @@ struct pwm_omap_dmtimer_chip {
pwm_omap_dmtimer *dm_timer;
struct omap_dm_timer_ops *pdata;
struct platform_device *dm_timer_pdev;
+   unsigned long freq;
 };
 
 static inline struct pwm_omap_dmtimer_chip *
@@ -48,9 +49,10 @@ to_pwm_omap_dmtimer_chip(struct pwm_chip *chip)
return container_of(chip, struct pwm_omap_dmtimer_chip, chip);
 }
 
-static u32 pwm_omap_dmtimer_get_clock_cycles(unsigned long clk_rate, int ns)
+static inline u32
+pwm_omap_dmtimer_get_clock_cycles(struct pwm_omap_dmtimer_chip *omap, int ns)
 {
-   return DIV_ROUND_CLOSEST_ULL((u64)clk_rate * ns, NSEC_PER_SEC);
+   return DIV_ROUND_CLOSEST_ULL((u64)omap->freq * ns, NSEC_PER_SEC);
 }
 
 static void pwm_omap_dmtimer_start(struct pwm_omap_dmtimer_chip *omap)
@@ -99,8 +101,6 @@ static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
struct pwm_omap_dmtimer_chip *omap = to_pwm_omap_dmtimer_chip(chip);
u32 period_cycles, duty_cycles;
u32 load_value, match_value;
-   struct clk *fclk;
-   unsigned long clk_rate;
bool timer_active;
 
dev_dbg(chip->dev, "requested duty cycle: %d ns, period: %d ns\n",
@@ -114,19 +114,6 @@ static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
return 0;
}
 
-   fclk = omap->pdata->get_fclk(omap->dm_timer);
-   if (!fclk) {
-   dev_err(chip->dev, "invalid pmtimer fclk\n");
-   goto err_einval;
-   }
-
-   clk_rate = clk_get_rate(fclk);
-   if (!clk_rate) {
-   dev_err(chip->dev, "invalid pmtimer fclk rate\n");
-   goto err_einval;
-   }
-
-   dev_dbg(chip->dev, "clk rate: %luHz\n", clk_rate);
 
/*
 * Calculate the appropriate load and match values based on the
@@ -144,35 +131,35 @@ static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
 *   OMAP4430/60/70 TRM sections 22.2.4.10 and 22.2.4.11
 *   AM335x Sitara TRM sections 20.1.3.5 and 20.1.3.6
 */
-   period_cycles = pwm_omap_dmtimer_get_clock_cycles(clk_rate, period_ns);
-   duty_cycles = pwm_omap_dmtimer_get_clock_cycles(clk_rate, duty_ns);
+   period_cycles = pwm_omap_dmtimer_get_clock_cycles(omap, period_ns);
+   duty_cycles = pwm_omap_dmtimer_get_clock_cycles(omap, duty_ns);
 
if (period_cycles < 2) {
dev_info(chip->dev,
 "period %d ns too short for clock rate %lu Hz\n",
-period_ns, clk_rate);
+period_ns, omap->freq);
goto err_einval;
}
 
if (duty_cycles < 1) {
dev_dbg(chip->dev,
"duty cycle %d ns is too short for clock rate %lu Hz\n",
-   duty_ns, clk_rate);
+   duty_ns, omap->freq);
dev_dbg(chip->dev, "using minimum of 1 clock cycle\n");
duty_cycles = 1;
} else if (duty_cycles >= period_cycles) {
dev_dbg(chip->dev,
"duty cycle %d ns is too long for period %d ns at clock 
rate %lu Hz\n",
-   duty_ns, period_ns, clk_rate);
+   duty_ns, period_ns, omap->freq);
dev_dbg(chip->dev, "using maximum of 1 clock cycle less than 
period\n");
duty_cycles = period_cycles - 1;
}
 
dev_dbg(chip->dev, "effective duty cycle: %lld ns, period: %lld ns\n",
DIV_ROUND_CLOSEST_ULL((u64)NSEC_PER_SEC * duty_cycles,
- clk_rate),
+ omap->freq),
DIV_ROUND_CLOSEST_ULL((u64)NSEC_PER_SEC * period_cycles,
- clk_rate));
+ omap->freq));
 
load_value = (DM_TIMER_MAX - period_cycles) + 1;
match_value = load_value + duty_cycles - 1;
@@ -248,8 +235,9 @@ static int pwm_omap_dmtimer_probe(struct platform_device 
*pdev)
struct dmtimer_platform_data *timer_pdata;
struct omap_dm_timer_ops *pdata;
pwm_omap_dmtimer *dm_timer;
+   struct clk *fclk;
u32 v;
-   int status, ret;
+   int ret;
 
timer = of_parse_phandle(np, "ti,timers", 0);
if (!timer)
@@ -302,9 +290,8 @@ static int pwm_omap_dmtimer_probe(struct platform_device 
*pdev)
 
omap = devm_kzalloc(&pd

[PATCH 1/3] clocksource: timer-dm: Check prescaler value

2018-01-17 Thread Ladislav Michl
Invalid value silently disables use of the prescaler.
Use -1 explicitely for that purpose and error out on
invalid value.

Signed-off-by: Ladislav Michl 
---
 drivers/clocksource/timer-dm.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/timer-dm.c b/drivers/clocksource/timer-dm.c
index 60db1734ea3b..324ec93d3dd2 100644
--- a/drivers/clocksource/timer-dm.c
+++ b/drivers/clocksource/timer-dm.c
@@ -663,13 +663,13 @@ int omap_dm_timer_set_prescaler(struct omap_dm_timer 
*timer, int prescaler)
 {
u32 l;
 
-   if (unlikely(!timer))
+   if (unlikely(!timer) || prescaler < -1 || prescaler > 7)
return -EINVAL;
 
omap_dm_timer_enable(timer);
l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG);
l &= ~(OMAP_TIMER_CTRL_PRE | (0x07 << 2));
-   if (prescaler >= 0x00 && prescaler <= 0x07) {
+   if (prescaler >= 0) {
l |= OMAP_TIMER_CTRL_PRE;
l |= prescaler << 2;
}


[PATCH 0/3] omap: dmtimer: Fix and cleanup moved driver

2018-01-17 Thread Ladislav Michl
This series is build on top od Keerthy's 'omap: dmtimer: Move driver
out of plat-omap' v7.

Ladislav Michl (3):
  clocksource: timer-dm: Check prescaler value
  pwm: pwm-omap-dmtimer: Fix frequency when using prescaler
  clocksource: timer-dm: Make unexported functions static

 drivers/clocksource/timer-dm.c |  222 -
 drivers/pwm/pwm-omap-dmtimer.c |   92 +---
 include/clocksource/dmtimer.h  |   24 
 3 files changed, 164 insertions(+), 174 deletions(-)



Re: [PATCH] usb: dwc3: core: power on PHYs before initializing core

2018-01-17 Thread Brian Norris
On Fri, Jan 12, 2018 at 12:00:16PM +0800, William Wu wrote:
> The dwc3_core_init() gets the PHYs and initializes the PHYs with
> the usb_phy_init() and phy_init() functions before initializing
> core, and power on the PHYs after core initialization is done.
> 
> However, some platforms (e.g. Rockchip RK3399 DWC3 with Type-C
> USB3 PHY), it needs to do some special operation while power on
> the Type-C PHY before initializing DWC3 core. It's because that
> the RK3399 Type-C PHY requires to hold the DWC3 controller in
> reset state to keep the PIPE power state in P2 while configuring
> the Type-C PHY, otherwise, it may cause waiting for the PIPE ready
> timeout. In this case, if we power on the PHYs after the DWC3 core
> initialization is done, the core will be reset to uninitialized
> state after power on the PHYs.
> 
> Fix this by powering on the PHYs before initializing core. And
> because the GUID register may also be reset in this case, so we
> need to configure the GUID register after powering on the PHYs.
> 
> Signed-off-by: William Wu 

This kinda should be part of your series:

[PATCH 0/3] Reset USB3 controller before initializing Type-C PHY on rk3399

or at least mentioned there, because the series there doesn't quite
right otherwise, no?

Anyway, I think this patch looks OK. I don't immediately see good
reasons for delaying the PHY init until later, and I do see reasons why
it could be useful earlier:

Reviewed-by: Brian Norris 

> ---
>  drivers/usb/dwc3/core.c | 46 ++
>  1 file changed, 22 insertions(+), 24 deletions(-)


Re: [PATCH v2] KVM: s390: wire up bpb feature

2018-01-17 Thread Christian Borntraeger
On 01/17/2018 02:51 PM, David Hildenbrand wrote:
> On 17.01.2018 14:44, Christian Borntraeger wrote:
>> The new firmware interfaces for branch prediction behaviour changes
>> are transparently available for the guest. Nevertheless, there is
>> new state attached that should be migrated and properly resetted.
>> Provide a mechanism for handling reset, migration and VSIE.
>>
>> Signed-off-by: Christian Borntraeger 
>> ---
>> v1->v2: - review feedback from David
>>  - rename seb(c) into bpb(c)
>>  arch/s390/include/asm/kvm_host.h |  3 ++-
>>  arch/s390/include/uapi/asm/kvm.h |  5 -
>>  arch/s390/kvm/kvm-s390.c | 12 
>>  arch/s390/kvm/vsie.c | 10 ++
>>  include/uapi/linux/kvm.h |  1 +
>>  5 files changed, 29 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/s390/include/asm/kvm_host.h 
>> b/arch/s390/include/asm/kvm_host.h
>> index e14f381..c1b0a9a 100644
>> --- a/arch/s390/include/asm/kvm_host.h
>> +++ b/arch/s390/include/asm/kvm_host.h
>> @@ -207,7 +207,8 @@ struct kvm_s390_sie_block {
>>  __u16   ipa;/* 0x0056 */
>>  __u32   ipb;/* 0x0058 */
>>  __u32   scaoh;  /* 0x005c */
>> -__u8reserved60; /* 0x0060 */
>> +#define FPF_BPBC0x20
>> +__u8fpf;/* 0x0060 */
>>  #define ECB_GS  0x40
>>  #define ECB_TE  0x10
>>  #define ECB_SRSI0x04
>> diff --git a/arch/s390/include/uapi/asm/kvm.h 
>> b/arch/s390/include/uapi/asm/kvm.h
>> index 38535a57..4cdaa55 100644
>> --- a/arch/s390/include/uapi/asm/kvm.h
>> +++ b/arch/s390/include/uapi/asm/kvm.h
>> @@ -224,6 +224,7 @@ struct kvm_guest_debug_arch {
>>  #define KVM_SYNC_RICCB  (1UL << 7)
>>  #define KVM_SYNC_FPRS   (1UL << 8)
>>  #define KVM_SYNC_GSCB   (1UL << 9)
>> +#define KVM_SYNC_BPBC   (1UL << 10)
>>  /* length and alignment of the sdnx as a power of two */
>>  #define SDNXC 8
>>  #define SDNXL (1UL << SDNXC)
>> @@ -247,7 +248,9 @@ struct kvm_sync_regs {
>>  };
>>  __u8  reserved[512];/* for future vector expansion */
>>  __u32 fpc;  /* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */
>> -__u8 padding1[52];  /* riccb needs to be 64byte aligned */
>> +__u8 bpbc : 1;  /* bp mode */
>> +__u8 reserved2 : 7;
>> +__u8 padding1[51];  /* riccb needs to be 64byte aligned */
>>  __u8 riccb[64]; /* runtime instrumentation controls block */
>>  __u8 padding2[192]; /* sdnx needs to be 256byte aligned */
>>  union {
>> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
>> index 2c93cbb..2598cf243 100644
>> --- a/arch/s390/kvm/kvm-s390.c
>> +++ b/arch/s390/kvm/kvm-s390.c
>> @@ -421,6 +421,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
>> ext)
>>  case KVM_CAP_S390_GS:
>>  r = test_facility(133);
>>  break;
>> +case KVM_CAP_S390_BPB:
>> +r = test_facility(82);
>> +break;
>>  default:
>>  r = 0;
>>  }
>> @@ -2198,6 +2201,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>>  kvm_s390_set_prefix(vcpu, 0);
>>  if (test_kvm_facility(vcpu->kvm, 64))
>>  vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
>> +if (test_kvm_facility(vcpu->kvm, 82))
>> +vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
>>  if (test_kvm_facility(vcpu->kvm, 133))
>>  vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
>>  /* fprs can be synchronized via vrs, even if the guest has no vx. With
>> @@ -2339,6 +2344,7 @@ static void kvm_s390_vcpu_initial_reset(struct 
>> kvm_vcpu *vcpu)
>>  current->thread.fpu.fpc = 0;
>>  vcpu->arch.sie_block->gbea = 1;
>>  vcpu->arch.sie_block->pp = 0;
>> +vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
>>  vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
>>  kvm_clear_async_pf_completion_queue(vcpu);
>>  if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
>> @@ -3298,6 +3304,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct 
>> kvm_run *kvm_run)
>>  vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
>>  vcpu->arch.gs_enabled = 1;
>>  }
>> +if ((kvm_run->kvm_dirty_regs & KVM_SYNC_BPBC) &&
>> +test_kvm_facility(vcpu->kvm, 82)) {
>> +vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
>> +vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 
>> 0;
>> +}
>>  save_access_regs(vcpu->arch.host_acrs);
>>  restore_access_regs(vcpu->run->s.regs.acrs);
>>  /* save host (userspace) fprs/vrs */
>> @@ -3344,6 +3355,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct 
>> kvm_run *kvm_run)
>>  kvm_run->s.regs.pft = vcpu->arch.pfault_token;
>>  kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
>>  kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
>> +kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == 
>> FPF_BPBC;
>>  save_access_regs(vcpu->ru

Re: [PATCH v2 2/8] x86/enter: MACROS to set/clear IBRS

2018-01-17 Thread Tim Chen
On 01/06/2018 03:05 PM, Mark Marshall wrote:
> Hi.
> 
> (I've only just subscribed and can't work out how to reply to a message from 
> before I subscribed (on my phone), sorry)
> 
> In the macro WRMSR_ASM you seem to have lost the wrmsr?
> 
>

Yes.  That's a bug noticed by Thomas and I'll fix for update to IBRS patches.

Tim


[PATCH] input: touchscreen: make USB drivers depend on USB

2018-01-17 Thread Marcus Folkesson
A driver should not enable an entire subsystem.

Signed-off-by: Marcus Folkesson 
---
 drivers/input/touchscreen/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/input/touchscreen/Kconfig 
b/drivers/input/touchscreen/Kconfig
index 38a226f9fcbd..08d2c1434493 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -837,8 +837,7 @@ config TOUCHSCREEN_WM97XX_ZYLONITE
 
 config TOUCHSCREEN_USB_COMPOSITE
tristate "USB Touchscreen Driver"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  USB Touchscreen driver for:
  - eGalax Touchkit USB (also includes eTurboTouch CT-410/510/700)
-- 
2.15.1



Re: [mm 4.15-rc8] Random oopses under memory pressure.

2018-01-17 Thread Linus Torvalds
On Wed, Jan 17, 2018 at 3:08 AM, Tetsuo Handa
 wrote:
>
> I needed to bisect between 4.10 and 4.11, and I got plausible culprit.
> [...]
> git bisect bad b4fb8f66f1ae2e167d06c12d018025a8d4d3ba7e
> # first bad commit: [b4fb8f66f1ae2e167d06c12d018025a8d4d3ba7e] mm, 
> page_alloc: Add missing check for memory holes

Ok, that is indeed much more likely, and very much matches the whole
"this problem only happens with sparsemem" issue.

In fact, the whole

   pfn_valid_within(buddy_pfn)

test looks very odd. Maybe the pfn of the buddy is valid, but it's not
in the same zone? Then we'd combine the two pages in two different
zones into one combined page.

Maybe that's why HIGHMEM matters? The low DMA zone is obviously
aligned in the whole PAGE_ORDER range. But the highmem zone might not
be. I used to know the highmem code, but I've happily forgotten
everything. But I think we end up deciding on some random non-aligned
number in the 900MB range as being the limit between the regular zone
and the HIGHMEM zone.

So maybe something like this to test the theory?

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 76c9688b6a0a..f919a5548943 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -756,6 +756,8 @@ static inline void rmv_page_order(struct page *page)
 static inline int page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
 {
+   if (WARN_ON_ONCE(page_zone(page) != page_zone(buddy)))
+   return 0;
if (page_is_guard(buddy) && page_order(buddy) == order) {
if (page_zone_id(page) != page_zone_id(buddy))
return 0;

I don't know. Does that warning trigger for you?

The above is completely untested. It might not compile. If it compiles
it might not work. And even if it "works", it might not matter,
because perhaps the boundary between regular memory and HIGHMEM is
already sufficiently aligned.

Comments?

Linus


Re: [PATCH 3/6] backlight/pandora: Stop using BL_CORE_DRIVER1

2018-01-17 Thread Daniel Vetter
Thanks a lot for your comments.

On Wed, Jan 17, 2018 at 04:47:41PM +, Daniel Thompson wrote:
> On 17/01/18 14:01, Daniel Vetter wrote:
> > Leaking driver internal tracking into the already massively confusing
> > backlight power tracking is really confusing.
> > 
> > Stop that by allocating a tiny driver private data structure instead.
> > 
> > Cc: Lee Jones 
> > Cc: Daniel Thompson 
> > Cc: Jingoo Han 
> > Signed-off-by: Daniel Vetter 
> > ---
> >   drivers/video/backlight/pandora_bl.c | 26 +++---
> >   1 file changed, 19 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/video/backlight/pandora_bl.c 
> > b/drivers/video/backlight/pandora_bl.c
> > index a186bc677c7d..6bd159946a47 100644
> > --- a/drivers/video/backlight/pandora_bl.c
> > +++ b/drivers/video/backlight/pandora_bl.c
> > @@ -35,11 +35,15 @@
> >   #define MAX_VALUE 63
> >   #define MAX_USER_VALUE (MAX_VALUE - MIN_VALUE)
> > -#define PANDORABL_WAS_OFF BL_CORE_DRIVER1
> > +struct pandora_private {
> > +   unsigned old_state;
> > +#define PANDORABL_WAS_OFF 1
> 
> Nit, but we using old_state like a bitfield so, BIT(0)?
> 
> 
> > +};
> >   static int pandora_backlight_update_status(struct backlight_device *bl)
> >   {
> > int brightness = bl->props.brightness;
> > +   struct pandora_private *priv = bl_get_data(bl);
> > u8 r;
> > if (bl->props.power != FB_BLANK_UNBLANK)
> > @@ -53,7 +57,7 @@ static int pandora_backlight_update_status(struct 
> > backlight_device *bl)
> > brightness = MAX_USER_VALUE;
> > if (brightness == 0) {
> > -   if (bl->props.state & PANDORABL_WAS_OFF)
> > +   if (priv->old_state & PANDORABL_WAS_OFF)
> > goto done;
> > /* first disable PWM0 output, then clock */
> > @@ -66,7 +70,7 @@ static int pandora_backlight_update_status(struct 
> > backlight_device *bl)
> > goto done;
> > }
> > -   if (bl->props.state & PANDORABL_WAS_OFF) {
> > +   if (priv->old_state & PANDORABL_WAS_OFF) {
> > /*
> >  * set PWM duty cycle to max. TPS61161 seems to use this
> >  * to calibrate it's PWM sensitivity when it starts.
> > @@ -93,9 +97,9 @@ static int pandora_backlight_update_status(struct 
> > backlight_device *bl)
> >   done:
> > if (brightness != 0)
> > -   bl->props.state &= ~PANDORABL_WAS_OFF;
> > +   priv->old_state 0;
> > else
> > -   bl->props.state |= PANDORABL_WAS_OFF;
> > +   priv->old_state = PANDORABL_WAS_OFF;
> 
> Well, we were using it like a bitfield until this bit...

I had a simple boolean first (because that's all we neeed), but that made
the code less readable. Should I s/1/true/ in the #define? The entire C99
bool tends to be a bit a bikeshed sometimes :-)

> 
> 
> > return 0;
> >   }
> > @@ -109,15 +113,23 @@ static int pandora_backlight_probe(struct 
> > platform_device *pdev)
> >   {
> > struct backlight_properties props;
> > struct backlight_device *bl;
> > +   struct pandora_private *priv;
> > u8 r;
> > +   priv = devm_kmalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
> > +   if (!priv) {
> > +   dev_err(&pdev->dev, "failed to allocate driver private data\n");
> > +   return -ENOMEM;
> > +   }
> > +
> > memset(&props, 0, sizeof(props));
> > props.max_brightness = MAX_USER_VALUE;
> > props.type = BACKLIGHT_RAW;
> > bl = devm_backlight_device_register(&pdev->dev, pdev->name, &pdev->dev,
> > -   NULL, &pandora_backlight_ops, &props);
> > +   priv, &pandora_backlight_ops, &props);
> > if (IS_ERR(bl)) {
> > dev_err(&pdev->dev, "failed to register backlight\n");
> > +   kfree(priv);
> 
> Why can't we rely on devres for cleanup?

Argh, I had kmalloc first and then changed to devm_kmalloc. The kfree here
needs to go indeed.

Cheers, Daniel

> 
> 
> > return PTR_ERR(bl);
> > }
> > @@ -126,7 +138,7 @@ static int pandora_backlight_probe(struct 
> > platform_device *pdev)
> > /* 64 cycle period, ON position 0 */
> > twl_i2c_write_u8(TWL_MODULE_PWM, 0x80, TWL_PWM0_ON);
> > -   bl->props.state |= PANDORABL_WAS_OFF;
> > +   priv->old_state = PANDORABL_WAS_OFF;
> > bl->props.brightness = MAX_USER_VALUE;
> > backlight_update_status(bl);
> > 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


[PATCH] input: tablet: make USB drivers depend on USB

2018-01-17 Thread Marcus Folkesson
A driver should not enable an entire subsystem.

Signed-off-by: Marcus Folkesson 
---
 drivers/input/tablet/Kconfig | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/input/tablet/Kconfig b/drivers/input/tablet/Kconfig
index a2b9f97422ce..379f40858709 100644
--- a/drivers/input/tablet/Kconfig
+++ b/drivers/input/tablet/Kconfig
@@ -13,8 +13,7 @@ if INPUT_TABLET
 
 config TABLET_USB_ACECAD
tristate "Acecad Flair tablet support (USB)"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use the USB version of the Acecad Flair
  tablet.  Make sure to say Y to "Mouse support"
@@ -26,8 +25,7 @@ config TABLET_USB_ACECAD
 
 config TABLET_USB_AIPTEK
tristate "Aiptek 6000U/8000U and Genius G_PEN tablet support (USB)"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use the USB version of the Aiptek 6000U,
  Aiptek 8000U or Genius G-PEN 560 tablet.  Make sure to say Y to
@@ -51,8 +49,7 @@ config TABLET_USB_GTCO
 
 config TABLET_USB_HANWANG
tristate "Hanwang Art Master III tablet support (USB)"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use the USB version of the Hanwang Art
  Master III tablet.
@@ -62,8 +59,7 @@ config TABLET_USB_HANWANG
 
 config TABLET_USB_KBTAB
tristate "KB Gear JamStudio tablet support (USB)"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use the USB version of the KB Gear
  JamStudio tablet.  Make sure to say Y to "Mouse support"
@@ -75,8 +71,7 @@ config TABLET_USB_KBTAB
 
 config TABLET_USB_PEGASUS
tristate "Pegasus Mobile Notetaker Pen input tablet support"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use the Pegasus Mobile Notetaker,
  also known as:
-- 
2.15.1



[PATCH] input: mouse: make USB drivers depend on USB

2018-01-17 Thread Marcus Folkesson
A driver should not enable an entire subsystem.

Signed-off-by: Marcus Folkesson 
---
 drivers/input/mouse/Kconfig | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig
index 89ebb8f39fee..38aed9b0bcb8 100644
--- a/drivers/input/mouse/Kconfig
+++ b/drivers/input/mouse/Kconfig
@@ -202,8 +202,7 @@ config MOUSE_SERIAL
 
 config MOUSE_APPLETOUCH
tristate "Apple USB Touchpad support"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use an Apple USB Touchpad.
 
@@ -223,8 +222,7 @@ config MOUSE_APPLETOUCH
 
 config MOUSE_BCM5974
tristate "Apple USB BCM5974 Multitouch trackpad support"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you have an Apple USB BCM5974 Multitouch
  trackpad.
@@ -418,8 +416,7 @@ config MOUSE_SYNAPTICS_I2C
 
 config MOUSE_SYNAPTICS_USB
tristate "Synaptics USB device support"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use a Synaptics USB touchpad or pointing
  stick.
-- 
2.15.1



[PATCH] input: misc: make USB drivers depend on USB

2018-01-17 Thread Marcus Folkesson
A driver should not enable an entire subsystem.

Signed-off-by: Marcus Folkesson 
---
 drivers/input/misc/Kconfig | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 9f082a388388..dd04da910bc9 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -354,8 +354,7 @@ config INPUT_ATLAS_BTNS
 
 config INPUT_ATI_REMOTE2
tristate "ATI / Philips USB RF remote control"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use an ATI or Philips USB RF remote control.
  These are RF remotes with USB receivers.
@@ -369,8 +368,7 @@ config INPUT_ATI_REMOTE2
 
 config INPUT_KEYSPAN_REMOTE
tristate "Keyspan DMR USB remote control"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use a Keyspan DMR USB remote control.
  Currently only the UIA-11 type of receiver has been tested.  The tag
@@ -401,8 +399,7 @@ config INPUT_KXTJ9_POLLED_MODE
 
 config INPUT_POWERMATE
tristate "Griffin PowerMate and Contour Jog support"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use Griffin PowerMate or Contour Jog 
devices.
  These are aluminum dials which can measure clockwise and anticlockwise
@@ -417,8 +414,7 @@ config INPUT_POWERMATE
 
 config INPUT_YEALINK
tristate "Yealink usb-p1k voip phone"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to enable keyboard and LCD functions of the
  Yealink usb-p1k usb phones. The audio part is enabled by the generic
@@ -432,8 +428,7 @@ config INPUT_YEALINK
 
 config INPUT_CM109
tristate "C-Media CM109 USB I/O Controller"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to enable keyboard and buzzer functions of the
  C-Media CM109 usb phones. The audio part is enabled by the generic
-- 
2.15.1



[PATCH] kernel:bpf Remove structure passing and assignment to save stack and no coping structures

2018-01-17 Thread Karim Eshapa
>On Sun, Jan 14, 2018 at 01:18:35PM +0200, Karim Eshapa wrote:
>> >> Use pointers to structure as arguments to function instead of coping
>> >> structures and less stack size. Also transfer TNUM(_v, _m) to
>> >> tnum.h file to be used in differnet files for creating anonymous 
>> >> structures
>> >> statically.
>> >>
>> >> Signed-off-by: Karim Eshapa 
>> ...
>> >> +/* Statically tnum constant */
>> >> +#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
>> >>  /* Represent a known constant as a tnum. */
>> >>  struct tnum tnum_const(u64 value);
>> >>  /* A completely unknown value */
>> >> @@ -26,7 +28,7 @@ struct tnum tnum_lshift(struct tnum a, u8 shift);
>> >>  /* Shift a tnum right (by a fixed shift) */
>> >>  struct tnum tnum_rshift(struct tnum a, u8 shift);
>> >>  /* Add two tnums, return @a + @b */
>> >> -struct tnum tnum_add(struct tnum a, struct tnum b);
>> >> +void tnum_add(struct tnum *res, struct tnum *a, struct tnum *b);
>> ...
>> >> - reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + 
>> >> off));
>> >> + tnum_add(®_off, ®->var_off, &TNUM(ip_align + reg->off + off, 
>> >> 0));
>> >>   if (!tnum_is_aligned(reg_off, size)) {
>> >>   char tn_buf[48];
>> >>
>> >> @@ -1023,8 +1023,7 @@ static int check_generic_ptr_alignment(struct 
>> >> bpf_verifier_env *env,
>> >>   /* Byte size accesses are always allowed. */
>> >>   if (!strict || size == 1)
>> >>   return 0;
>> >> -
>> >> - reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
>> >> + tnum_add(®_off, ®->var_off, &TNUM(reg->off + off, 0));
>> ...
> >> - dst_reg->var_off = tnum_add(ptr_reg->var_off, 
> >> off_reg->var_off);
>> >> + tnum_add(&dst_reg->var_off, &ptr_reg->var_off,
>> >> + &off_reg->var_off);
>>
>> >Is it gnu or intel style of argumnets ? where is src or dest ?
>> >Can the same pointer be used as src and as dst ? etc, etc
>> >I don't think it saves stack either.
>> >I'd rather leave things as-is.
>>
>> It's not specific style but it's recommended when passing structure 
>> specially if
>> the structures have large sizes.
> and (dest, src0, src1) respectively.Although tnum structure isn't large but 
> it saves
>> stack,we have 2 structure passed before calling and 1 returned to receive 
>> the return value.

>1. your patch has compile time warnings
>2. it doesn't reduce stack size.
>   For two functions that use tnum_add:
>   adjust_ptr_min_max_vals() before and after has exactly the same.
>   check_ptr_alignment() after your patch _increased_ stack size.
>3. text of verifier.o shrank 133 bytes while tnum.o increased 198

>Please do your homework next time.
>tnum code will stay as-is.

Thanks so much for your response,if there is any recommended tools
to test how your patch affect memory, performance and what's
going on because all accepted patches I sumbited was so trivial 
I'll be so appreciated.

Karim, 


[PATCH] input: joystick: make USB drivers depend on USB

2018-01-17 Thread Marcus Folkesson
A driver should not enable an entire subsystem.

Signed-off-by: Marcus Folkesson 
---
 drivers/input/joystick/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig
index 332c0cc1b2ab..4a199cff8c68 100644
--- a/drivers/input/joystick/Kconfig
+++ b/drivers/input/joystick/Kconfig
@@ -279,8 +279,7 @@ config JOYSTICK_JOYDUMP
 
 config JOYSTICK_XPAD
tristate "X-Box gamepad support"
-   depends on USB_ARCH_HAS_HCD
-   select USB
+   depends on USB
help
  Say Y here if you want to use the X-Box pad with your computer.
  Make sure to say Y to "Joystick support" (CONFIG_INPUT_JOYDEV)
-- 
2.15.1



Re: [PATCH v7 11/14] MIPS: ingenic: Initial JZ4770 support

2018-01-17 Thread James Hogan
On Tue, Jan 16, 2018 at 04:48:01PM +0100, Paul Cercueil wrote:
> Provide just enough bits (clocks, clocksource, uart) to allow a kernel
> to boot on the JZ4770 SoC to a initramfs userspace.
> 
> Signed-off-by: Paul Cercueil 
> Reviewed-by: PrasannaKumar Muralidharan 


> diff --git a/arch/mips/jz4740/time.c b/arch/mips/jz4740/time.c
> index bb1ad5119da4..2ca9160f642a 100644
> --- a/arch/mips/jz4740/time.c
> +++ b/arch/mips/jz4740/time.c
> @@ -113,7 +113,7 @@ static struct clock_event_device jz4740_clockevent = {
>  #ifdef CONFIG_MACH_JZ4740
>   .irq = JZ4740_IRQ_TCU0,
>  #endif
> -#ifdef CONFIG_MACH_JZ4780
> +#if defined(CONFIG_MACH_JZ4770) || defined(CONFIG_MACH_JZ4780)
>   .irq = JZ4780_IRQ_TCU2,
>  #endif
>  };
> -- 
> 2.11.0
> 

MACH_INGENIC selects SYS_SUPPORTS_ZBOOT_UART16550, so I wonder whether
arch/mips/boot/compressed/uart-16550.c needs updating for JZ4770 like
commit ba9e72c2290f ("MIPS: Fix build with DEBUG_ZBOOT and MACH_JZ4780")
does for JZ4780.

Otherwise the non-DT bits look reasonable (I've not really looked
properly at the DT):
Reviewed-by: James Hogan 

Cheers
James


signature.asc
Description: Digital signature


Re: PROBLEM: epoll_wait does not obey edge triggering semantics for hierarchically constructed epoll sets

2018-01-17 Thread Nick Murphy
Thanks.

Yeah, I didn't track it down, but I suspect this behavior has always
been there.  I do think it's ultimately incorrect behavior (i.e., a
violation of edge triggering semantics as I note in the initial
report).  The implications of this is that we'd like the option to
construct hierarchical epoll sets and use them in generic code that
uses epoll_wait and expects edge triggering, but we can't (because
epoll fd's behave differently from other fd's).  I'm a little
surprised others haven't come across this.

I'm not really a kernel developer nor am I particularly familiar with
this code, so I'm unclear how ugly a fix would be...I can imagine
there may be locking issues with recursively traversing child epoll
fd's?...

Nick

On Wed, Jan 17, 2018 at 9:21 AM, Jason Baron  wrote:
>
>
> On 01/12/2018 07:06 PM, Nick Murphy wrote:
>> [1.] One line summary of the problem:
>> epoll_wait does not obey edge triggering semantics for file
>> descriptors which are themselves epoll file descriptors (i.e., epoll
>> fd's added to an epoll set with the EPOLLET flag)
>>
>> [2.] Full description of the problem/report:
>> When executing the following sequence:
>> 1) create and add an event fd (for example) to an inner epoll set
>> 2) add the inner epoll fd to an outer epoll set (with EPOLLET flag set)
>> 3) write to (increase the value of) the event fd
>> 4) epoll_wait on outer fd
>> 5) epoll_wait on outer fd again
>>
>> Edge triggering semantics imply that the epoll_wait in step 5 should
>> block (nothing has changed).  It does not.  It returns immediately.
>>
>> If epoll_wait is called on the inner fd between steps 4 and 5, the
>> epoll_wait in step 5 will then block as expected.
>>
>> Does not seem to matter if the event is added to the inner epoll set
>> with EPOLLET set or not.
>>
>> [3.] Keywords (i.e., modules, networking, kernel): epoll, epoll_wait,
>> edge triggering
>>
>> [4.] Kernel version (from /proc/version): 4.4.0-103-generic (gcc version 
>> 4.8.4)
>>
>> [6.] A small shell script or example program which triggers the
>>  problem (if possible)
>>
>
> Interesting - it seems that epoll can excessively queue wakeup events
> when not desired. Here's a small patch which cures this case, if you
> want to try it out. The semantics around nested edge trigger, though do
> seem unexpected. For example, in the test case you presented. If one
> does epoll_wait() on the outer first and then the inner both
> epoll_wait() will return, however if one does the inner first and then
> the outer, only the inner will return an event. This has to do with how
> epoll implements its polling, it seems odd as well and trickier to fix.
> Afaict its always acted like this.
>
> Thanks,
>
> -Jason
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index afd548e..6bd1f46 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -713,6 +713,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
> if (!ep_is_linked(&epi->rdllink)) {
> list_add_tail(&epi->rdllink, &ep->rdllist);
> ep_pm_stay_awake(epi);
> +   pwake = 1;
> }
> }
> /*
> @@ -728,7 +729,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
> list_splice(&txlist, &ep->rdllist);
> __pm_relax(ep->ws);
>
> -   if (!list_empty(&ep->rdllist)) {
> +   if (unlikely(pwake)) {
> +   pwake = 0;
> /*
>  * Wake up (if active) both the eventpoll wait list and
>  * the ->poll() wait list (delayed after we release the
> lock).
> @@ -744,7 +746,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
> mutex_unlock(&ep->mtx);
>
> /* We have to call this outside the lock */
> -   if (pwake)
> +   if (unlikely(pwake))
> ep_poll_safewake(&ep->poll_wait);
>
> return error;
>
>
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>>
>> int main(int argc, char** argv) {
>>   struct epoll_event ev, events[1];
>>   int inner_ep, outer_ep, sem, flags, ret;
>>   long long val = 1;
>>
>>   if ((sem = eventfd(0, 0)) < 0) {
>> fprintf(stderr, "eventfd failed");
>> return -1;
>>   }
>>
>>   if ((inner_ep = epoll_create(1)) < 0) {
>> fprintf(stderr, "inner epoll_create failed");
>> return -1;
>>   }
>>
>>   // Set inner to be non-blocking (probably irrelevant, but...)
>>   if ((flags = fcntl(inner_ep, F_GETFL, 0)) < 0) {
>> fprintf(stderr, "fcntl get failed");
>> return -1;
>>   }
>>   flags |= O_NONBLOCK;
>>   if (fcntl(inner_ep, F_SETFL, flags) < 0) {
>> fprintf(stderr, "fcntl set failed");
>> return -1;
>>   }
>>
>>   // Add the event to the inner epoll instance.
>>   ev.events = EPOLLIN | EPOLLET;
>>   ev.data.fd = sem;
>>   if (epoll_ctl(inner_ep, EPOLL_CTL_ADD, sem, &ev) < 0) {
>> fprintf(stderr, "inner add failed");
>> return -1;
>>   }
>>
>>   if ((outer_ep = epoll_create(1)) < 0) {
>> f

Re: [PATCH v16 01/10] video: backlight: Add helpers to enable and disable backlight

2018-01-17 Thread Daniel Vetter
On Wed, Jan 17, 2018 at 6:00 PM, Daniel Thompson
 wrote:
> On 16/01/18 10:31, Meghana Madhyastha wrote:
>>
>> Add helper functions backlight_enable and backlight_disable to
>> enable/disable a backlight device. These helper functions can
>> then be used by different drm and tinydrm drivers to avoid
>> repetition of code and also to enforce a uniform and consistent
>> way to enable/disable a backlight device.
>>
>> Signed-off-by: Meghana Madhyastha 
>
>
> To be clear I don't disagree with anthing Daniel V. said about the horribly
> confused (and confusing) power states for backlight.
>
> Nevertheless I don't recall seeing any response (positive or negative) to
> this post from v13:
> https://www.spinics.net/lists/dri-devel/msg154459.html

I think also adjusting the fb_blank bits in these new helpers is a
reasonable thing to do. Maybe with add a huge TODO comment that this
is all a bit sad ...
-Daniel

> Daniel.
>
>
>
>> ---
>>   include/linux/backlight.h | 30 ++
>>   1 file changed, 30 insertions(+)
>>
>> diff --git a/include/linux/backlight.h b/include/linux/backlight.h
>> index af7003548..7b6a9a2a3 100644
>> --- a/include/linux/backlight.h
>> +++ b/include/linux/backlight.h
>> @@ -130,6 +130,36 @@ static inline int backlight_update_status(struct
>> backlight_device *bd)
>> return ret;
>>   }
>>   +/**
>> +  * backlight_enable - Enable backlight
>> +  * @bd: the backlight device to enable
>> +  */
>> +static inline int backlight_enable(struct backlight_device *bd)
>> +{
>> +   if (!bd)
>> +   return 0;
>> +
>> +   bd->props.power = FB_BLANK_UNBLANK;
>> +   bd->props.state &= ~BL_CORE_FBBLANK;
>> +
>> +   return backlight_update_status(bd);
>> +}
>> +
>> +/**
>> +  * backlight_disable - Disable backlight
>> +  * @bd: the backlight device to disable
>> +  */
>> +static inline int backlight_disable(struct backlight_device *bd)
>> +{
>> +   if (!bd)
>> +   return 0;
>> +
>> +   bd->props.power = FB_BLANK_POWERDOWN;
>> +   bd->props.state |= BL_CORE_FBBLANK;
>> +
>> +   return backlight_update_status(bd);
>> +}
>> +
>>   extern struct backlight_device *backlight_device_register(const char
>> *name,
>> struct device *dev, void *devdata, const struct backlight_ops
>> *ops,
>> const struct backlight_properties *props);
>>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel



-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch


[PATCH] usb: dwc3: Undo PHY init if soft reset fails

2018-01-17 Thread Brian Norris
In this function, we init the USB2 and USB3 PHYs, but if soft reset
times out, we don't unwind this.

Noticed by inspection.

Signed-off-by: Brian Norris 
---
 drivers/usb/dwc3/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 07832509584f..1cbbca9fcc52 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -233,6 +233,9 @@ static int dwc3_core_soft_reset(struct dwc3 *dwc)
udelay(1);
} while (--retries);
 
+   phy_exit(dwc->usb3_generic_phy);
+   phy_exit(dwc->usb2_generic_phy);
+
return -ETIMEDOUT;
 }
 
-- 
2.16.0.rc1.238.g530d649a79-goog



Re: [Freedreno] [PATCH 2/2] drm/msm/adreno: fix nvmem related link error

2018-01-17 Thread Jordan Crouse
On Mon, Jan 15, 2018 at 05:14:05PM +0100, Arnd Bergmann wrote:
> When NVMEM is configured as a loadable module, and adreno
> is built-in, we get a link failure:
> 
> drivers/gpu/drm/msm/adreno/a5xx_gpu.o: In function `a5xx_gpu_init':
> a5xx_gpu.c:(.text+0x15cc): undefined reference to `nvmem_cell_get'
> a5xx_gpu.c:(.text+0x15da): undefined reference to `nvmem_cell_read'
> a5xx_gpu.c:(.text+0x15e4): undefined reference to `nvmem_cell_put'
> 
> This adds a Kconfig dependency to enforce valid configurations,
> when NVMEM is a loadable module, adreno now has to also be one.
> The code seems to deal fine with nvmem being completely disabled,
> it will just not set the right speed bin then, so we don't need
> a hard dependency.
> 
> Fixes: f56d9df656c4 ("drm/msm/adreno: Read the speed bins for a5xx targets")
> Signed-off-by: Arnd Bergmann 

Reviewed-by: Jordan Crouse 
> ---
>  drivers/gpu/drm/msm/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
> index 99d39b2aefa6..74fb1c816da9 100644
> --- a/drivers/gpu/drm/msm/Kconfig
> +++ b/drivers/gpu/drm/msm/Kconfig
> @@ -4,6 +4,7 @@ config DRM_MSM
>   depends on DRM
>   depends on ARCH_QCOM || (ARM && COMPILE_TEST)
>   depends on OF && COMMON_CLK
> + depends on NVMEM || !NVMEM
>   depends on MMU
>   select QCOM_MDT_LOADER if ARCH_QCOM
>   select REGULATOR
> -- 
> 2.9.0
> 
> ___
> Freedreno mailing list
> freedr...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/freedreno

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project


Re: [PATCH] Net: ethernet: ti: netcp: Fix inbound ping crash if MTU size is greater than 1500

2018-01-17 Thread David Miller
From: Rex Chang 
Date: Tue, 16 Jan 2018 15:16:01 -0500

> In the receive queue for 4096 bytes fragments, the page address
> set in the SW data0 field of the descriptor is not the one we got
> when doing the reassembly in receive. The page structure was retrieved
> from the wrong descriptor into SW data0 which is then causing a
> page fault when UDP checksum is accessing data above 1500.
> 
> Signed-off-by: Rex Chang 

Applied, thank you.


Re: [PATCH] [RESEND] drm/gma500: initialize gma_clock_t structures

2018-01-17 Thread Daniel Vetter
On Wed, Jan 17, 2018 at 8:44 PM, Arnd Bergmann  wrote:
> On Wed, Jan 17, 2018 at 3:55 PM, Daniel Vetter  wrote:
>> On Wed, Jan 17, 2018 at 3:36 PM, Arnd Bergmann  wrote:
>>> On Wed, Jan 17, 2018 at 9:27 AM, Daniel Vetter  wrote:
 On Tue, Jan 16, 2018 at 03:57:10PM +0100, Arnd Bergmann wrote:
> The two functions pass a partially initialized structure back to the
> caller after a memset() on the destination.
>
> This is not entirely well-defined, most compilers are sensible enough
> to either keep the zero-initialization for the uninitialized members,
> but gcc-4.4 does not, and it warns about this:
>
> drivers/gpu/drm/gma500/oaktrail_crtc.c: In function 
> 'mrst_sdvo_find_best_pll':
> drivers/gpu/drm/gma500/oaktrail_crtc.c:175: warning: 'clock.vco' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:175: warning: 'clock.dot' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:175: warning: 'clock.p2' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:175: warning: 'clock.m2' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:175: warning: 'clock.m1' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c: In function 
> 'mrst_lvds_find_best_pll':
> drivers/gpu/drm/gma500/oaktrail_crtc.c:208: warning: 'clock.p' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:208: warning: 'clock.vco' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:208: warning: 'clock.p2' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:208: warning: 'clock.m2' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:208: warning: 'clock.m1' may be 
> used uninitialized in this function
> drivers/gpu/drm/gma500/oaktrail_crtc.c:208: warning: 'clock.n' may be 
> used uninitialized in this function
>
> This adds an initialization at declaration time to avoid the warning
> and make it well-defined on all compiler versions.
>
> Signed-off-by: Arnd Bergmann 

 Applied to drm-misc-next-fixes for 4.16, thx for your patch.
>>>
>>> Thanks!
>>>
 Aside: Still don't want commit rights? :-)
>>>
>>> I think I'm fine without. While I do tend to have a backlog on DRM
>>> patches that I'd
>>> like to get merged, they are generally of the kind that I should not
>>> apply myself
>>> without the maintainer being involved in some form, and then they can commit
>>> it themselves.
>>
>> Commit rights isn't for pushing unreviewed stuff (our scripts will
>> remind you of that if you try). But you could just volunteer someone
>> to review the entire pile and then push it, instead of nagging every
>> single slacking maintainer individually.
>
> I understand, but I could also just nag someone to review and apply
> the patches, right? Or do the committer and reviewer also need to
> be separate people?

Among author, committer and ackers/reviewers we just insist on 2
different people. So nagging works too, if you don't find that
frustrating.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch


[PATCH v6 05/99] xarray: Add definition of struct xarray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This is a direct replacement for struct radix_tree_root.  Some of the
struct members have changed name; convert those, and use a #define so
that radix_tree users continue to work without change.

Signed-off-by: Matthew Wilcox 
---
 include/linux/radix-tree.h   | 33 --
 include/linux/xarray.h   | 59 +
 lib/Makefile |  2 +-
 lib/idr.c|  4 +-
 lib/radix-tree.c | 75 
 lib/xarray.c | 42 ++
 tools/include/linux/spinlock.h   |  1 +
 tools/testing/radix-tree/.gitignore  |  1 +
 tools/testing/radix-tree/Makefile|  8 +++-
 tools/testing/radix-tree/linux/bug.h |  1 +
 tools/testing/radix-tree/linux/kconfig.h |  1 +
 tools/testing/radix-tree/linux/xarray.h  |  2 +
 tools/testing/radix-tree/multiorder.c|  6 +--
 tools/testing/radix-tree/test.c  |  6 +--
 14 files changed, 168 insertions(+), 73 deletions(-)
 create mode 100644 lib/xarray.c
 create mode 100644 tools/testing/radix-tree/linux/kconfig.h
 create mode 100644 tools/testing/radix-tree/linux/xarray.h

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 87f35fe00e55..c8a33e9e9a3c 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -30,6 +30,9 @@
 #include 
 #include 
 
+/* Keep unconverted code working */
+#define radix_tree_rootxarray
+
 /*
  * The bottom two bits of the slot determine how the remaining bits in the
  * slot are interpreted:
@@ -59,10 +62,7 @@ static inline bool radix_tree_is_internal_node(void *ptr)
 
 #define RADIX_TREE_MAX_TAGS 3
 
-#ifndef RADIX_TREE_MAP_SHIFT
-#define RADIX_TREE_MAP_SHIFT   (CONFIG_BASE_SMALL ? 4 : 6)
-#endif
-
+#define RADIX_TREE_MAP_SHIFT   XA_CHUNK_SHIFT
 #define RADIX_TREE_MAP_SIZE(1UL << RADIX_TREE_MAP_SHIFT)
 #define RADIX_TREE_MAP_MASK(RADIX_TREE_MAP_SIZE-1)
 
@@ -95,36 +95,21 @@ struct radix_tree_node {
unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
-/* The IDR tag is stored in the low bits of the GFP flags */
+/* The IDR tag is stored in the low bits of xa_flags */
 #define ROOT_IS_IDR((__force gfp_t)4)
-/* The top bits of gfp_mask are used to store the root tags */
+/* The top bits of xa_flags are used to store the root tags */
 #define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT)
 
-struct radix_tree_root {
-   spinlock_t  xa_lock;
-   gfp_t   gfp_mask;
-   struct radix_tree_node  __rcu *rnode;
-};
-
-#define RADIX_TREE_INIT(name, mask){   \
-   .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),  \
-   .gfp_mask = (mask), \
-   .rnode = NULL,  \
-}
+#define RADIX_TREE_INIT(name, mask)XARRAY_INIT_FLAGS(name, mask)
 
 #define RADIX_TREE(name, mask) \
struct radix_tree_root name = RADIX_TREE_INIT(name, mask)
 
-#define INIT_RADIX_TREE(root, mask)\
-do {   \
-   spin_lock_init(&(root)->xa_lock);   \
-   (root)->gfp_mask = (mask);  \
-   (root)->rnode = NULL;   \
-} while (0)
+#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)
 
 static inline bool radix_tree_empty(const struct radix_tree_root *root)
 {
-   return root->rnode == NULL;
+   return root->xa_head == NULL;
 }
 
 /**
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index c308152fde7f..3d2f1fafb7ec 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -10,6 +10,8 @@
  */
 
 #include 
+#include 
+#include 
 #include 
 #include 
 
@@ -99,6 +101,63 @@ static inline bool xa_is_internal(const void *entry)
return ((unsigned long)entry & 3) == 2;
 }
 
+/**
+ * struct xarray - The anchor of the XArray.
+ * @xa_lock: Lock that protects the contents of the XArray.
+ *
+ * To use the xarray, define it statically or embed it in your data structure.
+ * It is a very small data structure, so it does not usually make sense to
+ * allocate it separately and keep a pointer to it in your data structure.
+ *
+ * You may use the xa_lock to protect your own data structures as well.
+ */
+/*
+ * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
+ * If the only non-NULL entry in the array is at index 0, @xa_head is that
+ * entry.  If any other entry in the array is non-NULL, @xa_head points
+ * to an @xa_node.
+ */
+struct xarray {
+   spinlock_t  xa_lock;
+/* private: The rest of the data structure is not to be used directly. */
+   gfp_t   xa_flags;
+   void __rcu *xa_head;
+};
+
+#define XARRAY_I

Re: [PATCH v6 20/99] ida: Convert to XArray

2018-01-17 Thread John Paul Adrian Glaubitz
Hi Matthew!

On 01/17/2018 09:20 PM, Matthew Wilcox wrote:
> Use the xarray infrstructure like we used the radix tree infrastructure.
> This lets us get rid of idr_get_free() from the radix tree code.

There's a typo: infrstructure => infratructure

Cheers,
Adrian

-- 
 .''`.  John Paul Adrian Glaubitz
: :' :  Debian Developer - glaub...@debian.org
`. `'   Freie Universitaet Berlin - glaub...@physik.fu-berlin.de
  `-GPG: 62FF 8A75 84E0 2956 9546  0006 7426 3B37 F5B5 F913


[PATCH v6 04/99] xarray: Change definition of sibling entries

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Instead of storing a pointer to the slot containing the canonical entry,
store the offset of the slot.  Produces slightly more efficient code
(~300 bytes) and simplifies the implementation.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h | 90 ++
 lib/radix-tree.c   | 66 +++-
 2 files changed, 109 insertions(+), 47 deletions(-)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 1aa4ff0c19b6..c308152fde7f 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -22,6 +22,12 @@
  * x1: Value entry
  *
  * Attempting to store internal entries in the XArray is a bug.
+ *
+ * Most internal entries are pointers to the next node in the tree.
+ * The following internal entries have a special meaning:
+ *
+ * 0-62: Sibling entries
+ * 256: Retry entry
  */
 
 #define BITS_PER_XA_VALUE  (BITS_PER_LONG - 1)
@@ -60,6 +66,39 @@ static inline bool xa_is_value(const void *entry)
return (unsigned long)entry & 1;
 }
 
+/*
+ * xa_mk_internal() - Create an internal entry.
+ * @v: Value to turn into an internal entry.
+ *
+ * Return: An XArray internal entry corresponding to this value.
+ */
+static inline void *xa_mk_internal(unsigned long v)
+{
+   return (void *)((v << 2) | 2);
+}
+
+/*
+ * xa_to_internal() - Extract the value from an internal entry.
+ * @entry: XArray entry.
+ *
+ * Return: The value which was stored in the internal entry.
+ */
+static inline unsigned long xa_to_internal(const void *entry)
+{
+   return (unsigned long)entry >> 2;
+}
+
+/*
+ * xa_is_internal() - Is the entry an internal entry?
+ * @entry: XArray entry.
+ *
+ * Return: %true if the entry is an internal entry.
+ */
+static inline bool xa_is_internal(const void *entry)
+{
+   return ((unsigned long)entry & 3) == 2;
+}
+
 #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
 #define xa_lock(xa)spin_lock(&(xa)->xa_lock)
 #define xa_unlock(xa)  spin_unlock(&(xa)->xa_lock)
@@ -72,4 +111,55 @@ static inline bool xa_is_value(const void *entry)
 #define xa_unlock_irqrestore(xa, flags) \
spin_unlock_irqrestore(&(xa)->xa_lock, flags)
 
+/* Everything below here is the Advanced API.  Proceed with caution. */
+
+/*
+ * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
+ * the best chunk size requires some tradeoffs.  A power of two recommends
+ * itself so that we can walk the tree based purely on shifts and masks.
+ * Generally, the larger the better; as the number of slots per level of the
+ * tree increases, the less tall the tree needs to be.  But that needs to be
+ * balanced against the memory consumption of each node.  On a 64-bit system,
+ * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
+ * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
+ */
+#ifndef XA_CHUNK_SHIFT
+#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
+#endif
+#define XA_CHUNK_SIZE  (1UL << XA_CHUNK_SHIFT)
+#define XA_CHUNK_MASK  (XA_CHUNK_SIZE - 1)
+
+/* Private */
+static inline bool xa_is_node(const void *entry)
+{
+   return xa_is_internal(entry) && (unsigned long)entry > 4096;
+}
+
+/* Private */
+static inline void *xa_mk_sibling(unsigned int offset)
+{
+   return xa_mk_internal(offset);
+}
+
+/* Private */
+static inline unsigned long xa_to_sibling(const void *entry)
+{
+   return xa_to_internal(entry);
+}
+
+/**
+ * xa_is_sibling() - Is the entry a sibling entry?
+ * @entry: Entry retrieved from the XArray
+ *
+ * Return: %true if the entry is a sibling entry.
+ */
+static inline bool xa_is_sibling(const void *entry)
+{
+   return IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+   xa_is_internal(entry) &&
+   (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
+}
+
+#define XA_RETRY_ENTRY xa_mk_internal(256)
+
 #endif /* _LINUX_XARRAY_H */
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 012e4869f99b..f16f63d15edc 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 
 /* Number of nodes in fully populated tree of given height */
@@ -97,24 +98,7 @@ static inline void *node_to_entry(void *ptr)
return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
 }
 
-#define RADIX_TREE_RETRY   node_to_entry(NULL)
-
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-/* Sibling slots point directly to another slot in the same node */
-static inline
-bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
-{
-   void __rcu **ptr = node;
-   return (parent->slots <= ptr) &&
-   (ptr < parent->slots + RADIX_TREE_MAP_SIZE);
-}
-#else
-static inline
-bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
-{
-   return false;
-}
-#endif
+#define RADIX_TREE_RETRY   XA_RETRY_ENTRY
 
 static 

[PATCH v6 02/99] page cache: Use xa_lock

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Remove the address_space ->tree_lock and use the xa_lock newly added to
the radix_tree_root.  Rename the address_space ->page_tree to ->pages,
since we don't really care that it's a tree.  Take the opportunity to
rearrange the elements of address_space to pack them better on 64-bit,
and make the comments more useful.

Signed-off-by: Matthew Wilcox 
---
 Documentation/cgroup-v1/memory.txt  |   2 +-
 Documentation/vm/page_migration |  14 +--
 arch/arm/include/asm/cacheflush.h   |   6 +-
 arch/nios2/include/asm/cacheflush.h |   6 +-
 arch/parisc/include/asm/cacheflush.h|   6 +-
 drivers/staging/lustre/lustre/llite/glimpse.c   |   2 +-
 drivers/staging/lustre/lustre/mdc/mdc_request.c |   8 +-
 fs/afs/write.c  |   9 +-
 fs/btrfs/compression.c  |   2 +-
 fs/btrfs/extent_io.c|  16 +--
 fs/btrfs/inode.c|   2 +-
 fs/buffer.c |  13 ++-
 fs/cifs/file.c  |   9 +-
 fs/dax.c| 123 
 fs/f2fs/data.c  |   6 +-
 fs/f2fs/dir.c   |   6 +-
 fs/f2fs/inline.c|   6 +-
 fs/f2fs/node.c  |   8 +-
 fs/fs-writeback.c   |  20 ++--
 fs/inode.c  |  11 +--
 fs/nilfs2/btnode.c  |  20 ++--
 fs/nilfs2/page.c|  22 ++---
 include/linux/backing-dev.h |  12 +--
 include/linux/fs.h  |  17 ++--
 include/linux/mm.h  |   2 +-
 include/linux/pagemap.h |   4 +-
 mm/filemap.c|  84 
 mm/huge_memory.c|  10 +-
 mm/khugepaged.c |  49 +-
 mm/memcontrol.c |   4 +-
 mm/migrate.c|  32 +++---
 mm/page-writeback.c |  42 
 mm/readahead.c  |   2 +-
 mm/rmap.c   |   4 +-
 mm/shmem.c  |  60 ++--
 mm/swap_state.c |  17 ++--
 mm/truncate.c   |  22 ++---
 mm/vmscan.c |  12 +--
 mm/workingset.c |  22 ++---
 39 files changed, 344 insertions(+), 368 deletions(-)

diff --git a/Documentation/cgroup-v1/memory.txt 
b/Documentation/cgroup-v1/memory.txt
index cefb63639070..f0ba3fc6f2d8 100644
--- a/Documentation/cgroup-v1/memory.txt
+++ b/Documentation/cgroup-v1/memory.txt
@@ -262,7 +262,7 @@ When oom event notifier is registered, event will be 
delivered.
 2.6 Locking
 
lock_page_cgroup()/unlock_page_cgroup() should not be called under
-   mapping->tree_lock.
+   the mapping's xa_lock.
 
Other lock order is following:
PG_locked.
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index 0478ae2ad44a..faf849596a85 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -90,7 +90,7 @@ Steps:
 
 1. Lock the page to be migrated
 
-2. Insure that writeback is complete.
+2. Ensure that writeback is complete.
 
 3. Lock the new page that we want to move to. It is locked so that accesses to
this (not yet uptodate) page immediately lock while the move is in progress.
@@ -100,8 +100,8 @@ Steps:
mapcount is not zero then we do not migrate the page. All user space
processes that attempt to access the page will now wait on the page lock.
 
-5. The radix tree lock is taken. This will cause all processes trying
-   to access the page via the mapping to block on the radix tree spinlock.
+5. The address space xa_lock is taken. This will cause all processes trying
+   to access the page via the mapping to block on the spinlock.
 
 6. The refcount of the page is examined and we back out if references remain
otherwise we know that we are the only one referencing this page.
@@ -114,12 +114,12 @@ Steps:
 
 9. The radix tree is changed to point to the new page.
 
-10. The reference count of the old page is dropped because the radix tree
+10. The reference count of the old page is dropped because the address space
 reference is gone. A reference to the new page is established because
-the new page is referenced to by the radix tree.
+the new page is referenced by the address space.
 
-11. The radix tree lock is dropped. With that lookups in the mapping
-become possible again. Processes will move from spinning on the tree_lock
+11. The address space xa_lock is dropped

[PATCH v6 03/99] xarray: Replace exceptional entries

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Introduce xarray value entries to replace the radix tree exceptional
entry code.  This is a slight change in encoding to allow the use of an
extra bit (we can now store BITS_PER_LONG - 1 bits in a value entry).
It is also a change in emphasis; exceptional entries are intimidating
and different.  As the comment explains, you can choose to store values
or pointers in the xarray and they are both first-class citizens.

Signed-off-by: Matthew Wilcox 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h|   4 +-
 arch/powerpc/include/asm/nohash/64/pgtable.h|   4 +-
 drivers/gpu/drm/i915/i915_gem.c |  17 ++--
 drivers/staging/lustre/lustre/mdc/mdc_request.c |   2 +-
 fs/btrfs/compression.c  |   2 +-
 fs/btrfs/inode.c|   4 +-
 fs/dax.c| 107 
 fs/proc/task_mmu.c  |   2 +-
 include/linux/fs.h  |  48 +++
 include/linux/radix-tree.h  |  36 ++--
 include/linux/swapops.h |  19 ++---
 include/linux/xarray.h  |  51 +++
 lib/idr.c   |  63 ++
 lib/radix-tree.c|  21 ++---
 mm/filemap.c|  10 +--
 mm/khugepaged.c |   2 +-
 mm/madvise.c|   2 +-
 mm/memcontrol.c |   2 +-
 mm/mincore.c|   2 +-
 mm/readahead.c  |   2 +-
 mm/shmem.c  |  10 +--
 mm/swap.c   |   2 +-
 mm/truncate.c   |  12 +--
 mm/workingset.c |  12 ++-
 tools/testing/radix-tree/idr-test.c |   6 +-
 tools/testing/radix-tree/linux/radix-tree.h |   1 +
 tools/testing/radix-tree/multiorder.c   |  47 +--
 tools/testing/radix-tree/test.c |   2 +-
 28 files changed, 256 insertions(+), 236 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 44697817ccc6..5025c26f1acd 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -649,9 +649,7 @@ static inline bool pte_user(pte_t pte)
BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY);   \
} while (0)
-/*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
+
 #define SWP_TYPE_BITS 5
 #define __swp_type(x)  (((x).val >> _PAGE_BIT_SWAP_TYPE) \
& ((1UL << SWP_TYPE_BITS) - 1))
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h 
b/arch/powerpc/include/asm/nohash/64/pgtable.h
index abddf5830ad5..f711773568d7 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -329,9 +329,7 @@ static inline void __ptep_set_access_flags(struct mm_struct 
*mm,
 */ \
BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
} while (0)
-/*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
+
 #define SWP_TYPE_BITS 5
 #define __swp_type(x)  (((x).val >> _PAGE_BIT_SWAP_TYPE) \
& ((1UL << SWP_TYPE_BITS) - 1))
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5cfba89ed586..25ce7bcf9988 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5369,7 +5369,8 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
count = __sg_page_count(sg);
 
while (idx + count <= n) {
-   unsigned long exception, i;
+   void *entry;
+   unsigned long i;
int ret;
 
/* If we cannot allocate and insert this entry, or the
@@ -5384,12 +5385,9 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
if (ret && ret != -EEXIST)
goto scan;
 
-   exception =
-   RADIX_TREE_EXCEPTIONAL_ENTRY |
-   idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
+   entry = xa_mk_value(idx);
for (i = 1; i < count; i++) {
-   ret = radix_tree_insert(&iter->radix, idx + i,
-   (void *)exception);
+   ret = radix_tree_insert(&iter->radix, idx + i, entry);
if (ret && ret != -EEXIST)
goto scan;
}
@@ -5427,15 +5425,14 @@ i915_gem_object_get_sg(struct drm_i915_gem_objec

[PATCH v6 07/99] xarray: Add documentation

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This is documentation on how to use the XArray, not details about its
internal implementation.

Signed-off-by: Matthew Wilcox 
---
 Documentation/core-api/index.rst  |   1 +
 Documentation/core-api/xarray.rst | 361 ++
 2 files changed, 362 insertions(+)
 create mode 100644 Documentation/core-api/xarray.rst

diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index d5bbe035316d..eb16ba30aeb6 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -18,6 +18,7 @@ Core utilities
local_ops
workqueue
genericirq
+   xarray
flexible-arrays
librs
genalloc
diff --git a/Documentation/core-api/xarray.rst 
b/Documentation/core-api/xarray.rst
new file mode 100644
index ..914999c0bf3f
--- /dev/null
+++ b/Documentation/core-api/xarray.rst
@@ -0,0 +1,361 @@
+.. SPDX-License-Identifier: CC-BY-SA-4.0
+
+==
+XArray
+==
+
+:Author: Matthew Wilcox
+
+Overview
+
+
+The XArray is an abstract data type which behaves like a very large array
+of pointers.  It meets many of the same needs as a hash or a conventional
+resizable array.  Unlike a hash, it allows you to sensibly go to the
+next or previous entry in a cache-efficient manner.  In contrast to
+a resizable array, there is no need for copying data or changing MMU
+mappings in order to grow the array.  It is more memory-efficient,
+parallelisable and cache friendly than a doubly-linked list.  It takes
+advantage of RCU to perform lookups without locking.
+
+The XArray implementation is efficient when the indices used are densely
+clustered; hashing the object and using the hash as the index will not
+perform well.  The XArray is optimised for small indices, but still has
+good performance with large indices.  If your index can be larger than
+``ULONG_MAX`` then the XArray is not the data type for you.  The most
+important user of the XArray is the page cache.
+
+A freshly-initialised XArray contains a ``NULL`` pointer at every index.
+Each non-``NULL`` entry in the array has three bits associated with it
+called tags.  Each tag may be set or cleared independently of the others.
+You can iterate over entries which are tagged.
+
+Normal pointers may be stored in the XArray directly.  They must be 4-byte
+aligned, which is true for any pointer returned from :c:func:`kmalloc` and
+:c:func:`alloc_page`.  It isn't true for arbitrary user-space pointers,
+nor for function pointers.  You can store pointers to statically allocated
+objects, as long as those objects have an alignment of at least 4.
+
+You can also store integers between 0 and ``LONG_MAX`` in the XArray.
+You must first convert it into an entry using :c:func:`xa_mk_value`.
+When you retrieve an entry from the XArray, you can check whether it is
+a value entry by calling :c:func:`xa_is_value`, and convert it back to
+an integer by calling :c:func:`xa_to_value`.
+
+The XArray does not support storing :c:func:`IS_ERR` pointers as some
+conflict with value entries or internal entries.
+
+An unusual feature of the XArray is the ability to create entries which
+occupy a range of indices.  Once stored to, looking up any index in
+the range will return the same entry as looking up any other index in
+the range.  Setting a tag on one index will set it on all of them.
+Storing to any index will store to all of them.  Multi-index entries can
+be explicitly split into smaller entries, or storing ``NULL`` into any
+entry will cause the XArray to forget about the range.
+
+Normal API
+==
+
+Start by initialising an XArray, either with :c:func:`DEFINE_XARRAY`
+for statically allocated XArrays or :c:func:`xa_init` for dynamically
+allocated ones.
+
+You can then set entries using :c:func:`xa_store` and get entries
+using :c:func:`xa_load`.  xa_store will overwrite any entry with the
+new entry and return the previous entry stored at that index.  You can
+use :c:func:`xa_erase` instead of calling :c:func:`xa_store` with a
+%NULL entry.  There is no difference between an entry that has never
+been stored to and one that has most recently had ``NULL`` stored to it.
+
+You can conditionally replace an entry at an index by using
+:c:func:`xa_cmpxchg`.  Like :c:func:`cmpxchg`, it will only succeed if
+the entry at that index has the 'old' value.  It also returns the entry
+which was at that index; if it returns the same entry which was passed as
+'old', then :c:func:`xa_cmpxchg` succeeded.
+
+If you want to only store a new entry to an index if the current entry
+at that index is ``NULL``, you can use :c:func:`xa_insert` which
+returns ``-EEXIST`` if the entry is not empty.
+
+Calling :c:func:`xa_reserve` ensures that there is enough memory allocated
+to store an entry at the specified index.  This is not normally needed,
+but some users have a complicated locking scheme.
+
+You can enquire whether a tag is set on an entry by using
+:c:func:`xa_get_tag`.  If the ent

Re: [PATCH] MIPS: ftrace: Remove pointer comparison to 0 in prepare_ftrace_return

2018-01-17 Thread James Hogan
On Wed, Jan 17, 2018 at 12:31:57PM +0100, Mathieu Malaterre wrote:
> Replace pointer comparison to 0 with NULL in prepare_ftrace_return
> to improve code readability. Identified with coccinelle script
> 'badzero.cocci'.
> 
> Signed-off-by: Mathieu Malaterre 

Reviewed-by: James Hogan 

Cheers
James

> ---
>  arch/mips/kernel/ftrace.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c
> index 99285be0e088..7f3dfdbc3657 100644
> --- a/arch/mips/kernel/ftrace.c
> +++ b/arch/mips/kernel/ftrace.c
> @@ -361,7 +361,7 @@ void prepare_ftrace_return(unsigned long *parent_ra_addr, 
> unsigned long self_ra,
>* If fails when getting the stack address of the non-leaf function's
>* ra, stop function graph tracer and return
>*/
> - if (parent_ra_addr == 0)
> + if (parent_ra_addr == NULL)
>   goto out;
>  #endif
>   /* *parent_ra_addr = return_hooker; */
> -- 
> 2.11.0
> 
> 


signature.asc
Description: Digital signature


[PATCH v6 06/99] xarray: Define struct xa_node

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This is a direct replacement for struct radix_tree_node.  A couple of
struct members have changed name, so convert those.  Use a #define so
that radix tree users continue to work without change.

Signed-off-by: Matthew Wilcox 
---
 include/linux/radix-tree.h| 29 +++--
 include/linux/xarray.h| 24 ++
 lib/radix-tree.c  | 48 +--
 mm/workingset.c   | 16 ++--
 tools/testing/radix-tree/multiorder.c | 30 +++---
 5 files changed, 74 insertions(+), 73 deletions(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c8a33e9e9a3c..f64beb9ba175 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -32,6 +32,7 @@
 
 /* Keep unconverted code working */
 #define radix_tree_rootxarray
+#define radix_tree_nodexa_node
 
 /*
  * The bottom two bits of the slot determine how the remaining bits in the
@@ -60,41 +61,17 @@ static inline bool radix_tree_is_internal_node(void *ptr)
 
 /*** radix-tree API starts here ***/
 
-#define RADIX_TREE_MAX_TAGS 3
-
 #define RADIX_TREE_MAP_SHIFT   XA_CHUNK_SHIFT
 #define RADIX_TREE_MAP_SIZE(1UL << RADIX_TREE_MAP_SHIFT)
 #define RADIX_TREE_MAP_MASK(RADIX_TREE_MAP_SIZE-1)
 
-#define RADIX_TREE_TAG_LONGS   \
-   ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define RADIX_TREE_MAX_TAGSXA_MAX_TAGS
+#define RADIX_TREE_TAG_LONGS   XA_TAG_LONGS
 
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
  RADIX_TREE_MAP_SHIFT))
 
-/*
- * @count is the count of every non-NULL element in the ->slots array
- * whether that is a data entry, a retry entry, a user pointer,
- * a sibling entry or a pointer to the next level of the tree.
- * @exceptional is the count of every element in ->slots which is
- * either a data entry or a sibling entry for data.
- */
-struct radix_tree_node {
-   unsigned char   shift;  /* Bits remaining in each slot */
-   unsigned char   offset; /* Slot offset in parent */
-   unsigned char   count;  /* Total entry count */
-   unsigned char   exceptional;/* Exceptional entry count */
-   struct radix_tree_node *parent; /* Used when ascending tree */
-   struct radix_tree_root *root;   /* The tree we belong to */
-   union {
-   struct list_head private_list;  /* For tree user */
-   struct rcu_head rcu_head;   /* Used when freeing node */
-   };
-   void __rcu  *slots[RADIX_TREE_MAP_SIZE];
-   unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
-};
-
 /* The IDR tag is stored in the low bits of xa_flags */
 #define ROOT_IS_IDR((__force gfp_t)4)
 /* The top bits of xa_flags are used to store the root tags */
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 3d2f1fafb7ec..3d5f7804ef45 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -187,6 +187,30 @@ static inline void xa_init(struct xarray *xa)
 #endif
 #define XA_CHUNK_SIZE  (1UL << XA_CHUNK_SHIFT)
 #define XA_CHUNK_MASK  (XA_CHUNK_SIZE - 1)
+#define XA_MAX_TAGS3
+#define XA_TAG_LONGS   DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
+
+/*
+ * @count is the count of every non-NULL element in the ->slots array
+ * whether that is a value entry, a retry entry, a user pointer,
+ * a sibling entry or a pointer to the next level of the tree.
+ * @nr_values is the count of every element in ->slots which is
+ * either a value entry or a sibling entry to a value entry.
+ */
+struct xa_node {
+   unsigned char   shift;  /* Bits remaining in each slot */
+   unsigned char   offset; /* Slot offset in parent */
+   unsigned char   count;  /* Total entry count */
+   unsigned char   nr_values;  /* Value entry count */
+   struct xa_node __rcu *parent;   /* NULL at top of tree */
+   struct xarray   *array; /* The array we belong to */
+   union {
+   struct list_head private_list;  /* For tree user */
+   struct rcu_head rcu_head;   /* Used when freeing node */
+   };
+   void __rcu  *slots[XA_CHUNK_SIZE];
+   unsigned long   tags[XA_MAX_TAGS][XA_TAG_LONGS];
+};
 
 /* Private */
 static inline bool xa_is_node(const void *entry)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 126eeb06cfef..74a6ddd1d6ad 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -259,11 +259,11 @@ static void dump_node(struct radix_tree_node *node, 
unsigned long index)
 {
unsigned long i;
 
-   pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx 
%lx %lx shift %d count %d exceptional %d\n",
+   pr_debug("radix node

[PATCH v6 10/99] xarray: Add xa_store

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

xa_store() differs from radix_tree_insert() in that it will overwrite an
existing element in the array rather than returning an error.  This is
the behaviour which most users want, and those that want more complex
behaviour generally want to use the xas family of routines anyway.

For memory allocation, xa_store() will first attempt to request memory
from the slab allocator; if memory is not immediately available, it will
drop the xa_lock and allocate memory, keeping a pointer in the xa_state.
It does not use the per-CPU cache, although those will continue to exist
until all radix tree users are converted to the xarray.

This patch also includes xa_erase() and __xa_erase() for a streamlined
way to store NULL.  Since there is no need to allocate memory in order
to store a NULL in the XArray, we do not need to trouble the user with
deciding what memory allocation flags to use.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h| 107 +
 lib/radix-tree.c  |   4 +-
 lib/xarray.c  | 642 ++
 tools/include/linux/spinlock.h|   2 +
 tools/testing/radix-tree/linux/kernel.h   |   4 +
 tools/testing/radix-tree/linux/lockdep.h  |  11 +
 tools/testing/radix-tree/linux/rcupdate.h |   1 +
 tools/testing/radix-tree/test.c   |  32 ++
 tools/testing/radix-tree/test.h   |   5 +
 tools/testing/radix-tree/xarray-test.c| 113 +-
 10 files changed, 917 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/radix-tree/linux/lockdep.h

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index ddeb49b8bfc1..139b1c1fd022 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -149,10 +149,17 @@ typedef unsigned __bitwise xa_tag_t;
 #define XA_PRESENT ((__force xa_tag_t)8U)
 #define XA_TAG_MAX XA_TAG_2
 
+enum xa_lock_type {
+   XA_LOCK_IRQ = 1,
+   XA_LOCK_BH = 2,
+};
+
 /*
  * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
  * and we remain compatible with that.
  */
+#define XA_FLAGS_LOCK_IRQ  ((__force gfp_t)XA_LOCK_IRQ)
+#define XA_FLAGS_LOCK_BH   ((__force gfp_t)XA_LOCK_BH)
 #define XA_FLAGS_TAG(tag)  ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
(__force unsigned)(tag)))
 
@@ -202,6 +209,7 @@ struct xarray {
 
 void xa_init_flags(struct xarray *, gfp_t flags);
 void *xa_load(struct xarray *, unsigned long index);
+void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
 bool xa_get_tag(struct xarray *, unsigned long index, xa_tag_t);
 void xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
 void xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
@@ -217,6 +225,33 @@ static inline void xa_init(struct xarray *xa)
xa_init_flags(xa, 0);
 }
 
+/**
+ * xa_erase() - Erase this entry from the XArray.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * This function is the equivalent of calling xa_store() with %NULL as
+ * the third argument.  The XArray does not need to allocate memory, so
+ * the user does not need to provide GFP flags.
+ *
+ * Return: The entry which used to be at this index.
+ */
+static inline void *xa_erase(struct xarray *xa, unsigned long index)
+{
+   return xa_store(xa, index, NULL, 0);
+}
+
+/**
+ * xa_empty() - Determine if an array has any present entries.
+ * @xa: XArray.
+ *
+ * Return: %true if the array contains only NULL pointers.
+ */
+static inline bool xa_empty(const struct xarray *xa)
+{
+   return xa->xa_head == NULL;
+}
+
 /**
  * xa_tagged() - Inquire whether any entry in this array has a tag set
  * @xa: Array
@@ -243,7 +278,11 @@ static inline bool xa_tagged(const struct xarray *xa, 
xa_tag_t tag)
 
 /*
  * Versions of the normal API which require the caller to hold the xa_lock.
+ * If the GFP flags allow it, will drop the lock in order to allocate
+ * memory, then reacquire it afterwards.
  */
+void *__xa_erase(struct xarray *, unsigned long index);
+void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
 void __xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
 void __xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
 
@@ -339,6 +378,12 @@ static inline void *xa_entry_locked(struct xarray *xa,
lockdep_is_held(&xa->xa_lock));
 }
 
+/* Private */
+static inline void *xa_mk_node(const struct xa_node *node)
+{
+   return (void *)((unsigned long)node | 2);
+}
+
 /* Private */
 static inline struct xa_node *xa_to_node(const void *entry)
 {
@@ -519,6 +564,12 @@ static inline bool xas_valid(const struct xa_state *xas)
return !xas_invalid(xas);
 }
 
+/* True if the node represents head-of-tree, RESTART or BOUNDS */
+static inline bool xas_top(struct xa_node *node)
+{
+   return node <= XAS_RESTART;
+}
+
 /**
  * xas_retry() - Ha

[PATCH v6 09/99] xarray: Add xa_get_tag, xa_set_tag and xa_clear_tag

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

XArray tags are slightly more strongly typed than the radix tree tags,
but occupy the same bits.  This commit also adds the xas_ family of tag
operations, for cases where the caller is already holding the lock, and
xa_tagged() to ask whether any array member has a particular tag set.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h |  40 +++
 lib/xarray.c   | 229 +
 tools/include/linux/spinlock.h |   6 ++
 3 files changed, 275 insertions(+)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 54c694e5c33f..ddeb49b8bfc1 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -141,6 +142,20 @@ static inline int xa_err(void *entry)
return 0;
 }
 
+typedef unsigned __bitwise xa_tag_t;
+#define XA_TAG_0   ((__force xa_tag_t)0U)
+#define XA_TAG_1   ((__force xa_tag_t)1U)
+#define XA_TAG_2   ((__force xa_tag_t)2U)
+#define XA_PRESENT ((__force xa_tag_t)8U)
+#define XA_TAG_MAX XA_TAG_2
+
+/*
+ * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
+ * and we remain compatible with that.
+ */
+#define XA_FLAGS_TAG(tag)  ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
+   (__force unsigned)(tag)))
+
 /**
  * struct xarray - The anchor of the XArray.
  * @xa_lock: Lock that protects the contents of the XArray.
@@ -187,6 +202,9 @@ struct xarray {
 
 void xa_init_flags(struct xarray *, gfp_t flags);
 void *xa_load(struct xarray *, unsigned long index);
+bool xa_get_tag(struct xarray *, unsigned long index, xa_tag_t);
+void xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
+void xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
 
 /**
  * xa_init() - Initialise an empty XArray.
@@ -199,6 +217,18 @@ static inline void xa_init(struct xarray *xa)
xa_init_flags(xa, 0);
 }
 
+/**
+ * xa_tagged() - Inquire whether any entry in this array has a tag set
+ * @xa: Array
+ * @tag: Tag value
+ *
+ * Return: %true if any entry has this tag set.
+ */
+static inline bool xa_tagged(const struct xarray *xa, xa_tag_t tag)
+{
+   return xa->xa_flags & XA_FLAGS_TAG(tag);
+}
+
 #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
 #define xa_lock(xa)spin_lock(&(xa)->xa_lock)
 #define xa_unlock(xa)  spin_unlock(&(xa)->xa_lock)
@@ -211,6 +241,12 @@ static inline void xa_init(struct xarray *xa)
 #define xa_unlock_irqrestore(xa, flags) \
spin_unlock_irqrestore(&(xa)->xa_lock, flags)
 
+/*
+ * Versions of the normal API which require the caller to hold the xa_lock.
+ */
+void __xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
+void __xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
+
 /* Everything below here is the Advanced API.  Proceed with caution. */
 
 /*
@@ -504,6 +540,10 @@ static inline bool xas_retry(struct xa_state *xas, const 
void *entry)
 
 void *xas_load(struct xa_state *);
 
+bool xas_get_tag(const struct xa_state *, xa_tag_t);
+void xas_set_tag(const struct xa_state *, xa_tag_t);
+void xas_clear_tag(const struct xa_state *, xa_tag_t);
+
 /**
  * xas_reload() - Refetch an entry from the xarray.
  * @xas: XArray operation state.
diff --git a/lib/xarray.c b/lib/xarray.c
index 83b9c25de415..59b57e6f80de 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -5,6 +5,7 @@
  * Author: Matthew Wilcox 
  */
 
+#include 
 #include 
 #include 
 
@@ -24,6 +25,55 @@
  * @entry refers to something stored in a slot in the xarray
  */
 
+static inline struct xa_node *xa_parent(struct xarray *xa,
+   const struct xa_node *node)
+{
+   return rcu_dereference_check(node->parent,
+   lockdep_is_held(&xa->xa_lock));
+}
+
+static inline struct xa_node *xa_parent_locked(struct xarray *xa,
+   const struct xa_node *node)
+{
+   return rcu_dereference_protected(node->parent,
+   lockdep_is_held(&xa->xa_lock));
+}
+
+static inline void xa_tag_set(struct xarray *xa, xa_tag_t tag)
+{
+   if (!(xa->xa_flags & XA_FLAGS_TAG(tag)))
+   xa->xa_flags |= XA_FLAGS_TAG(tag);
+}
+
+static inline void xa_tag_clear(struct xarray *xa, xa_tag_t tag)
+{
+   if (xa->xa_flags & XA_FLAGS_TAG(tag))
+   xa->xa_flags &= ~(XA_FLAGS_TAG(tag));
+}
+
+static inline bool node_get_tag(const struct xa_node *node, unsigned int 
offset,
+   xa_tag_t tag)
+{
+   return test_bit(offset, node->tags[(__force unsigned)tag]);
+}
+
+static inline void node_set_tag(struct xa_node *node, unsigned int offset,
+   xa_tag_t tag)
+{
+   __set_bit(offset, node->tags[(__force unsigned)tag

[PATCH] led: core: Fix race on software blink cancellation

2018-01-17 Thread Jacek Anaszewski
Commit d23a22a74fde ("leds: delay led_set_brightness if stopping soft-blink")
made a modifications to the LED core allowing for led_set_brightness() to be
called from hard-irq context when soft blink is being handled in soft-irq.

Since that time LED core has undergone modifications related to addition of
generic support for delegating brightness setting to a workqueue as well as
subsequent fixes for blink setting use cases.

After that the LED core code became hard to maintain and analyze, especially
due to the imposed hard-irq context compatibility. It also turned out that
in some cases a LED remained off after executing following sequence of commands:

1. echo timer > trigger
2. echo 0 > brightness
3. echo 100 > brightness

The reason was LED_BLINK_DISABLE operation delegated to a set_brightness_work,
triggered in 2., which was handled after 3. took effect.

In order to serialize above operations and in the same time avoid
code overcomplication the hard-irq context compatibility is being removed,
which allows to use spin_lock_bh() for LED blink setting serialization.

>From now, if in hard-irq context, users need to delegate led_set_brightness()
to a workqueue in the place of use.

Reported-by: Craig McQueen 
Signed-off-by: Jacek Anaszewski 
---
Craig,

It would be great if you could confirm if this fixes your use case.

 drivers/leds/led-core.c  | 79 +++-
 drivers/leds/trigger/ledtrig-activity.c  |  3 --
 drivers/leds/trigger/ledtrig-heartbeat.c |  3 --
 include/linux/leds.h |  5 +-
 4 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index ede4fa0..25d711b 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -22,6 +22,9 @@
 DECLARE_RWSEM(leds_list_lock);
 EXPORT_SYMBOL_GPL(leds_list_lock);
 
+DEFINE_SPINLOCK(leds_soft_blink_lock);
+EXPORT_SYMBOL_GPL(leds_soft_blink_lock);
+
 LIST_HEAD(leds_list);
 EXPORT_SYMBOL_GPL(leds_list);
 
@@ -51,26 +54,31 @@ static void led_timer_function(struct timer_list *t)
unsigned long brightness;
unsigned long delay;
 
+   spin_lock_bh(&leds_soft_blink_lock);
+
+   /*
+* Check if soft blinking wasn't disabled via led_set_brightness()
+* in the meantime.
+*/
+   if (!test_bit(LED_BLINK_SW, &led_cdev->work_flags))
+   goto unlock;
+
if (!led_cdev->blink_delay_on || !led_cdev->blink_delay_off) {
led_set_brightness_nosleep(led_cdev, LED_OFF);
clear_bit(LED_BLINK_SW, &led_cdev->work_flags);
-   return;
+   goto unlock;
}
 
if (test_and_clear_bit(LED_BLINK_ONESHOT_STOP,
   &led_cdev->work_flags)) {
clear_bit(LED_BLINK_SW, &led_cdev->work_flags);
-   return;
+   goto unlock;
}
 
brightness = led_get_brightness(led_cdev);
if (!brightness) {
/* Time to switch the LED on. */
-   if (test_and_clear_bit(LED_BLINK_BRIGHTNESS_CHANGE,
-   &led_cdev->work_flags))
-   brightness = led_cdev->new_blink_brightness;
-   else
-   brightness = led_cdev->blink_brightness;
+   brightness = led_cdev->blink_brightness;
delay = led_cdev->blink_delay_on;
} else {
/* Store the current brightness value to be able
@@ -100,6 +108,9 @@ static void led_timer_function(struct timer_list *t)
}
 
mod_timer(&led_cdev->blink_timer, jiffies + msecs_to_jiffies(delay));
+
+unlock:
+   spin_unlock_bh(&leds_soft_blink_lock);
 }
 
 static void set_brightness_delayed(struct work_struct *ws)
@@ -108,11 +119,6 @@ static void set_brightness_delayed(struct work_struct *ws)
container_of(ws, struct led_classdev, set_brightness_work);
int ret = 0;
 
-   if (test_and_clear_bit(LED_BLINK_DISABLE, &led_cdev->work_flags)) {
-   led_cdev->delayed_set_value = LED_OFF;
-   led_stop_software_blink(led_cdev);
-   }
-
ret = __led_set_brightness(led_cdev, led_cdev->delayed_set_value);
if (ret == -ENOTSUPP)
ret = __led_set_brightness_blocking(led_cdev,
@@ -131,6 +137,8 @@ static void led_set_software_blink(struct led_classdev 
*led_cdev,
 {
int current_brightness;
 
+   spin_lock_bh(&leds_soft_blink_lock);
+
current_brightness = led_get_brightness(led_cdev);
if (current_brightness)
led_cdev->blink_brightness = current_brightness;
@@ -143,18 +151,21 @@ static void led_set_software_blink(struct led_classdev 
*led_cdev,
/* never on - just set to off */
if (!delay_on) {
led_set_brightness_nosleep(led_cdev, LED_OFF);
-   return;
+   goto unlock;
}
 
/* never off - just set to brightnes

Re: [PATCH] ARM: make memzero optimization smarter

2018-01-17 Thread Nicolas Pitre
On Wed, 17 Jan 2018, Nicolas Pitre wrote:

> On Wed, 17 Jan 2018, Russell King - ARM Linux wrote:
> 
> > However, __memzero is not safe against being called with a zero length
> > so it's not something we can simply remove.
> 
> The idea is about the possibility of removing __memzero altogether.
> It is not clear that the tiny performance gain from a dedicated memzero 
> implementation is worth the current overhead around it.

This being said, I fail to see how __memzero is not safe against a zero 
length. Are you sure it isn't?


Nicolas


[PATCH v6 08/99] xarray: Add xa_load

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This first function in the XArray API brings with it a lot of support
infrastructure.  The advanced API is based around the xa_state which is
a more capable version of the radix_tree_iter.

As the test-suite demonstrates, it is possible to use the xarray and
radix tree APIs on the same data structure.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h  | 282 
 lib/radix-tree.c|  43 -
 lib/xarray.c| 190 +++
 tools/testing/radix-tree/.gitignore |   1 +
 tools/testing/radix-tree/Makefile   |   7 +-
 tools/testing/radix-tree/linux/kernel.h |   1 +
 tools/testing/radix-tree/linux/radix-tree.h |   1 -
 tools/testing/radix-tree/linux/rcupdate.h   |   1 +
 tools/testing/radix-tree/linux/xarray.h |   1 +
 tools/testing/radix-tree/xarray-test.c  |  56 ++
 10 files changed, 537 insertions(+), 46 deletions(-)
 create mode 100644 tools/testing/radix-tree/xarray-test.c

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 3d5f7804ef45..54c694e5c33f 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -12,6 +12,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 
@@ -30,6 +32,10 @@
  *
  * 0-62: Sibling entries
  * 256: Retry entry
+ *
+ * Errors are also represented as internal entries, but use the negative
+ * space (-4094 to -2).  They're never stored in the slots array; only
+ * returned by the normal API.
  */
 
 #define BITS_PER_XA_VALUE  (BITS_PER_LONG - 1)
@@ -101,6 +107,40 @@ static inline bool xa_is_internal(const void *entry)
return ((unsigned long)entry & 3) == 2;
 }
 
+/**
+ * xa_is_err() - Report whether an XArray operation returned an error
+ * @entry: Result from calling an XArray function
+ *
+ * If an XArray operation cannot complete an operation, it will return
+ * a special value indicating an error.  This function tells you
+ * whether an error occurred; xa_err() tells you which error occurred.
+ *
+ * Return: %true if the entry indicates an error.
+ */
+static inline bool xa_is_err(const void *entry)
+{
+   return unlikely(xa_is_internal(entry));
+}
+
+/**
+ * xa_err() - Turn an XArray result into an errno.
+ * @entry: Result from calling an XArray function.
+ *
+ * If an XArray operation cannot complete an operation, it will return
+ * a special pointer value which encodes an errno.  This function extracts
+ * the errno from the pointer value, or returns 0 if the pointer does not
+ * represent an errno.
+ *
+ * Return: A negative errno or 0.
+ */
+static inline int xa_err(void *entry)
+{
+   /* xa_to_internal() would not do sign extension. */
+   if (xa_is_err(entry))
+   return (long)entry >> 2;
+   return 0;
+}
+
 /**
  * struct xarray - The anchor of the XArray.
  * @xa_lock: Lock that protects the contents of the XArray.
@@ -146,6 +186,7 @@ struct xarray {
struct xarray name = XARRAY_INIT_FLAGS(name, flags)
 
 void xa_init_flags(struct xarray *, gfp_t flags);
+void *xa_load(struct xarray *, unsigned long index);
 
 /**
  * xa_init() - Initialise an empty XArray.
@@ -212,6 +253,62 @@ struct xa_node {
unsigned long   tags[XA_MAX_TAGS][XA_TAG_LONGS];
 };
 
+#ifdef XA_DEBUG
+void xa_dump(const struct xarray *);
+void xa_dump_node(const struct xa_node *);
+#define XA_BUG_ON(xa, x) do { \
+   if (x) \
+   xa_dump(xa); \
+   BUG_ON(x); \
+   } while (0)
+#define XA_NODE_BUG_ON(node, x) do { \
+   if ((x) && (node)) \
+   xa_dump_node(node); \
+   BUG_ON(x); \
+   } while (0)
+#else
+#define XA_BUG_ON(xa, x)   do { } while (0)
+#define XA_NODE_BUG_ON(node, x)do { } while (0)
+#endif
+
+/* Private */
+static inline void *xa_head(struct xarray *xa)
+{
+   return rcu_dereference_check(xa->xa_head,
+   lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_head_locked(struct xarray *xa)
+{
+   return rcu_dereference_protected(xa->xa_head,
+   lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_entry(struct xarray *xa,
+   const struct xa_node *node, unsigned int offset)
+{
+   XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
+   return rcu_dereference_check(node->slots[offset],
+   lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_entry_locked(struct xarray *xa,
+   const struct xa_node *node, unsigned int offset)
+{
+   XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
+   return rcu_dereference_protected(node->slots[offset],
+   lockdep_is_held(&xa->xa_lock));
+}
+

[PATCH v6 11/99] xarray: Add xa_cmpxchg and xa_insert

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Like cmpxchg(), xa_cmpxchg will only store to the index if the current
entry matches the old entry.  It returns the current entry, which is
usually more useful than the errno returned by radix_tree_insert().
For the users who really only want the errno, the xa_insert() wrapper
provides a more convenient calling convention.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h | 56 
 lib/xarray.c   | 68 ++
 tools/testing/radix-tree/xarray-test.c | 10 +
 3 files changed, 134 insertions(+)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 139b1c1fd022..fc9ab3b13e60 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -210,6 +210,8 @@ struct xarray {
 void xa_init_flags(struct xarray *, gfp_t flags);
 void *xa_load(struct xarray *, unsigned long index);
 void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
+void *xa_cmpxchg(struct xarray *, unsigned long index,
+   void *old, void *entry, gfp_t);
 bool xa_get_tag(struct xarray *, unsigned long index, xa_tag_t);
 void xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
 void xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
@@ -264,6 +266,32 @@ static inline bool xa_tagged(const struct xarray *xa, 
xa_tag_t tag)
return xa->xa_flags & XA_FLAGS_TAG(tag);
 }
 
+/**
+ * xa_insert() - Store this entry in the XArray unless another entry is
+ * already present.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * If you would rather see the existing entry in the array, use xa_cmpxchg().
+ * This function is for users who don't care what the entry is, only that
+ * one is present.
+ *
+ * Return: -EEXIST if another entry was present, 0 if the store succeeded,
+ * or another negative errno if a different error happened (eg -ENOMEM).
+ */
+static inline int xa_insert(struct xarray *xa, unsigned long index,
+   void *entry, gfp_t gfp)
+{
+   void *curr = xa_cmpxchg(xa, index, NULL, entry, gfp);
+   if (!curr)
+   return 0;
+   if (xa_is_err(curr))
+   return xa_err(curr);
+   return -EEXIST;
+}
+
 #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
 #define xa_lock(xa)spin_lock(&(xa)->xa_lock)
 #define xa_unlock(xa)  spin_unlock(&(xa)->xa_lock)
@@ -283,9 +311,37 @@ static inline bool xa_tagged(const struct xarray *xa, 
xa_tag_t tag)
  */
 void *__xa_erase(struct xarray *, unsigned long index);
 void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
+void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
+   void *entry, gfp_t);
 void __xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
 void __xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
 
+/**
+ * __xa_insert() - Store this entry in the XArray unless another entry is
+ * already present.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * If you would rather see the existing entry in the array, use __xa_cmpxchg().
+ * This function is for users who don't care what the entry is, only that
+ * one is present.
+ *
+ * Return: -EEXIST if another entry was present, 0 if the store succeeded,
+ * or another negative errno if a different error happened (eg -ENOMEM).
+ */
+static inline int __xa_insert(struct xarray *xa, unsigned long index,
+   void *entry, gfp_t gfp)
+{
+   void *curr = __xa_cmpxchg(xa, index, NULL, entry, gfp);
+   if (!curr)
+   return 0;
+   if (xa_is_err(curr))
+   return xa_err(curr);
+   return -EEXIST;
+}
+
 /* Everything below here is the Advanced API.  Proceed with caution. */
 
 /*
diff --git a/lib/xarray.c b/lib/xarray.c
index 45b70e622bf1..d925a98fb9b8 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -928,6 +928,74 @@ void *__xa_store(struct xarray *xa, unsigned long index, 
void *entry, gfp_t gfp)
 }
 EXPORT_SYMBOL(__xa_store);
 
+/**
+ * xa_cmpxchg() - Conditionally replace an entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @old: Old value to test against.
+ * @entry: New value to place in array.
+ * @gfp: Memory allocation flags.
+ *
+ * If the entry at @index is the same as @old, replace it with @entry.
+ * If the return value is equal to @old, then the exchange was successful.
+ *
+ * Return: The old value at this index or xa_err() if an error happened.
+ */
+void *xa_cmpxchg(struct xarray *xa, unsigned long index,
+   void *old, void *entry, gfp_t gfp)
+{
+   XA_STATE(xas, xa, index);
+   void *curr;
+
+   if (WARN_ON_ONCE(xa_is_internal(entry)))
+   return XA_ERROR(-EINVAL);
+
+   do {
+   xas_lock(&xas);
+   c

[PATCH v6 12/99] xarray: Add xa_for_each

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This iterator allows the user to efficiently walk a range of the array,
executing the loop body once for each entry in that range that matches
the filter.  This commit also includes xa_find() and xa_find_above()
which are helper functions for xa_for_each() but may also be useful in
their own right.

In the xas family of functions, we also have xas_for_each(), xas_find(),
xas_next_entry(), xas_for_each_tag(), xas_find_tag(), xas_next_tag()
and xas_pause().

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h | 171 +
 lib/xarray.c   | 272 +
 tools/testing/radix-tree/test.c|  13 ++
 tools/testing/radix-tree/test.h|   1 +
 tools/testing/radix-tree/xarray-test.c | 122 +++
 5 files changed, 579 insertions(+)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index fc9ab3b13e60..fcd7ef68933a 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -215,6 +215,10 @@ void *xa_cmpxchg(struct xarray *, unsigned long index,
 bool xa_get_tag(struct xarray *, unsigned long index, xa_tag_t);
 void xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
 void xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
+void *xa_find(struct xarray *xa, unsigned long *index,
+   unsigned long max, xa_tag_t) __attribute__((nonnull(2)));
+void *xa_find_after(struct xarray *xa, unsigned long *index,
+   unsigned long max, xa_tag_t) __attribute__((nonnull(2)));
 
 /**
  * xa_init() - Initialise an empty XArray.
@@ -266,6 +270,33 @@ static inline bool xa_tagged(const struct xarray *xa, 
xa_tag_t tag)
return xa->xa_flags & XA_FLAGS_TAG(tag);
 }
 
+/**
+ * xa_for_each() - Iterate over a portion of an XArray.
+ * @xa: XArray.
+ * @entry: Entry retrieved from array.
+ * @index: Index of @entry.
+ * @max: Maximum index to retrieve from array.
+ * @filter: Selection criterion.
+ *
+ * Initialise @index to the minimum index you want to retrieve from
+ * the array.  During the iteration, @entry will have the value of the
+ * entry stored in @xa at @index.  The iteration will skip all entries in
+ * the array which do not match @filter.  You may modify @index during the
+ * iteration if you want to skip or reprocess indices.  It is safe to modify
+ * the array during the iteration.  At the end of the iteration, @entry will
+ * be set to NULL and @index will have a value less than or equal to max.
+ *
+ * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
+ * to handle your own locking with xas_for_each(), and if you have to unlock
+ * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
+ * will spin if it hits a retry entry; if you intend to see retry entries,
+ * you should use the xas_for_each() iterator instead.  The xas_for_each()
+ * iterator will expand into more inline code than xa_for_each().
+ */
+#define xa_for_each(xa, entry, index, max, filter) \
+   for (entry = xa_find(xa, &index, max, filter); entry; \
+entry = xa_find_after(xa, &index, max, filter))
+
 /**
  * xa_insert() - Store this entry in the XArray unless another entry is
  * already present.
@@ -620,6 +651,12 @@ static inline bool xas_valid(const struct xa_state *xas)
return !xas_invalid(xas);
 }
 
+/* True if the pointer is something other than a node */
+static inline bool xas_not_node(struct xa_node *node)
+{
+   return ((unsigned long)node & 3) || !node;
+}
+
 /* True if the node represents head-of-tree, RESTART or BOUNDS */
 static inline bool xas_top(struct xa_node *node)
 {
@@ -648,13 +685,16 @@ static inline bool xas_retry(struct xa_state *xas, const 
void *entry)
 void *xas_load(struct xa_state *);
 void *xas_store(struct xa_state *, void *entry);
 void *xas_create(struct xa_state *);
+void *xas_find(struct xa_state *, unsigned long max);
 
 bool xas_get_tag(const struct xa_state *, xa_tag_t);
 void xas_set_tag(const struct xa_state *, xa_tag_t);
 void xas_clear_tag(const struct xa_state *, xa_tag_t);
+void *xas_find_tag(struct xa_state *, unsigned long max, xa_tag_t);
 void xas_init_tags(const struct xa_state *);
 
 bool xas_nomem(struct xa_state *, gfp_t);
+void xas_pause(struct xa_state *);
 
 /**
  * xas_reload() - Refetch an entry from the xarray.
@@ -727,6 +767,137 @@ static inline void xas_set_update(struct xa_state *xas, 
xa_update_node_t update)
xas->xa_update = update;
 }
 
+/* Skip over any of these entries when iterating */
+static inline bool xa_iter_skip(const void *entry)
+{
+   return unlikely(!entry ||
+   (xa_is_internal(entry) && entry < XA_RETRY_ENTRY));
+}
+
+/**
+ * xas_next_entry() - Advance iterator to next present entry.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ *
+ * xas_next_entry() is an inline function to optimise xarray traversal for
+ * speed.  It is equivalent to calling

[PATCH v6 13/99] xarray: Add xa_extract

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This function combines the functionality of radix_tree_gang_lookup() and
radix_tree_gang_lookup_tagged().  It extracts entries matching the
specified filter into a normal array.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h |  2 ++
 lib/xarray.c   | 80 ++
 2 files changed, 82 insertions(+)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index fcd7ef68933a..d79fd48e4957 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -219,6 +219,8 @@ void *xa_find(struct xarray *xa, unsigned long *index,
unsigned long max, xa_tag_t) __attribute__((nonnull(2)));
 void *xa_find_after(struct xarray *xa, unsigned long *index,
unsigned long max, xa_tag_t) __attribute__((nonnull(2)));
+unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
+   unsigned long max, unsigned int n, xa_tag_t);
 
 /**
  * xa_init() - Initialise an empty XArray.
diff --git a/lib/xarray.c b/lib/xarray.c
index 3e6be0a07525..be276618f81b 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1368,6 +1368,86 @@ void *xa_find_after(struct xarray *xa, unsigned long 
*indexp,
 }
 EXPORT_SYMBOL(xa_find_after);
 
+static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
+   unsigned long max, unsigned int n)
+{
+   void *entry;
+   unsigned int i = 0;
+
+   rcu_read_lock();
+   xas_for_each(xas, entry, max) {
+   if (xas_retry(xas, entry))
+   continue;
+   dst[i++] = entry;
+   if (i == n)
+   break;
+   }
+   rcu_read_unlock();
+
+   return i;
+}
+
+static unsigned int xas_extract_tag(struct xa_state *xas, void **dst,
+   unsigned long max, unsigned int n, xa_tag_t tag)
+{
+   void *entry;
+   unsigned int i = 0;
+
+   rcu_read_lock();
+   xas_for_each_tag(xas, entry, max, tag) {
+   if (xas_retry(xas, entry))
+   continue;
+   dst[i++] = entry;
+   if (i == n)
+   break;
+   }
+   rcu_read_unlock();
+
+   return i;
+}
+
+/**
+ * xa_extract() - Copy selected entries from the XArray into a normal array.
+ * @xa: The source XArray to copy from.
+ * @dst: The buffer to copy entries into.
+ * @start: The first index in the XArray eligible to be selected.
+ * @max: The last index in the XArray eligible to be selected.
+ * @n: The maximum number of entries to copy.
+ * @filter: Selection criterion.
+ *
+ * Copies up to @n entries that match @filter from the XArray.  The
+ * copied entries will have indices between @start and @max, inclusive.
+ *
+ * The @filter may be an XArray tag value, in which case entries which are
+ * tagged with that tag will be copied.  It may also be %XA_PRESENT, in
+ * which case non-NULL entries will be copied.
+ *
+ * This function uses the RCU lock to protect itself.  That means that the
+ * entries returned may not represent a snapshot of the XArray at a moment
+ * in time.  For example, if index 5 is stored to, then index 10 is stored to,
+ * calling xa_extract() may return the old contents of index 5 and the
+ * new contents of index 10.  Indices not modified while this function is
+ * running will not be skipped.
+ *
+ * If you need stronger guarantees, holding the xa_lock across calls to this
+ * function will prevent concurrent modification.
+ *
+ * Return: The number of entries copied.
+ */
+unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
+   unsigned long max, unsigned int n, xa_tag_t filter)
+{
+   XA_STATE(xas, xa, start);
+
+   if (!n)
+   return 0;
+
+   if ((__force unsigned int)filter < XA_MAX_TAGS)
+   return xas_extract_tag(&xas, dst, max, n, filter);
+   return xas_extract_present(&xas, dst, max, n);
+}
+EXPORT_SYMBOL(xa_extract);
+
 #ifdef XA_DEBUG
 void xa_dump_node(const struct xa_node *node)
 {
-- 
2.15.1



[PATCH v6 00/99] XArray version 6

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This version of the XArray has no known bugs.  I have converted the
radix tree test suite entirely over to the XArray and fixed all bugs
that it has uncovered.  There are additional tests in the test suite for
the XArray, so I now claim the XArray has better test coverage than the
Radix Tree did.  Of course, that is not the same thing as fewer bugs,
but it now stands up to the tender embraces of Trinity without crashing.

You can get this version from my git tree here:
http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/xarray-2018-01-09
which includes a number of other patches that are at least tangentially
related to this patch set.

Most of the work I've done recently has been converting additional users
from the radix tree to the XArray.  That's going pretty well; still 24
radix tree users left to convert.  It's been worth doing because I've
spotted several common patterns that have led to changes (the lock_type,
reserve/release) and some common patterns that I'll add support for
later (chaining multiple entries from a single index, wanting to use
64-bit indices on 32-bit machines, having an array of XArrays, various
workarounds for not having range entries yet).

As far as line count goes, for the whole git tree, we're at:
 212 files changed, 7764 insertions(+), 7002 deletions(-)
with another 1376 lines to delete from radix-tree.[ch].  That doesn't take
into account the 371 lines of xarray.rst, the 587 lines of xarray-test.c,
and the fact that almost half of lib/xarray.c and include/linux/xarray.h
is documentation.

Changes since version 5:

 - Rebased to 4.15-rc8

API changes:
 - Renamed __xa_init() to xa_init_flags().
 - Added DEFINE_XARRAY_FLAGS().
 - Renamed xa_ctx to xa_lock_type; store it in the XA_FLAGS and use separate
   locking classes for each type so that lockdep doesn't emit spurious
   warnings.  It also reduces the amount of boilerplate.
 - Combined __xa_store_bh, __xa_store_irq and __xa_store into __xa_store().
 - Ditto for __xa_cmpxchg().
 - Renamed xa_store_empty() to xa_insert().
 - Added __xa_insert().
 - Added xa_reserve() and xa_release().
 - Renamed XA_NO_TAG to XA_PRESENT.
 - Combined xa_get_entries(), xa_get_tagged() and xa_get_maybe_tag()
   into xa_extract().
 - Added 'filter' argument to xa_find(), xa_find_after() and xa_for_each()
   to match xa_extract() and provide the functionality that would
   have otherwise had to be added in the form of xa_find_tag(),
   xa_find_tag_after() and xa_for_each_tag().
 - Replaced workingset_lookup_update() with mapping_set_update().
 - Renamed page_cache_tree_delete() to page_cache_delete().

New xarray users:
 - Converted SuperH interrupt controller radix tree to XArray.
 - Converted blk-cgroup radix tree to XArray.
 - Converted blk-ioc radix tree to XArray.
 - Converted i915 handles_vma radix tree to XArray.
 - Converted s390 gmap radix trees to XArray.
 - Converted hwspinlock to XArray.
 - Converted btrfs fs_roots to XArray.
 - Converted btrfs reada_zones to XArray.
 - Converted btrfs reada_extents to XArray.
 - Converted btrfs reada_tree to XArray.
 - Converted btrfs buffer_radix to XArray.
 - Converted btrfs delayed_nodes to XArray.
 - Converted btrfs name_cache to XArray.
 - Converted f2fs pids radix tree to XArray.
 - Converted f2fs ino_root radix tree to XArray.
 - Converted f2fs extent_tree to XArray.
 - Converted f2fs gclist radix tree to XArray.
 - Converted dma-debug active cacheline radix tree to XArray.
 - Converted Xen pvcalls-back socketpass_mappings to XArray.
 - Converted net/qrtr radix tree to XArray.
 - Converted null_blk radix trees to XArray.

Documentation:
 - Added a bit more internals documentation.
 - Rewrote xa_init_flags documentation.
 - Added the __xa_ functions to the locking table.
 - Rewrote the section on using the __xa_ functions.

Internal changes:
 - Free up the bottom four bits of the xa_flags, since these are not
   valid GFP flags to pass to kmem_cache_alloc().
 - Moved the XA_FLAGS_TRACK_FREE bit to the bottom bits of the flags to leave
   space for more tags (later).
 - Fixed multiple bugs in xas_find() and xas_find_tag().
 - Fixed bug in shrinking XArray (and add a test case that exercises it).
 - Fixed bug in erasing multi-index entries.
 - Fixed a compile warning with CONFIG_RADIX_TREE_MULTIORDER=n.
 - Added an xas_update() helper.
 - Use ->array to track an xa_node's state through its lifecycle
   (allocated -> rcu_free -> actually free).
 - Made XA_BUG_ON dump the entire tree while XA_NODE_BUG_ON dumps only the
   node that appears suspect.
 - Fixed debugging printks to use %px and pr_cont/pr_info etc.
 - Renamed some internal tag functions.
 - Moved xa_track_free() from xarray.h to xarray.c.

Test suite:
 - Added new tests for xas_find() and xas_find_tag().
 - Added new tests for the update_node functionality.
 - Converted the radix tree test suite to the xarray API.

Matthew Wilcox (99):
  xarray: Add the xa_lock to the radix_tree_root
  page cache

Re: [Nouveau] [RFC 0/4] Implement full clockgating for Kepler1 and 2

2018-01-17 Thread Mikko Perttunen

On 01/16/2018 12:06 AM, Lyude Paul wrote:

It's here! After a lot of investigation, rewrites, and traces, I present
the patch series to implement all known levels of clockgating for
Kepler1 and Kepler2 GPUs.

Starting with Fermi GPUs (this is probably present on earlier GPUs as
well, but with a far less easy to manage interface), nvidia added two
clockgating levels that are handled mostly in firmware (with the
exception of course, of the driver initially programming all of the
register values containing engine delays and that stuff):
   - CG_CTRL - Main register for enabling/disabling clockgating for
 engines and hw blocks
   - BLCG - "Block-level clockgating", a deeper level of clockgating
Starting with kepler2 as well, nvidia also introduced:
   - SLCG - "??? clockgating" even deeper level of clockgating


FWIW, SLCG stands for "second level clock gating".

Cheers,
Mikko


[PATCH v6 15/99] xarray: Add xas_next and xas_prev

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

These two functions move the xas index by one position, and adjust the
rest of the iterator state to match it.  This is more efficient than
calling xas_set() as it keeps the iterator at the leaves of the tree
instead of walking the iterator from the root each time.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h |  67 +
 lib/xarray.c   |  74 ++
 tools/testing/radix-tree/xarray-test.c | 259 +
 3 files changed, 400 insertions(+)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index d106b2fe4cec..01ce313fc00e 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -660,6 +660,12 @@ static inline bool xas_not_node(struct xa_node *node)
return ((unsigned long)node & 3) || !node;
 }
 
+/* True if the node represents RESTART or an error */
+static inline bool xas_frozen(struct xa_node *node)
+{
+   return (unsigned long)node & 2;
+}
+
 /* True if the node represents head-of-tree, RESTART or BOUNDS */
 static inline bool xas_top(struct xa_node *node)
 {
@@ -901,6 +907,67 @@ enum {
for (entry = xas_find_tag(xas, max, tag); entry; \
 entry = xas_next_tag(xas, max, tag))
 
+void *__xas_next(struct xa_state *);
+void *__xas_prev(struct xa_state *);
+
+/**
+ * xas_prev() - Move iterator to previous index.
+ * @xas: XArray operation state.
+ *
+ * If the @xas was in an error state, it will remain in an error state
+ * and this function will return %NULL.  If the @xas has never been walked,
+ * it will have the effect of calling xas_load().  Otherwise one will be
+ * subtracted from the index and the state will be walked to the correct
+ * location in the array for the next operation.
+ *
+ * If the iterator was referencing index 0, this function wraps
+ * around to %ULONG_MAX.
+ *
+ * Return: The entry at the new index.  This may be %NULL or an internal
+ * entry, although it should never be a node entry.
+ */
+static inline void *xas_prev(struct xa_state *xas)
+{
+   struct xa_node *node = xas->xa_node;
+
+   if (unlikely(xas_not_node(node) || node->shift ||
+   xas->xa_offset == 0))
+   return __xas_prev(xas);
+
+   xas->xa_index--;
+   xas->xa_offset--;
+   return xa_entry(xas->xa, node, xas->xa_offset);
+}
+
+/**
+ * xas_next() - Move state to next index.
+ * @xas: XArray operation state.
+ *
+ * If the @xas was in an error state, it will remain in an error state
+ * and this function will return %NULL.  If the @xas has never been walked,
+ * it will have the effect of calling xas_load().  Otherwise one will be
+ * added to the index and the state will be walked to the correct
+ * location in the array for the next operation.
+ *
+ * If the iterator was referencing index %ULONG_MAX, this function wraps
+ * around to 0.
+ *
+ * Return: The entry at the new index.  This may be %NULL or an internal
+ * entry, although it should never be a node entry.
+ */
+static inline void *xas_next(struct xa_state *xas)
+{
+   struct xa_node *node = xas->xa_node;
+
+   if (unlikely(xas_not_node(node) || node->shift ||
+   xas->xa_offset == XA_CHUNK_MASK))
+   return __xas_next(xas);
+
+   xas->xa_index++;
+   xas->xa_offset++;
+   return xa_entry(xas->xa, node, xas->xa_offset);
+}
+
 /* Internal functions, mostly shared between radix-tree.c, xarray.c and idr.c 
*/
 void xas_destroy(struct xa_state *);
 
diff --git a/lib/xarray.c b/lib/xarray.c
index af81d4bf9ae1..e8ece1fff9fd 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -838,6 +838,80 @@ void xas_pause(struct xa_state *xas)
 }
 EXPORT_SYMBOL_GPL(xas_pause);
 
+/*
+ * __xas_prev() - Find the previous entry in the XArray.
+ * @xas: XArray operation state.
+ *
+ * Helper function for xas_prev() which handles all the complex cases
+ * out of line.
+ */
+void *__xas_prev(struct xa_state *xas)
+{
+   void *entry;
+
+   if (!xas_frozen(xas->xa_node))
+   xas->xa_index--;
+   if (xas_not_node(xas->xa_node))
+   return xas_load(xas);
+
+   if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
+   xas->xa_offset--;
+
+   while (xas->xa_offset == 255) {
+   xas->xa_offset = xas->xa_node->offset - 1;
+   xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+   if (!xas->xa_node)
+   return set_bounds(xas);
+   }
+
+   for (;;) {
+   entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+   if (!xa_is_node(entry))
+   return entry;
+
+   xas->xa_node = xa_to_node(entry);
+   xas_set_offset(xas);
+   }
+}
+EXPORT_SYMBOL_GPL(__xas_prev);
+
+/*
+ * __xas_next() - Find the next entry in the XArray.
+ * @xas: XArray operation state.
+ *
+ * Helper function for xas_next() which handles all the complex cases

[PATCH v6 14/99] xarray: Add xa_destroy

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This function frees all the internal memory allocated to the xarray
and reinitialises it to be empty.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h |  1 +
 lib/xarray.c   | 26 ++
 2 files changed, 27 insertions(+)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index d79fd48e4957..d106b2fe4cec 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -221,6 +221,7 @@ void *xa_find_after(struct xarray *xa, unsigned long *index,
unsigned long max, xa_tag_t) __attribute__((nonnull(2)));
 unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
unsigned long max, unsigned int n, xa_tag_t);
+void xa_destroy(struct xarray *);
 
 /**
  * xa_init() - Initialise an empty XArray.
diff --git a/lib/xarray.c b/lib/xarray.c
index be276618f81b..af81d4bf9ae1 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1448,6 +1448,32 @@ unsigned int xa_extract(struct xarray *xa, void **dst, 
unsigned long start,
 }
 EXPORT_SYMBOL(xa_extract);
 
+/**
+ * xa_destroy() - Free all internal data structures.
+ * @xa: XArray.
+ *
+ * After calling this function, the XArray is empty and has freed all memory
+ * allocated for its internal data structures.  You are responsible for
+ * freeing the objects referenced by the XArray.
+ */
+void xa_destroy(struct xarray *xa)
+{
+   XA_STATE(xas, xa, 0);
+   unsigned long flags;
+   void *entry;
+
+   xas.xa_node = NULL;
+   xas_lock_irqsave(&xas, flags);
+   entry = xa_head_locked(xa);
+   RCU_INIT_POINTER(xa->xa_head, NULL);
+   xas_init_tags(&xas);
+   /* lockdep checks we're still holding the lock in xas_free_nodes() */
+   if (xa_is_node(entry))
+   xas_free_nodes(&xas, xa_to_node(entry));
+   xas_unlock_irqrestore(&xas, flags);
+}
+EXPORT_SYMBOL(xa_destroy);
+
 #ifdef XA_DEBUG
 void xa_dump_node(const struct xa_node *node)
 {
-- 
2.15.1



[PATCH v6 16/99] xarray: Add xas_create_range

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This hopefully temporary function is useful for users who have not yet
been converted to multi-index entries.

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h |  2 ++
 lib/xarray.c   | 22 ++
 2 files changed, 24 insertions(+)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 01ce313fc00e..acb6d02ff194 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -705,6 +705,8 @@ void xas_init_tags(const struct xa_state *);
 bool xas_nomem(struct xa_state *, gfp_t);
 void xas_pause(struct xa_state *);
 
+void xas_create_range(struct xa_state *, unsigned long max);
+
 /**
  * xas_reload() - Refetch an entry from the xarray.
  * @xas: XArray operation state.
diff --git a/lib/xarray.c b/lib/xarray.c
index e8ece1fff9fd..c044373d6893 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -612,6 +612,28 @@ void *xas_create(struct xa_state *xas)
 }
 EXPORT_SYMBOL_GPL(xas_create);
 
+/**
+ * xas_create_range() - Ensure that stores to this range will succeed
+ * @xas: XArray operation state.
+ * @max: The highest index to create a slot for.
+ *
+ * Creates all of the slots in the range between the current position of
+ * @xas and @max.  This is for the benefit of users who have not yet been
+ * converted to multi-index entries.
+ *
+ * The implementation is naive.
+ */
+void xas_create_range(struct xa_state *xas, unsigned long max)
+{
+   XA_STATE(tmp, xas->xa, xas->xa_index);
+
+   do {
+   xas_create(&tmp);
+   xas_set(&tmp, tmp.xa_index + XA_CHUNK_SIZE);
+   } while (tmp.xa_index < max);
+}
+EXPORT_SYMBOL_GPL(xas_create_range);
+
 static void store_siblings(struct xa_state *xas, void *entry, void *curr,
int *countp, int *valuesp)
 {
-- 
2.15.1



[PATCH v6 17/99] xarray: Add MAINTAINERS entry

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Add myself as XArray and IDR maintainer.

Signed-off-by: Matthew Wilcox 
---
 MAINTAINERS | 12 
 1 file changed, 12 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 18994806e441..55ae4c0b38d5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14893,6 +14893,18 @@ T: git 
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/vdso
 S: Maintained
 F: arch/x86/entry/vdso/
 
+XARRAY
+M: Matthew Wilcox 
+M: Matthew Wilcox 
+L: linux-fsde...@vger.kernel.org
+S: Supported
+F: Documentation/core-api/xarray.rst
+F: lib/idr.c
+F: lib/xarray.c
+F: include/linux/idr.h
+F: include/linux/xarray.h
+F: tools/testing/radix-tree
+
 XC2028/3028 TUNER DRIVER
 M: Mauro Carvalho Chehab 
 M: Mauro Carvalho Chehab 
-- 
2.15.1



Re: [PATCH V4 1/2] nvme: add NVME_CTRL_RESET_PREPARE state

2018-01-17 Thread James Smart
I'm having a hard time following why this patch is being requested. Help 
me catch on.


On 1/16/2018 8:54 PM, Jianchao Wang wrote:

Currently, the ctrl->state will be changed to NVME_CTRL_RESETTING
before queue the reset work. This is not so strict. There could be
a big gap before the reset_work callback is invoked.
ok so what you're saying is you want something to know that you've 
transitioned to RESETTING but not yet performed an action for the 
reset.   What is that something and what is it to do ?


guessing - I assume it's in the transport  xxx_is_ready() and/or 
nvmf_check_init_req(), which is called at the start of queue_rq(), that 
wants to do something ?




  In addition,
there is some disable work in the reset_work callback, strictly
speaking, not part of reset work, and could lead to some confusion.


Can you explain this ?  what's the confusion ?

I assume by "disable" you mean it is quiescing queues ?




In addition, after set state to RESETTING and disable procedure,
nvme-rdma/fc use NVME_CTRL_RECONNECTING to mark the setup and
reconnect procedure. The RESETTING state has been narrowed.


I still don't follow. Yes RECONNECTING is where we repetitively: try to 
create a link-side association again: if it fails, delay and try again; 
or if success, reinit the controller, and unquiesce all queues - 
allowing full operation again, at which time we transition to LIVE.


by "narrowed" what do you mean ?    what "narrowed" ?

In FC, as we have a lot of work that must occur to terminate io as part 
of the reset, it can be a fairly long window.  I don't know that any 
functionally in this path, regardless of time window, has narrowed.





This patch add NVME_CTRL_RESET_PREPARE state to mark the reset_work
or error recovery work, scheduling gap and disable procedure.
After that,
  - For nvme-pci, nvmet-loop, set state to RESETTING, start
initialization.
  - For nvme-rdma, nvme-fc, set state to RECONNECTING, start
initialization or reconnect.


So I'm lost - so you've effectively renamed RESETTING to RESET_PREPARE 
for fc/rdma.  What do you define are the actions in RESETTING that went 
away and why is that different between pci and the other transports ?   
Why doesn't nvme-pci need to go through RESET_PREPARE ? doesn't it have 
the same scheduling window for a reset_work thread ?



On 1/17/2018 1:06 AM, Max Gurtovoy wrote:



+
+    case NVME_CTRL_RESETTING:
+    switch (old_state) {
+    case NVME_CTRL_RESET_PREPARE:
+    changed = true;
+    /* FALLTHRU */
+    default:
+    break;
+    }
+    break;
  case NVME_CTRL_RECONNECTING:
  switch (old_state) {
  case NVME_CTRL_LIVE:
-    case NVME_CTRL_RESETTING:
+    case NVME_CTRL_RESET_PREPARE:


As I suggested in V3, please don't allow this transition.
We'll move to NVME_CTRL_RECONNECTING from NVME_CTRL_RESETTING.

I look on it like that:

NVME_CTRL_RESET_PREPARE - "suspend" state
NVME_CTRL_RESETTING - "resume" state

you don't reconnect from "suspend" state, you must "resume" before you 
reconnect.


This makes no sense to me.

I could use a definition of what "suspend" and "resume" mean to you

from what I've seen so far:
NVME_CTRL_RESET_PREPARE:   means I've decided to reset, changed state, 
but the actual work for reset hasn't started yet.   As we haven't 
commonized who does the quiescing of the queues, the queues are still 
live at this state, although some nvme check routine may bounce them. In 
truth, the queues should be quiesced here.


NVME_CTRL_RESETTING: I'm resetting the controller, tearing down 
queues/connections, the link side association.  AFAIK - pci and all the 
other transports have to do things things.   Now is when the blk-mq 
queues get stopped.   We have a variance on whether the queues are 
unquiesced or left quiesced (I think this is what you meant by "resume", 
where resume means unquiesce) at the end of this.   The admin_q is 
unquiesced, meaning new admin cmds should fail.  rdma also has io queues 
unquiesced meaning new ios fail, while fc leaves them quiesced while 
background timers run - meaning no new ios issued, nor any fail back to 
a multipather. With the agreement that we would patch all of the 
transports to leave them quiesced with fast-fail-timeouts occuring to 
unquiesce them and start failing ios.


NVME_RECONNECTING: transitioned to after the link-side association is 
terminated and the transport will now attempt to reconnect (perhaps 
several attempts) to create a new link-side association. Stays in this 
state until the controller is fully reconnected and it transitions to 
NVME_LIVE.   Until the link side association is active, queues do what 
they do (as left by RESETTING and/or updated per timeouts) excepting 
that after an active assocation, they queues will be unquiesced at the 
time of the LIVE transition.   Note: we grandfathered PCI into not 
needing this state:   As you (almost) can't fail the establis

[PATCH v6 19/99] idr: Convert to XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

The IDR distinguishes between unallocated entries (read as NULL) and
entries where the user has chosen to store NULL.  The radix tree was
modified to consider NULL entries which had tag 0 _clear_ as being
allocated, but it added a lot of complexity.

Instead, the XArray has a 'zero entry', which the normal API will treat
as NULL, but is distinct from NULL when using the advanced API.  The IDR
code converts between NULL and zero entries.

The idr_for_each_entry_ul() iterator becomes an alias for xa_for_each(),
so we drop the idr_get_next_ul() function as it has no users.

The exported IDR API was a weird mix of GPL-only and general symbols;
I converted them all to GPL as there was no way to use the IDR API
without being GPL.

Signed-off-by: Matthew Wilcox 
---
 Documentation/core-api/xarray.rst   |   6 +
 include/linux/idr.h | 156 ---
 include/linux/xarray.h  |  29 +++-
 lib/idr.c   | 298 ++--
 lib/radix-tree.c|  77 +-
 lib/xarray.c|  32 
 tools/testing/radix-tree/idr-test.c |  34 
 7 files changed, 419 insertions(+), 213 deletions(-)

diff --git a/Documentation/core-api/xarray.rst 
b/Documentation/core-api/xarray.rst
index 0172c7d9e6ea..1dea1c522506 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -284,6 +284,12 @@ to :c:func:`xas_retry`, and retry the operation if it 
returns ``true``.
this RCU period.  You should restart the lookup from the head of the
array.
 
+   * - Zero
+ - :c:func:`xa_is_zero`
+ - Zero entries appear as ``NULL`` through the Normal API, but occupy an
+   entry in the XArray which can be tagged or otherwise used to reserve
+   the index.
+
 Other internal entries may be added in the future.  As far as possible, they
 will be handled by :c:func:`xas_retry`.
 
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 11eea38b9629..9064ae5f0abc 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -9,35 +9,35 @@
  * tables.
  */
 
-#ifndef __IDR_H__
-#define __IDR_H__
+#ifndef _LINUX_IDR_H
+#define _LINUX_IDR_H
 
 #include 
 #include 
 #include 
-#include 
+#include 
 
 struct idr {
-   struct radix_tree_root  idr_rt;
-   unsigned intidr_next;
+   struct xarray   idr_xa;
+   unsigned intidr_next;
 };
 
-/*
- * The IDR API does not expose the tagging functionality of the radix tree
- * to users.  Use tag 0 to track whether a node has free space below it.
- */
-#define IDR_FREE   0
-
-/* Set the IDR flag and the IDR_FREE tag */
-#define IDR_RT_MARKER  (ROOT_IS_IDR | (__force gfp_t)  \
-   (1 << (ROOT_TAG_SHIFT + IDR_FREE)))
+#define IDR_INIT_FLAGS (XA_FLAGS_TRACK_FREE | XA_FLAGS_LOCK_IRQ |  \
+XA_FLAGS_TAG(XA_FREE_TAG))
 
 #define IDR_INIT(name) \
 {  \
-   .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER)  \
+   .idr_xa = XARRAY_INIT_FLAGS(name.idr_xa, IDR_INIT_FLAGS),   \
+   .idr_next = 0,  \
 }
 #define DEFINE_IDR(name)   struct idr name = IDR_INIT(name)
 
+static inline void idr_init(struct idr *idr)
+{
+   xa_init_flags(&idr->idr_xa, IDR_INIT_FLAGS);
+   idr->idr_next = 0;
+}
+
 /**
  * idr_get_cursor - Return the current position of the cyclic allocator
  * @idr: idr handle
@@ -66,62 +66,83 @@ static inline void idr_set_cursor(struct idr *idr, unsigned 
int val)
 
 /**
  * DOC: idr sync
- * idr synchronization (stolen from radix-tree.h)
+ * idr synchronization
  *
- * idr_find() is able to be called locklessly, using RCU. The caller must
- * ensure calls to this function are made within rcu_read_lock() regions.
- * Other readers (lock-free or otherwise) and modifications may be running
- * concurrently.
+ * The IDR manages its own locking, using irqsafe spinlocks for operations
+ * which modify the IDR and RCU for operations which do not.  The user of
+ * the IDR may choose to wrap accesses to it in a lock if it needs to
+ * guarantee the IDR does not change during a read access.  The easiest way
+ * to do this is to grab the same lock the IDR uses for write accesses
+ * using one of the idr_lock() wrappers.
  *
- * It is still required that the caller manage the synchronization and
- * lifetimes of the items. So if RCU lock-free lookups are used, typically
- * this would mean that the items have their own locks, or are amenable to
- * lock-free access; and that the items are freed by RCU (or only freed after
- * having been deleted from the idr tree *and* a synchronize_rcu() grace
- * period).
+ * The caller must still manage the synchronization and lifetimes of the
+ * items. So if RCU lock-free lookups 

[PATCH v6 18/99] xarray: Add ability to store errno values

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

While the radix tree offers no ability to store IS_ERR pointers,
documenting that the XArray does not led to some concern.  Here is a
sanctioned way to store errnos in the XArray.  I'm concerned that it
will confuse people who can't tell the difference between xa_is_err()
and xa_is_errno(), so I've added copious kernel-doc to help them tell
the difference.

Signed-off-by: Matthew Wilcox 
---
 Documentation/core-api/xarray.rst  |  8 +--
 include/linux/xarray.h | 44 ++
 tools/testing/radix-tree/xarray-test.c |  8 ++-
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/Documentation/core-api/xarray.rst 
b/Documentation/core-api/xarray.rst
index 914999c0bf3f..0172c7d9e6ea 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -42,8 +42,12 @@ When you retrieve an entry from the XArray, you can check 
whether it is
 a value entry by calling :c:func:`xa_is_value`, and convert it back to
 an integer by calling :c:func:`xa_to_value`.
 
-The XArray does not support storing :c:func:`IS_ERR` pointers as some
-conflict with value entries or internal entries.
+The XArray does not support storing :c:func:`IS_ERR` pointers because
+some conflict with value entries or internal entries.  If you need
+to store error numbers in the array, you can encode them into error
+entries with :c:func:`xa_mk_errno`, check whether a returned entry is
+an error with :c:func:`xa_is_errno` and convert it back into an errno
+with :c:func:`xa_to_errno`.
 
 An unusual feature of the XArray is the ability to create entries which
 occupy a range of indices.  Once stored to, looking up any index in
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index acb6d02ff194..ca6af6dd42c4 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -75,6 +75,50 @@ static inline bool xa_is_value(const void *entry)
return (unsigned long)entry & 1;
 }
 
+/**
+ * xa_mk_errno() - Create an XArray entry from an error number.
+ * @error: Error number to store in XArray.
+ *
+ * Return: An entry suitable for storing in the XArray.
+ */
+static inline void *xa_mk_errno(long error)
+{
+   return (void *)(error << 2);
+}
+
+/**
+ * xa_to_errno() - Get error number stored in an XArray entry.
+ * @entry: XArray entry.
+ *
+ * Calling this function on an entry which is not an xa_is_errno() will
+ * yield unpredictable results.  Do not confuse this function with xa_err();
+ * this function is for errnos which have been stored in the XArray, and
+ * that function is for errors returned from the XArray implementation.
+ *
+ * Return: The error number stored in the XArray entry.
+ */
+static inline long xa_to_errno(const void *entry)
+{
+   return (long)entry >> 2;
+}
+
+/**
+ * xa_is_errno() - Determine if an entry is an errno.
+ * @entry: XArray entry.
+ *
+ * Do not confuse this function with xa_is_err(); that function tells you
+ * whether the XArray implementation returned an error; this function
+ * tells you whether the entry you successfully stored in the XArray
+ * represented an errno.  If you have never stored an errno in the XArray,
+ * you do not have to check this.
+ *
+ * Return: True if the entry is an errno, false if it is a pointer.
+ */
+static inline bool xa_is_errno(const void *entry)
+{
+   return (((unsigned long)entry & 3) == 0) && (entry > (void *)-4096);
+}
+
 /*
  * xa_mk_internal() - Create an internal entry.
  * @v: Value to turn into an internal entry.
diff --git a/tools/testing/radix-tree/xarray-test.c 
b/tools/testing/radix-tree/xarray-test.c
index 2ad460c1febf..4d3541ac31e9 100644
--- a/tools/testing/radix-tree/xarray-test.c
+++ b/tools/testing/radix-tree/xarray-test.c
@@ -29,7 +29,13 @@ void check_xa_err(struct xarray *xa)
assert(xa_err(xa_store(xa, 1, xa_mk_value(0), GFP_KERNEL)) == 0);
assert(xa_err(xa_store(xa, 1, NULL, 0)) == 0);
 // kills the test-suite :-(
-// assert(xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) == -EINVAL);
+// assert(xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) == -EINVAL);
+
+   assert(xa_err(xa_store(xa, 0, xa_mk_errno(-ENOMEM), GFP_KERNEL)) == 0);
+   assert(xa_err(xa_load(xa, 0)) == 0);
+   assert(xa_is_errno(xa_load(xa, 0)) == true);
+   assert(xa_to_errno(xa_load(xa, 0)) == -ENOMEM);
+   xa_erase(xa, 0);
 }
 
 void check_xa_tag(struct xarray *xa)
-- 
2.15.1



Re: [Suspected-Phishing]Re: [PATCH V3 1/2] nvme: split resetting state into reset_prepate and resetting

2018-01-17 Thread James Smart

On 1/17/2018 2:37 AM, Sagi Grimberg wrote:


After Sagi's nvme-rdma: fix concurrent reset and reconnect, the rdma 
ctrl state is changed to RECONNECTING state
after some clearing and shutdown work, then some initializing 
procedure,  no matter reset work path or error recovery path.

The fc reset work also does the same thing.
So if we define the range that RESET_PREPARE includes scheduling gap 
and disable and clear work, RESETTING includes initializing

procedure,  RECONNECTING is very similar with RESETTING.

Maybe we could do like this;
In nvme fc/rdma
- set state to RESET_PREPARE, queue reset_work/err_work
- clear/shutdown works, set state to RECONNECTING


Should be fine.


In nvme pci
- set state to RESET_PREPARE, queue reset_work
- clear/shutdown works, set state to RESETTING
- initialization, set state to LIVE


Given that we split reset state and we have a clear symmetry between
the transports, do we want to maybe come up with a unique state that is
coherent across all transports?

Maybe we rename them to NVME_CTRL_SHUTTING_DOWN and
NVME_CTRL_ESTABLISHING? I'm open for better names..


I'm leaning toward this latter suggestion - we need to define the states 
and the actions they take. It seems to me, that RESETTING became the 
"init controller" part in Jainchao's model. So maybe it's not the 
shutting down that needs a new state, but rather the REINIT part.


-- james



Re: [PATCH] vxlan: Fix trailing semicolon

2018-01-17 Thread David Miller
From: Luis de Bethencourt 
Date: Tue, 16 Jan 2018 15:03:32 +

> The trailing semicolon is an empty statement that does no operation.
> It is completely stripped out by the compiler. Removing it since it doesn't do
> anything.
> 
> Signed-off-by: Luis de Bethencourt 

Applied to net-next, thanks.


[PATCH v6 21/99] xarray: Add xa_reserve and xa_release

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

This function simply creates a slot in the XArray for users which need
to acquire multiple locks before storing their entry in the tree and
so cannot use a plain xa_store().

Signed-off-by: Matthew Wilcox 
---
 include/linux/xarray.h | 14 ++
 lib/xarray.c   | 51 ++
 tools/testing/radix-tree/xarray-test.c | 25 +
 3 files changed, 90 insertions(+)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 6f59f1f60205..c3f7405c5517 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -259,6 +259,7 @@ void *xa_load(struct xarray *, unsigned long index);
 void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
 void *xa_cmpxchg(struct xarray *, unsigned long index,
void *old, void *entry, gfp_t);
+int xa_reserve(struct xarray *, unsigned long index, gfp_t);
 bool xa_get_tag(struct xarray *, unsigned long index, xa_tag_t);
 void xa_set_tag(struct xarray *, unsigned long index, xa_tag_t);
 void xa_clear_tag(struct xarray *, unsigned long index, xa_tag_t);
@@ -373,6 +374,19 @@ static inline int xa_insert(struct xarray *xa, unsigned 
long index,
return -EEXIST;
 }
 
+/**
+ * xa_release() - Release a reserved entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * After calling xa_reserve(), you can call this function to release the
+ * reservation.  It is harmless to call this function if the entry was used.
+ */
+static inline void xa_release(struct xarray *xa, unsigned long index)
+{
+   xa_cmpxchg(xa, index, NULL, NULL, 0);
+}
+
 #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
 #define xa_lock(xa)spin_lock(&(xa)->xa_lock)
 #define xa_unlock(xa)  spin_unlock(&(xa)->xa_lock)
diff --git a/lib/xarray.c b/lib/xarray.c
index ace309cc9253..b4dec8e2d202 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1275,6 +1275,8 @@ void *xa_cmpxchg(struct xarray *xa, unsigned long index,
do {
xas_lock(&xas);
curr = xas_load(&xas);
+   if (curr == XA_ZERO_ENTRY)
+   curr = NULL;
if (curr == old)
xas_store(&xas, entry);
xas_unlock(&xas);
@@ -1310,6 +1312,8 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
 
do {
curr = xas_load(&xas);
+   if (curr == XA_ZERO_ENTRY)
+   curr = NULL;
if (curr == old)
xas_store(&xas, entry);
} while (__xas_nomem(&xas, gfp));
@@ -1318,6 +1322,53 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long 
index,
 }
 EXPORT_SYMBOL(__xa_cmpxchg);
 
+/**
+ * xa_reserve() - Reserve this index in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @gfp: Memory allocation flags.
+ *
+ * Ensures there is somewhere to store an entry at @index in the array.
+ * If there is already something stored at @index, this function does
+ * nothing.  If there was nothing there, the entry is marked as reserved.
+ * Loads from @index will continue to see a %NULL pointer until a
+ * subsequent store to @index.
+ *
+ * If you do not use the entry that you have reserved, call xa_release()
+ * or xa_erase() to free any unnecessary memory.
+ *
+ * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
+ */
+int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
+{
+   XA_STATE(xas, xa, index);
+   unsigned int lock_type = xa_lock_type(xa);
+   void *curr;
+
+   do {
+   if (lock_type == XA_LOCK_IRQ)
+   xas_lock_irq(&xas);
+   else if (lock_type == XA_LOCK_BH)
+   xas_lock_bh(&xas);
+   else
+   xas_lock(&xas);
+
+   curr = xas_create(&xas);
+   if (!curr)
+   xas_store(&xas, XA_ZERO_ENTRY);
+
+if (lock_type == XA_LOCK_IRQ)
+xas_unlock_irq(&xas);
+else if (lock_type == XA_LOCK_BH)
+xas_unlock_bh(&xas);
+else
+xas_unlock(&xas);
+   } while (xas_nomem(&xas, gfp));
+
+   return xas_error(&xas);
+}
+EXPORT_SYMBOL(xa_reserve);
+
 /**
  * __xa_set_tag() - Set this tag on this entry while locked.
  * @xa: XArray.
diff --git a/tools/testing/radix-tree/xarray-test.c 
b/tools/testing/radix-tree/xarray-test.c
index 4d3541ac31e9..fe38b53df2ab 100644
--- a/tools/testing/radix-tree/xarray-test.c
+++ b/tools/testing/radix-tree/xarray-test.c
@@ -502,6 +502,29 @@ void check_move(struct xarray *xa)
} while (i < (1 << 16));
 }
 
+void check_reserve(struct xarray *xa)
+{
+   assert(xa_empty(xa));
+   xa_reserve(xa, 12345678, GFP_KERNEL);
+   assert(!xa_empty(xa));
+   assert(!xa_load(xa, 12345678));
+   xa_release(xa, 12345678);
+   assert(xa_emp

Re: [RFC 1/2] softirq: Defer net rx/tx processing to ksoftirqd context

2018-01-17 Thread Linus Torvalds
On Wed, Jan 17, 2018 at 12:30 PM, David Miller  wrote:
>
> I wanted to chime in about this earlier, and make it clear that it isn't
> just IRQs that can trigger softirqs.  User context actions in the kernel
> can trigger softirqs too.

Yes, anybody can do that "raise_softirq()" thing, although the common
thing tends to be for it to be from an interrupt that wants to delay
more long-running things to a non-irq-dsiabled context.

It was in some way always a "poor mans interrupt thread" (with no
blocking like a real thread context, but at least not impacting actual
interrupt latency).

That said, this made me wonder a bit. I wonder how bounded the latency
is for raising a softirq from process context. We only _check_ the
softirq on the last hardirq exit, I think.

We latency was traditionally bounded by the timer irq, and non-idle
CPU's should still be running timers. But I do note that networking
does seem to have some private hacks for the latency problem (ie
net_tx_action())?

I wonder if we should run softirqs on return to user mode (and make
softirq set a thread flag if not in interrupt context).

Although maybe some people actually want the "delay until next
interrupt" kind of behavior to throttle things if they do softirqs
from process context. That sounds unlikely, but  who knows.

 Linus


[PATCH v6 20/99] ida: Convert to XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Use the xarray infrstructure like we used the radix tree infrastructure.
This lets us get rid of idr_get_free() from the radix tree code.

Signed-off-by: Matthew Wilcox 
---
 include/linux/idr.h|   8 +-
 include/linux/radix-tree.h |   4 -
 lib/idr.c  | 320 ++---
 lib/radix-tree.c   | 119 -
 4 files changed, 187 insertions(+), 264 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 9064ae5f0abc..ad4199247301 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -232,11 +232,11 @@ struct ida_bitmap {
 DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap);
 
 struct ida {
-   struct radix_tree_root  ida_rt;
+   struct xarray   ida_xa;
 };
 
 #define IDA_INIT(name) {   \
-   .ida_rt = RADIX_TREE_INIT(name, IDR_INIT_FLAGS | GFP_NOWAIT),   \
+   .ida_xa = XARRAY_INIT_FLAGS(name.ida_xa, IDR_INIT_FLAGS)\
 }
 #define DEFINE_IDA(name)   struct ida name = IDA_INIT(name)
 
@@ -251,7 +251,7 @@ void ida_simple_remove(struct ida *ida, unsigned int id);
 
 static inline void ida_init(struct ida *ida)
 {
-   INIT_RADIX_TREE(&ida->ida_rt, IDR_INIT_FLAGS | GFP_NOWAIT);
+   xa_init_flags(&ida->ida_xa, IDR_INIT_FLAGS);
 }
 
 /**
@@ -268,6 +268,6 @@ static inline int ida_get_new(struct ida *ida, int *p_id)
 
 static inline bool ida_is_empty(const struct ida *ida)
 {
-   return radix_tree_empty(&ida->ida_rt);
+   return xa_empty(&ida->ida_xa);
 }
 #endif /* _LINUX_IDR_H */
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index f64beb9ba175..4c5c36414a80 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -302,10 +302,6 @@ int radix_tree_split(struct radix_tree_root *, unsigned 
long index,
 int radix_tree_join(struct radix_tree_root *, unsigned long index,
unsigned new_order, void *);
 
-void __rcu **idr_get_free(struct radix_tree_root *root,
- struct radix_tree_iter *iter, gfp_t gfp,
- unsigned long max);
-
 enum {
RADIX_TREE_ITER_TAG_MASK = 0x0f,/* tag index in lower nybble */
RADIX_TREE_ITER_TAGGED   = 0x10,/* lookup tagged slots */
diff --git a/lib/idr.c b/lib/idr.c
index 379eaa8cb75b..7e9a8850b613 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -13,7 +13,6 @@
 #include 
 
 DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap);
-static DEFINE_SPINLOCK(simple_ida_lock);
 
 /* In radix-tree.c temporarily */
 extern bool idr_nomem(struct xa_state *, gfp_t);
@@ -337,26 +336,23 @@ EXPORT_SYMBOL_GPL(idr_replace);
 /*
  * Developer's notes:
  *
- * The IDA uses the functionality provided by the IDR & radix tree to store
- * bitmaps in each entry.  The XA_FREE_TAG tag means there is at least one bit
- * free, unlike the IDR where it means at least one entry is free.
- *
- * I considered telling the radix tree that each slot is an order-10 node
- * and storing the bit numbers in the radix tree, but the radix tree can't
- * allow a single multiorder entry at index 0, which would significantly
- * increase memory consumption for the IDA.  So instead we divide the index
- * by the number of bits in the leaf bitmap before doing a radix tree lookup.
- *
- * As an optimisation, if there are only a few low bits set in any given
- * leaf, instead of allocating a 128-byte bitmap, we store the bits
+ * The IDA uses the functionality provided by the IDR & XArray to store
+ * bitmaps in each entry.  The XA_FREE_TAG tag is used to mean that there
+ * is at least one bit free, unlike the IDR where it means at least one
+ * array entry is free.
+ *
+ * The XArray supports multi-index entries, so I considered teaching the
+ * XArray that each slot is an order-10 node and indexing the XArray by the
+ * ID.  The XArray has the significant optimisation of storing the first
+ * entry in the struct xarray and avoiding allocating an xa_node.
+ * Unfortunately, it can't do that for multi-order entries.
+ * So instead the XArray index is the ID divided by the number of bits in
+ * the bitmap
+ *
+ * As a further optimisation, if there are only a few low bits set in any
+ * given leaf, instead of allocating a 128-byte bitmap, we store the bits
  * directly in the entry.
  *
- * We allow the radix tree 'exceptional' count to get out of date.  Nothing
- * in the IDA nor the radix tree code checks it.  If it becomes important
- * to maintain an accurate exceptional count, switch the rcu_assign_pointer()
- * calls to radix_tree_iter_replace() which will correct the exceptional
- * count.
- *
  * The IDA always requires a lock to alloc/free.  If we add a 'test_bit'
  * equivalent, it will still need locking.  Going to RCU lookup would require
  * using RCU to free bitmaps, and that's not trivial without embedding an
@@ -366,104 +362,114 @@ EXPORT_SYMBOL_GPL(idr_replace);
 
 #define IDA_MAX (0x8000

Re: [PATCH] drm/vc4: Flush the caches before the bin jobs, as well.

2018-01-17 Thread Eric Anholt
Eric Anholt  writes:
> If the frame samples from a render target that was just written, its
> cache flush during the binning step may have occurred before the
> previous frame's RCL was completed.  Flush the texture caches again
> before starting each RCL job to make sure that the sampling of the
> previous RCL's output is correct.
>
> Fixes flickering in the top left of 3DMMES Taiji.
>
> Signed-off-by: Eric Anholt 
> Fixes: ca26d28bbaa3 ("drm/vc4: improve throughput by pipelining binning and 
> rendering jobs")

Whoops, in the subject, this should have been "before the *render* jobs."


signature.asc
Description: PGP signature


[PATCH v6 24/99] page cache: Add and replace pages using the XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Use the XArray APIs to add and replace pages in the page cache.  This
removes two uses of the radix tree preload API and is significantly
shorter code.

Signed-off-by: Matthew Wilcox 
---
 include/linux/swap.h |   8 ++-
 mm/filemap.c | 143 ++-
 2 files changed, 67 insertions(+), 84 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c2b8128799c1..394957963c4b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -299,8 +299,12 @@ void *workingset_eviction(struct address_space *mapping, 
struct page *page);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
 
-/* Do not use directly, use workingset_lookup_update */
-void workingset_update_node(struct radix_tree_node *node);
+/* Only track the nodes of mappings with shadow entries */
+void workingset_update_node(struct xa_node *node);
+#define mapping_set_update(xas, mapping) do {  \
+   if (!dax_mapping(mapping) && !shmem_mapping(mapping))   \
+   xas_set_update(xas, workingset_update_node);\
+} while (0)
 
 /* Returns workingset_update_node() if the mapping has shadow entries. */
 #define workingset_lookup_update(mapping)  \
diff --git a/mm/filemap.c b/mm/filemap.c
index f1b4480723dd..e6371b551de1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -112,35 +112,6 @@
  *   ->tasklist_lock(memory_failure, collect_procs_ao)
  */
 
-static int page_cache_tree_insert(struct address_space *mapping,
- struct page *page, void **shadowp)
-{
-   struct radix_tree_node *node;
-   void **slot;
-   int error;
-
-   error = __radix_tree_create(&mapping->pages, page->index, 0,
-   &node, &slot);
-   if (error)
-   return error;
-   if (*slot) {
-   void *p;
-
-   p = radix_tree_deref_slot_protected(slot,
-   &mapping->pages.xa_lock);
-   if (!xa_is_value(p))
-   return -EEXIST;
-
-   mapping->nrexceptional--;
-   if (shadowp)
-   *shadowp = p;
-   }
-   __radix_tree_replace(&mapping->pages, node, slot, page,
-workingset_lookup_update(mapping));
-   mapping->nrpages++;
-   return 0;
-}
-
 static void page_cache_tree_delete(struct address_space *mapping,
   struct page *page, void *shadow)
 {
@@ -776,51 +747,44 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  * locked.  This function does not add the new page to the LRU, the
  * caller must do that.
  *
- * The remove + add is atomic.  The only way this function can fail is
- * memory allocation failure.
+ * The remove + add is atomic.  This function cannot fail.
  */
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 {
-   int error;
+   struct address_space *mapping = old->mapping;
+   void (*freepage)(struct page *) = mapping->a_ops->freepage;
+   pgoff_t offset = old->index;
+   XA_STATE(xas, &mapping->pages, offset);
+   unsigned long flags;
 
VM_BUG_ON_PAGE(!PageLocked(old), old);
VM_BUG_ON_PAGE(!PageLocked(new), new);
VM_BUG_ON_PAGE(new->mapping, new);
 
-   error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
-   if (!error) {
-   struct address_space *mapping = old->mapping;
-   void (*freepage)(struct page *);
-   unsigned long flags;
-
-   pgoff_t offset = old->index;
-   freepage = mapping->a_ops->freepage;
-
-   get_page(new);
-   new->mapping = mapping;
-   new->index = offset;
+   get_page(new);
+   new->mapping = mapping;
+   new->index = offset;
 
-   xa_lock_irqsave(&mapping->pages, flags);
-   __delete_from_page_cache(old, NULL);
-   error = page_cache_tree_insert(mapping, new, NULL);
-   BUG_ON(error);
+   xas_lock_irqsave(&xas, flags);
+   xas_store(&xas, new);
 
-   /*
-* hugetlb pages do not participate in page cache accounting.
-*/
-   if (!PageHuge(new))
-   __inc_node_page_state(new, NR_FILE_PAGES);
-   if (PageSwapBacked(new))
-   __inc_node_page_state(new, NR_SHMEM);
-   xa_unlock_irqrestore(&mapping->pages, flags);
-   mem_cgroup_migrate(old, new);
-   radix_tree_preload_end();
-   if (freepage)
-   freepage(old);
-   put_page(old);
-   }
+   old->mapping = NULL;
+   /* hugetlb pages do not participate in page cache accounting. */
+   if (!PageHuge(old))
+   __dec_node_page_st

[PATCH v6 23/99] page cache: Add page_cache_range_empty function

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

btrfs has its own custom function for determining whether the page cache
has any pages in a particular range.  Move this functionality to the
page cache, and call it from btrfs.

Signed-off-by: Matthew Wilcox 
---
 fs/btrfs/btrfs_inode.h  |  7 -
 fs/btrfs/inode.c| 70 -
 include/linux/pagemap.h |  2 ++
 mm/filemap.c| 26 ++
 4 files changed, 34 insertions(+), 71 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 63f0ccc92a71..a48bd6e0a0bb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -365,6 +365,11 @@ static inline void btrfs_print_data_csum_error(struct 
btrfs_inode *inode,
logical_start, csum, csum_expected, mirror_num);
 }
 
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
+static inline bool btrfs_page_exists_in_range(struct inode *inode,
+   loff_t start, loff_t end)
+{
+   return page_cache_range_empty(inode->i_mapping, start >> PAGE_SHIFT,
+   end >> PAGE_SHIFT);
+}
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dbdb5bf6bca1..d7d2c556d5a2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7541,76 +7541,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 
offset, u64 *len,
return ret;
 }
 
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
-{
-   struct radix_tree_root *root = &inode->i_mapping->pages;
-   bool found = false;
-   void **pagep = NULL;
-   struct page *page = NULL;
-   unsigned long start_idx;
-   unsigned long end_idx;
-
-   start_idx = start >> PAGE_SHIFT;
-
-   /*
-* end is the last byte in the last page.  end == start is legal
-*/
-   end_idx = end >> PAGE_SHIFT;
-
-   rcu_read_lock();
-
-   /* Most of the code in this while loop is lifted from
-* find_get_page.  It's been modified to begin searching from a
-* page and return just the first page found in that range.  If the
-* found idx is less than or equal to the end idx then we know that
-* a page exists.  If no pages are found or if those pages are
-* outside of the range then we're fine (yay!) */
-   while (page == NULL &&
-  radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
-   page = radix_tree_deref_slot(pagep);
-   if (unlikely(!page))
-   break;
-
-   if (radix_tree_exception(page)) {
-   if (radix_tree_deref_retry(page)) {
-   page = NULL;
-   continue;
-   }
-   /*
-* Otherwise, shmem/tmpfs must be storing a swap entry
-* here so return it without attempting to raise page
-* count.
-*/
-   page = NULL;
-   break; /* TODO: Is this relevant for this use case? */
-   }
-
-   if (!page_cache_get_speculative(page)) {
-   page = NULL;
-   continue;
-   }
-
-   /*
-* Has the page moved?
-* This is part of the lockless pagecache protocol. See
-* include/linux/pagemap.h for details.
-*/
-   if (unlikely(page != *pagep)) {
-   put_page(page);
-   page = NULL;
-   }
-   }
-
-   if (page) {
-   if (page->index <= end_idx)
-   found = true;
-   put_page(page);
-   }
-
-   rcu_read_unlock();
-   return found;
-}
-
 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
  struct extent_state **cached_state, int writing)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0db127c3ccac..34d4fa3ad1c5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -245,6 +245,8 @@ pgoff_t page_cache_next_gap(struct address_space *mapping,
 pgoff_t index, unsigned long max_scan);
 pgoff_t page_cache_prev_gap(struct address_space *mapping,
 pgoff_t index, unsigned long max_scan);
+bool page_cache_range_empty(struct address_space *mapping,
+   pgoff_t index, pgoff_t max);
 
 #define FGP_ACCESSED   0x0001
 #define FGP_LOCK   0x0002
diff --git a/mm/filemap.c b/mm/filemap.c
index 146e8ec16ec0..f1b4480723dd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1398,6 +1398,32 @@ pgoff_t page_cache_prev_gap(struct address_space 
*mapping,
 }
 EXPORT_SYMBOL(page_cache_prev_gap);
 
+

[PATCH v6 22/99] page cache: Convert hole search to XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

The page cache offers the ability to search for a miss in the previous or
next N locations.  Rather than teach the XArray about the page cache's
definition of a miss, use xas_prev() and xas_next() to search the page
array.  This should be more efficient as it does not have to start the
lookup from the top for each index.

Signed-off-by: Matthew Wilcox 
---
 fs/nfs/blocklayout/blocklayout.c |   2 +-
 include/linux/pagemap.h  |   4 +-
 mm/filemap.c | 110 ++-
 mm/readahead.c   |   4 +-
 4 files changed, 55 insertions(+), 65 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 995d707537da..7bd643538cff 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -826,7 +826,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t 
idx)
end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (end != inode->i_mapping->nrpages) {
rcu_read_lock();
-   end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
+   end = page_cache_next_gap(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 80a6149152d4..0db127c3ccac 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -241,9 +241,9 @@ static inline gfp_t readahead_gfp_mask(struct address_space 
*x)
 
 typedef int filler_t(void *, struct page *);
 
-pgoff_t page_cache_next_hole(struct address_space *mapping,
+pgoff_t page_cache_next_gap(struct address_space *mapping,
 pgoff_t index, unsigned long max_scan);
-pgoff_t page_cache_prev_hole(struct address_space *mapping,
+pgoff_t page_cache_prev_gap(struct address_space *mapping,
 pgoff_t index, unsigned long max_scan);
 
 #define FGP_ACCESSED   0x0001
diff --git a/mm/filemap.c b/mm/filemap.c
index 309be963140c..146e8ec16ec0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1327,86 +1327,76 @@ int __lock_page_or_retry(struct page *page, struct 
mm_struct *mm,
 }
 
 /**
- * page_cache_next_hole - find the next hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
- * lowest indexed hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'return - index >=
- * max_scan' will be true). In rare cases of index wrap-around, 0 will
- * be returned.
- *
- * page_cache_next_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 5, then subsequently a hole is created at
- * index 10, page_cache_next_hole covering both indexes may return 10
- * if called under rcu_read_lock.
+ * page_cache_next_gap() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
+ * gap with the lowest index.
+ *
+ * This function may be called under the rcu_read_lock.  However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 5, then subsequently a gap is
+ * created at index 10, page_cache_next_gap covering both indices may
+ * return 10 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'return - index >= max_scan' will be true).
+ * In the rare case of index wrap-around, 0 will be returned.
  */
-pgoff_t page_cache_next_hole(struct address_space *mapping,
+pgoff_t page_cache_next_gap(struct address_space *mapping,
 pgoff_t index, unsigned long max_scan)
 {
-   unsigned long i;
+   XA_STATE(xas, &mapping->pages, index);
 
-   for (i = 0; i < max_scan; i++) {
-   struct page *page;
-
-   page = radix_tree_lookup(&mapping->pages, index);
-   if (!page || xa_is_value(page))
+   while (max_scan--) {
+   void *entry = xas_next(&xas);
+   if (!entry || xa_is_value(entry))
break;
-   index++;
-   if (index == 0)
+   if (xas.xa_index == 0)
break;
}
 
-   return index;
+   return xas.xa_index;
 }
-EXPORT_SYMBOL(page_cache_next_hole);
+EXPORT_SYMBOL(page_cache_next_gap);
 
 /**
- * page_cache_prev_hole - find the prev hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search backwards in the range [max(index-max_scan+1, 0)

Re: [PATCHv4 1/5] net: dsa: Support internal phy on 'cpu' port

2018-01-17 Thread David Miller
From: Sebastian Reichel 
Date: Tue, 16 Jan 2018 11:19:54 +0100

> This adds support for enabling the internal PHY for a 'cpu' port.
> It has been tested on GE B850v3,  B650v3 and B450v3, which have a
> built-in MV88E6240 switch hardwired to a PCIe based network card
> making use of the internal PHY. Since mv88e6xxx driver resets the
> chip during probe, the PHY is disabled without this patch resulting
> in missing link and non-functional switch device.
> 
> Signed-off-by: Sebastian Reichel 

Andrew, Florian, Vivien, please review.


radeon 0000:01:00.0: swiotlb buffer is full (sz: 2097152 bytes)

2018-01-17 Thread Borislav Petkov
Hi guys,

seen this already?

I see it has happened during resume. Kernel is rc7+tip/master.

Box is stable otherwise while I'm working on it...

[66861.818432] usb 10-1: USB disconnect, device number 2
[75380.827447] perf: interrupt took too long (2527 > 2500), lowering 
kernel.perf_event_max_sample_rate to 79000
[94022.728431] radeon :01:00.0: swiotlb buffer is full (sz: 2097152 bytes)
[94022.735717] swiotlb: coherent allocation failed for device :01:00.0 
size=2097152
[94022.743525] CPU: 2 PID: 3069 Comm: Xorg Not tainted 4.15.0-rc7+ #4
[94022.749711] Hardware name: To be filled by O.E.M. To be filled by 
O.E.M./M5A97 EVO R2.0, BIOS 1503 01/16/2013
[94022.749711] Call Trace:
[94022.749717]  dump_stack+0x67/0x8f
[94022.749720]  swiotlb_alloc_coherent+0x169/0x170
[94022.749726]  ttm_dma_pool_get_pages+0x1ea/0x450 [ttm]
[94022.749731]  ttm_dma_populate+0x248/0x330 [ttm]
[94022.749734]  ttm_tt_bind+0x23/0x50 [ttm]
[94022.749737]  ttm_bo_handle_move_mem+0x3a1/0x3e0 [ttm]
[94022.749741]  ? ttm_bo_mem_space+0x3bc/0x4a0 [ttm]
[94022.749744]  ttm_bo_validate+0x139/0x150 [ttm]
[94022.749746]  ? _raw_write_unlock+0x12/0x30
[94022.749748]  ? drm_vma_offset_add+0x6a/0x90
[94022.749751]  ttm_bo_init_reserved+0x3a5/0x470 [ttm]
[94022.749754]  ttm_bo_init+0x4d/0xb0 [ttm]
[94022.749778]  ? radeon_update_memory_usage.isra.0+0x60/0x60 [radeon]
[94022.749784]  ? drm_gem_object_init+0x31/0x50
[94022.749796]  radeon_bo_create+0x1bf/0x290 [radeon]
[94022.749809]  ? radeon_update_memory_usage.isra.0+0x60/0x60 [radeon]
[94022.749822]  radeon_gem_object_create+0xa9/0x1b0 [radeon]
[94022.749835]  ? radeon_gem_pwrite_ioctl+0x30/0x30 [radeon]
[94022.749848]  radeon_gem_create_ioctl+0x6a/0xf0 [radeon]
[94022.749862]  ? radeon_gem_pwrite_ioctl+0x30/0x30 [radeon]
[94022.749863]  drm_ioctl_kernel+0x6e/0xd0
[94022.749865]  ? unix_state_double_unlock+0x30/0x30
[94022.749866]  drm_ioctl+0x33b/0x3f0
[94022.749879]  ? radeon_gem_pwrite_ioctl+0x30/0x30 [radeon]
[94022.749881]  ? preempt_count_sub+0xa8/0x100
[94022.749882]  ? _raw_spin_unlock_irqrestore+0x25/0x50
[94022.749883]  ? preempt_count_sub+0xa8/0x100
[94022.749894]  radeon_drm_ioctl+0x5d/0xa0 [radeon]
[94022.749896]  do_vfs_ioctl+0xa2/0x600
[94022.749898]  ? __fget+0x67/0xb0
[94022.749899]  SyS_ioctl+0x4c/0x90
[94022.749901]  entry_SYSCALL_64_fastpath+0x22/0x8a
[94022.749902] RIP: 0033:0x7f28b58145e7
[94022.749903] RSP: 002b:7ffdac6c6948 EFLAGS: 0246
[94022.770322] radeon :01:00.0: swiotlb buffer is full (sz: 2097152 bytes)
[94022.770323] swiotlb: coherent allocation failed for device :01:00.0 
size=2097152

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.


[PATCH v6 29/99] page cache: Convert filemap_range_has_page to XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Instead of calling find_get_pages_range() and putting any reference,
just use xa_find() to look for a page in the right range.

Signed-off-by: Matthew Wilcox 
---
 mm/filemap.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 2536fcacb5bc..cd01f353cf6a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -461,18 +461,11 @@ bool filemap_range_has_page(struct address_space *mapping,
 {
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
-   struct page *page;
 
if (end_byte < start_byte)
return false;
 
-   if (mapping->nrpages == 0)
-   return false;
-
-   if (!find_get_pages_range(mapping, &index, end, 1, &page))
-   return false;
-   put_page(page);
-   return true;
+   return xa_find(&mapping->pages, &index, end, XA_PRESENT);
 }
 EXPORT_SYMBOL(filemap_range_has_page);
 
-- 
2.15.1



[PATCH v6 27/99] page cache: Convert delete_batch to XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Rename the function from page_cache_tree_delete_batch to just
page_cache_delete_batch.

Signed-off-by: Matthew Wilcox 
---
 mm/filemap.c | 28 +---
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 317a89df1945..d2a0031d61f5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -276,7 +276,7 @@ void delete_from_page_cache(struct page *page)
 EXPORT_SYMBOL(delete_from_page_cache);
 
 /*
- * page_cache_tree_delete_batch - delete several pages from page cache
+ * page_cache_delete_batch - delete several pages from page cache
  * @mapping: the mapping to which pages belong
  * @pvec: pagevec with pages to delete
  *
@@ -289,23 +289,18 @@ EXPORT_SYMBOL(delete_from_page_cache);
  *
  * The function expects xa_lock to be held.
  */
-static void
-page_cache_tree_delete_batch(struct address_space *mapping,
+static void page_cache_delete_batch(struct address_space *mapping,
 struct pagevec *pvec)
 {
-   struct radix_tree_iter iter;
-   void **slot;
+   XA_STATE(xas, &mapping->pages, pvec->pages[0]->index);
int total_pages = 0;
int i = 0, tail_pages = 0;
struct page *page;
-   pgoff_t start;
 
-   start = pvec->pages[0]->index;
-   radix_tree_for_each_slot(slot, &mapping->pages, &iter, start) {
+   mapping_set_update(&xas, mapping);
+   xas_for_each(&xas, page, ULONG_MAX) {
if (i >= pagevec_count(pvec) && !tail_pages)
break;
-   page = radix_tree_deref_slot_protected(slot,
-  &mapping->pages.xa_lock);
if (xa_is_value(page))
continue;
if (!tail_pages) {
@@ -314,8 +309,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
 * have our pages locked so they are protected from
 * being removed.
 */
-   if (page != pvec->pages[i])
+   if (page != pvec->pages[i]) {
+   VM_BUG_ON_PAGE(page->index >
+   pvec->pages[i]->index, page);
continue;
+   }
WARN_ON_ONCE(!PageLocked(page));
if (PageTransHuge(page) && !PageHuge(page))
tail_pages = HPAGE_PMD_NR - 1;
@@ -326,11 +324,11 @@ page_cache_tree_delete_batch(struct address_space 
*mapping,
 */
i++;
} else {
+   VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
+   != pvec->pages[i]->index, page);
tail_pages--;
}
-   radix_tree_clear_tags(&mapping->pages, iter.node, slot);
-   __radix_tree_replace(&mapping->pages, iter.node, slot, NULL,
-   workingset_lookup_update(mapping));
+   xas_store(&xas, NULL);
total_pages++;
}
mapping->nrpages -= total_pages;
@@ -351,7 +349,7 @@ void delete_from_page_cache_batch(struct address_space 
*mapping,
 
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
-   page_cache_tree_delete_batch(mapping, pvec);
+   page_cache_delete_batch(mapping, pvec);
xa_unlock_irqrestore(&mapping->pages, flags);
 
for (i = 0; i < pagevec_count(pvec); i++)
-- 
2.15.1



[PATCH v6 30/99] mm: Convert page-writeback to XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Includes moving mapping_tagged() to fs.h as a static inline, and
changing it to return bool.

Signed-off-by: Matthew Wilcox 
---
 include/linux/fs.h  | 17 +--
 mm/page-writeback.c | 62 +++--
 2 files changed, 32 insertions(+), 47 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e4345c13e237..c58bc3c619bf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -470,15 +470,18 @@ struct block_device {
struct mutexbd_fsfreeze_mutex;
 } __randomize_layout;
 
+/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
+#define PAGECACHE_TAG_DIRTYXA_TAG_0
+#define PAGECACHE_TAG_WRITEBACKXA_TAG_1
+#define PAGECACHE_TAG_TOWRITE  XA_TAG_2
+
 /*
- * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
- * radix trees
+ * Returns true if any of the pages in the mapping are marked with the tag.
  */
-#define PAGECACHE_TAG_DIRTY0
-#define PAGECACHE_TAG_WRITEBACK1
-#define PAGECACHE_TAG_TOWRITE  2
-
-int mapping_tagged(struct address_space *mapping, int tag);
+static inline bool mapping_tagged(struct address_space *mapping, xa_tag_t tag)
+{
+   return xa_tagged(&mapping->pages, tag);
+}
 
 static inline void i_mmap_lock_write(struct address_space *mapping)
 {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 588ce729d199..0407436a8305 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2098,33 +2098,25 @@ void __init page_writeback_init(void)
  * dirty pages in the file (thus it is important for this function to be quick
  * so that it can tag pages faster than a dirtying process can create them).
  */
-/*
- * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce xa_lock latency.
- */
 void tag_pages_for_writeback(struct address_space *mapping,
 pgoff_t start, pgoff_t end)
 {
-#define WRITEBACK_TAG_BATCH 4096
-   unsigned long tagged = 0;
-   struct radix_tree_iter iter;
-   void **slot;
+   XA_STATE(xas, &mapping->pages, start);
+   unsigned int tagged = 0;
+   void *page;
 
-   xa_lock_irq(&mapping->pages);
-   radix_tree_for_each_tagged(slot, &mapping->pages, &iter, start,
-   PAGECACHE_TAG_DIRTY) {
-   if (iter.index > end)
-   break;
-   radix_tree_iter_tag_set(&mapping->pages, &iter,
-   PAGECACHE_TAG_TOWRITE);
-   tagged++;
-   if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+   xas_lock_irq(&xas);
+   xas_for_each_tag(&xas, page, end, PAGECACHE_TAG_DIRTY) {
+   xas_set_tag(&xas, PAGECACHE_TAG_TOWRITE);
+   if (++tagged % XA_CHECK_SCHED)
continue;
-   slot = radix_tree_iter_resume(slot, &iter);
-   xa_unlock_irq(&mapping->pages);
+
+   xas_pause(&xas);
+   xas_unlock_irq(&xas);
cond_resched();
-   xa_lock_irq(&mapping->pages);
+   xas_lock_irq(&xas);
}
-   xa_unlock_irq(&mapping->pages);
+   xas_unlock_irq(&xas);
 }
 EXPORT_SYMBOL(tag_pages_for_writeback);
 
@@ -2164,7 +2156,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
-   int tag;
+   xa_tag_t tag;
 
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -2445,7 +2437,7 @@ void account_page_cleaned(struct page *page, struct 
address_space *mapping,
 
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
- * its radix tree.
+ * the xarray.
  *
  * This is also used when a single buffer is being dirtied: we want to set the
  * page dirty in that case, but not all the buffers.  This is a "bottom-up"
@@ -2471,7 +2463,7 @@ int __set_page_dirty_nobuffers(struct page *page)
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
-   radix_tree_tag_set(&mapping->pages, page_index(page),
+   __xa_set_tag(&mapping->pages, page_index(page),
   PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->pages, flags);
unlock_page_memcg(page);
@@ -2634,13 +2626,13 @@ EXPORT_SYMBOL(__cancel_dirty_page);
  * Returns true if the page was previously dirty.
  *
  * This is for preparing to put the page under writeout.  We leave the page
- * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * tagged as dirty in the xarray so that a concurrent write-for-sync
  * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
  * implementation will run either set_page_writeback() or set_page_dirty(),
- * at which stage we bri

[PATCH v6 31/99] mm: Convert workingset to XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

We construct a fake XA_STATE and use it to delete the node with xa_store()
rather than adding a special function for this unique use case.

Signed-off-by: Matthew Wilcox 
---
 include/linux/swap.h |  9 -
 mm/workingset.c  | 51 ++-
 2 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 394957963c4b..e519554730fa 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -306,15 +306,6 @@ void workingset_update_node(struct xa_node *node);
xas_set_update(xas, workingset_update_node);\
 } while (0)
 
-/* Returns workingset_update_node() if the mapping has shadow entries. */
-#define workingset_lookup_update(mapping)  \
-({ \
-   radix_tree_update_node_t __helper = workingset_update_node; \
-   if (dax_mapping(mapping) || shmem_mapping(mapping)) \
-   __helper = NULL;\
-   __helper;   \
-})
-
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
diff --git a/mm/workingset.c b/mm/workingset.c
index 91b6e16ad4c1..f7ca6ea5d8b1 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -148,7 +148,7 @@
  * and activations is maintained (node->inactive_age).
  *
  * On eviction, a snapshot of this counter (along with some bits to
- * identify the node) is stored in the now empty page cache radix tree
+ * identify the node) is stored in the now empty page cache
  * slot of the evicted page.  This is called a shadow entry.
  *
  * On cache misses for which there are shadow entries, an eligible
@@ -162,7 +162,7 @@
 
 /*
  * Eviction timestamps need to be able to cover the full range of
- * actionable refaults. However, bits are tight in the radix tree
+ * actionable refaults. However, bits are tight in the xarray
  * entry, and after storing the identifier for the lruvec there might
  * not be enough left to represent every single actionable refault. In
  * that case, we have to sacrifice granularity for distance, and group
@@ -338,7 +338,7 @@ void workingset_activation(struct page *page)
 
 static struct list_lru shadow_nodes;
 
-void workingset_update_node(struct radix_tree_node *node)
+void workingset_update_node(struct xa_node *node)
 {
/*
 * Track non-empty nodes that contain only shadow entries;
@@ -370,7 +370,7 @@ static unsigned long count_shadow_nodes(struct shrinker 
*shrinker,
local_irq_enable();
 
/*
-* Approximate a reasonable limit for the radix tree nodes
+* Approximate a reasonable limit for the nodes
 * containing shadow entries. We don't need to keep more
 * shadow entries than possible pages on the active list,
 * since refault distances bigger than that are dismissed.
@@ -385,11 +385,11 @@ static unsigned long count_shadow_nodes(struct shrinker 
*shrinker,
 * worst-case density of 1/8th. Below that, not all eligible
 * refaults can be detected anymore.
 *
-* On 64-bit with 7 radix_tree_nodes per page and 64 slots
+* On 64-bit with 7 xa_nodes per page and 64 slots
 * each, this will reclaim shadow entries when they consume
 * ~1.8% of available memory:
 *
-* PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
+* PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
 */
if (sc->memcg) {
cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
@@ -398,7 +398,7 @@ static unsigned long count_shadow_nodes(struct shrinker 
*shrinker,
cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
}
-   max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
+   max_nodes = cache >> (XA_CHUNK_SHIFT - 3);
 
if (nodes <= max_nodes)
return 0;
@@ -408,11 +408,11 @@ static unsigned long count_shadow_nodes(struct shrinker 
*shrinker,
 static enum lru_status shadow_lru_isolate(struct list_head *item,
  struct list_lru_one *lru,
  spinlock_t *lru_lock,
- void *arg)
+ void *arg) __must_hold(lru_lock)
 {
+   XA_STATE(xas, NULL, 0);
struct address_space *mapping;
-   struct radix_tree_node *node;
-   unsigned int i;
+   struct xa_node *node;
int ret;
 
/*
@@ -420,7 +420,7 @@ static enum lru_status shadow_lru_isolate(struct list_head 
*item,
 * the shadow node LRU under the mapping->pages.xa_lock and the
 * lru_lock.  Because the page cache tree is em

[PATCH v6 34/99] mm: Convert delete_from_swap_cache to XArray

2018-01-17 Thread Matthew Wilcox
From: Matthew Wilcox 

Both callers of __delete_from_swap_cache have the swp_entry_t already,
so pass that in to make constructing the XA_STATE easier.

Signed-off-by: Matthew Wilcox 
---
 include/linux/swap.h |  5 +++--
 mm/swap_state.c  | 24 ++--
 mm/vmscan.c  |  2 +-
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index e519554730fa..8eb99229dbc0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -408,7 +408,7 @@ extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *page);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
 extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
-extern void __delete_from_swap_cache(struct page *);
+extern void __delete_from_swap_cache(struct page *, swp_entry_t entry);
 extern void delete_from_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
@@ -583,7 +583,8 @@ static inline int add_to_swap_cache(struct page *page, 
swp_entry_t entry,
return -1;
 }
 
-static inline void __delete_from_swap_cache(struct page *page)
+static inline void __delete_from_swap_cache(struct page *page,
+   swp_entry_t entry)
 {
 }
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index a57b5ad4c503..219e3b4f09e6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -154,23 +154,22 @@ int add_to_swap_cache(struct page *page, swp_entry_t 
entry, gfp_t gfp)
  * This must be called only on pages that have
  * been verified to be in the swap cache.
  */
-void __delete_from_swap_cache(struct page *page)
+void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
 {
-   struct address_space *address_space;
+   struct address_space *address_space = swap_address_space(entry);
int i, nr = hpage_nr_pages(page);
-   swp_entry_t entry;
-   pgoff_t idx;
+   pgoff_t idx = swp_offset(entry);
+   XA_STATE(xas, &address_space->pages, idx);
 
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
VM_BUG_ON_PAGE(PageWriteback(page), page);
 
-   entry.val = page_private(page);
-   address_space = swap_address_space(entry);
-   idx = swp_offset(entry);
for (i = 0; i < nr; i++) {
-   radix_tree_delete(&address_space->pages, idx + i);
+   void *entry = xas_store(&xas, NULL);
+   VM_BUG_ON_PAGE(entry != page + i, entry);
set_page_private(page + i, 0);
+   xas_next(&xas);
}
ClearPageSwapCache(page);
address_space->nrpages -= nr;
@@ -246,14 +245,11 @@ int add_to_swap(struct page *page)
  */
 void delete_from_swap_cache(struct page *page)
 {
-   swp_entry_t entry;
-   struct address_space *address_space;
-
-   entry.val = page_private(page);
+   swp_entry_t entry = { .val = page_private(page) };
+   struct address_space *address_space = swap_address_space(entry);
 
-   address_space = swap_address_space(entry);
xa_lock_irq(&address_space->pages);
-   __delete_from_swap_cache(page);
+   __delete_from_swap_cache(page, entry);
xa_unlock_irq(&address_space->pages);
 
put_swap_page(page, entry);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2fa675a2db31..51d437a18db8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -718,7 +718,7 @@ static int __remove_mapping(struct address_space *mapping, 
struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
-   __delete_from_swap_cache(page);
+   __delete_from_swap_cache(page, swap);
xa_unlock_irqrestore(&mapping->pages, flags);
put_swap_page(page, swap);
} else {
-- 
2.15.1



Re: 答复: 答复: [PATCH v6] mfd: Add support for RTS5250S power saving

2018-01-17 Thread Bjorn Helgaas
On Wed, Dec 27, 2017 at 05:37:50PM -0600, Bjorn Helgaas wrote:
> On Tue, Dec 19, 2017 at 08:15:24AM +, 冯锐 wrote:
> > > On Fri, Dec 15, 2017 at 09:42:45AM +, 冯锐 wrote:
> > > > > [+cc Hans, Dave, linux-pci]
> > > > >
> > > > > On Thu, Sep 07, 2017 at 04:26:39PM +0800, rui_f...@realsil.com.cn
> > > wrote:
> > > > > > From: Rui Feng 
> > > > >
> > > > > I wish this had been posted to linux-pci before being merged.
> > > > >
> > > > > I'm concerned because some of this appears to overlap and conflict
> > > > > with PCI core management of ASPM.
> > > > >
> > > > > I assume these devices advertise ASPM support in their Link
> > > > > Capabilites registers, right?  If so, why isn't the existing PCI
> > > > > core ASPM support sufficient?
> > > > >
> > > > When L1SS is configured, the device(hardware) can't enter L1SS status
> > > > automatically, it need driver(software) to do some work to achieve the
> > > function.
> > > 
> > > So this is a hardware defect in the device?  As far as I know, ASPM and 
> > > L1SS
> > > are specified such that they should work without special driver support.
> > > 
> > Yes, you can say that.
> > 
> > > > > > Enable power saving for RTS5250S as following steps:
> > > > > > 1.Set 0xFE58 to enable clock power management.
> > > > >
> > > > > Is this clock power management something specific to RTS5250S, or is
> > > > > it standard PCIe architected stuff?
> > > > >
> > > > 0xFE58 is specific register to RTS5250S not standard PCIe architected 
> > > > stuff.
> > > 
> > > OK.  I asked because devices often mirror architected PCIe config things 
> > > in
> > > device-specific MMIO space, and if I squint just right, I can sort of 
> > > match up the
> > > register bits you used with things in the PCIe spec.
> > > 
> > > > > > 2.Check cfg space whether support L1SS or not.
> > > > >
> > > > > This sounds like standard PCIe ASPM L1 Substates, right?
> > > > >
> > > > Yes.
> > > >
> > > > > > 3.If support L1SS, set 0xFF03 to free clkreq.
> > > > > > 4.When entering idle status, enable aspm
> > > > > >   and set parameters for L1SS and LTR.
> > > > > > 5.Wnen entering run status, disable aspm
> > > > > >   and set parameters for L1SS and LTR.
> > > > >
> > > > > In general, drivers should not configure ASPM, L1SS, and LTR
> > > > > themselves; the PCI core should do that.
> > > > >
> > > > > If a driver needs to tweak ASPM at run-time, it should use
> > > > > interfaces exported by the PCI core to do so.
> > > > >
> > > > Which interface I can use to set ASPM? I use "pci_write_config_byte" 
> > > > now.
> > > 
> > > What do you need to do?  include/linux/pci-aspm.h exports
> > > pci_disable_link_state(), which is mainly used to avoid ASPM
> > > states that have hardware errata.
> > > 
> > I want to enable ASPM(L0 -> L1) and disable ASPM(L1 -> L0), which
> > interface can I use?
> 
> You can use pci_disable_link_state() to disable usage of L1.
> 
> Currently there is no corresponding pci_enable_link_state().  What if
> we added something like the following (untested)?  Would that work for
> you?

Hi Rui,

Any thoughts on the patch below?

> commit 209930d809fa602b8aafdd171b26719cee6c6649
> Author: Bjorn Helgaas 
> Date:   Wed Dec 27 16:56:26 2017 -0600
> 
> PCI/ASPM: Add pci_enable_link_state()
> 
> Some drivers want control over the ASPM states their device is allowed to
> use.  We already have a pci_disable_link_state(), and drivers can use that
> to prevent the device from entering L0 or L1s.
> 
> Add a corresponding pci_enable_link_state() so a driver can enable use of
> L0 or L1s again.
> 
> diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> index 3b9b4d50cd98..ca217195f800 100644
> --- a/drivers/pci/pcie/aspm.c
> +++ b/drivers/pci/pcie/aspm.c
> @@ -1028,6 +1028,67 @@ void pcie_aspm_powersave_config_link(struct pci_dev 
> *pdev)
>   up_read(&pci_bus_sem);
>  }
>  
> +/**
> + * pci_enable_link_state - Enable device's link state, so the link may
> + * enter specific states.  Note that if the BIOS didn't grant ASPM
> + * control to the OS, this does nothing because we can't touch the LNKCTL
> + * register.
> + *
> + * @pdev: PCI device
> + * @state: ASPM link state to enable
> + */
> +void pci_enable_link_state(struct pci_dev *pdev, int state)
> +{
> + struct pci_dev *parent = pdev->bus->self;
> + struct pcie_link_state *link;
> + u32 lnkcap;
> +
> + if (!pci_is_pcie(pdev))
> + return;
> +
> + if (pdev->has_secondary_link)
> + parent = pdev;
> + if (!parent || !parent->link_state)
> + return;
> +
> + /*
> +  * A driver requested that ASPM be enabled on this device, but
> +  * if we don't have permission to manage ASPM (e.g., on ACPI
> +  * systems we have to observe the FADT ACPI_FADT_NO_ASPM bit and
> +  * the _OSC method), we can't honor that request.  Windows has
> +  * a similar mechanism using "PciASPMOptOut", which is also
> +  * ignored in this

<    1   2   3   4   5   6   7   8   9   10   >