date:20161208

Re: Tearing down DMA transfer setup after DMA client has finished

2016-12-08 Thread Måns Rullgård

Vinod Koul  writes:

> To make it efficient, disregarding your Sbox HW issue, the solution is
> virtual channels. You can delink physical channels and virtual channels. If
> one has SW controlled MUX then a channel can service any client. For few
> controllers request lines are hard wired so they cant use any channel. But
> if you dont have this restriction then driver can queue up many transactions
> from different controllers.

Have you been paying attention at all?  This exactly what the driver
ALREADY DOES.

-- 
Måns Rullgård

[PATCH v18 13/15] acpi/arm64: Add memory-mapped timer support in GTDT driver

2016-12-08 Thread fu . wei

From: Fu Wei 

On platforms booting with ACPI, architected memory-mapped timers'
configuration data is provided by firmware through the ACPI GTDT
static table.

The clocksource architected timer kernel driver requires a firmware
interface to collect timer configuration and configure its driver.
this infrastructure is present for device tree systems, but it is
missing on systems booting with ACPI.

Implement the kernel infrastructure required to parse the static
ACPI GTDT table so that the architected timer clocksource driver can
make use of it on systems booting with ACPI, therefore enabling
the corresponding timers configuration.

Signed-off-by: Fu Wei 
Signed-off-by: Hanjun Guo 
---
 drivers/acpi/arm64/gtdt.c | 124 ++
 include/linux/acpi.h  |   1 +
 2 files changed, 125 insertions(+)

diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index d93a790..91ea6cb 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -37,6 +37,28 @@ struct acpi_gtdt_descriptor {
 
 static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
 
+static inline void *next_platform_timer(void *platform_timer)
+{
+   struct acpi_gtdt_header *gh = platform_timer;
+
+   platform_timer += gh->length;
+   if (platform_timer < acpi_gtdt_desc.gtdt_end)
+   return platform_timer;
+
+   return NULL;
+}
+
+#define for_each_platform_timer(_g)\
+   for (_g = acpi_gtdt_desc.platform_timer; _g;\
+_g = next_platform_timer(_g))
+
+static inline bool is_timer_block(void *platform_timer)
+{
+   struct acpi_gtdt_header *gh = platform_timer;
+
+   return gh->type == ACPI_GTDT_TYPE_TIMER_BLOCK;
+}
+
 static int __init map_gt_gsi(u32 interrupt, u32 flags)
 {
int trigger, polarity;
@@ -155,3 +177,105 @@ int __init acpi_gtdt_init(struct acpi_table_header *table,
 
return ret;
 }
+
+static int __init gtdt_parse_timer_block(struct acpi_gtdt_timer_block *block,
+struct arch_timer_mem *data)
+{
+   int i;
+   struct acpi_gtdt_timer_entry *frame;
+
+   if (!block->timer_count) {
+   pr_err(FW_BUG "GT block present, but frame count is zero.");
+   return -ENODEV;
+   }
+
+   if (block->timer_count > ARCH_TIMER_MEM_MAX_FRAMES) {
+   pr_err(FW_BUG "GT block lists %d frames, ACPI spec only allows 
8\n",
+  block->timer_count);
+   return -EINVAL;
+   }
+
+   data->cntctlbase = (phys_addr_t)block->block_address;
+   /*
+* According to "Table * CNTCTLBase memory map" of
+*  for ARMv8,
+* The size of the CNTCTLBase frame is 4KB(Offset 0x000 – 0xFFC).
+*/
+   data->size = SZ_4K;
+   data->num_frames = block->timer_count;
+
+   frame = (void *)block + block->timer_offset;
+   if (frame + block->timer_count != (void *)block + block->header.length)
+   return -EINVAL;
+
+   /*
+* Get the GT timer Frame data for every GT Block Timer
+*/
+   for (i = 0; i < block->timer_count; i++, frame++) {
+   if (!frame->base_address || !frame->timer_interrupt)
+   return -EINVAL;
+
+   data->frame[i].phys_irq = map_gt_gsi(frame->timer_interrupt,
+frame->timer_flags);
+   if (data->frame[i].phys_irq <= 0) {
+   pr_warn("failed to map physical timer irq in frame 
%d.\n",
+   i);
+   return -EINVAL;
+   }
+
+   data->frame[i].virt_irq =
+   map_gt_gsi(frame->virtual_timer_interrupt,
+  frame->virtual_timer_flags);
+   if (data->frame[i].virt_irq <= 0) {
+   pr_warn("failed to map virtual timer irq in frame 
%d.\n",
+   i);
+   acpi_unregister_gsi(frame->timer_interrupt);
+   return -EINVAL;
+   }
+
+   data->frame[i].frame_nr = frame->frame_number;
+   data->frame[i].cntbase = frame->base_address;
+   /*
+* According to "Table * CNTBaseN memory map" of
+*  for ARMv8,
+* The size of the CNTBaseN frame is 4KB(Offset 0x000 – 0xFFC).
+*/
+   data->frame[i].size = SZ_4K;
+   }
+
+   return 0;
+}
+
+/**
+ * acpi_arch_timer_mem_init() - Get the info of all GT blocks in GTDT table.
+ * @data:  the pointer to the array of struct arch_timer_mem for returning
+ * the result of parsing. The element number of this array should
+ * be platform_timer_count(the total number of platform timers).
+ * @count: The pointer of int variate

[PATCH v18 11/15] acpi/arm64: Add GTDT table parse driver

2016-12-08 Thread fu . wei

From: Fu Wei 

This patch adds support for parsing arch timer info in GTDT,
provides some kernel APIs to parse all the PPIs and
always-on info in GTDT and export them.

By this driver, we can simplify arm_arch_timer drivers, and
separate the ACPI GTDT knowledge from it.

Signed-off-by: Fu Wei 
Signed-off-by: Hanjun Guo 
Acked-by: Rafael J. Wysocki 
Tested-by: Xiongfeng Wang 
---
 arch/arm64/Kconfig  |   1 +
 drivers/acpi/arm64/Kconfig  |   3 +
 drivers/acpi/arm64/Makefile |   1 +
 drivers/acpi/arm64/gtdt.c   | 157 
 include/linux/acpi.h|   6 ++
 5 files changed, 168 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 969ef88..4277a21 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2,6 +2,7 @@ config ARM64
def_bool y
select ACPI_CCA_REQUIRED if ACPI
select ACPI_GENERIC_GSI if ACPI
+   select ACPI_GTDT if ACPI
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
select ACPI_MCFG if ACPI
select ACPI_SPCR_TABLE if ACPI
diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig
index 4616da4..5a6f80f 100644
--- a/drivers/acpi/arm64/Kconfig
+++ b/drivers/acpi/arm64/Kconfig
@@ -4,3 +4,6 @@
 
 config ACPI_IORT
bool
+
+config ACPI_GTDT
+   bool
diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 72331f2..1017def 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_ACPI_IORT)+= iort.o
+obj-$(CONFIG_ACPI_GTDT)+= gtdt.o
diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
new file mode 100644
index 000..d93a790
--- /dev/null
+++ b/drivers/acpi/arm64/gtdt.c
@@ -0,0 +1,157 @@
+/*
+ * ARM Specific GTDT table Support
+ *
+ * Copyright (C) 2016, Linaro Ltd.
+ * Author: Daniel Lezcano 
+ * Fu Wei 
+ * Hanjun Guo 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ACPI GTDT: " fmt
+
+/**
+ * struct acpi_gtdt_descriptor - Store the key info of GTDT for all functions
+ * @gtdt:  The pointer to the struct acpi_table_gtdt of GTDT table.
+ * @gtdt_end:  The pointer to the end of GTDT table.
+ * @platform_timer:The pointer to the start of Platform Timer Structure
+ *
+ * The struct store the key info of GTDT table, it should be initialized by
+ * acpi_gtdt_init.
+ */
+struct acpi_gtdt_descriptor {
+   struct acpi_table_gtdt *gtdt;
+   void *gtdt_end;
+   void *platform_timer;
+};
+
+static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
+
+static int __init map_gt_gsi(u32 interrupt, u32 flags)
+{
+   int trigger, polarity;
+
+   trigger = (flags & ACPI_GTDT_INTERRUPT_MODE) ? ACPI_EDGE_SENSITIVE
+   : ACPI_LEVEL_SENSITIVE;
+
+   polarity = (flags & ACPI_GTDT_INTERRUPT_POLARITY) ? ACPI_ACTIVE_LOW
+   : ACPI_ACTIVE_HIGH;
+
+   return acpi_register_gsi(NULL, interrupt, trigger, polarity);
+}
+
+/**
+ * acpi_gtdt_map_ppi() - Map the PPIs of per-cpu arch_timer.
+ * @type:  the type of PPI.
+ *
+ * Note: Linux on arm64 isn't supported on the secure side.
+ * So we only handle the non-secure timer PPIs,
+ * ARCH_TIMER_PHYS_SECURE_PPI is treated as invalid type.
+ *
+ * Return: the mapped PPI value, 0 if error.
+ */
+int __init acpi_gtdt_map_ppi(int type)
+{
+   struct acpi_table_gtdt *gtdt = acpi_gtdt_desc.gtdt;
+
+   switch (type) {
+   case ARCH_TIMER_PHYS_NONSECURE_PPI:
+   return map_gt_gsi(gtdt->non_secure_el1_interrupt,
+ gtdt->non_secure_el1_flags);
+   case ARCH_TIMER_VIRT_PPI:
+   return map_gt_gsi(gtdt->virtual_timer_interrupt,
+ gtdt->virtual_timer_flags);
+
+   case ARCH_TIMER_HYP_PPI:
+   return map_gt_gsi(gtdt->non_secure_el2_interrupt,
+ gtdt->non_secure_el2_flags);
+   default:
+   pr_err("Failed to map timer interrupt: invalid type.\n");
+   }
+
+   return 0;
+}
+
+/**
+ * acpi_gtdt_c3stop() - Got c3stop info from GTDT according to the type of PPI.
+ * @type:  the type of PPI.
+ *
+ * Return: 1 if the timer can be in deep idle state, 0 otherwise.
+ */
+bool __init acpi_gtdt_c3stop(int type)
+{
+   struct acpi_table_gtdt *gtdt = acpi_gtdt_desc.gtdt;
+
+   switch (type) {
+   case ARCH_TIMER_PHYS_NONSECURE_PPI:
+   return !(gtdt->non_secure_el1_flags & ACPI_GTDT_ALWAYS_ON);
+
+   case ARCH_TIMER_VIRT_PPI:
+   return

[PATCH v18 15/15] acpi/arm64: Add SBSA Generic Watchdog support in GTDT driver

2016-12-08 Thread fu . wei

From: Fu Wei 

This driver adds support for parsing SBSA Generic Watchdog timer
in GTDT, parse all info in SBSA Generic Watchdog Structure in GTDT,
and creating a platform device with that information.

This allows the operating system to obtain device data from the
resource of platform device. The platform device named "sbsa-gwdt"
can be used by the ARM SBSA Generic Watchdog driver.

Signed-off-by: Fu Wei 
Signed-off-by: Hanjun Guo 
Tested-by: Xiongfeng Wang 
---
 drivers/acpi/arm64/gtdt.c | 93 +++
 drivers/watchdog/Kconfig  |  1 +
 2 files changed, 94 insertions(+)

diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index 91ea6cb..22d3659 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -59,6 +60,13 @@ static inline bool is_timer_block(void *platform_timer)
return gh->type == ACPI_GTDT_TYPE_TIMER_BLOCK;
 }
 
+static inline bool is_watchdog(void *platform_timer)
+{
+   struct acpi_gtdt_header *gh = platform_timer;
+
+   return gh->type == ACPI_GTDT_TYPE_WATCHDOG;
+}
+
 static int __init map_gt_gsi(u32 interrupt, u32 flags)
 {
int trigger, polarity;
@@ -279,3 +287,88 @@ int __init acpi_arch_timer_mem_init(struct arch_timer_mem 
*data,
 
return 0;
 }
+
+/*
+ * Initialize a SBSA generic Watchdog platform device info from GTDT
+ */
+static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
+   int index)
+{
+   struct platform_device *pdev;
+   int irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags);
+   int no_irq = 1;
+
+   /*
+* According to SBSA specification the size of refresh and control
+* frames of SBSA Generic Watchdog is SZ_4K(Offset 0x000 – 0xFFF).
+*/
+   struct resource res[] = {
+   DEFINE_RES_MEM(wd->control_frame_address, SZ_4K),
+   DEFINE_RES_MEM(wd->refresh_frame_address, SZ_4K),
+   DEFINE_RES_IRQ(irq),
+   };
+
+   pr_debug("found a Watchdog (0x%llx/0x%llx gsi:%u flags:0x%x).\n",
+wd->refresh_frame_address, wd->control_frame_address,
+wd->timer_interrupt, wd->timer_flags);
+
+   if (!(wd->refresh_frame_address && wd->control_frame_address)) {
+   pr_err(FW_BUG "failed to get the Watchdog base address.\n");
+   return -EINVAL;
+   }
+
+   if (!wd->timer_interrupt)
+   pr_warn(FW_BUG "failed to get the Watchdog interrupt.\n");
+   else if (irq <= 0)
+   pr_warn("failed to map the Watchdog interrupt.\n");
+   else
+   no_irq = 0;
+
+   /*
+* Add a platform device named "sbsa-gwdt" to match the platform driver.
+* "sbsa-gwdt": SBSA(Server Base System Architecture) Generic Watchdog
+* The platform driver (like drivers/watchdog/sbsa_gwdt.c)can get device
+* info below by matching this name.
+*/
+   pdev = platform_device_register_simple("sbsa-gwdt", index, res,
+  ARRAY_SIZE(res) - no_irq);
+   if (IS_ERR(pdev)) {
+   acpi_unregister_gsi(wd->timer_interrupt);
+   return PTR_ERR(pdev);
+   }
+
+   return 0;
+}
+
+static int __init gtdt_sbsa_gwdt_init(void)
+{
+   int ret, i = 0;
+   void *platform_timer;
+   struct acpi_table_header *table;
+
+   if (acpi_disabled)
+   return 0;
+
+   if (ACPI_FAILURE(acpi_get_table(ACPI_SIG_GTDT, 0, )))
+   return -EINVAL;
+
+   ret = acpi_gtdt_init(table, NULL);
+   if (ret)
+   return ret;
+
+   for_each_platform_timer(platform_timer) {
+   if (is_watchdog(platform_timer)) {
+   ret = gtdt_import_sbsa_gwdt(platform_timer, i);
+   if (ret)
+   break;
+   i++;
+   }
+   }
+
+   if (i)
+   pr_info("found %d SBSA generic Watchdog(s).\n", i);
+
+   return ret;
+}
+
+device_initcall(gtdt_sbsa_gwdt_init);
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 3eb58cb..a95c62d 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -219,6 +219,7 @@ config ARM_SBSA_WATCHDOG
tristate "ARM SBSA Generic Watchdog"
depends on ARM64
depends on ARM_ARCH_TIMER
+   depends on ACPI_GTDT || !ACPI
select WATCHDOG_CORE
help
  ARM SBSA Generic Watchdog has two stage timeouts:
-- 
2.9.3

[PATCH v18 14/15] clocksource/drivers/arm_arch_timer: Add GTDT support for memory-mapped timer

2016-12-08 Thread fu . wei

From: Fu Wei 

The patch add memory-mapped timer register support by using the
information provided by the new GTDT driver of ACPI.

Signed-off-by: Fu Wei 
---
 drivers/clocksource/arm_arch_timer.c | 35 ---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 7f059f9..1fe1c08 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -1054,10 +1054,36 @@ CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, 
"arm,armv7-timer-mem",
   arch_timer_mem_of_init);
 
 #ifdef CONFIG_ACPI_GTDT
-/* Initialize per-processor generic timer */
+static int __init arch_timer_mem_acpi_init(int platform_timer_count)
+{
+   struct arch_timer_mem *timer_mem;
+   int timer_count, i, ret;
+
+   timer_mem = kcalloc(platform_timer_count, sizeof(*timer_mem),
+   GFP_KERNEL);
+   if (!timer_mem)
+   return -ENOMEM;
+
+   ret = acpi_arch_timer_mem_init(timer_mem, _count);
+   if (ret || !timer_count)
+   goto error;
+
+   for (i = 0; i < timer_count; i++) {
+   ret = arch_timer_mem_init(timer_mem);
+   if (!ret)
+   break;
+   timer_mem++;
+   }
+
+error:
+   kfree(timer_mem);
+   return ret;
+}
+
+/* Initialize per-processor generic timer and memory-mapped timer(if present) 
*/
 static int __init arch_timer_acpi_init(struct acpi_table_header *table)
 {
-   int ret;
+   int ret, platform_timer_count;
 
if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
pr_warn("already initialized, skipping\n");
@@ -1066,7 +1092,7 @@ static int __init arch_timer_acpi_init(struct 
acpi_table_header *table)
 
arch_timers_present |= ARCH_TIMER_TYPE_CP15;
 
-   ret = acpi_gtdt_init(table, NULL);
+   ret = acpi_gtdt_init(table, _timer_count);
if (ret) {
pr_err("Failed to init GTDT table.\n");
return ret;
@@ -1099,6 +1125,9 @@ static int __init arch_timer_acpi_init(struct 
acpi_table_header *table)
if (ret)
return ret;
 
+   if (arch_timer_mem_acpi_init(platform_timer_count))
+   pr_err("Failed to initialize memory-mapped timer.\n");
+
return arch_timer_common_init();
 }
 CLOCKSOURCE_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
-- 
2.9.3

[PATCH v18 12/15] clocksource/drivers/arm_arch_timer: Simplify ACPI support code.

2016-12-08 Thread fu . wei

From: Fu Wei 

The patch update arm_arch_timer driver to use the function
provided by the new GTDT driver of ACPI.
By this way, arm_arch_timer.c can be simplified, and separate
all the ACPI GTDT knowledge from this timer driver.

Signed-off-by: Fu Wei 
Signed-off-by: Hanjun Guo 
Tested-by: Xiongfeng Wang 
---
 drivers/clocksource/arm_arch_timer.c | 46 ++--
 1 file changed, 13 insertions(+), 33 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index b02a406..7f059f9 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -1053,59 +1053,36 @@ static int __init arch_timer_mem_of_init(struct 
device_node *np)
 CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem",
   arch_timer_mem_of_init);
 
-#ifdef CONFIG_ACPI
-static int __init map_generic_timer_interrupt(u32 interrupt, u32 flags)
-{
-   int trigger, polarity;
-
-   if (!interrupt)
-   return 0;
-
-   trigger = (flags & ACPI_GTDT_INTERRUPT_MODE) ? ACPI_EDGE_SENSITIVE
-   : ACPI_LEVEL_SENSITIVE;
-
-   polarity = (flags & ACPI_GTDT_INTERRUPT_POLARITY) ? ACPI_ACTIVE_LOW
-   : ACPI_ACTIVE_HIGH;
-
-   return acpi_register_gsi(NULL, interrupt, trigger, polarity);
-}
-
+#ifdef CONFIG_ACPI_GTDT
 /* Initialize per-processor generic timer */
 static int __init arch_timer_acpi_init(struct acpi_table_header *table)
 {
int ret;
-   struct acpi_table_gtdt *gtdt;
 
if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
pr_warn("already initialized, skipping\n");
return -EINVAL;
}
 
-   gtdt = container_of(table, struct acpi_table_gtdt, header);
-
arch_timers_present |= ARCH_TIMER_TYPE_CP15;
 
-   arch_timer_ppi[ARCH_TIMER_PHYS_SECURE_PPI] =
-   map_generic_timer_interrupt(gtdt->secure_el1_interrupt,
-   gtdt->secure_el1_flags);
+   ret = acpi_gtdt_init(table, NULL);
+   if (ret) {
+   pr_err("Failed to init GTDT table.\n");
+   return ret;
+   }
 
arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI] =
-   map_generic_timer_interrupt(gtdt->non_secure_el1_interrupt,
-   gtdt->non_secure_el1_flags);
+   acpi_gtdt_map_ppi(ARCH_TIMER_PHYS_NONSECURE_PPI);
 
arch_timer_ppi[ARCH_TIMER_VIRT_PPI] =
-   map_generic_timer_interrupt(gtdt->virtual_timer_interrupt,
-   gtdt->virtual_timer_flags);
+   acpi_gtdt_map_ppi(ARCH_TIMER_VIRT_PPI);
 
arch_timer_ppi[ARCH_TIMER_HYP_PPI] =
-   map_generic_timer_interrupt(gtdt->non_secure_el2_interrupt,
-   gtdt->non_secure_el2_flags);
+   acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI);
 
arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
 
-   /* Get the frequency from CNTFRQ */
-   arch_timer_detect_rate();
-
arch_timer_uses_ppi = arch_timer_select_ppi();
if (!arch_timer_ppi[arch_timer_uses_ppi]) {
pr_err("No interrupt available, giving up\n");
@@ -1113,7 +1090,10 @@ static int __init arch_timer_acpi_init(struct 
acpi_table_header *table)
}
 
/* Always-on capability */
-   arch_timer_c3stop = !(gtdt->non_secure_el1_flags & ACPI_GTDT_ALWAYS_ON);
+   arch_timer_c3stop = acpi_gtdt_c3stop(arch_timer_uses_ppi);
+
+   /* Get the frequency from CNTFRQ */
+   arch_timer_detect_rate();
 
ret = arch_timer_register();
if (ret)
-- 
2.9.3

Re: [PATCH v2 4/5] arm64: dts: exynos5433: Add bus dt node using VDD_INT for Exynos5433

2016-12-08 Thread Krzysztof Kozlowski

On Thu, Dec 08, 2016 at 01:58:10PM +0900, Chanwoo Choi wrote:
> This patch adds the bus nodes using VDD_INT for Exynos5433 SoC.
> Exynos5433 has the following AMBA AXI buses to translate data
> between DRAM and sub-blocks.
> 
> Following list specify the detailed correlation between sub-block and clock:
> - CLK_ACLK_G2D_{400|266}  : Bus clock for G2D (2D graphic engine)
> - CLK_ACLK_MSCL_400   : Bus clock for MSCL (Memory to memory Scaler)
> - CLK_ACLK_GSCL_333   : Bus clock for GSCL (General Scaler)
> - CLK_SCLK_JPEG_MSCL  : Bus clock for JPEG
> - CLK_ACLK_MFC_400: Bus clock for MFC (Multi Format Codec)
> - CLK_ACLK_HEVC_400   : Bus clock for HEVC (High Efficient Video Codec)
> - CLK_ACLK_BUS0_400   : NoC(Network On Chip)'s bus clock for 
> PERIC/PERIS/FSYS/MSCL
> - CLK_ACLK_BUS1_400   : NoC's bus clock for MFC/HEVC/G3D
> - CLK_ACLK_BUS2_400   : NoC's bus clock for GSCL/DISP/G2D/CAM0/CAM1/ISP
> 
> Signed-off-by: Chanwoo Choi 
> ---
>  arch/arm64/boot/dts/exynos/exynos5433-bus.dtsi | 197 
> +
>  arch/arm64/boot/dts/exynos/exynos5433.dtsi |   1 +
>  2 files changed, 198 insertions(+)
>  create mode 100644 arch/arm64/boot/dts/exynos/exynos5433-bus.dtsi

For the reference:
Reviewed-by: Krzysztof Kozlowski 

I'll queue it for v4.11, after this merge window.

Best regards,
Krzysztof

Re: [PATCH 3/3] powerpc: enable support for GCC plugins

2016-12-08 Thread Kees Cook

On Thu, Dec 8, 2016 at 6:42 AM, PaX Team  wrote:
> On 6 Dec 2016 at 17:28, Andrew Donnellan wrote:
>
>> Enable support for GCC plugins on powerpc.
>>
>> Add an additional version check in gcc-plugins-check to advise users to
>> upgrade to gcc 5.2+ on powerpc to avoid issues with header files (gcc <=
>> 4.6) or missing copies of rs6000-cpus.def (4.8 to 5.1 on 64-bit targets).
>
> i don't think that this is the right approach. there's a general and a special
> issue here, both of which need different handling.
>
> the general problem is to detect problems related to gcc plugin headers and
> notify the users about solutions. emitting various messages from a Makefile
> is certainly not a scalable approach, just imagine how it will look when the
> other 30+ archs begin to add their own special cases... if anything, they
> should be documented in Documentation/gcc-plugins.txt (or a new doc if it
> grows too big) and the Makefile message should just point at it.
>
> as for the solutions, the general advice should enable the use of otherwise
> failing gcc versions instead of forcing updating to new ones (though the
> latter is advisable for other reasons but not everyone's in the position to
> do so easily). in my experience all one needs to do is manually install the
> missing files from the gcc sources (ideally distros would take care of it).
>
> the specific problem addressed here can (and IMHO should) be solved in
> another way: remove the inclusion of the offending headers in gcc-common.h
> as neither tm.h nor c-common.h are needed by existing plugins. for background,
> i created gcc-common.h to simplify plugin development across all supportable
> gcc versions i came across over the years, so it follows the 'everything but
> the kitchen sink' approach. that isn't necessarily what the kernel and other
> projects need so they should just use my version as a basis and fork/simplify
> it (even i maintain private forks of the public version).

If removing those will lower the requirement for PPC, that would be
ideal. Otherwise, I'd like to take the practical approach of making
the plugins available on PPC right now, with an eye towards relaxing
the version requirement as people need it.

> as for the location of c-common.h, upstream gcc moved it under c-family in
> 2010 after the release of 4.5, so it should be where gcc-common.h expects
> it and i'm not sure how it ended up at its old location for you.

That is rather odd. What distro was the PPC test done on? (Or were
these manually built gcc versions?)

-Kees

-- 
Kees Cook
Nexus Security

Re: ATH9 driver issues on ARM64

2016-12-08 Thread Marc Zyngier

On 08/12/16 15:29, Bharat Kumar Gogada wrote:

Two things:

> Here is the cat /proc/interrupts (after we do interface up):
> 
> root@:~# ifconfig wlan0 up
> [ 1548.926601] IPv6: ADDRCONF(NETDEV_UP): wlan0: link is not ready
> root@Xilinx-ZCU102-2016_3:~# cat /proc/interrupts 
>CPU0   CPU1   CPU2   CPU3   
>   1:  0  0  0  0 GICv2  29 Edge  
> arch_timer
>   2:  19873  20058  19089  17435 GICv2  30 Edge  
> arch_timer
>  12:  0  0  0  0 GICv2 156 Level 
> zynqmp-dma
>  13:  0  0  0  0 GICv2 157 Level 
> zynqmp-dma
>  14:  0  0  0  0 GICv2 158 Level 
> zynqmp-dma
>  15:  0  0  0  0 GICv2 159 Level 
> zynqmp-dma
>  16:  0  0  0  0 GICv2 160 Level 
> zynqmp-dma
>  17:  0  0  0  0 GICv2 161 Level 
> zynqmp-dma
>  18:  0  0  0  0 GICv2 162 Level 
> zynqmp-dma
>  19:  0  0  0  0 GICv2 163 Level 
> zynqmp-dma
>  20:  0  0  0  0 GICv2 164 Level 
> Mali_GP_MMU, Mali_GP, Mali_PP0_MMU, Mali_PP0, Mali_PP1_MMU, Mali_PP1

I'm not even going to consider looking at something that is running out
of tree code. So please start things with a fresh kernel that doesn't
contain stuff we can't debug.

>  30:  0  0  0  0 GICv2  95 Level 
> eth0, eth0
> 206:314  0  0  0 GICv2  49 Level 
> cdns-i2c
> 207: 40  0  0  0 GICv2  50 Level 
> cdns-i2c
> 209:  0  0  0  0 GICv2 150 Level 
> nwl_pcie:misc
> 214: 12  0  0  0 GICv2  47 Level 
> ff0f.spi
> 215:  0  0  0  0 GICv2  58 Level 
> ffa6.rtc
> 216:  0  0  0  0 GICv2  59 Level 
> ffa6.rtc
> 217:  0  0  0  0 GICv2 165 Level 
> ahci-ceva[fd0c.ahci]
> 218: 61  0  0  0 GICv2  81 Level mmc0
> 219:  0  0  0  0 GICv2 187 Level 
> arm-smmu global fault
> 220:471  0  0  0 GICv2  53 Level 
> xuartps
> 223:  0  0  0  0 GICv2 154 Level 
> fd4c.dma
> 224:  3  0  0  0 dummy   1 Edge  ath9k

What is this "dummy" controller? And if that's supposed to be a legacy
interrupt from the PCI device, it has the wrong trigger.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...

[PATCH v3 14/15] livepatch: add /proc//patch_state

2016-12-08 Thread Josh Poimboeuf

Expose the per-task patch state value so users can determine which tasks
are holding up completion of a patching operation.

Signed-off-by: Josh Poimboeuf 
---
 Documentation/filesystems/proc.txt | 18 ++
 fs/proc/base.c | 15 +++
 2 files changed, 33 insertions(+)

diff --git a/Documentation/filesystems/proc.txt 
b/Documentation/filesystems/proc.txt
index 72624a1..85c501b 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -44,6 +44,7 @@ Table of Contents
   3.8   /proc//fdinfo/ - Information about opened file
   3.9   /proc//map_files - Information about memory mapped files
   3.10  /proc//timerslack_ns - Task timerslack value
+  3.11 /proc//patch_state - Livepatch patch operation state
 
   4Configuring procfs
   4.1  Mount options
@@ -1886,6 +1887,23 @@ Valid values are from 0 - ULLONG_MAX
 An application setting the value must have PTRACE_MODE_ATTACH_FSCREDS level
 permissions on the task specified to change its timerslack_ns value.
 
+3.11   /proc//patch_state - Livepatch patch operation state
+-
+When CONFIG_LIVEPATCH is enabled, this file displays the value of the
+patch state for the task.
+
+A value of '-1' indicates that no patch is in transition.
+
+A value of '0' indicates that a patch is in transition and the task is
+unpatched.  If the patch is being enabled, then the task hasn't been
+patched yet.  If the patch is being disabled, then the task has already
+been unpatched.
+
+A value of '1' indicates that a patch is in transition and the task is
+patched.  If the patch is being enabled, then the task has already been
+patched.  If the patch is being disabled, then the task hasn't been
+unpatched yet.
+
 
 --
 Configuring procfs
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5ea8363..2e1e012 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2841,6 +2841,15 @@ static int proc_pid_personality(struct seq_file *m, 
struct pid_namespace *ns,
return err;
 }
 
+#ifdef CONFIG_LIVEPATCH
+static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
+   struct pid *pid, struct task_struct *task)
+{
+   seq_printf(m, "%d\n", task->patch_state);
+   return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
 /*
  * Thread groups
  */
@@ -2940,6 +2949,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("timers", S_IRUGO, proc_timers_operations),
 #endif
REG("timerslack_ns", S_IRUGO|S_IWUGO, 
proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_LIVEPATCH
+   ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3320,6 +3332,9 @@ static const struct pid_entry tid_base_stuff[] = {
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
 #endif
+#ifdef CONFIG_LIVEPATCH
+   ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
-- 
2.7.4

[PATCH v3 12/15] livepatch: store function sizes

2016-12-08 Thread Josh Poimboeuf

For the consistency model we'll need to know the sizes of the old and
new functions to determine if they're on the stacks of any tasks.

Signed-off-by: Josh Poimboeuf 
---
 include/linux/livepatch.h |  3 +++
 kernel/livepatch/core.c   | 16 
 2 files changed, 19 insertions(+)

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 1e2eb91..1a5a93c 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -37,6 +37,8 @@
  * @old_addr:  the address of the function being patched
  * @kobj:  kobject for sysfs resources
  * @stack_node:list node for klp_ops func_stack list
+ * @old_size:  size of the old function
+ * @new_size:  size of the new function
  * @patched:   the func has been added to the klp_ops list
  */
 struct klp_func {
@@ -56,6 +58,7 @@ struct klp_func {
unsigned long old_addr;
struct kobject kobj;
struct list_head stack_node;
+   unsigned long old_size, new_size;
bool patched;
 };
 
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 8ca8a0e..fc160c6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -584,6 +584,22 @@ static int klp_init_object_loaded(struct klp_patch *patch,
 >old_addr);
if (ret)
return ret;
+
+   ret = kallsyms_lookup_size_offset(func->old_addr,
+ >old_size, NULL);
+   if (!ret) {
+   pr_err("kallsyms size lookup failed for '%s'\n",
+  func->old_name);
+   return -ENOENT;
+   }
+
+   ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
+ >new_size, NULL);
+   if (!ret) {
+   pr_err("kallsyms size lookup failed for '%s' 
replacement\n",
+  func->old_name);
+   return -ENOENT;
+   }
}
 
return 0;
-- 
2.7.4

[PATCH v3 05/15] livepatch/powerpc: add TIF_PATCH_PENDING thread flag

2016-12-08 Thread Josh Poimboeuf

Add the TIF_PATCH_PENDING thread flag to enable the new livepatch
per-task consistency model for powerpc.  The bit getting set indicates
the thread has a pending patch which needs to be applied when the thread
exits the kernel.

The bit is included in the _TIF_USER_WORK_MASK macro so that
do_notify_resume() and klp_update_patch_state() get called when the bit
is set.

Signed-off-by: Josh Poimboeuf 
---
 arch/powerpc/include/asm/thread_info.h | 4 +++-
 arch/powerpc/kernel/signal.c   | 4 
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/thread_info.h 
b/arch/powerpc/include/asm/thread_info.h
index 87e4b2d..6fc6464 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -92,6 +92,7 @@ static inline struct thread_info *current_thread_info(void)
   TIF_NEED_RESCHED */
 #define TIF_32BIT  4   /* 32 bit binary */
 #define TIF_RESTORE_TM 5   /* need to restore TM FP/VEC/VSX */
+#define TIF_PATCH_PENDING  6   /* pending live patching update */
 #define TIF_SYSCALL_AUDIT  7   /* syscall auditing active */
 #define TIF_SINGLESTEP 8   /* singlestepping active */
 #define TIF_NOHZ   9   /* in adaptive nohz mode */
@@ -115,6 +116,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_POLLING_NRFLAG(1<

[PATCH v3 15/15] livepatch: allow removal of a disabled patch

2016-12-08 Thread Josh Poimboeuf

From: Miroslav Benes 

Currently we do not allow patch module to unload since there is no
method to determine if a task is still running in the patched code.

The consistency model gives us the way because when the unpatching
finishes we know that all tasks were marked as safe to call an original
function. Thus every new call to the function calls the original code
and at the same time no task can be somewhere in the patched code,
because it had to leave that code to be marked as safe.

We can safely let the patch module go after that.

Completion is used for synchronization between module removal and sysfs
infrastructure in a similar way to commit 942e443127e9 ("module: Fix
mod->mkobj.kobj potentially freed too early").

Note that we still do not allow the removal for immediate model, that is
no consistency model. The module refcount may increase in this case if
somebody disables and enables the patch several times. This should not
cause any harm.

With this change a call to try_module_get() is moved to
__klp_enable_patch from klp_register_patch to make module reference
counting symmetric (module_put() is in a patch disable path) and to
allow to take a new reference to a disabled module when being enabled.

Also all kobject_put(>kobj) calls are moved outside of klp_mutex
lock protection to prevent a deadlock situation when
klp_unregister_patch is called and sysfs directories are removed. There
is no need to do the same for other kobject_put() callsites as we
currently do not have their sysfs counterparts.

Signed-off-by: Miroslav Benes 
Signed-off-by: Josh Poimboeuf 
---
 Documentation/livepatch/livepatch.txt | 29 -
 include/linux/livepatch.h |  3 ++
 kernel/livepatch/core.c   | 80 ++-
 kernel/livepatch/transition.c | 12 +-
 samples/livepatch/livepatch-sample.c  |  1 -
 5 files changed, 72 insertions(+), 53 deletions(-)

diff --git a/Documentation/livepatch/livepatch.txt 
b/Documentation/livepatch/livepatch.txt
index f87e742..b0eaaf8 100644
--- a/Documentation/livepatch/livepatch.txt
+++ b/Documentation/livepatch/livepatch.txt
@@ -265,8 +265,15 @@ section "Livepatch life-cycle" below for more details 
about these
 two operations.
 
 Module removal is only safe when there are no users of the underlying
-functions.  The immediate consistency model is not able to detect this;
-therefore livepatch modules cannot be removed. See "Limitations" below.
+functions. The immediate consistency model is not able to detect this. The
+code just redirects the functions at the very beginning and it does not
+check if the functions are in use. In other words, it knows when the
+functions get called but it does not know when the functions return.
+Therefore it cannot be decided when the livepatch module can be safely
+removed. This is solved by a hybrid consistency model. When the system is
+transitioned to a new patch state (patched/unpatched) it is guaranteed that
+no task sleeps or runs in the old code.
+
 
 5. Livepatch life-cycle
 ===
@@ -437,24 +444,6 @@ The current Livepatch implementation has several 
limitations:
 There is work in progress to remove this limitation.
 
 
-  + Livepatch modules can not be removed.
-
-The current implementation just redirects the functions at the very
-beginning. It does not check if the functions are in use. In other
-words, it knows when the functions get called but it does not
-know when the functions return. Therefore it can not decide when
-the livepatch module can be safely removed.
-
-This will get most likely solved once a more complex consistency model
-is supported. The idea is that a safe state for patching should also
-mean a safe state for removing the patch.
-
-Note that the patch itself might get disabled by writing zero
-to /sys/kernel/livepatch//enabled. It causes that the new
-code will not longer get called. But it does not guarantee
-that anyone is not sleeping anywhere in the new code.
-
-
   + Livepatch works reliably only when the dynamic ftrace is located at
 the very beginning of the function.
 
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 8e06fe5..1959e52 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -23,6 +23,7 @@
 
 #include 
 #include 
+#include 
 
 #if IS_ENABLED(CONFIG_LIVEPATCH)
 
@@ -114,6 +115,7 @@ struct klp_object {
  * @list:  list node for global list of registered patches
  * @kobj:  kobject for sysfs resources
  * @enabled:   the patch is enabled (but operation may be incomplete)
+ * @finish:for waiting till it is safe to remove the patch module
  */
 struct klp_patch {
/* external */
@@ -125,6 +127,7 @@ struct klp_patch {
struct list_head list;
struct kobject kobj;
bool enabled;
+   struct completion finish;
 };
 
 #define

[PATCH v3 07/15] livepatch/s390: add TIF_PATCH_PENDING thread flag

2016-12-08 Thread Josh Poimboeuf

From: Miroslav Benes 

Update a task's patch state when returning from a system call or user
space interrupt, or after handling a signal.

This greatly increases the chances of a patch operation succeeding.  If
a task is I/O bound, it can be patched when returning from a system
call.  If a task is CPU bound, it can be patched when returning from an
interrupt.  If a task is sleeping on a to-be-patched function, the user
can send SIGSTOP and SIGCONT to force it to switch.

Since there are two ways the syscall can be restarted on return from a
signal handling process, it is important to clear the flag before
do_signal() is called. Otherwise we could miss the migration if we used
SIGSTOP/SIGCONT procedure or fake signal to migrate patching blocking
tasks. If we place our hook to sysc_work label in entry before
TIF_SIGPENDING is evaluated we kill two birds with one stone. The task
is correctly migrated in all return paths from a syscall.

Signed-off-by: Miroslav Benes 
Signed-off-by: Josh Poimboeuf 
---
 arch/s390/include/asm/thread_info.h |  2 ++
 arch/s390/kernel/entry.S| 31 ++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/thread_info.h 
b/arch/s390/include/asm/thread_info.h
index 4977668..646845e 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -56,6 +56,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src);
 #define TIF_SIGPENDING 1   /* signal pending */
 #define TIF_NEED_RESCHED   2   /* rescheduling necessary */
 #define TIF_UPROBE 3   /* breakpointed or single-stepping */
+#define TIF_PATCH_PENDING  4   /* pending live patching update */
 
 #define TIF_31BIT  16  /* 32bit process */
 #define TIF_MEMDIE 17  /* is terminating due to OOM killer */
@@ -74,6 +75,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src);
 #define _TIF_SIGPENDING_BITUL(TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED  _BITUL(TIF_NEED_RESCHED)
 #define _TIF_UPROBE_BITUL(TIF_UPROBE)
+#define _TIF_PATCH_PENDING _BITUL(TIF_PATCH_PENDING)
 
 #define _TIF_31BIT _BITUL(TIF_31BIT)
 #define _TIF_SINGLE_STEP   _BITUL(TIF_SINGLE_STEP)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 161f4e6..33848a8 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -47,7 +47,7 @@ STACK_SIZE  = 1 << STACK_SHIFT
 STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
 
 _TIF_WORK  = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
-  _TIF_UPROBE)
+  _TIF_UPROBE | _TIF_PATCH_PENDING)
 _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
   _TIF_SYSCALL_TRACEPOINT)
 _CIF_WORK  = (_CIF_MCCK_PENDING | _CIF_ASCE | _CIF_FPU)
@@ -352,6 +352,11 @@ ENTRY(system_call)
 #endif
TSTMSK  __PT_FLAGS(%r11),_PIF_PER_TRAP
jo  .Lsysc_singlestep
+#ifdef CONFIG_LIVEPATCH
+   TSTMSK  __TI_flags(%r12),_TIF_PATCH_PENDING
+   jo  .Lsysc_patch_pending# handle live patching just before
+   # signals and possible syscall restart
+#endif
TSTMSK  __TI_flags(%r12),_TIF_SIGPENDING
jo  .Lsysc_sigpending
TSTMSK  __TI_flags(%r12),_TIF_NOTIFY_RESUME
@@ -426,6 +431,16 @@ ENTRY(system_call)
 #endif
 
 #
+# _TIF_PATCH_PENDING is set, call klp_update_patch_state
+#
+#ifdef CONFIG_LIVEPATCH
+.Lsysc_patch_pending:
+   lg  %r2,__LC_CURRENT# pass pointer to task struct
+   larl%r14,.Lsysc_return
+   jg  klp_update_patch_state
+#endif
+
+#
 # _PIF_PER_TRAP is set, call do_per_trap
 #
 .Lsysc_singlestep:
@@ -674,6 +689,10 @@ ENTRY(io_int_handler)
jo  .Lio_mcck_pending
TSTMSK  __TI_flags(%r12),_TIF_NEED_RESCHED
jo  .Lio_reschedule
+#ifdef CONFIG_LIVEPATCH
+   TSTMSK  __TI_flags(%r12),_TIF_PATCH_PENDING
+   jo  .Lio_patch_pending
+#endif
TSTMSK  __TI_flags(%r12),_TIF_SIGPENDING
jo  .Lio_sigpending
TSTMSK  __TI_flags(%r12),_TIF_NOTIFY_RESUME
@@ -720,6 +739,16 @@ ENTRY(io_int_handler)
j   .Lio_return
 
 #
+# _TIF_PATCH_PENDING is set, call klp_update_patch_state
+#
+#ifdef CONFIG_LIVEPATCH
+.Lio_patch_pending:
+   lg  %r2,__LC_CURRENT# pass pointer to task struct
+   larl%r14,.Lio_return
+   jg  klp_update_patch_state
+#endif
+
+#
 # _TIF_SIGPENDING or is set, call do_signal
 #
 .Lio_sigpending:
-- 
2.7.4

[PATCH v3 00/15] livepatch: hybrid consistency model

2016-12-08 Thread Josh Poimboeuf

Dusting the cobwebs off the consistency model again.  This is based on
linux-next/master.

v1 was posted on 2015-02-09:

  https://lkml.kernel.org/r/cover.1423499826.git.jpoim...@redhat.com

v2 was posted on 2016-04-28:

  https://lkml.kernel.org/r/cover.1461875890.git.jpoim...@redhat.com

The biggest issue from v2 was finding a decent way to detect preemption
and page faults on the stack of a sleeping task.  That problem was
solved by rewriting the x86 stack unwinder.  The new unwinder helps
detect such cases by finding all pt_regs on the stack.  When
preemption/page faults are detected, the stack is considered unreliable
and the patching of the task is deferred.

For more details about the consistency model, see patch 13/15.

---

v3:
- rebase on new x86 unwinder
- force !HAVE_RELIABLE_STACKTRACE arches to use patch->immediate for
  now, because we don't have a way to transition kthreads otherwise
- rebase s390 TIF_PATCH_PENDING patch onto latest entry code
- update barrier comments and move barrier from the end of
  klp_init_transition() to its callers
- "klp_work" -> "klp_transition_work"
- "klp_patch_task()" -> "klp_update_patch_state()"
- explicit _TIF_ALLWORK_MASK
- change klp_reverse_transition() to not try to complete transition.
  instead modify the work queue delay to zero.
- get rid of klp_schedule_work() in favor of calling
  schedule_delayed_work() directly with a KLP_TRANSITION_DELAY
- initialize klp_target_state to KLP_UNDEFINED
- move klp_target_state assignment to before patch->immediate check in
  klp_init_transition()
- rcu_read_lock() in klp_update_patch_state(), test the thread flag in
  patch task, synchronize_rcu() in klp_complete_transition()
- use kstrtobool() in enabled_store()
- change task_rq_lock() argument type to struct rq_flags
- add several WARN_ON_ONCE assertions for klp_target_state and
  task->patch_state

v2:
- "universe" -> "patch state"
- rename klp_update_task_universe() -> klp_patch_task()
- add preempt IRQ tracking (TF_PREEMPT_IRQ)
- fix print_context_stack_reliable() bug
- improve print_context_stack_reliable() comments
- klp_ftrace_handler comment fixes
- add "patch_state" proc file to tid_base_stuff
- schedule work even for !RELIABLE_STACKTRACE
- forked child inherits patch state from parent
- add detailed comment to livepatch.h klp_func definition about the
  klp_func patched/transition state transitions
- update exit_to_usermode_loop() comment
- clear all TIF_KLP_NEED_UPDATE flags in klp_complete_transition()
- remove unnecessary function externs
- add livepatch documentation, sysfs documentation, /proc documentation
- /proc/pid/patch_state: -1 means no patch is currently being applied/reverted
- "TIF_KLP_NEED_UPDATE" -> "TIF_PATCH_PENDING"
- support for s390 and powerpc-le
- don't assume stacks with dynamic ftrace trampolines are reliable
- add _TIF_ALLWORK_MASK info to commit log

v1.9:
- revive from the dead and rebased
- reliable stacks!
- add support for immediate consistency model
- add a ton of comments
- fix up memory barriers
- remove "allow patch modules to be removed" patch for now, it still 
  needs more discussion and thought - it can be done with something
- "proc/pid/universe" -> "proc/pid/patch_status"
- remove WARN_ON_ONCE from !func condition in ftrace handler -- can
  happen because of RCU
- keep klp_mutex private by putting the work_fn in core.c
- convert states from int to boolean
- remove obsolete '@state' comments
- several header file and include improvements suggested by Jiri S
- change kallsyms_lookup_size_offset() errors from EINVAL -> ENOENT
- change proc file permissions S_IRUGO -> USR
- use klp_for_each_object/func helpers

---

Jiri Slaby (1):
  livepatch/s390: reorganize TIF thread flag bits

Josh Poimboeuf (12):
  stacktrace/x86: add function for detecting reliable stack traces
  x86/entry: define _TIF_ALLWORK_MASK flags explicitly
  livepatch: temporary stubs for klp_patch_pending() and
klp_update_patch_state()
  livepatch/x86: add TIF_PATCH_PENDING thread flag
  livepatch/powerpc: add TIF_PATCH_PENDING thread flag
  livepatch: separate enabled and patched states
  livepatch: remove unnecessary object loaded check
  livepatch: move patching functions into patch.c
  livepatch: use kstrtobool() in enabled_store()
  livepatch: store function sizes
  livepatch: change to a per-task consistency model
  livepatch: add /proc//patch_state

Miroslav Benes (2):
  livepatch/s390: add TIF_PATCH_PENDING thread flag
  livepatch: allow removal of a disabled patch

 Documentation/ABI/testing/sysfs-kernel-livepatch |   8 +
 Documentation/filesystems/proc.txt   |  18 +
 Documentation/livepatch/livepatch.txt| 156 ++--
 arch/Kconfig |   6 +
 arch/powerpc/include/asm/thread_info.h   |   4 +-
 arch/powerpc/kernel/signal.c |   4 +
 arch/s390/include/asm/thread_info.h  |  24 +-
 arch/s390/kernel/entry.S |  31 +-

[PATCH v3 04/15] livepatch/x86: add TIF_PATCH_PENDING thread flag

2016-12-08 Thread Josh Poimboeuf

Add the TIF_PATCH_PENDING thread flag to enable the new livepatch
per-task consistency model for x86_64.  The bit getting set indicates
the thread has a pending patch which needs to be applied when the thread
exits the kernel.

The bit is placed in the _TIF_ALLWORK_MASK macro, which results in
exit_to_usermode_loop() calling klp_update_patch_state() when it's set.

Signed-off-by: Josh Poimboeuf 
---
 arch/x86/entry/common.c| 9 ++---
 arch/x86/include/asm/thread_info.h | 4 +++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index bdd9cc5..16a51a5 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -129,14 +130,13 @@ static long syscall_trace_enter(struct pt_regs *regs)
 
 #define EXIT_TO_USERMODE_LOOP_FLAGS\
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
-_TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
+_TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
 
 static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 {
/*
 * In order to return to user mode, we need to have IRQs off with
-* none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
-* _TIF_UPROBE, or _TIF_NEED_RESCHED set.  Several of these flags
+* none of EXIT_TO_USERMODE_LOOP_FLAGS set.  Several of these flags
 * can be set at any time on preemptable kernels if we have IRQs on,
 * so we need to loop.  Disabling preemption wouldn't help: doing the
 * work to clear some of the flags can sleep.
@@ -163,6 +163,9 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 
cached_flags)
if (cached_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
 
+   if (cached_flags & _TIF_PATCH_PENDING)
+   klp_update_patch_state(current);
+
/* Disable IRQs and retry */
local_irq_disable();
 
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index 1fe6043..79f4d6a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -84,6 +84,7 @@ struct thread_info {
 #define TIF_SECCOMP8   /* secure computing */
 #define TIF_USER_RETURN_NOTIFY 11  /* notify kernel of userspace return */
 #define TIF_UPROBE 12  /* breakpointed or singlestepping */
+#define TIF_PATCH_PENDING  13  /* pending live patching update */
 #define TIF_NOTSC  16  /* TSC is not accessible in userland */
 #define TIF_IA32   17  /* IA32 compatibility process */
 #define TIF_NOHZ   19  /* in adaptive nohz mode */
@@ -107,6 +108,7 @@ struct thread_info {
 #define _TIF_SECCOMP   (1 << TIF_SECCOMP)
 #define _TIF_USER_RETURN_NOTIFY(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE(1 << TIF_UPROBE)
+#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
 #define _TIF_NOTSC (1 << TIF_NOTSC)
 #define _TIF_IA32  (1 << TIF_IA32)
 #define _TIF_NOHZ  (1 << TIF_NOHZ)
@@ -133,7 +135,7 @@ struct thread_info {
(_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING |\
 _TIF_SINGLESTEP | _TIF_NEED_RESCHED | _TIF_SYSCALL_EMU |   \
 _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE |   \
-_TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)
+_TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ | _TIF_PATCH_PENDING)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW
\
-- 
2.7.4

[PATCH v3 08/15] livepatch: separate enabled and patched states

2016-12-08 Thread Josh Poimboeuf

Once we have a consistency model, patches and their objects will be
enabled and disabled at different times.  For example, when a patch is
disabled, its loaded objects' funcs can remain registered with ftrace
indefinitely until the unpatching operation is complete and they're no
longer in use.

It's less confusing if we give them different names: patches can be
enabled or disabled; objects (and their funcs) can be patched or
unpatched:

- Enabled means that a patch is logically enabled (but not necessarily
  fully applied).

- Patched means that an object's funcs are registered with ftrace and
  added to the klp_ops func stack.

Also, since these states are binary, represent them with booleans
instead of ints.

Signed-off-by: Josh Poimboeuf 
---
 include/linux/livepatch.h | 17 ---
 kernel/livepatch/core.c   | 72 +++
 2 files changed, 42 insertions(+), 47 deletions(-)

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 60558d8..1e2eb91 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -28,11 +28,6 @@
 
 #include 
 
-enum klp_state {
-   KLP_DISABLED,
-   KLP_ENABLED
-};
-
 /**
  * struct klp_func - function structure for live patching
  * @old_name:  name of the function to be patched
@@ -41,8 +36,8 @@ enum klp_state {
  * can be found (optional)
  * @old_addr:  the address of the function being patched
  * @kobj:  kobject for sysfs resources
- * @state: tracks function-level patch application state
  * @stack_node:list node for klp_ops func_stack list
+ * @patched:   the func has been added to the klp_ops list
  */
 struct klp_func {
/* external */
@@ -60,8 +55,8 @@ struct klp_func {
/* internal */
unsigned long old_addr;
struct kobject kobj;
-   enum klp_state state;
struct list_head stack_node;
+   bool patched;
 };
 
 /**
@@ -71,7 +66,7 @@ struct klp_func {
  * @kobj:  kobject for sysfs resources
  * @mod:   kernel module associated with the patched object
  * (NULL for vmlinux)
- * @state: tracks object-level patch application state
+ * @patched:   the object's funcs have been added to the klp_ops list
  */
 struct klp_object {
/* external */
@@ -81,7 +76,7 @@ struct klp_object {
/* internal */
struct kobject kobj;
struct module *mod;
-   enum klp_state state;
+   bool patched;
 };
 
 /**
@@ -90,7 +85,7 @@ struct klp_object {
  * @objs:  object entries for kernel objects to be patched
  * @list:  list node for global list of registered patches
  * @kobj:  kobject for sysfs resources
- * @state: tracks patch-level application state
+ * @enabled:   the patch is enabled (but operation may be incomplete)
  */
 struct klp_patch {
/* external */
@@ -100,7 +95,7 @@ struct klp_patch {
/* internal */
struct list_head list;
struct kobject kobj;
-   enum klp_state state;
+   bool enabled;
 };
 
 #define klp_for_each_object(patch, obj) \
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 217b39d..2dbd355 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -348,11 +348,11 @@ static unsigned long klp_get_ftrace_location(unsigned 
long faddr)
 }
 #endif
 
-static void klp_disable_func(struct klp_func *func)
+static void klp_unpatch_func(struct klp_func *func)
 {
struct klp_ops *ops;
 
-   if (WARN_ON(func->state != KLP_ENABLED))
+   if (WARN_ON(!func->patched))
return;
if (WARN_ON(!func->old_addr))
return;
@@ -378,10 +378,10 @@ static void klp_disable_func(struct klp_func *func)
list_del_rcu(>stack_node);
}
 
-   func->state = KLP_DISABLED;
+   func->patched = false;
 }
 
-static int klp_enable_func(struct klp_func *func)
+static int klp_patch_func(struct klp_func *func)
 {
struct klp_ops *ops;
int ret;
@@ -389,7 +389,7 @@ static int klp_enable_func(struct klp_func *func)
if (WARN_ON(!func->old_addr))
return -EINVAL;
 
-   if (WARN_ON(func->state != KLP_DISABLED))
+   if (WARN_ON(func->patched))
return -EINVAL;
 
ops = klp_find_ops(func->old_addr);
@@ -437,7 +437,7 @@ static int klp_enable_func(struct klp_func *func)
list_add_rcu(>stack_node, >func_stack);
}
 
-   func->state = KLP_ENABLED;
+   func->patched = true;
 
return 0;
 
@@ -448,36 +448,36 @@ static int klp_enable_func(struct klp_func *func)
return ret;
 }
 
-static void klp_disable_object(struct klp_object *obj)
+static void klp_unpatch_object(struct klp_object *obj)
 {
struct klp_func *func;
 
klp_for_each_func(obj, func)
-   if (func->state == KLP_ENABLED)
-   klp_disable_func(func);
+   if (func->patched)
+

[PATCH v3 01/15] stacktrace/x86: add function for detecting reliable stack traces

2016-12-08 Thread Josh Poimboeuf

For live patching and possibly other use cases, a stack trace is only
useful if it can be assured that it's completely reliable.  Add a new
save_stack_trace_tsk_reliable() function to achieve that.

Scenarios which indicate that a stack trace may be unreliable:

- running task
- interrupt stack
- preemption
- corrupted stack data
- stack grows the wrong way
- stack walk doesn't reach the bottom
- user didn't provide a large enough entries array

Also add CONFIG_HAVE_RELIABLE_STACKTRACE so arch-independent code can
determine at build time whether the function is implemented.

Signed-off-by: Josh Poimboeuf 
---
 arch/Kconfig   |  6 +
 arch/x86/Kconfig   |  1 +
 arch/x86/include/asm/unwind.h  |  6 +
 arch/x86/kernel/stacktrace.c   | 59 +-
 arch/x86/kernel/unwind_frame.c |  1 +
 include/linux/stacktrace.h |  8 +++---
 kernel/stacktrace.c| 12 +++--
 7 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 13f27c1..d61a133 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -678,6 +678,12 @@ config HAVE_STACK_VALIDATION
  Architecture supports the 'objtool check' host tool command, which
  performs compile-time stack metadata validation.
 
+config HAVE_RELIABLE_STACKTRACE
+   bool
+   help
+ Architecture has a save_stack_trace_tsk_reliable() function which
+ only returns a stack trace if it can guarantee the trace is reliable.
+
 config HAVE_ARCH_HASH
bool
default n
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 215612c..b4a6663 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -155,6 +155,7 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
+   select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && 
STACK_VALIDATION
select HAVE_STACK_VALIDATIONif X86_64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index c5a7f3a..44f86dc 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -11,6 +11,7 @@ struct unwind_state {
unsigned long stack_mask;
struct task_struct *task;
int graph_idx;
+   bool error;
 #ifdef CONFIG_FRAME_POINTER
unsigned long *bp;
struct pt_regs *regs;
@@ -40,6 +41,11 @@ void unwind_start(struct unwind_state *state, struct 
task_struct *task,
__unwind_start(state, task, regs, first_frame);
 }
 
+static inline bool unwind_error(struct unwind_state *state)
+{
+   return state->error;
+}
+
 #ifdef CONFIG_FRAME_POINTER
 
 static inline
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 0653788..3e0cf5e 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -74,6 +74,64 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct 
stack_trace *trace)
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+static int __save_stack_trace_reliable(struct stack_trace *trace,
+  struct task_struct *task)
+{
+   struct unwind_state state;
+   struct pt_regs *regs;
+   unsigned long addr;
+
+   for (unwind_start(, task, NULL, NULL); !unwind_done();
+unwind_next_frame()) {
+
+   regs = unwind_get_entry_regs();
+   if (regs) {
+   /*
+* Preemption and page faults on the stack can make
+* frame pointers unreliable.
+*/
+   if (!user_mode(regs))
+   return -1;
+
+   /*
+* This frame contains the (user mode) pt_regs at the
+* end of the stack.  Finish the unwind.
+*/
+   unwind_next_frame();
+   break;
+   }
+
+   addr = unwind_get_return_address();
+   if (!addr || save_stack_address(trace, addr, false))
+   return -1;
+   }
+
+   if (!unwind_done() || unwind_error())
+   return -1;
+
+   if (trace->nr_entries < trace->max_entries)
+   trace->entries[trace->nr_entries++] = ULONG_MAX;
+
+   return 0;
+}
+
+int save_stack_trace_tsk_reliable(struct task_struct *tsk,
+ struct stack_trace *trace)
+{
+   int ret;
+
+   if (!try_get_task_stack(tsk))
+   return -EINVAL;
+
+   ret = __save_stack_trace_reliable(trace, tsk);
+
+   put_task_stack(tsk);
+
+   return ret;
+}
+#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */
+
 /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
 
 struct

Re: [PATCH v3 04/15] livepatch/x86: add TIF_PATCH_PENDING thread flag

2016-12-08 Thread Andy Lutomirski

On Thu, Dec 8, 2016 at 10:08 AM, Josh Poimboeuf  wrote:
> Add the TIF_PATCH_PENDING thread flag to enable the new livepatch
> per-task consistency model for x86_64.  The bit getting set indicates
> the thread has a pending patch which needs to be applied when the thread
> exits the kernel.
>
> The bit is placed in the _TIF_ALLWORK_MASK macro, which results in
> exit_to_usermode_loop() calling klp_update_patch_state() when it's set.
>
> Signed-off-by: Josh Poimboeuf 

Acked-by: Andy Lutomirski

Re: [RFC, PATCHv1 24/28] x86/mm: add sync_global_pgds() for configuration with 5-level paging

2016-12-08 Thread Andy Lutomirski

On Thu, Dec 8, 2016 at 8:21 AM, Kirill A. Shutemov
 wrote:
> This basically restores slightly modified version of original
> sync_global_pgds() which we had before foldedl p4d was introduced.
>
> The only modification is protection against 'address' overflow.
>
> Signed-off-by: Kirill A. Shutemov 
> ---
>  arch/x86/mm/init_64.c | 47 +++
>  1 file changed, 47 insertions(+)
>
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index a991f5c4c2c4..d637893ac8c2 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -92,6 +92,52 @@ __setup("noexec32=", nonx32_setup);
>   * When memory was added/removed make sure all the processes MM have
>   * suitable PGD entries in the local PGD level page.
>   */
> +#ifdef CONFIG_X86_5LEVEL
> +void sync_global_pgds(unsigned long start, unsigned long end, int removed)
> +{
> +unsigned long address;
> +
> +   for (address = start; address <= end && address >= start;
> +   address += PGDIR_SIZE) {
> +const pgd_t *pgd_ref = pgd_offset_k(address);
> +struct page *page;
> +
> +/*
> + * When it is called after memory hot remove, pgd_none()
> + * returns true. In this case (removed == 1), we must clear
> + * the PGD entries in the local PGD level page.
> + */
> +if (pgd_none(*pgd_ref) && !removed)
> +continue;

This isn't quite specific to your patch, but can we assert that, if
removed=1, then we're not operating on the vmalloc range?  Because if
we do, this will be racy is nasty ways.

Re: [RFC v3 00/10] KVM PCIe/MSI passthrough on ARM/ARM64 and IOVA reserved regions

2016-12-08 Thread Robin Murphy

On 08/12/16 17:01, Alex Williamson wrote:
> On Thu, 8 Dec 2016 13:14:04 +
> Robin Murphy  wrote:
>> On 08/12/16 09:36, Auger Eric wrote:
>>> 3) RMRR reporting in the iommu group sysfs? Joerg: yes; Don: no
>>>My current series does not expose them in iommu group sysfs.
>>>I understand we can expose the RMRR regions in the iomm group sysfs
>>>without necessarily supporting RMRR requiring device assignment.
>>>We can also add this support later.  
>>
>> As you say, reporting them doesn't necessitate allowing device
>> assignment, and it's information which can already be easily grovelled
>> out of dmesg (for intel-iommu at least) - there doesn't seem to be any
>> need to hide them, but the x86 folks can have the final word on that.
> 
> Eric and I talked about this and I don't see the value in identifying
> an RMRR as anything other than a reserved range for a device.  It's not
> userspace's job to maintain an identify mapped range for the device,
> and it can't be trusted to do so anyway.  It does throw a kink in the
> machinery though as an RMRR is a reserved memory range unique to a
> device.  It doesn't really fit into a monolithic /sys/class/iommu view
> of global reserved regions as an RMRR is only relevant to the device
> paths affected.

I think we're in violent agreement then - to clarify, I was thinking in
terms of patch 7 of this series, where everything relevant to a
particular group would be exposed as just an opaque "don't use this
address range" regardless of the internal type.

I'm less convinced the kernel has any need to provide its own 'global'
view of reservations which strictly are always at least per-IOMMU, if
not per-root-complex, even when all the instances do share the same
address by design. The group-based interface fits the reality neatly,
and userspace can easily iterate all the groups if it wants to consider
everything. Plus if it doesn't want to, then it needn't bother reserving
anything which doesn't apply to the group(s) it's going to bind to VFIO.

Robin.

> Another kink is that sometimes we know what the RMRR is for, know that
> it's irrelevant for our use case, and ignore it.  This is true for USB
> and Intel graphics use cases of RMRRs.
> 
> Also, aside from the above mentioned cases, devices with RMRRs are
> currently excluded from participating in the IOMMU API by the
> intel-iommu driver and I expect this to continue in the general case
> regardless of whether the ranges are more easily exposed to userspace.
> ARM may have to deal with mangling a guest memory map due to lack of
> any standard layout, de facto or otherwise, but for x86 I don't think
> it's worth the migration and hotplug implications.  Thanks,
> 
> Alex
>

[PATCH v18 00/15] acpi, clocksource: add GTDT driver and GTDT support in arm_arch_timer

2016-12-08 Thread fu . wei

From: Fu Wei 

This patchset:
(1)Preparation for adding GTDT support in arm_arch_timer:
1. Move some enums and marcos to header file;
2. Add a new enum for spi type;
3. Improve printk relevant code;
4. Rename some enums and defines;
5. Rework PPI determination;
6. Rework counter frequency detection;
7. Refactor arch_timer_needs_probing, move it into DT init call
8. Introduce some new structs and refactor the MMIO timer init code
for reusing some common code.

(2)Introduce ACPI GTDT parser: drivers/acpi/arm64/acpi_gtdt.c
Parse all kinds of timer in GTDT table of ACPI:arch timer,
memory-mapped timer and SBSA Generic Watchdog timer.
This driver can help to simplify all the relevant timer drivers,
and separate all the ACPI GTDT knowledge from them.

(3)Simplify ACPI code for arm_arch_timer

(4)Add GTDT support for ARM memory-mapped timer.

This patchset has been tested on the following platforms with ACPI enabled:
(1)ARM Foundation v8 model

Changelog:
v18: https://lkml.org/lkml/2016/12/8/
 Fix 8/15 patch problem of "int ret;" in arch_timer_acpi_init
 Rebase to 4.9.0-rc8-g9269898

v17: https://lkml.org/lkml/2016/11/25/140
 Take out some cleanups from 4/15.
 Merge 5/15 and 6/15, improve PPI determination code,
 improve commit message.
 Rework counter frequency detection.
 Move arch_timer_needs_of_probing into DT init call.
 Move Platform Timer scan loop back to timer init call to avoid allocating
 and free memory.
 Improve all the exported functions' comment.

v16: https://lkml.org/lkml/2016/11/16/268
 Fix patchset problem about static enum ppi_nr of 01/13 in v15.
 Refactor arch_timer_detect_rate.
 Refactor arch_timer_needs_probing.

v15: https://lkml.org/lkml/2016/11/15/366
 Re-order patches
 Add arm_arch_timer refactoring patches to prepare for GTDT:
 1. rename some  enums and defines, and some cleanups
 2. separate out arch_timer_uses_ppi init code and fix a potential bug
 3. Improve some new structs, refactor the timer init code.
 Since the some structs have been changed, GTDT parser for memory-mapped
 timer and SBSA Generic Watchdog timer have been update.

v14: https://lkml.org/lkml/2016/9/28/573
 Separate memory-mapped timer GTDT support into two patches
 1. Refactor the timer init code to prepare for GTDT
 2. Add GTDT support for memory-mapped timer

v13: http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1231717.html
 Improve arm_arch_timer code for memory-mapped
 timer GTDT support, refactor original memory-mapped timer
 dt support for reusing some common code.

v12: https://lkml.org/lkml/2016/9/13/250
 Rebase to latest Linux 4.8-rc6
 Delete the confusing "skipping" in the error message.

V11: https://lkml.org/lkml/2016/9/6/354
 Rebase to latest Linux 4.8-rc5
 Delete typedef (suggested by checkpatch.pl)

V10: https://lkml.org/lkml/2016/7/26/215
 Drop the "readq" patch.
 Rebase to latest Linux 4.7.

V9: https://lkml.org/lkml/2016/7/25/345
Improve pr_err message in acpi gtdt driver.
Update Commit message for 7/9
shorten the irq mapping function name
Improve GTDT driver for memory-mapped timer

v8: https://lkml.org/lkml/2016/7/19/660
Improve "pr_fmt(fmt)" definition: add "ACPI" in front of "GTDT",
and also improve printk message.
Simplify is_timer_block and is_watchdog.
Merge acpi_gtdt_desc_init and gtdt_arch_timer_init into acpi_gtdt_init();
Delete __init in include/linux/acpi.h for GTDT API
Make ARM64 select GTDT.
Delete "#include " from acpi_gtdt.c
Simplify GT block parse code.

v7: https://lkml.org/lkml/2016/7/13/769
Move the GTDT driver to drivers/acpi/arm64
Add add the ARM64-specific ACPI Support maintainers in MAINTAINERS
Merge 3 patches of GTDT parser driver.
Fix the for_each_platform_timer bug.

v6: https://lkml.org/lkml/2016/6/29/580
split the GTDT driver to 4 parts: basic, arch_timer, memory-mapped timer,
and SBSA Generic Watchdog timer
Improve driver by suggestions and example code from Daniel Lezcano

v5: https://lkml.org/lkml/2016/5/24/356
Sorting out all patches, simplify the API of GTDT driver:
GTDT driver just fills the data struct for arm_arch_timer driver.

v4: https://lists.linaro.org/pipermail/linaro-acpi/2016-March/006667.html
Delete the kvm relevant patches
Separate two patches for sorting out the code for arm_arch_timer.
Improve irq info export code to allow missing irq info in GTDT table.

v3: https://lkml.org/lkml/2016/2/1/658
Improve GTDT driver code:
  (1)improve pr_* by defining pr_fmt(fmt)
  (2)simplify gtdt_sbsa_gwdt_init
  (3)improve gtdt_arch_timer_data_init, if table is NULL, it will try
  to get GTDT table.
Move enum ppi_nr to arm_arch_timer.h, and add enum spi_nr.
Add

[PATCH v18 01/15] clocksource/drivers/arm_arch_timer: Move enums and defines to header file

2016-12-08 Thread fu . wei

From: Fu Wei 

To support the arm_arch_timer via ACPI we need to share defines and enums
between the driver and the ACPI parser code.

Split out the relevant defines and enums into arm_arch_timer.h, and
change "enum ppi_nr" to "enum arch_timer_ppi_nr" to avoid the potential
name clashes.
Also switch "enum ppi_nr" to "enum arch_timer_ppi_nr" in
arm_arch_timer.c.
No functional change.

Signed-off-by: Fu Wei 
Acked-by: Mark Rutland 
Tested-by: Xiongfeng Wang 
---
 drivers/clocksource/arm_arch_timer.c | 13 +
 include/clocksource/arm_arch_timer.h | 12 
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 73c487d..21068be 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -51,8 +51,6 @@
 #define CNTV_TVAL  0x38
 #define CNTV_CTL   0x3c
 
-#define ARCH_CP15_TIMERBIT(0)
-#define ARCH_MEM_TIMER BIT(1)
 static unsigned arch_timers_present __initdata;
 
 static void __iomem *arch_counter_base;
@@ -65,20 +63,11 @@ struct arch_timer {
 #define to_arch_timer(e) container_of(e, struct arch_timer, evt)
 
 static u32 arch_timer_rate;
-
-enum ppi_nr {
-   PHYS_SECURE_PPI,
-   PHYS_NONSECURE_PPI,
-   VIRT_PPI,
-   HYP_PPI,
-   MAX_TIMER_PPI
-};
-
 static int arch_timer_ppi[MAX_TIMER_PPI];
 
 static struct clock_event_device __percpu *arch_timer_evt;
 
-static enum ppi_nr arch_timer_uses_ppi = VIRT_PPI;
+static enum arch_timer_ppi_nr arch_timer_uses_ppi = VIRT_PPI;
 static bool arch_timer_c3stop;
 static bool arch_timer_mem_use_virtual;
 
diff --git a/include/clocksource/arm_arch_timer.h 
b/include/clocksource/arm_arch_timer.h
index caedb74..557f869 100644
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -16,9 +16,13 @@
 #ifndef __CLKSOURCE_ARM_ARCH_TIMER_H
 #define __CLKSOURCE_ARM_ARCH_TIMER_H
 
+#include 
 #include 
 #include 
 
+#define ARCH_CP15_TIMERBIT(0)
+#define ARCH_MEM_TIMER BIT(1)
+
 #define ARCH_TIMER_CTRL_ENABLE (1 << 0)
 #define ARCH_TIMER_CTRL_IT_MASK(1 << 1)
 #define ARCH_TIMER_CTRL_IT_STAT(1 << 2)
@@ -34,6 +38,14 @@ enum arch_timer_reg {
ARCH_TIMER_REG_TVAL,
 };
 
+enum arch_timer_ppi_nr {
+   PHYS_SECURE_PPI,
+   PHYS_NONSECURE_PPI,
+   VIRT_PPI,
+   HYP_PPI,
+   MAX_TIMER_PPI
+};
+
 #define ARCH_TIMER_PHYS_ACCESS 0
 #define ARCH_TIMER_VIRT_ACCESS 1
 #define ARCH_TIMER_MEM_PHYS_ACCESS 2
-- 
2.9.3

Re: ATH9 driver issues on ARM64

2016-12-08 Thread Kalle Valo

Bharat Kumar Gogada  writes:

>  > [+cc Kalle, ath9k list]

Thanks, but please also CC linux-wireless. Full thread below for the
folks there.

>> On Thu, Dec 08, 2016 at 01:49:42PM +, Bharat Kumar Gogada wrote:
>> > Hi,
>> >
>> > Did anyone test Atheros ATH9 driver(drivers/net/wireless/ath/ath9k/)
>> > on ARM64.  The end point is TP link wifi card with which supports
>> > only legacy interrupts.
>> 
>> If it works on other arches and the arm64 PCI enumeration works, my
>> first guess would be an INTx issue, e.g., maybe the driver is waiting
>> for an interrupt that never arrives.
> We are not sure for now.
>> 
>> > We are trying to test it on ARM64 with
>> > (drivers/pci/host/pcie-xilinx-nwl.c) as root port.
>> >
>> > EP is getting enumerated and able to link up.
>> >
>> > But when we start scan system gets hanged.
>> 
>> When you say the system hangs when you start a scan, I assume you mean
>> a wifi scan, not the PCI enumeration.  A problem with a wifi scan
>> might cause a *process* to hang, but it shouldn't hang the entire
>> system.
>> 
> Yes wifi scan.
>> > When we took trace we see that after we start scan assert message is
>> > sent but there is no de assert from end point.
>> 
>> Are you talking about a trace from a PCIe analyzer?  Do you see an
>> Assert_INTx PCIe message on the link?
>> 
> Yes lecroy trace, yes we do see Assert_INTx and Deassert_INTx happening when 
> we do interface link up.
> When we have less debug prints in Atheros driver, and do wifi scan we see 
> Assert_INTx but never Deassert_INTx, 
>> > What might cause end point not sending de assert ?
>> 
>> If the endpoint doesn't send a Deassert_INTx message, I expect that
>> would mean the driver didn't service the interrupt and remove the
>> condition that caused the device to assert the interrupt in the first
>> place.
>> 
>> If the driver didn't receive the interrupt, it couldn't service it, of
>> course.  You could add a printk in the ath9k interrupt service
>> routine to see if you ever get there.
>>
> The interrupt behavior is changing w.r.t amount of debug prints we add. (I 
> kept many prints to aid debug)
> root@Xilinx-ZCU102-2016_3:~# iw dev wlan0 scan
> [   83.064675] ath9k: ath9k_iowrite32 ff800a400024
> [   83.069486] ath9k: ath9k_ioread32 ff800a400024
> [   83.074257] ath9k_hw_kill_interrupts793
> [   83.078260] ath9k: ath9k_iowrite32 ff800a400024
> [   83.083107] ath9k: ath9k_ioread32 ff800a400024
> [   83.087882] ath9k_hw_kill_interrupts793
> [   83.095450] ath9k_hw_enable_interrupts  821
> [   83.099557] ath9k_hw_enable_interrupts  825
> [   83.103721] ath9k_hw_enable_interrupts  832
> [   83.107887] ath9k: ath9k_iowrite32 ff800a400024
> [   83.112748] AR_SREV_9100 0
> [   83.115438] ath9k_hw_enable_interrupts  848
> [   83.119607] ath9k: ath9k_ioread32 ff800a400024
> [   83.124389] ath9k_hw_intrpend   762
> [   83.127761] (AR_SREV_9340(ah) val 0
> [   83.131234] ath9k_hw_intrpend   767
> [   83.134628] ath_isr 603
> [   83.137134] ath9k: ath9k_iowrite32 ff800a400024
> [   83.141995] ath9k: ath9k_ioread32 ff800a400024
> [   83.146771] ath9k_hw_kill_interrupts793
> [   83.150864] ath9k_hw_enable_interrupts  821
> [   83.154971] ath9k_hw_enable_interrupts  825
> [   83.159135] ath9k_hw_enable_interrupts  832
> [   83.163300] ath9k: ath9k_iowrite32 ff800a400024
> [   83.168161] AR_SREV_9100 0
> [   83.170852] ath9k_hw_enable_interrupts  848
> [   83.170855] ath9k_hw_intrpend   762
> [   83.178398] (AR_SREV_9340(ah) val 0
> [   83.181873] ath9k_hw_intrpend   767
> [   83.185265] ath_isr 603
> [   83.187773] ath9k: ath9k_iowrite32 ff800a400024
> [   83.192635] ath9k: ath9k_ioread32 ff800a400024
> [   83.197411] ath9k_hw_kill_interrupts793
> [   83.201414] ath9k: ath9k_ioread32 ff800a400024
> [   83.206258] ath9k_hw_enable_interrupts  821
> [   83.210368] ath9k_hw_enable_interrupts  825
> [   83.214531] ath9k_hw_enable_interrupts  832
> [   83.218698] ath9k: ath9k_iowrite32 ff800a400024
> [   83.223558] AR_SREV_9100 0
> [   83.226243] ath9k_hw_enable_interrupts  848
> [   83.226246] ath9k_hw_intrpend   762
> [   83.233794] (AR_SREV_9340(ah) val 0
> [   83.237268] ath9k_hw_intrpend   767
> [   83.240661] ath_isr 603
> [   83.243169] ath9k: ath9k_iowrite32 ff800a400024
> [   83.248030] ath9k: ath9k_ioread32 ff800a400024
> [   83.252806] ath9k_hw_kill_interrupts793
> [   83.256811] ath9k: ath9k_ioread32 ff800a400024
> [   83.261651] ath9k_hw_enable_interrupts  821
> [   83.265753] ath9k_hw_enable_interrupts  825
> [   83.269919] ath9k_hw_enable_interrupts  832
> [   83.274083] ath9k: ath9k_iowrite32 ff800a400024
> [   83.278945] AR_SREV_9100 0
> [   83.281630] ath9k_hw_enable_interrupts  848
> [   83.281633] ath9k_hw_intrpend   762
> [   83.281634] (AR_SREV_9340(ah)

Re: [PATCH] x86/vm86: fix compilation warning on a unused variable

2016-12-08 Thread Jérémy Lefaure

On Thu, 8 Dec 2016 09:33:05 +0100
Borislav Petkov  wrote:

> On Wed, Dec 07, 2016 at 11:38:33PM -0500, Jérémy Lefaure wrote:
> > When CONFIG_TRANSPARENT_HUGEPAGE is disabled, split_huge_pmd is a no-op
> > stub. In such case, vma is unused and a compiler raises a warning:
> > 
> > arch/x86/kernel/vm86_32.c: In function ‘mark_screen_rdonly’:
> > arch/x86/kernel/vm86_32.c:180:26: warning: unused variable ‘vma’
> > [-Wunused-variable]
> >struct vm_area_struct *vma = find_vma(mm, 0xA);
> >  ^~~
> > Adding __maybe_unused in the vma declaration fixes this warning.
> > 
> > In addition, checking if CONFIG_TRANSPARENT_HUGEPAGE is enabled avoids
> > calling find_vma function for nothing.
> > 
> > Signed-off-by: Jérémy Lefaure 
> > ---
> >  arch/x86/kernel/vm86_32.c | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
> > index 01f30e5..0813b76 100644
> > --- a/arch/x86/kernel/vm86_32.c
> > +++ b/arch/x86/kernel/vm86_32.c
> > @@ -176,8 +176,9 @@ static void mark_screen_rdonly(struct mm_struct *mm)
> > goto out;
> > pmd = pmd_offset(pud, 0xA);
> >  
> > -   if (pmd_trans_huge(*pmd)) {
> > -   struct vm_area_struct *vma = find_vma(mm, 0xA);
> > +   if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_trans_huge(*pmd)) {
> > +   struct vm_area_struct __maybe_unused *vma = find_vma(mm,
> > +0xA);  
> 
> So wouldn't the __maybe_unused alone without changing the if-condition
> fix the warning too?
> 

Yes it will. I did not see that pmd_trans_huge returns 0 if
CONFIG_TRANSPARENT_HUGEPAGE is disabled. So you're right, the
IS_ENABLED(...) in the condition is useless.

Thanks,
Jérémy

Re: [PATCH 0/2] Determine kernel text mapping size at runtime for x86_64

2016-12-08 Thread Kees Cook

On Wed, Dec 7, 2016 at 11:56 PM, Baoquan He  wrote:
> Dave Anderson ever told in Crash utility he makes judgement whether it's
> a kaslr kernel by size of KERNEL_IMAGE_SIZE. As long as it's 1G, it's
> recognized as kaslr. Then the current upstream kernel has a wrong behaviour,
> it sets KERNEL_IMAGE_SIZE as 1G as long as CONFIG_RANDOMIZE_BASE is enabled,
> though people specify "nokaslr" into cmdline to disable kaslr explicitly.

I'm not sure that's the correct solution to the Crash utility -- the
kaslr-ness of a kernel should be already exposed in the dump with the
kaslr_enabled variable yes?

> So in this patchset, made changes to determine the size of kernel text mapping
> area at runtime. If "nokaslr" specified, kernel mapping size is 512M though
> CONFIG_RANDOMIZE_BASE is enabled.

This seems to make the non-KASLR case more consistent, so I'm fine
with the idea. Once the build-bots are happy with everything, consider
the series:

Acked-by: Kees Cook 

Thanks!

-Kees

>
> Baoquan He (2):
>   x86/64: Make kernel text mapping always take one whole page table in
> early boot code
>   x86/KASLR/64: Determine kernel text mapping size at runtime
>
>  arch/x86/boot/compressed/kaslr.c| 15 ++-
>  arch/x86/include/asm/kaslr.h|  1 +
>  arch/x86/include/asm/page_64_types.h| 20 
>  arch/x86/include/asm/pgtable_64_types.h |  2 +-
>  arch/x86/kernel/head64.c| 11 ++-
>  arch/x86/kernel/head_64.S   | 16 +---
>  arch/x86/mm/dump_pagetables.c   |  3 ++-
>  arch/x86/mm/init_64.c   |  2 +-
>  arch/x86/mm/physaddr.c  |  6 +++---
>  9 files changed, 45 insertions(+), 31 deletions(-)
>
> --
> 2.5.5
>



-- 
Kees Cook
Nexus Security

Re: [PATCH v2 1/3] perf: add PERF_RECORD_NAMESPACES to include namespaces related info

2016-12-08 Thread Hari Bathini


Hi Peter,


Sorry for taking so long to respond...


On Thursday 24 November 2016 08:40 PM, Peter Zijlstra wrote:

On Thu, Nov 24, 2016 at 08:14:29PM +0530, Hari Bathini wrote:

@@ -862,6 +875,19 @@ enum perf_event_type {
 */
PERF_RECORD_SWITCH_CPU_WIDE = 15,
  
+	/*

+* struct {
+*  struct perf_event_headerheader;
+*
+*  u32 pid, tid;
+*  u64 time;

pid,tid and time are already present in sample_id. Many of the 'legacy'
record have redundant information since we added sample_id, but most of
the new ones haven't and rely on sample_all being set.


I tried using pid/tid from sample data, but realized that pid/tid in 
event_id

could be different from the one in sample data, at least for fork/namespaces
events, since __perf_event_header__init_id( ) that updates the sample data
is getting the pid/tid of current task.

I am not sure if it is advisable to change __perf_event_header__init_id( 
) for this..?


Thanks
Hari

Re: [RFC, PATCHv1 16/28] x86/asm: remove __VIRTUAL_MASK_SHIFT==47 assert

2016-12-08 Thread Andy Lutomirski

On Thu, Dec 8, 2016 at 8:21 AM, Kirill A. Shutemov
 wrote:
> We don't need it anymore. 17be0aec74fb ("x86/asm/entry/64: Implement
> better check for canonical addresses") made canonical address check
> generic wrt. address width.

This code existed in part to remind us that this needs very careful
adjustment when the paging size becomes dynamic.  If you want to
remove it, please add test cases to tools/testing/selftests/x86 that
verify:

a. Either mmap(2^47-4096, ..., MAP_FIXED, ...) fails or that, if it
succeeds and you put a syscall instruction at the very end, that
invoking the syscall instruction there works.  The easiest way to do
this may be to have the selftest literally have a page of text that
has 4094 0xcc bytes and a syscall and to map that page or perhaps move
it into place with mremap.  That will avoid annoying W^X userspace
stuff from messing up the test.  You'll need to handle the signal when
you fall off the end of the world after the syscall.

b. Ditto for the new highest possible userspace page.

c. Ditto for one page earlier to make sure that your test actually works.

d. For each possible maximum address, call raise(SIGUSR1) and, in the
signal handler, change RIP to point to the first noncanonical address
and RCX to match RIP.  Return and catch the resulting exception.  This
may be easy to integrate into the sigreturn tests, and I can help with
that.

--Andy

Re: [RFC, PATCHv1 22/28] x86/espfix: support 5-level paging

2016-12-08 Thread Andy Lutomirski

On Thu, Dec 8, 2016 at 8:21 AM, Kirill A. Shutemov
 wrote:
> XXX: how to test this?

tools/testing/selftests/x86/sigreturn_{32,64}

[RFC 00/10] kmod: stress test driver, few fixes and enhancements

2016-12-08 Thread Luis R. Rodriguez

Upon running into an old kmod v19 issue with mount (get_fs_type()) a few of us
hunted for the cause of the issue. Although the issue ended up being a
userspace issue, a stress test driver was written to help reproduce the issue,
and along the way a few other fixes and sanity checks were implemented.

I've taken the time to generalize the stress test driver as a kselftest driver
with a 9 test cases. The last two test cases reveal an existing issue which
is not yet addressed upstream, even if you have kmod v19 present. A fix is
proposed in the last patch. Orignally we had discarded this patch as too
complex due to the alias handling, but upon further analysis of test cases
and memory pressure issues, it seems worth considering. Other than the
last patch I don't think much of the other patches are controversial, but
sending as RFC first just in case.

If its not clear, an end goal here is to make module loading a bit more
deterministic with stronger sanity checks and stress tests. Please note,
the stress test diver requires 4 GiB of RAM to run all tests without running
out of memory. A lot of this has to do with the memory requirements needed
for a dynamic test for multiple threads, but note that the final memory
pressure and OOMs actually don't come from this allocation, but instead
from many finit_module() calls, this consumes quite a bit of memory, specially
if you have a lot of dependencies which also need to be loaded prior to
your needed module -- as is the case for filesystem drivers.

These patches are available on my linux-next git-tree on my branch
20161208-kmod-test-driver-try2 [0], which is based on linux-next tag
next-20161208. Patches are also available based on v4.9-rc8 [1] for
those looking for a bit more stable tree given x86_64 on linux-next is
hosed at the moment.

Since kmod.c doesn't seem to get much love, and since I've been digging
quite a bit into it for other users (firmware) I suppose I could volunteer
myself to maintain this code as well, unless there are oppositions to this.

[0] 
https://git.kernel.org/cgit/linux/kernel/git/mcgrof/linux-next.git/log/?h=20161208-kmod-test-driver-try2
[1] 
https://git.kernel.org/cgit/linux/kernel/git/mcgrof/linux.git/log/?h=20161208-kmod-test-driver

Luis R. Rodriguez (10):
  kmod: add test driver to stress test the module loader
  module: fix memory leak on early load_module() failures
  kmod: add dynamic max concurrent thread count
  kmod: provide wrappers for kmod_concurrent inc/dec
  kmod: return -EBUSY if modprobe limit is reached
  kmod: provide sanity check on kmod_concurrent access
  kmod: use simplified rate limit printk
  sysctl: add support for unsigned int properly
  kmod: add helpers for getting kmod count and limit
  kmod: add a sanity check on module loading

 Documentation/admin-guide/kernel-parameters.txt |7 +
 include/linux/kmod.h|9 +
 include/linux/sysctl.h  |3 +
 init/Kconfig|   23 +
 init/main.c |1 +
 kernel/kmod.c   |  244 -
 kernel/module.c |   12 +-
 kernel/sysctl.c |  198 +++-
 lib/Kconfig.debug   |   25 +
 lib/Makefile|1 +
 lib/test_kmod.c | 1248 +++
 tools/testing/selftests/kmod/Makefile   |   11 +
 tools/testing/selftests/kmod/config |7 +
 tools/testing/selftests/kmod/kmod.sh|  448 
 14 files changed, 2199 insertions(+), 38 deletions(-)
 create mode 100644 lib/test_kmod.c
 create mode 100644 tools/testing/selftests/kmod/Makefile
 create mode 100644 tools/testing/selftests/kmod/config
 create mode 100755 tools/testing/selftests/kmod/kmod.sh

-- 
2.10.1

[RFC 01/10] kmod: add test driver to stress test the module loader

2016-12-08 Thread Luis R. Rodriguez

This adds a new stress test driver for kmod: the kernel module loader.
The new stress test driver, test_kmod, is only enabled as a module right
now. It should be possible to load this as built-in and load tests early
(refer to the force_init_test module parameter), however since a lot of
test can get a system out of memory fast we leave this disabled for now.

Using a system with 1024 MiB of RAM can *easily* get your kernel
OOM fast with this test driver.

The test_kmod driver exposes API knobs for us to fine tune simple
request_module() and get_fs_type() calls. Since these API calls
only allow each one parameter a test driver for these is rather
simple. Other factors that can help out test driver though are
the number of calls we issue and knowing current limitations of
each. This exposes configuration as much as possible through
userspace to be able to build tests directly from userspace.

Since it allows multiple misc devices its will eventually (once we
add a knob to let us create new devices at will) also be possible to
perform more tests in parallel, provided you have enough memory.

We only enable tests we know work as of right now.

Demo screenshots:

 # tools/testing/selftests/kmod/kmod.sh
kmod_test_0001_driver: OK! - loading kmod test
kmod_test_0001_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected 
MODULE_NOT_FOUND
kmod_test_0001_fs: OK! - loading kmod test
kmod_test_0001_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL
kmod_test_0002_driver: OK! - loading kmod test
kmod_test_0002_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected 
MODULE_NOT_FOUND
kmod_test_0002_fs: OK! - loading kmod test
kmod_test_0002_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL
kmod_test_0003: OK! - loading kmod test
kmod_test_0003: OK! - Return value: 0 (SUCCESS), expected SUCCESS
kmod_test_0004: OK! - loading kmod test
kmod_test_0004: OK! - Return value: 0 (SUCCESS), expected SUCCESS
kmod_test_0005: OK! - loading kmod test
kmod_test_0005: OK! - Return value: 0 (SUCCESS), expected SUCCESS
kmod_test_0006: OK! - loading kmod test
kmod_test_0006: OK! - Return value: 0 (SUCCESS), expected SUCCESS
kmod_test_0005: OK! - loading kmod test
kmod_test_0005: OK! - Return value: 0 (SUCCESS), expected SUCCESS
kmod_test_0006: OK! - loading kmod test
kmod_test_0006: OK! - Return value: 0 (SUCCESS), expected SUCCESS
Test completed

You can also request for specific tests:

 # tools/testing/selftests/kmod/kmod.sh -t 0001
kmod_test_0001_driver: OK! - loading kmod test
kmod_test_0001_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected 
MODULE_NOT_FOUND
kmod_test_0001_fs: OK! - loading kmod test
kmod_test_0001_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL
Test completed

Lastly, the current available number of tests:

 # tools/testing/selftests/kmod/kmod.sh --help
Usage: tools/testing/selftests/kmod/kmod.sh [ -t <4-number-digit> ]
Valid tests: 0001-0009

0001 - Simple test - 1 thread  for empty string
0002 - Simple test - 1 thread  for modules/filesystems that do not exist
0003 - Simple test - 1 thread  for get_fs_type() only
0004 - Simple test - 2 threads for get_fs_type() only
0005 - multithreaded tests with default setup - request_module() only
0006 - multithreaded tests with default setup - get_fs_type() only
0007 - multithreaded tests with default setup test request_module() and 
get_fs_type()
0008 - multithreaded - push kmod_concurrent over max_modprobes for 
request_module()
0009 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()

The following test cases currently fail, as such they are not currently
enabled by default:

 # tools/testing/selftests/kmod/kmod.sh -t 0007
 # tools/testing/selftests/kmod/kmod.sh -t 0008
 # tools/testing/selftests/kmod/kmod.sh -t 0009
 # tools/testing/selftests/kmod/kmod.sh -t 0010
 # tools/testing/selftests/kmod/kmod.sh -t 0011

To be sure to run them as intended please unload both of the modules:

  o test_module
  o xfs

And ensure they are not loaded on your system prior to testing them.
If you use these paritions for your rootfs you can change the default
test driver used for get_fs_type() by exporting it into your
environment. For example of other test defaults you can override
refer to kmod.sh allow_user_defaults().

Behind the scenes this is how we fine tune at a test case prior to
hitting a trigger to run it:

cat /sys/devices/virtual/misc/test_kmod0/config
echo -n "2" > /sys/devices/virtual/misc/test_kmod0/config_test_case
echo -n "ext4" > /sys/devices/virtual/misc/test_kmod0/config_test_fs
echo -n "80" > /sys/devices/virtual/misc/test_kmod0/config_num_threads
cat /sys/devices/virtual/misc/test_kmod0/config
echo -n "1" > /sys/devices/virtual/misc/test_kmod0/config_num_threads

Finally to trigger:

echo -n "1" > /sys/devices/virtual/misc/test_kmod0/trigger_config

The kmod.sh script uses the above constructs to build differnt test cases.

A bit of interpretation of the current failures follows, first two
premises:

[PATCH v2 4/4] ARM: treewide: Replace uses of virt_to_phys with __pa_symbol

2016-12-08 Thread Florian Fainelli

All low-level PM/SMP code using virt_to_phys() should actually use
__pa_symbol() against kernel symbols. Update code where relevant to move
away from virt_to_phys().

Signed-off-by: Florian Fainelli 
---
 arch/arm/common/mcpm_entry.c  | 12 ++--
 arch/arm/mach-alpine/platsmp.c|  2 +-
 arch/arm/mach-axxia/platsmp.c |  2 +-
 arch/arm/mach-bcm/bcm63xx_smp.c   |  2 +-
 arch/arm/mach-bcm/platsmp-brcmstb.c   |  2 +-
 arch/arm/mach-bcm/platsmp.c   |  4 ++--
 arch/arm/mach-berlin/platsmp.c|  2 +-
 arch/arm/mach-exynos/firmware.c   |  4 ++--
 arch/arm/mach-exynos/mcpm-exynos.c|  2 +-
 arch/arm/mach-exynos/platsmp.c|  4 ++--
 arch/arm/mach-exynos/pm.c |  6 +++---
 arch/arm/mach-exynos/suspend.c|  6 +++---
 arch/arm/mach-hisi/platmcpm.c |  2 +-
 arch/arm/mach-hisi/platsmp.c  |  6 +++---
 arch/arm/mach-imx/platsmp.c   |  2 +-
 arch/arm/mach-imx/pm-imx6.c   |  2 +-
 arch/arm/mach-imx/src.c   |  2 +-
 arch/arm/mach-mediatek/platsmp.c  |  2 +-
 arch/arm/mach-mvebu/pm.c  |  2 +-
 arch/arm/mach-mvebu/pmsu.c|  2 +-
 arch/arm/mach-mvebu/system-controller.c   |  2 +-
 arch/arm/mach-omap2/control.c |  8 
 arch/arm/mach-omap2/omap-mpuss-lowpower.c |  8 
 arch/arm/mach-omap2/omap-smp.c|  4 ++--
 arch/arm/mach-prima2/platsmp.c|  2 +-
 arch/arm/mach-prima2/pm.c |  2 +-
 arch/arm/mach-pxa/palmz72.c   |  2 +-
 arch/arm/mach-pxa/pxa25x.c|  2 +-
 arch/arm/mach-pxa/pxa27x.c|  2 +-
 arch/arm/mach-pxa/pxa3xx.c|  2 +-
 arch/arm/mach-realview/platsmp-dt.c   |  2 +-
 arch/arm/mach-rockchip/platsmp.c  |  4 ++--
 arch/arm/mach-rockchip/pm.c   |  2 +-
 arch/arm/mach-s3c24xx/mach-jive.c |  2 +-
 arch/arm/mach-s3c24xx/pm-s3c2410.c|  2 +-
 arch/arm/mach-s3c24xx/pm-s3c2416.c|  2 +-
 arch/arm/mach-s3c64xx/pm.c|  2 +-
 arch/arm/mach-s5pv210/pm.c|  2 +-
 arch/arm/mach-sa1100/pm.c |  2 +-
 arch/arm/mach-shmobile/platsmp-apmu.c |  6 +++---
 arch/arm/mach-shmobile/platsmp-scu.c  |  4 ++--
 arch/arm/mach-socfpga/platsmp.c   |  4 ++--
 arch/arm/mach-spear/platsmp.c |  2 +-
 arch/arm/mach-sti/platsmp.c   |  2 +-
 arch/arm/mach-sunxi/platsmp.c |  4 ++--
 arch/arm/mach-tango/platsmp.c |  2 +-
 arch/arm/mach-tango/pm.c  |  2 +-
 arch/arm/mach-tegra/reset.c   |  4 ++--
 arch/arm/mach-ux500/platsmp.c |  2 +-
 arch/arm/mach-vexpress/dcscb.c|  2 +-
 arch/arm/mach-vexpress/platsmp.c  |  2 +-
 arch/arm/mach-vexpress/tc2_pm.c   |  4 ++--
 arch/arm/mach-zx/platsmp.c|  4 ++--
 arch/arm/mach-zynq/platsmp.c  |  2 +-
 54 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c
index a923524d1040..cf062472e07b 100644
--- a/arch/arm/common/mcpm_entry.c
+++ b/arch/arm/common/mcpm_entry.c
@@ -144,7 +144,7 @@ extern unsigned long 
mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER];
 
 void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
 {
-   unsigned long val = ptr ? virt_to_phys(ptr) : 0;
+   unsigned long val = ptr ? __pa_symbol(ptr) : 0;
mcpm_entry_vectors[cluster][cpu] = val;
sync_cache_w(_entry_vectors[cluster][cpu]);
 }
@@ -299,8 +299,8 @@ void mcpm_cpu_power_down(void)
 * the kernel as if the power_up method just had deasserted reset
 * on the CPU.
 */
-   phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
-   phys_reset(virt_to_phys(mcpm_entry_point));
+   phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);
+   phys_reset(__pa_symbol(mcpm_entry_point));
 
/* should never get here */
BUG();
@@ -388,8 +388,8 @@ static int __init nocache_trampoline(unsigned long _arg)
__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
__mcpm_cpu_down(cpu, cluster);
 
-   phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
-   phys_reset(virt_to_phys(mcpm_entry_point));
+   phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);
+   phys_reset(__pa_symbol(mcpm_entry_point));
BUG();
 }
 
@@ -449,7 +449,7 @@ int __init mcpm_sync_init(
sync_cache_w(_sync);
 
if (power_up_setup) {
-   mcpm_power_up_setup_phys = virt_to_phys(power_up_setup);
+   mcpm_power_up_setup_phys = __pa_symbol(power_up_setup);
sync_cache_w(_power_up_setup_phys);
}
 
diff --git a/arch/arm/mach-alpine/platsmp.c b/arch/arm/mach-alpine/platsmp.c
index

Re: [PATCH 0/2] Determine kernel text mapping size at runtime for x86_64

2016-12-08 Thread Dave Anderson



- Original Message -
> On Wed, Dec 7, 2016 at 11:56 PM, Baoquan He  wrote:
> > Dave Anderson ever told in Crash utility he makes judgement whether it's
> > a kaslr kernel by size of KERNEL_IMAGE_SIZE. As long as it's 1G, it's
> > recognized as kaslr. Then the current upstream kernel has a wrong behaviour,
> > it sets KERNEL_IMAGE_SIZE as 1G as long as CONFIG_RANDOMIZE_BASE is enabled,
> > though people specify "nokaslr" into cmdline to disable kaslr explicitly.
> 
> I'm not sure that's the correct solution to the Crash utility -- the
> kaslr-ness of a kernel should be already exposed in the dump with the
> kaslr_enabled variable yes?

The crash utility doesn't use KERNEL_IMAGE_SIZE to determine whether
KASLR is in play, but rather to determine the base of the modules virtual
address space (i.e, the same way the kernel does).  And then it uses that
value in a couple other places.

Dave


> 
> > So in this patchset, made changes to determine the size of kernel text
> > mapping
> > area at runtime. If "nokaslr" specified, kernel mapping size is 512M though
> > CONFIG_RANDOMIZE_BASE is enabled.
> 
> This seems to make the non-KASLR case more consistent, so I'm fine
> with the idea. Once the build-bots are happy with everything, consider
> the series:
> 
> Acked-by: Kees Cook 
> 
> Thanks!
> 
> -Kees
> 
> >
> > Baoquan He (2):
> >   x86/64: Make kernel text mapping always take one whole page table in
> > early boot code
> >   x86/KASLR/64: Determine kernel text mapping size at runtime
> >
> >  arch/x86/boot/compressed/kaslr.c| 15 ++-
> >  arch/x86/include/asm/kaslr.h|  1 +
> >  arch/x86/include/asm/page_64_types.h| 20 
> >  arch/x86/include/asm/pgtable_64_types.h |  2 +-
> >  arch/x86/kernel/head64.c| 11 ++-
> >  arch/x86/kernel/head_64.S   | 16 +---
> >  arch/x86/mm/dump_pagetables.c   |  3 ++-
> >  arch/x86/mm/init_64.c   |  2 +-
> >  arch/x86/mm/physaddr.c  |  6 +++---
> >  9 files changed, 45 insertions(+), 31 deletions(-)
> >
> > --
> > 2.5.5
> >
> 
> 
> 
> --
> Kees Cook
> Nexus Security
>

enabling COMPILE_TEST support for GCC plugins in v4.11

2016-12-08 Thread Kees Cook

Hi,

I'd like to get the GCC plugins building under
allyesconfig/allmodconfig for -next soon (with the intention of
landing the change in v4.11). Specifically, I intend to revert
a519167e753e ("gcc-plugins: disable under COMPILE_TEST").

Right now the plugins are only supported on x86, arm, and arm64,
though powerpc may happen in either v4.10 or v4.11 as well. This means
that the autobuilders for these architectures need to have the "gcc
plugin development" package installed which contains the GCC headers
needed for the plugins. For Debian/Ubuntu, this is gcc-$N-plugin-dev
(and for cross compilers: gcc-$N-plugin-dev-$arch-linux-$abi). For
Fedora, it is gcc-plugin-devel (though I'm not sure the naming for
cross compilers). Manual builds of compilers should already have these
headers installed.

The "noisy" plugin, cyc_complexity, is just an example, and I have
disabled it (which is pending[1] for v4.10). The remaining ones
(sancov and latent_entropy) are what I'm hoping to see tested
tree-wide (with the expectation that more are coming down the road:
initify, randstruct, structleak, constify, ...)

IIUC, the 0day builder already has the headers installed. I tried to
look through linux-next to find all the other folks that do
autobuilding on these architectures; apologies if I've missed anyone.

If you have a moment, applying 215e2aa6c024[1] and reverting
a519167e753e for an allyesconfig/allmodconfig build should let you
know if things are working correctly with headers installed. If anyone
sees any problems, please let me know and I can queue up fixes.

Thanks!

-Kees

[1] 
http://git.kernel.org/cgit/linux/kernel/git/kees/linux.git/commit/?h=for-next/gcc-plugins=215e2aa6c024d27cdbe88e2ea88cb59dcab588eb

-- 
Kees Cook
Nexus Security

Re: [PATCH v2 3/3] kvm: svm: Use the hardware provided GPA instead of page walk

2016-12-08 Thread Brijesh Singh


Hi Paolo,

On 12/08/2016 09:39 AM, Brijesh Singh wrote:

Hi Paolo,

On 12/08/2016 08:52 AM, Paolo Bonzini wrote:



On 23/11/2016 18:02, Brijesh Singh wrote:

From: Tom Lendacky 

When a guest causes a NPF which requires emulation, KVM sometimes walks
the guest page tables to translate the GVA to a GPA. This is unnecessary
most of the time on AMD hardware since the hardware provides the GPA in
EXITINFO2.

The only exception cases involve string operations involving rep or
operations that use two memory locations. With rep, the GPA will only be
the value of the initial NPF and with dual memory locations we won't
know
which memory address was translated into EXITINFO2.

Signed-off-by: Tom Lendacky 
Reviewed-by: Borislav Petkov 
Signed-off-by: Brijesh Singh 


Tom, Brijesh,

I would like to confirm that you have run kvm-unit-tests with this patch?
I haven't yet tried AMD (will do before sending the pull request to
Linus),
but a similar patch for Intel gives me these 4 failures on emulator.flat:

FAIL: push mem
FAIL: pop mem
FAIL: cross-page mmio read
FAIL: cross-page mmio write



I did not ran kvm-unit-tests. However, I was able to boot Linux
guest and run some stress test. I will run kvm-unit-tests and let
you know soon.



I am able to reproduce it on AMD HW using kvm-unit-tests. Looking at 
test, the initial thought is "push mem" has two operands (the memory 
being pushed and the stack pointer). The provided GPA could be either 
one of those.


If we can detect those cases, we should not set the gpa_available on 
them (like what we do with string move).


We probably haven't hit this case in guest booting. Will investigate bit 
further and provide a updated patch to handle it.



-Brijesh

The VMX patch to set gpa_available is just this:

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 25d48380c312..5d7b60d4795b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6393,6 +6393,7 @@ static int handle_ept_violation(struct kvm_vcpu
*vcpu)
 /* ept page table is present? */
 error_code |= (exit_qualification & 0x38) != 0;

+vcpu->arch.gpa_available = true;
 vcpu->arch.exit_qualification = exit_qualification;

 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -6410,6 +6411,7 @@ static int handle_ept_misconfig(struct kvm_vcpu
*vcpu)
 }

 ret = handle_mmio_page_fault(vcpu, gpa, true);
+vcpu->arch.gpa_available = true;
 if (likely(ret == RET_MMIO_PF_EMULATE))
 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
   EMULATE_DONE;
@@ -8524,6 +8526,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 u32 vectoring_info = vmx->idt_vectoring_info;

 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+vcpu->arch.gpa_available = false;

 /*
  * Flush logged GPAs PML buffer, this will make dirty_bitmap more

Thanks,

Paolo


---
 arch/x86/include/asm/kvm_emulate.h |3 +++
 arch/x86/include/asm/kvm_host.h|3 +++
 arch/x86/kvm/svm.c |2 ++
 arch/x86/kvm/x86.c |   17 -
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h
b/arch/x86/include/asm/kvm_emulate.h
index e9cd7be..2d1ac09 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -344,6 +344,9 @@ struct x86_emulate_ctxt {
 struct read_cache mem_read;
 };

+/* String operation identifier (matches the definition in emulate.c) */
+#define CTXT_STRING_OP(1 << 13)
+
 /* Repeat String Operation Prefix */
 #define REPE_PREFIX0xf3
 #define REPNE_PREFIX0xf2
diff --git a/arch/x86/include/asm/kvm_host.h
b/arch/x86/include/asm/kvm_host.h
index 77cb3f9..fd5b1c8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -668,6 +668,9 @@ struct kvm_vcpu_arch {

 int pending_ioapic_eoi;
 int pending_external_vector;
+
+/* GPA available (AMD only) */
+bool gpa_available;
 };

 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5e64e656..1bbd04c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4246,6 +4246,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 return 1;
 }

+vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
+
 return svm_exit_handlers[exit_code](svm);
 }

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c30f62dc..5002eea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4441,7 +4441,19 @@ static int vcpu_mmio_gva_to_gpa(struct
kvm_vcpu *vcpu, unsigned long gva,
 return 1;
 }

-*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access,
exception);
+/*
+ * If the exit was due to a NPF we may already have a GPA.
+ * If the GPA is present, use it to avoid the GVA to GPA table
+ * walk. Note, this cannot be used on string operations since
+ *

[PATCH v2 1/4] mtd: lart: Rename partition defines to be prefixed with PART_

2016-12-08 Thread Florian Fainelli

In preparation for defining KERNEL_START on ARM, rename KERNEL_START to
PART_KERNEL_START, and to be consistent, do this for all
partition-related constants.

Signed-off-by: Florian Fainelli 
---
 drivers/mtd/devices/lart.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/mtd/devices/lart.c b/drivers/mtd/devices/lart.c
index 82bd00af5cc3..268aae45b514 100644
--- a/drivers/mtd/devices/lart.c
+++ b/drivers/mtd/devices/lart.c
@@ -75,18 +75,18 @@ static char module_name[] = "lart";
 
 /* blob */
 #define NUM_BLOB_BLOCKSFLASH_NUMBLOCKS_16m_PARAM
-#define BLOB_START 0x
-#define BLOB_LEN   (NUM_BLOB_BLOCKS * 
FLASH_BLOCKSIZE_PARAM)
+#define PART_BLOB_START0x
+#define PART_BLOB_LEN  (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM)
 
 /* kernel */
 #define NUM_KERNEL_BLOCKS  7
-#define KERNEL_START   (BLOB_START + BLOB_LEN)
-#define KERNEL_LEN (NUM_KERNEL_BLOCKS * 
FLASH_BLOCKSIZE_MAIN)
+#define PART_KERNEL_START  (PART_BLOB_START + PART_BLOB_LEN)
+#define PART_KERNEL_LEN(NUM_KERNEL_BLOCKS * 
FLASH_BLOCKSIZE_MAIN)
 
 /* initial ramdisk */
 #define NUM_INITRD_BLOCKS  24
-#define INITRD_START   (KERNEL_START + KERNEL_LEN)
-#define INITRD_LEN (NUM_INITRD_BLOCKS * 
FLASH_BLOCKSIZE_MAIN)
+#define PART_INITRD_START  (PART_KERNEL_START + PART_KERNEL_LEN)
+#define PART_INITRD_LEN(NUM_INITRD_BLOCKS * 
FLASH_BLOCKSIZE_MAIN)
 
 /*
  * See section 4.0 in "3 Volt Fast Boot Block Flash Memory" Intel Datasheet
@@ -587,20 +587,20 @@ static struct mtd_partition lart_partitions[] = {
/* blob */
{
.name   = "blob",
-   .offset = BLOB_START,
-   .size   = BLOB_LEN,
+   .offset = PART_BLOB_START,
+   .size   = PART_BLOB_LEN,
},
/* kernel */
{
.name   = "kernel",
-   .offset = KERNEL_START, /* MTDPART_OFS_APPEND */
-   .size   = KERNEL_LEN,
+   .offset = PART_KERNEL_START,/* MTDPART_OFS_APPEND */
+   .size   = PART_KERNEL_LEN,
},
/* initial ramdisk / file system */
{
.name   = "file system",
-   .offset = INITRD_START, /* MTDPART_OFS_APPEND */
-   .size   = INITRD_LEN,   /* MTDPART_SIZ_FULL */
+   .offset = PART_INITRD_START,/* MTDPART_OFS_APPEND */
+   .size   = PART_INITRD_LEN,  /* MTDPART_SIZ_FULL */
}
 };
 #define NUM_PARTITIONS ARRAY_SIZE(lart_partitions)
-- 
2.9.3

[PATCH v2 2/4] ARM: Define KERNEL_START and KERNEL_END

2016-12-08 Thread Florian Fainelli

In preparation for adding CONFIG_DEBUG_VIRTUAL support, define a set of
common constants: KERNEL_START and KERNEL_END which abstract
CONFIG_XIP_KERNEL vs. !CONFIG_XIP_KERNEL. Update the code where
relevant.

Signed-off-by: Florian Fainelli 
---
 arch/arm/include/asm/memory.h | 7 +++
 arch/arm/mm/init.c| 7 ++-
 arch/arm/mm/mmu.c | 6 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 76cbd9c674df..bee7511c5098 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -111,6 +111,13 @@
 
 #endif /* !CONFIG_MMU */
 
+#ifdef CONFIG_XIP_KERNEL
+#define KERNEL_START   _sdata
+#else
+#define KERNEL_START   _stext
+#endif
+#define KERNEL_END _end
+
 /*
  * We fix the TCM memories max 32 KiB ITCM resp DTCM at these
  * locations
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 370581aeb871..c87d0d5b65f2 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -230,11 +230,8 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, 
phys_addr_t align)
 void __init arm_memblock_init(const struct machine_desc *mdesc)
 {
/* Register the kernel text, kernel data and initrd with memblock. */
-#ifdef CONFIG_XIP_KERNEL
-   memblock_reserve(__pa(_sdata), _end - _sdata);
-#else
-   memblock_reserve(__pa(_stext), _end - _stext);
-#endif
+   memblock_reserve(__pa(KERNEL_START), _end - KERNEL_START);
+
 #ifdef CONFIG_BLK_DEV_INITRD
/* FDT scan will populate initrd_start */
if (initrd_start && !phys_initrd_size) {
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 4001dd15818d..f0fd1a2db036 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1437,11 +1437,7 @@ static void __init kmap_init(void)
 static void __init map_lowmem(void)
 {
struct memblock_region *reg;
-#ifdef CONFIG_XIP_KERNEL
-   phys_addr_t kernel_x_start = round_down(__pa(_sdata), SECTION_SIZE);
-#else
-   phys_addr_t kernel_x_start = round_down(__pa(_stext), SECTION_SIZE);
-#endif
+   phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), 
SECTION_SIZE);
phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
 
/* Map all the lowmem memory banks. */
-- 
2.9.3

[PATCH v2 3/4] ARM: Add support for CONFIG_DEBUG_VIRTUAL

2016-12-08 Thread Florian Fainelli

x86 has an option: CONFIG_DEBUG_VIRTUAL to do additional checks on
virt_to_phys calls. The goal is to catch users who are calling
virt_to_phys on non-linear addresses immediately. This includes caller
using __virt_to_phys() on image addresses instead of __pa_symbol(). This
is a generally useful debug feature to spot bad code (particulary in
drivers).

Signed-off-by: Florian Fainelli 
---
 arch/arm/Kconfig  |  1 +
 arch/arm/include/asm/memory.h | 16 --
 arch/arm/mm/Makefile  |  1 +
 arch/arm/mm/physaddr.c| 51 +++
 4 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/mm/physaddr.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b5d529fdffab..5e66173c5787 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2,6 +2,7 @@ config ARM
bool
default y
select ARCH_CLOCKSOURCE_DATA
+   select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index bee7511c5098..d90300193adf 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -213,7 +213,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
: "r" (x), "I" (__PV_BITS_31_24)\
: "cc")
 
-static inline phys_addr_t __virt_to_phys(unsigned long x)
+static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 {
phys_addr_t t;
 
@@ -245,7 +245,7 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
 #define PHYS_OFFSETPLAT_PHYS_OFFSET
 #define PHYS_PFN_OFFSET((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))
 
-static inline phys_addr_t __virt_to_phys(unsigned long x)
+static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 {
return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
 }
@@ -261,6 +261,16 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
 PHYS_PFN_OFFSET)
 
+#define __pa_symbol_nodebug(x) __virt_to_phys_nodebug((x))
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern phys_addr_t __virt_to_phys(unsigned long x);
+extern phys_addr_t __phys_addr_symbol(unsigned long x);
+#else
+#define __virt_to_phys(x)  __virt_to_phys_nodebug(x)
+#define __phys_addr_symbol(x)  __pa_symbol_nodebug(x)
+#endif
+
 /*
  * These are *only* valid on the kernel direct mapped RAM memory.
  * Note: Drivers should NOT use these.  They are the wrong
@@ -283,9 +293,11 @@ static inline void *phys_to_virt(phys_addr_t x)
  * Drivers should NOT use these either.
  */
 #define __pa(x)__virt_to_phys((unsigned long)(x))
+#define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned 
long)(x), 0))
 #define __va(x)((void 
*)__phys_to_virt((phys_addr_t)(x)))
 #define pfn_to_kaddr(pfn)  __va((phys_addr_t)(pfn) << PAGE_SHIFT)
 
+
 extern long long arch_phys_to_idmap_offset;
 
 /*
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index e8698241ece9..b3dea80715b4 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -14,6 +14,7 @@ endif
 
 obj-$(CONFIG_ARM_PTDUMP)   += dump.o
 obj-$(CONFIG_MODULES)  += proc-syms.o
+obj-$(CONFIG_DEBUG_VIRTUAL)+= physaddr.o
 
 obj-$(CONFIG_ALIGNMENT_TRAP)   += alignment.o
 obj-$(CONFIG_HIGHMEM)  += highmem.o
diff --git a/arch/arm/mm/physaddr.c b/arch/arm/mm/physaddr.c
new file mode 100644
index ..0288760306ce
--- /dev/null
+++ b/arch/arm/mm/physaddr.c
@@ -0,0 +1,51 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+#include "mm.h"
+
+static inline bool __virt_addr_valid(unsigned long x)
+{
+   /* high_memory does not get immediately defined, and there
+* are early callers of __pa() against PAGE_OFFSET, just catch
+* these here, then do normal checks, with the exception of
+* MAX_DMA_ADDRESS.
+*/
+   if ((x >= PAGE_OFFSET && !high_memory) ||
+  (x >= PAGE_OFFSET &&
+   high_memory && x < (unsigned long)high_memory) ||
+   x == MAX_DMA_ADDRESS)
+   return true;
+
+   return false;
+}
+
+phys_addr_t __virt_to_phys(unsigned long x)
+{
+   WARN(!__virt_addr_valid(x),
+"virt_to_phys used for non-linear address: %pK (%pS)\n",
+(void *)x,
+(void *)x);
+
+   return __virt_to_phys_nodebug(x);
+}
+EXPORT_SYMBOL(__virt_to_phys);
+
+phys_addr_t __phys_addr_symbol(unsigned long x)
+{
+   /* This is bounds checking against the kernel image only.
+* __pa_symbol should only be used on kernel symbol addresses.
+*/
+   VIRTUAL_BUG_ON(x < (unsigned long)KERNEL_START ||
+  x > (unsigned long)KERNEL_END);
+
+   return

[GIT PULL] SCSI fixes for 4.9-rc8

2016-12-08 Thread James Bottomley

One small fix for a regression in a prior fix (again).  This time the
condition in the prior fix BUG_ON proved to be wrong under certain
circumstances causing a BUG to trigger where it shouldn't in the lpfc
driver.

The patch is available here:

git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git scsi-fixes

The short changelog is:

Mauricio Faria de Oliveira (1):
  scsi: lpfc: fix oops/BUG in lpfc_sli_ringtxcmpl_put()

And the diffstat:

 drivers/scsi/lpfc/lpfc_sli.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

With full diff below.

James

---

diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index c532605..f4f77c5 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -1323,18 +1323,20 @@ lpfc_sli_ringtxcmpl_put(struct lpfc_hba *phba, struct 
lpfc_sli_ring *pring,
 {
lockdep_assert_held(>hbalock);
 
-   BUG_ON(!piocb || !piocb->vport);
+   BUG_ON(!piocb);
 
list_add_tail(>list, >txcmplq);
piocb->iocb_flag |= LPFC_IO_ON_TXCMPLQ;
 
if ((unlikely(pring->ringno == LPFC_ELS_RING)) &&
   (piocb->iocb.ulpCommand != CMD_ABORT_XRI_CN) &&
-  (piocb->iocb.ulpCommand != CMD_CLOSE_XRI_CN) &&
-   (!(piocb->vport->load_flag & FC_UNLOADING)))
-   mod_timer(>vport->els_tmofunc,
- jiffies +
- msecs_to_jiffies(1000 * (phba->fc_ratov << 1)));
+  (piocb->iocb.ulpCommand != CMD_CLOSE_XRI_CN)) {
+   BUG_ON(!piocb->vport);
+   if (!(piocb->vport->load_flag & FC_UNLOADING))
+   mod_timer(>vport->els_tmofunc,
+ jiffies +
+ msecs_to_jiffies(1000 * (phba->fc_ratov << 
1)));
+   }
 
return 0;
 }

Re: crashkernel only allowing smaller allocations as kernel version increases

2016-12-08 Thread Robert LeBlanc

OK, after looking through some code, it looks like the behavior was
change (back?) to only allocating under 896M with commit
55a20ee7804ab64ac90bcdd4e2868a42829e2784. In order to allocate more
RAM than that, you have to specify crashkernel=1024M,high which
reserves some RAM in low memory and the requested amount in high
memory. This is not explained in Documentation/kdump/kdump.txt. I'll
work on a pull request for Documentation.

Robert LeBlanc
PGP Fingerprint 79A2 9CA4 6CC4 45DD A904  C70E E654 3BB2 FA62 B9F1


On Thu, Dec 8, 2016 at 11:36 AM, Robert LeBlanc  wrote:
> I've been trying to capture a core to help in debuging a problem, but
> it seems that as I go up in kernel versions, I can only allocate less
> and less memory to crashkernel and with 4.9-rc8 I can't allocate
> enough to prevent OOM (specifically on CentOS with iSCSI root). I'm
> seeing the same reduction of allocatable memory on both Debian and
> CentOS on the same hardware
>
> As an example:
> Debian
> 3.16 - 1 GB is OK
> 4.9-rc7 - ~512MB is OK, but gets "kernel version is not supported".
>
> CentOS
> 3.10 - 1 GB is OK
> 4.8 - ~800MB is OK
> 4.9-rc8 - ~512MB is OK, but OOMs
>
> CentOS 4.9 kernel was built with default config, Debian 4.9 was built
> by coping the distro 3.16.0-4 config and then running make menuconfig.
>
> Trying to specify memory amount above this causes kdump to report that
> no memory has been reserved. The hardware has 512GB of RAM. Am I
> missing some config option that limiting how much crashkernel can
> reserve?
>
> I also verified on my test desktop with 8GB of RAM that setting
> crashkernel=1024M also fails to allocate memory with 4.9-rc8, but
> works fine with 128M for instance.
>
> Thanks,
>
> 
> Robert LeBlanc
> PGP Fingerprint 79A2 9CA4 6CC4 45DD A904  C70E E654 3BB2 FA62 B9F1

Re: [RFC, PATCHv1 17/28] x86/mm: define virtual memory map for 5-level paging

2016-12-08 Thread Kirill A. Shutemov

On Thu, Dec 08, 2016 at 10:56:04AM -0800, Randy Dunlap wrote:
> > @@ -23,6 +23,27 @@ a000 - ff5f (=1526 MB) module 
> > mapping space
> >  ff60 - ffdf (=8 MB) vsyscalls
> >  ffe0 -  (=2 MB) unused hole
> >  
> > +Virtual memory map with 5 level page tables:
> > +
> > + - 00ff (=56 bits) user space, different per mm
> > +hole caused by [57:63] sign extension
> 
> Can you briefly explain the sign extension?

Sure, I'll update it on respin.

> Should that be [56:63]?

You're right, it should. 

Thanks for all your corrections.

-- 
 Kirill A. Shutemov

[PATCH v18 06/15] clocksource/drivers/arm_arch_timer: Rework counter frequency detection.

2016-12-08 Thread fu . wei

From: Fu Wei 

Currently, the counter frequency detection call(arch_timer_detect_rate)
combines all the ways to get counter frequency: device-tree property,
system coprocessor register, MMIO timer. But in the most of use cases,
we don't need all the ways to try:
For example, reading device-tree property will be needed only when
system boot with device-tree, getting frequency from MMIO timer register
will beneeded only when we init MMIO timer.

This patch separates paths to determine frequency:
Separate out device-tree code, keep them in device-tree init function.
Separate out the MMIO frequency and the sysreg frequency detection call,
and use the appropriate one for the counter.

Signed-off-by: Fu Wei 
Tested-by: Xiongfeng Wang 
---
 drivers/clocksource/arm_arch_timer.c | 49 +++-
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index e43be0a..e554081 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -487,27 +487,31 @@ static int arch_timer_starting_cpu(unsigned int cpu)
return 0;
 }
 
-static void
-arch_timer_detect_rate(void __iomem *cntbase, struct device_node *np)
+static void arch_timer_detect_rate(void)
 {
-   /* Who has more than one independent system counter? */
-   if (arch_timer_rate)
-   return;
+   /*
+* Try to get the timer frequency from
+* cntfrq_el0(system coprocessor register).
+*/
+   if (!arch_timer_rate)
+   arch_timer_rate = arch_timer_get_cntfrq();
+
+   /* Check the timer frequency. */
+   if (!arch_timer_rate)
+   pr_warn("frequency not available\n");
+}
 
+static void arch_timer_mem_detect_rate(void __iomem *cntbase)
+{
/*
-* Try to determine the frequency from the device tree or CNTFRQ,
-* if ACPI is enabled, get the frequency from CNTFRQ ONLY.
+* Try to determine the frequency from
+* CNTFRQ in memory-mapped timer.
 */
-   if (!acpi_disabled ||
-   of_property_read_u32(np, "clock-frequency", _timer_rate)) {
-   if (cntbase)
-   arch_timer_rate = readl_relaxed(cntbase + CNTFRQ);
-   else
-   arch_timer_rate = arch_timer_get_cntfrq();
-   }
+   if (!arch_timer_rate)
+   arch_timer_rate = readl_relaxed(cntbase + CNTFRQ);
 
/* Check the timer frequency. */
-   if (arch_timer_rate == 0)
+   if (!arch_timer_rate)
pr_warn("frequency not available\n");
 }
 
@@ -883,7 +887,9 @@ static int __init arch_timer_of_init(struct device_node *np)
for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++)
arch_timer_ppi[i] = irq_of_parse_and_map(np, i);
 
-   arch_timer_detect_rate(NULL, np);
+   if (!arch_timer_rate &&
+   of_property_read_u32(np, "clock-frequency", _timer_rate))
+   arch_timer_detect_rate();
 
arch_timer_c3stop = !of_property_read_bool(np, "always-on");
 
@@ -986,7 +992,14 @@ static int __init arch_timer_mem_init(struct device_node 
*np)
goto out;
}
 
-   arch_timer_detect_rate(base, np);
+   /*
+* Try to determine the frequency from the device tree,
+* if fail, get the frequency from CNTFRQ.
+*/
+   if (!arch_timer_rate &&
+   of_property_read_u32(np, "clock-frequency", _timer_rate))
+   arch_timer_mem_detect_rate(base);
+
ret = arch_timer_mem_register(base, irq);
if (ret)
goto out;
@@ -1048,7 +1061,7 @@ static int __init arch_timer_acpi_init(struct 
acpi_table_header *table)
gtdt->non_secure_el2_flags);
 
/* Get the frequency from CNTFRQ */
-   arch_timer_detect_rate(NULL, NULL);
+   arch_timer_detect_rate();
 
arch_timer_uses_ppi = arch_timer_select_ppi();
if (!arch_timer_ppi[arch_timer_uses_ppi]) {
-- 
2.9.3

[PATCH v18 08/15] clocksource/drivers/arm_arch_timer: move arch_timer_needs_of_probing into DT init call

2016-12-08 Thread fu . wei

From: Fu Wei 

Because arch_timer_needs_of_probing is only for booting with device-tree,
but arch_timer_common_init is a generic init call which shouldn't include
the FW-specific code. It's better to put arch_timer_needs_of_probing into
DT init function.

But for per-cpu timer, the arch_timer_common_init is called from
arch_timer_init. For reaching the goal above, this patch disassemble
arch_timer_init and use arch_timer_register and arch_timer_common_init
directly, just like arch_timer_mem init code is doing.
By this way, all the DT relevant code are only called from DT init call.

Signed-off-by: Fu Wei 
---
 drivers/clocksource/arm_arch_timer.c | 46 
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 2c9085a..8c813da 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -821,9 +821,6 @@ static bool __init arch_timer_needs_of_probing(void)
 
 static int __init arch_timer_common_init(void)
 {
-   if (acpi_disabled && arch_timer_needs_of_probing())
-   return 0;
-
arch_timer_banner(arch_timers_present);
arch_counter_register(arch_timers_present);
return arch_timer_arch_init();
@@ -861,26 +858,9 @@ static enum arch_timer_ppi_nr __init 
arch_timer_select_ppi(void)
return ARCH_TIMER_PHYS_SECURE_PPI;
 }
 
-static int __init arch_timer_init(void)
-{
-   int ret;
-
-   ret = arch_timer_register();
-   if (ret)
-   return ret;
-
-   ret = arch_timer_common_init();
-   if (ret)
-   return ret;
-
-   arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
-
-   return 0;
-}
-
 static int __init arch_timer_of_init(struct device_node *np)
 {
-   int i;
+   int i, ret;
 
if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
pr_warn("multiple nodes in dt, skipping\n");
@@ -891,6 +871,8 @@ static int __init arch_timer_of_init(struct device_node *np)
for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++)
arch_timer_ppi[i] = irq_of_parse_and_map(np, i);
 
+   arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
+
if (!arch_timer_rate &&
of_property_read_u32(np, "clock-frequency", _timer_rate))
arch_timer_detect_rate();
@@ -921,7 +903,14 @@ static int __init arch_timer_of_init(struct device_node 
*np)
return -EINVAL;
}
 
-   return arch_timer_init();
+   ret = arch_timer_register();
+   if (ret)
+   return ret;
+
+   if (arch_timer_needs_of_probing())
+   return 0;
+
+   return arch_timer_common_init();
 }
 CLOCKSOURCE_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", 
arch_timer_of_init);
 CLOCKSOURCE_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", 
arch_timer_of_init);
@@ -1008,7 +997,8 @@ static int __init arch_timer_mem_init(struct device_node 
*np)
if (ret)
goto out;
 
-   return arch_timer_common_init();
+   if (!arch_timer_needs_of_probing())
+   ret = arch_timer_common_init();
 out:
iounmap(cntctlbase);
of_node_put(best_frame);
@@ -1037,6 +1027,7 @@ static int __init map_generic_timer_interrupt(u32 
interrupt, u32 flags)
 /* Initialize per-processor generic timer */
 static int __init arch_timer_acpi_init(struct acpi_table_header *table)
 {
+   int ret;
struct acpi_table_gtdt *gtdt;
 
if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
@@ -1064,6 +1055,8 @@ static int __init arch_timer_acpi_init(struct 
acpi_table_header *table)
map_generic_timer_interrupt(gtdt->non_secure_el2_interrupt,
gtdt->non_secure_el2_flags);
 
+   arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
+
/* Get the frequency from CNTFRQ */
arch_timer_detect_rate();
 
@@ -1076,8 +1069,11 @@ static int __init arch_timer_acpi_init(struct 
acpi_table_header *table)
/* Always-on capability */
arch_timer_c3stop = !(gtdt->non_secure_el1_flags & ACPI_GTDT_ALWAYS_ON);
 
-   arch_timer_init();
-   return 0;
+   ret = arch_timer_register();
+   if (ret)
+   return ret;
+
+   return arch_timer_common_init();
 }
 CLOCKSOURCE_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
 #endif
-- 
2.9.3

[PATCH v18 07/15] clocksource/drivers/arm_arch_timer: Refactor arch_timer_needs_probing

2016-12-08 Thread fu . wei

From: Fu Wei 

When system init with device-tree, we don't know which node will be
initialized first. And the code in arch_timer_common_init should wait
until per-cpu timer and MMIO timer are both initialized. So we need
arch_timer_needs_probing to detect the init status of system.

But currently the code is dispersed in arch_timer_needs_probing and
arch_timer_common_init. And the function name doesn't specify that
it's only for device-tree. This is somewhat confusing.

This patch move all related code from arch_timer_common_init to
arch_timer_needs_probing, refactor it, and rename it to
arch_timer_needs_of_probing. And make sure that it will be called
only if acpi is disabled.

Signed-off-by: Fu Wei 
---
 drivers/clocksource/arm_arch_timer.c | 34 +++---
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index e554081..2c9085a 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -792,15 +792,28 @@ static const struct of_device_id 
arch_timer_mem_of_match[] __initconst = {
{},
 };
 
-static bool __init
-arch_timer_needs_probing(int type, const struct of_device_id *matches)
+static bool __init arch_timer_needs_of_probing(void)
 {
struct device_node *dn;
bool needs_probing = false;
+   unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM;
 
-   dn = of_find_matching_node(NULL, matches);
-   if (dn && of_device_is_available(dn) && !(arch_timers_present & type))
+   /* We have two timers, and both device-tree nodes are probed. */
+   if ((arch_timers_present & mask) == mask)
+   return false;
+
+   /*
+* Only one type of timer is probed,
+* check if we have another type of timer node in device-tree.
+*/
+   if (arch_timers_present & ARCH_TIMER_TYPE_CP15)
+   dn = of_find_matching_node(NULL, arch_timer_mem_of_match);
+   else
+   dn = of_find_matching_node(NULL, arch_timer_of_match);
+
+   if (dn && of_device_is_available(dn))
needs_probing = true;
+
of_node_put(dn);
 
return needs_probing;
@@ -808,17 +821,8 @@ arch_timer_needs_probing(int type, const struct 
of_device_id *matches)
 
 static int __init arch_timer_common_init(void)
 {
-   unsigned mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM;
-
-   /* Wait until both nodes are probed if we have two timers */
-   if ((arch_timers_present & mask) != mask) {
-   if (arch_timer_needs_probing(ARCH_TIMER_TYPE_MEM,
-arch_timer_mem_of_match))
-   return 0;
-   if (arch_timer_needs_probing(ARCH_TIMER_TYPE_CP15,
-arch_timer_of_match))
-   return 0;
-   }
+   if (acpi_disabled && arch_timer_needs_of_probing())
+   return 0;
 
arch_timer_banner(arch_timers_present);
arch_counter_register(arch_timers_present);
-- 
2.9.3

[PATCH v18 10/15] clocksource/drivers/arm_arch_timer: Refactor the timer init code to prepare for GTDT

2016-12-08 Thread fu . wei

From: Fu Wei 

The patch refactor original memory-mapped timer init code:
(1) Refactor "arch_timer_mem_init", make it become a common code for
memory-mapped timer init.
(2) Add a new function "arch_timer_mem_of_init" for DT init.

Signed-off-by: Fu Wei 
---
 drivers/clocksource/arm_arch_timer.c | 130 ---
 1 file changed, 88 insertions(+), 42 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 8c813da..b02a406 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -915,17 +915,17 @@ static int __init arch_timer_of_init(struct device_node 
*np)
 CLOCKSOURCE_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", 
arch_timer_of_init);
 CLOCKSOURCE_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", 
arch_timer_of_init);
 
-static int __init arch_timer_mem_init(struct device_node *np)
+static int __init arch_timer_mem_init(struct arch_timer_mem *timer_mem)
 {
-   struct device_node *frame, *best_frame = NULL;
void __iomem *cntctlbase, *base;
-   unsigned int irq, ret = -EINVAL;
+   struct arch_timer_mem_frame *best_frame = NULL;
+   unsigned int irq;
u32 cnttidr;
+   int i, ret;
 
-   arch_timers_present |= ARCH_TIMER_TYPE_MEM;
-   cntctlbase = of_iomap(np, 0);
+   cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size);
if (!cntctlbase) {
-   pr_err("Can't find CNTCTLBase\n");
+   pr_err("Can't map CNTCTLBase.\n");
return -ENXIO;
}
 
@@ -935,26 +935,18 @@ static int __init arch_timer_mem_init(struct device_node 
*np)
 * Try to find a virtual capable frame. Otherwise fall back to a
 * physical capable frame.
 */
-   for_each_available_child_of_node(np, frame) {
-   int n;
-   u32 cntacr;
-
-   if (of_property_read_u32(frame, "frame-number", )) {
-   pr_err("Missing frame-number\n");
-   of_node_put(frame);
-   goto out;
-   }
+   for (i = 0; i < timer_mem->num_frames; i++) {
+   u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
+CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
+   int n = timer_mem->frame[i].frame_nr;
 
/* Try enabling everything, and see what sticks */
-   cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
-CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
writel_relaxed(cntacr, cntctlbase + CNTACR(n));
cntacr = readl_relaxed(cntctlbase + CNTACR(n));
 
if ((cnttidr & CNTTIDR_VIRT(n)) &&
!(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) {
-   of_node_put(best_frame);
-   best_frame = frame;
+   best_frame = _mem->frame[i];
arch_timer_mem_use_virtual = true;
break;
}
@@ -962,50 +954,104 @@ static int __init arch_timer_mem_init(struct device_node 
*np)
if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT))
continue;
 
-   of_node_put(best_frame);
-   best_frame = of_node_get(frame);
+   best_frame = _mem->frame[i];
}
+   iounmap(cntctlbase);
 
-   ret= -ENXIO;
-   base = arch_counter_base = of_iomap(best_frame, 0);
-   if (!base) {
-   pr_err("Can't map frame's registers\n");
-   goto out;
+   if (!best_frame) {
+   pr_err("Can't find frame for register\n");
+   return -EINVAL;
}
 
if (arch_timer_mem_use_virtual)
-   irq = irq_of_parse_and_map(best_frame, ARCH_TIMER_VIRT_SPI);
+   irq = best_frame->virt_irq;
else
-   irq = irq_of_parse_and_map(best_frame, ARCH_TIMER_PHYS_SPI);
+   irq = best_frame->phys_irq;
 
-   ret = -EINVAL;
if (!irq) {
pr_err("Frame missing %s irq",
   arch_timer_mem_use_virtual ? "virt" : "phys");
-   goto out;
+   return -EINVAL;
}
 
-   /*
-* Try to determine the frequency from the device tree,
-* if fail, get the frequency from CNTFRQ.
-*/
-   if (!arch_timer_rate &&
-   of_property_read_u32(np, "clock-frequency", _timer_rate))
-   arch_timer_mem_detect_rate(base);
+   base = ioremap(best_frame->cntbase, best_frame->size);
+   if (!base) {
+   pr_err("Can't map frame's registers\n");
+   return -ENXIO;
+   }
+
+   arch_timer_mem_detect_rate(base);
 
ret = arch_timer_mem_register(base, irq);
-   if (ret)
+   if (ret) {
+   iounmap(base);
+   return ret;
+   }
+
+

[PATCH v18 09/15] clocksource/drivers/arm_arch_timer: Introduce some new structs to prepare for GTDT

2016-12-08 Thread fu . wei

From: Fu Wei 

The patch introduce two new structs: arch_timer_mem, arch_timer_mem_frame.
And also introduce a new define: ARCH_TIMER_MEM_MAX_FRAMES

These will be used for refactoring the memory-mapped timer init code to
prepare for GTDT

Signed-off-by: Fu Wei 
---
 include/clocksource/arm_arch_timer.h | 17 +
 1 file changed, 17 insertions(+)

diff --git a/include/clocksource/arm_arch_timer.h 
b/include/clocksource/arm_arch_timer.h
index 48376a5..3403247 100644
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -57,6 +57,8 @@ enum arch_timer_spi_nr {
 #define ARCH_TIMER_MEM_PHYS_ACCESS 2
 #define ARCH_TIMER_MEM_VIRT_ACCESS 3
 
+#define ARCH_TIMER_MEM_MAX_FRAMES  8
+
 #define ARCH_TIMER_USR_PCT_ACCESS_EN   (1 << 0) /* physical counter */
 #define ARCH_TIMER_USR_VCT_ACCESS_EN   (1 << 1) /* virtual counter */
 #define ARCH_TIMER_VIRT_EVT_EN (1 << 2)
@@ -72,6 +74,21 @@ struct arch_timer_kvm_info {
int virtual_irq;
 };
 
+struct arch_timer_mem_frame {
+   int frame_nr;
+   phys_addr_t cntbase;
+   size_t size;
+   int phys_irq;
+   int virt_irq;
+};
+
+struct arch_timer_mem {
+   phys_addr_t cntctlbase;
+   size_t size;
+   int num_frames;
+   struct arch_timer_mem_frame frame[ARCH_TIMER_MEM_MAX_FRAMES];
+};
+
 #ifdef CONFIG_ARM_ARCH_TIMER
 
 extern u32 arch_timer_get_rate(void);
-- 
2.9.3

[PATCH v18 03/15] clocksource/drivers/arm_arch_timer: Improve printk relevant code

2016-12-08 Thread fu . wei

From: Fu Wei 

This patch defines pr_fmt(fmt) for all pr_* functions,
then the pr_* doesn't need to add "arch_timer:" everytime.

According to the suggestion from checkpatch.pl:
(1) delete some Blank Spaces in arch_timer_banner;
(2) delete a redundant Tab in a bland line of arch_timer_init(void)

No functional change.

Signed-off-by: Fu Wei 
Acked-by: Mark Rutland 
Tested-by: Xiongfeng Wang 
---
 drivers/clocksource/arm_arch_timer.c | 49 ++--
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 63bb532..15341cf 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -32,6 +32,9 @@
 
 #include 
 
+#undef pr_fmt
+#define pr_fmt(fmt) "arch_timer: " fmt
+
 #define CNTTIDR0x08
 #define CNTTIDR_VIRT(n)(BIT(1) << ((n) * 4))
 
@@ -504,22 +507,22 @@ arch_timer_detect_rate(void __iomem *cntbase, struct 
device_node *np)
 
/* Check the timer frequency. */
if (arch_timer_rate == 0)
-   pr_warn("Architected timer frequency not available\n");
+   pr_warn("frequency not available\n");
 }
 
 static void arch_timer_banner(unsigned type)
 {
-   pr_info("Architected %s%s%s timer(s) running at %lu.%02luMHz 
(%s%s%s).\n",
-type & ARCH_CP15_TIMER ? "cp15" : "",
-type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  " and " : "",
-type & ARCH_MEM_TIMER ? "mmio" : "",
-(unsigned long)arch_timer_rate / 100,
-(unsigned long)(arch_timer_rate / 1) % 100,
-type & ARCH_CP15_TIMER ?
-(arch_timer_uses_ppi == VIRT_PPI) ? "virt" : "phys" :
+   pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n",
+   type & ARCH_CP15_TIMER ? "cp15" : "",
+   type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  " and " : "",
+   type & ARCH_MEM_TIMER ? "mmio" : "",
+   (unsigned long)arch_timer_rate / 100,
+   (unsigned long)(arch_timer_rate / 1) % 100,
+   type & ARCH_CP15_TIMER ?
+   (arch_timer_uses_ppi == VIRT_PPI) ? "virt" : "phys" :
"",
-type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  "/" : "",
-type & ARCH_MEM_TIMER ?
+   type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  "/" : "",
+   type & ARCH_MEM_TIMER ?
arch_timer_mem_use_virtual ? "virt" : "phys" :
"");
 }
@@ -618,8 +621,7 @@ static void __init arch_counter_register(unsigned type)
 
 static void arch_timer_stop(struct clock_event_device *clk)
 {
-   pr_debug("arch_timer_teardown disable IRQ%d cpu #%d\n",
-clk->irq, smp_processor_id());
+   pr_debug("disable IRQ%d cpu #%d\n", clk->irq, smp_processor_id());
 
disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]);
if (arch_timer_has_nonsecure_ppi())
@@ -712,8 +714,7 @@ static int __init arch_timer_register(void)
}
 
if (err) {
-   pr_err("arch_timer: can't register interrupt %d (%d)\n",
-  ppi, err);
+   pr_err("can't register interrupt %d (%d)\n", ppi, err);
goto out_free;
}
 
@@ -766,7 +767,7 @@ static int __init arch_timer_mem_register(void __iomem 
*base, unsigned int irq)
 
ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", >evt);
if (ret) {
-   pr_err("arch_timer: Failed to request mem timer irq\n");
+   pr_err("Failed to request mem timer irq\n");
kfree(t);
}
 
@@ -844,7 +845,7 @@ static int __init arch_timer_init(void)
}
 
if (!has_ppi) {
-   pr_warn("arch_timer: No interrupt available, giving 
up\n");
+   pr_warn("No interrupt available, giving up\n");
return -EINVAL;
}
}
@@ -858,7 +859,7 @@ static int __init arch_timer_init(void)
return ret;
 
arch_timer_kvm_info.virtual_irq = arch_timer_ppi[VIRT_PPI];
-   
+
return 0;
 }
 
@@ -867,7 +868,7 @@ static int __init arch_timer_of_init(struct device_node *np)
int i;
 
if (arch_timers_present & ARCH_CP15_TIMER) {
-   pr_warn("arch_timer: multiple nodes in dt, skipping\n");
+   pr_warn("multiple nodes in dt, skipping\n");
return 0;
}
 
@@ -911,7 +912,7 @@ static int __init arch_timer_mem_init(struct device_node 
*np)
arch_timers_present |= ARCH_MEM_TIMER;
cntctlbase = of_iomap(np, 0);
if (!cntctlbase) {
-   pr_err("arch_timer: Can't find CNTCTLBase\n");
+

[PATCH v18 05/15] clocksource/drivers/arm_arch_timer: rework PPI determination

2016-12-08 Thread fu . wei

From: Fu Wei 

Currently, the arch timer driver uses ARCH_TIMER_PHYS_SECURE_PPI to
mean the driver will use the secure PPI *and* potentialy also use the
non-secure PPI. This is somewhat confusing.

For arm64, where it never makes sense to use the secure PPI, this
means we must always request the useless secure PPI, adding to the
confusion. For ACPI, where we may not even have a valid secure PPI
number, this is additionally problematic. We need the driver to be
able to use *only* the non-secure PPI.

The logic to choose which PPI to use is intertwined with other logic
in arch_timer_init(). This patch factors the PPI determination out
into a new function named arch_timer_select_ppi, and then reworks it
so that we can handle having only a non-secure PPI.

This patch also moves arch_timer_ppi verification out to caller,
because we can verify the configuration from device-tree for ARM by this
way.

Meanwhile, because we will select ARCH_TIMER_PHYS_NONSECURE_PPI for ARM64,
the logic in arch_timer_register also need to be updated.

Signed-off-by: Fu Wei 
Tested-by: Xiongfeng Wang 
---
 drivers/clocksource/arm_arch_timer.c | 77 +---
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 231175b..e43be0a 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -699,7 +699,7 @@ static int __init arch_timer_register(void)
case ARCH_TIMER_PHYS_NONSECURE_PPI:
err = request_percpu_irq(ppi, arch_timer_handler_phys,
 "arch_timer", arch_timer_evt);
-   if (!err && arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]) {
+   if (!err && arch_timer_has_nonsecure_ppi()) {
ppi = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI];
err = request_percpu_irq(ppi, arch_timer_handler_phys,
 "arch_timer", arch_timer_evt);
@@ -821,39 +821,41 @@ static int __init arch_timer_common_init(void)
return arch_timer_arch_init();
 }
 
-static int __init arch_timer_init(void)
+/**
+ * arch_timer_select_ppi() - Select suitable PPI for the current system.
+ *
+ * If HYP mode is available, we know that the physical timer
+ * has been configured to be accessible from PL1. Use it, so
+ * that a guest can use the virtual timer instead.
+ *
+ * On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE
+ * accesses to CNTP_*_EL1 registers are silently redirected to
+ * their CNTHP_*_EL2 counterparts, and use a different PPI
+ * number.
+ *
+ * If no interrupt provided for virtual timer, we'll have to
+ * stick to the physical timer. It'd better be accessible...
+ * For arm64 we never use the secure interrupt.
+ *
+ * Return: a suitable PPI type for the current system.
+ */
+static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void)
 {
-   int ret;
-   /*
-* If HYP mode is available, we know that the physical timer
-* has been configured to be accessible from PL1. Use it, so
-* that a guest can use the virtual timer instead.
-*
-* If no interrupt provided for virtual timer, we'll have to
-* stick to the physical timer. It'd better be accessible...
-*
-* On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE
-* accesses to CNTP_*_EL1 registers are silently redirected to
-* their CNTHP_*_EL2 counterparts, and use a different PPI
-* number.
-*/
-   if (is_hyp_mode_available() || !arch_timer_ppi[ARCH_TIMER_VIRT_PPI]) {
-   bool has_ppi;
+   if (is_hyp_mode_available() && is_kernel_in_hyp_mode())
+   return ARCH_TIMER_HYP_PPI;
 
-   if (is_kernel_in_hyp_mode()) {
-   arch_timer_uses_ppi = ARCH_TIMER_HYP_PPI;
-   has_ppi = !!arch_timer_ppi[ARCH_TIMER_HYP_PPI];
-   } else {
-   arch_timer_uses_ppi = ARCH_TIMER_PHYS_SECURE_PPI;
-   has_ppi = (!!arch_timer_ppi[ARCH_TIMER_PHYS_SECURE_PPI] 
||
-  
!!arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
-   }
+   if (arch_timer_ppi[ARCH_TIMER_VIRT_PPI])
+   return ARCH_TIMER_VIRT_PPI;
 
-   if (!has_ppi) {
-   pr_warn("No interrupt available, giving up\n");
-   return -EINVAL;
-   }
-   }
+   if (IS_ENABLED(CONFIG_ARM64))
+   return ARCH_TIMER_PHYS_NONSECURE_PPI;
+
+   return ARCH_TIMER_PHYS_SECURE_PPI;
+}
+
+static int __init arch_timer_init(void)
+{
+   int ret;
 
ret = arch_timer_register();
if (ret)
@@ -901,6 +903,13 @@ static int __init arch_timer_of_init(struct device_node 
*np)
if

[PATCH v18 02/15] clocksource/drivers/arm_arch_timer: Add a new enum for spi type

2016-12-08 Thread fu . wei

From: Fu Wei 

This patch add a new enum "arch_timer_spi_nr" and use it in the driver.
Just for code's readability, no functional change.

Signed-off-by: Fu Wei 
Acked-by: Mark Rutland 
---
 drivers/clocksource/arm_arch_timer.c | 4 ++--
 include/clocksource/arm_arch_timer.h | 6 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 21068be..63bb532 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -960,9 +960,9 @@ static int __init arch_timer_mem_init(struct device_node 
*np)
}
 
if (arch_timer_mem_use_virtual)
-   irq = irq_of_parse_and_map(best_frame, 1);
+   irq = irq_of_parse_and_map(best_frame, ARCH_TIMER_VIRT_SPI);
else
-   irq = irq_of_parse_and_map(best_frame, 0);
+   irq = irq_of_parse_and_map(best_frame, ARCH_TIMER_PHYS_SPI);
 
ret = -EINVAL;
if (!irq) {
diff --git a/include/clocksource/arm_arch_timer.h 
b/include/clocksource/arm_arch_timer.h
index 557f869..d23c381 100644
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -46,6 +46,12 @@ enum arch_timer_ppi_nr {
MAX_TIMER_PPI
 };
 
+enum arch_timer_spi_nr {
+   ARCH_TIMER_PHYS_SPI,
+   ARCH_TIMER_VIRT_SPI,
+   ARCH_TIMER_MAX_TIMER_SPI
+};
+
 #define ARCH_TIMER_PHYS_ACCESS 0
 #define ARCH_TIMER_VIRT_ACCESS 1
 #define ARCH_TIMER_MEM_PHYS_ACCESS 2
-- 
2.9.3

Re: net: deadlock on genl_mutex

2016-12-08 Thread Dmitry Vyukov

On Thu, Dec 8, 2016 at 6:16 PM, Dmitry Vyukov  wrote:
> On Thu, Dec 8, 2016 at 5:16 PM, Dmitry Vyukov  wrote:
>> On Tue, Nov 29, 2016 at 6:59 AM,   wrote:

 Issue was reported yesterday and is under investigation.


 http://marc.info/?l=linux-netdev=148014004331663=2


 Thanks !
>>>
>>>
>>> Hi Dmitry
>>>
>>> Can you try the patch below with your reproducer? I haven't seen similar
>>> crashes reported after this (or even with Eric's patch).
>>
>> I've synced to 318c8932ddec5c1c26a4af0f3c053784841c598e (Dec 7) and do
>> _not_ see this report happening anymore.
>> Thanks.
>
>
> But now I am seeing "possible deadlock" warnings involving genl_lock:
>
> [ INFO: possible circular locking dependency detected ]
> 4.9.0-rc8+ #77 Not tainted
> ---
> syz-executor7/18794 is trying to acquire lock:
>  (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x1c/0x20
> net/core/rtnetlink.c:70
> but task is already holding lock:
>  (genl_mutex){+.+.+.}, at: [< inline >] genl_lock
> net/netlink/genetlink.c:31
>  (genl_mutex){+.+.+.}, at: []
> genl_rcv_msg+0x209/0x260 net/netlink/genetlink.c:658
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
>[  315.403815] [< inline >] validate_chain
> kernel/locking/lockdep.c:2265
>[  315.403815] []
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>[  315.403815] [] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>[  315.403815] [< inline >] __mutex_lock_common
> kernel/locking/mutex.c:521
>[  315.403815] []
> mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
>[  315.403815] [< inline >] genl_lock 
> net/netlink/genetlink.c:31
>[  315.403815] [] genl_lock_dumpit+0x46/0xa0
> net/netlink/genetlink.c:518
>[  315.403815] [] netlink_dump+0x57c/0xd70
> net/netlink/af_netlink.c:2127
>[  315.403815] []
> __netlink_dump_start+0x4ea/0x760 net/netlink/af_netlink.c:2217
>[  315.403815] []
> genl_family_rcv_msg+0xdc9/0x1070 net/netlink/genetlink.c:586
>[  315.403815] [] genl_rcv_msg+0x1b0/0x260
> net/netlink/genetlink.c:660
>[  315.403815] [] netlink_rcv_skb+0x2bc/0x3a0
> net/netlink/af_netlink.c:2298
>[  315.403815] [] genl_rcv+0x2d/0x40
> net/netlink/genetlink.c:671
>[  315.403815] [< inline >] netlink_unicast_kernel
> net/netlink/af_netlink.c:1231
>[  315.403815] [] netlink_unicast+0x51a/0x740
> net/netlink/af_netlink.c:1257
>[  315.403815] [] netlink_sendmsg+0xaa4/0xe50
> net/netlink/af_netlink.c:1803
>[  315.403815] [< inline >] sock_sendmsg_nosec net/socket.c:621
>[  315.403815] [] sock_sendmsg+0xcf/0x110
> net/socket.c:631
>[  315.403815] [] sock_write_iter+0x32b/0x620
> net/socket.c:829
>[  315.403815] [< inline >] new_sync_write fs/read_write.c:499
>[  315.403815] [] __vfs_write+0x4fe/0x830
> fs/read_write.c:512
>[  315.403815] [] vfs_write+0x175/0x4e0
> fs/read_write.c:560
>[  315.403815] [< inline >] SYSC_write fs/read_write.c:607
>[  315.403815] [] SyS_write+0x100/0x240
> fs/read_write.c:599
>[  315.403815] [] entry_SYSCALL_64_fastpath+0x23/0xc6
>
>[  315.403815] [< inline >] validate_chain
> kernel/locking/lockdep.c:2265
>[  315.403815] []
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>[  315.403815] [] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>[  315.403815] [< inline >] __mutex_lock_common
> kernel/locking/mutex.c:521
>[  315.403815] []
> mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
>[  315.403815] []
> __netlink_dump_start+0xf9/0x760 net/netlink/af_netlink.c:2187
>[  315.403815] [< inline >] netlink_dump_start
> include/linux/netlink.h:165
>[  315.403815] []
> ctnetlink_stat_ct_cpu+0x198/0x1e0
> net/netfilter/nf_conntrack_netlink.c:2045
>[  315.403815] []
> nfnetlink_rcv_msg+0x9be/0xd60 net/netfilter/nfnetlink.c:212
>[  315.403815] [] netlink_rcv_skb+0x2bc/0x3a0
> net/netlink/af_netlink.c:2298
>[  315.403815] [] nfnetlink_rcv+0x7e1/0x10d0
> net/netfilter/nfnetlink.c:474
>[  315.403815] [< inline >] netlink_unicast_kernel
> net/netlink/af_netlink.c:1231
>[  315.403815] [] netlink_unicast+0x51a/0x740
> net/netlink/af_netlink.c:1257
>[  315.403815] [] netlink_sendmsg+0xaa4/0xe50
> net/netlink/af_netlink.c:1803
>[  315.403815] [< inline >] sock_sendmsg_nosec net/socket.c:621
>[  315.403815] [] sock_sendmsg+0xcf/0x110
> net/socket.c:631
>[  315.403815] [] sock_write_iter+0x32b/0x620
> net/socket.c:829
>[  315.403815] [< inline >] new_sync_write fs/read_write.c:499
>[

crashkernel only allowing smaller allocations as kernel version increases

2016-12-08 Thread Robert LeBlanc

I've been trying to capture a core to help in debuging a problem, but
it seems that as I go up in kernel versions, I can only allocate less
and less memory to crashkernel and with 4.9-rc8 I can't allocate
enough to prevent OOM (specifically on CentOS with iSCSI root). I'm
seeing the same reduction of allocatable memory on both Debian and
CentOS on the same hardware

As an example:
Debian
3.16 - 1 GB is OK
4.9-rc7 - ~512MB is OK, but gets "kernel version is not supported".

CentOS
3.10 - 1 GB is OK
4.8 - ~800MB is OK
4.9-rc8 - ~512MB is OK, but OOMs

CentOS 4.9 kernel was built with default config, Debian 4.9 was built
by coping the distro 3.16.0-4 config and then running make menuconfig.

Trying to specify memory amount above this causes kdump to report that
no memory has been reserved. The hardware has 512GB of RAM. Am I
missing some config option that limiting how much crashkernel can
reserve?

I also verified on my test desktop with 8GB of RAM that setting
crashkernel=1024M also fails to allocate memory with 4.9-rc8, but
works fine with 128M for instance.

Thanks,


Robert LeBlanc
PGP Fingerprint 79A2 9CA4 6CC4 45DD A904  C70E E654 3BB2 FA62 B9F1

Re: [PATCH] xen/pci: Bubble up error and fix description.

2016-12-08 Thread Konrad Rzeszutek Wilk

On Thu, Dec 08, 2016 at 07:58:29AM +0100, Juergen Gross wrote:
> On 06/12/16 15:28, Konrad Rzeszutek Wilk wrote:
> > The function is never called under PV guests, and only shows up
> > when MSI (or MSI-X) cannot be allocated. Convert the message
> > to include the error value.
> > 
> > Signed-off-by: Konrad Rzeszutek Wilk 
> 
> Commited to xen/tip.git for-linus-4.10

Thank you!
> 
> 
> Juergen
>

Re: [PATCH 3/6] net: ethernet: ti: cpts: add support of cpts HW_TS_PUSH

2016-12-08 Thread Grygorii Strashko

On 12/03/2016 05:21 PM, Richard Cochran wrote:
> On Mon, Nov 28, 2016 at 05:04:25PM -0600, Grygorii Strashko wrote:
>> This also change overflow polling period when HW_TS_PUSH feature is
>> enabled - overflow check work will be scheduled more often (every
>> 200ms) for proper HW_TS_PUSH events reporting.
> 
> For proper reporting, you should make use of the interrupt.  The small
> fifo (16 iirc) could very well overflow in 200 ms.  The interrupt
> handler should read out the entire fifo at each interrupt.
> 

huh. Seems this is not really good idea, because MISC Irq will be 
triggered for *any* CPTS event and there is no way to enable it just for
HW_TS_PUSH. So, this doesn't work will with current code for RX/TX timestamping
(which uses polling mode). + runtime overhead in net RX/TX caused by 
triggering more interrupts. 

May be, overflow check/polling timeout can be made configurable (module 
parameter). 

-- 
regards,
-grygorii

Re: [RFC, PATCHv1 16/28] x86/asm: remove __VIRTUAL_MASK_SHIFT==47 assert

2016-12-08 Thread Kirill A. Shutemov

On Thu, Dec 08, 2016 at 10:39:57AM -0800, Andy Lutomirski wrote:
> On Thu, Dec 8, 2016 at 8:21 AM, Kirill A. Shutemov
>  wrote:
> > We don't need it anymore. 17be0aec74fb ("x86/asm/entry/64: Implement
> > better check for canonical addresses") made canonical address check
> > generic wrt. address width.
> 
> This code existed in part to remind us that this needs very careful
> adjustment when the paging size becomes dynamic.  If you want to
> remove it, please add test cases to tools/testing/selftests/x86 that
> verify:
> 
> a. Either mmap(2^47-4096, ..., MAP_FIXED, ...) fails or that, if it
> succeeds and you put a syscall instruction at the very end, that
> invoking the syscall instruction there works.  The easiest way to do
> this may be to have the selftest literally have a page of text that
> has 4094 0xcc bytes and a syscall and to map that page or perhaps move
> it into place with mremap.  That will avoid annoying W^X userspace
> stuff from messing up the test.  You'll need to handle the signal when
> you fall off the end of the world after the syscall.
> 
> b. Ditto for the new highest possible userspace page.
> 
> c. Ditto for one page earlier to make sure that your test actually works.
> 
> d. For each possible maximum address, call raise(SIGUSR1) and, in the
> signal handler, change RIP to point to the first noncanonical address
> and RCX to match RIP.  Return and catch the resulting exception.  This
> may be easy to integrate into the sigreturn tests, and I can help with
> that.

Thanks, for hints.

I'll come back to you with testcases to verify that they are you wanted
to see.

-- 
 Kirill A. Shutemov

Re: [PATCH 0/3] Fix improper handling of pcie hotplug events.

2016-12-08 Thread Bjorn Helgaas

On Sat, Nov 19, 2016 at 12:32:44AM -0800, Ashok Raj wrote:
> This patch series fixes pciehp for certain special conditions observed during
> testing.
> 
> Ashok Raj (3):
>   pciehp: Prioritize data-link event over presence detect
>   pciehp: Fix led status when enabling already enabled slot.

I applied the above on pci/hotplug for v4.10, without the stable
annotation.  If we get reports that would justify stable backports, we
can always request that later.  The ideal thing would be to get those
reports soon enough that we can reference them in the changelog and
add the stable annotations back before these get merged for v4.10.

>   pciehp: Fix race condition handling surprise link-down

I'm holding off on this one, pending resolution of the locking
question.

>  drivers/pci/hotplug/pciehp_ctrl.c |  6 +++---
>  drivers/pci/hotplug/pciehp_hpc.c  | 21 -
>  2 files changed, 15 insertions(+), 12 deletions(-)
> 
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 09/15] livepatch: remove unnecessary object loaded check

2016-12-08 Thread Josh Poimboeuf

klp_patch_object()'s callers already ensure that the object is loaded,
so its call to klp_is_object_loaded() is unnecessary.

This will also make it possible to move the patching code into a
separate file.

Signed-off-by: Josh Poimboeuf 
---
 kernel/livepatch/core.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 2dbd355..47ed643 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -467,9 +467,6 @@ static int klp_patch_object(struct klp_object *obj)
if (WARN_ON(obj->patched))
return -EINVAL;
 
-   if (WARN_ON(!klp_is_object_loaded(obj)))
-   return -EINVAL;
-
klp_for_each_func(obj, func) {
ret = klp_patch_func(func);
if (ret) {
-- 
2.7.4

[PATCH v3 13/15] livepatch: change to a per-task consistency model

2016-12-08 Thread Josh Poimboeuf

Change livepatch to use a basic per-task consistency model.  This is the
foundation which will eventually enable us to patch those ~10% of
security patches which change function or data semantics.  This is the
biggest remaining piece needed to make livepatch more generally useful.

This code stems from the design proposal made by Vojtech [1] in November
2014.  It's a hybrid of kGraft and kpatch: it uses kGraft's per-task
consistency and syscall barrier switching combined with kpatch's stack
trace switching.  There are also a number of fallback options which make
it quite flexible.

Patches are applied on a per-task basis, when the task is deemed safe to
switch over.  When a patch is enabled, livepatch enters into a
transition state where tasks are converging to the patched state.
Usually this transition state can complete in a few seconds.  The same
sequence occurs when a patch is disabled, except the tasks converge from
the patched state to the unpatched state.

An interrupt handler inherits the patched state of the task it
interrupts.  The same is true for forked tasks: the child inherits the
patched state of the parent.

Livepatch uses several complementary approaches to determine when it's
safe to patch tasks:

1. The first and most effective approach is stack checking of sleeping
   tasks.  If no affected functions are on the stack of a given task,
   the task is patched.  In most cases this will patch most or all of
   the tasks on the first try.  Otherwise it'll keep trying
   periodically.  This option is only available if the architecture has
   reliable stacks (HAVE_RELIABLE_STACKTRACE).

2. The second approach, if needed, is kernel exit switching.  A
   task is switched when it returns to user space from a system call, a
   user space IRQ, or a signal.  It's useful in the following cases:

   a) Patching I/O-bound user tasks which are sleeping on an affected
  function.  In this case you have to send SIGSTOP and SIGCONT to
  force it to exit the kernel and be patched.
   b) Patching CPU-bound user tasks.  If the task is highly CPU-bound
  then it will get patched the next time it gets interrupted by an
  IRQ.
   c) In the future it could be useful for applying patches for
  architectures which don't yet have HAVE_RELIABLE_STACKTRACE.  In
  this case you would have to signal most of the tasks on the
  system.  However this isn't supported yet because there's
  currently no way to patch kthreads without
  HAVE_RELIABLE_STACKTRACE.

3. For idle "swapper" tasks, since they don't ever exit the kernel, they
   instead have a klp_update_patch_state() call in the idle loop which
   allows them to be patched before the CPU enters the idle state.

   (Note there's not yet such an approach for kthreads.)

All the above approaches may be skipped by setting the 'immediate' flag
in the 'klp_patch' struct, which will disable per-task consistency and
patch all tasks immediately.  This can be useful if the patch doesn't
change any function or data semantics.  Note that, even with this flag
set, it's possible that some tasks may still be running with an old
version of the function, until that function returns.

There's also an 'immediate' flag in the 'klp_func' struct which allows
you to specify that certain functions in the patch can be applied
without per-task consistency.  This might be useful if you want to patch
a common function like schedule(), and the function change doesn't need
consistency but the rest of the patch does.

For architectures which don't have HAVE_RELIABLE_STACKTRACE, the user
must set patch->immediate which causes all tasks to be patched
immediately.  This option should be used with care, only when the patch
doesn't change any function or data semantics.

In the future, architectures which don't have HAVE_RELIABLE_STACKTRACE
may be allowed to use per-task consistency if we can come up with
another way to patch kthreads.

The /sys/kernel/livepatch//transition file shows whether a patch
is in transition.  Only a single patch (the topmost patch on the stack)
can be in transition at a given time.  A patch can remain in transition
indefinitely, if any of the tasks are stuck in the initial patch state.

A transition can be reversed and effectively canceled by writing the
opposite value to the /sys/kernel/livepatch//enabled file while
the transition is in progress.  Then all the tasks will attempt to
converge back to the original patch state.

[1] https://lkml.kernel.org/r/20141107140458.ga21...@suse.cz

Signed-off-by: Josh Poimboeuf 
---
 Documentation/ABI/testing/sysfs-kernel-livepatch |   8 +
 Documentation/livepatch/livepatch.txt| 127 +-
 include/linux/init_task.h|   9 +
 include/linux/livepatch.h|  40 +-
 include/linux/sched.h|   3 +
 kernel/fork.c|   3 +
 kernel/livepatch/Makefile|

[PATCH v3 10/15] livepatch: move patching functions into patch.c

2016-12-08 Thread Josh Poimboeuf

Move functions related to the actual patching of functions and objects
into a new patch.c file.

Signed-off-by: Josh Poimboeuf 
---
 kernel/livepatch/Makefile |   2 +-
 kernel/livepatch/core.c   | 202 +--
 kernel/livepatch/patch.c  | 213 ++
 kernel/livepatch/patch.h  |  32 +++
 4 files changed, 247 insertions(+), 202 deletions(-)
 create mode 100644 kernel/livepatch/patch.c
 create mode 100644 kernel/livepatch/patch.h

diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index e8780c0..e136dad 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
-livepatch-objs := core.o
+livepatch-objs := core.o patch.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 47ed643..6a137e1 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -24,32 +24,13 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
-
-/**
- * struct klp_ops - structure for tracking registered ftrace ops structs
- *
- * A single ftrace_ops is shared between all enabled replacement functions
- * (klp_func structs) which have the same old_addr.  This allows the switch
- * between function versions to happen instantaneously by updating the klp_ops
- * struct's func_stack list.  The winner is the klp_func at the top of the
- * func_stack (front of the list).
- *
- * @node:  node for the global klp_ops list
- * @func_stack:list head for the stack of klp_func's (active func is 
on top)
- * @fops:  registered ftrace ops struct
- */
-struct klp_ops {
-   struct list_head node;
-   struct list_head func_stack;
-   struct ftrace_ops fops;
-};
+#include "patch.h"
 
 /*
  * The klp_mutex protects the global lists and state transitions of any
@@ -60,28 +41,12 @@ struct klp_ops {
 static DEFINE_MUTEX(klp_mutex);
 
 static LIST_HEAD(klp_patches);
-static LIST_HEAD(klp_ops);
 
 static struct kobject *klp_root_kobj;
 
 /* TODO: temporary stub */
 void klp_update_patch_state(struct task_struct *task) {}
 
-static struct klp_ops *klp_find_ops(unsigned long old_addr)
-{
-   struct klp_ops *ops;
-   struct klp_func *func;
-
-   list_for_each_entry(ops, _ops, node) {
-   func = list_first_entry(>func_stack, struct klp_func,
-   stack_node);
-   if (func->old_addr == old_addr)
-   return ops;
-   }
-
-   return NULL;
-}
-
 static bool klp_is_module(struct klp_object *obj)
 {
return obj->name;
@@ -314,171 +279,6 @@ static int klp_write_object_relocations(struct module 
*pmod,
return ret;
 }
 
-static void notrace klp_ftrace_handler(unsigned long ip,
-  unsigned long parent_ip,
-  struct ftrace_ops *fops,
-  struct pt_regs *regs)
-{
-   struct klp_ops *ops;
-   struct klp_func *func;
-
-   ops = container_of(fops, struct klp_ops, fops);
-
-   rcu_read_lock();
-   func = list_first_or_null_rcu(>func_stack, struct klp_func,
- stack_node);
-   if (WARN_ON_ONCE(!func))
-   goto unlock;
-
-   klp_arch_set_pc(regs, (unsigned long)func->new_func);
-unlock:
-   rcu_read_unlock();
-}
-
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
-   return faddr;
-}
-#endif
-
-static void klp_unpatch_func(struct klp_func *func)
-{
-   struct klp_ops *ops;
-
-   if (WARN_ON(!func->patched))
-   return;
-   if (WARN_ON(!func->old_addr))
-   return;
-
-   ops = klp_find_ops(func->old_addr);
-   if (WARN_ON(!ops))
-   return;
-
-   if (list_is_singular(>func_stack)) {
-   unsigned long ftrace_loc;
-
-   ftrace_loc = klp_get_ftrace_location(func->old_addr);
-   if (WARN_ON(!ftrace_loc))
-   return;
-
-   WARN_ON(unregister_ftrace_function(>fops));
-   WARN_ON(ftrace_set_filter_ip(>fops, ftrace_loc, 1, 0));
-
-   list_del_rcu(>stack_node);
-   list_del(>node);
-   kfree(ops);
-   } else {
-   list_del_rcu(>stack_node);
-   }
-
-   func->patched = false;
-}
-
-static int klp_patch_func(struct klp_func *func)
-{
-   struct klp_ops *ops;
-   int ret;
-
-   if (WARN_ON(!func->old_addr))
-   return -EINVAL;
-
-   if (WARN_ON(func->patched))
-   return -EINVAL;
-
-

Re: [RFC, PATCHv1 00/28] 5-level paging

2016-12-08 Thread Linus Torvalds

On Thu, Dec 8, 2016 at 8:21 AM, Kirill A. Shutemov
 wrote:
>
> This patchset is still very early. There are a number of things missing
> that we have to do before asking anyone to merge it (listed below).
> It would be great if folks can start testing applications now (in QEMU) to
> look for breakage.
> Any early comments on the design or the patches would be appreciated as
> well.

Looks ok to me. Starting off with a compile-time config option seems fine.

I do think that the x86 cpuid part should (patch 15) should be the
first patch, so that we see "la57" as a capability in /proc/cpuinfo
whether it's being enabled or not? We should merge that part
regardless of any mm patches, I think.

   Linus

[PATCH v3 02/15] x86/entry: define _TIF_ALLWORK_MASK flags explicitly

2016-12-08 Thread Josh Poimboeuf

The _TIF_ALLWORK_MASK macro automatically includes the least-significant
16 bits of the thread_info flags, which is less than obvious and tends
to create confusion and surprises when reading or modifying the code.

Define the flags explicitly.

Signed-off-by: Josh Poimboeuf 
---
 arch/x86/include/asm/thread_info.h | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index ad6f5eb0..1fe6043 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -73,9 +73,6 @@ struct thread_info {
  * thread information flags
  * - these are process state flags that various assembly files
  *   may need to access
- * - pending work-to-be-done flags are in LSW
- * - other flags in MSW
- * Warning: layout of LSW is hardcoded in entry.S
  */
 #define TIF_SYSCALL_TRACE  0   /* syscall trace active */
 #define TIF_NOTIFY_RESUME  1   /* callback before returning to user */
@@ -133,8 +130,10 @@ struct thread_info {
 
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK  \
-   ((0x & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT |   \
-   _TIF_NOHZ)
+   (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING |\
+_TIF_SINGLESTEP | _TIF_NEED_RESCHED | _TIF_SYSCALL_EMU |   \
+_TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE |   \
+_TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW
\
-- 
2.7.4

[PATCH] coredump: Ensure proper size of sparse core files

2016-12-08 Thread Dave Kleikamp

If the last section of a core file ends with an unmapped or zero page,
the size of the file does not correspond with the last dump_skip() call.
gdb complains that the file is truncated and can be confusing to users.

After all of the vma sections are written, make sure that the file size
is no smaller than the current file position.

This problem can be demonstrated with gdb's bigcore testcase on the
sparc architecture.

Signed-off-by: Dave Kleikamp 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 fs/binfmt_elf.c  |  1 +
 fs/coredump.c| 18 ++
 include/linux/coredump.h |  1 +
 3 files changed, 20 insertions(+)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2472af2..cfd724f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2296,6 +2296,7 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
}
}
+   dump_truncate(cprm);
 
if (!elf_core_write_extra_data(cprm))
goto end_coredump;
diff --git a/fs/coredump.c b/fs/coredump.c
index eb9c92c..ecdb1e3 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align)
return mod ? dump_skip(cprm, align - mod) : 1;
 }
 EXPORT_SYMBOL(dump_align);
+
+/*
+ * Insures that file size is big enough to contain the current file
+ * postion. This prevents gdb from complaining about a truncated file
+ * if the last "write" to the file was dump_skip.
+ */
+void dump_truncate(struct coredump_params *cprm)
+{
+   struct file *file = cprm->file;
+   loff_t offset;
+
+   if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+   offset = file->f_op->llseek(file, 0, SEEK_CUR);
+   if (i_size_read(file->f_mapping->host) < offset)
+   do_truncate(file->f_path.dentry, offset, 0, file);
+   }
+}
+EXPORT_SYMBOL(dump_truncate);
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index d016a12..28ffa94 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -14,6 +14,7 @@ struct coredump_params;
 extern int dump_skip(struct coredump_params *cprm, size_t nr);
 extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 extern int dump_align(struct coredump_params *cprm, int align);
+extern void dump_truncate(struct coredump_params *cprm);
 #ifdef CONFIG_COREDUMP
 extern void do_coredump(const siginfo_t *siginfo);
 #else
-- 
2.10.2

Re: [GIT PULL 1/3] ARM: exynos: Soc/mach for v4.10

2016-12-08 Thread Olof Johansson

On Thu, Dec 8, 2016 at 7:28 AM, Pankaj Dubey  wrote:
> On 3 December 2016 at 22:33, Krzysztof Kozlowski  wrote:
>> On Fri, Dec 02, 2016 at 10:52:57PM +0100, Arnd Bergmann wrote:
>>> On Thursday, December 1, 2016 8:34:04 PM CET Krzysztof Kozlowski wrote:
>>> > On Thu, Nov 24, 2016 at 08:08:27AM +0200, Krzysztof Kozlowski wrote:
>>> > > Hi,
>>> > >
>>> > > This contains previous dts branch because SCU node in dts is needed
>>> > > prior to removing it from mach code.
>>> > >
>>> > > Below you will find full pull request and one stripped from dependency.
>>> > >
>>> >
>>> > Hi Arnd, Kevin and Olof,
>>> >
>>> > What about this pull from the set?
>>> >
>>>
>>> Sorry, I initially deferred it and then didn't get back to it.
>>>
>>> The dependency on the .dts changes made me a bit nervous about
>>> taking it, mostly because the changelog fails to explain the
>>> exact dependencies.
>>>
>>> This breaks compatibility with existing .dtb files, right?
>>
>> No, strictly speaking not. There was no dt-bindings change here, no DT
>> properties for SCU before. We are converting our drivers to DTB so this
>> is the same as before when switching for pinctrl, clocks or all other
>> drivers to DT.
>>
>> We are not braking DTB ABI because there was no ABI around it before.
>> Otherwise, one would say that lack of SCU DT node was an ABI. That is
>> wrong, because DT should describe the hardware and SCU is in hardware.
>>
>>> What I'd like to see here is an explanation about:
>>>
>>> - what the upside of breaking compatibility is
>>
>> DTBs which do not have SCU are not proper because they skip that part of
>> hardware. However we are breaking them in the way the SMP won't work
>> there. It is not an ABI break, as I mentioned above.
>>
>>> - what exactly stops working with an old dtb
>>> - why we don't keep a fallback for handling old dtb files
>>
>> What is the point for it? This is not an ABI break. Even if it was,
>> Samsung guys don't care for ABI breaks at all (and in fact we wanted to
>> mark the platform experimental...).
>>
>>> It would also be helpful to have a separate pull request for
>>> the commits require the new dtb, and the stuff that is unrelated.
>>
>> I can do that but the pull will be small.
>>
>
> Arnd,
>
> Any update on this? Intention of this change is to improve
> Exynos SoC's DT support in mainline kernel. This will help in removing static
> mapping from exynos machine files and simplify mach-exynos code-base.

Adding the SCU nodes now makes sense. So does using them if they're available.

Given the prevalence of exynos systems with DTS already out there, it
would make sense to give an overlap of making the kernel work without
the SCU in DT for a period of time.

This isn't like the old days of when we were mass-converting things
and breakage was expected. We're well into a steady state here, so
being nicer to downstream users is likely the right thing to do here.


-Olof

Re: [PATCH] x86/vm86: fix compilation warning on a unused variable

2016-12-08 Thread Jérémy Lefaure

On Thu, 8 Dec 2016 13:50:11 +0300
"Kirill A. Shutemov"  wrote:

> On Wed, Dec 07, 2016 at 11:38:33PM -0500, Jérémy Lefaure wrote:
> > When CONFIG_TRANSPARENT_HUGEPAGE is disabled, split_huge_pmd is a no-op
> > stub. In such case, vma is unused and a compiler raises a warning:
> > 
> > arch/x86/kernel/vm86_32.c: In function ‘mark_screen_rdonly’:
> > arch/x86/kernel/vm86_32.c:180:26: warning: unused variable ‘vma’
> > [-Wunused-variable]
> >struct vm_area_struct *vma = find_vma(mm, 0xA);
> >  ^~~
> > Adding __maybe_unused in the vma declaration fixes this warning.  
> 
> Hm. pmd_trans_huge() is zero if CONFIG_TRANSPARENT_HUGEPAGE is not set.
> Compiler should get rid of whole block of code under the 'if'.
> 
> Could you share your kernel config which triggers the warning?
> And what compiler do you use?
> 

After a `make allnoconfig`, I enable "Legacy VM86 support" and nothing
else. I tested with 2 compilers, gcc 4.9.2 (on debian jessie) and gcc
6.2.1 (on archlinux).

Actually, the compiler does not raise warnings on complete build (`make
mrproper`, configuration and `make`) but only on partial build (`make
arch/x86/kernel/vm86_32.o` or `touch arch/x86/kernel/vm86_32.c &&
make`). So maybe it is a compiler issue ?

The solution you propose in your other email (adding "(void)__vma;" in
the no-op split_huge_pmd) seems to fix the warnings on partial build.

Thanks,
Jérémy

[PATCH v2] tracing: Replace kmap with copy_from_user() in trace_marker writing

2016-12-08 Thread Steven Rostedt


Instead of using get_user_pages_fast() and kmap_atomic() when writing
to the trace_marker file, just allocate enough space on the ring buffer
directly, and write into it via copy_from_user().

Writing into the trace_marker file use to allocate a temporary buffer
to perform the copy_from_user(), as we didn't want to write into the
ring buffer if the copy failed. But as a trace_marker write is suppose
to be extremely fast, and allocating memory causes other tracepoints to
trigger, Peter Zijlstra suggested using get_user_pages_fast() and
kmap_atomic() to keep the user space pages in memory and reading it
directly. But Henrik Austad had issues with this because that caused
other tracepoints to trigger as well.

Instead, just allocate the space in the ring buffer and use
copy_from_user() directly. If it faults, return -EFAULT and write
"" into the ring buffer.

Link: http://lkml.kernel.org/r/20161208124018.72dd0...@gandalf.local.home

Cc: Ingo Molnar 
Cc: Henrik Austad 
Cc: Peter Zijlstra 
Updates: d696b58ca2c3ca "tracing: Do not allocate buffer for trace_marker"
Suggested-by: Thomas Gleixner 
Signed-off-by: Steven Rostedt 
---

Changes from v1: Removed unused variables.

 kernel/trace/trace.c | 139 ++-
 1 file changed, 37 insertions(+), 102 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 60416bf7c591..6f420d7b703b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5738,61 +5738,6 @@ tracing_free_buffer_release(struct inode *inode, struct 
file *filp)
return 0;
 }
 
-static inline int lock_user_pages(const char __user *ubuf, size_t cnt,
- struct page **pages, void **map_page,
- int *offset)
-{
-   unsigned long addr = (unsigned long)ubuf;
-   int nr_pages = 1;
-   int ret;
-   int i;
-
-   /*
-* Userspace is injecting traces into the kernel trace buffer.
-* We want to be as non intrusive as possible.
-* To do so, we do not want to allocate any special buffers
-* or take any locks, but instead write the userspace data
-* straight into the ring buffer.
-*
-* First we need to pin the userspace buffer into memory,
-* which, most likely it is, because it just referenced it.
-* But there's no guarantee that it is. By using get_user_pages_fast()
-* and kmap_atomic/kunmap_atomic() we can get access to the
-* pages directly. We then write the data directly into the
-* ring buffer.
-*/
-
-   /* check if we cross pages */
-   if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
-   nr_pages = 2;
-
-   *offset = addr & (PAGE_SIZE - 1);
-   addr &= PAGE_MASK;
-
-   ret = get_user_pages_fast(addr, nr_pages, 0, pages);
-   if (ret < nr_pages) {
-   while (--ret >= 0)
-   put_page(pages[ret]);
-   return -EFAULT;
-   }
-
-   for (i = 0; i < nr_pages; i++)
-   map_page[i] = kmap_atomic(pages[i]);
-
-   return nr_pages;
-}
-
-static inline void unlock_user_pages(struct page **pages,
-void **map_page, int nr_pages)
-{
-   int i;
-
-   for (i = nr_pages - 1; i >= 0; i--) {
-   kunmap_atomic(map_page[i]);
-   put_page(pages[i]);
-   }
-}
-
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -5802,14 +5747,14 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
struct ring_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
-   struct page *pages[2];
-   void *map_page[2];
-   int nr_pages = 1;
+   const char faulted[] = "";
ssize_t written;
-   int offset;
int size;
int len;
 
+/* Used in tracing_mark_raw_write() as well */
+#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
+
if (tracing_disabled)
return -EINVAL;
 
@@ -5821,30 +5766,31 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
 
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-   nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, );
-   if (nr_pages < 0)
-   return nr_pages;
-
local_save_flags(irq_flags);
-   size = sizeof(*entry) + cnt + 2; /* possible \n added */
+   size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+
+   /* If less than "", then make sure we can still add that */
+   if (cnt < FAULTED_SIZE)
+   size += FAULTED_SIZE - cnt;
+
buffer = tr->trace_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,

Re: [PATCH] drivers: net: xgene: initialize slots

2016-12-08 Thread Iyappan Subramanian

On Thu, Dec 8, 2016 at 3:17 AM, Colin King  wrote:
> From: Colin Ian King 
>
> static analysis using cppcheck detected that slots was uninitialized.
> Fix this by initializing it to buf_pool->slots - 1
>
> Found using static analysis with CoverityScan, CID #1387620
>
> Fixes: a9380b0f7be818 ("drivers: net: xgene: Add support for Jumbo frame")
> Signed-off-by: Colin Ian King 
> ---
>  drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c 
> b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
> index 6c7eea8..899163c 100644
> --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
> +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
> @@ -636,6 +636,7 @@ static void xgene_enet_free_pagepool(struct 
> xgene_enet_desc_ring *buf_pool,
>
> dev = ndev_to_dev(buf_pool->ndev);
> head = buf_pool->head;
> +   slots = buf_pool->slots - 1;
>
> for (i = 0; i < 4; i++) {
> frag_size = xgene_enet_get_data_len(le64_to_cpu(desc[i ^ 1]));

Thanks, Colin.

Dan Carpenter  posted the fix already and
got accepted.
http://marc.info/?l=linux-netdev=148110980224343=2

> --
> 2.10.2
>

Re: [PATCH 1/1] arm64: mm: add config options for page table configuration

2016-12-08 Thread Catalin Marinas

On Thu, Dec 08, 2016 at 08:30:36AM -0800, Scott Branden wrote:
> On 16-12-08 02:00 AM, Catalin Marinas wrote:
> >On Wed, Dec 07, 2016 at 11:40:00AM -0800, Scott Branden wrote:
> >>Make MAX_PHYSMEM_BITS and SECTIONS_SIZE_BITS configurable by adding
> >>config options.
> >>Default to current settings currently defined in sparesmem.h.
> >>For systems wishing to save memory the config options can be overridden.
> >>Example, changing MAX_PHYSMEM_BITS from 48 to 36 at the same time as
> >>changing SECTION_SIZE_BITS from 30 to 26 frees 13MB of memory.
[...]
> > I would rather reduce SECTION_SIZE_BITS permanently where
> >feasible, like in this patch:
> >
> >http://lkml.kernel.org/r/1465821119-3384-1-git-send-email-jszh...@marvell.com
>
> This patch does not meet my requirements as I need SECTION_SIZE_BITS to be
> set to 28 to reduce memory

So with this patch, we reduce it to 27, it should be fine-grained enough
for 128MB sections. Alternatively, there were other suggestions here:

http://lkml.iu.edu/hypermail/linux/kernel/1604.1/03036.html

> and to allow memory hotplug to allocate a 256 MB section.

Can memory hotplug not work with 2*128MB sections in this case?

> My patch future proofs the tuning of the parameters by allowing
> any section size to be made. 

While MAX_PHYSMEM_BITS makes sense to users in general,
SECTION_SIZE_BITS is not always clear to the average user what it means
and its min/max boundaries. That's another reason (apart from single/few
Image case) why I prefer to not expose it as configuration option.

> I could combine the patch you list such that
> SECTION_SIZE_BITS defaults to 30 when CONFIG_ARM64_64_PAGES is selected and
> 27 otherwise.  Should it default to something else for 16K and 4K pages?

I haven't done any calculations for 16K yet but we could probably come
up with some formula based on PAGE_SHIFT to cover all cases.

> In terms of MAX_PHYSMEM_BITS, if our SoCs only use 40 (or less) bits I would
> also like the configuration functionality.  This allows us to make the
> SECTION_SIZE_BITS smaller.

So how small do you want SECTION_SIZE_BITS to be? As I said above, 128MB
sections should be sufficient in most cases and without the need to
reduce MAX_PHYSMEM_BITS.

-- 
Catalin

Re: [PATCH 1/2] mailbox: mailbox-test: add support for fasync/poll

2016-12-08 Thread Sudeep Holla



On 29/11/16 14:37, Sudeep Holla wrote:
> Currently the read operation on the message debug file returns error if
> there's no data ready to be read. It expects the userspace to retry if
> it fails. Since the mailbox response could be asynchronous, it would be
> good to add support to block the read until the data is available.
> 
> We can also implement poll file operations so that the userspace can
> wait to become ready to perform any I/O.
> 
> This patch implements the poll and fasync file operation callback for
> the test mailbox device.
> 
> Cc: Lee Jones 
> Cc: Jassi Brar 
> Signed-off-by: Sudeep Holla 

Gentle Ping!

-- 
Regards,
Sudeep

Re: [PATCH 08/11] ACPICA: Tables: Back port acpi_get_table_with_size() and early_acpi_os_unmap_memory() from Linux kernel

2016-12-08 Thread Dan Williams

On Thu, Dec 8, 2016 at 5:18 AM, Rafael J. Wysocki  wrote:
> On Thu, Dec 8, 2016 at 2:11 AM, Dan Williams  wrote:
>> On Tue, Nov 29, 2016 at 11:21 PM, Lv Zheng  wrote:
>>> ACPICA commit cac6790954d4d752a083e610b8a22febcd07
>>>
>>> This patch back ports Linux acpi_get_table_with_size() and
>>> early_acpi_os_unmap_memory() into ACPICA upstream to reduce divergences.
>>>
>>> The 2 APIs are used by Linux as table management APIs for long time, it
>>> contains a hidden logic that during the early stage, the mapped tables
>>> should be unmapped before the early stage ends.
>>>
>>> During the early stage, tables are handled by the following sequence:
>>>  acpi_get_table_with_size();
>>>  parse the table
>>>  early_acpi_os_unmap_memory();
>>> During the late stage, tables are handled by the following sequence:
>>>  acpi_get_table();
>>>  parse the table
>>> Linux uses acpi_gbl_permanent_mmap to distinguish the early stage and the
>>> late stage.
>>>
>>> The reasoning of introducing acpi_get_table_with_size() is: ACPICA will
>>> remember the early mapped pointer in acpi_get_table() and Linux isn't able 
>>> to
>>> prevent ACPICA from using the wrong early mapped pointer during the late
>>> stage as there is no API provided from ACPICA to be an inverse of
>>> acpi_get_table() to forget the early mapped pointer.
>>>
>>> But how ACPICA can work with the early/late stage requirement? Inside of
>>> ACPICA, tables are ensured to be remained in "INSTALLED" state during the
>>> early stage, and they are carefully not transitioned to "VALIDATED" state
>>> until the late stage. So the same logic is in fact implemented inside of
>>> ACPICA in a different way. The gap is only that the feature is not provided
>>> to the OSPMs in an accessible external API style.
>>>
>>> It then is possible to fix the gap by providing an inverse of
>>> acpi_get_table() from ACPICA, so that the two Linux sequences can be
>>> combined:
>>>  acpi_get_table();
>>>  parse the table
>>>  acpi_put_table();
>>> In order to work easier with the current Linux code, acpi_get_table() and
>>> acpi_put_table() is implemented in a usage counting based style:
>>>  1. When the usage count of the table is increased from 0 to 1, table is
>>> mapped and .Pointer is set with the mapping address (VALIDATED);
>>>  2. When the usage count of the table is decreased from 1 to 0, .Pointer
>>> is unset and the mapping address is unmapped (INVALIDATED).
>>> So that we can deploy the new APIs to Linux with minimal effort by just
>>> invoking acpi_get_table() in acpi_get_table_with_size() and invoking
>>> acpi_put_table() in early_acpi_os_unmap_memory(). Lv Zheng.
>>>
>>> Link: https://github.com/acpica/acpica/commit/cac67909
>>> Signed-off-by: Lv Zheng 
>>> Signed-off-by: Bob Moore 
>>
>> This commit in -next (071b39575679 ACPICA: Tables: Back port
>> acpi_get_table_with_size() and early_acpi_os_unmap_memory() from Linux
>> kernel) causes a regression in my nfit/nvdimm test environment. The
>> nfit produced by QEMU no longer results in a nvdimm bus being created.
>>
>> I have not root caused it, but I'm using the following command line
>> options to create an nfit in qemu-2.6.  Reverting the commit leads
>> compile failures.
>
> Would the build problems go away if you reverted "ACPICA: Tables:
> Allow FADT to be customized with virtual address" (linux-next commit
> cf334d3174f9) in addition to it?

Yes, reverting those two commits gets me back to a functional environment:

Revert "ACPICA: Tables: Allow FADT to be customized with virtual address"
Revert "ACPICA: Tables: Back port acpi_get_table_with_size() and
early_acpi_os_un

Re: [PATCH] mm: page_alloc: High-order per-cpu page allocator v7

2016-12-08 Thread Mel Gorman

On Thu, Dec 08, 2016 at 06:19:51PM +0100, Jesper Dangaard Brouer wrote:
> > > See patch below signature.
> > > 
> > > Besides I think you misunderstood me, you can adjust:
> > >  sysctl net.core.rmem_max
> > >  sysctl net.core.wmem_max
> > > 
> > > And you should if you plan to use/set 851968 as socket size for UDP
> > > remote tests, else you will be limited to the "max" values (212992 well
> > > actually 425984 2x default value, for reasons I cannot remember)
> > >   
> > 
> > The intent is to use the larger values to avoid packet loss on
> > UDP_STREAM.
> 
> We do seem to misunderstand each-other.
> I was just pointing out two things:
> 
> 1. Notice the difference between "max" and "default" proc setting.
>Only adjust the "max" setting.
> 
> 2. There was simple BASH-shell script error in your commit.
>Patch below fix it.
> 

Understood now.

> [PATCH] mmtests: actually use variable SOCKETSIZE_OPT
> 
> From: Jesper Dangaard Brouer 
> 

Applied, thanks!

-- 
Mel Gorman
SUSE Labs

Re: [PATCH 3/3] pciehp: Fix race condition handling surprise link-down

2016-12-08 Thread Bjorn Helgaas

On Thu, Dec 08, 2016 at 12:20:58PM -0500, Keith Busch wrote:
> On Thu, Dec 08, 2016 at 09:11:58AM -0600, Bjorn Helgaas wrote:
> > On Wed, Dec 07, 2016 at 07:04:33PM -0500, Keith Busch wrote:
> > > 
> > > It currently looks safe to nest the p_slot->lock under the
> > > p_slot->hotplug_lock if that is you recommendation.
> > 
> > I'm not making a recommendation; that would take a lot more thought
> > than I've given this.
> > 
> > There are at least three locks in pciehp:
> > 
> >   struct controller.ctrl_lock
> >   struct slot.lock
> >   struct slot.hotplug_lock
> > 
> > There shouldn't really be any performance paths in pciehp, so I'm
> > pretty doubtful that we need such complicated locking.
> 
> They are protecting different things, but I agree it looks like room
> for simplification exists.

If we can't simplify this immediately, can we add a comment about what
the different locks protect so people have a hint about which one to
use?  For example, it looks like this patch might benefit from that
knowledge.

> > > Alternatively we could fix this if we used an ordered work queue for
> > > the slot's work, but that is a significantly more complex change.
> > 
> > You mean we can currently execute things from the work queue in a
> > different order than they were enqueued?  That sounds ... difficult to
> > analyze, to say the least.
> 
> The events are dequeued in order, but they don't wait for the previous
> to complete, so pciehp's current work queue can have multiple events
> executing in parallel. That's part of why rapid pciehp slot events are
> a little more difficult to follow, and I think we may even be unsafely
> relying on the order the mutexes are obtained from these work events.

Hmm.  I certainly did not expect multiple events executing in
parallel.  That sounds like a pretty serious issue to me.

> Partly unrelated, we could process surprise removals significantly
> faster (microseconds vs. seconds), with the limited pci access series
> here, giving fewer simultaneously executing events to consider:
> 
>  https://www.spinics.net/lists/linux-pci/msg55585.html
> 
> Do you have any concerns with that series?

I'm dragging my feet because I want the removal process to become
simpler to understand, not more complicated, and we're exposing more
issues that I didn't even know about.

> > I don't know much about work queues, and Documentation/workqueue.txt
> > doesn't mention alloc_ordered_workqueue().  Is that what you are
> > referring to?
> 
> Yes, the alloc_ordered_workqueue is what I had in mind, though switching
> to that is not as simple as calling the different API. I am looking into
> that for longer term, but for the incremental fix, do you think we can
> go forward with Raj's proposal?

I'd like to at least see a consistent locking strategy for protecting
p_slot->state.  All the existing updates are protected by
p_slot->lock, but the one Raj is adding is protected by
p_slot->hotplug_lock.

Bjorn

Re: [PATCH v4 2/2] usb: dwc3: core: Support the dwc3 host suspend/resume

2016-12-08 Thread Felipe Balbi


Hi,

Baolin Wang  writes:
>> Baolin Wang  writes:
> On 28 November 2016 at 14:43, Baolin Wang  wrote:
>> For some mobile devices with strict power management, we also want to 
>> suspend
>> the host when the slave is detached for power saving. Thus we add the 
>> host
>> suspend/resume functions to support this requirement.
>>
>> Signed-off-by: Baolin Wang 
>> ---
>> Changes since v3:
>>  - No updates.
>>
>> Changes since v2:
>>  - Remove pm_children_suspended() and other unused macros.
>>
>>  Changes since v1:
>>- Add pm_runtime.h head file to avoid kbuild error.
>> ---
>>  drivers/usb/dwc3/Kconfig |7 +++
>>  drivers/usb/dwc3/core.c  |   26 +-
>>  drivers/usb/dwc3/core.h  |   15 +++
>>  drivers/usb/dwc3/host.c  |   37 +
>>  4 files changed, 84 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/usb/dwc3/Kconfig b/drivers/usb/dwc3/Kconfig
>> index a45b4f1..47bb2f3 100644
>> --- a/drivers/usb/dwc3/Kconfig
>> +++ b/drivers/usb/dwc3/Kconfig
>> @@ -47,6 +47,13 @@ config USB_DWC3_DUAL_ROLE
>>
>>  endchoice
>>
>> +config USB_DWC3_HOST_SUSPEND
>> +   bool "Choose if the DWC3 host (xhci) can be suspend/resume"
>> +   depends on USB_DWC3_HOST=y || USB_DWC3_DUAL_ROLE=y

 why do you need another Kconfig for this? Just enable it already :-p
>>>
>>> I assume some platforms may do not need this feature. But okay, I can
>>> remove this kconfig and enable it.
>>
>> thanks :-)
>>
>> diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
>> index 9a4a5e4..7ad4bc3 100644
>> --- a/drivers/usb/dwc3/core.c
>> +++ b/drivers/usb/dwc3/core.c
>> @@ -1091,6 +1091,7 @@ static int dwc3_probe(struct platform_device *pdev)
>> pm_runtime_use_autosuspend(dev);
>> pm_runtime_set_autosuspend_delay(dev, 
>> DWC3_DEFAULT_AUTOSUSPEND_DELAY);
>> pm_runtime_enable(dev);
>> +   pm_suspend_ignore_children(dev, true);

 why do you need this?
>>>
>>> Since the dwc3 device is the parent deive of xhci device, if we want
>>> to suspend dwc3 device, we need to suspend child device (xhci device)
>>> manually by issuing pm_runtime_put_sync() in dwc3_host_suspend(). In
>>> pm_runtime_put_sync(), it will check if the children (xhci device) of
>>> dwc3 device have been in suspend state, if not we will suspend dwc3
>>> device failed.
>>>
>>> We get/put the child device manually in parent device's runtime
>>> callback, we need to ignore the child device's runtime state, or it
>>> will failed due to the dependency.
>>
>> I see. Good explanation :-)
>>
>> There's one thing though, if you want to runtime suspend the gadget and
>> dwc3 is working on peripheral mode, host side (XHCI) should already be
>> runtime suspended because there's nothing attached to it. Why isn't it
>> runtime suspended?
>
> Since we have get the runtime count for xHCI device in
> xhci_plat_probe(), in case it will suspend automatically if some
> controllers do not want xHCI enters runtime suspend automatically. So
> the parent device (dwc3 device) need to get/put the xHCI device's
> runtime count  to resume/suspend xHCI.

IMHO, that's a bug in xhci-plat, not something dwc3 should work around.

-- 
balbi

Re: staging: comedi: usbduxsigma: Split a condition check in usbduxsigma_alloc_usb_buffers()

2016-12-08 Thread Ian Abbott


On 08/12/16 15:46, SF Markus Elfring wrote:

* Reduce memory allocation sizes for two function calls.


Is this implementation detail worth for further considerations?


I assume you are referring to the allocation of devpriv->ai_urbs and 
devpriv->ao_urbs?  Your patch does not reduce their sizes; `urb` and 
`*devpriv->ai_urbs` have the same type `struct urb *`.  Having said 
that, `sizeof(*devpriv->ai_urbs)` is cleaner code than `sizeof(urb)` in 
this case.


--
-=( Ian Abbott @ MEV Ltd.E-mail:  )=-
-=(  Web: http://www.mev.co.uk/  )=-

Re: [PATCH v2 1/2] devicetree: i2c-hid: Add Wacom digitizer + regulator support

2016-12-08 Thread Dmitry Torokhov

On Thu, Dec 08, 2016 at 10:26:41AM -0600, Rob Herring wrote:
> On Thu, Dec 8, 2016 at 10:13 AM, Dmitry Torokhov
>  wrote:
> > On December 8, 2016 8:03:06 AM PST, Rob Herring  wrote:
> >>On Thu, Dec 8, 2016 at 9:41 AM, Benjamin Tissoires
> >> wrote:
> >>> On Dec 06 2016 or thereabouts, Doug Anderson wrote:
>  Hi,
> 
>  On Tue, Dec 6, 2016 at 6:56 AM, Rob Herring  wrote:
>  > On Tue, Dec 6, 2016 at 2:48 AM, Benjamin Tissoires
>  >  wrote:
>  >> On Dec 05 2016 or thereabouts, Rob Herring wrote:
>  >>> On Thu, Dec 01, 2016 at 09:24:50AM -0800, Brian Norris wrote:
>  >>> > Hi Benjamin and Rob,
>  >>> >
>  >>> > On Thu, Dec 01, 2016 at 03:34:34PM +0100, Benjamin Tissoires
> >>wrote:
>  >>> > > On Nov 30 2016 or thereabouts, Brian Norris wrote:
>  >>> > > > From: Caesar Wang 
>  >>> > > >
>  >>> > > > Add a compatible string and regulator property for Wacom
> >>W9103
>  >>> > > > digitizer. Its VDD supply may need to be enabled before
> >>using it.
>  >>> > > >
>  >>> > > > Signed-off-by: Caesar Wang 
>  >>> > > > Cc: Rob Herring 
>  >>> > > > Cc: Jiri Kosina 
>  >>> > > > Cc: linux-in...@vger.kernel.org
>  >>> > > > Signed-off-by: Brian Norris 
>  >>> > > > ---
>  >>> > > > v1 was a few months back. I finally got around to
> >>rewriting it based on
>  >>> > > > DT binding feedback.
>  >>> > > >
>  >>> > > > v2:
>  >>> > > >  * add compatible property for wacom
>  >>> > > >  * name the regulator property specifically (VDD)
>  >>> > > >
>  >>> > > >  Documentation/devicetree/bindings/input/hid-over-i2c.txt
> >>| 6 +-
>  >>> > > >  1 file changed, 5 insertions(+), 1 deletion(-)
>  >>> > > >
>  >>> > > > diff --git
> >>a/Documentation/devicetree/bindings/input/hid-over-i2c.txt
> >>b/Documentation/devicetree/bindings/input/hid-over-i2c.txt
>  >>> > > > index 488edcb264c4..eb98054e60c9 100644
>  >>> > > > ---
> >>a/Documentation/devicetree/bindings/input/hid-over-i2c.txt
>  >>> > > > +++
> >>b/Documentation/devicetree/bindings/input/hid-over-i2c.txt
>  >>> > > > @@ -11,12 +11,16 @@ If this binding is used, the kernel
> >>module i2c-hid will handle the communication
>  >>> > > >  with the device and the generic hid core layer will
> >>handle the protocol.
>  >>> > > >
>  >>> > > >  Required properties:
>  >>> > > > -- compatible: must be "hid-over-i2c"
>  >>> > > > +- compatible: must be "hid-over-i2c", or a
> >>device-specific string like:
>  >>> > > > +* "wacom,w9013"
>  >>> > >
>  >>> > > NACK on this one.
>  >>> > >
>  >>> > > After re-reading the v1 submission I realized Rob asked for
> >>this change,
>  >>> > > but I strongly disagree.
>  >>> > >
>  >>> > > HID over I2C is a generic protocol, in the same way HID over
> >>USB is. We
>  >>> > > can not start adding device specifics here, this is opening
> >>the can of
>  >>> > > worms. If the device is a HID one, nothing else should
> >>matter. The rest
>  >>> > > (description of the device, name, etc...) is all provided by
> >>the
>  >>> > > protocol.
>  >>> >
>  >>> > I should have spoken up when Rob made the suggestion, because
> >>I more or
>  >>> > less agree with Benjamin here. I don't really see why this
> >>needs to have
>  >>> > a specialized compatible string, as the property is still
> >>fairly
>  >>> > generic, and the entire device handling is via a generic
> >>protocol. The
>  >>> > fact that we manage its power via a regulator is not very
>  >>> > device-specific.
>  >>>
>  >>> It doesn't matter that the protocol is generic. The device
> >>attached and
>  >>> the implementation is not. Implementations have been known to
> >>have
>  >>> bugs/quirks (generally speaking, not HID over I2C in
> >>particular). There
>  >>> are also things outside the scope of what is 'hid-over-i2c' like
> >>what's
>  >>> needed to power-on the device which this patch clearly show.
>  >>
>  >> Yes, there are bugs, quirks, even with HID. But the HID declares
> >>within
>  >> the protocol the Vendor ID and the Product ID, which means once
> >>we pass
>  >> the initial "device is ready" step and can do a single i2c
> >>write/read,
>  >> we don't give a crap about device tree anymore.
>  >>
>  >> This is just about setting the device in shape so that it can
> >>answer a
>  >> single write/read.
>  >>
>  >>>
>  >>> This is no different than a panel attached via LVDS, eDP, etc.,
> >>or
>  >>> USB/PCIe device hard-wired on a board. They all use standard
> >>protocols
>  >>> and all need additional data to describe them. Of course,

[PATCH] ARM: Ignore compressed kernel build products

2016-12-08 Thread Florian Fainelli

When we select a kernel compression scheme, we will end-up with e.g:
piggy.xzkern under arch/arm/boot/compressed/, let's ignore these files.

Signed-off-by: Florian Fainelli 
---
 arch/arm/boot/compressed/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/compressed/.gitignore 
b/arch/arm/boot/compressed/.gitignore
index 86b2f5d28240..06686768610f 100644
--- a/arch/arm/boot/compressed/.gitignore
+++ b/arch/arm/boot/compressed/.gitignore
@@ -4,6 +4,7 @@ font.c
 lib1funcs.S
 hyp-stub.S
 piggy_data
+piggy.*
 vmlinux
 vmlinux.lds
 
-- 
2.9.3

[PATCH] ARM: Ignore compressed kernel build products

2016-12-08 Thread Florian Fainelli

When we select a kernel compression scheme, we will end-up with e.g:
piggy.xzkern under arch/arm/boot/compressed/, let's ignore these files.

Signed-off-by: Florian Fainelli 
---
 arch/arm/boot/compressed/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/compressed/.gitignore 
b/arch/arm/boot/compressed/.gitignore
index 86b2f5d28240..06686768610f 100644
--- a/arch/arm/boot/compressed/.gitignore
+++ b/arch/arm/boot/compressed/.gitignore
@@ -4,6 +4,7 @@ font.c
 lib1funcs.S
 hyp-stub.S
 piggy_data
+piggy.*
 vmlinux
 vmlinux.lds
 
-- 
2.9.3

Re: [PATCH v2] rpmsg: qcom_smd: Add support for "label" property

2016-12-08 Thread Bjorn Andersson

On Thu 08 Dec 04:14 PST 2016, Jeremy McNicoll wrote:

> On 2016-12-02 2:06 PM, Bjorn Andersson wrote:
> >Add support for the "label" property, used to give the edge a name other
> >than the one of the DT node. This allows the implementor to provide
> >consistently named edges when using the rpmsg character device.
> >
> >Signed-off-by: Bjorn Andersson 
> >---
> >
> >Changes since v1:
> >- Moved sysfs attribute to the correct (this) patch
> >
> > drivers/rpmsg/qcom_smd.c | 22 ++
> > 1 file changed, 22 insertions(+)
> >
> 
> Could you please add something to Documentation/devicetree/bindings/
> showing this "label" property.
> 
> Or add a "label" to an existing DTS so people/implementors  have a reference
> if they so choose to use this property.
> 

The DT binding update was already in flight and has been Acked by Rob
and Andy. I just didn't get the implementation out until now.

You can find it here:
https://patchwork.kernel.org/patch/9385753/

Thanks for having a look.

Regards,
Bjorn

[PATCH v2 0/4] ARM: Add support for CONFIG_DEBUG_VIRTUAL

2016-12-08 Thread Florian Fainelli

This patch series builds on top of Laura's [PATCHv5 00/10] CONFIG_DEBUG_VIRTUAL
for arm64 to add support for CONFIG_DEBUG_VIRTUAL for ARM.

This was tested on a Brahma B15 platform (ARMv7 + HIGHMEM + LPAE).

Note that the treewide changes would involve a huge CC list, which
is why it has been purposely trimmed to just focusing on the DEBUG_VIRTUAL
aspect.

Changes in v2:

- Modified MTD LART driver not to create symbol conflicts with
  KERNEL_START
- Fixed patch that defines and uses KERNEL_START/END
- Fixed __pa_symbol()'s definition
- Inline __pa_symbol() check wihtin the VIRTUAL_BUG_ON statement
- Simplified check for virtual addresses
- Added a tree-wide patch changing SMP/PM implementations to use
  __pa_symbol(), build tested against multi_v{5,7}_defconfig

Thanks!

Florian Fainelli (4):
  mtd: lart: Rename partition defines to be prefixed with PART_
  ARM: Define KERNEL_START and KERNEL_END
  ARM: Add support for CONFIG_DEBUG_VIRTUAL
  ARM: treewide: Replace uses of virt_to_phys with __pa_symbol

 arch/arm/Kconfig  |   1 +
 arch/arm/boot/compressed/piggy.xzkern | Bin 0 -> 2998584 bytes
 arch/arm/common/mcpm_entry.c  |  12 +++
 arch/arm/include/asm/memory.h |  23 --
 arch/arm/mach-alpine/platsmp.c|   2 +-
 arch/arm/mach-axxia/platsmp.c |   2 +-
 arch/arm/mach-bcm/bcm63xx_smp.c   |   2 +-
 arch/arm/mach-bcm/platsmp-brcmstb.c   |   2 +-
 arch/arm/mach-bcm/platsmp.c   |   4 +--
 arch/arm/mach-berlin/platsmp.c|   2 +-
 arch/arm/mach-exynos/firmware.c   |   4 +--
 arch/arm/mach-exynos/mcpm-exynos.c|   2 +-
 arch/arm/mach-exynos/platsmp.c|   4 +--
 arch/arm/mach-exynos/pm.c |   6 ++--
 arch/arm/mach-exynos/suspend.c|   6 ++--
 arch/arm/mach-hisi/platmcpm.c |   2 +-
 arch/arm/mach-hisi/platsmp.c  |   6 ++--
 arch/arm/mach-imx/platsmp.c   |   2 +-
 arch/arm/mach-imx/pm-imx6.c   |   2 +-
 arch/arm/mach-imx/src.c   |   2 +-
 arch/arm/mach-mediatek/platsmp.c  |   2 +-
 arch/arm/mach-mvebu/pm.c  |   2 +-
 arch/arm/mach-mvebu/pmsu.c|   2 +-
 arch/arm/mach-mvebu/system-controller.c   |   2 +-
 arch/arm/mach-omap2/control.c |   8 ++---
 arch/arm/mach-omap2/omap-mpuss-lowpower.c |   8 ++---
 arch/arm/mach-omap2/omap-smp.c|   4 +--
 arch/arm/mach-prima2/platsmp.c|   2 +-
 arch/arm/mach-prima2/pm.c |   2 +-
 arch/arm/mach-pxa/palmz72.c   |   2 +-
 arch/arm/mach-pxa/pxa25x.c|   2 +-
 arch/arm/mach-pxa/pxa27x.c|   2 +-
 arch/arm/mach-pxa/pxa3xx.c|   2 +-
 arch/arm/mach-realview/platsmp-dt.c   |   2 +-
 arch/arm/mach-rockchip/platsmp.c  |   4 +--
 arch/arm/mach-rockchip/pm.c   |   2 +-
 arch/arm/mach-s3c24xx/mach-jive.c |   2 +-
 arch/arm/mach-s3c24xx/pm-s3c2410.c|   2 +-
 arch/arm/mach-s3c24xx/pm-s3c2416.c|   2 +-
 arch/arm/mach-s3c64xx/pm.c|   2 +-
 arch/arm/mach-s5pv210/pm.c|   2 +-
 arch/arm/mach-sa1100/pm.c |   2 +-
 arch/arm/mach-shmobile/platsmp-apmu.c |   6 ++--
 arch/arm/mach-shmobile/platsmp-scu.c  |   4 +--
 arch/arm/mach-socfpga/platsmp.c   |   4 +--
 arch/arm/mach-spear/platsmp.c |   2 +-
 arch/arm/mach-sti/platsmp.c   |   2 +-
 arch/arm/mach-sunxi/platsmp.c |   4 +--
 arch/arm/mach-tango/platsmp.c |   2 +-
 arch/arm/mach-tango/pm.c  |   2 +-
 arch/arm/mach-tegra/reset.c   |   4 +--
 arch/arm/mach-ux500/platsmp.c |   2 +-
 arch/arm/mach-vexpress/dcscb.c|   2 +-
 arch/arm/mach-vexpress/platsmp.c  |   2 +-
 arch/arm/mach-vexpress/tc2_pm.c   |   4 +--
 arch/arm/mach-zx/platsmp.c|   4 +--
 arch/arm/mach-zynq/platsmp.c  |   2 +-
 arch/arm/mm/Makefile  |   1 +
 arch/arm/mm/init.c|   7 ++--
 arch/arm/mm/mmu.c |   6 +---
 arch/arm/mm/physaddr.c|  51 ++
 drivers/mtd/devices/lart.c|  24 +++---
 62 files changed, 173 insertions(+), 108 deletions(-)
 create mode 100644 arch/arm/boot/compressed/piggy.xzkern
 create mode 100644 arch/arm/mm/physaddr.c

-- 
2.9.3

Re: [PATCH 5/8] efi: Get the secure boot status [ver #5]

2016-12-08 Thread David Howells

Lukas Wunner  wrote:

> > +out_efi_err:
> > +   pr_efi_err(sys_table_arg, "Could not determine UEFI Secure Boot 
> > status.\n");
> > +   if (status == EFI_NOT_FOUND)
> > +   return efi_secureboot_mode_disabled;
> > +   return efi_secureboot_mode_unknown;
> > +}
> 
> In the out_efi_err path, the if-statement needs to come before the
> pr_efi_err() call.  Otherwise it would be a change of behaviour for
> ARM to what we have now.

As I understand it, if the BIOS is an EFI BIOS, these variables must exist -
in which case I would argue that the pr_efi_err-statement should be before the
if-statement.

David

[PATCH v18 04/15] clocksource/drivers/arm_arch_timer: rename some enums and defines.

2016-12-08 Thread fu . wei

From: Fu Wei 

Rename some enums and defines, to unify the format of enums and defines
in arm_arch_timer.h, also update all the users of these enums and defines:
drivers/clocksource/arm_arch_timer.c
virt/kvm/arm/hyp/timer-sr.c

No functional change.

Signed-off-by: Fu Wei 
Tested-by: Xiongfeng Wang 
---
 drivers/clocksource/arm_arch_timer.c | 111 ++-
 include/clocksource/arm_arch_timer.h |  24 
 virt/kvm/arm/hyp/timer-sr.c  |   6 +-
 3 files changed, 73 insertions(+), 68 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 15341cf..231175b 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -66,11 +66,11 @@ struct arch_timer {
 #define to_arch_timer(e) container_of(e, struct arch_timer, evt)
 
 static u32 arch_timer_rate;
-static int arch_timer_ppi[MAX_TIMER_PPI];
+static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI];
 
 static struct clock_event_device __percpu *arch_timer_evt;
 
-static enum arch_timer_ppi_nr arch_timer_uses_ppi = VIRT_PPI;
+static enum arch_timer_ppi_nr arch_timer_uses_ppi = ARCH_TIMER_VIRT_PPI;
 static bool arch_timer_c3stop;
 static bool arch_timer_mem_use_virtual;
 
@@ -340,7 +340,7 @@ static void fsl_a008585_set_sne(struct clock_event_device 
*clk)
if (!static_branch_unlikely(_timer_read_ool_enabled))
return;
 
-   if (arch_timer_uses_ppi == VIRT_PPI)
+   if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
clk->set_next_event = fsl_a008585_set_next_event_virt;
else
clk->set_next_event = fsl_a008585_set_next_event_phys;
@@ -352,7 +352,7 @@ static void __arch_timer_setup(unsigned type,
 {
clk->features = CLOCK_EVT_FEAT_ONESHOT;
 
-   if (type == ARCH_CP15_TIMER) {
+   if (type == ARCH_TIMER_TYPE_CP15) {
if (arch_timer_c3stop)
clk->features |= CLOCK_EVT_FEAT_C3STOP;
clk->name = "arch_sys_timer";
@@ -360,14 +360,14 @@ static void __arch_timer_setup(unsigned type,
clk->cpumask = cpumask_of(smp_processor_id());
clk->irq = arch_timer_ppi[arch_timer_uses_ppi];
switch (arch_timer_uses_ppi) {
-   case VIRT_PPI:
+   case ARCH_TIMER_VIRT_PPI:
clk->set_state_shutdown = arch_timer_shutdown_virt;
clk->set_state_oneshot_stopped = 
arch_timer_shutdown_virt;
clk->set_next_event = arch_timer_set_next_event_virt;
break;
-   case PHYS_SECURE_PPI:
-   case PHYS_NONSECURE_PPI:
-   case HYP_PPI:
+   case ARCH_TIMER_PHYS_SECURE_PPI:
+   case ARCH_TIMER_PHYS_NONSECURE_PPI:
+   case ARCH_TIMER_HYP_PPI:
clk->set_state_shutdown = arch_timer_shutdown_phys;
clk->set_state_oneshot_stopped = 
arch_timer_shutdown_phys;
clk->set_next_event = arch_timer_set_next_event_phys;
@@ -447,8 +447,8 @@ static void arch_counter_set_user_access(void)
 
 static bool arch_timer_has_nonsecure_ppi(void)
 {
-   return (arch_timer_uses_ppi == PHYS_SECURE_PPI &&
-   arch_timer_ppi[PHYS_NONSECURE_PPI]);
+   return (arch_timer_uses_ppi == ARCH_TIMER_PHYS_SECURE_PPI &&
+   arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
 }
 
 static u32 check_ppi_trigger(int irq)
@@ -469,14 +469,15 @@ static int arch_timer_starting_cpu(unsigned int cpu)
struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
u32 flags;
 
-   __arch_timer_setup(ARCH_CP15_TIMER, clk);
+   __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk);
 
flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]);
enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags);
 
if (arch_timer_has_nonsecure_ppi()) {
-   flags = check_ppi_trigger(arch_timer_ppi[PHYS_NONSECURE_PPI]);
-   enable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI], flags);
+   flags = 
check_ppi_trigger(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
+   enable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI],
+ flags);
}
 
arch_counter_set_user_access();
@@ -513,16 +514,17 @@ arch_timer_detect_rate(void __iomem *cntbase, struct 
device_node *np)
 static void arch_timer_banner(unsigned type)
 {
pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n",
-   type & ARCH_CP15_TIMER ? "cp15" : "",
-   type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  " and " : "",
-   type & ARCH_MEM_TIMER ? "mmio" : "",
+   type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "",
+   type == (ARCH_TIMER_TYPE_CP15 |

Re: [PATCH v3 0/6] net: stmmac: make DMA programmable burst length more configurable

2016-12-08 Thread David Miller

From: Niklas Cassel 
Date: Wed, 7 Dec 2016 15:20:02 +0100

> Make DMA programmable burst length more configurable in the stmmac driver.
> 
> This is done by adding support for independent pbl for tx/rx through DT.
> More fine grained tuning of pbl is possible thanks to a DT property saying
> that we should NOT multiply pbl values by x8/x4 in hardware.
> 
> All new DT properties are optional, and created in a way that it will not
> affect any existing DT configurations.

Series applied to net-next, thanks.

[PATCH v3 11/15] livepatch: use kstrtobool() in enabled_store()

2016-12-08 Thread Josh Poimboeuf

The sysfs enabled value is a boolean, so kstrtobool() is a better fit
for parsing the input string since it does the range checking for us.

Suggested-by: Petr Mladek 
Signed-off-by: Josh Poimboeuf 
---
 kernel/livepatch/core.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 6a137e1..8ca8a0e 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -408,26 +408,23 @@ static ssize_t enabled_store(struct kobject *kobj, struct 
kobj_attribute *attr,
 {
struct klp_patch *patch;
int ret;
-   unsigned long val;
+   bool enabled;
 
-   ret = kstrtoul(buf, 10, );
+   ret = kstrtobool(buf, );
if (ret)
return -EINVAL;
 
-   if (val > 1)
-   return -EINVAL;
-
patch = container_of(kobj, struct klp_patch, kobj);
 
mutex_lock(_mutex);
 
-   if (patch->enabled == val) {
+   if (patch->enabled == enabled) {
/* already in requested state */
ret = -EINVAL;
goto err;
}
 
-   if (val) {
+   if (enabled) {
ret = __klp_enable_patch(patch);
if (ret)
goto err;
-- 
2.7.4

Re: [RFC, PATCHv1 00/28] 5-level paging

2016-12-08 Thread hpa

On December 8, 2016 10:16:07 AM PST, Linus Torvalds 
 wrote:
>On Thu, Dec 8, 2016 at 8:21 AM, Kirill A. Shutemov
> wrote:
>>
>> This patchset is still very early. There are a number of things
>missing
>> that we have to do before asking anyone to merge it (listed below).
>> It would be great if folks can start testing applications now (in
>QEMU) to
>> look for breakage.
>> Any early comments on the design or the patches would be appreciated
>as
>> well.
>
>Looks ok to me. Starting off with a compile-time config option seems
>fine.
>
>I do think that the x86 cpuid part should (patch 15) should be the
>first patch, so that we see "la57" as a capability in /proc/cpuinfo
>whether it's being enabled or not? We should merge that part
>regardless of any mm patches, I think.
>
>   Linus

Definitely.
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

RE: ATH9 driver issues on ARM64

2016-12-08 Thread Bharat Kumar Gogada

> On 08/12/16 15:29, Bharat Kumar Gogada wrote:
> 
> Two things:
> 
> > Here is the cat /proc/interrupts (after we do interface up):
> >
> > root@:~# ifconfig wlan0 up
> > [ 1548.926601] IPv6: ADDRCONF(NETDEV_UP): wlan0: link is not ready
> > root@Xilinx-ZCU102-2016_3:~# cat /proc/interrupts
> >CPU0   CPU1   CPU2   CPU3
> >   1:  0  0  0  0 GICv2  29 Edge  
> > arch_timer
> >   2:  19873  20058  19089  17435 GICv2  30 Edge  
> > arch_timer
> >  12:  0  0  0  0 GICv2 156 Level 
> > zynqmp-dma
> >  13:  0  0  0  0 GICv2 157 Level 
> > zynqmp-dma
> >  14:  0  0  0  0 GICv2 158 Level 
> > zynqmp-dma
> >  15:  0  0  0  0 GICv2 159 Level 
> > zynqmp-dma
> >  16:  0  0  0  0 GICv2 160 Level 
> > zynqmp-dma
> >  17:  0  0  0  0 GICv2 161 Level 
> > zynqmp-dma
> >  18:  0  0  0  0 GICv2 162 Level 
> > zynqmp-dma
> >  19:  0  0  0  0 GICv2 163 Level 
> > zynqmp-dma
> >  20:  0  0  0  0 GICv2 164 Level 
> > Mali_GP_MMU, Mali_GP,
> Mali_PP0_MMU, Mali_PP0, Mali_PP1_MMU, Mali_PP1
> 
> I'm not even going to consider looking at something that is running out of 
> tree
> code. So please start things with a fresh kernel that doesn't contain stuff we
> can't debug.
> 
Ok will test with fresh kernel.

> >  30:  0  0  0  0 GICv2  95 Level 
> > eth0, eth0
> > 206:314  0  0  0 GICv2  49 Level 
> > cdns-i2c
> > 207: 40  0  0  0 GICv2  50 Level 
> > cdns-i2c
> > 209:  0  0  0  0 GICv2 150 Level 
> > nwl_pcie:misc
> > 214: 12  0  0  0 GICv2  47 Level 
> > ff0f.spi
> > 215:  0  0  0  0 GICv2  58 Level 
> > ffa6.rtc
> > 216:  0  0  0  0 GICv2  59 Level 
> > ffa6.rtc
> > 217:  0  0  0  0 GICv2 165 Level 
> > ahci-ceva[fd0c.ahci]
> > 218: 61  0  0  0 GICv2  81 Level 
> > mmc0
> > 219:  0  0  0  0 GICv2 187 Level 
> > arm-smmu global fault
> > 220:471  0  0  0 GICv2  53 Level 
> > xuartps
> > 223:  0  0  0  0 GICv2 154 Level 
> > fd4c.dma
> > 224:  3  0  0  0 dummy   1 Edge  
> > ath9k
> 
> What is this "dummy" controller? And if that's supposed to be a legacy 
> interrupt
> from the PCI device, it has the wrong trigger.

Yes it is for legacy interrupt, wrong trigger means ? 

Thanks & Regards,
Bharat

Re: ATH9 driver issues on ARM64

2016-12-08 Thread Marc Zyngier

On 08/12/16 18:33, Bharat Kumar Gogada wrote:
>> On 08/12/16 15:29, Bharat Kumar Gogada wrote:
>>
>> Two things:
>>
>>> Here is the cat /proc/interrupts (after we do interface up):
>>>
>>> root@:~# ifconfig wlan0 up
>>> [ 1548.926601] IPv6: ADDRCONF(NETDEV_UP): wlan0: link is not ready
>>> root@Xilinx-ZCU102-2016_3:~# cat /proc/interrupts
>>>CPU0   CPU1   CPU2   CPU3
>>>   1:  0  0  0  0 GICv2  29 Edge  
>>> arch_timer
>>>   2:  19873  20058  19089  17435 GICv2  30 Edge  
>>> arch_timer

By the way, please use a recent kernel. Seeing edge here means you're
running with something that is a bit old (and broken). And since you
haven't even said what revision of the kernel you're using, hslping you
is not an easy task. tglx told you something similar about a week ago.

>>>  12:  0  0  0  0 GICv2 156 Level 
>>> zynqmp-dma
>>>  13:  0  0  0  0 GICv2 157 Level 
>>> zynqmp-dma
>>>  14:  0  0  0  0 GICv2 158 Level 
>>> zynqmp-dma
>>>  15:  0  0  0  0 GICv2 159 Level 
>>> zynqmp-dma
>>>  16:  0  0  0  0 GICv2 160 Level 
>>> zynqmp-dma
>>>  17:  0  0  0  0 GICv2 161 Level 
>>> zynqmp-dma
>>>  18:  0  0  0  0 GICv2 162 Level 
>>> zynqmp-dma
>>>  19:  0  0  0  0 GICv2 163 Level 
>>> zynqmp-dma
>>>  20:  0  0  0  0 GICv2 164 Level 
>>> Mali_GP_MMU, Mali_GP,
>> Mali_PP0_MMU, Mali_PP0, Mali_PP1_MMU, Mali_PP1
>>
>> I'm not even going to consider looking at something that is running out of 
>> tree
>> code. So please start things with a fresh kernel that doesn't contain stuff 
>> we
>> can't debug.
>>
> Ok will test with fresh kernel.
> 
>>>  30:  0  0  0  0 GICv2  95 Level 
>>> eth0, eth0
>>> 206:314  0  0  0 GICv2  49 Level 
>>> cdns-i2c
>>> 207: 40  0  0  0 GICv2  50 Level 
>>> cdns-i2c
>>> 209:  0  0  0  0 GICv2 150 Level 
>>> nwl_pcie:misc
>>> 214: 12  0  0  0 GICv2  47 Level 
>>> ff0f.spi
>>> 215:  0  0  0  0 GICv2  58 Level 
>>> ffa6.rtc
>>> 216:  0  0  0  0 GICv2  59 Level 
>>> ffa6.rtc
>>> 217:  0  0  0  0 GICv2 165 Level 
>>> ahci-ceva[fd0c.ahci]
>>> 218: 61  0  0  0 GICv2  81 Level 
>>> mmc0
>>> 219:  0  0  0  0 GICv2 187 Level 
>>> arm-smmu global fault
>>> 220:471  0  0  0 GICv2  53 Level 
>>> xuartps
>>> 223:  0  0  0  0 GICv2 154 Level 
>>> fd4c.dma
>>> 224:  3  0  0  0 dummy   1 Edge  
>>> ath9k
>>
>> What is this "dummy" controller? And if that's supposed to be a legacy 
>> interrupt
>> from the PCI device, it has the wrong trigger.
> 
> Yes it is for legacy interrupt, wrong trigger means ? 

Aren't legacy interrupts supposed to be *level* triggered, and not edge?

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...

Re: [PATCH 1/1 linux-next] ASoC: samsung: include gpio consumer.h

2016-12-08 Thread Krzysztof Kozlowski

On Wed, Dec 07, 2016 at 09:31:02PM +0100, Fabian Frederick wrote:
> 
> 
> > On 07 December 2016 at 18:55 Krzysztof Kozlowski  wrote:
> >
> >
> > On Wed, Dec 07, 2016 at 08:39:02AM +0100, Fabian Frederick wrote:
> > > Fix the following build errors
> >
> > I couldn't reproduce it on default config. Can you mention the necessary
> > environment/defconfig/arch etc.?
> >
> > Patch itself looks needed. However shouldn't the driver depend also on
> > GPIOLIB?
> >
> > Best regards,
> > Krzysztof
> 
> Hi Krzysztof,
> 
>         It's based on a randconfig with compile_test without gpiolib.
> I reproduced it with make defconfig followed by selecting
> x86_32

So could you mention it? Something like:
"...build errors on x86_32 with !GPIOLIB."
would be sufficient IMHO.

> SPI
> SND_SOC_SAMSUNG
> MFD_ARIZONA_I2C
> MFD_ARIZONA
> SND_SOC
> COMPILE_TEST
> X86_INTEL_QUARK
> COMMON_CLK
> SND_SOC_SAMSUNG
> SND_SOC_SAMSUNG_TM2_WM5110
> 
> Here's another patch solving the same problem with complete explanation:
> 
> Commit 638f958baeaf
> ("extcon: Allow compile test of GPIO consumers if !GPIOLIB")
> 
> as you're suggesting, we could add depend on GPIOLIB.
> extcon has the following: depends on GPIOLIB || COMPILE_TEST
> 
> smartq_wm8987.c also has gpio_consumer.h for SND_SOC_SMARTQ
> which would give the following update on samsung/Kconfig besides initial 
> patch:

Makes sense. The GPIOLIB is indeed runtime dependency for the driver.

Feel free to send a follow up patch for GPIOLIB (reported-by credits
would be appreciated ;) ).

Best regards,
Krzysztof

> 
> diff --git a/sound/soc/samsung/Kconfig b/sound/soc/samsung/Kconfig
> index 7c42315..f1f1d79 100644
> --- a/sound/soc/samsung/Kconfig
> +++ b/sound/soc/samsung/Kconfig
> @@ -111,6 +111,7 @@ config SND_SOC_SAMSUNG_RX1950_UDA1380
>  config SND_SOC_SMARTQ
>         tristate "SoC I2S Audio support for SmartQ board"
>         depends on MACH_SMARTQ || COMPILE_TEST
> +       depends on GPIOLIB || COMPILE_TEST
>         depends on I2C
>         select SND_SAMSUNG_I2S
>         select SND_SOC_WM8750
> @@ -193,6 +194,7 @@ config SND_SOC_ARNDALE_RT5631_ALC5631
>  config SND_SOC_SAMSUNG_TM2_WM5110
>         tristate "SoC I2S Audio support for WM5110 on TM2 board"
>         depends on SND_SOC_SAMSUNG && MFD_ARIZONA && I2C && SPI_MASTER
> +       depends on GPIOLIB || COMPILE_TEST
>         select SND_SOC_MAX98504
>         select SND_SOC_WM5110
>         select SND_SAMSUNG_I2S
> 
> 
> Regards,
> Fabian
> >
> > > sound/soc/samsung/tm2_wm5110.c:220:3: error: implicit declaration
> > > of function 'gpiod_set_value_cansleep'
> > > [-Werror=implicit-function-declaration]
> > > sound/soc/samsung/tm2_wm5110.c:438:24: error: implicit declaration
> > > of function 'devm_gpiod_get' [-Werror=implicit-function-declaration]
> > >
> > > Signed-off-by: Fabian Frederick 
> > > ---
> > >  sound/soc/samsung/tm2_wm5110.c | 1 +
> > >  1 file changed, 1 insertion(+)
> > >
> > > diff --git a/sound/soc/samsung/tm2_wm5110.c 
> > > b/sound/soc/samsung/tm2_wm5110.c
> > > index 5cdf7d1..24cc9d6 100644
> > > --- a/sound/soc/samsung/tm2_wm5110.c
> > > +++ b/sound/soc/samsung/tm2_wm5110.c
> > > @@ -12,6 +12,7 @@
> > > 
> > >  #include 
> > >  #include 
> > > +#include 
> > >  #include 
> > >  #include 
> > >  #include 
> > > --
> > > 2.7.4
> > >

[PATCH] tracing: Replace kmap with copy_from_user() in trace_marker writing

2016-12-08 Thread Steven Rostedt

Instead of using get_user_pages_fast() and kmap_atomic() when writing
to the trace_marker file, just allocate enough space on the ring buffer
directly, and write into it via copy_from_user().

Writing into the trace_marker file use to allocate a temporary buffer
to perform the copy_from_user(), as we didn't want to write into the
ring buffer if the copy failed. But as a trace_marker write is suppose
to be extremely fast, and allocating memory causes other tracepoints to
trigger, Peter Zijlstra suggested using get_user_pages_fast() and
kmap_atomic() to keep the user space pages in memory and reading it
directly. But Henrik Austad had issues with this because that caused
other tracepoints to trigger as well.

Instead, just allocate the space in the ring buffer and use
copy_from_user() directly. If it faults, return -EFAULT and write
"" into the ring buffer.

Cc: Henrik Austad 
Cc: Peter Zijlstra 
Updates: d696b58ca2c3ca "tracing: Do not allocate buffer for trace_marker"
Suggested-by: Thomas Gleixner 
Signed-off-by: Steven Rostedt 
---
 trace.c |  135 +---
 1 file changed, 37 insertions(+), 98 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 60416bf7c591..e0f8d814cec6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5738,61 +5738,6 @@ tracing_free_buffer_release(struct inode *inode, struct 
file *filp)
return 0;
 }
 
-static inline int lock_user_pages(const char __user *ubuf, size_t cnt,
- struct page **pages, void **map_page,
- int *offset)
-{
-   unsigned long addr = (unsigned long)ubuf;
-   int nr_pages = 1;
-   int ret;
-   int i;
-
-   /*
-* Userspace is injecting traces into the kernel trace buffer.
-* We want to be as non intrusive as possible.
-* To do so, we do not want to allocate any special buffers
-* or take any locks, but instead write the userspace data
-* straight into the ring buffer.
-*
-* First we need to pin the userspace buffer into memory,
-* which, most likely it is, because it just referenced it.
-* But there's no guarantee that it is. By using get_user_pages_fast()
-* and kmap_atomic/kunmap_atomic() we can get access to the
-* pages directly. We then write the data directly into the
-* ring buffer.
-*/
-
-   /* check if we cross pages */
-   if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
-   nr_pages = 2;
-
-   *offset = addr & (PAGE_SIZE - 1);
-   addr &= PAGE_MASK;
-
-   ret = get_user_pages_fast(addr, nr_pages, 0, pages);
-   if (ret < nr_pages) {
-   while (--ret >= 0)
-   put_page(pages[ret]);
-   return -EFAULT;
-   }
-
-   for (i = 0; i < nr_pages; i++)
-   map_page[i] = kmap_atomic(pages[i]);
-
-   return nr_pages;
-}
-
-static inline void unlock_user_pages(struct page **pages,
-void **map_page, int nr_pages)
-{
-   int i;
-
-   for (i = nr_pages - 1; i >= 0; i--) {
-   kunmap_atomic(map_page[i]);
-   put_page(pages[i]);
-   }
-}
-
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -5803,13 +5748,15 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
struct print_entry *entry;
unsigned long irq_flags;
struct page *pages[2];
-   void *map_page[2];
-   int nr_pages = 1;
+   const char faulted[] = "";
ssize_t written;
int offset;
int size;
int len;
 
+/* Used in tracing_mark_raw_write() as well */
+#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
+
if (tracing_disabled)
return -EINVAL;
 
@@ -5821,30 +5768,31 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
 
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-   nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, );
-   if (nr_pages < 0)
-   return nr_pages;
-
local_save_flags(irq_flags);
-   size = sizeof(*entry) + cnt + 2; /* possible \n added */
+   size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+
+   /* If less than "", then make sure we can still add that */
+   if (cnt < FAULTED_SIZE)
+   size += FAULTED_SIZE - cnt;
+
buffer = tr->trace_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
irq_flags, preempt_count());
-   if (!event) {
+   if (unlikely(!event))
/* Ring buffer disabled, return as if not open for write */
-

Re: [RFC PATCH 00/19] ide: remove deprecated host drivers (part 1)

2016-12-08 Thread David Miller

From: Bartlomiej Zolnierkiewicz 
Date: Thu, 08 Dec 2016 18:14:33 +0100

> I asked you about this in private mail in August 2015, you told me to
> bring this on the list.  I did it (with these patches) in February
> 2016.  After two pings and months of waiting for a reply all I get is
> is a quick NAK?

Yep, that's how much effort, consideration, and time a deprecated
subsystem deserves.

[PATCH v3 03/15] livepatch: temporary stubs for klp_patch_pending() and klp_update_patch_state()

2016-12-08 Thread Josh Poimboeuf

Create temporary stubs for klp_patch_pending() and
klp_update_patch_state() so we can add TIF_PATCH_PENDING to different
architectures in separate patches without breaking build bisectability.

Signed-off-by: Josh Poimboeuf 
---
 include/linux/livepatch.h | 7 ++-
 kernel/livepatch/core.c   | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 9072f04..60558d8 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -123,10 +123,15 @@ void arch_klp_init_object_loaded(struct klp_patch *patch,
 int klp_module_coming(struct module *mod);
 void klp_module_going(struct module *mod);
 
+static inline bool klp_patch_pending(struct task_struct *task) { return false; 
}
+void klp_update_patch_state(struct task_struct *task);
+
 #else /* !CONFIG_LIVEPATCH */
 
 static inline int klp_module_coming(struct module *mod) { return 0; }
-static inline void klp_module_going(struct module *mod) { }
+static inline void klp_module_going(struct module *mod) {}
+static inline bool klp_patch_pending(struct task_struct *task) { return false; 
}
+static inline void klp_update_patch_state(struct task_struct *task) {}
 
 #endif /* CONFIG_LIVEPATCH */
 
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index af46438..217b39d 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -64,6 +64,9 @@ static LIST_HEAD(klp_ops);
 
 static struct kobject *klp_root_kobj;
 
+/* TODO: temporary stub */
+void klp_update_patch_state(struct task_struct *task) {}
+
 static struct klp_ops *klp_find_ops(unsigned long old_addr)
 {
struct klp_ops *ops;
-- 
2.7.4

[PATCH v3 06/15] livepatch/s390: reorganize TIF thread flag bits

2016-12-08 Thread Josh Poimboeuf

From: Jiri Slaby 

Group the TIF thread flag bits by their inclusion in the _TIF_WORK and
_TIF_TRACE macros.

Signed-off-by: Jiri Slaby 
Signed-off-by: Josh Poimboeuf 
---
 arch/s390/include/asm/thread_info.h | 22 ++
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/thread_info.h 
b/arch/s390/include/asm/thread_info.h
index a5b54a4..4977668 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -51,14 +51,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src);
 /*
  * thread information flags bit numbers
  */
+/* _TIF_WORK bits */
 #define TIF_NOTIFY_RESUME  0   /* callback before returning to user */
 #define TIF_SIGPENDING 1   /* signal pending */
 #define TIF_NEED_RESCHED   2   /* rescheduling necessary */
-#define TIF_SYSCALL_TRACE  3   /* syscall trace active */
-#define TIF_SYSCALL_AUDIT  4   /* syscall auditing active */
-#define TIF_SECCOMP5   /* secure computing */
-#define TIF_SYSCALL_TRACEPOINT 6   /* syscall tracepoint instrumentation */
-#define TIF_UPROBE 7   /* breakpointed or single-stepping */
+#define TIF_UPROBE 3   /* breakpointed or single-stepping */
+
 #define TIF_31BIT  16  /* 32bit process */
 #define TIF_MEMDIE 17  /* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK18  /* restore signal mask in do_signal() */
@@ -66,15 +64,23 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src);
 #define TIF_BLOCK_STEP 20  /* This task is block stepped */
 #define TIF_UPROBE_SINGLESTEP  21  /* This task is uprobe single stepped */
 
+/* _TIF_TRACE bits */
+#define TIF_SYSCALL_TRACE  24  /* syscall trace active */
+#define TIF_SYSCALL_AUDIT  25  /* syscall auditing active */
+#define TIF_SECCOMP26  /* secure computing */
+#define TIF_SYSCALL_TRACEPOINT 27  /* syscall tracepoint instrumentation */
+
 #define _TIF_NOTIFY_RESUME _BITUL(TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING_BITUL(TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED  _BITUL(TIF_NEED_RESCHED)
+#define _TIF_UPROBE_BITUL(TIF_UPROBE)
+
+#define _TIF_31BIT _BITUL(TIF_31BIT)
+#define _TIF_SINGLE_STEP   _BITUL(TIF_SINGLE_STEP)
+
 #define _TIF_SYSCALL_TRACE _BITUL(TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT _BITUL(TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP   _BITUL(TIF_SECCOMP)
 #define _TIF_SYSCALL_TRACEPOINT_BITUL(TIF_SYSCALL_TRACEPOINT)
-#define _TIF_UPROBE_BITUL(TIF_UPROBE)
-#define _TIF_31BIT _BITUL(TIF_31BIT)
-#define _TIF_SINGLE_STEP   _BITUL(TIF_SINGLE_STEP)
 
 #endif /* _ASM_THREAD_INFO_H */
-- 
2.7.4

Re: [GIT PULL 1/3] ARM: exynos: Soc/mach for v4.10

2016-12-08 Thread Krzysztof Kozlowski

On Thu, Dec 08, 2016 at 10:25:35AM -0800, Olof Johansson wrote:
> On Thu, Dec 8, 2016 at 7:28 AM, Pankaj Dubey  wrote:
> > On 3 December 2016 at 22:33, Krzysztof Kozlowski  wrote:
> >> On Fri, Dec 02, 2016 at 10:52:57PM +0100, Arnd Bergmann wrote:
> >>>
> >>> Sorry, I initially deferred it and then didn't get back to it.
> >>>
> >>> The dependency on the .dts changes made me a bit nervous about
> >>> taking it, mostly because the changelog fails to explain the
> >>> exact dependencies.
> >>>
> >>> This breaks compatibility with existing .dtb files, right?
> >>
> >> No, strictly speaking not. There was no dt-bindings change here, no DT
> >> properties for SCU before. We are converting our drivers to DTB so this
> >> is the same as before when switching for pinctrl, clocks or all other
> >> drivers to DT.
> >>
> >> We are not braking DTB ABI because there was no ABI around it before.
> >> Otherwise, one would say that lack of SCU DT node was an ABI. That is
> >> wrong, because DT should describe the hardware and SCU is in hardware.
> >>
> >>> What I'd like to see here is an explanation about:
> >>>
> >>> - what the upside of breaking compatibility is
> >>
> >> DTBs which do not have SCU are not proper because they skip that part of
> >> hardware. However we are breaking them in the way the SMP won't work
> >> there. It is not an ABI break, as I mentioned above.
> >>
> >>> - what exactly stops working with an old dtb
> >>> - why we don't keep a fallback for handling old dtb files
> >>
> >> What is the point for it? This is not an ABI break. Even if it was,
> >> Samsung guys don't care for ABI breaks at all (and in fact we wanted to
> >> mark the platform experimental...).
> >>
> >>> It would also be helpful to have a separate pull request for
> >>> the commits require the new dtb, and the stuff that is unrelated.
> >>
> >> I can do that but the pull will be small.
> >>
> >
> > Arnd,
> >
> > Any update on this? Intention of this change is to improve
> > Exynos SoC's DT support in mainline kernel. This will help in removing 
> > static
> > mapping from exynos machine files and simplify mach-exynos code-base.
> 
> Adding the SCU nodes now makes sense. So does using them if they're available.
> 
> Given the prevalence of exynos systems with DTS already out there, it
> would make sense to give an overlap of making the kernel work without
> the SCU in DT for a period of time.
> 
> This isn't like the old days of when we were mass-converting things
> and breakage was expected. We're well into a steady state here, so
> being nicer to downstream users is likely the right thing to do here.

I think that either we treat this as an ABI break, or just "being
nice to downstream".

In the first case, not breaking things would be a valid reason. But I
believe this is not an ABI break. ABI is about an interface, not about
being nice. The SCU should be defined by downstream users because this
is the description of the hardware. In the same time, kernel did not
document as an interface something like "there should be no SCU node
defined" so creating a requirement of SCU is not an breakage of existing
interface.

The second case then, being nice to downstream. Do we want to be nice or
do we want to push them to mainline? Okay, it is never black or white...
yet we should rather encourage downstream to mainline and creating
compatibility periods is rather opposite of that.

Best regards,
Krzysztof

Re: [PATCH 0/4] g_NCR5380: Bug fix and some enhancements

2016-12-08 Thread Ondrej Zary

On Monday 05 December 2016 07:07:19 Finn Thain wrote:
> This patch series is based on the one submitted recently by Ondrej Zary.
>
> This version has a different irq probing fix for HP C2502 boards and
> a more comprehensive patch to change the default irq parameter.
>
> It needs testing on actual ISA hardware.

Tested on HP C2502 (53C400A chip), Canon FG2-5202 (53C400 chip), DTC-3181L 
(DTCT-436P chip) and MS-PNR (53C400A chip) ISA cards - everything works fine!

Thanks.

Tested-by: Ondrej Zary 

BTW. The release-region fix (my previous patch 6/6) has disappeared somehow. 
Should I resubmit?

-- 
Ondrej Zary

Re: [RFC, PATCHv1 17/28] x86/mm: define virtual memory map for 5-level paging

2016-12-08 Thread Randy Dunlap

On 12/08/16 08:21, Kirill A. Shutemov wrote:
> The first part of memory map (up to %esp fixup) simply scales existing
> map for 4-level paging by factor of 9 -- number of bits addressed by
> additional page table level.
> 
> The rest of the map is uncahnged.

 unchanged.

(more fixes below)


> Signed-off-by: Kirill A. Shutemov 
> ---
>  Documentation/x86/x86_64/mm.txt | 23 ++-
>  arch/x86/Kconfig|  1 +
>  arch/x86/include/asm/kasan.h|  9 ++---
>  arch/x86/include/asm/page_64_types.h| 10 ++
>  arch/x86/include/asm/pgtable_64_types.h |  6 ++
>  arch/x86/include/asm/sparsemem.h|  9 +++--
>  6 files changed, 52 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
> index 8c7dd5957ae1..d33fb0799b3d 100644
> --- a/Documentation/x86/x86_64/mm.txt
> +++ b/Documentation/x86/x86_64/mm.txt
> @@ -12,7 +12,7 @@ c900 - e8ff (=45 bits) 
> vmalloc/ioremap space
>  e900 - e9ff (=40 bits) hole
>  ea00 - eaff (=40 bits) virtual memory map (1TB)
>  ... unused hole ...
> -ec00 - fc00 (=44 bits) kasan shadow memory (16TB)
> +ec00 - fbff (=44 bits) kasan shadow memory (16TB)
>  ... unused hole ...
>  ff00 - ff7f (=39 bits) %esp fixup stacks
>  ... unused hole ...
> @@ -23,6 +23,27 @@ a000 - ff5f (=1526 MB) module 
> mapping space
>  ff60 - ffdf (=8 MB) vsyscalls
>  ffe0 -  (=2 MB) unused hole
>  
> +Virtual memory map with 5 level page tables:
> +
> + - 00ff (=56 bits) user space, different per mm
> +hole caused by [57:63] sign extension

Can you briefly explain the sign extension?
Should that be [56:63]?

> +ff00 - ff0f (=52 bits) guard hole, reserved for 
> hypervisor
> +ff10 - ff8f (=55 bits) direct mapping of all phys. 
> memory
> +ff90 - ff91 (=49 bits) hole
> +ff92 - ffd1 (=54 bits) vmalloc/ioremap space
> +ffd2 - ff93 (=49 bits) virtual memory map (512TB)
> +... unused hole ...
> +ff96 - ffb5 (=53 bits) kasan shadow memory (8PB)
> +... unused hole ...
> +fffe - fffe (=49 bits) %esp fixup stacks
> +... unused hole ...
> +ffef -  (=64 GB) EFI region mapping space

- fffe

> +... unused hole ...
> +8000 - a000 (=512 MB)  kernel text mapping, from 
> phys 0

- 9fff

> +a000 - ff5f (=1526 MB) module mapping space
> +ff60 - ffdf (=8 MB) vsyscalls
> +ffe0 -  (=2 MB) unused hole
> +
>  The direct mapping covers all memory in the system up to the highest
>  memory address (this means in some cases it can also include PCI memory
>  holes).

> diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
> index 1410b567ecde..2587c6bd89be 100644
> --- a/arch/x86/include/asm/kasan.h
> +++ b/arch/x86/include/asm/kasan.h
> @@ -11,9 +11,12 @@
>   * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
>   */
>  #define KASAN_SHADOW_START  (KASAN_SHADOW_OFFSET + \
> - (0x8000ULL >> 3))
> -/* 47 bits for kernel address -> (47 - 3) bits for shadow */
> -#define KASAN_SHADOW_END(KASAN_SHADOW_START + (1ULL << (47 - 3)))
> + ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
> +/*
> + * 47 bits for kernel address -> (47 - 3) bits for shadow
> + * 56 bits for kernel address -> (56 - 3) bits fro shadow

typo: s/fro/for/

> + */
> +#define KASAN_SHADOW_END(KASAN_SHADOW_START + (1ULL << 
> (__VIRTUAL_MASK_SHIFT - 3)))
>  
>  #ifndef __ASSEMBLY__
>  


-- 
~Randy

Re: [RFC, PATCHv1 00/28] 5-level paging

2016-12-08 Thread Kirill A. Shutemov

On Thu, Dec 08, 2016 at 10:16:07AM -0800, Linus Torvalds wrote:
> On Thu, Dec 8, 2016 at 8:21 AM, Kirill A. Shutemov
>  wrote:
> >
> > This patchset is still very early. There are a number of things missing
> > that we have to do before asking anyone to merge it (listed below).
> > It would be great if folks can start testing applications now (in QEMU) to
> > look for breakage.
> > Any early comments on the design or the patches would be appreciated as
> > well.
> 
> Looks ok to me. Starting off with a compile-time config option seems fine.
> 
> I do think that the x86 cpuid part should (patch 15) should be the
> first patch, so that we see "la57" as a capability in /proc/cpuinfo
> whether it's being enabled or not? We should merge that part
> regardless of any mm patches, I think.

Okay, I'll split up the CPUID part into separate patch and move it
beginning for the patchset

REQUIRED_MASK portion will stay where it is.

-- 
 Kirill A. Shutemov

Re: [RFC, PATCHv1 24/28] x86/mm: add sync_global_pgds() for configuration with 5-level paging

2016-12-08 Thread Kirill A. Shutemov

On Thu, Dec 08, 2016 at 10:42:19AM -0800, Andy Lutomirski wrote:
> On Thu, Dec 8, 2016 at 8:21 AM, Kirill A. Shutemov
>  wrote:
> > This basically restores slightly modified version of original
> > sync_global_pgds() which we had before foldedl p4d was introduced.
> >
> > The only modification is protection against 'address' overflow.
> >
> > Signed-off-by: Kirill A. Shutemov 
> > ---
> >  arch/x86/mm/init_64.c | 47 +++
> >  1 file changed, 47 insertions(+)
> >
> > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> > index a991f5c4c2c4..d637893ac8c2 100644
> > --- a/arch/x86/mm/init_64.c
> > +++ b/arch/x86/mm/init_64.c
> > @@ -92,6 +92,52 @@ __setup("noexec32=", nonx32_setup);
> >   * When memory was added/removed make sure all the processes MM have
> >   * suitable PGD entries in the local PGD level page.
> >   */
> > +#ifdef CONFIG_X86_5LEVEL
> > +void sync_global_pgds(unsigned long start, unsigned long end, int removed)
> > +{
> > +unsigned long address;
> > +
> > +   for (address = start; address <= end && address >= start;
> > +   address += PGDIR_SIZE) {
> > +const pgd_t *pgd_ref = pgd_offset_k(address);
> > +struct page *page;
> > +
> > +/*
> > + * When it is called after memory hot remove, pgd_none()
> > + * returns true. In this case (removed == 1), we must clear
> > + * the PGD entries in the local PGD level page.
> > + */
> > +if (pgd_none(*pgd_ref) && !removed)
> > +continue;
> 
> This isn't quite specific to your patch, but can we assert that, if
> removed=1, then we're not operating on the vmalloc range?  Because if
> we do, this will be racy is nasty ways.

Looks like there's no users of removed=1. The last user is gone with
af2cf278ef4f ("x86/mm/hotplug: Don't remove PGD entries in
remove_pagetable()")

I'll just drop it (with separate patch).

-- 
 Kirill A. Shutemov

Re: [PATCH 1/1] arm64: mm: add config options for page table configuration

2016-12-08 Thread Scott Branden


On 16-12-08 10:57 AM, Catalin Marinas wrote:

On Thu, Dec 08, 2016 at 08:30:36AM -0800, Scott Branden wrote:

On 16-12-08 02:00 AM, Catalin Marinas wrote:

On Wed, Dec 07, 2016 at 11:40:00AM -0800, Scott Branden wrote:

Make MAX_PHYSMEM_BITS and SECTIONS_SIZE_BITS configurable by adding
config options.
Default to current settings currently defined in sparesmem.h.
For systems wishing to save memory the config options can be overridden.
Example, changing MAX_PHYSMEM_BITS from 48 to 36 at the same time as
changing SECTION_SIZE_BITS from 30 to 26 frees 13MB of memory.

[...]

I would rather reduce SECTION_SIZE_BITS permanently where
feasible, like in this patch:

http://lkml.kernel.org/r/1465821119-3384-1-git-send-email-jszh...@marvell.com


This patch does not meet my requirements as I need SECTION_SIZE_BITS to be
set to 28 to reduce memory


So with this patch, we reduce it to 27, it should be fine-grained enough
for 128MB sections. Alternatively, there were other suggestions here:

http://lkml.iu.edu/hypermail/linux/kernel/1604.1/03036.html


and to allow memory hotplug to allocate a 256 MB section.


Can memory hotplug not work with 2*128MB sections in this case?
Yes, I then need to hotplug the memory at 2 locations for 1 memory 
addition but that will work in my current use case.  I'm one step away 
from hotplug working on ARM64.  Once that works I hope to break the 
dependencies between hotplug memory size created based on 
SECTION_SIZE_BITS in the future.


Since I currently have your attention:  I do think there is fundamental 
bug in the ARM64 mm implementation.  If you look at 
/sys/devices/system/memory it only shows the last memoryX section 
created after init.  It should be showing up multiple sections.  As a 
quick test change SECTION_SIZE_BITS and you look at 
/sys/devices/system/memory to see what changes.  Look at a standard x64 
machine you will see all the memoryX entries present.





My patch future proofs the tuning of the parameters by allowing
any section size to be made.


While MAX_PHYSMEM_BITS makes sense to users in general,
SECTION_SIZE_BITS is not always clear to the average user what it means
and its min/max boundaries. That's another reason (apart from single/few
Image case) why I prefer to not expose it as configuration option.
I agree SECTION_SIZE_BITS is confusing.  If you could provide more 
documentation on what it means and how it is used that would help others 
for sure.  I just stumbled upon it while working on a tight memory 
system and found it saves me significant memory.



I could combine the patch you list such that
SECTION_SIZE_BITS defaults to 30 when CONFIG_ARM64_64_PAGES is selected and
27 otherwise.  Should it default to something else for 16K and 4K pages?


I haven't done any calculations for 16K yet but we could probably come
up with some formula based on PAGE_SHIFT to cover all cases.
Yes, a calculation based on pages that modified SECTION_SIZE_BITS would 
probably be a better solution.



In terms of MAX_PHYSMEM_BITS, if our SoCs only use 40 (or less) bits I would
also like the configuration functionality.  This allows us to make the
SECTION_SIZE_BITS smaller.


So how small do you want SECTION_SIZE_BITS to be? As I said above, 128MB
sections should be sufficient in most cases and without the need to
reduce MAX_PHYSMEM_BITS.

256MB works for my current use case.  But appears somebody else was 
looking for 64MB previously.  So that is why adding support for 
modifying MAX_PHYSMEM_BITS makes sense as it needs to be modified to 
support the 64MB case:

https://lkml.org/lkml/2016/8/11/209

[RFC 10/10] kmod: add a sanity check on module loading

2016-12-08 Thread Luis R. Rodriguez

kmod has an optimization in place whereby if a some kernel code
uses request_module() on a module already loaded we never bother
userspace as the module already is loaded. This is not true for
get_fs_type() though as it uses aliases.

Additionally kmod <= v19 was broken -- it returns 0 to modprobe calls,
assuming the kernel module is built-in, where really we have a race as
the module starts forming. kmod <= v19 has incorrect userspace heuristics,
a userspace kmod fix is available for it:

http://git.kernel.org/cgit/utils/kernel/kmod/kmod.git/commit/libkmod/libkmod-module.c?id=fd44a98ae2eb5eb32161088954ab21e58e19dfc4

This changes kmod to address both:

 o Provides the alias optimization for get_fs_type() so modules already
   loaded do not get re-requested.

 o Provides a sanity test to verify modprobe's work

This is important given how any get_fs_type() users assert success
means we're ready to go, and tests with the new test_kmod stress driver
reveal that request_module() and get_fs_type() might fail for a few
other reasons. You don't need old kmod to fail on request_module() or
get_fs_type(), with the right system setup, these calls *can* fail
today.

Although this does get us in the business of keeping alias maps in
kernel, the the work to support and maintain this is trivial.
Aditionally, since it may be important get_fs_type() should not fail on
certain systems, this tightens things up a bit more.

The TL;DR:

kmod <= v19 will return 0 on modprobe calls if you are built-in,
however its heuristics for checking if you are built-in were broken.

It assumed that having the directory /sys/module/module-name
but not having the file /sys/module/module-name/initstate
is sufficient to assume a module is built-in.

The kernel loads the inittstate attribute *after* it creates the
directory. This is an issue when modprobe returns 0 for kernel calls
which assumes a return of 0 on request_module() can give you the
right to assert the module is loaded and live.

We cannot trust returns of modprobe as 0 in the kernel, we need to
verify that modules are live if modprobe return 0 but only if modules
*are* modules. The kernel heuristic we use to determine if a module is
built-in is that if modprobe returns 0 we know we must be built-in or
a module, but if we are a module clearly we must have a lingering kmod
dangling on our linked list. If there is no modules there we are *somewhat*
certain the module must be built in.

This is not enough though... we cannot easily work around this since the
kernel can use aliases to userspace for modules calls. For instance
fs/namespace.c uses fs-modulename for filesystesms on get_fs_type(), so
these need to be taken into consideration as well.

Using kmod <= 19 will give you a NULL get_fs_type() return even though
the module was loaded... That is a corner case, there are other failures
for request_module() though -- the other failures are not easy to
reproduce though but fortunately we have a stress test driver to help
with that now. Use the following tests:

 # tools/testing/selftests/kmod/kmod.sh -t 0008
 # tools/testing/selftests/kmod/kmod.sh -t 0009

You can more easily see this error if you have kmod <= v19 installed.

You will need to install kmod <= v19, be sure to install its modprobe
into /sbin/ as by default the 'make install' target does not replace
your own.

This test helps cure test_kmod cases 0008 0009 so enable them.

Reported-by: Martin Wilck 
Reported-by: Randy Wright 
Signed-off-by: Luis R. Rodriguez 
---
 kernel/kmod.c| 73 
 kernel/module.c  | 11 --
 tools/testing/selftests/kmod/kmod.sh |  9 ++---
 3 files changed, 85 insertions(+), 8 deletions(-)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0f449f77ed7..6bf0feab41d1 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -61,6 +61,11 @@ static DECLARE_RWSEM(umhelper_sem);
 
 #ifdef CONFIG_MODULES
 
+bool finished_loading(const char *name);
+int module_wait_until_finished(const char *name);
+struct module *find_module_all(const char *name, size_t len,
+  bool even_unformed);
+
 /*
modprobe_path is set via /proc/sys.
 */
@@ -158,6 +163,72 @@ int get_kmod_umh_count(void)
return atomic_read(_concurrent);
 }
 
+static bool kmod_exists(char *name)
+{
+   struct module *mod;
+
+   mutex_lock(_mutex);
+   mod = find_module_all(name, strlen(name), true);
+   mutex_unlock(_mutex);
+
+   if (mod)
+   return true;
+
+   return false;
+}
+
+/*
+ * The assumption is this must be a module, it could still not be live though
+ * since kmod <= 19 returns 0 even if it was not ready yet.  Allow for force
+ * wait check in case you are stuck on old userspace.
+ */
+static int wait_for_kmod(char *name)
+{
+   int ret = 0;
+
+   if (!finished_loading(name))
+   ret =

Re: [RFC, PATCHv1 15/28] x86: detect 5-level paging support

2016-12-08 Thread Linus Torvalds

On Thu, Dec 8, 2016 at 12:05 PM, Borislav Petkov  wrote:
>
> The cpuid() in cpuflags.c doesn't zero ecx which, if we have to be
> pedantic, it should do. It calls CPUID now with the ptr value of its 4th
> on 64-bit and 3rd arg on 32-bit, respectively, IINM.

In fact, just do a single cpuid_count(), and then implement the
traditional cpuid() as just

   #define cpuid(x, a,b,c,d) cpuid_count(x, 0, a, b, c, d)

or something.

Especially since that's some of the ugliest inline asm ever due to the
nasty BX handling.

  Linus

[PATCH 1/7] blk-mq: add blk_mq_start_stopped_hw_queue()

2016-12-08 Thread Jens Axboe

We have a variant for all hardware queues, but not one for a single
hardware queue.

Signed-off-by: Jens Axboe 
---
 block/blk-mq.c | 18 +++---
 include/linux/blk-mq.h |  1 +
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90db5b490df9..b216746be9d3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1064,18 +1064,22 @@ void blk_mq_start_hw_queues(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+   if (!blk_mq_hctx_stopped(hctx))
+   return;
+
+   clear_bit(BLK_MQ_S_STOPPED, >state);
+   blk_mq_run_hw_queue(hctx, async);
+}
+
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
struct blk_mq_hw_ctx *hctx;
int i;
 
-   queue_for_each_hw_ctx(q, hctx, i) {
-   if (!blk_mq_hctx_stopped(hctx))
-   continue;
-
-   clear_bit(BLK_MQ_S_STOPPED, >state);
-   blk_mq_run_hw_queue(hctx, async);
-   }
+   queue_for_each_hw_ctx(q, hctx, i)
+   blk_mq_start_stopped_hw_queue(hctx, async);
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 35a0af5ede6d..87e404aae267 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -231,6 +231,7 @@ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-- 
2.7.4

[PATCH 5/7] blk-mq-sched: add framework for MQ capable IO schedulers

2016-12-08 Thread Jens Axboe

Signed-off-by: Jens Axboe 
---
 block/Makefile   |   2 +-
 block/blk-core.c |   9 +-
 block/blk-exec.c |   3 +-
 block/blk-flush.c|   7 +-
 block/blk-merge.c|   3 +
 block/blk-mq-sched.c | 246 +++
 block/blk-mq-sched.h | 187 +++
 block/blk-mq-tag.c   |   1 +
 block/blk-mq.c   | 150 +++--
 block/blk-mq.h   |  34 +++
 block/elevator.c | 181 ++
 include/linux/blk-mq.h   |   2 +-
 include/linux/elevator.h |  29 +-
 13 files changed, 713 insertions(+), 141 deletions(-)
 create mode 100644 block/blk-mq-sched.c
 create mode 100644 block/blk-mq-sched.h

diff --git a/block/Makefile b/block/Makefile
index a827f988c4e6..2eee9e1bb6db 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o 
blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
-   blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
+   blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4b7ec5958055..3f83414d6986 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-sched.h"
 #include "blk-wbt.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -1428,7 +1429,7 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
return;
 
if (q->mq_ops) {
-   blk_mq_free_request(req);
+   blk_mq_sched_put_request(req);
return;
}
 
@@ -1464,7 +1465,7 @@ void blk_put_request(struct request *req)
struct request_queue *q = req->q;
 
if (q->mq_ops)
-   blk_mq_free_request(req);
+   blk_mq_sched_put_request(req);
else {
unsigned long flags;
 
@@ -1528,6 +1529,7 @@ bool bio_attempt_back_merge(struct request_queue *q, 
struct request *req,
blk_account_io_start(req, false);
return true;
 }
+EXPORT_SYMBOL_GPL(bio_attempt_back_merge);
 
 bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 struct bio *bio)
@@ -1552,6 +1554,7 @@ bool bio_attempt_front_merge(struct request_queue *q, 
struct request *req,
blk_account_io_start(req, false);
return true;
 }
+EXPORT_SYMBOL_GPL(bio_attempt_front_merge);
 
 /**
  * blk_attempt_plug_merge - try to merge with %current's plugged list
@@ -2173,7 +2176,7 @@ int blk_insert_cloned_request(struct request_queue *q, 
struct request *rq)
if (q->mq_ops) {
if (blk_queue_io_stat(q))
blk_account_io_start(rq, true);
-   blk_mq_insert_request(rq, false, true, false);
+   blk_mq_sched_insert_request(rq, false, true, false);
return 0;
}
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 3ecb00a6cf45..86656fdfa637 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,6 +9,7 @@
 #include 
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 /*
  * for max sense size
@@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct 
gendisk *bd_disk,
 * be reused after dying flag is set
 */
if (q->mq_ops) {
-   blk_mq_insert_request(rq, at_head, true, false);
+   blk_mq_sched_insert_request(rq, at_head, true, false);
return;
}
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 27a42dab5a36..63b91697d167 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,6 +74,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
 
 /* FLUSH/FUA sequences */
 enum {
@@ -425,9 +426,9 @@ void blk_insert_flush(struct request *rq)
 */
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-   if (q->mq_ops) {
-   blk_mq_insert_request(rq, false, true, false);
-   } else
+   if (q->mq_ops)
+   blk_mq_sched_insert_request(rq, false, true, false);
+   else
list_add_tail(>queuelist, >queue_head);
return;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1002afdfee99..01247812e13f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -766,6 +766,7 @@ int attempt_back_merge(struct request_queue *q, struct 
request *rq)
 
return 0;
 }

[PATCH 3/7] elevator: make the rqhash helpers exported

2016-12-08 Thread Jens Axboe

Signed-off-by: Jens Axboe 
---
 block/elevator.c | 8 
 include/linux/elevator.h | 5 +
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index a18a5db274e4..40f0c04e5ad3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -248,13 +248,13 @@ static inline void __elv_rqhash_del(struct request *rq)
rq->rq_flags &= ~RQF_HASHED;
 }
 
-static void elv_rqhash_del(struct request_queue *q, struct request *rq)
+void elv_rqhash_del(struct request_queue *q, struct request *rq)
 {
if (ELV_ON_HASH(rq))
__elv_rqhash_del(rq);
 }
 
-static void elv_rqhash_add(struct request_queue *q, struct request *rq)
+void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
struct elevator_queue *e = q->elevator;
 
@@ -263,13 +263,13 @@ static void elv_rqhash_add(struct request_queue *q, 
struct request *rq)
rq->rq_flags |= RQF_HASHED;
 }
 
-static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
__elv_rqhash_del(rq);
elv_rqhash_add(q, rq);
 }
 
-static struct request *elv_rqhash_find(struct request_queue *q, sector_t 
offset)
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
struct elevator_queue *e = q->elevator;
struct hlist_node *next;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index f219c9aed360..b276e9ef0e0b 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -108,6 +108,11 @@ struct elevator_type
 
 #define ELV_HASH_BITS 6
 
+void elv_rqhash_del(struct request_queue *q, struct request *rq);
+void elv_rqhash_add(struct request_queue *q, struct request *rq);
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq);
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
+
 /*
  * each queue has an elevator_queue associated with it
  */
-- 
2.7.4

< 1 2 3 4 5 6 7 8 9 10 >

401 - 500 of 1714 matches

Mail list logo