date:20171002

[PATCH v4 5/5] clk: aspeed: Add reset controller

2017-10-02 Thread Joel Stanley

There are some resets that are not associated with gates. These are
represented by a reset controller.

Signed-off-by: Joel Stanley 

---
v3:
  - Add named initalisers for the reset defines
  - Add define for ADC
---
 drivers/clk/clk-aspeed.c | 82 +++-
 include/dt-bindings/clock/aspeed-clock.h | 10 
 2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/clk-aspeed.c b/drivers/clk/clk-aspeed.c
index a424b056e767..de491dc7f955 100644
--- a/drivers/clk/clk-aspeed.c
+++ b/drivers/clk/clk-aspeed.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -292,6 +293,68 @@ static const struct clk_ops aspeed_clk_gate_ops = {
.is_enabled = aspeed_clk_is_enabled,
 };
 
+/**
+ * struct aspeed_reset - Aspeed reset controller
+ * @map: regmap to access the containing system controller
+ * @rcdev: reset controller device
+ */
+struct aspeed_reset {
+   struct regmap   *map;
+   struct reset_controller_dev rcdev;
+};
+
+#define to_aspeed_reset(p) container_of((p), struct aspeed_reset, rcdev)
+
+static const u8 aspeed_resets[] = {
+   [ASPEED_RESET_XDMA] = 25,
+   [ASPEED_RESET_MCTP] = 24,
+   [ASPEED_RESET_ADC]  = 23,
+   [ASPEED_RESET_JTAG_MASTER] = 22,
+   [ASPEED_RESET_MIC]  = 18,
+   [ASPEED_RESET_PWM]  =  9,
+   [ASPEED_RESET_PCIVGA]   =  8,
+   [ASPEED_RESET_I2C]  =  2,
+   [ASPEED_RESET_AHB]  =  1,
+};
+
+static int aspeed_reset_deassert(struct reset_controller_dev *rcdev,
+unsigned long id)
+{
+   struct aspeed_reset *ar = to_aspeed_reset(rcdev);
+   u32 rst = BIT(aspeed_resets[id]);
+
+   return regmap_update_bits(ar->map, ASPEED_RESET_CTRL, rst, 0);
+}
+
+static int aspeed_reset_assert(struct reset_controller_dev *rcdev,
+  unsigned long id)
+{
+   struct aspeed_reset *ar = to_aspeed_reset(rcdev);
+   u32 rst = BIT(aspeed_resets[id]);
+
+   return regmap_update_bits(ar->map, ASPEED_RESET_CTRL, rst, rst);
+}
+
+static int aspeed_reset_status(struct reset_controller_dev *rcdev,
+  unsigned long id)
+{
+   struct aspeed_reset *ar = to_aspeed_reset(rcdev);
+   u32 val, rst = BIT(aspeed_resets[id]);
+   int ret;
+
+   ret = regmap_read(ar->map, ASPEED_RESET_CTRL, &val);
+   if (ret)
+   return ret;
+
+   return !!(val & rst);
+}
+
+static const struct reset_control_ops aspeed_reset_ops = {
+   .assert = aspeed_reset_assert,
+   .deassert = aspeed_reset_deassert,
+   .status = aspeed_reset_status,
+};
+
 static struct clk_hw *aspeed_clk_hw_register_gate(struct device *dev,
const char *name, const char *parent_name, unsigned long flags,
struct regmap *map, u8 clock_idx, u8 reset_idx,
@@ -333,10 +396,11 @@ static int aspeed_clk_probe(struct platform_device *pdev)
 {
const struct aspeed_clk_soc_data *soc_data;
struct device *dev = &pdev->dev;
+   struct aspeed_reset *ar;
struct regmap *map;
struct clk_hw *hw;
u32 val, rate;
-   int i;
+   int i, ret;
 
map = syscon_node_to_regmap(dev->of_node);
if (IS_ERR(map)) {
@@ -344,6 +408,22 @@ static int aspeed_clk_probe(struct platform_device *pdev)
return PTR_ERR(map);
}
 
+   ar = devm_kzalloc(dev, sizeof(*ar), GFP_KERNEL);
+   if (!ar)
+   return -ENOMEM;
+
+   ar->map = map;
+   ar->rcdev.owner = THIS_MODULE;
+   ar->rcdev.nr_resets = ARRAY_SIZE(aspeed_resets);
+   ar->rcdev.ops = &aspeed_reset_ops;
+   ar->rcdev.of_node = dev->of_node;
+
+   ret = devm_reset_controller_register(dev, &ar->rcdev);
+   if (ret) {
+   dev_err(dev, "could not register reset controller\n");
+   return ret;
+   }
+
/* SoC generations share common layouts but have different divisors */
soc_data = of_device_get_match_data(dev);
if (!soc_data) {
diff --git a/include/dt-bindings/clock/aspeed-clock.h 
b/include/dt-bindings/clock/aspeed-clock.h
index 4a99421d77c8..8e19646d8025 100644
--- a/include/dt-bindings/clock/aspeed-clock.h
+++ b/include/dt-bindings/clock/aspeed-clock.h
@@ -39,4 +39,14 @@
 
 #define ASPEED_NUM_CLKS35
 
+#define ASPEED_RESET_XDMA  0
+#define ASPEED_RESET_MCTP  1
+#define ASPEED_RESET_ADC   2
+#define ASPEED_RESET_JTAG_MASTER   3
+#define ASPEED_RESET_MIC   4
+#define ASPEED_RESET_PWM   5
+#define ASPEED_RESET_PCIVGA6
+#define ASPEED_RESET_I2C   7
+#define ASPEED_RESET_AHB   8
+
 #endif
-- 
2.14.1

[PATCH v4 3/5] clk: aspeed: Add platform driver and register PLLs

2017-10-02 Thread Joel Stanley

This registers a platform driver to set up all of the non-core clocks.

The clocks that have configurable rates are now registered.

Signed-off-by: Joel Stanley 

--
v4:
 - Add eclk div table to fix ast2500 calculation
 - Add defines to document the BIT() macros
 - Pass dev where we can when registering clocks
 - Check for errors when registering clk_hws
v3:
 - Fix bclk and eclk calculation
 - Seperate out ast2400 and ast25000 for pll calculation

Signed-off-by: Joel Stanley 
---
 drivers/clk/clk-aspeed.c | 163 +++
 1 file changed, 163 insertions(+)

diff --git a/drivers/clk/clk-aspeed.c b/drivers/clk/clk-aspeed.c
index d39cf51a5114..adb295292189 100644
--- a/drivers/clk/clk-aspeed.c
+++ b/drivers/clk/clk-aspeed.c
@@ -14,6 +14,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -114,6 +116,32 @@ static const struct aspeed_gate_data aspeed_gates[] 
__initconst = {
[ASPEED_CLK_GATE_LHCCLK] =  { 28, -1, "lhclk-gate", 
"lhclk", 0 }, /* LPC master/LPC+ */
 };
 
+static const char * const eclk_parents[] = {"d1pll", "hpll", "mpll"};
+
+static const struct clk_div_table ast2500_eclk_div_table[] = {
+   { 0x0, 2 },
+   { 0x1, 2 },
+   { 0x2, 3 },
+   { 0x3, 4 },
+   { 0x4, 5 },
+   { 0x5, 6 },
+   { 0x6, 7 },
+   { 0x7, 8 },
+   { 0 }
+};
+
+static const struct clk_div_table ast2500_mac_div_table[] = {
+   { 0x0, 4 }, /* Yep, really. Aspeed confirmed this is correct */
+   { 0x1, 4 },
+   { 0x2, 6 },
+   { 0x3, 8 },
+   { 0x4, 10 },
+   { 0x5, 12 },
+   { 0x6, 14 },
+   { 0x7, 16 },
+   { 0 }
+};
+
 static const struct clk_div_table ast2400_div_table[] = {
{ 0x0, 2 },
{ 0x1, 4 },
@@ -179,6 +207,141 @@ static struct clk_hw *aspeed_ast2500_calc_pll(const char 
*name, u32 val)
mult, div);
 }
 
+struct aspeed_clk_soc_data {
+   const struct clk_div_table *div_table;
+   const struct clk_div_table *mac_div_table;
+   const struct clk_div_table *eclk_div_table;
+   struct clk_hw *(*calc_pll)(const char *name, u32 val);
+};
+
+static const struct aspeed_clk_soc_data ast2500_data = {
+   .div_table = ast2500_div_table,
+   .mac_div_table = ast2500_mac_div_table,
+   .eclk_div_table = ast2500_eclk_div_table,
+   .calc_pll = aspeed_ast2500_calc_pll,
+};
+
+static const struct aspeed_clk_soc_data ast2400_data = {
+   .div_table = ast2400_div_table,
+   .mac_div_table = ast2400_div_table,
+   .eclk_div_table = ast2400_div_table,
+   .calc_pll = aspeed_ast2400_calc_pll,
+};
+
+static int aspeed_clk_probe(struct platform_device *pdev)
+{
+   const struct aspeed_clk_soc_data *soc_data;
+   struct device *dev = &pdev->dev;
+   struct regmap *map;
+   struct clk_hw *hw;
+   u32 val, rate;
+
+   map = syscon_node_to_regmap(dev->of_node);
+   if (IS_ERR(map)) {
+   dev_err(dev, "no syscon regmap\n");
+   return PTR_ERR(map);
+   }
+
+   /* SoC generations share common layouts but have different divisors */
+   soc_data = of_device_get_match_data(dev);
+   if (!soc_data) {
+   dev_err(dev, "no match data for platform\n");
+   return -EINVAL;
+   }
+
+   /* UART clock div13 setting */
+   regmap_read(map, ASPEED_MISC_CTRL, &val);
+   if (val & UART_DIV13_EN)
+   rate = 2400 / 13;
+   else
+   rate = 2400;
+   /* TODO: Find the parent data for the uart clock */
+   hw = clk_hw_register_fixed_rate(dev, "uart", NULL, 0, rate);
+   if (IS_ERR(hw))
+   return PTR_ERR(hw);
+   aspeed_clk_data->hws[ASPEED_CLK_UART] = hw;
+
+   /*
+* Memory controller (M-PLL) PLL. This clock is configured by the
+* bootloader, and is exposed to Linux as a read-only clock rate.
+*/
+   regmap_read(map, ASPEED_MPLL_PARAM, &val);
+   hw = soc_data->calc_pll("mpll", val);
+   if (IS_ERR(hw))
+   return PTR_ERR(hw);
+   aspeed_clk_data->hws[ASPEED_CLK_MPLL] = hw;
+
+   /* SD/SDIO clock divider (TODO: There's a gate too) */
+   hw = clk_hw_register_divider_table(dev, "sdio", "hpll", 0,
+   scu_base + ASPEED_CLK_SELECTION, 12, 3, 0,
+   soc_data->div_table,
+   &aspeed_clk_lock);
+   if (IS_ERR(hw))
+   return PTR_ERR(hw);
+   aspeed_clk_data->hws[ASPEED_CLK_SDIO] = hw;
+
+   /* MAC AHB bus clock divider */
+   hw = clk_hw_register_divider_table(dev, "mac", "hpll", 0,
+   scu_base + ASPEED_CLK_SELECTION, 16, 3, 0,
+   soc_data->mac_div_table,
+   &aspeed_clk_lock);
+   if (IS_ERR(hw))
+   return PTR_ERR(hw);
+   aspeed_clk_data->hws[ASPEED_CLK_MAC] = hw;
+
+   /* LPC Host (LHCLK) clock divider */
+

[PATCH v4 4/5] clk: aspeed: Register gated clocks

2017-10-02 Thread Joel Stanley

The majority of the clocks in the system are gates paired with a reset
controller that holds the IP in reset.

This borrows from clk_hw_register_gate, but registers two 'gates', one
to control the clock enable register and the other to control the reset
IP. This allows us to enforce the ordering:

 1. Place IP in reset
 2. Enable clock
 3. Delay
 4. Release reset

There are some gates that do not have an associated reset; these are
handled by using -1 as the index for the reset.

Signed-off-by: Joel Stanley 

---
v4:
 - Drop useless 'disable clock' comment
 - Drop CLK_IS_BASIC flag
 - Fix 'there are a number of clocks...' comment
 - Pass device to clk registration functions
 - Check for errors when registering clk_hws
v3:
 - Remove gates offset as gates are now at the start of the list
---
 drivers/clk/clk-aspeed.c | 130 +++
 1 file changed, 130 insertions(+)

diff --git a/drivers/clk/clk-aspeed.c b/drivers/clk/clk-aspeed.c
index adb295292189..a424b056e767 100644
--- a/drivers/clk/clk-aspeed.c
+++ b/drivers/clk/clk-aspeed.c
@@ -228,6 +228,107 @@ static const struct aspeed_clk_soc_data ast2400_data = {
.calc_pll = aspeed_ast2400_calc_pll,
 };
 
+static int aspeed_clk_enable(struct clk_hw *hw)
+{
+   struct aspeed_clk_gate *gate = to_aspeed_clk_gate(hw);
+   unsigned long flags;
+   u32 clk = BIT(gate->clock_idx);
+   u32 rst = BIT(gate->reset_idx);
+
+   spin_lock_irqsave(gate->lock, flags);
+
+   if (gate->reset_idx >= 0) {
+   /* Put IP in reset */
+   regmap_update_bits(gate->map, ASPEED_RESET_CTRL, rst, rst);
+
+   /* Delay 100us */
+   udelay(100);
+   }
+
+   /* Enable clock */
+   regmap_update_bits(gate->map, ASPEED_CLK_STOP_CTRL, clk, 0);
+
+   if (gate->reset_idx >= 0) {
+   /* Delay 10ms */
+   /* TODO: can we sleep here? */
+   msleep(10);
+
+   /* Take IP out of reset */
+   regmap_update_bits(gate->map, ASPEED_RESET_CTRL, rst, 0);
+   }
+
+   spin_unlock_irqrestore(gate->lock, flags);
+
+   return 0;
+}
+
+static void aspeed_clk_disable(struct clk_hw *hw)
+{
+   struct aspeed_clk_gate *gate = to_aspeed_clk_gate(hw);
+   unsigned long flags;
+   u32 clk = BIT(gate->clock_idx);
+
+   spin_lock_irqsave(gate->lock, flags);
+
+   regmap_update_bits(gate->map, ASPEED_CLK_STOP_CTRL, clk, clk);
+
+   spin_unlock_irqrestore(gate->lock, flags);
+}
+
+static int aspeed_clk_is_enabled(struct clk_hw *hw)
+{
+   struct aspeed_clk_gate *gate = to_aspeed_clk_gate(hw);
+   u32 clk = BIT(gate->clock_idx);
+   u32 reg;
+
+   regmap_read(gate->map, ASPEED_CLK_STOP_CTRL, ®);
+
+   return (reg & clk) ? 0 : 1;
+}
+
+static const struct clk_ops aspeed_clk_gate_ops = {
+   .enable = aspeed_clk_enable,
+   .disable = aspeed_clk_disable,
+   .is_enabled = aspeed_clk_is_enabled,
+};
+
+static struct clk_hw *aspeed_clk_hw_register_gate(struct device *dev,
+   const char *name, const char *parent_name, unsigned long flags,
+   struct regmap *map, u8 clock_idx, u8 reset_idx,
+   u8 clk_gate_flags, spinlock_t *lock)
+{
+   struct aspeed_clk_gate *gate;
+   struct clk_init_data init;
+   struct clk_hw *hw;
+   int ret;
+
+   gate = kzalloc(sizeof(*gate), GFP_KERNEL);
+   if (!gate)
+   return ERR_PTR(-ENOMEM);
+
+   init.name = name;
+   init.ops = &aspeed_clk_gate_ops;
+   init.flags = flags;
+   init.parent_names = parent_name ? &parent_name : NULL;
+   init.num_parents = parent_name ? 1 : 0;
+
+   gate->map = map;
+   gate->clock_idx = clock_idx;
+   gate->reset_idx = reset_idx;
+   gate->flags = clk_gate_flags;
+   gate->lock = lock;
+   gate->hw.init = &init;
+
+   hw = &gate->hw;
+   ret = clk_hw_register(dev, hw);
+   if (ret) {
+   kfree(gate);
+   hw = ERR_PTR(ret);
+   }
+
+   return hw;
+}
+
 static int aspeed_clk_probe(struct platform_device *pdev)
 {
const struct aspeed_clk_soc_data *soc_data;
@@ -235,6 +336,7 @@ static int aspeed_clk_probe(struct platform_device *pdev)
struct regmap *map;
struct clk_hw *hw;
u32 val, rate;
+   int i;
 
map = syscon_node_to_regmap(dev->of_node);
if (IS_ERR(map)) {
@@ -323,6 +425,34 @@ static int aspeed_clk_probe(struct platform_device *pdev)
return PTR_ERR(hw);
aspeed_clk_data->hws[ASPEED_CLK_BCLK] = hw;
 
+   /*
+* TODO: There are a number of clocks that not included in this driver
+* as more information is required:
+*   D2-PLL
+*   D-PLL
+*   YCLK
+*   RGMII
+*   RMII
+*   UART[1..5] clock source mux
+*/
+
+   for (i = 0; i < ARRAY_SIZE(aspeed_gates); i++) {
+   const struct aspeed_gate_data *

[PATCH v4 1/5] clk: Add clock driver for ASPEED BMC SoCs

2017-10-02 Thread Joel Stanley

This adds the stub of a driver for the ASPEED SoCs. The clocks are
defined and the static registration is set up.

Signed-off-by: Joel Stanley 
---
v3:
 - use named initlisers for aspeed_gates table
 - fix clocks typo
 - Move ASPEED_NUM_CLKS to the bottom of the list
 - Put gates at the start of the list, so we can use them to initalise
   the aspeed_gates table
 - Add ASPEED_CLK_SELECTION_2
 - Set parent of network MAC gates
---
 drivers/clk/Kconfig  |  12 +++
 drivers/clk/Makefile |   1 +
 drivers/clk/clk-aspeed.c | 148 +++
 include/dt-bindings/clock/aspeed-clock.h |  42 +
 4 files changed, 203 insertions(+)
 create mode 100644 drivers/clk/clk-aspeed.c
 create mode 100644 include/dt-bindings/clock/aspeed-clock.h

diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index 1c4e1aa6767e..9abe063ef8d2 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -142,6 +142,18 @@ config COMMON_CLK_GEMINI
  This driver supports the SoC clocks on the Cortina Systems Gemini
  platform, also known as SL3516 or CS3516.
 
+config COMMON_CLK_ASPEED
+   bool "Clock driver for Aspeed BMC SoCs"
+   depends on ARCH_ASPEED || COMPILE_TEST
+   default ARCH_ASPEED
+   select MFD_SYSCON
+   select RESET_CONTROLLER
+   ---help---
+ This driver supports the SoC clocks on the Aspeed BMC platforms.
+
+ The G4 and G5 series, including the ast2400 and ast2500, are supported
+ by this driver.
+
 config COMMON_CLK_S2MPS11
tristate "Clock driver for S2MPS1X/S5M8767 MFD"
depends on MFD_SEC_CORE || COMPILE_TEST
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index c99f363826f0..575c68919d9b 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_ARCH_CLPS711X)   += clk-clps711x.o
 obj-$(CONFIG_COMMON_CLK_CS2000_CP) += clk-cs2000-cp.o
 obj-$(CONFIG_ARCH_EFM32)   += clk-efm32gg.o
 obj-$(CONFIG_COMMON_CLK_GEMINI)+= clk-gemini.o
+obj-$(CONFIG_COMMON_CLK_ASPEED)+= clk-aspeed.o
 obj-$(CONFIG_ARCH_HIGHBANK)+= clk-highbank.o
 obj-$(CONFIG_CLK_HSDK) += clk-hsdk-pll.o
 obj-$(CONFIG_COMMON_CLK_MAX77686)  += clk-max77686.o
diff --git a/drivers/clk/clk-aspeed.c b/drivers/clk/clk-aspeed.c
new file mode 100644
index ..a45eb351bb05
--- /dev/null
+++ b/drivers/clk/clk-aspeed.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2017 IBM Corporation
+ *
+ * Joel Stanley 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "clk-aspeed: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define ASPEED_STRAP   0x70
+
+/* Keeps track of all clocks */
+static struct clk_hw_onecell_data *aspeed_clk_data;
+
+static void __iomem *scu_base;
+
+/**
+ * struct aspeed_gate_data - Aspeed gated clocks
+ * @clock_idx: bit used to gate this clock in the clock register
+ * @reset_idx: bit used to reset this IP in the reset register. -1 if no
+ * reset is required when enabling the clock
+ * @name: the clock name
+ * @parent_name: the name of the parent clock
+ * @flags: standard clock framework flags
+ */
+struct aspeed_gate_data {
+   u8  clock_idx;
+   s8  reset_idx;
+   const char  *name;
+   const char  *parent_name;
+   unsigned long   flags;
+};
+
+/**
+ * struct aspeed_clk_gate - Aspeed specific clk_gate structure
+ * @hw:handle between common and hardware-specific interfaces
+ * @reg:   register controlling gate
+ * @clock_idx: bit used to gate this clock in the clock register
+ * @reset_idx: bit used to reset this IP in the reset register. -1 if no
+ * reset is required when enabling the clock
+ * @flags: hardware-specific flags
+ * @lock:  register lock
+ *
+ * Some of the clocks in the Aspeed SoC must be put in reset before enabling.
+ * This modified version of clk_gate allows an optional reset bit to be
+ * specified.
+ */
+struct aspeed_clk_gate {
+   struct clk_hw   hw;
+   struct regmap   *map;
+   u8  clock_idx;
+   s8  reset_idx;
+   u8  flags;
+   spinlock_t  *lock;
+};
+
+#define to_aspeed_clk_gate(_hw) container_of(_hw, struct aspeed_clk_gate, hw)
+
+/* TODO: ask Aspeed about the actual parent data */
+static const struct aspeed_gate_data aspeed_gates[] __initconst = {
+   /*   clk rst   name parent  
flags */
+   [ASPEED_CLK_GATE_ECLK] ={  0, -1, "eclk-gate",  "eclk", 
0 }, /* Video Engine */
+   [ASPEED_CLK_GATE_GCLK] ={  1,  7

[PATCH v4 2/5] clk: aspeed: Register core clocks

2017-10-02 Thread Joel Stanley

This registers the core clocks; those which are required to calculate
the rate of the timer peripheral so the system can load a clocksource
driver.

Signed-off-by: Joel Stanley 

---
v4:
  - Add defines to document the BIT() macros
v3:
  - Fix ast2400 ahb calculation
  - Remove incorrect 'this is wrong' comment
  - Separate out clkin calc to be per platform
  - Support 48MHz clkin on ast2400
---
 drivers/clk/clk-aspeed.c | 177 +++
 1 file changed, 177 insertions(+)

diff --git a/drivers/clk/clk-aspeed.c b/drivers/clk/clk-aspeed.c
index a45eb351bb05..d39cf51a5114 100644
--- a/drivers/clk/clk-aspeed.c
+++ b/drivers/clk/clk-aspeed.c
@@ -20,7 +20,23 @@
 
 #include 
 
+#define ASPEED_RESET_CTRL  0x04
+#define ASPEED_CLK_SELECTION   0x08
+#define ASPEED_CLK_STOP_CTRL   0x0c
+#define ASPEED_MPLL_PARAM  0x20
+#define ASPEED_HPLL_PARAM  0x24
+#define  AST2500_HPLL_BYPASS_ENBIT(20)
+#define  AST2400_HPLL_STRAPPED BIT(18)
+#define  AST2400_HPLL_BYPASS_ENBIT(17)
+#define ASPEED_MISC_CTRL   0x2c
+#define  UART_DIV13_EN BIT(12)
 #define ASPEED_STRAP   0x70
+#define  CLKIN_25MHZ_ENBIT(23)
+#define  AST2400_CLK_SOURCE_SELBIT(18)
+#define ASPEED_CLK_SELECTION_2 0xd8
+
+/* Globally visible clocks */
+static DEFINE_SPINLOCK(aspeed_clk_lock);
 
 /* Keeps track of all clocks */
 static struct clk_hw_onecell_data *aspeed_clk_data;
@@ -98,6 +114,160 @@ static const struct aspeed_gate_data aspeed_gates[] 
__initconst = {
[ASPEED_CLK_GATE_LHCCLK] =  { 28, -1, "lhclk-gate", 
"lhclk", 0 }, /* LPC master/LPC+ */
 };
 
+static const struct clk_div_table ast2400_div_table[] = {
+   { 0x0, 2 },
+   { 0x1, 4 },
+   { 0x2, 6 },
+   { 0x3, 8 },
+   { 0x4, 10 },
+   { 0x5, 12 },
+   { 0x6, 14 },
+   { 0x7, 16 },
+   { 0 }
+};
+
+static const struct clk_div_table ast2500_div_table[] = {
+   { 0x0, 4 },
+   { 0x1, 8 },
+   { 0x2, 12 },
+   { 0x3, 16 },
+   { 0x4, 20 },
+   { 0x5, 24 },
+   { 0x6, 28 },
+   { 0x7, 32 },
+   { 0 }
+};
+
+static struct clk_hw *aspeed_ast2400_calc_pll(const char *name, u32 val)
+{
+   unsigned int mult, div;
+
+   if (val & AST2400_HPLL_BYPASS_EN) {
+   /* Pass through mode */
+   mult = div = 1;
+   } else {
+   /* F = 24Mhz * (2-OD) * [(N + 2) / (D + 1)] */
+   u32 n = (val >> 5) & 0x3f;
+   u32 od = (val >> 4) & 0x1;
+   u32 d = val & 0xf;
+
+   mult = (2 - od) * (n + 2);
+   div = d + 1;
+   }
+   return clk_hw_register_fixed_factor(NULL, name, "clkin", 0,
+   mult, div);
+};
+
+static struct clk_hw *aspeed_ast2500_calc_pll(const char *name, u32 val)
+{
+   unsigned int mult, div;
+
+   if (val & AST2500_HPLL_BYPASS_EN) {
+   /* Pass through mode */
+   mult = div = 1;
+   } else {
+   /* F = clkin * [(M+1) / (N+1)] / (P + 1) */
+   u32 p = (val >> 13) & 0x3f;
+   u32 m = (val >> 5) & 0xff;
+   u32 n = val & 0x1f;
+
+   mult = (m + 1) / (n + 1);
+   div = p + 1;
+   }
+
+   return clk_hw_register_fixed_factor(NULL, name, "clkin", 0,
+   mult, div);
+}
+
+static void __init aspeed_ast2400_cc(struct regmap *map)
+{
+   struct clk_hw *hw;
+   u32 val, freq, div;
+
+   /*
+* CLKIN is the crystal oscillator, 24, 48 or 25MHz selected by
+* strapping
+*/
+   regmap_read(map, ASPEED_STRAP, &val);
+   if (val & CLKIN_25MHZ_EN)
+   freq = 2500;
+   else if (val & AST2400_CLK_SOURCE_SEL)
+   freq = 4800;
+   else
+   freq = 2400;
+   hw = clk_hw_register_fixed_rate(NULL, "clkin", NULL, 0, freq);
+   pr_debug("clkin @%u MHz\n", freq / 100);
+
+   /*
+* High-speed PLL clock derived from the crystal. This the CPU clock,
+* and we assume that it is enabled
+*/
+   regmap_read(map, ASPEED_HPLL_PARAM, &val);
+   WARN(val & AST2400_HPLL_STRAPPED, "hpll is strapped not configured");
+   aspeed_clk_data->hws[ASPEED_CLK_HPLL] = aspeed_ast2400_calc_pll("hpll", 
val);
+
+   /*
+* Strap bits 11:10 define the CPU/AHB clock frequency ratio (aka HCLK)
+*   00: Select CPU:AHB = 1:1
+*   01: Select CPU:AHB = 2:1
+*   10: Select CPU:AHB = 4:1
+*   11: Select CPU:AHB = 3:1
+*/
+   regmap_read(map, ASPEED_STRAP, &val);
+   val = (val >> 10) & 0x3;
+   div = val + 1;
+   if (div == 3)
+   div = 4;
+   else if (div == 4)
+   div = 3;
+   hw = clk_hw_register_fixed_factor(NULL, "ahb", "hpll", 0, 1, div);
+   aspeed_clk_data->hws[ASPEED_CLK_AHB] = hw;
+
+   /* APB clock clock selection register SCU08 (aka PCLK) */
+

[PATCH v4 0/5] clk: Add Aspeed clock driver

2017-10-02 Thread Joel Stanley

This driver supports the ast2500, ast2400 (and derivative) BMC SoCs from
Aspeed.

This is v4. See patches for detailed changelogs.

v4: Address review from Andrew and Stephen. 
v3: Address review from Andrew and has seen more testing on hardware
v2: split the driver out into a series of patches to make them easier to
review.

All of the important clocks are supported, with most non-essential ones
also implemented where information is available. I am working with
Aspeed to clear up some of the missing information, including the
missing parent-sibling relationships.

We need to know the rate of the apb clock in order to correctly program
the clocksource driver, so the apb and it's parents are created in the
CLK_OF_DECLARE_DRIVER callback.

The rest of the clocks are created at normal driver probe time. I
followed the Gemini driver's lead with using the regmap where I could,
but also having a pointer to the base address for use with the common
clock callbacks.

The driver borrows from the clk_gate common clock infrastructure, but modifies
it in order to support the clock gate and reset pair that most of the clocks
have. This pair must be reset-ungated-released, with appropriate delays,
according to the datasheet.

The first patch introduces the core clock registration parts, and describes
the clocks. The second creates the core clocks, giving the system enough to
boot (but without uart). Next come the non-core clocks, and finally the reset
controller that is used for the few cocks that don't have a gate to go with 
their
reset pair.

Please review!

Cheers,

Joel

Joel Stanley (5):
  clk: Add clock driver for ASPEED BMC SoCs
  clk: aspeed: Register core clocks
  clk: aspeed: Add platform driver and register PLLs
  clk: aspeed: Register gated clocks
  clk: aspeed: Add reset controller

 drivers/clk/Kconfig  |  12 +
 drivers/clk/Makefile |   1 +
 drivers/clk/clk-aspeed.c | 698 +++
 include/dt-bindings/clock/aspeed-clock.h |  52 +++
 4 files changed, 763 insertions(+)
 create mode 100644 drivers/clk/clk-aspeed.c
 create mode 100644 include/dt-bindings/clock/aspeed-clock.h

-- 
2.14.1

Re: [patch V2 22/29] lockup_detector: Make watchdog_nmi_reconfigure() two stage

2017-10-02 Thread Thomas Gleixner

On Tue, 3 Oct 2017, Michael Ellerman wrote:
> Hi Thomas,
> Unfortunately this is hitting the WARN_ON in start_wd_cpu() on powerpc
> because we're calling it multiple times for the boot CPU.
> 
> The first call is via:
> 
>   start_wd_on_cpu+0x80/0x2f0
>   watchdog_nmi_reconfigure+0x124/0x170
>   softlockup_reconfigure_threads+0x110/0x130
>   lockup_detector_init+0xbc/0xe0
>   kernel_init_freeable+0x18c/0x37c
>   kernel_init+0x2c/0x160
>   ret_from_kernel_thread+0x5c/0xbc
> 
> And then again via the CPU hotplug registration:
> 
>   start_wd_on_cpu+0x80/0x2f0
>   cpuhp_invoke_callback+0x194/0x620
>   cpuhp_thread_fun+0x7c/0x1b0
>   smpboot_thread_fn+0x290/0x2a0
>   kthread+0x168/0x1b0
>   ret_from_kernel_thread+0x5c/0xbc
> 
> 
> The first call is new because previously watchdog_nmi_reconfigure()
> wasn't called from softlockup_reconfigure_threads().

Hmm, don't you have the same problem with CPU hotplug or do you just get
lucky because the hotplug callback in your code is ordered vs. the
softlockup thread hotplug callback in a way that this does not hit?

> I'm not sure what the easiest fix is. One option would be to just drop
> the WARN_ON, it's just there for paranoia AFAICS.

The straight forward way is to make use of the new probe function. Patch
below.

Thanks,

tglx

8<--
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -375,20 +375,18 @@ void watchdog_nmi_start(void)
 /*
  * This runs after lockup_detector_init() which sets up watchdog_cpumask.
  */
-static int __init powerpc_watchdog_init(void)
+int __init watchdog_nmi_probe(void)
 {
int err;
 
-   watchdog_calc_timeouts();
-
-   err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online",
-   start_wd_on_cpu, stop_wd_on_cpu);
+   err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+   "powerpc/watchdog:online",
+   start_wd_on_cpu, stop_wd_on_cpu);
if (err < 0)
pr_warn("Watchdog could not be initialized");
 
return 0;
 }
-arch_initcall(powerpc_watchdog_init);
 
 static void handle_backtrace_ipi(struct pt_regs *regs)
 {
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -608,7 +608,6 @@ static inline int watchdog_park_threads(
 static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
-static inline void softlockup_init_threads(void) { }
 static void softlockup_reconfigure_threads(void)
 {
cpus_read_lock();
@@ -617,6 +616,10 @@ static void softlockup_reconfigure_threa
watchdog_nmi_start();
cpus_read_unlock();
 }
+static inline void softlockup_init_threads(void)
+{
+   softlockup_reconfigure_threads();
+}
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
 static void __lockup_detector_cleanup(void)

Re: regression in 4.14-rc2 caused by apparmor: add base infastructure for socket mediation

2017-10-02 Thread Vlastimil Babka

On 10/03/2017 07:15 AM, James Bottomley wrote:
> On Mon, 2017-10-02 at 21:11 -0700, John Johansen wrote:
>> On 10/02/2017 09:02 PM, James Bottomley wrote:
>>>
>>> The specific problem is that dnsmasq refuses to start on openSUSE
>>> Leap 42.2.  The specific cause is that and attempt to open a
>>> PF_LOCAL socket gets EACCES.  This means that networking doesn't
>>> function on a system with a 4.14-rc2 system.
>>>
>>> Reverting commit 651e28c5537abb39076d3949fb7618536f1d242e
>>> (apparmor: add base infastructure for socket mediation) causes the
>>> system to function again.
>>>
>>
>> This is not a kernel regression,
> 
> Regression means something that worked in a previous version of the
> kernel which is broken now. This problem falls within that definition.

Hm, but if this was because opensuse kernel and apparmor rules relied on
an out-of-tree patch, then it's not an upstream regression?

>>  it is because  opensuse dnsmasque is starting with policy that
>> doesn't allow access to PF_LOCAL socket
> 
> Because there was no co-ordination between their version of the patch
> and yours.  If you're sending in patches that you know might break
> systems because they need a co-ordinated rollout of something in
> userspace then it would be nice if you could co-ordinate it ...
> 
> Doing it in the merge window and not in -rc2 would also be helpful
> because I have more expectation of a userspace mismatch from stuff in
> the merge window.

Agree, but with rc2 there's still plenty of time, and running rcX means
some issues can be expected...

>> Christian Boltz the opensuse apparmor maintainer has been working
>> on a policy update for opensuse see bug
>>
>> https://bugzilla.opensuse.org/show_bug.cgi?id=1061195
> 
> Well, that looks really encouraging: The line about "To give you an
> impression what "lots of" means - I had to adjust 40 profiles on my
> laptop".  The upshot being apart from a bandaid, openSUSE still has no
> co-ordinated fix for this.

Note that the openSUSE Leap 42.2 kernel is 4.4, so by running 4.14 means
you are unsupported from the distro POV and you can't expect that the
42.2 apparmor profiles will ever be updated. I reported the bug above
for the Tumbleweed rolling distro, which gets new kernels after the
final version is released and passes QA. rcX kernels are packaged for
testing, but you have to add the repo explicitly. So there's still
enough time to co-ordinate fix of profiles and final 4.14 even for
Tumbleweed.

> James
>

Re: [RESEND PATCH] prctl: add PR_[GS]ET_PDEATHSIG_PROC

2017-10-02 Thread Jürg Billeter

On Mon, 2017-10-02 at 22:25 -0500, Eric W. Biederman wrote:
> The code where it calls group_send_sig_info is buggy for pdeath_signal.
> And it no less buggy for this new case.  There is no point to check
> permissions when sending a signal to yourself.  Especially this signal
> gets cleared during exec with a change of permissions.
> 
> 
> I would recommend using:
>  do_send_sig_info(p->signal->pdeath_signal_proc, SEND_SIG_NOINFO, p, true);
> 
> Perhaps with a comment saying that no permission check is needed when
> sending a signal to yourself.

Depending on how you look at it, one could also argue that the dying
parent sends the signal. However, I'm fine with dropping the permission
check in v2. I'll also send a patch to change this for the existing
pdeath_signal.

> I don't know what I think about inherit over fork, and the whole tree
> killing thing.  Except when the signal is SIGKILL I don't know if that
> code does what is intended.  So I am a little leary of it.

I agree that inheritance across fork is mainly useful for SIGKILL.
While non-SIGKILL users could clear the setting after fork(), another
option would be to allow the caller to specify whether the setting
should be inherited using prctl arg3.

This would allow both, the exact process-based equivalent to
pdeath_signal (no inheritance) as well as the interesting SIGKILL case
for killing a process tree. Does this sound sensible? I'd be happy to
add this to v2.

Jürg

Re: [PATCH 5/6] lightnvm: pblk: free up mempool allocation for erases correctly

2017-10-02 Thread Javier González

> On 2 Oct 2017, at 19.18, Rakesh Pandit  wrote:
> 
> On Mon, Oct 02, 2017 at 03:25:10PM +0300, Rakesh Pandit wrote:
>> On Mon, Oct 02, 2017 at 02:09:35PM +0200, Javier González wrote:
 On 1 Oct 2017, at 15.25, Rakesh Pandit  wrote:
 
 While separating read and erase mempools in 22da65a1b pblk_g_rq_cache
 was used two times to set aside memory both for erase and read
 requests.  Because same kmem cache is used repeatedly a single call to
 kmem_cache_destroy wouldn't deallocate everything.  Repeatedly doing
 loading and unloading of pblk modules would eventually result in some
 leak.
 
 The fix is to really use separate kmem cache and track it
 appropriately.
 
 Fixes: 22da65a1b ("lightnvm: pblk: decouple read/erase mempools")
 Signed-off-by: Rakesh Pandit 
>>> 
>>> I'm not sure I follow this logic. I assume that you're thinking of the
>>> refcount on kmem_cache. During cache creation, all is good; if a
>>> different cache creation fails, destruction is guaranteed, since the
>>> refcount is 0. On tear down (pblk_core_free), we destroy the mempools
>>> associated to the caches. In this case, the refcount goes to 0 too, as
>>> we destroy the 2 mempools. So I don't see where the leak can happen. Am
>>> I missing something?
>>> 
>>> In any case, Jens reported some bugs on the mempools, where we did not
>>> guarantee forward progress. Here you can find the original discussion
>>> and the mempool audit [1]. Would be good if you reviewed these.
>>> 
>>> [1] https://www.spinics.net/lists/kernel/msg2602274.html
>> 
>> Thanks, yes makes sense to follow up in patch thread.  I will respond
>> to above questions there later today.
> 
> I wasn't thinking it right in addition to looking at test results from
> a incorrectly instrumented debugged version.
> 
> I went through the series you pointed and all seem okay to me now.
> 
> Please drop this patch.
> 

Cool.

Javier


signature.asc
Description: Message signed with OpenPGP

Re: 4.14-rc2 on thinkpad x220: out of memory when inserting mmc card

2017-10-02 Thread Adrian Hunter

On 02/10/17 17:09, Linus Walleij wrote:
> On Sun, Oct 1, 2017 at 12:57 PM, Tetsuo Handa
>  wrote:
> 
 I inserted u-SD card, only to realize that it is not detected as it
 should be. And dmesg indeed reveals:
>>>
>>> Tetsuo asked me to report this to linux-mm.
>>>
>>> But 2^4 is 16 pages, IIRC that can't be expected to work reliably, and
>>> thus this sounds like MMC bug, not mm bug.
> 
> 
> I'm not sure I fully understand this error message:
> "worker/2:1: page allocation failure: order:4"
> 
> What I guess from context is that the mmc_init_request()
> call is failing to allocate 16 pages, meaning for 4K pages
> 64KB which is the typical bounce buffer.
> 
> This is what the code has always allocated as bounce buffer,
> but it used to happen upfront, when probing the MMC block layer,
> rather than when allocating the requests.

That is not exactly right.  As I already wrote, the memory allocation used
to be optional but became mandatory with:

  commit 304419d8a7e9204c5d19b704467b814df8c8f5b1
  Author: Linus Walleij 
  Date:   Thu May 18 11:29:32 2017 +0200

  mmc: core: Allocate per-request data using the block layer core

Re: [RFC PATCH 0/2] Missing READ_ONCE in core and arch-specific pgtable code leading to crashes

2017-10-02 Thread Jon Masters

On 09/29/2017 04:56 AM, Will Deacon wrote:

> The full fix isn't just cosmetic; it's also addressing the wider problem
> of unannotated racing page table accesses outside of the specific failure
> case we've run into.

Let us know if there are additional tests we should be running on the
Red Hat end. We've got high hundreds of ARM server systems at this
point, including pretty much everything out there.

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop

Re: 4.14-rc2 on thinkpad x220: out of memory when inserting mmc card

2017-10-02 Thread Adrian Hunter

On 02/10/17 16:03, Pavel Machek wrote:
> On Mon 2017-10-02 14:06:03, Linus Walleij wrote:
>> On Mon, Oct 2, 2017 at 10:41 AM, Pavel Machek  wrote:
>>
 Bounce buffers are being removed from v4.15
>>
>> As Adrian states, this would make any last bugs go away. I would
>> even consider putting this patch this into fixes if it solves the problem.
>>
>>> although you may experience
 performance regression with that:

   https://marc.info/?l=linux-mmc&m=150589778700551
>>>
>>> Hmm. The performance of this is already pretty bad, I really hope it
>>> does not get any worse.
>>
>> Did you use bounce buffers? Those were improving performance on
>> some laptops with TI or Ricoh host controllers and nothing else was
>> ever really using it (as can be seen from the commit).
> 
> Thinkpad X220... how do I tell if I was using them? I believe so,
> because I uncovered bug in them before.

You are certainly using bounce buffers.  What does lspci -knn show?

Re: regression in 4.14-rc2 caused by apparmor: add base infastructure for socket mediation

2017-10-02 Thread John Johansen

On 10/02/2017 10:15 PM, James Bottomley wrote:
> On Mon, 2017-10-02 at 21:11 -0700, John Johansen wrote:
>> On 10/02/2017 09:02 PM, James Bottomley wrote:
>>>
>>> The specific problem is that dnsmasq refuses to start on openSUSE
>>> Leap 42.2.  The specific cause is that and attempt to open a
>>> PF_LOCAL socket gets EACCES.  This means that networking doesn't
>>> function on a system with a 4.14-rc2 system.
>>>
>>> Reverting commit 651e28c5537abb39076d3949fb7618536f1d242e
>>> (apparmor: add base infastructure for socket mediation) causes the
>>> system to function again.
>>>
>>
>> This is not a kernel regression,
> 
> Regression means something that worked in a previous version of the
> kernel which is broken now. This problem falls within that definition.
> 

sure, its a regression for suse based system. It isn't however a
regression in the kernel code or interface. It makes the information
available, its a matter of how the user space and policy are
configured.

It is entirely possible to use the 4.14 kernel on suse without having
to modify policy if the policy version/feature set is pinned. However
this is not a feature that suse seems to be using. Instead suse policy
is tracking and enforcing all kernel supported features when they
become available, regardless of whether the policy has been updated.

This makes sense for a policy developers machine, not so much for a
general user. I will have to discuss this with Christian and Goldwyn.

>>  it is because  opensuse dnsmasque is starting with policy that
>> doesn't allow access to PF_LOCAL socket
> 
> Because there was no co-ordination between their version of the patch
> and yours.  If you're sending in patches that you know might break
> systems because they need a co-ordinated rollout of something in
> userspace then it would be nice if you could co-ordinate it ...
>

This information was communicated more than once. That is not to say
there were not issues with the landing or else you would not have seen
this. In fact I would say this particular sync was handled poorly and
we as an upstream certainly have to take some of the blame for it.

The userspace that supported the 4.14 kernel changes landed long
ago. It was specific policy updates that were missing.

Ideally your policy would have been pinned to a specific kernel
feature set, so that kernel changes would not have resulted in policy
issues.

> Doing it in the merge window and not in -rc2 would also be helpful
> because I have more expectation of a userspace mismatch from stuff in
> the merge window.
> 

Certainly and this would have landed during the merge window except
for an issue with the security tree. This particular series lived in
-next for several weeks before landing and I would have never asked
for it to have been pulled as late as it was except for the issue
around the security tree this last cycle.

>> Christian Boltz the opensuse apparmor maintainer has been working
>> on a policy update for opensuse see bug
>>
>> https://bugzilla.opensuse.org/show_bug.cgi?id=1061195
> 
> Well, that looks really encouraging: The line about "To give you an
> impression what "lots of" means - I had to adjust 40 profiles on my
> laptop".  The upshot being apart from a bandaid, openSUSE still has no
> co-ordinated fix for this.
> 

yes, it is a change that affects policy, the same can be said for any
other MAC system when new mediation is added. It can be fixed by either
configuring the feature set/version that policy is targeting or updating
policy.

For policy changes this particular change it can mostly be fixed by an
adjustment to the abstractions. The bandaid referenced has to do with
Christian choosing to use only what is supported in 4.14 instead of
the upstream solution which contains rules for work targeted beyond
4.14, even though userspace supports those rules already and will
compile them to a policy that works in 4.14.

However Christian wants to update the suse policy using the 4.14
kernel because he does not feel that he can properly verify the
upstream policy changes on suse with 4.14. This is an understandable
stance for him to take, but it does mean there is some disconnect
between what is in the upstream apparmor project and what is in suse.

Regardless this is a change that you shouldn't have noticed, so its
obvious the coordination was off and needs to be improved.

[PATCH v4 2/6] perf record: Get the first sample time and last sample time

2017-10-02 Thread Jin Yao

In perf record, it's walked on all samples yet. So it's very easy to get
the first/last samples and save the time to perf file header via the
function write_sample_time().

In later, perf report/script will fetch the time from perf file header.

Change log:
---
v3: Remove the definitions of first_sample_time and last_sample_time
from struct record and directly save them in perf_evlist.

Signed-off-by: Jin Yao 
---
 tools/perf/builtin-record.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 9b379f3..d5b78449 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -488,6 +488,11 @@ static int process_sample_event(struct perf_tool *tool,
 
rec->samples++;
 
+   if (rec->evlist->first_sample_time == 0)
+   rec->evlist->first_sample_time = sample->time;
+
+   rec->evlist->last_sample_time = sample->time;
+
return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
 }
 
-- 
2.7.4

[PATCH v4 1/6] perf header: Record first sample time and last sample time in perf file header

2017-10-02 Thread Jin Yao

perf report/script/... have a --time option to limit the time range
of output. That's very useful to slice large traces, e.g. when processing
the output of perf script for some analysis.

But right now --time only supports absolute time. Also there is no fast
way to get the start/end times of a given trace except for looking at it.
This makes it hard to e.g. only decode the first half of the trace, which
is useful for parallelization of scripts

Another problem is that perf records are variable size and there is no
synchronization mechanism. So the only way to find the last sample reliably
would be to walk all samples. But we want to avoid that in perf report/...
because it is already quite expensive. That is why storing the first sample
time and last sample time in perf record is better.

This patch creates a new header feature type HEADER_SAMPLE_TIME and related
ops. Save the first sample time and the last sample time to the feature
section in perf file header.

Change log:
---
v4: Use perf script time style for timestamp printing. Also add with
the printing of sample duration.

v3: Remove the definitions of first_sample_time/last_sample_time from
perf_session. Just define them in perf_evlist

Signed-off-by: Jin Yao 
---
 tools/perf/Documentation/perf.data-file-format.txt |  4 ++
 tools/perf/util/evlist.h   |  2 +
 tools/perf/util/header.c   | 60 ++
 tools/perf/util/header.h   |  1 +
 4 files changed, 67 insertions(+)

diff --git a/tools/perf/Documentation/perf.data-file-format.txt 
b/tools/perf/Documentation/perf.data-file-format.txt
index e90c59c..d5e3043 100644
--- a/tools/perf/Documentation/perf.data-file-format.txt
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -238,6 +238,10 @@ struct auxtrace_index {
struct auxtrace_index_entry entries[PERF_AUXTRACE_INDEX_ENTRY_COUNT];
 };
 
+   HEADER_SAMPLE_TIME = 21,
+
+Two uint64_t for the time of first sample and the time of last sample.
+
other bits are reserved and should ignored for now
HEADER_FEAT_BITS= 256,
 
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index b1c14f1..9ccc2fb 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -99,6 +99,8 @@ struct perf_evlist {
struct perf_evsel *selected;
struct events_stats stats;
struct perf_env *env;
+   u64 first_sample_time;
+   u64 last_sample_time;
 };
 
 struct perf_evsel_str_handler {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 605bbd5..61b28d89 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "evlist.h"
 #include "evsel.h"
@@ -36,6 +37,7 @@
 #include 
 #include "asm/bug.h"
 #include "tool.h"
+#include "time-utils.h"
 
 #include "sane_ctype.h"
 
@@ -1181,6 +1183,20 @@ static int write_stat(struct feat_fd *ff __maybe_unused,
return 0;
 }
 
+static int write_sample_time(struct feat_fd *ff,
+struct perf_evlist *evlist)
+{
+   int ret;
+
+   ret = do_write(ff, &evlist->first_sample_time,
+  sizeof(evlist->first_sample_time));
+   if (ret < 0)
+   return ret;
+
+   return do_write(ff, &evlist->last_sample_time,
+   sizeof(evlist->last_sample_time));
+}
+
 static void print_hostname(struct feat_fd *ff, FILE *fp)
 {
fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname);
@@ -1506,6 +1522,28 @@ static void print_group_desc(struct feat_fd *ff, FILE 
*fp)
}
 }
 
+static void print_sample_time(struct feat_fd *ff, FILE *fp)
+{
+   struct perf_session *session;
+   char time_buf[32];
+   double d;
+
+   session = container_of(ff->ph, struct perf_session, header);
+
+   timestamp__scnprintf_usec(session->evlist->first_sample_time,
+ time_buf, sizeof(time_buf));
+   fprintf(fp, "# time of first sample : %s\n", time_buf);
+
+   timestamp__scnprintf_usec(session->evlist->last_sample_time,
+ time_buf, sizeof(time_buf));
+   fprintf(fp, "# time of last sample : %s\n", time_buf);
+
+   d = (double)(session->evlist->last_sample_time -
+   session->evlist->first_sample_time) / NSEC_PER_MSEC;
+
+   fprintf(fp, "# sample duration : %10.3f ms\n", d);
+}
+
 static int __event_process_build_id(struct build_id_event *bev,
char *filename,
struct perf_session *session)
@@ -2147,6 +2185,27 @@ static int process_cache(struct feat_fd *ff, void *data 
__maybe_unused)
return -1;
 }
 
+static int process_sample_time(struct feat_fd *ff, void *data __maybe_unused)
+{
+   struct perf_session *session;
+   u64 first_sample_time, last_sample_time;
+   int ret;
+

[PATCH v4 4/6] perf util: Create function to perform multiple time range checking

2017-10-02 Thread Jin Yao

Previous patch supports the multiple time range.

For example, select the first and second 10% time slices.
perf report --time 10%/1,10%/2

We need a function to check if a timestamp is in the ranges of
[0, 10%) and [10%, 20%].

Note that it includes the last element in [10%, 20%] but it
doesn't include the last element in [0, 10%). It's to avoid
the overlap.

This patch implments a new function perf_time__ranges_skip_sample
for this checking.

Change log:
---
v4: Let perf_time__ranges_skip_sample be compatible with
perf_time__skip_sample when only one time range.

Signed-off-by: Jin Yao 
---
 tools/perf/util/time-utils.c | 28 
 tools/perf/util/time-utils.h |  3 +++
 2 files changed, 31 insertions(+)

diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c
index 79e4281..b380356 100644
--- a/tools/perf/util/time-utils.c
+++ b/tools/perf/util/time-utils.c
@@ -299,6 +299,34 @@ bool perf_time__skip_sample(struct perf_time_interval 
*ptime, u64 timestamp)
return false;
 }
 
+bool perf_time__ranges_skip_sample(struct perf_time_interval *ptime_buf,
+  int num, u64 timestamp)
+{
+   struct perf_time_interval *ptime;
+   int i;
+
+   if ((timestamp == 0) || (num == 0))
+   return false;
+
+   if (num == 1)
+   return perf_time__skip_sample(&ptime_buf[0], timestamp);
+
+   /*
+* start/end of multiple time ranges must be valid.
+*/
+   for (i = 0; i < num; i++) {
+   ptime = &ptime_buf[i];
+
+   if (timestamp >= ptime->start &&
+   ((timestamp < ptime->end && i < num - 1) ||
+(timestamp <= ptime->end && i == num - 1))) {
+   break;
+   }
+   }
+
+   return (i == num) ? true : false;
+}
+
 int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz)
 {
u64  sec = timestamp / NSEC_PER_SEC;
diff --git a/tools/perf/util/time-utils.h b/tools/perf/util/time-utils.h
index fd018e2..de279ea 100644
--- a/tools/perf/util/time-utils.h
+++ b/tools/perf/util/time-utils.h
@@ -17,6 +17,9 @@ int perf_time__percent_parse_str(struct perf_time_interval 
*ptime_buf, int num,
 
 bool perf_time__skip_sample(struct perf_time_interval *ptime, u64 timestamp);
 
+bool perf_time__ranges_skip_sample(struct perf_time_interval *ptime_buf,
+  int num, u64 timestamp);
+
 int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz);
 
 int fetch_current_timestamp(char *buf, size_t sz);
-- 
2.7.4

[PATCH v4 6/6] perf script: support time percent and multiple time ranges

2017-10-02 Thread Jin Yao

perf script has a --time option to limit the time range of output.
It only supports absolute time.

Now this option is extended to support multiple time ranges and
support the percent of time.

For example:

1. Select the first and second 10% time slices
   perf script --time 10%/1,10%/2

2. Select from 0% to 10% and 30% to 40% slices
   perf script --time 0%-10%,30%-40%

Change log:
---
v4: Remove perf_time__skip_sample, only uses perf_time__ranges_skip_sample

v3: Since the definitions of first_sample_time/last_sample_time
are moved from perf_session to perf_evlist so change the
related code.

Signed-off-by: Jin Yao 
---
 tools/perf/Documentation/perf-script.txt | 16 
 tools/perf/builtin-script.c  | 25 -
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt 
b/tools/perf/Documentation/perf-script.txt
index bcc1ba3..2c1f2b9 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -321,6 +321,22 @@ include::itrace.txt[]
stop time is not given (i.e, time string is 'x.y,') then analysis goes
to end of file.
 
+   Also support time percent with multipe time range. Time string is
+   'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'. The maximum number of slices is 
10.
+
+   For example:
+   Select the second 10% time slice
+   perf script --time 10%/2
+
+   Select from 0% to 10% time slice
+   perf script --time 0%-10%
+
+   Select the first and second 10% time slices
+   perf script --time 10%/1,10%/2
+
+   Select from 0% to 10% and 30% to 40% slices
+   perf script --time 0%-10%,30%-40%
+
 --max-blocks::
Set the maximum number of program blocks to print with brstackasm for
each sample.
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 9092de0..a8d5f02 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1357,6 +1357,8 @@ static void print_sample_synth(struct perf_sample *sample,
}
 }
 
+#define PTIME_RANGE_MAX10
+
 struct perf_script {
struct perf_tooltool;
struct perf_session *session;
@@ -1369,7 +1371,8 @@ struct perf_script {
struct thread_map   *threads;
int name_width;
const char  *time_str;
-   struct perf_time_interval ptime;
+   struct perf_time_interval ptime_range[PTIME_RANGE_MAX];
+   int range_num;
 };
 
 static int perf_evlist__max_name_len(struct perf_evlist *evlist)
@@ -1565,8 +1568,10 @@ static int process_sample_event(struct perf_tool *tool,
struct perf_script *scr = container_of(tool, struct perf_script, tool);
struct addr_location al;
 
-   if (perf_time__skip_sample(&scr->ptime, sample->time))
+   if (perf_time__ranges_skip_sample(scr->ptime_range, scr->range_num,
+ sample->time)) {
return 0;
+   }
 
if (debug_mode) {
if (sample->time < last_timestamp) {
@@ -3103,9 +3108,19 @@ int cmd_script(int argc, const char **argv)
goto out_delete;
 
/* needs to be parsed after looking up reference time */
-   if (perf_time__parse_str(&script.ptime, script.time_str) != 0) {
-   pr_err("Invalid time string\n");
-   return -EINVAL;
+   if (perf_time__parse_str(script.ptime_range, script.time_str) != 0) {
+   script.range_num = perf_time__percent_parse_str(
+   script.ptime_range, PTIME_RANGE_MAX,
+   script.time_str,
+   session->evlist->first_sample_time,
+   session->evlist->last_sample_time);
+
+   if (script.range_num < 0) {
+   pr_err("Invalid time string\n");
+   return -EINVAL;
+   }
+   } else {
+   script.range_num = 1;
}
 
err = __cmd_script(&script);
-- 
2.7.4

[PATCH v4 5/6] perf report: support time percent and multiple time ranges

2017-10-02 Thread Jin Yao

perf report has a --time option to limit the time range of output.
It only supports absolute time.

Now this option is extended to support multiple time ranges and
support the percent of time.

For example:

1. Select the first and second 10% time slices
perf report --time 10%/1,10%/2

2. Select from 0% to 10% and 30% to 40% slices
perf report --time 0%-10%,30%-40%

Change log:
---
v4: Remove perf_time__skip_sample, only uses perf_time__ranges_skip_sample

v3: Since the definitions of first_sample_time/last_sample_time
are moved from perf_session to perf_evlist so change the
related code.

Signed-off-by: Jin Yao 
---
 tools/perf/Documentation/perf-report.txt | 16 
 tools/perf/builtin-report.c  | 28 
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt 
b/tools/perf/Documentation/perf-report.txt
index 383a98d..3a0975c 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -402,6 +402,22 @@ OPTIONS
stop time is not given (i.e, time string is 'x.y,') then analysis goes
to end of file.
 
+   Also support time percent with multipe time range. Time string is
+   'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'. The maximum number of slices is 
10.
+
+   For example:
+   Select the second 10% time slice
+   perf report --time 10%/2
+
+   Select from 0% to 10% time slice
+   perf report --time 0%-10%
+
+   Select the first and second 10% time slices
+   perf report --time 10%/1,10%/2
+
+   Select from 0% to 10% and 30% to 40% slices
+   perf report --time 0%-10%,30%-40%
+
 --itrace::
Options for decoding instruction tracing data. The options are:
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f9dff65..ab7ab25 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -51,6 +51,8 @@
 #include 
 #include 
 
+#define PTIME_RANGE_MAX10
+
 struct report {
struct perf_tooltool;
struct perf_session *session;
@@ -68,7 +70,8 @@ struct report {
const char  *cpu_list;
const char  *symbol_filter_str;
const char  *time_str;
-   struct perf_time_interval ptime;
+   struct perf_time_interval ptime_range[PTIME_RANGE_MAX];
+   int range_num;
float   min_percent;
u64 nr_entries;
u64 queue_size;
@@ -185,8 +188,10 @@ static int process_sample_event(struct perf_tool *tool,
};
int ret = 0;
 
-   if (perf_time__skip_sample(&rep->ptime, sample->time))
+   if (perf_time__ranges_skip_sample(rep->ptime_range, rep->range_num,
+ sample->time)) {
return 0;
+   }
 
if (machine__resolve(machine, &al, sample) < 0) {
pr_debug("problem processing %d event, skipping it.\n",
@@ -1073,8 +1078,23 @@ int cmd_report(int argc, const char **argv)
if (symbol__init(&session->header.env) < 0)
goto error;
 
-   if (perf_time__parse_str(&report.ptime, report.time_str) != 0) {
-   pr_err("Invalid time string\n");
+   if (perf_time__parse_str(report.ptime_range, report.time_str) != 0) {
+   report.range_num = perf_time__percent_parse_str(
+   report.ptime_range, PTIME_RANGE_MAX,
+   report.time_str,
+   session->evlist->first_sample_time,
+   session->evlist->last_sample_time);
+
+   if (report.range_num < 0) {
+   pr_err("Invalid time string\n");
+   return -EINVAL;
+   }
+   } else {
+   report.range_num = 1;
+   }
+
+   if (report.range_num > 0 && perf_data_file__is_pipe(session->file)) {
+   pr_err("Time percent range is not supported in pipe\n");
return -EINVAL;
}
 
-- 
2.7.4

[PATCH v4 3/6] perf util: Create function to parse time percent

2017-10-02 Thread Jin Yao

Current perf report/script/... have a --time option to limit the time
range of output. But right now it only supports absolute time.

For easy using, now it can support a percent of time usage.

For example:

1. Select the second 10% time slice
   perf report --time 10%/2

2. Select from 0% to 10% time slice
   perf report --time 0%-10%

It also support the multiple time ranges.

3. Select the first and second 10% time slices
   perf report --time 10%/1,10%/2

4. Select from 0% to 10% and 30% to 40% slices
   perf report --time 0%-10%,30%-40%

Change log:
---
v4: An issue is found. Following passes.
perf script --time 10%/10x12321xsdfdasfdsafdsafdsa

Now it uses strtol to replace atoi.

Signed-off-by: Jin Yao 
---
 tools/perf/util/time-utils.c | 205 ---
 tools/perf/util/time-utils.h |   3 +
 2 files changed, 196 insertions(+), 12 deletions(-)

diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c
index 5b5d021..79e4281 100644
--- a/tools/perf/util/time-utils.c
+++ b/tools/perf/util/time-utils.c
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "perf.h"
 #include "debug.h"
@@ -59,11 +60,10 @@ static int parse_timestr_sec_nsec(struct perf_time_interval 
*ptime,
return 0;
 }
 
-int perf_time__parse_str(struct perf_time_interval *ptime, const char *ostr)
+static int split_start_end(char **start, char **end, const char *ostr, char ch)
 {
char *start_str, *end_str;
char *d, *str;
-   int rc = 0;
 
if (ostr == NULL || *ostr == '\0')
return 0;
@@ -73,25 +73,35 @@ int perf_time__parse_str(struct perf_time_interval *ptime, 
const char *ostr)
if (str == NULL)
return -ENOMEM;
 
-   ptime->start = 0;
-   ptime->end = 0;
-
-   /* str has the format: ,
-* variations: ,
-* ,
-* ,
-*/
start_str = str;
-   d = strchr(start_str, ',');
+   d = strchr(start_str, ch);
if (d) {
*d = '\0';
++d;
}
end_str = d;
 
+   *start = start_str;
+   *end = end_str;
+
+   return 0;
+}
+
+int perf_time__parse_str(struct perf_time_interval *ptime, const char *ostr)
+{
+   char *start_str = NULL, *end_str;
+   int rc;
+
+   rc = split_start_end(&start_str, &end_str, ostr, ',');
+   if (rc || !start_str)
+   return rc;
+
+   ptime->start = 0;
+   ptime->end = 0;
+
rc = parse_timestr_sec_nsec(ptime, start_str, end_str);
 
-   free(str);
+   free(start_str);
 
/* make sure end time is after start time if it was given */
if (rc == 0 && ptime->end && ptime->end < ptime->start)
@@ -103,6 +113,177 @@ int perf_time__parse_str(struct perf_time_interval 
*ptime, const char *ostr)
return rc;
 }
 
+static int parse_percent(double *pcnt, char *str)
+{
+   char *c;
+
+   c = strchr(str, '%');
+   if (c)
+   *c = '\0';
+   else
+   return -1;
+
+   *pcnt = atof(str) / 100.0;
+
+   return 0;
+}
+
+static int percent_slash_split(char *str, struct perf_time_interval *ptime,
+  u64 start, u64 end)
+{
+   char *p, *end_str;
+   double pcnt, start_pcnt, end_pcnt;
+   u64 total = end - start;
+   int i;
+
+   /*
+* Example:
+* 10%/2: select the second 10% slice and the third 10% slice
+*/
+
+   /* We can modify this string since the original one is copied */
+   p = strchr(str, '/');
+   if (!p)
+   return -1;
+
+   *p = '\0';
+   if (parse_percent(&pcnt, str) < 0)
+   return -1;
+
+   p++;
+   i = (int)strtol(p, &end_str, 10);
+   if (*end_str)
+   return -1;
+
+   if (pcnt <= 0.0)
+   return -1;
+
+   start_pcnt = pcnt * (i - 1);
+   end_pcnt = pcnt * i;
+
+   if (start_pcnt < 0.0 || start_pcnt > 1.0 ||
+   end_pcnt < 0.0 || end_pcnt > 1.0) {
+   return -1;
+   }
+
+   ptime->start = start + round(start_pcnt * total);
+   ptime->end = start + round(end_pcnt * total);
+
+   return 0;
+}
+
+static int percent_dash_split(char *str, struct perf_time_interval *ptime,
+ u64 start, u64 end)
+{
+   char *start_str = NULL, *end_str;
+   double start_pcnt, end_pcnt;
+   u64 total = end - start;
+   int ret;
+
+   /*
+* Example: 0%-10%
+*/
+
+   ret = split_start_end(&start_str, &end_str, str, '-');
+   if (ret || !start_str)
+   return ret;
+
+   if ((parse_percent(&start_pcnt, start_str) != 0) ||
+   (parse_percent(&end_pcnt, end_str) != 0)) {
+   free(start_str);
+   return -1;
+   }
+
+   free(start_str);
+
+   if (start_pcnt < 0.0 || start_pcnt > 1.0 ||
+   end_pcnt < 0.0 || end_pcnt > 1.0 ||

[PATCH v4 0/6] perf report/script: Support percent and multiple range in --time option

2017-10-02 Thread Jin Yao

v4:
---
1. Use perf script time style for timestamp printing. Also add with
   the printing of sample duration. For example:

   perf report --header

   time of first sample : 5276531.323099
   time of last sample : 5276555.345625
   sample duration :  24022.526 ms

2. Fix an invalid time string issue. For example,

   perf script --time 10%/10x12321xsdfdasfdsafdsafdsa

   Now in code, it uses strtol to replace atoi.

3. Remove perf_time__skip_sample, only uses perf_time__ranges_skip_sample
   in perf report/perf script.

v3:
---
1. Move the definitions of first_sample_time/last_sample_time from
   perf_session and struct record to perf_evlist and update the
   related code.

v2:
---
1. This patch creates a new header feature type HEADER_SAMPLE_TIME and related
   ops. Save the first sample time and the last sample time to the feature
   section in perf file header.

2. Add checking for last element in time range.

   For example, select the first and second 10% time slices.
   perf report --time 10%/1,10%/2

   Note that now it includes the last element in [10%, 20%] but it
   doesn't include the last element in [0, 10%). It's to avoid
   the overlap.

Following patches are changed:

   perf header: Record first sample time and last sample time in perf file 
header
   perf record: Get the first sample time and last sample time
   perf util: Create function to perform multiple time range checking

v1: initial post


Current perf report/script/... have a --time option to limit the time
range of output. But it only supports the absolute time.

The patch series extend this option to let it support percent of time
and support the multiple time ranges.

For example:

1. Select the second 10% time slice
   perf report --time 10%/2

2. Select from 0% to 10% time slice
   perf report --time 0%-10%

It also support the multiple time ranges.

3. Select the first and second 10% time slices
   perf report --time 10%/1,10%/2

4. Select from 0% to 10% and 30% to 40% slices
   perf report --time 0%-10%,30%-40%

Jin Yao (6):
  perf header: Record first sample time and last sample time in perf
file header
  perf record: Get the first sample time and last sample time
  perf util: Create function to parse time percent
  perf util: Create function to perform multiple time range checking
  perf report: support time percent and multiple time ranges
  perf script: support time percent and multiple time ranges

 tools/perf/Documentation/perf-report.txt   |  16 ++
 tools/perf/Documentation/perf-script.txt   |  16 ++
 tools/perf/Documentation/perf.data-file-format.txt |   4 +
 tools/perf/builtin-record.c|   5 +
 tools/perf/builtin-report.c|  28 ++-
 tools/perf/builtin-script.c|  25 ++-
 tools/perf/util/evlist.h   |   2 +
 tools/perf/util/header.c   |  60 ++
 tools/perf/util/header.h   |   1 +
 tools/perf/util/time-utils.c   | 233 +++--
 tools/perf/util/time-utils.h   |   6 +
 11 files changed, 375 insertions(+), 21 deletions(-)

-- 
2.7.4

Re: [PATCH] tee: ACPI support for optee driver

2017-10-02 Thread Jon Masters

On 09/22/2017 05:37 AM, Lorenzo Pieralisi wrote:
> On Thu, Sep 21, 2017 at 03:45:28PM +0800, Hanjun Guo wrote:
>> On 2017/9/21 15:12, Mayuresh Chitale wrote:
>>> This patch modifies the optee driver to add support for parsing
>>> the conduit method from an ACPI node.
>>
>> Sorry I didn't involve this earlier, but I think this is a wrong
>> approach, in ACPI 5.1+ spec, there is a bit in FADT table which
>> indicates PSCI using SMC or HVC, please see ACPI 6.2 [1],
>> Table 5-37 Fixed ACPI Description Table ARM Boot Architecture Flags.
>>
>> Can we just use that to get the conduit method for optee driver too?
>>
>> [1]: http://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
> 
> It is just not a matter of conduit method but also to define how OPTEE
> should be detected. It is up to Linaro (who owns OP-TEE) to put forward
> a binding at ACPI (ARM) spec level.
> 
> We do not define ACPI bindings on a kernel mailing list.
> 
> NAK on this patch.

Is this actively being tracked by Linaro? If not, can Applied folks ping
me off-list and I will proxy your request into Linaro.

Jon.


-- 
Computer Architect | Sent from my Fedora powered laptop

[PATCH V2] firmware: tegra: add BPMP debugfs support

2017-10-02 Thread Timo Alho

Tegra power management firmware running on co-processor (BPMP)
implements a simple pseudo file system akin to debugfs. The file
system can be used for debugging purposes to examine and change the
status of selected resources controlled by the firmware (such as
clocks, resets, voltages, powergates, ...).

Add support to "mirror" the firmware's file system to debugfs. At
boot, query firmware for a list of all possible files and create
corresponding debugfs entries. Read/write of individual files is
implemented by sending a Message ReQuest (MRQ) that passes the full
file path name and data to firmware via DRAM.

Signed-off-by: Timo Alho 
---
Changes in v2:
 - Address Jonathan's review feedback
  * restructure error printing and what error codes passed to higher
layers
  * don't use IS_ERR_OR_NULL()
  * avoid overwriting last-character of filename in one corner case
(name length = 255)
  
 drivers/firmware/tegra/Makefile   |   4 +-
 drivers/firmware/tegra/bpmp.c |   4 +
 drivers/firmware/tegra/bpmp_debugfs.c | 444 ++
 include/soc/tegra/bpmp.h  |  14 ++
 4 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/tegra/bpmp_debugfs.c

diff --git a/drivers/firmware/tegra/Makefile b/drivers/firmware/tegra/Makefile
index e34a2f7..0314568 100644
--- a/drivers/firmware/tegra/Makefile
+++ b/drivers/firmware/tegra/Makefile
@@ -1,2 +1,4 @@
-obj-$(CONFIG_TEGRA_BPMP)   += bpmp.o
+tegra-bpmp-y   = bpmp.o
+tegra-bpmp-$(CONFIG_DEBUG_FS)  += bpmp_debugfs.o
+obj-$(CONFIG_TEGRA_BPMP)   += tegra-bpmp.o
 obj-$(CONFIG_TEGRA_IVC)+= ivc.o
diff --git a/drivers/firmware/tegra/bpmp.c b/drivers/firmware/tegra/bpmp.c
index 73ca55b..d29c593 100644
--- a/drivers/firmware/tegra/bpmp.c
+++ b/drivers/firmware/tegra/bpmp.c
@@ -824,6 +824,10 @@ static int tegra_bpmp_probe(struct platform_device *pdev)
if (err < 0)
goto free_mrq;
 
+   err = tegra_bpmp_init_debugfs(bpmp);
+   if (err < 0)
+   dev_err(&pdev->dev, "debugfs initialization failed: %d\n", err);
+
return 0;
 
 free_mrq:
diff --git a/drivers/firmware/tegra/bpmp_debugfs.c 
b/drivers/firmware/tegra/bpmp_debugfs.c
new file mode 100644
index 000..f7f6a0a
--- /dev/null
+++ b/drivers/firmware/tegra/bpmp_debugfs.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+struct seqbuf {
+   char *buf;
+   size_t pos;
+   size_t size;
+};
+
+static void seqbuf_init(struct seqbuf *seqbuf, void *buf, size_t size)
+{
+   seqbuf->buf = buf;
+   seqbuf->size = size;
+   seqbuf->pos = 0;
+}
+
+static size_t seqbuf_avail(struct seqbuf *seqbuf)
+{
+   return seqbuf->pos < seqbuf->size ? seqbuf->size - seqbuf->pos : 0;
+}
+
+static size_t seqbuf_status(struct seqbuf *seqbuf)
+{
+   return seqbuf->pos <= seqbuf->size ? 0 : -EOVERFLOW;
+}
+
+static int seqbuf_eof(struct seqbuf *seqbuf)
+{
+   return seqbuf->pos >= seqbuf->size;
+}
+
+static int seqbuf_read(struct seqbuf *seqbuf, void *buf, size_t nbyte)
+{
+   nbyte = min(nbyte, seqbuf_avail(seqbuf));
+   memcpy(buf, seqbuf->buf + seqbuf->pos, nbyte);
+   seqbuf->pos += nbyte;
+   return seqbuf_status(seqbuf);
+}
+
+static int seqbuf_read_u32(struct seqbuf *seqbuf, uint32_t *v)
+{
+   int err;
+
+   err = seqbuf_read(seqbuf, v, 4);
+   *v = le32_to_cpu(*v);
+   return err;
+}
+
+static int seqbuf_read_str(struct seqbuf *seqbuf, const char **str)
+{
+   *str = seqbuf->buf + seqbuf->pos;
+   seqbuf->pos += strnlen(*str, seqbuf_avail(seqbuf));
+   seqbuf->pos++;
+   return seqbuf_status(seqbuf);
+}
+
+static void seqbuf_seek(struct seqbuf *seqbuf, ssize_t offset)
+{
+   seqbuf->pos += offset;
+}
+
+/* map filename in Linux debugfs to corresponding entry in BPMP */
+static const char *get_filename(struct tegra_bpmp *bpmp,
+   const struct file *file, char *buf, int size)
+{
+   char root_path_buf[512];
+   const char *root_path;
+   const char *filename;
+   size_t root_len;
+
+   root_path = dentry_path(bpmp->debugfs_mirror, root_path_buf,
+   sizeof(root_path_buf));
+   if (IS_ERR(root_path))
+   return NULL;
+
+   root_len = strlen(root_path);
+
+   filename = dentry_path(file->f_path.dentry, buf, size);
+   if (IS_ERR(filename))
+

Re: [PATCH v2 3/5] clk: aspeed: Add platform driver and register PLLs

2017-10-02 Thread Joel Stanley

On Tue, Oct 3, 2017 at 6:54 AM, Stephen Boyd  wrote:
> On 09/21, Joel Stanley wrote:
>> @@ -160,6 +191,104 @@ static struct clk_hw *aspeed_calc_pll(const char 
>> *name, u32 val)
>> + /*
>> +  * Memory controller (M-PLL) PLL. This clock is configured by the
>> +  * bootloader, and is exposed to Linux as a read-only clock rate.
>> +  */
>> + regmap_read(map, ASPEED_MPLL_PARAM, &val);
>> + aspeed_clk_data->hws[ASPEED_CLK_MPLL] = aspeed_calc_pll("mpll", val);
>> +
>> + /* SD/SDIO clock divider (TODO: There's a gate too) */
>> + hw = clk_hw_register_divider_table(NULL, "sdio", "hpll", 0,
>
> Please pass your dev pointer here from the platform device.
>
>> + scu_base + ASPEED_CLK_SELECTION, 12, 3, 0,
>> + div_table,
>> + &aspeed_clk_lock);
>
> And check for errors? Perhaps use devm_clk_hw_regsiter() APIs and
> construct the dividers and muxes directly instead of using the
> basic type registration APIs.

Do you think that devm_ is overkill, given we will never unload this driver?

Can you explain why you suggest to construct the structures directly
instead of using the APIs?

I had a read of the basic type registration functions, and the
relevant failure paths are memory allocation failures. If we're out of
memory that early in boot then things have gone pretty bad.

I can add checks for null and bail out; I don't think there's value in
freeing the allocated memory: if a system can't load it's clock driver
then it's super hosed.

Thanks for the review. I fixed all of the other things you mentioned.

Cheers,

Joel

Re: [PATCH v2 4/5] clk: aspeed: Register gated clocks

2017-10-02 Thread Joel Stanley

On Tue, Oct 3, 2017 at 7:07 AM, Stephen Boyd  wrote:
> On 09/21, Joel Stanley wrote:
>> The majority of the clocks in the system are gates paired with a reset
>> controller that holds the IP in reset.
>>
>> This borrows from clk_hw_register_gate, but registers two 'gates', one
>> to control the clock enable register and the other to control the reset
>> IP. This allows us to enforce the ordering:
>>
>>  1. Place IP in reset
>>  2. Enable clock
>>  3. Delay
>>  4. Release reset
>>
>> There are some gates that do not have an associated reset; these are
>> handled by using -1 as the index for the reset.
>
> Half of these aren't clks, but reset bits? Perhaps you should
> register power domains for these things and then have the power
> domain assert the reset, enable the clock, delay, and then
> release the reset in the poweron callback. Then device drivers
> don't have to be aware of the ordering, and you can keep
> enforcing the ordering in one place, but we don't have to make
> fake gates and shoehorn the sequence into the clk framework.

I had a look. We've got 24 gates being registered, and half are just
gate+reset pairs, while the other half are both. Some of them have
clocks with divisors upstream of the gate, so in those cases we do
care what the rate is. For the others they are simply bringing the IP
online.

Note that we don't have drivers for all of the peripherals, so things
may change once those drivers are written.

I hadn't looked into the power domain stuff - when I brought this up
on the list recently Philip suggested that this should be contained in
a clk driver, so that's the direction we went:

 http://www.spinics.net/lists/linux-clk/msg18733.html

Cheers,

Joel

Re: [PATCH] mm,hugetlb,migration: don't migrate kernelcore hugepages

2017-10-02 Thread Alexandru Moise

On Mon, Oct 02, 2017 at 06:15:00PM +0200, Michal Hocko wrote:
> On Mon 02-10-17 17:06:38, Alexandru Moise wrote:
> > On Mon, Oct 02, 2017 at 04:27:17PM +0200, Michal Hocko wrote:
> > > On Mon 02-10-17 16:06:33, Alexandru Moise wrote:
> > > > On Mon, Oct 02, 2017 at 02:54:32PM +0200, Michal Hocko wrote:
> > > > > On Mon 02-10-17 00:51:11, Alexandru Moise wrote:
> > > > > > This attempts to bring more flexibility to how hugepages are 
> > > > > > allocated
> > > > > > by making it possible to decide whether we want the hugepages to be
> > > > > > allocated from ZONE_MOVABLE or to the zone allocated by the 
> > > > > > "kernelcore="
> > > > > > boot parameter for non-movable allocations.
> > > > > > 
> > > > > > A new boot parameter is introduced, "hugepages_movable=", this sets 
> > > > > > the
> > > > > > default value for the "hugepages_treat_as_movable" sysctl. This 
> > > > > > allows
> > > > > > us to determine the zone for hugepages allocated at boot time. It 
> > > > > > only
> > > > > > affects 2M hugepages allocated at boot time for now because 1G
> > > > > > hugepages are allocated much earlier in the boot process and ignore
> > > > > > this sysctl completely.
> > > > > > 
> > > > > > The "hugepages_treat_as_movable" sysctl is also turned into a 
> > > > > > mandatory
> > > > > > setting that all hugepage allocations at runtime must respect (both
> > > > > > 2M and 1G sized hugepages). The default value is changed to "1" to
> > > > > > preserve the existing behavior that if hugepage migration is 
> > > > > > supported,
> > > > > > then the pages will be allocated from ZONE_MOVABLE.
> > > > > > 
> > > > > > Note however if not enough contiguous memory is present in 
> > > > > > ZONE_MOVABLE
> > > > > > then the allocation will fallback to the non-movable zone and those
> > > > > > pages will not be migratable.
> > > > > 
> > > > > This changelog doesn't explain _why_ we would need something like 
> > > > > that.
> > > > > 
> > > > 
> > > > So people shouldn't be able to choose whether their hugepages should be
> > > > migratable or not?
> > > 
> > > How are hugetlb pages any different from THP wrt. migrateability POV? Or
> > > any other mapped memory to the userspace in general?
> > 
> > THP shares more with regular userspace mapped memory than with hugetlbfs 
> > pages.
> > They have separate codepaths in migrate_pages().
> 
> That is a mere implementation detail. You are right that THP shares more
> with regular userspace memory because it is transparent from the
> configuration POV but that has nothing to do with page migration AFAICS.
> 
> > And no one ever sets the movable
> > flag on a hugetlbfs mapping, so even though __PageMovable(hpage) on a 
> > hugetlbfs
> > page returns false, it will still move.
> 
> __PageMovable is a completely unrelated thing. It is for pages which are
> !LRU but still movable.
> 
> > 
> > > 
> > > > Maybe they consider some of their applications more important than
> > > > others.
> > > 
> > > I do not understand this part.
> > > 
> > > > Say:
> > > > You have a large number of correctable errors on a subpage of a compound
> > > > page. So you copy the contents of the page to another hugepage, break 
> > > > the
> > > > original page and offline the subpage. 
> > > 
> > > I suspect you have HWPoisoning in mind right?
> > 
> > No, rather soft offlining. 
> 
> I thought this is the same thing.
> 
> > > > But maybe you'd rather that some of
> > > > your hugepages not be broken and moved because you're not that worried 
> > > > about
> > > > memory corruption, but more about availability.
> > > 
> > > Could you be more specific please?
> > 
> > You can have a platform with reliable DIMM modules and a platform with less 
> > reliable
> > DIMM modules. So you would prefer to inhibit hugepage migration on the 
> > platform with
> > reliable DIMM modules that you know will behave ok even under a high number 
> > of 
> > correctable memory errors. tools like mcelog however are not hugepage aware 
> > and
> > cannot be told "if this PFN is part of a hugepage, don't try to soft 
> > offline it",
> > rather deciding which PFNs should be unmovable should be done in the kernel,
> > but it should still be controllable by the administrator.
> 
> This sounds like a userspace policy that should be handled outside of
> the kernel.
> 
> > For hugetlbfs pages in particular, this behavior is not present, without 
> > this patch.
> > 
> > > 
> > > > Without this patch even if hugepages are in the non-movable zone, they 
> > > > move.
> > > 
> > > which is ok. This is very same with any other movable allocations.
> > 
> > So you can have movable pages in the non-movable kernel zone?
> 
> yes. Most configuration even do not have any movable zone unless
> explicitly configured.
> 
> > > > > > The implementation is a bit dirty so obviously I'm open to 
> > > > > > suggestions
> > > > > > for a better way to implement this behavior, or comments whether 
> > > > > > the whole
> > > > > > idea i

[PATCH v10] vfio: platform: reset: Add Broadcom FlexRM reset module

2017-10-02 Thread Anup Patel

This patch adds Broadcom FlexRM low-level reset for
VFIO platform.

It will do the following:
1. Disable/Deactivate each FlexRM ring
2. Flush each FlexRM ring

The cleanup sequence for FlexRM rings is adapted from
Broadcom FlexRM mailbox driver.

Signed-off-by: Anup Patel 
Reviewed-by: Oza Oza 
Reviewed-by: Scott Branden 
Reviewed-by: Eric Auger 
---
 drivers/vfio/platform/reset/Kconfig|   9 ++
 drivers/vfio/platform/reset/Makefile   |   1 +
 .../vfio/platform/reset/vfio_platform_bcmflexrm.c  | 113 +
 3 files changed, 123 insertions(+)
 create mode 100644 drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c

diff --git a/drivers/vfio/platform/reset/Kconfig 
b/drivers/vfio/platform/reset/Kconfig
index 705..392e3c0 100644
--- a/drivers/vfio/platform/reset/Kconfig
+++ b/drivers/vfio/platform/reset/Kconfig
@@ -13,3 +13,12 @@ config VFIO_PLATFORM_AMDXGBE_RESET
  Enables the VFIO platform driver to handle reset for AMD XGBE
 
  If you don't know what to do here, say N.
+
+config VFIO_PLATFORM_BCMFLEXRM_RESET
+   tristate "VFIO support for Broadcom FlexRM reset"
+   depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
+   default ARCH_BCM_IPROC
+   help
+ Enables the VFIO platform driver to handle reset for Broadcom FlexRM
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/platform/reset/Makefile 
b/drivers/vfio/platform/reset/Makefile
index 93f4e23..8d9874b 100644
--- a/drivers/vfio/platform/reset/Makefile
+++ b/drivers/vfio/platform/reset/Makefile
@@ -5,3 +5,4 @@ ccflags-y += -Idrivers/vfio/platform
 
 obj-$(CONFIG_VFIO_PLATFORM_CALXEDAXGMAC_RESET) += vfio-platform-calxedaxgmac.o
 obj-$(CONFIG_VFIO_PLATFORM_AMDXGBE_RESET) += vfio-platform-amdxgbe.o
+obj-$(CONFIG_VFIO_PLATFORM_BCMFLEXRM_RESET) += vfio_platform_bcmflexrm.o
diff --git a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c 
b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
new file mode 100644
index 000..d45c3be
--- /dev/null
+++ b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * This driver provides reset support for Broadcom FlexRM ring manager
+ * to VFIO platform.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+/* FlexRM configuration */
+#define RING_REGS_SIZE 0x1
+#define RING_VER_MAGIC 0x76303031
+
+/* Per-Ring register offsets */
+#define RING_VER   0x000
+#define RING_CONTROL   0x034
+#define RING_FLUSH_DONE0x038
+
+/* Register RING_CONTROL fields */
+#define CONTROL_FLUSH_SHIFT5
+
+/* Register RING_FLUSH_DONE fields */
+#define FLUSH_DONE_MASK0x1
+
+static int vfio_platform_bcmflexrm_shutdown(void __iomem *ring)
+{
+   unsigned int timeout;
+
+   /* Disable/inactivate ring */
+   writel_relaxed(0x0, ring + RING_CONTROL);
+
+   /* Set ring flush state */
+   timeout = 1000; /* timeout of 1s */
+   writel_relaxed(BIT(CONTROL_FLUSH_SHIFT), ring + RING_CONTROL);
+   do {
+   if (readl_relaxed(ring + RING_FLUSH_DONE) &
+   FLUSH_DONE_MASK)
+   break;
+   mdelay(1);
+   } while (--timeout);
+   if (!timeout)
+   return -ETIMEDOUT;
+
+   /* Clear ring flush state */
+   timeout = 1000; /* timeout of 1s */
+   writel_relaxed(0x0, ring + RING_CONTROL);
+   do {
+   if (!(readl_relaxed(ring + RING_FLUSH_DONE) &
+ FLUSH_DONE_MASK))
+   break;
+   mdelay(1);
+   } while (--timeout);
+   if (!timeout)
+   return -ETIMEDOUT;
+
+   return 0;
+}
+
+static int vfio_platform_bcmflexrm_reset(struct vfio_platform_device *vdev)
+{
+   void __iomem *ring;
+   int rc = 0, ret = 0, ring_num = 0;
+   struct vfio_platform_region *reg = &vdev->regions[0];
+
+   /* Map FlexRM ring registers if not mapped */
+   if (!reg->ioaddr) {
+   reg->ioaddr = ioremap_nocache(reg->addr, reg->size);
+   if (!reg->ioaddr)
+   return -ENOMEM;
+   }
+
+   /* Discover and shutdown each FlexRM ring */
+   for (ring = reg->ioaddr;
+

[PATCH v10] FlexRM support in VFIO platform

2017-10-02 Thread Anup Patel

This patchset primarily adds Broadcom FlexRM reset module for
VFIO platform driver.

The patches are based on Linux-4.14-rc3 and can also be
found at flexrm-vfio-v10 branch of
https://github.com/Broadcom/arm64-linux.git

Changes since v9:
 - Make GPL comment header similar to other Broadcom drivers

Changes since v8:
 - Add missing "ring_num++" in vfio_platform_bcmflexrm_reset()

Changes since v7:
 - Use "ret |= rc" instead of "ret = rc" in
   vfio_platform_bcmflexrm_reset()

Changes since v6:
 - Update the FlexRM ring flush sequence as suggested
   by HW folks
 - Shutdown all FlexRM ring anyway even if it fails on
   any of the FlexRM rings
 - Use dev_warn() instead of pr_warn()

Changes since v5:
 - Make kconfig option VFIO_PLATFORM_BCMFLEXRM_RESET
   default to ARCH_BCM_IPROC

Changes since v4:
 - Use "--timeout" instead of "timeout--" in
   vfio_platform_bcmflexrm_shutdown()

Changes since v3:
 - Improve "depends on" for Kconfig option
   VFIO_PLATFORM_BCMFLEXRM_RESET
 - Fix typo in pr_warn() called by
   vfio_platform_bcmflexrm_shutdown()
 - Return error from vfio_platform_bcmflexrm_shutdown()
   when FlexRM ring flush timeout happens

Changes since v2:
 - Remove PATCH1 because fixing VFIO no-IOMMU mode is
   a separate topic

Changes since v1:
 - Remove iommu_present() check in vfio_iommu_group_get()
 - Drop PATCH1-to-PATCH3 because IOMMU_CAP_BYPASS is not
   required
 - Move additional comments out of license header in
   vfio_platform_bcmflexrm.c

Anup Patel (1):
  vfio: platform: reset: Add Broadcom FlexRM reset module

 drivers/vfio/platform/reset/Kconfig|   9 ++
 drivers/vfio/platform/reset/Makefile   |   1 +
 .../vfio/platform/reset/vfio_platform_bcmflexrm.c  | 113 +
 3 files changed, 123 insertions(+)
 create mode 100644 drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c

-- 
2.7.4

[PATCH v4 3/6] mtd: spi-nor: cadence-quadspi: Add new binding to enable loop-back circuit

2017-10-02 Thread Vignesh R

Cadence QSPI IP has a adapted loop-back circuit which can be enabled by
setting BYPASS field to 0 in READCAPTURE register. It enables use of
QSPI return clock to latch the data rather than the internal QSPI
reference clock. For high speed operations, adapted loop-back circuit
using QSPI return clock helps to increase data valid window.

Add DT parameter cdns,rclk-en to help enable adapted loop-back circuit
for boards which do have QSPI return clock provided. Update binding
documentation for the same.

Signed-off-by: Vignesh R 
Acked-by: Rob Herring 
---
 Documentation/devicetree/bindings/mtd/cadence-quadspi.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt 
b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
index 7dbe3bd9ac56..bb2075df9b38 100644
--- a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
+++ b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
@@ -16,6 +16,9 @@ Required properties:
 
 Optional properties:
 - cdns,is-decoded-cs : Flag to indicate whether decoder is used or not.
+- cdns,rclk-en : Flag to indicate that QSPI return clock is used to latch
+  the read data rather than the QSPI clock. Make sure that QSPI return
+  clock is populated on the board before using this property.
 
 Optional subnodes:
 Subnodes of the Cadence Quad SPI controller are spi slave nodes with additional
-- 
2.14.1

RE: [PATCH v2 3/6] staging: fsl-dpaa2/ethsw: Add ethtool support

2017-10-02 Thread Razvan Stefanescu



> -Original Message-
> From: Andrew Lunn [mailto:and...@lunn.ch]
> Sent: Monday, October 02, 2017 18:37
> To: Razvan Stefanescu 
> Cc: gre...@linuxfoundation.org; de...@driverdev.osuosl.org; linux-
> ker...@vger.kernel.org; net...@vger.kernel.org; ag...@suse.de;
> a...@arndb.de; Alexandru Marginean ;
> Bogdan Purcareata ; Ruxandra Ioana Radulescu
> ; Laurentiu Tudor ;
> stuyo...@gmail.com
> Subject: Re: [PATCH v2 3/6] staging: fsl-dpaa2/ethsw: Add ethtool support
> 
> Hi Razvan
> 
> > +static void ethsw_get_drvinfo(struct net_device *netdev,
> > + struct ethtool_drvinfo *drvinfo)
> > +{
> > +   struct ethsw_port_priv *port_priv = netdev_priv(netdev);
> > +   u16 version_major, version_minor;
> > +   int err;
> > +
> > +   strlcpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
> > +   strlcpy(drvinfo->version, ethsw_drv_version, sizeof(drvinfo->version));
> 
> Software driver versions are mostly useless. I would suggest you
> remove this.
> 
>Andrew
Thank you. I'll remove it in v3.

Best regards,
Razvan S.

[PATCH 0/5] Few more FlexRM driver improvements

2017-10-02 Thread Anup Patel

This patchset does few more improvements to Broadcom FlexRM mailbox
driver.

The patches are based on Linux-4.14-rc3 and can also be found at
flexrm-imp2-v1 branch of:
https://github.com/Broadcom/arm64-linux.git

Anup Patel (4):
  mailbox: bcm-flexrm-mailbox: Fix FlexRM ring flush sequence
  mailbox: bcm-flexrm-mailbox: Print ring number in errors and warnings
  mailbox: bcm-flexrm-mailbox: Use common GPL comment header
  mailbox: Build Broadcom FlexRM driver as loadable module for iProc
SOCs

Scott Branden (1):
  mailbox: bcm-flexrm-mailbox: add depends on ARCH_BCM_IPROC

 drivers/mailbox/Kconfig  |  3 +-
 drivers/mailbox/bcm-flexrm-mailbox.c | 66 ++--
 2 files changed, 51 insertions(+), 18 deletions(-)

-- 
2.7.4

[PATCH 1/5] mailbox: bcm-flexrm-mailbox: Fix FlexRM ring flush sequence

2017-10-02 Thread Anup Patel

As-per suggestion from FlexRM HW folks, we have to first set
FlexRM ring flush state and then clear it for FlexRM ring flush
to work properly.

Currently, the FlexRM driver has incomplete FlexRM ring flush
sequence which causes repeated insmod+rmmod of mailbox client
drivers to fail.

This patch fixes FlexRM ring flush sequence in flexrm_shutdown()
as described above.

Fixes: dbc049eee730 ("mailbox: Add driver for Broadcom FlexRM
ring manager")

Signed-off-by: Anup Patel 
Reviewed-by: Scott Branden 
Cc: sta...@vger.kernel.org
---
 drivers/mailbox/bcm-flexrm-mailbox.c | 22 +++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c 
b/drivers/mailbox/bcm-flexrm-mailbox.c
index ae61463..f052a3e 100644
--- a/drivers/mailbox/bcm-flexrm-mailbox.c
+++ b/drivers/mailbox/bcm-flexrm-mailbox.c
@@ -1365,8 +1365,8 @@ static void flexrm_shutdown(struct mbox_chan *chan)
/* Disable/inactivate ring */
writel_relaxed(0x0, ring->regs + RING_CONTROL);
 
-   /* Flush ring with timeout of 1s */
-   timeout = 1000;
+   /* Set ring flush state */
+   timeout = 1000; /* timeout of 1s */
writel_relaxed(BIT(CONTROL_FLUSH_SHIFT),
ring->regs + RING_CONTROL);
do {
@@ -1374,7 +1374,23 @@ static void flexrm_shutdown(struct mbox_chan *chan)
FLUSH_DONE_MASK)
break;
mdelay(1);
-   } while (timeout--);
+   } while (--timeout);
+   if (!timeout)
+   dev_err(ring->mbox->dev,
+   "setting ring%d flush state timedout\n", ring->num);
+
+   /* Clear ring flush state */
+   timeout = 1000; /* timeout of 1s */
+   writel_relaxed(0x0, ring + RING_CONTROL);
+   do {
+   if (!(readl_relaxed(ring + RING_FLUSH_DONE) &
+ FLUSH_DONE_MASK))
+   break;
+   mdelay(1);
+   } while (--timeout);
+   if (!timeout)
+   dev_err(ring->mbox->dev,
+   "clearing ring%d flush state timedout\n", ring->num);
 
/* Abort all in-flight requests */
for (reqid = 0; reqid < RING_MAX_REQ_COUNT; reqid++) {
-- 
2.7.4

[PATCH 1/4] dmaengine: bcm-sba-raid: serialize dma_cookie_complete() using reqs_lock

2017-10-02 Thread Anup Patel

As-per documentation in driver/dma/dmaengine.h, the
dma_cookie_complete() API should be called with lock
held.

This patch ensures that Broadcom SBA RAID driver calls
the dma_cookie_complete() API with reqs_lock held.

Signed-off-by: Anup Patel 
Reviewed-by: Ray Jui 
Reviewed-by: Scott Branden 
---
 drivers/dma/bcm-sba-raid.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
index 6c2c447..15c5585 100644
--- a/drivers/dma/bcm-sba-raid.c
+++ b/drivers/dma/bcm-sba-raid.c
@@ -442,7 +442,9 @@ static void sba_process_received_request(struct sba_device 
*sba,
 
WARN_ON(tx->cookie < 0);
if (tx->cookie > 0) {
+   spin_lock_irqsave(&sba->reqs_lock, flags);
dma_cookie_complete(tx);
+   spin_unlock_irqrestore(&sba->reqs_lock, flags);
dmaengine_desc_get_callback_invoke(tx, NULL);
dma_descriptor_unmap(tx);
tx->callback = NULL;
-- 
2.7.4

[PATCH 0/4] Few more SBA RAID driver improvements

2017-10-02 Thread Anup Patel

This patchset does few more improvements to Broadcom SBA RAID
driver.

The patches are based on Linux-4.14-rc3 and can also be found
at sba-raid-imp2-v1 branch of:
https://github.com/Broadcom/arm64-linux.git

Anup Patel (4):
  dmaengine: bcm-sba-raid: serialize dma_cookie_complete() using
reqs_lock
  dmaengine: bcm-sba-raid: Use only single mailbox channel
  dmaengine: bcm-sba-raid: Use common GPL comment header
  dmaengine: Build bcm-sba-raid driver as loadable module for iProc SoCs

 drivers/dma/Kconfig|   2 +-
 drivers/dma/bcm-sba-raid.c | 117 ++---
 2 files changed, 38 insertions(+), 81 deletions(-)

-- 
2.7.4

[PATCH 3/4] dmaengine: bcm-sba-raid: Use common GPL comment header

2017-10-02 Thread Anup Patel

This patch makes the comment header of Broadcom SBA RAID driver
similar to the GPL comment header used across Broadcom driver
sources.

Signed-off-by: Anup Patel 
---
 drivers/dma/bcm-sba-raid.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
index 409da59..3956a01 100644
--- a/drivers/dma/bcm-sba-raid.c
+++ b/drivers/dma/bcm-sba-raid.c
@@ -1,9 +1,14 @@
 /*
  * Copyright (C) 2017 Broadcom
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  */
 
 /*
-- 
2.7.4

[PATCH 2/4] dmaengine: bcm-sba-raid: Use only single mailbox channel

2017-10-02 Thread Anup Patel

Each mailbox channel used by Broadcom SBA RAID driver is
a separate HW ring.

Currently, Broadcom SBA RAID driver creates one DMA channel
using one or more mailbox channels. When we are using more
than one mailbox channels for a DMA channel, the sba_request
are distributed evenly among multiple mailbox channels which
results in sba_request being completed out-of-order.

The above described out-of-order completion of sba_request
breaks the dma_async_is_complete() API because it assumes
DMA cookies are completed in orderly fashion.

To ensure correct behaviour of dma_async_is_complete() API,
this patch updates Broadcom SBA RAID driver to use only
single mailbox channel. If additional mailbox channels are
specified in DT then those will be ignored.

Signed-off-by: Anup Patel 
Reviewed-by: Ray Jui 
Reviewed-by: Scott Branden 
---
 drivers/dma/bcm-sba-raid.c | 104 -
 1 file changed, 27 insertions(+), 77 deletions(-)

diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
index 15c5585..409da59 100644
--- a/drivers/dma/bcm-sba-raid.c
+++ b/drivers/dma/bcm-sba-raid.c
@@ -25,11 +25,8 @@
  *
  * The Broadcom SBA RAID driver does not require any register programming
  * except submitting request to SBA hardware device via mailbox channels.
- * This driver implements a DMA device with one DMA channel using a set
- * of mailbox channels provided by Broadcom SoC specific ring manager
- * driver. To exploit parallelism (as described above), all DMA request
- * coming to SBA RAID DMA channel are broken down to smaller requests
- * and submitted to multiple mailbox channels in round-robin fashion.
+ * This driver implements a DMA device with one DMA channel using a single
+ * mailbox channel provided by Broadcom SoC specific ring manager driver.
  * For having more SBA DMA channels, we can create more SBA device nodes
  * in Broadcom SoC specific DTS based on number of hardware rings supported
  * by Broadcom SoC ring manager.
@@ -85,6 +82,7 @@
 #define SBA_CMD_GALOIS 0xe
 
 #define SBA_MAX_REQ_PER_MBOX_CHANNEL   8192
+#define SBA_MAX_MSG_SEND_PER_MBOX_CHANNEL  8
 
 /* Driver helper macros */
 #define to_sba_request(tx) \
@@ -142,9 +140,7 @@ struct sba_device {
u32 max_cmds_pool_size;
/* Maibox client and Mailbox channels */
struct mbox_client client;
-   int mchans_count;
-   atomic_t mchans_current;
-   struct mbox_chan **mchans;
+   struct mbox_chan *mchan;
struct device *mbox_dev;
/* DMA device and DMA channel */
struct dma_device dma_dev;
@@ -200,14 +196,6 @@ static inline u32 __pure sba_cmd_pq_c_mdata(u32 d, u32 b1, 
u32 b0)
 
 /* == General helper routines = */
 
-static void sba_peek_mchans(struct sba_device *sba)
-{
-   int mchan_idx;
-
-   for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
-   mbox_client_peek_data(sba->mchans[mchan_idx]);
-}
-
 static struct sba_request *sba_alloc_request(struct sba_device *sba)
 {
bool found = false;
@@ -231,7 +219,7 @@ static struct sba_request *sba_alloc_request(struct 
sba_device *sba)
 * would have completed which will create more
 * room for new requests.
 */
-   sba_peek_mchans(sba);
+   mbox_client_peek_data(sba->mchan);
return NULL;
}
 
@@ -369,15 +357,11 @@ static void sba_cleanup_pending_requests(struct 
sba_device *sba)
 static int sba_send_mbox_request(struct sba_device *sba,
 struct sba_request *req)
 {
-   int mchans_idx, ret = 0;
-
-   /* Select mailbox channel in round-robin fashion */
-   mchans_idx = atomic_inc_return(&sba->mchans_current);
-   mchans_idx = mchans_idx % sba->mchans_count;
+   int ret = 0;
 
/* Send message for the request */
req->msg.error = 0;
-   ret = mbox_send_message(sba->mchans[mchans_idx], &req->msg);
+   ret = mbox_send_message(sba->mchan, &req->msg);
if (ret < 0) {
dev_err(sba->dev, "send message failed with error %d", ret);
return ret;
@@ -390,7 +374,7 @@ static int sba_send_mbox_request(struct sba_device *sba,
}
 
/* Signal txdone for mailbox channel */
-   mbox_client_txdone(sba->mchans[mchans_idx], ret);
+   mbox_client_txdone(sba->mchan, ret);
 
return ret;
 }
@@ -402,13 +386,8 @@ static void _sba_process_pending_requests(struct 
sba_device *sba)
u32 count;
struct sba_request *req;
 
-   /*
-* Process few pending requests
-*
-* For now, we process ( * 8)
-* number of requests at a time.
-*/
-   count = sba->mchans_count * 8;
+   /* Process few pending requests */
+   count = SBA_MAX_MSG_SEND_PER_MBOX_CHANNEL;
while (!list_empty(&sba->reqs_pending_list) && count) {

[PATCH 4/4] dmaengine: Build bcm-sba-raid driver as loadable module for iProc SoCs

2017-10-02 Thread Anup Patel

By default, we build Broadcom SBA RAID driver as loadable module for
iProc SOCs so that kernel image is little smaller and we load SBA RAID
driver only when required.

Signed-off-by: Anup Patel 
Reviewed-by: Scott Branden 
---
 drivers/dma/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index fadc4d8..48cf8df 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -115,7 +115,7 @@ config BCM_SBA_RAID
select DMA_ENGINE_RAID
select ASYNC_TX_DISABLE_XOR_VAL_DMA
select ASYNC_TX_DISABLE_PQ_VAL_DMA
-   default ARCH_BCM_IPROC
+   default m if ARCH_BCM_IPROC
help
  Enable support for Broadcom SBA RAID Engine. The SBA RAID
  engine is available on most of the Broadcom iProc SoCs. It
-- 
2.7.4

[PATCH 2/5] mailbox: bcm-flexrm-mailbox: Print ring number in errors and warnings

2017-10-02 Thread Anup Patel

This patch updates all dev_err() and dev_warn() to print
ring number so that we have more info for debugging.

Signed-off-by: Anup Patel 
Reviewed-by: Scott Branden 
---
 drivers/mailbox/bcm-flexrm-mailbox.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c 
b/drivers/mailbox/bcm-flexrm-mailbox.c
index f052a3e..e0443ae 100644
--- a/drivers/mailbox/bcm-flexrm-mailbox.c
+++ b/drivers/mailbox/bcm-flexrm-mailbox.c
@@ -1116,8 +1116,8 @@ static int flexrm_process_completions(struct flexrm_ring 
*ring)
err = flexrm_cmpl_desc_to_error(desc);
if (err < 0) {
dev_warn(ring->mbox->dev,
-"got completion desc=0x%lx with error %d",
-(unsigned long)desc, err);
+   "ring%d got completion desc=0x%lx with error %d\n",
+   ring->num, (unsigned long)desc, err);
}
 
/* Determine request id from completion descriptor */
@@ -1127,8 +1127,8 @@ static int flexrm_process_completions(struct flexrm_ring 
*ring)
msg = ring->requests[reqid];
if (!msg) {
dev_warn(ring->mbox->dev,
-"null msg pointer for completion desc=0x%lx",
-(unsigned long)desc);
+   "ring%d null msg pointer for completion desc=0x%lx\n",
+   ring->num, (unsigned long)desc);
continue;
}
 
@@ -1238,7 +1238,9 @@ static int flexrm_startup(struct mbox_chan *chan)
ring->bd_base = dma_pool_alloc(ring->mbox->bd_pool,
   GFP_KERNEL, &ring->bd_dma_base);
if (!ring->bd_base) {
-   dev_err(ring->mbox->dev, "can't allocate BD memory\n");
+   dev_err(ring->mbox->dev,
+   "can't allocate BD memory for ring%d\n",
+   ring->num);
ret = -ENOMEM;
goto fail;
}
@@ -1261,7 +1263,9 @@ static int flexrm_startup(struct mbox_chan *chan)
ring->cmpl_base = dma_pool_alloc(ring->mbox->cmpl_pool,
 GFP_KERNEL, &ring->cmpl_dma_base);
if (!ring->cmpl_base) {
-   dev_err(ring->mbox->dev, "can't allocate completion memory\n");
+   dev_err(ring->mbox->dev,
+   "can't allocate completion memory for ring%d\n",
+   ring->num);
ret = -ENOMEM;
goto fail_free_bd_memory;
}
@@ -1269,7 +1273,8 @@ static int flexrm_startup(struct mbox_chan *chan)
 
/* Request IRQ */
if (ring->irq == UINT_MAX) {
-   dev_err(ring->mbox->dev, "ring IRQ not available\n");
+   dev_err(ring->mbox->dev,
+   "ring%d IRQ not available\n", ring->num);
ret = -ENODEV;
goto fail_free_cmpl_memory;
}
@@ -1278,7 +1283,8 @@ static int flexrm_startup(struct mbox_chan *chan)
   flexrm_irq_thread,
   0, dev_name(ring->mbox->dev), ring);
if (ret) {
-   dev_err(ring->mbox->dev, "failed to request ring IRQ\n");
+   dev_err(ring->mbox->dev,
+   "failed to request ring%d IRQ\n", ring->num);
goto fail_free_cmpl_memory;
}
ring->irq_requested = true;
@@ -1291,7 +1297,9 @@ static int flexrm_startup(struct mbox_chan *chan)
&ring->irq_aff_hint);
ret = irq_set_affinity_hint(ring->irq, &ring->irq_aff_hint);
if (ret) {
-   dev_err(ring->mbox->dev, "failed to set IRQ affinity hint\n");
+   dev_err(ring->mbox->dev,
+   "failed to set IRQ affinity hint for ring%d\n",
+   ring->num);
goto fail_free_irq;
}
 
-- 
2.7.4

[PATCH v4 5/6] mtd: spi-nor: cadence-quadspi: Fix error path in probe

2017-10-02 Thread Vignesh R

Fix the reversed goto labels, so that we disable cqspi controller only
if its enabled previously. This is a minor cleanup.

Signed-off-by: Vignesh R 
---
 drivers/mtd/spi-nor/cadence-quadspi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c 
b/drivers/mtd/spi-nor/cadence-quadspi.c
index d9629e8f4798..60b557e00cfb 100644
--- a/drivers/mtd/spi-nor/cadence-quadspi.c
+++ b/drivers/mtd/spi-nor/cadence-quadspi.c
@@ -1255,9 +1255,9 @@ static int cqspi_probe(struct platform_device *pdev)
}
 
return ret;
-probe_irq_failed:
-   cqspi_controller_enable(cqspi, 0);
 probe_setup_failed:
+   cqspi_controller_enable(cqspi, 0);
+probe_irq_failed:
clk_disable_unprepare(cqspi->clk);
return ret;
 }
-- 
2.14.1

[PATCH 3/5] mailbox: bcm-flexrm-mailbox: add depends on ARCH_BCM_IPROC

2017-10-02 Thread Anup Patel

From: Scott Branden 

The Broadcom FlexRM Mailbox is only present in the Broadcom IPROC SoCs.
Add depends on ARCH_BCM_IPROC to BCM_FLEXRX_MBOX.

Signed-off-by: Scott Branden 
Reviewed-by: Ray Jui 
---
 drivers/mailbox/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index c5731e5..3161f23 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -163,6 +163,7 @@ config BCM_PDC_MBOX
 config BCM_FLEXRM_MBOX
tristate "Broadcom FlexRM Mailbox"
depends on ARM64
+   depends on ARCH_BCM_IPROC || COMPILE_TEST
depends on HAS_DMA
select GENERIC_MSI_IRQ_DOMAIN
default ARCH_BCM_IPROC
-- 
2.7.4

[PATCH 5/5] mailbox: Build Broadcom FlexRM driver as loadable module for iProc SOCs

2017-10-02 Thread Anup Patel

By default, we build Broadcom FlexRM driver as loadable module for
iProc SOCs so that kernel image is little smaller and we load FlexRM
driver only when required.

Signed-off-by: Anup Patel 
Reviewed-by: Scott Branden 
---
 drivers/mailbox/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index 3161f23..ba2f152 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -166,7 +166,7 @@ config BCM_FLEXRM_MBOX
depends on ARCH_BCM_IPROC || COMPILE_TEST
depends on HAS_DMA
select GENERIC_MSI_IRQ_DOMAIN
-   default ARCH_BCM_IPROC
+   default m if ARCH_BCM_IPROC
help
  Mailbox implementation of the Broadcom FlexRM ring manager,
  which provides access to various offload engines on Broadcom
-- 
2.7.4

[PATCH 4/5] mailbox: bcm-flexrm-mailbox: Use common GPL comment header

2017-10-02 Thread Anup Patel

This patch makes the comment header of Broadcom FlexRM driver
similar to the GPL comment header used across Broadcom driver
sources.

Signed-off-by: Anup Patel 
---
 drivers/mailbox/bcm-flexrm-mailbox.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c 
b/drivers/mailbox/bcm-flexrm-mailbox.c
index e0443ae..a8cf433 100644
--- a/drivers/mailbox/bcm-flexrm-mailbox.c
+++ b/drivers/mailbox/bcm-flexrm-mailbox.c
@@ -1,10 +1,18 @@
-/* Broadcom FlexRM Mailbox Driver
- *
+/*
  * Copyright (C) 2017 Broadcom
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * Broadcom FlexRM Mailbox Driver
  *
  * Each Broadcom FlexSparx4 offload engine is implemented as an
  * extension to Broadcom FlexRM ring manager. The FlexRM ring
-- 
2.7.4

[PATCH v4 6/6] mtd: spi-nor: cadence-quadspi: Add runtime PM support

2017-10-02 Thread Vignesh R

Add pm_runtime* calls to cadence-quadspi driver. This is required to
switch on QSPI power domain on TI 66AK2G SoC during probe.

Signed-off-by: Vignesh R 
---
v4: Disable pm in error path.

 drivers/mtd/spi-nor/cadence-quadspi.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c 
b/drivers/mtd/spi-nor/cadence-quadspi.c
index 60b557e00cfb..75a2bc447a99 100644
--- a/drivers/mtd/spi-nor/cadence-quadspi.c
+++ b/drivers/mtd/spi-nor/cadence-quadspi.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1224,10 +1225,17 @@ static int cqspi_probe(struct platform_device *pdev)
return -ENXIO;
}
 
+   pm_runtime_enable(dev);
+   ret = pm_runtime_get_sync(dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(dev);
+   return ret;
+   }
+
ret = clk_prepare_enable(cqspi->clk);
if (ret) {
dev_err(dev, "Cannot enable QSPI clock.\n");
-   return ret;
+   goto probe_clk_failed;
}
 
cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk);
@@ -1259,6 +1267,9 @@ static int cqspi_probe(struct platform_device *pdev)
cqspi_controller_enable(cqspi, 0);
 probe_irq_failed:
clk_disable_unprepare(cqspi->clk);
+probe_clk_failed:
+   pm_runtime_put_sync(dev);
+   pm_runtime_disable(dev);
return ret;
 }
 
@@ -1275,6 +1286,9 @@ static int cqspi_remove(struct platform_device *pdev)
 
clk_disable_unprepare(cqspi->clk);
 
+   pm_runtime_put_sync(&pdev->dev);
+   pm_runtime_disable(&pdev->dev);
+
return 0;
 }
 
-- 
2.14.1

[PATCH v4 2/6] mtd: spi-nor: cadence-quadspi: add a delay in write sequence

2017-10-02 Thread Vignesh R

As per 66AK2G02 TRM[1] SPRUHY8F section 11.15.5.3 Indirect Access
Controller programming sequence, a delay equal to couple of QSPI master
clock(~5ns) is required after setting CQSPI_REG_INDIRECTWR_START bit and
writing data to the flash. Introduce a quirk flag CQSPI_NEEDS_WR_DELAY
to handle this and set this flag for TI 66AK2G SoC.

[1]http://www.ti.com/lit/ug/spruhy8f/spruhy8f.pdf

Signed-off-by: Vignesh R 
---
 drivers/mtd/spi-nor/cadence-quadspi.c | 27 ++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c 
b/drivers/mtd/spi-nor/cadence-quadspi.c
index 53c7d8e0327a..5cd5d6f7303f 100644
--- a/drivers/mtd/spi-nor/cadence-quadspi.c
+++ b/drivers/mtd/spi-nor/cadence-quadspi.c
@@ -38,6 +38,9 @@
 #define CQSPI_NAME "cadence-qspi"
 #define CQSPI_MAX_CHIPSELECT   16
 
+/* Quirks */
+#define CQSPI_NEEDS_WR_DELAY   BIT(0)
+
 struct cqspi_st;
 
 struct cqspi_flash_pdata {
@@ -76,6 +79,7 @@ struct cqspi_st {
u32 fifo_depth;
u32 fifo_width;
u32 trigger_address;
+   u32 wr_delay;
struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT];
 };
 
@@ -608,6 +612,15 @@ static int cqspi_indirect_write_execute(struct spi_nor 
*nor,
reinit_completion(&cqspi->transfer_complete);
writel(CQSPI_REG_INDIRECTWR_START_MASK,
   reg_base + CQSPI_REG_INDIRECTWR);
+   /*
+* As per 66AK2G02 TRM SPRUHY8F section 11.15.5.3 Indirect Access
+* Controller programming sequence, couple of cycles of
+* QSPI_REF_CLK delay is required for the above bit to
+* be internally synchronized by the QSPI module. Provide 5
+* cycles of delay.
+*/
+   if (cqspi->wr_delay)
+   ndelay(cqspi->wr_delay);
 
while (remaining > 0) {
write_bytes = remaining > page_size ? page_size : remaining;
@@ -1156,6 +1169,7 @@ static int cqspi_probe(struct platform_device *pdev)
struct cqspi_st *cqspi;
struct resource *res;
struct resource *res_ahb;
+   unsigned long data;
int ret;
int irq;
 
@@ -1213,6 +1227,10 @@ static int cqspi_probe(struct platform_device *pdev)
}
 
cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk);
+   data  = (unsigned long)of_device_get_match_data(dev);
+   if (data & CQSPI_NEEDS_WR_DELAY)
+   cqspi->wr_delay = 5 * DIV_ROUND_UP(NSEC_PER_SEC,
+  cqspi->master_ref_clk_hz);
 
ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0,
   pdev->name, cqspi);
@@ -1284,7 +1302,14 @@ static const struct dev_pm_ops cqspi__dev_pm_ops = {
 #endif
 
 static const struct of_device_id cqspi_dt_ids[] = {
-   {.compatible = "cdns,qspi-nor",},
+   {
+   .compatible = "cdns,qspi-nor",
+   .data = (void *)0,
+   },
+   {
+   .compatible = "ti,k2g-qspi",
+   .data = (void *)CQSPI_NEEDS_WR_DELAY,
+   },
{ /* end of table */ }
 };
 
-- 
2.14.1

[PATCH v4 4/6] mtd: spi-nor: cadence-quadspi: Add support to enable loop-back clock circuit

2017-10-02 Thread Vignesh R

Cadence QSPI IP has a adapted loop-back circuit which can be enabled by
setting BYPASS field to 0 in READCAPTURE register. It enables use of
QSPI return clock to latch the data rather than the internal QSPI
reference clock. For high speed operations, adapted loop-back circuit
using QSPI return clock helps to increase data valid window.

Based on DT parameter cdns,rclk-en enable adapted loop-back circuit
for boards which do have QSPI return clock provided.
This patch also modifies cqspi_readdata_capture() function's bypass
parameter to bool to match how its used in the function.

Signed-off-by: Vignesh R 
---
 drivers/mtd/spi-nor/cadence-quadspi.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c 
b/drivers/mtd/spi-nor/cadence-quadspi.c
index 5cd5d6f7303f..d9629e8f4798 100644
--- a/drivers/mtd/spi-nor/cadence-quadspi.c
+++ b/drivers/mtd/spi-nor/cadence-quadspi.c
@@ -78,6 +78,7 @@ struct cqspi_st {
boolis_decoded_cs;
u32 fifo_depth;
u32 fifo_width;
+   boolrclk_en;
u32 trigger_address;
u32 wr_delay;
struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT];
@@ -788,7 +789,7 @@ static void cqspi_config_baudrate_div(struct cqspi_st 
*cqspi)
 }
 
 static void cqspi_readdata_capture(struct cqspi_st *cqspi,
-  const unsigned int bypass,
+  const bool bypass,
   const unsigned int delay)
 {
void __iomem *reg_base = cqspi->iobase;
@@ -852,7 +853,8 @@ static void cqspi_configure(struct spi_nor *nor)
cqspi->sclk = sclk;
cqspi_config_baudrate_div(cqspi);
cqspi_delay(nor);
-   cqspi_readdata_capture(cqspi, 1, f_pdata->read_delay);
+   cqspi_readdata_capture(cqspi, !cqspi->rclk_en,
+  f_pdata->read_delay);
}
 
if (switch_cs || switch_ck)
@@ -1049,6 +1051,8 @@ static int cqspi_of_get_pdata(struct platform_device 
*pdev)
return -ENXIO;
}
 
+   cqspi->rclk_en = of_property_read_bool(np, "cdns,rclk-en");
+
return 0;
 }
 
-- 
2.14.1

[PATCH v4 1/6] mtd: spi-nor: cadence-quadspi: Add TI 66AK2G SoC specific compatible

2017-10-02 Thread Vignesh R

Update binding documentation to add a new compatible for TI 66AK2G SoC,
to handle TI SoC specific quirks in the driver.

Signed-off-by: Vignesh R 
Acked-by: Rob Herring 
---
 Documentation/devicetree/bindings/mtd/cadence-quadspi.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt 
b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
index f248056da24c..7dbe3bd9ac56 100644
--- a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
+++ b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
@@ -1,7 +1,9 @@
 * Cadence Quad SPI controller
 
 Required properties:
-- compatible : Should be "cdns,qspi-nor".
+- compatible : should be one of the following:
+   Generic default - "cdns,qspi-nor".
+   For TI 66AK2G SoC - "ti,k2g-qspi", "cdns,qspi-nor".
 - reg : Contains two entries, each of which is a tuple consisting of a
physical address and length. The first entry is the address and
length of the controller register set. The second entry is the
-- 
2.14.1

[PATCH v4 0/6] K2G: Add QSPI support

2017-10-02 Thread Vignesh R



This series adds support for Cadence QSPI IP present in TI's 66AK2G SoC.
The patches enhance the existing cadence-quadspi driver to support
loopback clock circuit, pm_runtime support and tweaks for 66AK2G SoC.

Change log:

v4:
* New patch to fix error handling sequence in probe.

v3:
* Fix build warnings reported by kbuild test bot.

Resend:
* Rebase to latest linux-next.
* Collect Acked-bys

v2:
* Drop DT patches. Will be sent as separate series as requested by
 maintainer.
* Split binding docs into separate patches.
* Address comments by Rob Herring.

Vignesh R (6):
  mtd: spi-nor: cadence-quadspi: Add TI 66AK2G SoC specific compatible
  mtd: spi-nor: cadence-quadspi: add a delay in write sequence
  mtd: spi-nor: cadence-quadspi: Add new binding to enable loop-back
circuit
  mtd: spi-nor: cadence-quadspi: Add support to enable loop-back clock
circuit
  mtd: spi-nor: cadence-quadspi: Fix error path in probe
  mtd: spi-nor: cadence-quadspi: Add runtime PM support

 .../devicetree/bindings/mtd/cadence-quadspi.txt|  7 ++-
 drivers/mtd/spi-nor/cadence-quadspi.c  | 55 +++---
 2 files changed, 55 insertions(+), 7 deletions(-)

-- 
2.14.1

Re: regression in 4.14-rc2 caused by apparmor: add base infastructure for socket mediation

2017-10-02 Thread James Bottomley

On Mon, 2017-10-02 at 21:11 -0700, John Johansen wrote:
> On 10/02/2017 09:02 PM, James Bottomley wrote:
> > 
> > The specific problem is that dnsmasq refuses to start on openSUSE
> > Leap 42.2.  The specific cause is that and attempt to open a
> > PF_LOCAL socket gets EACCES.  This means that networking doesn't
> > function on a system with a 4.14-rc2 system.
> > 
> > Reverting commit 651e28c5537abb39076d3949fb7618536f1d242e
> > (apparmor: add base infastructure for socket mediation) causes the
> > system to function again.
> > 
> 
> This is not a kernel regression,

Regression means something that worked in a previous version of the
kernel which is broken now. This problem falls within that definition.

>  it is because  opensuse dnsmasque is starting with policy that
> doesn't allow access to PF_LOCAL socket

Because there was no co-ordination between their version of the patch
and yours.  If you're sending in patches that you know might break
systems because they need a co-ordinated rollout of something in
userspace then it would be nice if you could co-ordinate it ...

Doing it in the merge window and not in -rc2 would also be helpful
because I have more expectation of a userspace mismatch from stuff in
the merge window.

> Christian Boltz the opensuse apparmor maintainer has been working
> on a policy update for opensuse see bug
> 
> https://bugzilla.opensuse.org/show_bug.cgi?id=1061195

Well, that looks really encouraging: The line about "To give you an
impression what "lots of" means - I had to adjust 40 profiles on my
laptop".  The upshot being apart from a bandaid, openSUSE still has no
co-ordinated fix for this.

James

Re: [PATCH 2/3] sched/fair: Introduce scaled capacity awareness in select_idle_sibling code path

2017-10-02 Thread Joel Fernandes

Hi Rohit,

On Thu, Sep 28, 2017 at 8:09 AM, Rohit Jain  wrote:
[..]
>>>
>>> With this case, because we know from the past avg, one of the strands is
>>> running low on capacity, I am trying to return a better strand for the
>>> thread to start on.
>>>
>> I know what you're trying to do but they way you've retrofitted it into
>> the
>> core looks weird (to me) and makes the code unreadable and ugly IMO.
>>
>> Why not do something simpler like skip the core if any SMT thread has been
>> running at lesser capacity? I'm not sure if this works great or if the
>> maintainers
>> will prefer your or my below approach, but I find the below diff much
>> cleaner
>> for the select_idle_core bit. It also makes more sense since resources are
>> shared at SMT level so makes sense to me to skip the core altogether for
>> this:
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6ee7242dbe0a..f324a84e29f1 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5738,14 +5738,17 @@ static int select_idle_core(struct task_struct *p,
>> struct sched_domain *sd, int
>> for_each_cpu_wrap(core, cpus, target) {
>> bool idle = true;
>> +   bool full_cap = true;
>> for_each_cpu(cpu, cpu_smt_mask(core)) {
>> cpumask_clear_cpu(cpu, cpus);
>> if (!idle_cpu(cpu))
>> idle = false;
>> +   if (!full_capacity(cpu))
>> +   full_cap = false;
>> }
>>   - if (idle)
>> +   if (idle && full_cap)
>> return core;
>> }
>>
>
>
>
> Well, with your changes you will skip over fully idle cores which is not
> an ideal thing either. I see that you were advocating for select
> idle+lowest capacity core, whereas I was stopping at the first idlecore.
>
> Since the whole philosophy till now in this patch is "Don't spare an
> idle CPU", I think the following diff might look better to you. Please
> note this is only for discussion sakes, I haven't fully tested it yet.
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ec15e5f..c2933eb 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6040,7 +6040,9 @@ void __update_idle_core(struct rq *rq)
>  static int select_idle_core(struct task_struct *p, struct sched_domain *sd,
> int target)
>  {
>  struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
> -int core, cpu;
> +int core, cpu, rcpu, backup_core;
> +
> +rcpu = backup_core = -1;
>
>  if (!static_branch_likely(&sched_smt_present))
>  return -1;
> @@ -6052,15 +6054,34 @@ static int select_idle_core(struct task_struct *p,
> struct sched_domain *sd, int
>
>  for_each_cpu_wrap(core, cpus, target) {
>  bool idle = true;
> +bool full_cap = true;
>
>  for_each_cpu(cpu, cpu_smt_mask(core)) {
>  cpumask_clear_cpu(cpu, cpus);
>  if (!idle_cpu(cpu))
>  idle = false;
> +
> +if (!full_capacity(cpu)) {
> +full_cap = false;
> +}
>  }
>
> -if (idle)
> +if (idle && full_cap)
>  return core;
> +else if (idle && backup_core == -1)
> +backup_core = core;
> +}
> +
> +if (backup_core != -1) {
> +for_each_cpu(cpu, cpu_smt_mask(backup_core)) {
> +if (full_capacity(cpu))
> +return cpu;
> +else if ((rcpu == -1) ||
> + (capacity_of(cpu) > capacity_of(rcpu)))
> +rcpu = cpu;
> +}
> +
> +return rcpu;
>  }
>
>
> Do let me know what you think.

I think that if there isn't a benefit in your tests in doing the above
vs the simpler approach, then I prefer the simpler approach especially
since there's no point/benefit in complicating the code for
select_idle_core.

thanks,

- Joel

Re: regression in 4.14-rc2 caused by apparmor: add base infastructure for socket mediation

2017-10-02 Thread John Johansen

On 10/02/2017 09:02 PM, James Bottomley wrote:
> The specific problem is that dnsmasq refuses to start on openSUSE Leap
> 42.2.  The specific cause is that and attempt to open a PF_LOCAL socket
> gets EACCES.  This means that networking doesn't function on a system
> with a 4.14-rc2 system.
> 
> Reverting commit 651e28c5537abb39076d3949fb7618536f1d242e (apparmor:
> add base infastructure for socket mediation) causes the system to
> function again.
> 

This is not a kernel regression, it is because  opensuse dnsmasque is
starting with policy that doesn't allow access to PF_LOCAL socket

Christian Boltz the opensuse apparmor maintainer has been working
on a policy update for opensuse see bug

https://bugzilla.opensuse.org/show_bug.cgi?id=1061195

regression in 4.14-rc2 caused by apparmor: add base infastructure for socket mediation

2017-10-02 Thread James Bottomley

The specific problem is that dnsmasq refuses to start on openSUSE Leap
42.2.  The specific cause is that and attempt to open a PF_LOCAL socket
gets EACCES.  This means that networking doesn't function on a system
with a 4.14-rc2 system.

Reverting commit 651e28c5537abb39076d3949fb7618536f1d242e (apparmor:
add base infastructure for socket mediation) causes the system to
function again.

James

[PATCH 01/12] of: overlay.c: Remove comments that state the obvious, to reduce clutter

2017-10-02 Thread frowand . list

From: Frank Rowand 

Follows recommendations in Documentation/process/coding-style.rst,
section 8, Commenting.

Some in function comments are promoted to function header comments.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 53 
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 8ecfee31ab6d..26f63f10f4b0 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -143,7 +143,6 @@ static struct property *dup_and_fixup_symbol_prop(struct 
of_overlay *ov,
strcpy(new->value, target_path);
strcpy(new->value + target_path_len, label_path);
 
-   /* mark the property as dynamic */
of_property_set_flag(new, OF_DYNAMIC);
 
return new;
@@ -157,23 +156,24 @@ static struct property *dup_and_fixup_symbol_prop(struct 
of_overlay *ov,
 
 }
 
+/*
+ * Some special properties are not updated (no error returned).
+ * Update of property in symbols node is not allowed.
+ */
 static int of_overlay_apply_single_property(struct of_overlay *ov,
struct device_node *target, struct property *prop,
bool is_symbols_node)
 {
struct property *propn = NULL, *tprop;
 
-   /* NOTE: Multiple changes of single properties not supported */
tprop = of_find_property(target, prop->name, NULL);
 
-   /* special properties are not meant to be updated (silent NOP) */
if (of_prop_cmp(prop->name, "name") == 0 ||
of_prop_cmp(prop->name, "phandle") == 0 ||
of_prop_cmp(prop->name, "linux,phandle") == 0)
return 0;
 
if (is_symbols_node) {
-   /* changing a property in __symbols__ node not allowed */
if (tprop)
return -EINVAL;
propn = dup_and_fixup_symbol_prop(ov, prop);
@@ -184,14 +184,19 @@ static int of_overlay_apply_single_property(struct 
of_overlay *ov,
if (propn == NULL)
return -ENOMEM;
 
-   /* not found? add */
if (tprop == NULL)
return of_changeset_add_property(&ov->cset, target, propn);
 
-   /* found? update */
return of_changeset_update_property(&ov->cset, target, propn);
 }
 
+/*
+ * NOTE: Multiple mods of created nodes not supported.
+ *
+ * Return
+ *  -ENOMEM if memory allocation fails
+ *  -EINVAL if existing node has a phandle and overlay node has a phandle
+ */
 static int of_overlay_apply_single_device_node(struct of_overlay *ov,
struct device_node *target, struct device_node *child)
 {
@@ -203,13 +208,11 @@ static int of_overlay_apply_single_device_node(struct 
of_overlay *ov,
if (cname == NULL)
return -ENOMEM;
 
-   /* NOTE: Multiple mods of created nodes not supported */
for_each_child_of_node(target, tchild)
if (!of_node_cmp(cname, kbasename(tchild->full_name)))
break;
 
if (tchild != NULL) {
-   /* new overlay phandle value conflicts with existing value */
if (child->phandle)
return -EINVAL;
 
@@ -217,12 +220,10 @@ static int of_overlay_apply_single_device_node(struct 
of_overlay *ov,
ret = of_overlay_apply_one(ov, tchild, child, 0);
of_node_put(tchild);
} else {
-   /* create empty tree as a target */
tchild = __of_node_dup(child, "%pOF/%s", target, cname);
if (!tchild)
return -ENOMEM;
 
-   /* point to parent */
tchild->parent = target;
 
ret = of_changeset_attach_node(&ov->cset, tchild);
@@ -243,6 +244,8 @@ static int of_overlay_apply_single_device_node(struct 
of_overlay *ov,
  * Note that the in case of an error the target node is left
  * in a inconsistent state. Error recovery should be performed
  * by using the changeset.
+ *
+ * Do not allow symbols node to have any children.
  */
 static int of_overlay_apply_one(struct of_overlay *ov,
struct device_node *target, const struct device_node *overlay,
@@ -262,7 +265,6 @@ static int of_overlay_apply_one(struct of_overlay *ov,
}
}
 
-   /* do not allow symbols node to have any children */
if (is_symbols_node)
return 0;
 
@@ -292,7 +294,6 @@ static int of_overlay_apply(struct of_overlay *ov)
 {
int i, err;
 
-   /* first we apply the overlays atomically */
for (i = 0; i < ov->count; i++) {
struct of_overlay_info *ovinfo = &ov->ovinfo_tab[i];
 
@@ -309,10 +310,10 @@ static int of_overlay_apply(struct of_overlay *ov)
 
 /*
  * Find the target node using a number of different strategies
- * in order of preference
+ * in order of preference:
  *
- * "target" property containing the phandle of the target
- * "target-path" property containing the path of the target
+ * 1) "targe

[PATCH 03/12] of: overlay: rename identifiers to more reflect what they do

2017-10-02 Thread frowand . list

From: Frank Rowand 

This patch is aimed primarily at drivers/of/overlay.c, but those
changes also have a small impact in a few other files.

overlay.c is difficult to read and maintain.  Improve readability:
  - Rename functions, types and variables to better reflect what
they do and to be consistent with names in other places,
such as the device tree overlay FDT (flattened device tree),
and make the algorithms more clear
  - Use the same names consistently throughout the file
  - Update comments for name changes
  - Fix incorrect comments

This patch is intended to not introduce any functional change.

Signed-off-by: Frank Rowand 
---
 Documentation/devicetree/overlay-notes.txt   |  12 +-
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |   5 +-
 drivers/of/dynamic.c |   2 +-
 drivers/of/overlay.c | 506 ++-
 drivers/of/unittest.c|  20 +-
 include/linux/of.h   |  12 +-
 6 files changed, 294 insertions(+), 263 deletions(-)

diff --git a/Documentation/devicetree/overlay-notes.txt 
b/Documentation/devicetree/overlay-notes.txt
index eb7f2685fda1..c4aa0adf13ec 100644
--- a/Documentation/devicetree/overlay-notes.txt
+++ b/Documentation/devicetree/overlay-notes.txt
@@ -87,15 +87,15 @@ Overlay in-kernel API
 
 The API is quite easy to use.
 
-1. Call of_overlay_create() to create and apply an overlay. The return value
-is a cookie identifying this overlay.
+1. Call of_overlay_apply() to create and apply an overlay changeset. The return
+value is an error or a cookie identifying this overlay.
 
-2. Call of_overlay_destroy() to remove and cleanup the overlay previously
-created via the call to of_overlay_create(). Removal of an overlay that
-is stacked by another will not be permitted.
+2. Call of_overlay_remove() to remove and cleanup the overlay changeset
+previously created via the call to of_overlay_apply(). Removal of an overlay
+changeset that is stacked by another will not be permitted.
 
 Finally, if you need to remove all overlays in one-go, just call
-of_overlay_destroy_all() which will remove every single one in the correct
+of_overlay_remove_all() which will remove every single one in the correct
 order.
 
 Overlay DTS Format
diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c 
b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
index 623a9140493c..5f5b7ba35f1d 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -247,9 +247,10 @@ static void __init tilcdc_convert_slave_node(void)
 
tilcdc_node_disable(slave);
 
-   ret = of_overlay_create(overlay);
+   ret = of_overlay_apply(overlay);
if (ret)
-   pr_err("%s: Creating overlay failed: %d\n", __func__, ret);
+   pr_err("%s: Applying overlay changeset failed: %d\n",
+   __func__, ret);
else
pr_info("%s: ti,tilcdc,slave node successfully converted\n",
__func__);
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 301b6db2b48d..124510d56421 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -775,7 +775,7 @@ int of_changeset_revert(struct of_changeset *ocs)
 EXPORT_SYMBOL_GPL(of_changeset_revert);
 
 /**
- * of_changeset_action - Perform a changeset action
+ * of_changeset_action - Add an action to the tail of the changeset list
  *
  * @ocs:   changeset pointer
  * @action:action to perform
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 8e0c7eb4858c..397ef10d1f26 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -25,67 +25,63 @@
 #include "of_private.h"
 
 /**
- * struct of_overlay_info - Holds a single overlay info
+ * struct fragment - info about fragment nodes in overlay expanded device tree
  * @target:target of the overlay operation
- * @overlay:   pointer to the overlay contents node
- *
- * Holds a single overlay state, including all the overlay logs &
- * records.
+ * @overlay:   pointer to the __overlay__ node
  */
-struct of_overlay_info {
+struct fragment {
struct device_node *target;
struct device_node *overlay;
bool is_symbols_node;
 };
 
 /**
- * struct of_overlay - Holds a complete overlay transaction
- * @node:  List on which we are located
- * @count: Count of ovinfo structures
- * @ovinfo_tab:Overlay info table (count sized)
- * @cset:  Changeset to be used
- *
- * Holds a complete overlay transaction
+ * struct overlay_changeset
+ * @ovcs_list: list on which we are located
+ * @count: count of @fragments structures
+ * @fragments: info about fragment nodes in overlay expanded device tree
+ * @cset:  changeset to apply fragments to live device tree
  */
-struct of_overlay {
+struct overlay_changeset {
int id;
-   struct list_head node;
+   struct list_head ovcs_list;
int count;
-   struct of_overl

[PATCH 00/12] of: overlay: clean up device tree overlay code

2017-10-02 Thread frowand . list

From: Frank Rowand 

I have found the device tree overlay code to be difficult to read and
maintain.  This patch series attempts to improve that situation.

The cleanup includes some changes visible to users of overlays.  The
only in kernel user of overlays is fixed up for those changes.  The
in kernel user is:

   drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c

Following the cleanup patches are a set of patches to fix various
issues.

The first five patches are intended to not make any functional
changes, and are segrated to ease review.

Frank Rowand (12):
  of: overlay.c: Remove comments that state the obvious, to reduce
clutter
  of: overlay.c: Convert comparisons to zero or NULL to logical
expressions
  of: overlay: rename identifiers to more reflect what they do
  of: overlay: rename identifiers in dup_and_fixup_symbol_prop()
  of: overlay: minor restructuring
  of: overlay: detect cases where device tree may become corrupt
  of: overlay: expand check of whether overlay changeset can be removed
  of: overlay: loosen overly strict phandle clash check
  of: overlay: avoid race condition between applying multiple overlays
  of: overlay: simplify applying symbols from an overlay
  of: overlay: remove a dependency on device node full_name
  of: overlay: remove unneeded check for NULL kbasename()

 Documentation/devicetree/overlay-notes.txt   |   12 +-
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |   15 +-
 drivers/of/base.c|2 +-
 drivers/of/dynamic.c |  137 +++-
 drivers/of/of_private.h  |   10 +-
 drivers/of/overlay.c | 1024 --
 drivers/of/unittest.c|   80 +-
 include/linux/of.h   |   33 +-
 8 files changed, 871 insertions(+), 442 deletions(-)

-- 
Frank Rowand

[PATCH 04/12] of: overlay: rename identifiers in dup_and_fixup_symbol_prop()

2017-10-02 Thread frowand . list

From: Frank Rowand 

More renaming of identifiers to better reflect what they do.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 397ef10d1f26..c350742ed2a2 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -90,17 +90,29 @@ static int overlay_notify(struct overlay_changeset *ovcs,
return 0;
 }
 
+/*
+ * The properties in the "/__symbols__" node are "symbols".
+ *
+ * The value of properties in the "/__symbols__" node is the path of a
+ * node in the subtree of a fragment node's "__overlay__" node, for
+ * example "/fragment@0/__overlay__/symbol_path_tail".  Symbol_path_tail
+ * can be a single node or it may be a multi-node path.
+ *
+ * The duplicated property value will be modified by replacing the
+ * "/fragment_name/__overlay/" portion of the value  with the target
+ * path from the fragment node.
+ */
 static struct property *dup_and_fixup_symbol_prop(
struct overlay_changeset *ovcs, const struct property *prop)
 {
struct fragment *fragment;
struct property *new;
const char *overlay_name;
-   char *label_path;
+   char *symbol_path_tail;
char *symbol_path;
const char *target_path;
int k;
-   int label_path_len;
+   int symbol_path_tail_len;
int overlay_name_len;
int target_path_len;
 
@@ -126,18 +138,18 @@ static struct property *dup_and_fixup_symbol_prop(
target_path = fragment->target->full_name;
target_path_len = strlen(target_path);
 
-   label_path = symbol_path + overlay_name_len;
-   label_path_len = strlen(label_path);
+   symbol_path_tail = symbol_path + overlay_name_len;
+   symbol_path_tail_len = strlen(symbol_path_tail);
 
new->name = kstrdup(prop->name, GFP_KERNEL);
-   new->length = target_path_len + label_path_len + 1;
+   new->length = target_path_len + symbol_path_tail_len + 1;
new->value = kzalloc(new->length, GFP_KERNEL);
 
if (!new->name || !new->value)
goto err_free;
 
strcpy(new->value, target_path);
-   strcpy(new->value + target_path_len, label_path);
+   strcpy(new->value + target_path_len, symbol_path_tail);
 
of_property_set_flag(new, OF_DYNAMIC);
 
-- 
Frank Rowand

[PATCH 07/12] of: overlay: expand check of whether overlay changeset can be removed

2017-10-02 Thread frowand . list

From: Frank Rowand 

The test of whether it is safe to remove an overlay changeset
looked at whether any node in the overlay changeset was in a
subtree rooted at any more recently applied overlay changeset
node.

The test failed to determine whether any node in the overlay
changeset was the root of a subtree that contained a more
recently applied overlay changeset node.  Add this additional
check to the test.

The test is still lacking any check for any phandle dependencies.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index c7526186b1c8..015d8b112f60 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -698,13 +698,13 @@ static int find_node(struct device_node *tree, struct 
device_node *np)
 }
 
 /*
- * Is @remove_ce_np a child of or the same as any
+ * Is @remove_ce_node a child of, a parent of, or the same as any
  * node in an overlay changeset more topmost than @remove_ovcs?
  *
  * Returns 1 if found, else 0
  */
-static int node_in_later_cs(struct overlay_changeset *remove_ovcs,
-   struct device_node *remove_ce_np)
+static int node_overlaps_later_cs(struct overlay_changeset *remove_ovcs,
+   struct device_node *remove_ce_node)
 {
struct overlay_changeset *ovcs;
struct of_changeset_entry *ce;
@@ -714,10 +714,16 @@ static int node_in_later_cs(struct overlay_changeset 
*remove_ovcs,
break;
 
list_for_each_entry(ce, &ovcs->cset.entries, node) {
-   if (find_node(ce->np, remove_ce_np)) {
-   pr_err("%s: #%d clashes #%d @%pOF\n",
+   if (find_node(ce->np, remove_ce_node)) {
+   pr_err("%s: #%d overlaps with #%d @%pOF\n",
__func__, remove_ovcs->id, ovcs->id,
-   remove_ce_np);
+   remove_ce_node);
+   return 1;
+   }
+   if (find_node(remove_ce_node, ce->np)) {
+   pr_err("%s: #%d overlaps with #%d @%pOF\n",
+   __func__, remove_ovcs->id, ovcs->id,
+   remove_ce_node);
return 1;
}
}
@@ -741,7 +747,7 @@ static int overlay_removal_is_ok(struct overlay_changeset 
*remove_ovcs)
struct of_changeset_entry *remove_ce;
 
list_for_each_entry(remove_ce, &remove_ovcs->cset.entries, node) {
-   if (node_in_later_cs(remove_ovcs, remove_ce->np)) {
+   if (node_overlaps_later_cs(remove_ovcs, remove_ce->np)) {
pr_err("overlay #%d is not topmost\n", remove_ovcs->id);
return 0;
}
-- 
Frank Rowand

[PATCH 06/12] of: overlay: detect cases where device tree may become corrupt

2017-10-02 Thread frowand . list

From: Frank Rowand 

When an attempt to apply an overlay changeset fails, an effort
is made to revert any partial application of the changeset.
When an attempt to remove an overlay changeset fails, an effort
is made to re-apply any partial reversion of the changeset.

The existing code does not check for failure to recover a failed
overlay changeset application or overlay changeset revert.

Add the missing checks and flag the devicetree as corrupt if the
state of the devicetree can not be determined.

Improve and expand the returned errors to more fully reflect the
result of the effort to undo the partial effects of a failed attempt
to apply or remove an overlay changeset.

If the device tree might be corrupt, do not allow further attempts
to apply or remove an overlay changeset.

When creating an overlay changeset from an overlay device tree,
add some additional warnings if the state of the overlay device
tree is not as expected.

Signed-off-by: Frank Rowand 
---
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |   5 +-
 drivers/of/dynamic.c | 135 +++---
 drivers/of/of_private.h  |   8 +-
 drivers/of/overlay.c | 253 ++-
 drivers/of/unittest.c|  57 +++---
 include/linux/of.h   |  10 +-
 6 files changed, 372 insertions(+), 96 deletions(-)

diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c 
b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
index 5f5b7ba35f1d..7a7be0515bfd 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -204,7 +204,7 @@ static void __init tilcdc_convert_slave_node(void)
/* For all memory needed for the overlay tree. This memory can
   be freed after the overlay has been applied. */
struct kfree_table kft;
-   int ret;
+   int ovcs_id, ret;
 
if (kfree_table_init(&kft))
return;
@@ -247,7 +247,8 @@ static void __init tilcdc_convert_slave_node(void)
 
tilcdc_node_disable(slave);
 
-   ret = of_overlay_apply(overlay);
+   ovcs_id = 0;
+   ret = of_overlay_apply(overlay, &ovcs_id);
if (ret)
pr_err("%s: Applying overlay changeset failed: %d\n",
__func__, ret);
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 124510d56421..c1026efd6f9e 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -508,11 +508,12 @@ static void __of_changeset_entry_invert(struct 
of_changeset_entry *ce,
}
 }
 
-static void __of_changeset_entry_notify(struct of_changeset_entry *ce, bool 
revert)
+static int __of_changeset_entry_notify(struct of_changeset_entry *ce,
+   bool revert)
 {
struct of_reconfig_data rd;
struct of_changeset_entry ce_inverted;
-   int ret;
+   int ret = 0;
 
if (revert) {
__of_changeset_entry_invert(ce, &ce_inverted);
@@ -534,11 +535,12 @@ static void __of_changeset_entry_notify(struct 
of_changeset_entry *ce, bool reve
default:
pr_err("invalid devicetree changeset action: %i\n",
(int)ce->action);
-   return;
+   ret = -EINVAL;
}
 
if (ret)
pr_err("changeset notifier error @%pOF\n", ce->np);
+   return ret;
 }
 
 static int __of_changeset_entry_apply(struct of_changeset_entry *ce)
@@ -672,32 +674,82 @@ void of_changeset_destroy(struct of_changeset *ocs)
 }
 EXPORT_SYMBOL_GPL(of_changeset_destroy);
 
-int __of_changeset_apply(struct of_changeset *ocs)
+/*
+ * Apply the changeset entries in @ocs.
+ * If apply fails, an attempt is made to revert the entries that were
+ * successfully applied.
+ *
+ * If multiple revert errors occur then only the final revert error is 
reported.
+ *
+ * Returns 0 on success, a negative error value in case of an error.
+ * If a revert error occurs, it is returned in *ret_revert.
+ */
+int __of_changeset_apply_entries(struct of_changeset *ocs, int *ret_revert)
 {
struct of_changeset_entry *ce;
-   int ret;
+   int ret, ret_tmp;
 
-   /* perform the rest of the work */
pr_debug("changeset: applying...\n");
list_for_each_entry(ce, &ocs->entries, node) {
ret = __of_changeset_entry_apply(ce);
if (ret) {
pr_err("Error applying changeset (%d)\n", ret);
-   list_for_each_entry_continue_reverse(ce, &ocs->entries, 
node)
-   __of_changeset_entry_revert(ce);
+   list_for_each_entry_continue_reverse(ce, &ocs->entries,
+node) {
+   ret_tmp = __of_changeset_entry_revert(ce);
+   if (ret_tmp)
+   *ret_revert = ret_tmp;
+   }

[PATCH 08/12] of: overlay: loosen overly strict phandle clash check

2017-10-02 Thread frowand . list

From: Frank Rowand 

When an overlay contains a node that already exists in
the live device tree, the overlay node is not allowed
to change the phandle of the existing node.

The existing check refused to allow an overlay node to
set the node phandle even when the existing node did
not have a phandle.  Relax the check to allow an
overlay node to set the phandle value if the existing
node does not have a phandle.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 015d8b112f60..a0d3222febdc 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -302,10 +302,10 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
return build_changeset_next_level(ovcs, tchild, node, 0);
}
 
-   if (node->phandle)
-   return -EINVAL;
-
-   ret = build_changeset_next_level(ovcs, tchild, node, 0);
+   if (node->phandle && tchild->phandle)
+   ret = -EINVAL;
+   else
+   ret = build_changeset_next_level(ovcs, tchild, node, 0);
of_node_put(tchild);
 
return ret;
-- 
Frank Rowand

[PATCH 02/12] of: overlay.c: Convert comparisons to zero or NULL to logical expressions

2017-10-02 Thread frowand . list

From: Frank Rowand 

Use normal shorthand for comparing a variable to zero.
For variable "XXX":
   convert (XXX == 0) to (!XXX)
   convert (XXX != 0) to (XXX)

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 26f63f10f4b0..8e0c7eb4858c 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -168,9 +168,9 @@ static int of_overlay_apply_single_property(struct 
of_overlay *ov,
 
tprop = of_find_property(target, prop->name, NULL);
 
-   if (of_prop_cmp(prop->name, "name") == 0 ||
-   of_prop_cmp(prop->name, "phandle") == 0 ||
-   of_prop_cmp(prop->name, "linux,phandle") == 0)
+   if (!of_prop_cmp(prop->name, "name") ||
+   !of_prop_cmp(prop->name, "phandle") ||
+   !of_prop_cmp(prop->name, "linux,phandle"))
return 0;
 
if (is_symbols_node) {
@@ -181,10 +181,10 @@ static int of_overlay_apply_single_property(struct 
of_overlay *ov,
propn = __of_prop_dup(prop, GFP_KERNEL);
}
 
-   if (propn == NULL)
+   if (!propn)
return -ENOMEM;
 
-   if (tprop == NULL)
+   if (!tprop)
return of_changeset_add_property(&ov->cset, target, propn);
 
return of_changeset_update_property(&ov->cset, target, propn);
@@ -205,14 +205,14 @@ static int of_overlay_apply_single_device_node(struct 
of_overlay *ov,
int ret = 0;
 
cname = kbasename(child->full_name);
-   if (cname == NULL)
+   if (!cname)
return -ENOMEM;
 
for_each_child_of_node(target, tchild)
if (!of_node_cmp(cname, kbasename(tchild->full_name)))
break;
 
-   if (tchild != NULL) {
+   if (tchild) {
if (child->phandle)
return -EINVAL;
 
@@ -270,7 +270,7 @@ static int of_overlay_apply_one(struct of_overlay *ov,
 
for_each_child_of_node(overlay, child) {
ret = of_overlay_apply_single_device_node(ov, target, child);
-   if (ret != 0) {
+   if (ret) {
pr_err("Failed to apply single node @%pOF/%s\n",
   target, child->name);
of_node_put(child);
@@ -299,7 +299,7 @@ static int of_overlay_apply(struct of_overlay *ov)
 
err = of_overlay_apply_one(ov, ovinfo->target, ovinfo->overlay,
   ovinfo->is_symbols_node);
-   if (err != 0) {
+   if (err) {
pr_err("apply failed '%pOF'\n", ovinfo->target);
return err;
}
@@ -322,11 +322,11 @@ static struct device_node *find_target_node(struct 
device_node *info_node)
int ret;
 
ret = of_property_read_u32(info_node, "target", &val);
-   if (ret == 0)
+   if (!ret)
return of_find_node_by_phandle(val);
 
ret = of_property_read_string(info_node, "target-path", &path);
-   if (ret == 0)
+   if (!ret)
return of_find_node_by_path(path);
 
pr_err("Failed to find target for node %p (%s)\n",
@@ -353,11 +353,11 @@ static int of_fill_overlay_info(struct of_overlay *ov,
struct device_node *info_node, struct of_overlay_info *ovinfo)
 {
ovinfo->overlay = of_get_child_by_name(info_node, "__overlay__");
-   if (ovinfo->overlay == NULL)
+   if (!ovinfo->overlay)
goto err_fail;
 
ovinfo->target = find_target_node(info_node);
-   if (ovinfo->target == NULL)
+   if (!ovinfo->target)
goto err_fail;
 
return 0;
@@ -397,13 +397,13 @@ static int of_build_overlay_info(struct of_overlay *ov,
cnt++;
 
ovinfo = kcalloc(cnt, sizeof(*ovinfo), GFP_KERNEL);
-   if (ovinfo == NULL)
+   if (!ovinfo)
return -ENOMEM;
 
cnt = 0;
for_each_child_of_node(tree, node) {
err = of_fill_overlay_info(ov, node, &ovinfo[cnt]);
-   if (err == 0)
+   if (!err)
cnt++;
}
 
@@ -421,7 +421,7 @@ static int of_build_overlay_info(struct of_overlay *ov,
cnt++;
}
 
-   if (cnt == 0) {
+   if (!cnt) {
kfree(ovinfo);
return -ENODEV;
}
@@ -477,7 +477,7 @@ int of_overlay_create(struct device_node *tree)
int err, id;
 
ov = kzalloc(sizeof(*ov), GFP_KERNEL);
-   if (ov == NULL)
+   if (!ov)
return -ENOMEM;
ov->id = -1;
 
@@ -628,7 +628,7 @@ int of_overlay_destroy(int id)
mutex_lock(&of_mutex);
 
ov = idr_find(&ov_idr, id);
-   if (ov == NULL) {
+   if (!ov) {
err = -ENODEV;
pr_err("destroy: Could not find overlay #%d\n", id);

[PATCH 11/12] of: overlay: remove a dependency on device node full_name

2017-10-02 Thread frowand . list

From: Frank Rowand 

The "%pOF" printf format was recently added to print the
full name of a device tree node, with the intent of changing
the node full_name field to contain only the node name instead
of the full path of the node.

dup_and_fixup_symbol_prop() duplicates a property from the
"/__symbols__" node of an overlay device tree.  The value
of each duplicated property must be fixed up to include
the full path of a node in the live device tree.  The
current code uses the node's full_name for that purpose.
Update the code to use the "%pOF" printf format to
determine the node's full path.

Signed-off-by: Frank Rowand 
---
 drivers/of/base.c   |  2 +-
 drivers/of/of_private.h |  2 ++
 drivers/of/overlay.c| 90 ++---
 3 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 260d33c0f26c..6f91fa67e5bb 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -760,7 +760,7 @@ struct device_node *of_get_child_by_name(const struct 
device_node *node,
 }
 EXPORT_SYMBOL(of_get_child_by_name);
 
-static struct device_node *__of_find_node_by_path(struct device_node *parent,
+struct device_node *__of_find_node_by_path(struct device_node *parent,
const char *path)
 {
struct device_node *child;
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index b66e8a812147..0c9e473801f2 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -81,6 +81,8 @@ extern void *__unflatten_device_tree(const void *blob,
 struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags);
 __printf(2, 3) struct device_node *__of_node_dup(const struct device_node *np, 
const char *fmt, ...);
 
+struct device_node *__of_find_node_by_path(struct device_node *parent,
+   const char *path);
 struct device_node *__of_find_node_by_full_path(struct device_node *node,
const char *path);
 
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 172807d3f375..81881e45f273 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -37,6 +37,7 @@ struct fragment {
 /**
  * struct overlay_changeset
  * @ovcs_list: list on which we are located
+ * @overlay_tree:  expanded device tree that contains the fragment nodes
  * @count: count of fragment structures
  * @fragments: fragment nodes in the overlay expanded device tree
  * @symbols_fragment:  last element of @fragments[] is the  __symbols__ node
@@ -45,6 +46,7 @@ struct fragment {
 struct overlay_changeset {
int id;
struct list_head ovcs_list;
+   struct device_node *overlay_tree;
int count;
struct fragment *fragments;
bool symbols_fragment;
@@ -145,12 +147,13 @@ static int overlay_notify(struct overlay_changeset *ovcs,
 }
 
 /*
- * The properties in the "/__symbols__" node are "symbols".
+ * The values of properties in the "/__symbols__" node are paths in
+ * the ovcs->overlay_tree.  When duplicating the properties, the paths
+ * need to be adjusted to be the correct path for the live device tree.
  *
- * The value of properties in the "/__symbols__" node is the path of a
- * node in the subtree of a fragment node's "__overlay__" node, for
- * example "/fragment@0/__overlay__/symbol_path_tail".  Symbol_path_tail
- * can be a single node or it may be a multi-node path.
+ * The paths refer to a node in the subtree of a fragment node's "__overlay__"
+ * node, for example "/fragment@0/__overlay__/symbol_path_tail",
+ * where symbol_path_tail can be a single node or it may be a multi-node path.
  *
  * The duplicated property value will be modified by replacing the
  * "/fragment_name/__overlay/" portion of the value  with the target
@@ -160,59 +163,76 @@ static struct property *dup_and_fixup_symbol_prop(
struct overlay_changeset *ovcs, const struct property *prop)
 {
struct fragment *fragment;
-   struct property *new;
-   const char *overlay_name;
-   char *symbol_path_tail;
-   char *symbol_path;
+   struct property *new_prop;
+   struct device_node *fragment_node;
+   struct device_node *overlay_node;
+   const char *path;
+   const char *path_tail;
const char *target_path;
int k;
-   int symbol_path_tail_len;
int overlay_name_len;
+   int path_len;
+   int path_tail_len;
int target_path_len;
 
if (!prop->value)
return NULL;
-   symbol_path = prop->value;
+   if (strnlen(prop->value, prop->length) >= prop->length)
+   return NULL;
+   path = prop->value;
+   path_len = strlen(path);
 
-   new = kzalloc(sizeof(*new), GFP_KERNEL);
-   if (!new)
+   if (path_len < 1)
return NULL;
+   fragment_node = __of_find_node_by_path(ovcs->overlay_tree, path

[PATCH 10/12] of: overlay: simplify applying symbols from an overlay

2017-10-02 Thread frowand . list

From: Frank Rowand 

The code to apply symbols from an overlay to the live device tree
was implemented with the intent to be minimally intrusive on the
existing code.  After recent restructuring of the overlay apply
code, it is easier to disintangle the code that applies the
symbols, and to make the overlay changeset creation code more
straight forward and understandable.

Remove the extra complexity, and make the code more obvious.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 91 +---
 1 file changed, 65 insertions(+), 26 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 4ed372af6ce7..172807d3f375 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -32,21 +32,22 @@
 struct fragment {
struct device_node *target;
struct device_node *overlay;
-   bool is_symbols_node;
 };
 
 /**
  * struct overlay_changeset
- * @ovcs_list: list on which we are located
- * @count: count of @fragments structures
- * @fragments: info about fragment nodes in overlay expanded device tree
- * @cset:  changeset to apply fragments to live device tree
+ * @ovcs_list: list on which we are located
+ * @count: count of fragment structures
+ * @fragments: fragment nodes in the overlay expanded device tree
+ * @symbols_fragment:  last element of @fragments[] is the  __symbols__ node
+ * @cset:  changeset to apply fragments to live device tree
  */
 struct overlay_changeset {
int id;
struct list_head ovcs_list;
int count;
struct fragment *fragments;
+   bool symbols_fragment;
struct of_changeset cset;
 };
 
@@ -68,8 +69,7 @@ static int devicetree_corrupt(void)
 
 static int build_changeset_next_level(struct overlay_changeset *ovcs,
struct device_node *target_node,
-   const struct device_node *overlay_node,
-   bool is_symbols_node);
+   const struct device_node *overlay_node);
 
 /*
  * of_resolve_phandles() finds the largest phandle in the live tree.
@@ -221,7 +221,7 @@ static struct property *dup_and_fixup_symbol_prop(
  * @ovcs:  overlay changeset
  * @target_node:   where to place @overlay_prop in live tree
  * @overlay_prop:  property to add or update, from overlay tree
- * is_symbols_node:1 if @target_node is "/__symbols__"
+ * @is_symbols_prop:   1 if @overlay_prop is from node "/__symbols__"
  *
  * If @overlay_prop does not already exist in @target_node, add changeset entry
  * to add @overlay_prop in @target_node, else add changeset entry to update
@@ -237,7 +237,7 @@ static struct property *dup_and_fixup_symbol_prop(
 static int add_changeset_property(struct overlay_changeset *ovcs,
struct device_node *target_node,
struct property *overlay_prop,
-   bool is_symbols_node)
+   bool is_symbols_prop)
 {
struct property *new_prop = NULL, *prop;
 
@@ -248,7 +248,7 @@ static int add_changeset_property(struct overlay_changeset 
*ovcs,
!of_prop_cmp(overlay_prop->name, "linux,phandle"))
return 0;
 
-   if (is_symbols_node) {
+   if (is_symbols_prop) {
if (prop)
return -EINVAL;
new_prop = dup_and_fixup_symbol_prop(ovcs, overlay_prop);
@@ -321,13 +321,13 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
if (ret)
return ret;
 
-   return build_changeset_next_level(ovcs, tchild, node, 0);
+   return build_changeset_next_level(ovcs, tchild, node);
}
 
if (node->phandle && tchild->phandle)
ret = -EINVAL;
else
-   ret = build_changeset_next_level(ovcs, tchild, node, 0);
+   ret = build_changeset_next_level(ovcs, tchild, node);
of_node_put(tchild);
 
return ret;
@@ -338,7 +338,6 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
  * @ovcs:  overlay changeset
  * @target_node:   where to place @overlay_node in live tree
  * @overlay_node:  node from within an overlay device tree fragment
- * @is_symbols_node:   @overlay_node is node "/__symbols__"
  *
  * Add the properties (if any) and nodes (if any) from @overlay_node to the
  * @ovcs->cset changeset.  If an added node has child nodes, they will
@@ -351,16 +350,14 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
  */
 static int build_changeset_next_level(struct overlay_changeset *ovcs,
struct device_node *target_node,
-   const struct device_node *overlay_node,
-   bool is_symbols_node)
+   const struct device_node *overlay_node)
 {
struct device_node *child;
struct property *prop;
int ret;
 
for_each_property_of_node(overlay_node, prop) {
-   ret = add_cha

[PATCH 05/12] of: overlay: minor restructuring

2017-10-02 Thread frowand . list

From: Frank Rowand 

Continue improving the readability of overlay.c.  The previous patches
renamed identifiers.  This patch is split out from the previous patches
to make the previous patches easier to review.

Changes are:
  - minor code restructuring
  - some initialization of an overlay changeset occurred outside of
init_overlay_changeset(), move that into init_overlay_changeset()
  - consolidate freeing an overlay changeset into free_overlay_changeset()

This patch is intended to not introduce any functional change.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 205 +++
 1 file changed, 92 insertions(+), 113 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index c350742ed2a2..0f92a5c26748 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -55,6 +55,9 @@ static int build_changeset_next_level(struct 
overlay_changeset *ovcs,
const struct device_node *overlay_node,
bool is_symbols_node);
 
+static LIST_HEAD(ovcs_list);
+static DEFINE_IDR(ovcs_idr);
+
 static BLOCKING_NOTIFIER_HEAD(overlay_notify_chain);
 
 int of_overlay_notifier_register(struct notifier_block *nb)
@@ -160,8 +163,6 @@ static struct property *dup_and_fixup_symbol_prop(
kfree(new->value);
kfree(new);
return NULL;
-
-
 }
 
 /**
@@ -249,13 +250,7 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
if (!of_node_cmp(node_kbasename, kbasename(tchild->full_name)))
break;
 
-   if (tchild) {
-   if (node->phandle)
-   return -EINVAL;
-
-   ret = build_changeset_next_level(ovcs, tchild, node, 0);
-   of_node_put(tchild);
-   } else {
+   if (!tchild) {
tchild = __of_node_dup(node, "%pOF/%s",
   target_node, node_kbasename);
if (!tchild)
@@ -267,11 +262,15 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
if (ret)
return ret;
 
-   ret = build_changeset_next_level(ovcs, tchild, node, 0);
-   if (ret)
-   return ret;
+   return build_changeset_next_level(ovcs, tchild, node, 0);
}
 
+   if (node->phandle)
+   return -EINVAL;
+
+   ret = build_changeset_next_level(ovcs, tchild, node, 0);
+   of_node_put(tchild);
+
return ret;
 }
 
@@ -385,41 +384,6 @@ static struct device_node *find_target_node(struct 
device_node *info_node)
 }
 
 /**
- * of_fill_overlay_info() - Fill an overlay info structure
- * @ov Overlay to fill
- * @info_node: Device node containing the overlay
- * @ovinfo:Pointer to the overlay info structure to fill
- *
- * Fills an overlay info structure with the overlay information
- * from a device node. This device node must have a target property
- * which contains a phandle of the overlay target node, and an
- * __overlay__ child node which has the overlay contents.
- * Both ovinfo->target & ovinfo->overlay have their references taken.
- *
- * Returns 0 on success, or a negative error value.
- */
-static int of_fill_overlay_info(struct of_overlay *ov,
-   struct device_node *info_node, struct of_overlay_info *ovinfo)
-{
-   ovinfo->overlay = of_get_child_by_name(info_node, "__overlay__");
-   if (!ovinfo->overlay)
-   goto err_fail;
-
-   ovinfo->target = find_target_node(info_node);
-   if (!ovinfo->target)
-   goto err_fail;
-
-   return 0;
-
-err_fail:
-   of_node_put(ovinfo->target);
-   of_node_put(ovinfo->overlay);
-
-   memset(ovinfo, 0, sizeof(*ovinfo));
-   return -EINVAL;
-}
-
-/**
  * init_overlay_changeset() - initialize overlay changeset from overlay tree
  * @ovcs   Overlay changeset to build
  * @tree:  Contains all the overlay fragments and overlay fixup nodes
@@ -429,32 +393,61 @@ static int of_fill_overlay_info(struct of_overlay *ov,
  * nodes and the __symbols__ node.  Any other top level node will be ignored.
  *
  * Returns 0 on success, -ENOMEM if memory allocation failure, -EINVAL if error
- * detected in @tree, or -ENODEV if no valid nodes found.
+ * detected in @tree, or -ENOSPC if idr_alloc() error.
  */
 static int init_overlay_changeset(struct overlay_changeset *ovcs,
struct device_node *tree)
 {
-   struct device_node *node;
+   struct device_node *node, *overlay_node;
struct fragment *fragment;
struct fragment *fragments;
int cnt, ret;
 
+   INIT_LIST_HEAD(&ovcs->ovcs_list);
+
+   of_changeset_init(&ovcs->cset);
+
+   ovcs->id = idr_alloc(&ovcs_idr, ovcs, 1, 0, GFP_KERNEL);
+   if (ovcs->id <= 0)
+   return ovcs->id;
+
cnt = 0;
-   for_each_child_of_node(tree, node)
-   cnt++;
 
-   if (of_get_child_by_name(tree, "__symbols__"))

[PATCH 09/12] of: overlay: avoid race condition between applying multiple overlays

2017-10-02 Thread frowand . list

From: Frank Rowand 

The process of applying an overlay consists of:
  - unflatten an overlay FDT (flattened device tree) into an
EDT (expanded device tree)
  - fixup the phandle values in the overlay EDT to fit in a
range above the phandle values in the live device tree
  - create the overlay changeset to reflect the contents of
the overlay EDT
  - apply the overlay changeset, to modify the live device tree,
potentially changing the maximum phandle value in the live
device tree

There is currently no protection against two overlay applies
concurrently determining what range of phandle values are in use
in the live device tree, and subsequently changing that range.
Add a mutex to prevent multiple overlay applies from occurring
simultaneously.

Ignoring 2 checkpatch warnings: Prefer using '"%s...", __func__'
so that the WARN() string will be more easily grepped.

Signed-off-by: Frank Rowand 
---
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |  7 +++
 drivers/of/overlay.c | 22 ++
 drivers/of/unittest.c| 21 +
 include/linux/of.h   | 19 +++
 4 files changed, 69 insertions(+)

diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c 
b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
index 7a7be0515bfd..c99f7924b1c6 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -221,6 +221,11 @@ static void __init tilcdc_convert_slave_node(void)
goto out;
}
 
+   /*
+* protect from of_resolve_phandles() through of_overlay_apply()
+*/
+   of_overlay_mutex_lock();
+
overlay = tilcdc_get_overlay(&kft);
if (!overlay)
goto out;
@@ -256,6 +261,8 @@ static void __init tilcdc_convert_slave_node(void)
pr_info("%s: ti,tilcdc,slave node successfully converted\n",
__func__);
 out:
+   of_overlay_mutex_unlock();
+
kfree_table_free(&kft);
of_node_put(i2c);
of_node_put(slave);
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index a0d3222febdc..4ed372af6ce7 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -71,6 +71,28 @@ static int build_changeset_next_level(struct 
overlay_changeset *ovcs,
const struct device_node *overlay_node,
bool is_symbols_node);
 
+/*
+ * of_resolve_phandles() finds the largest phandle in the live tree.
+ * of_overlay_apply() may add a larger phandle to the live tree.
+ * Do not allow race between two overlays being applied simultaneously:
+ *mutex_lock(&of_overlay_phandle_mutex)
+ *of_resolve_phandles()
+ *of_overlay_apply()
+ *mutex_unlock(&of_overlay_phandle_mutex)
+ */
+static DEFINE_MUTEX(of_overlay_phandle_mutex);
+
+void of_overlay_mutex_lock(void)
+{
+   mutex_lock(&of_overlay_phandle_mutex);
+}
+
+void of_overlay_mutex_unlock(void)
+{
+   mutex_unlock(&of_overlay_phandle_mutex);
+}
+
+
 static LIST_HEAD(ovcs_list);
 static DEFINE_IDR(ovcs_idr);
 
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index db2f170186de..f4c8aff21320 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -994,9 +994,17 @@ static int __init unittest_data_add(void)
return -ENODATA;
}
of_node_set_flag(unittest_data_node, OF_DETACHED);
+
+   /*
+* This lock normally encloses of_overlay_apply() as well as
+* of_resolve_phandles().
+*/
+   of_overlay_mutex_lock();
+
rc = of_resolve_phandles(unittest_data_node);
if (rc) {
pr_err("%s: Failed to resolve phandles (rc=%i)\n", __func__, 
rc);
+   of_overlay_mutex_unlock();
return -EINVAL;
}
 
@@ -1006,6 +1014,7 @@ static int __init unittest_data_add(void)
__of_attach_node_sysfs(np);
of_aliases = of_find_node_by_path("/aliases");
of_chosen = of_find_node_by_path("/chosen");
+   of_overlay_mutex_unlock();
return 0;
}
 
@@ -1018,6 +1027,9 @@ static int __init unittest_data_add(void)
attach_node_and_children(np);
np = next;
}
+
+   of_overlay_mutex_unlock();
+
return 0;
 }
 
@@ -2150,9 +2162,12 @@ static int __init overlay_data_add(int onum)
}
of_node_set_flag(info->np_overlay, OF_DETACHED);
 
+   of_overlay_mutex_lock();
+
ret = of_resolve_phandles(info->np_overlay);
if (ret) {
pr_err("resolve ot phandles (ret=%d), %d\n", ret, onum);
+   of_overlay_mutex_unlock();
goto out_free_np_overlay;
}
 
@@ -2160,9 +2175,12 @@ static int __init overlay_data_add(int onum)
ret = of_overlay_apply(info->np_overlay, &info->overlay_id);
if (ret < 0) {
pr_err("of_overlay_ap

[PATCH 12/12] of: overlay: remove unneeded check for NULL kbasename()

2017-10-02 Thread frowand . list

From: Frank Rowand 

kbasename() will not return NULL if passed a valid string.  If
the parameter passed to kbasename() in this case is already NULL
then the devicetree has been corrupted.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 81881e45f273..88df2986b03f 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -322,8 +322,6 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
int ret = 0;
 
node_kbasename = kbasename(node->full_name);
-   if (!node_kbasename)
-   return -ENOMEM;
 
for_each_child_of_node(target_node, tchild)
if (!of_node_cmp(node_kbasename, kbasename(tchild->full_name)))
-- 
Frank Rowand

Re: 4879b7ae05 ("Merge tag 'dmaengine-4.12-rc1' of .."): WARNING: kernel stack regs at bd92bc2e in 01-cpu-hotplug:3811 has bad 'bp' value 000001be

2017-10-02 Thread Fengguang Wu


Hi Josh,

On Mon, Oct 02, 2017 at 04:31:09PM -0500, Josh Poimboeuf wrote:

On Mon, Oct 02, 2017 at 04:26:54PM -0500, Josh Poimboeuf wrote:

Fengguang, assuming it's reliably recreatable, any chance you could
recreate with the following patch?


Sure, I'll try your patch on v4.14-rc3 since it looks the most
reproducible kernel. For the bisected 4879b7ae05, the warning only
shows up once out of 909 boots according to the below stats. So I'm
not sure whether it's the _first_ bad commit. To double confirm, I
just queued 5000 more boot tests for each of its parent commits.


ecc721a72c  Merge tag 'pwm/for-4.12-rc1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/thierry.reding/linux-pwm
be13ec668d  Merge branch 'topic/pl330' into for-linus
4879b7ae05  Merge tag 'dmaengine-4.12-rc1' of 
git://git.infradead.org/users/vkoul/slave-dma
9e66317d3c  Linux 4.14-rc3
1418b85217  Add linux-next specific files for 20170929
+---++++---+---+
|   | ecc721a72c | 
be13ec668d | 4879b7ae05 | v4.14-rc3 | next-20170929 |
+---++++---+---+
| boot_successes| 1009   | 1009 
  | 909| 5 | 510   |
| boot_failures | 0  | 0
  | 1  | 4 | 153   |
| WARNING:kernel_stack  | 0  | 0
  | 1  | 3 | 111   |
| BUG:unable_to_handle_kernel   | 0  | 0
  | 0  | 3 | 48|
| Oops:#[##]| 0  | 0
  | 0  | 3 | 48|
| EIP:update_stack_state| 0  | 0
  | 0  | 3 | 48|
| Kernel_panic-not_syncing:Fatal_exception_in_interrupt | 0  | 0
  | 0  | 3 | 48|
| invoked_oom-killer:gfp_mask=0x| 0  | 0
  | 0  | 1 | 16|
| Mem-Info  | 0  | 0
  | 0  | 1 | 16|
| EIP:clear_user| 0  | 0
  | 0  | 0 | 2 |
| EIP:copy_page_to_iter | 0  | 0
  | 0  | 0 | 1 |
+---++++---+---+



Sorry, here's a version which actually compiles.


OK.

Thanks,
Fengguang


diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d145a0b1f529..00234fa5a33a 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state)
state->stack_info.type, state->stack_info.next_sp,
state->stack_mask, state->graph_idx);

-   for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, 
sizeof(long))) {
+   for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
break;

Re: [RESEND PATCH] prctl: add PR_[GS]ET_PDEATHSIG_PROC

2017-10-02 Thread Eric W. Biederman

Andrew Morton  writes:

> On Fri, 29 Sep 2017 14:30:58 +0200 Jürg Billeter  wrote:
>
>> PR_SET_PDEATHSIG sets a parent death signal that the calling process
>> will get when its parent thread dies, even when the result of getppid()
>> doesn't change because the calling process is reparented to a different
>> thread in the same parent process. When managing multiple processes, a
>> process-based parent death signal is much more useful. E.g., to avoid
>> stray child processes.
>> 
>> PR_SET_PDEATHSIG_PROC sets a process-based death signal. Unlike
>> PR_SET_PDEATHSIG, this is inherited across fork to allow killing a whole
>> subtree without race conditions.
>> 
>> This can be used for sandboxing when combined with a seccomp filter.
>> 
>> There have been previous attempts to support this by changing the
>> behavior of PR_SET_PDEATHSIG. However, that would break existing
>> applications. See https://marc.info/?l=linux-kernel&m=117621804801689
>> and https://bugzilla.kernel.org/show_bug.cgi?id=43300
>
> Are Eric and Oleg OK with this?
>
> A prctl manpage update will be needed, please (cc linux-api).

It makes for an interesting way of killing a process tree.  The domino
effect.

I believe the rational for adding a new prctl.

The code where it calls group_send_sig_info is buggy for pdeath_signal.
And it no less buggy for this new case.  There is no point to check
permissions when sending a signal to yourself.  Especially this signal
gets cleared during exec with a change of permissions.

I would recommend using:
 do_send_sig_info(p->signal->pdeath_signal_proc, SEND_SIG_NOINFO, p, true);

Perhaps with a comment saying that no permission check is needed when
sending a signal to yourself.

I don't know what I think about inherit over fork, and the whole tree
killing thing.  Except when the signal is SIGKILL I don't know if that
code does what is intended.  So I am a little leary of it.

Eric

Re: [RFC v2 PATCH] x86/boot: Add the secdata section to the setup header

2017-10-02 Thread Gary Lin

On Fri, Sep 08, 2017 at 01:59:00PM -0700, h...@zytor.com wrote:
> On September 8, 2017 2:45:10 AM PDT, Gary Lin  wrote:
> >On Thu, Sep 07, 2017 at 02:16:21PM -0700, h...@zytor.com wrote:
> >> On September 7, 2017 2:44:51 AM PDT, Gary Lin  wrote:
> >> >On Thu, Jun 01, 2017 at 08:46:26AM +, Ard Biesheuvel wrote:
> >> >> On 1 June 2017 at 08:11, Gary Lin  wrote:
> >> >> > On Fri, May 12, 2017 at 04:05:34PM +0800, Gary Lin wrote:
> >> >> >> A new section, secdata, in the setup header is introduced to
> >store
> >> >the
> >> >> >> distro-specific security version which is designed to help the
> >> >> >> bootloader to warn the user when loading a less secure or
> >> >vulnerable
> >> >> >> kernel. The secdata section can be presented as the following:
> >> >> >>
> >> >> >> struct sec_hdr {
> >> >> >>   __u16 header_length;
> >> >> >>   __u32 distro_version;
> >> >> >>   __u16 security_version;
> >> >> >> } __attribute__((packed));
> >> >> >> char *signer;
> >> >> >>
> >> >> >> It consists of a fixed size structure and a null-terminated
> >> >string.
> >> >> >> "header_length" is the size of "struct sec_hdr" and can be used
> >as
> >> >the
> >> >> >> offset to "signer". It also can be a kind of the "header
> >version"
> >> >to
> >> >> >> detect if any new member is introduced.
> >> >> >>
> >> >> >> The kernel packager of the distribution can put the distro name
> >in
> >> >> >> "signer" and the distro version in "distro_version". When a
> >severe
> >> >> >> vulnerability is fixed, the packager increases
> >"security_version"
> >> >in
> >> >> >> the kernel build afterward. The bootloader can maintain a list
> >of
> >> >the
> >> >> >> security versions of the current kernels and only allows the
> >> >kernel with
> >> >> >> a higher or equal security version to boot. If the user is
> >going
> >> >to boot
> >> >> >> a kernel with a lower security version, a warning should show
> >to
> >> >prevent
> >> >> >> the user from loading a vulnerable kernel accidentally.
> >> >> >>
> >> >> >> Enabling UEFI Secure Boot is recommended when using the
> >security
> >> >version
> >> >> >> or the attacker may alter the security version stealthily.
> >> >> >>
> >> >> > Any comment?
> >> >> >
> >> >> 
> >> >> This is now entirely x86-specific. My preference would be to have
> >a
> >> >> generic solution instead.
> >> >> 
> >> >After check the headers again, another idea came to my mind: the
> >MS-DOS
> >> >stub. It's designed to show a warning while the image is loaded in
> >> >DOS(*),
> >> >but I wonder if it still matters. In the x86 linux efi header, the
> >stub
> >> >is just a 3-lines message, while arm64 completely ignores the stub.
> >> >
> >> >Since there is a offset to the PE header at 0x3c, we can
> >theoretically
> >> >put any thing between 0x40 and the PE header without affecting the
> >> >current settings.
> >> >
> >> >HPA,
> >> >
> >> >Does the MS-DOS stub raise any concern to you?
> >> >
> >> >Thanks,
> >> >
> >> >Gary Lin
> >> >
> >> >(*)
> >>
> >>https://msdn.microsoft.com/zh-tw/library/windows/desktop/ms680547(v=vs.85).aspx#ms-dos_stub__image_only_
> >> >
> >> >> -- 
> >> >> Ard.
> >> >> 
> >> >> 
> >> >> >> v2:
> >> >> >> - Decrease the size of secdata_offset to 2 bytes since the
> >setup
> >> >header
> >> >> >>   is limited to around 32KB.
> >> >> >> - Restructure the secdata section. The signer is now a
> >> >null-terminated
> >> >> >>   string. The type of distro_version changes to u32 in case the
> >> >distro
> >> >> >>   uses a long version.
> >> >> >> - Modify the Kconfig names and add help.
> >> >> >> - Remove the signer name hack in build.c.
> >> >> >>
> >> >> >> Cc: Ard Biesheuvel 
> >> >> >> Cc: "H. Peter Anvin" 
> >> >> >> Cc: Thomas Gleixner 
> >> >> >> Cc: Ingo Molnar 
> >> >> >> Cc: Joey Lee 
> >> >> >> Signed-off-by: Gary Lin 
> >> >> >> ---
> >[snip]
> >> >> >> --
> >> >> >> 2.12.2
> >> >> >>
> >> >> 
> >> 
> >> I really don't think that is a good idea.  I would much rather keep
> >this in a space we fully own.
> >Fine. I'll find another place for ARM64 (probably append the structure
> >right after the PE-header and denote the 2-byte offset in the reserved
> >fields in the first 64 bytes header).
> >
> >Thanks,
> >
> >Gary Lin
> 
> Another "safe" option would be to put it in a COFF segment; then it would be 
> system-independent.
Hi HPA,

Sorry for the late reply, I was travelling last two weeks.

In the beginning, I thought a new coff section was feasible. However, it
seems not possible for x86-64.

Although the section itself can be anywhere, we have to register an
entry in the section table in the coff optional header. For the x86-64
kernel image, the section table starts at 0x13b while every entry takes
40 bytes. Currently, there are 4 sections: .setup, .reloc, .text, and
.bss, so the new entry should be added at (0x13b + 0x24 * 4) = 0x1db and
ended at 0x1ff. Unfortunately, the x86 boot protocol requires the setup
header starts at 0x1f1, so there is no room for a new entry in the c

Re: [PATCH] vfs: hard-ban creating files with control characters in the name

2017-10-02 Thread Adam Borowski

On Tue, Oct 03, 2017 at 03:07:24AM +0100, Al Viro wrote:
> On Tue, Oct 03, 2017 at 02:50:42AM +0200, Adam Borowski wrote:
> > Anything with bytes 1-31,127 will get -EACCES.
> > 
> > Especially \n is bad: instead of natural file-per-line, you need an
> > user-unfriendly feature of -print0 added to every producer and consumer;
> > a good part of users either don't know or don't feel the need to bother
> > with escaping this snowflake, thus introducing security holes.
> > 
> > The rest of control characters, while not as harmful, don't have a
> > legitimate use nor have any real chance of coming from a tarball (malice
> > and fooling around excluded).  No character set ever supported as a system
> > locale by glibc, and, TTBMK, by other popular Unices, includes them, thus
> > it can be assumed no foreign files have such names other than artificially.
> > 
> > This goes in stark contrast with other characters proposed to be banned:
> > non-UTF8 is common, and even on my desktop's disk I found examples of all
> > of: [ ], < >, initial -, initial and final space, ?, *, .., ', ", |, &.
> > Somehow no \ anywhere.  I think I have an idea why no / .
> > 
> > Another debatable point is whether to -EACCES or to silently rename to an
> > escaped form such as %0A.  I believe the former is better because:
> > * programs can be confused if a directory has files they didn't just write
> > * many filesystems already disallow certain characters (like invalid
> >   Unicode), thus returning an error is consistent
> > 
> > An example of a write-up of this issue can be found at:
> > https://www.dwheeler.com/essays/fixing-unix-linux-filenames.html
> 
> That essay is full of shit, and you've even mentioned parts of that just
> above...

I used it as a list of problems, not solutions.

> NAK; you'd _still_ need proper quoting (or a shell with something resembling 
> an
> actual syntax, rather than the "more or less what srb had ended up 
> implementing"),
> so it doesn't really buy you anything.

Well, what about just \n then?  Unlike all the others which are relatively
straightforward, \n requires -print0 which not all programs implement, and
way too many people consider too burdensome to use.

> Badly written script will still be exploitable.

Yeah, but we'd kill a major exploit avenue.

> And since older kernels and other Unices are not going away, you would've
> created an inconsistently vulnerable set of scripts, on top of the false
> sense of security.

That shouldn't stop us from improving new kernels -- scripts that have
-print0 won't lose it, those that don't will have a vulnerability fixed. 
Same as with any other kind of hardening.  As for other Unices: Theo de
Raadt is not someone to object to a trivial security patch, FreeBSD would
follow, OSX is too hostile to developers for me to care.  Thus, the only
concern is new userland on old kernels.  But distributions don't support
such combinations for long, unlike the other way around.  As for people
writing their own scripts: they already tend to be vulnerable.

I for example, when writing an ad-hoc pipeline, tend to first make it
display files that'd be processed; switching that to -print0 back and forth
would be really tedious thus I usually remain vulnerable to \n (unless the
script is meant for external use -- but it's too easy to forget).  And how
do you propose to process a list of files with grep or sed if there are
newlines involved?

Basic quotes make it trivial to handle everything but two snowflakes: \n and
initial -; the latter you need to remember about but ./* or -- aren't hard.
This leaves \n.

Thus, would you consider banning just newlines?


Meow!
-- 
⢀⣴⠾⠻⢶⣦⠀ We domesticated dogs 36000 years ago; together we chased
⣾⠁⢰⠒⠀⣿⡁ animals, hung out and licked or scratched our private parts.
⢿⡄⠘⠷⠚⠋⠀ Cats domesticated us 9500 years ago, and immediately we got
⠈⠳⣄ agriculture, towns then cities. -- whitroth on /.

Re: [BUG] fs/dcache: might_sleep is called under a spinlock

2017-10-02 Thread Al Viro

On Tue, Oct 03, 2017 at 10:38:25AM +0800, Jia-Ju Bai wrote:
> According to fs/dcache.c, might_sleep is called under a spinlock,
> and the function call path is:
> d_prune_aliases (acquire the spinlock)
>   dput
> might_sleep
> 
> This bug is found by my static analysis tool and my code review.
> A possible fix is to remove might_sleep in dput.

... or to fix your static analysis tool.  First of all, that call
of dput() really *can* block and if we had inode->i_lock or dentry->d_lock
still held at that point we'd have a real bug.  However, __dentry_kill()
there is called with dentry->d_inode == inode and inode->i_lock held,
so dentry->d_inode is stable until inode->i_lock is dropped.  Said
__dentry_kill() contains
if (dentry->d_inode)
dentry_unlink_inode(dentry);
with inode->i_lock held until that point.  dentry_unlink_inode() starts
with
struct inode *inode = dentry->d_inode;
bool hashed = !d_unhashed(dentry);

if (hashed)
raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
if (hashed)
raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
so
1) inode in there is guaranteed to be equal to the argument of
d_prune_aliases() and
2) both dentry->d_lock and inode->i_lock are dropped before
dentry_unlink_inode() returns.  inode->i_lock is not regained in the
rest of __dentry_kill(); dentry->d_lock is regained and dropped before
__dentry_kill() returns.  IOW, we are fine - dput() in d_prune_aliases()
is called without any spinlocks held.

That, BTW, is the reason for
goto restart;
in there, instead of just continuing the loop - if we get to that point,
the list of aliases might have changed.

Removing might_sleep() in dput() would've been wrong - it really might
sleep when called from that point.  Here's how: we used to have two
links to the same file - foo/bar and baz/barf.  baz/barf used to be
opened, then rm -rf baz happened and later we'd called d_prune_aliases()
on the inode of foo/bar.  And as the loop had been executed on one CPU,
on another the opened file got closed, dropping the last reference to
dentry that used to be baz/barf.  Note that its parent (the thing that
used to be dentry of baz) is unhashed and the only contributor to its
refcount is our dentry, so dput(parent) *does* drop the last remaining
reference, triggering the final iput() on inode of baz, along with
freeing on-disk inode, doing disk IO, etc.

Again, it's not that we can't block in that dput() - it's that __dentry_kill()
drops all spinlocks.

[RFC PATCH v2 07/31] KVM: arm64: Setup vttbr_el2 on each VM entry

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

Now that the vttbr value will be different depending on the VM's
exception level, we set it on each VM entry.

We only have one mmu instance at this point, but there will be
multiple of them if we come to run nested VMs.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/context.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/kvm/context.c b/arch/arm64/kvm/context.c
index a7811e1..afd1702 100644
--- a/arch/arm64/kvm/context.c
+++ b/arch/arm64/kvm/context.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct el1_el2_map {
enum vcpu_sysregel1;
@@ -174,6 +175,15 @@ static void flush_shadow_el1_sysregs(struct kvm_vcpu *vcpu)
flush_shadow_el1_sysregs_nvhe(vcpu);
 }
 
+static void setup_s2_mmu(struct kvm_vcpu *vcpu)
+{
+   struct kvm_s2_mmu *mmu = &vcpu->kvm->arch.mmu;
+   struct kvm_s2_vmid *vmid = vcpu_get_active_vmid(vcpu);
+
+   vcpu->arch.hw_vttbr = kvm_get_vttbr(vmid, mmu);
+   vcpu->arch.hw_mmu = mmu;
+}
+
 /*
  * List of EL0 and EL1 registers which we allow the virtual EL2 mode to access
  * directly without trapping. This is possible because the impact of
@@ -323,6 +333,8 @@ void kvm_arm_setup_shadow_state(struct kvm_vcpu *vcpu)
setup_mpidr_el1(vcpu);
ctxt->hw_sys_regs = ctxt->sys_regs;
}
+
+   setup_s2_mmu(vcpu);
 }
 
 /**
-- 
1.9.1

[RFC PATCH v2 08/31] KVM: arm/arm64: Make mmu functions non-static

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

Make mmu functions non-static so that we can reuse those functions
to support mmu for the nested VMs.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/kvm_mmu.h |  9 
 virt/kvm/arm/mmu.c   | 94 +++-
 2 files changed, 64 insertions(+), 39 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 21c0299..bceaec1 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -145,9 +145,18 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
 
 void stage2_unmap_vm(struct kvm *kvm);
 int kvm_alloc_stage2_pgd(struct kvm *kvm);
+int __kvm_alloc_stage2_pgd(struct kvm_s2_mmu *mmu);
 void kvm_free_stage2_pgd(struct kvm *kvm);
+void __kvm_free_stage2_pgd(struct kvm *kvm, struct kvm_s2_mmu *mmu);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
  phys_addr_t pa, unsigned long size, bool writable);
+void kvm_unmap_stage2_range(struct kvm *kvm, struct kvm_s2_mmu *mmu,
+   phys_addr_t start, u64 size);
+void kvm_stage2_wp_range(struct kvm *kvm, struct kvm_s2_mmu *mmu,
+phys_addr_t addr, phys_addr_t end);
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu,
+   phys_addr_t start, phys_addr_t end);
+
 
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 184cdc9..ca10799 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -307,7 +307,7 @@ static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, pgd_t 
*pgd,
 }
 
 /**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * kvm_unmap_stage2_range -- Clear stage2 page table entries to unmap a range
  * @kvm:   The VM pointer
  * @start: The intermediate physical base address of the range to unmap
  * @size:  The size of the area to unmap
@@ -317,12 +317,12 @@ static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, 
pgd_t *pgd,
  * destroying the VM), otherwise another faulting VCPU may come in and mess
  * with things behind our backs.
  */
-static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
+void kvm_unmap_stage2_range(struct kvm *kvm, struct kvm_s2_mmu *mmu,
+   phys_addr_t start, u64 size)
 {
pgd_t *pgd;
phys_addr_t addr = start, end = start + size;
phys_addr_t next;
-   struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
 
assert_spin_locked(&kvm->mmu_lock);
pgd = mmu->pgd + stage2_pgd_index(addr);
@@ -391,11 +391,10 @@ static void stage2_flush_puds(pgd_t *pgd, phys_addr_t 
addr, phys_addr_t end)
} while (pud++, addr = next, addr != end);
 }
 
-static void stage2_flush_memslot(struct kvm_s2_mmu *mmu,
-struct kvm_memory_slot *memslot)
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu,
+   phys_addr_t start, phys_addr_t end)
 {
-   phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-   phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
+   phys_addr_t addr = start;
phys_addr_t next;
pgd_t *pgd;
 
@@ -406,6 +405,15 @@ static void stage2_flush_memslot(struct kvm_s2_mmu *mmu,
} while (pgd++, addr = next, addr != end);
 }
 
+static void stage2_flush_memslot(struct kvm_s2_mmu *mmu,
+   struct kvm_memory_slot *memslot)
+{
+   phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
+   phys_addr_t end = start + PAGE_SIZE * memslot->npages;
+
+   kvm_stage2_flush_range(mmu, start, end);
+}
+
 /**
  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
  * @kvm: The struct kvm pointer
@@ -762,21 +770,9 @@ int create_hyp_io_mappings(void *from, void *to, 
phys_addr_t phys_addr)
 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
 }
 
-/**
- * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
- * @kvm:   The KVM struct pointer for the VM.
- *
- * Allocates only the stage-2 HW PGD level table(s) (can support either full
- * 40-bit input addresses or limited to 32-bit input addresses). Clears the
- * allocated pages.
- *
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_alloc_stage2_pgd(struct kvm *kvm)
+int __kvm_alloc_stage2_pgd(struct kvm_s2_mmu *mmu)
 {
pgd_t *pgd;
-   struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
 
if (mmu->pgd != NULL) {
kvm_err("kvm_arch already initialized?\n");
@@ -793,6 +789,22 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
return 0;
 }
 
+/**
+ * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
+ * @kvm:   The KVM struct pointer for the VM.
+ *
+ * Allocates only the stage-2 HW PGD level table(s) (can support either full
+ * 40-bit input

[RFC PATCH v2 04/31] KVM: arm/arm64: Abstract stage-2 MMU state into a separate structure

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

Abstract stage-2 MMU state into a separate structure and change all
callers referring to page tables, VMIDs, and the VTTBR to use this new
indirection.

This is about to become very handy when using shadow stage-2 page
tables.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---
 arch/arm/include/asm/kvm_asm.h|   7 +-
 arch/arm/include/asm/kvm_host.h   |  26 +---
 arch/arm/kvm/hyp/switch.c |   5 +-
 arch/arm/kvm/hyp/tlb.c|  18 ++---
 arch/arm64/include/asm/kvm_asm.h  |   7 +-
 arch/arm64/include/asm/kvm_host.h |  10 ++-
 arch/arm64/kvm/hyp/switch.c   |   5 +-
 arch/arm64/kvm/hyp/tlb.c  |  38 +--
 virt/kvm/arm/arm.c|  34 +-
 virt/kvm/arm/mmu.c| 137 +-
 10 files changed, 163 insertions(+), 124 deletions(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 14d68a4..71b7255 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -57,6 +57,7 @@
 #ifndef __ASSEMBLY__
 struct kvm;
 struct kvm_vcpu;
+struct kvm_s2_mmu;
 
 extern char __kvm_hyp_init[];
 extern char __kvm_hyp_init_end[];
@@ -64,9 +65,9 @@
 extern char __kvm_hyp_vector[];
 
 extern void __kvm_flush_vm_context(void);
-extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
-extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
-extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
+extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
+extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 7e9e6c8..78d826e 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -53,9 +53,21 @@
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 void kvm_reset_coprocs(struct kvm_vcpu *vcpu);
 
-struct kvm_arch {
-   /* VTTBR value associated with below pgd and vmid */
+struct kvm_s2_mmu {
+   /* The VMID generation used for the virt. memory system */
+   u64vmid_gen;
+   u32vmid;
+
+   /* Stage-2 page table */
+   pgd_t *pgd;
+
+   /* VTTBR value associated with above pgd and vmid */
u64vttbr;
+};
+
+struct kvm_arch {
+   /* Stage 2 paging state for the VM */
+   struct kvm_s2_mmu mmu;
 
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
@@ -65,13 +77,6 @@ struct kvm_arch {
 * here.
 */
 
-   /* The VMID generation used for the virt. memory system */
-   u64vmid_gen;
-   u32vmid;
-
-   /* Stage-2 page table */
-   pgd_t *pgd;
-
/* Interrupt controller */
struct vgic_distvgic;
int max_vcpus;
@@ -185,6 +190,9 @@ struct kvm_vcpu_arch {
 
/* Detect first run of a vcpu */
bool has_run_once;
+
+   /* Stage 2 paging state used by the hardware on next switch */
+   struct kvm_s2_mmu *hw_mmu;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index ebd2dd4..4814671 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -75,8 +75,9 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu 
*vcpu)
 
 static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu)
 {
-   struct kvm *kvm = kern_hyp_va(vcpu->kvm);
-   write_sysreg(kvm->arch.vttbr, VTTBR);
+   struct kvm_s2_mmu *mmu = kern_hyp_va(vcpu->arch.hw_mmu);
+
+   write_sysreg(mmu->vttbr, VTTBR);
write_sysreg(vcpu->arch.midr, VPIDR);
 }
 
diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c
index 6d810af..56f0a49 100644
--- a/arch/arm/kvm/hyp/tlb.c
+++ b/arch/arm/kvm/hyp/tlb.c
@@ -34,13 +34,13 @@
  * As v7 does not support flushing per IPA, just nuke the whole TLB
  * instead, ignoring the ipa value.
  */
-void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
+void __hyp_text __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
 {
dsb(ishst);
 
/* Switch to requested VMID */
-   kvm = kern_hyp_va(kvm);
-   write_sysreg(kvm->arch.vttbr, VTTBR);
+   mmu = kern_hyp_va(mmu);
+   write_sysreg(mmu->vttbr, VTTBR);
isb();
 
write_sysreg(0, TLBIALLIS);
@@ -50,17 +50,17 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
write_sysreg(0, VTTBR);
 }
 
-void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
+phys_addr_t ipa)
 {
-   __kvm_tlb_flush_vmid(kvm);
+   __kvm_tlb_flush_vmid(mmu);
 }
 
-void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu)
+void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
 {
-   struct kvm *kvm = kern_hyp_va(kern_

[RFC PATCH v2 06/31] KVM: arm64: Invalidate virtual EL2 TLB entries when needed

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

Sometimes when we are invalidating the TLB for a certain S2 MMU
context, this context can also have EL2 context associated with it and
we have to invalidate this too.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---
 virt/kvm/arm/arm.c |  5 +
 virt/kvm/arm/mmu.c | 23 ++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 41e0654..63dd897 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -362,6 +362,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
u64 vttbr = kvm_get_vttbr(&mmu->vmid, mmu);
 
kvm_call_hyp(__kvm_tlb_flush_local_vmid, vttbr);
+
+   if (mmu->el2_vmid.vmid) {
+   vttbr = kvm_get_vttbr(&mmu->el2_vmid, mmu);
+   kvm_call_hyp(__kvm_tlb_flush_local_vmid, vttbr);
+   }
*last_ran = vcpu->vcpu_id;
}
 
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 0edcf23..184cdc9 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -64,7 +64,21 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
u64 vttbr = kvm_get_vttbr(&mmu->vmid, mmu);
 
-   kvm_call_hyp(__kvm_tlb_flush_vmid, vttbr);
+   if (!mmu->el2_vmid.vmid) {
+   /*
+* For a normal (i.e. non-nested) guest, flush entries for the
+* given VMID *
+*/
+   kvm_call_hyp(__kvm_tlb_flush_vmid, vttbr);
+   } else {
+   /*
+* When supporting nested virtualization, we can have multiple
+* VMIDs in play for each VCPU in the VM, so it's really not
+* worth it to try to quiesce the system and flush all the
+* VMIDs that may be in use, instead just nuke the whole thing.
+*/
+   kvm_call_hyp(__kvm_flush_vm_context);
+   }
 }
 
 static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa)
@@ -72,6 +86,13 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, 
phys_addr_t ipa)
u64 vttbr = kvm_get_vttbr(&mmu->vmid, mmu);
 
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, vttbr, ipa);
+
+   if (!mmu->el2_vmid.vmid) {
+   /* Nothing to do more for a non-nested guest */
+   return;
+   }
+   vttbr = kvm_get_vttbr(&mmu->el2_vmid, mmu);
+   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, vttbr, ipa);
 }
 
 /*
-- 
1.9.1

[RFC PATCH v2 12/31] KVM: arm/arm64: Handle shadow stage 2 page faults

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

If we are faulting on a shadow stage 2 translation, we first walk the
guest hypervisor's stage 2 page table to see if it has a mapping. If
not, we inject a stage 2 page fault to the virtual EL2. Otherwise, we
create a mapping in the shadow stage 2 page table.

Note that we have to deal with two IPAs when we got a showdow stage 2
page fault. One is the address we faulted on, and is in the L2 guest
phys space. The other is from the guest stage-2 page table walk, and is
in the L1 guest phys space.  To differentiate them, we rename variable
names so that fault_ipa is used for the former and ipa is used for the
latter.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---

Notes:
v1-->v2:
- Added a common function to inject s2 faults.
- Align L1 IPA as well as L2 IPA in transparent_hugepage_adjust(). This will
come in handy when creating a rmap entry with both IPAs.

 arch/arm/include/asm/kvm_emulate.h   |  7 
 arch/arm/include/asm/kvm_mmu.h   |  4 ++
 arch/arm64/include/asm/kvm_emulate.h |  5 +++
 arch/arm64/include/asm/kvm_mmu.h |  1 +
 arch/arm64/kvm/mmu-nested.c  |  8 
 virt/kvm/arm/mmio.c  | 12 +++---
 virt/kvm/arm/mmu.c   | 75 +---
 7 files changed, 92 insertions(+), 20 deletions(-)

diff --git a/arch/arm/include/asm/kvm_emulate.h 
b/arch/arm/include/asm/kvm_emulate.h
index 24a3fbf..8136464 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -297,4 +297,11 @@ static inline struct kvm_s2_vmid 
*vcpu_get_active_vmid(struct kvm_vcpu *vcpu)
 {
return &vcpu->kvm->arch.mmu.vmid;
 }
+
+/* arm architecture doesn't support the nesting */
+static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+
 #endif /* __ARM_KVM_EMULATE_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5fab21a..6a22846 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -242,6 +242,10 @@ static inline void kvm_nested_s2_free(struct kvm *kvm) { }
 static inline void kvm_nested_s2_wp(struct kvm *kvm) { }
 static inline void kvm_nested_s2_clear(struct kvm *kvm) { }
 static inline void kvm_nested_s2_flush(struct kvm *kvm) { }
+static inline int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+   return 0;
+}
 
 static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index f476576..c66554b 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -390,4 +390,9 @@ static inline unsigned long vcpu_data_host_to_guest(struct 
kvm_vcpu *vcpu,
return data;/* Leave LE untouched */
 }
 
+static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
+{
+   return vcpu_nested_stage2_enabled(vcpu) && !is_hyp_ctxt(vcpu);
+}
+
 #endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index c4efcd5..425e4a2 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -342,6 +342,7 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t 
gipa,
 void kvm_nested_s2_wp(struct kvm *kvm);
 void kvm_nested_s2_clear(struct kvm *kvm);
 void kvm_nested_s2_flush(struct kvm *kvm);
+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
 
 static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/kvm/mmu-nested.c b/arch/arm64/kvm/mmu-nested.c
index fb694b7..75570cc 100644
--- a/arch/arm64/kvm/mmu-nested.c
+++ b/arch/arm64/kvm/mmu-nested.c
@@ -60,6 +60,14 @@ static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, 
u32 fsc)
return esr;
 }
 
+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+   vcpu->arch.ctxt.sys_regs[FAR_EL2] = vcpu->arch.fault.far_el2;
+   vcpu->arch.ctxt.sys_regs[HPFAR_EL2] = vcpu->arch.fault.hpfar_el2;
+
+   return kvm_inject_nested_sync(vcpu, esr_el2);
+}
+
 static int check_base_s2_limits(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
int level, int input_size, int stride)
 {
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index b6e715f..a1009c2 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -153,7 +153,7 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool 
*is_write, int *len)
 }
 
 int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
-phys_addr_t fault_ipa)
+phys_addr_t ipa)
 {
unsigned long data;
unsigned long rt;
@@ -182,22 +182,22 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run 
*run,
data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),

[RFC PATCH v2 13/31] KVM: arm/arm64: Move kvm_is_write_fault to header file

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

Move this little function to the header files for arm/arm64 so other
code can make use of it directly.

Signed-off-by: Christoffer Dall 
---
 arch/arm/include/asm/kvm_emulate.h   | 8 
 arch/arm64/include/asm/kvm_emulate.h | 8 
 virt/kvm/arm/mmu.c   | 8 
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/arm/include/asm/kvm_emulate.h 
b/arch/arm/include/asm/kvm_emulate.h
index 8136464..9b745d9 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -223,6 +223,14 @@ static inline u8 kvm_vcpu_trap_get_fault_type(struct 
kvm_vcpu *vcpu)
return kvm_vcpu_get_hsr(vcpu) & HSR_FSC_TYPE;
 }
 
+static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
+{
+   if (kvm_vcpu_trap_is_iabt(vcpu))
+   return false;
+
+   return kvm_vcpu_dabt_iswrite(vcpu);
+}
+
 static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)
 {
return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK;
diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index c66554b..4c47bc7 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -307,6 +307,14 @@ static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu 
*vcpu)
return (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
 }
 
+static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
+{
+   if (kvm_vcpu_trap_is_iabt(vcpu))
+   return false;
+
+   return kvm_vcpu_dabt_iswrite(vcpu);
+}
+
 static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
 {
return vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 25d3d73..74941ad 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1141,14 +1141,6 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, 
phys_addr_t *ipap,
return false;
 }
 
-static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
-{
-   if (kvm_vcpu_trap_is_iabt(vcpu))
-   return false;
-
-   return kvm_vcpu_dabt_iswrite(vcpu);
-}
-
 /**
  * stage2_wp_ptes - write protect PMD range
  * @pmd:   pointer to pmd entry
-- 
1.9.1

[RFC PATCH v2 11/31] KVM: arm64: Implement nested Stage-2 page table walk logic

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

Based on the pseudo-code in the ARM ARM, implement a stage 2 software
page table walker.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---

Notes:
v1-->v2:
- Handled different endianness between the host and the guest hypervisor
- Decoupled the stage-2 PTW from injecting exceptions. This will come in 
handy
  when we just want to walk the page table.
- Added esr and upper_attr fields in kvm_s2_trans struct
- Reworked pa_max() to have KVM_PHYS_SHIFT
- Updated comment about the continuous bits

 arch/arm/include/asm/kvm_mmu.h   |  16 +++
 arch/arm64/include/asm/esr.h |   1 +
 arch/arm64/include/asm/kvm_arm.h |   3 +
 arch/arm64/include/asm/kvm_mmu.h |  12 ++
 arch/arm64/kvm/mmu-nested.c  | 241 +++
 5 files changed, 273 insertions(+)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index d3eafc5..5fab21a 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -221,6 +221,22 @@ static inline unsigned int kvm_get_vmid_bits(void)
return 8;
 }
 
+struct kvm_s2_trans {
+   phys_addr_t output;
+   phys_addr_t block_size;
+   bool writable;
+   bool readable;
+   int level;
+   u32 esr;
+   u64 upper_attr;
+};
+
+static inline int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+struct kvm_s2_trans *result)
+{
+   return 0;
+}
+
 static inline void kvm_nested_s2_unmap(struct kvm_vcpu *vcpu) { }
 static inline void kvm_nested_s2_free(struct kvm *kvm) { }
 static inline void kvm_nested_s2_wp(struct kvm *kvm) { }
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 210fde6..bc6610b 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -108,6 +108,7 @@
 #define ESR_ELx_CM (UL(1) << 8)
 
 /* ISS field definitions for exceptions taken in to Hyp */
+#define ESR_ELx_FSC_ADDRSZ (0x00)
 #define ESR_ELx_CV (UL(1) << 24)
 #define ESR_ELx_COND_SHIFT (20)
 #define ESR_ELx_COND_MASK  (UL(0xF) << ESR_ELx_COND_SHIFT)
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index a1274b7..3993703 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -104,6 +104,7 @@
 #define VTCR_EL2_RES1  (1 << 31)
 #define VTCR_EL2_HD(1 << 22)
 #define VTCR_EL2_HA(1 << 21)
+#define VTCR_EL2_PS_SHIFT  TCR_EL2_PS_SHIFT
 #define VTCR_EL2_PS_MASK   TCR_EL2_PS_MASK
 #define VTCR_EL2_TG0_MASK  TCR_TG0_MASK
 #define VTCR_EL2_TG0_4KTCR_TG0_4K
@@ -177,6 +178,8 @@
 #define VTTBR_VMID_SHIFT  (UL(48))
 #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
 
+#define SCTLR_EE   (UL(1) << 25)
+
 /* Hyp System Trap Register */
 #define HSTR_EL2_T(x)  (1 << x)
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7fc7a83..c4efcd5 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -322,9 +322,21 @@ static inline unsigned int kvm_get_vmid_bits(void)
return (cpuid_feature_extract_unsigned_field(reg, 
ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
 }
 
+struct kvm_s2_trans {
+   phys_addr_t output;
+   phys_addr_t block_size;
+   bool writable;
+   bool readable;
+   int level;
+   u32 esr;
+   u64 upper_attr;
+};
+
 struct kvm_nested_s2_mmu *get_nested_mmu(struct kvm_vcpu *vcpu, u64 vttbr);
 struct kvm_s2_mmu *vcpu_get_active_s2_mmu(struct kvm_vcpu *vcpu);
 void update_nested_s2_mmu(struct kvm_vcpu *vcpu);
+int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+  struct kvm_s2_trans *result);
 void kvm_nested_s2_unmap(struct kvm_vcpu *vcpu);
 void kvm_nested_s2_free(struct kvm *kvm);
 void kvm_nested_s2_wp(struct kvm *kvm);
diff --git a/arch/arm64/kvm/mmu-nested.c b/arch/arm64/kvm/mmu-nested.c
index 3ee20f2..fb694b7 100644
--- a/arch/arm64/kvm/mmu-nested.c
+++ b/arch/arm64/kvm/mmu-nested.c
@@ -22,6 +22,247 @@
 #include 
 #include 
 
+struct s2_walk_info {
+   unsigned int pgshift;
+   unsigned int pgsize;
+   unsigned int ps;
+   unsigned int sl;
+   unsigned int t0sz;
+};
+
+static unsigned int ps_to_output_size(unsigned int ps)
+{
+   switch (ps) {
+   case 0: return 32;
+   case 1: return 36;
+   case 2: return 40;
+   case 3: return 42;
+   case 4: return 44;
+   case 5:
+   default:
+   return 48;
+   }
+}
+
+static unsigned int pa_max(void)
+{
+/* We always emulate a VM with maximum PA size of KVM_PHYS_SIZE. */
+   return KVM_PHYS_SHIFT;
+}
+
+static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
+{
+   u32 esr;
+
+   esr = kvm_vcpu_get_hsr(vcpu) & ~ESR_ELx_FSC;
+   esr |= fsc;
+   esr |= level & 0x3;
+   return esr;
+}
+
+static int

[RFC PATCH v2 14/31] KVM: arm/arm64: Forward the guest hypervisor's stage 2 permission faults

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

We can have discrepancies between the nested stage 2 page table and the
shadow one in a couple of cases.  For example, the guest hypervisor can
mark a page writable but the host hypervisor maps the page read-only in
the shadow page table, if using something like KSM on the host level.
In this case, a write fault is handled directly by the host hypervisor.
But we could also simply have a read-only page mapped read-only in both
tables, in which case the host hypervisor cannot do anything else than
telling the guest hypervisor about the fault.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---
 arch/arm/include/asm/kvm_mmu.h   |  7 +++
 arch/arm64/include/asm/kvm_mmu.h |  2 ++
 arch/arm64/kvm/mmu-nested.c  | 22 ++
 virt/kvm/arm/mmu.c   |  7 +++
 4 files changed, 38 insertions(+)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 6a22846..1c5b652 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -237,6 +237,13 @@ static inline int kvm_walk_nested_s2(struct kvm_vcpu 
*vcpu, phys_addr_t gipa,
return 0;
 }
 
+static inline int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
+  phys_addr_t fault_ipa,
+  struct kvm_s2_trans *trans)
+{
+   return 0;
+}
+
 static inline void kvm_nested_s2_unmap(struct kvm_vcpu *vcpu) { }
 static inline void kvm_nested_s2_free(struct kvm *kvm) { }
 static inline void kvm_nested_s2_wp(struct kvm *kvm) { }
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 425e4a2..239bb89 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -337,6 +337,8 @@ struct kvm_s2_trans {
 void update_nested_s2_mmu(struct kvm_vcpu *vcpu);
 int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
   struct kvm_s2_trans *result);
+int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+struct kvm_s2_trans *trans);
 void kvm_nested_s2_unmap(struct kvm_vcpu *vcpu);
 void kvm_nested_s2_free(struct kvm *kvm);
 void kvm_nested_s2_wp(struct kvm *kvm);
diff --git a/arch/arm64/kvm/mmu-nested.c b/arch/arm64/kvm/mmu-nested.c
index 75570cc..a440d7b 100644
--- a/arch/arm64/kvm/mmu-nested.c
+++ b/arch/arm64/kvm/mmu-nested.c
@@ -271,6 +271,28 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t 
gipa,
return walk_nested_s2_pgd(vcpu, gipa, &wi, result);
 }
 
+/*
+ * Returns non-zero if permission fault is handled by injecting it to the next
+ * level hypervisor.
+ */
+int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+struct kvm_s2_trans *trans)
+{
+   unsigned long fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+   bool write_fault = kvm_is_write_fault(vcpu);
+
+   if (fault_status != FSC_PERM)
+   return 0;
+
+   if ((write_fault && !trans->writable) ||
+   (!write_fault && !trans->readable)) {
+   trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
+   return 1;
+   }
+
+   return 0;
+}
+
 /* expects kvm->mmu_lock to be held */
 void kvm_nested_s2_wp(struct kvm *kvm)
 {
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 74941ad..4fb7b3b 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1591,6 +1591,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
if (ret)
goto out_unlock;
 
+   nested_trans.esr = 0;
+   ret = kvm_s2_handle_perm_fault(vcpu, fault_ipa, &nested_trans);
+   if (nested_trans.esr)
+   kvm_inject_s2_fault(vcpu, nested_trans.esr);
+   if (ret)
+   goto out_unlock;
+
ipa = nested_trans.output;
}
 
-- 
1.9.1

[RFC PATCH v2 10/31] KVM: arm/arm64: Unmap/flush shadow stage 2 page tables

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

Unmap/flush shadow stage 2 page tables for the nested VMs as well as the
stage 2 page table for the guest hypervisor.

Note: A bunch of the code in mmu.c relating to MMU notifiers is
currently dealt with in an extremely abrupt way, for example by clearing
out an entire shadow stage-2 table. This will be handled in a more
efficient way using the reverse mapping feature in a later version of
the patch series.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---

Notes:
v1-->v2:
- Removed an unnecessary iteration for each vcpu in 
kvm_nested_s2_all_vcpus_*()
  functions and remove all_vcpus in the function names; a list of nested mmu
  is per VM, not per vcpu.
- Renamed kvm_nested_s2_unmap() to kvm_nested_s2_clear()
- Renamed kvm_nested_s2_teardown() to kvm_nested_s2_free()
- Removed the unused kvm_nested_s2_init() function.

 arch/arm/include/asm/kvm_mmu.h   |  6 ++
 arch/arm64/include/asm/kvm_mmu.h |  5 +
 arch/arm64/kvm/mmu-nested.c  | 40 
 virt/kvm/arm/arm.c   |  6 +-
 virt/kvm/arm/mmu.c   | 17 +
 5 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 86fdc70..d3eafc5 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -221,6 +221,12 @@ static inline unsigned int kvm_get_vmid_bits(void)
return 8;
 }
 
+static inline void kvm_nested_s2_unmap(struct kvm_vcpu *vcpu) { }
+static inline void kvm_nested_s2_free(struct kvm *kvm) { }
+static inline void kvm_nested_s2_wp(struct kvm *kvm) { }
+static inline void kvm_nested_s2_clear(struct kvm *kvm) { }
+static inline void kvm_nested_s2_flush(struct kvm *kvm) { }
+
 static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
struct kvm_s2_mmu *mmu)
 {
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 452912f..7fc7a83 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -325,6 +325,11 @@ static inline unsigned int kvm_get_vmid_bits(void)
 struct kvm_nested_s2_mmu *get_nested_mmu(struct kvm_vcpu *vcpu, u64 vttbr);
 struct kvm_s2_mmu *vcpu_get_active_s2_mmu(struct kvm_vcpu *vcpu);
 void update_nested_s2_mmu(struct kvm_vcpu *vcpu);
+void kvm_nested_s2_unmap(struct kvm_vcpu *vcpu);
+void kvm_nested_s2_free(struct kvm *kvm);
+void kvm_nested_s2_wp(struct kvm *kvm);
+void kvm_nested_s2_clear(struct kvm *kvm);
+void kvm_nested_s2_flush(struct kvm *kvm);
 
 static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/kvm/mmu-nested.c b/arch/arm64/kvm/mmu-nested.c
index c436daf..3ee20f2 100644
--- a/arch/arm64/kvm/mmu-nested.c
+++ b/arch/arm64/kvm/mmu-nested.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2017 - Columbia University and Linaro Ltd.
  * Author: Jintack Lim 
+ * Author: Christoffer Dall 
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,6 +22,45 @@
 #include 
 #include 
 
+/* expects kvm->mmu_lock to be held */
+void kvm_nested_s2_wp(struct kvm *kvm)
+{
+   struct kvm_nested_s2_mmu *nested_mmu;
+   struct list_head *nested_mmu_list = &kvm->arch.nested_mmu_list;
+
+   list_for_each_entry_rcu(nested_mmu, nested_mmu_list, list)
+   kvm_stage2_wp_range(kvm, &nested_mmu->mmu, 0, KVM_PHYS_SIZE);
+}
+
+/* expects kvm->mmu_lock to be held */
+void kvm_nested_s2_clear(struct kvm *kvm)
+{
+   struct kvm_nested_s2_mmu *nested_mmu;
+   struct list_head *nested_mmu_list = &kvm->arch.nested_mmu_list;
+
+   list_for_each_entry_rcu(nested_mmu, nested_mmu_list, list)
+   kvm_unmap_stage2_range(kvm, &nested_mmu->mmu, 0, KVM_PHYS_SIZE);
+}
+
+/* expects kvm->mmu_lock to be held */
+void kvm_nested_s2_flush(struct kvm *kvm)
+{
+   struct kvm_nested_s2_mmu *nested_mmu;
+   struct list_head *nested_mmu_list = &kvm->arch.nested_mmu_list;
+
+   list_for_each_entry_rcu(nested_mmu, nested_mmu_list, list)
+   kvm_stage2_flush_range(&nested_mmu->mmu, 0, KVM_PHYS_SIZE);
+}
+
+void kvm_nested_s2_free(struct kvm *kvm)
+{
+   struct kvm_nested_s2_mmu *nested_mmu;
+   struct list_head *nested_mmu_list = &kvm->arch.nested_mmu_list;
+
+   list_for_each_entry_rcu(nested_mmu, nested_mmu_list, list)
+   __kvm_free_stage2_pgd(kvm, &nested_mmu->mmu);
+}
+
 static struct kvm_nested_s2_mmu *lookup_nested_mmu(struct kvm_vcpu *vcpu,
   u64 vttbr)
 {
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 4548d77..08706f8 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -187,6 +187,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
free_percpu(kvm->arch.last_vcpu_ran);
kvm->arch.last_vcpu_ran =

[RFC PATCH v2 15/31] KVM: arm64: Move system register helper functions around

2017-10-02 Thread Jintack Lim

From: Jintack Lim 

We are about to add a framework to handle system instruction traps. To
reuse existing helper functions, let's move them around.

No functional change.

Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/sys_regs.c | 89 ---
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 395b964..541bb97 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1621,6 +1621,51 @@ static bool access_id_aa64mmfr0_el1(struct kvm_vcpu *v,
{ SYS_DESC(SYS_SP_EL2), NULL, reset_special, SP_EL2, 0},
 };
 
+#define reg_to_match_value(x)  \
+   ({  \
+   unsigned long val;  \
+   val  = (x)->Op0 << 14;  \
+   val |= (x)->Op1 << 11;  \
+   val |= (x)->CRn << 7;   \
+   val |= (x)->CRm << 3;   \
+   val |= (x)->Op2;\
+   val;\
+})
+
+static int match_sys_reg(const void *key, const void *elt)
+{
+   const unsigned long pval = (unsigned long)key;
+   const struct sys_reg_desc *r = elt;
+
+   return pval - reg_to_match_value(r);
+}
+
+static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params,
+const struct sys_reg_desc table[],
+unsigned int num)
+{
+   unsigned long pval = reg_to_match_value(params);
+
+   return bsearch((void *)pval, table, num, sizeof(table[0]),
+  match_sys_reg);
+}
+
+static void perform_access(struct kvm_vcpu *vcpu,
+  struct sys_reg_params *params,
+  const struct sys_reg_desc *r)
+{
+   /*
+* Not having an accessor means that we have configured a trap
+* that we don't know how to handle. This certainly qualifies
+* as a gross bug that should be fixed right away.
+*/
+   BUG_ON(!r->access);
+
+   /* Skip instruction if instructed so */
+   if (likely(r->access(vcpu, params, r)))
+   kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+}
+
 static bool trap_dbgidr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -1968,56 +2013,12 @@ static const struct sys_reg_desc 
*get_target_table(unsigned target,
}
 }
 
-#define reg_to_match_value(x)  \
-   ({  \
-   unsigned long val;  \
-   val  = (x)->Op0 << 14;  \
-   val |= (x)->Op1 << 11;  \
-   val |= (x)->CRn << 7;   \
-   val |= (x)->CRm << 3;   \
-   val |= (x)->Op2;\
-   val;\
-})
-
-static int match_sys_reg(const void *key, const void *elt)
-{
-   const unsigned long pval = (unsigned long)key;
-   const struct sys_reg_desc *r = elt;
-
-   return pval - reg_to_match_value(r);
-}
-
-static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params,
-const struct sys_reg_desc table[],
-unsigned int num)
-{
-   unsigned long pval = reg_to_match_value(params);
-
-   return bsearch((void *)pval, table, num, sizeof(table[0]), 
match_sys_reg);
-}
-
 int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
kvm_inject_undefined(vcpu);
return 1;
 }
 
-static void perform_access(struct kvm_vcpu *vcpu,
-  struct sys_reg_params *params,
-  const struct sys_reg_desc *r)
-{
-   /*
-* Not having an accessor means that we have configured a trap
-* that we don't know how to handle. This certainly qualifies
-* as a gross bug that should be fixed right away.
-*/
-   BUG_ON(!r->access);
-
-   /* Skip instruction if instructed so */
-   if (likely(r->access(vcpu, params, r)))
-   kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-}
-
 /*
  * emulate_cp --  tries to match a sys_reg access in a handling table, and
  *call the corresponding trap handler.
-- 
1.9.1

[RFC PATCH v2 16/31] KVM: arm64: Introduce sys_reg_desc.forward_trap

2017-10-02 Thread Jintack Lim

This introduces a function prototype to determine if we need to forward
system instruction traps to the virtual EL2. The implementation of
forward_trap functions for each system instruction will be added in
later patches.

Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/sys_regs.c | 8 
 arch/arm64/kvm/sys_regs.h | 6 ++
 2 files changed, 14 insertions(+)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 541bb97..88ce172 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1661,6 +1661,14 @@ static void perform_access(struct kvm_vcpu *vcpu,
 */
BUG_ON(!r->access);
 
+   /*
+* Forward this trap to the virtual EL2 if the guest hypervisor has
+* configured to trap the current instruction.
+*/
+   if (nested_virt_in_use(vcpu) && r->forward_trap
+   && unlikely(r->forward_trap(vcpu)))
+   return;
+
/* Skip instruction if instructed so */
if (likely(r->access(vcpu, params, r)))
kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 827717b..6dd4008 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -61,6 +61,12 @@ struct sys_reg_desc {
const struct kvm_one_reg *reg, void __user *uaddr);
int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr);
+
+   /*
+* Forward the trap to the virtual EL2 if the guest hypervisor has
+* configured to trap the current instruction.
+*/
+   bool (*forward_trap)(struct kvm_vcpu *vcpu);
 };
 
 static inline void print_sys_reg_instr(const struct sys_reg_params *p)
-- 
1.9.1

[RFC PATCH v2 21/31] KVM: arm64: Emulate AT S1E[01] instructions

2017-10-02 Thread Jintack Lim

Emulate AT S1E[01] instructions by issuing the same instructions in EL2. We
set the physical EL1 registers, NV and NV1 bits as described in the AT
instruction emulation overview.

Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/kvm_emulate.h | 11 +++
 arch/arm64/kvm/sys_regs.c| 32 ++--
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index 4c47bc7..a494db2 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -185,6 +185,17 @@ static inline bool vcpu_el2_tge_is_set(const struct 
kvm_vcpu *vcpu)
return (vcpu_sys_reg(vcpu, HCR_EL2) & HCR_TGE);
 }
 
+
+/*
+ * When the NV and NV1 bits are set, the EL2 page table format is used for the
+ * EL1 translation regime.
+ */
+static inline bool vcpu_el2_format_used(const struct kvm_vcpu *vcpu)
+{
+   return ((vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) &&
+   (vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV1));
+}
+
 static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu)
 {
/*
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index d8728cc..a82274f 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1621,6 +1621,26 @@ static bool access_id_aa64mmfr0_el1(struct kvm_vcpu *v,
{ SYS_DESC(SYS_SP_EL2), NULL, reset_special, SP_EL2, 0},
 };
 
+static bool handle_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+const struct sys_reg_desc *r)
+{
+   struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
+   bool el2_format;
+   int sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+   /* See '2. EL0/EL1 AT instructions: S1E[01]x, S12E1x' table. */
+   if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
+   ctxt->hw_sys_regs = ctxt->shadow_sys_regs;
+   else
+   ctxt->hw_sys_regs = ctxt->sys_regs;
+
+   el2_format = vcpu_el2_format_used(vcpu);
+
+   kvm_call_hyp(__kvm_at_insn, vcpu, p->regval, el2_format, sys_encoding);
+
+   return true;
+}
+
 /*
  * AT instruction emulation
  *
@@ -1690,12 +1710,12 @@ static bool access_id_aa64mmfr0_el1(struct kvm_vcpu *v,
 #define SYS_INSN_TO_DESC(insn, access_fn, forward_fn)  \
{ SYS_DESC((insn)), (access_fn), NULL, 0, 0, NULL, NULL, (forward_fn) }
 static struct sys_reg_desc sys_insn_descs[] = {
-   SYS_INSN_TO_DESC(AT_S1E1R, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S1E1W, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S1E0R, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S1E0W, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S1E1RP, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S1E1WP, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E1R, handle_s1e01, NULL),
+   SYS_INSN_TO_DESC(AT_S1E1W, handle_s1e01, NULL),
+   SYS_INSN_TO_DESC(AT_S1E0R, handle_s1e01, NULL),
+   SYS_INSN_TO_DESC(AT_S1E0W, handle_s1e01, NULL),
+   SYS_INSN_TO_DESC(AT_S1E1RP, handle_s1e01, NULL),
+   SYS_INSN_TO_DESC(AT_S1E1WP, handle_s1e01, NULL),
SYS_INSN_TO_DESC(AT_S1E2R, NULL, NULL),
SYS_INSN_TO_DESC(AT_S1E2W, NULL, NULL),
SYS_INSN_TO_DESC(AT_S12E1R, NULL, NULL),
-- 
1.9.1

[RFC PATCH v2 26/31] KVM: arm64: Emulate TLBI ALLE1(IS)

2017-10-02 Thread Jintack Lim

TLBI ALLE1(IS) instruction invalidates all EL1&0 regime stage 1 and 2
TLB entries (on all PEs in the same Inner Shareable domain). To emulate
these instructions, we first need to clear all the mappings in the
shadow page tables since executing those instructions implies the change
of mappings in the stage 2 page tables maintained by the guest
hypervisor.  We then need to invalidate all EL1&0 regime stage 1 and 2
TLB entries of all VMIDs, which are assigned by the host hypervisor, for
this VM.

Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/sys_regs.c | 29 +++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a1ae8fb..5a82de9 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1795,6 +1795,31 @@ static bool handle_vae2(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
return true;
 }
 
+static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+  const struct sys_reg_desc *r)
+{
+   struct kvm_s2_mmu *mmu = &vcpu->kvm->arch.mmu;
+   u64 vttbr = kvm_get_vttbr(&mmu->vmid, mmu);
+
+   if (vcpu->kvm->arch.mmu.vmid.vmid_gen) {
+   /*
+* Invalidate the stage 1 and 2 TLB entries for the host OS
+* in a VM only if there is one.
+*/
+   kvm_call_hyp(__kvm_tlb_flush_vmid, vttbr);
+   }
+
+   spin_lock(&vcpu->kvm->mmu_lock);
+   /*
+* Clear all mappings in the shadow page tables and invalidate the stage
+* 1 and 2 TLB entries via kvm_tlb_flush_vmid_ipa().
+*/
+   kvm_nested_s2_clear(vcpu->kvm);
+   spin_unlock(&vcpu->kvm->mmu_lock);
+
+   return true;
+}
+
 /*
  * AT instruction emulation
  *
@@ -1880,14 +1905,14 @@ static bool handle_vae2(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE2IS, handle_alle2is, NULL),
SYS_INSN_TO_DESC(TLBI_VAE2IS, handle_vae2, NULL),
-   SYS_INSN_TO_DESC(TLBI_ALLE1IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_ALLE1IS, handle_alle1is, NULL),
SYS_INSN_TO_DESC(TLBI_VALE2IS, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2E1, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2LE1, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE2, handle_alle2, NULL),
SYS_INSN_TO_DESC(TLBI_VAE2, handle_vae2, NULL),
-   SYS_INSN_TO_DESC(TLBI_ALLE1, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_ALLE1, handle_alle1is, NULL),
SYS_INSN_TO_DESC(TLBI_VALE2, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_VMALLS12E1, NULL, NULL),
 };
-- 
1.9.1

[RFC PATCH v2 20/31] KVM: arm64: Implement AT instruction handling

2017-10-02 Thread Jintack Lim

Implement AT instruction handling logic in EL2. This will be used to
emulate AT instructions executed in the virtual EL2.

AT instruction emulation works by loading the proper processor context,
which depends on the trapped instruction and the virtual HCR_EL2, to the
EL1 virtual memory control registers and executing AT instructions. Note
that ctxt->hw_sys_regs is expected to have the proper processor context
before calling the handling function(__kvm_at_insn) implemented in this
patch.

Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/kvm_asm.h |   3 +
 arch/arm64/kvm/hyp/Makefile  |   1 +
 arch/arm64/kvm/hyp/at.c  | 131 +++
 3 files changed, 135 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/at.c

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index e492749..4bded9d 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -56,6 +56,9 @@
 extern void __kvm_tlb_flush_vmid(u64 vttbr);
 extern void __kvm_tlb_flush_local_vmid(u64 vttbr);
 
+extern void __kvm_at_insn(struct kvm_vcpu *vcpu, unsigned long vaddr,
+ bool el2_regime, int sys_encoding);
+
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 14c4e3b..1b03adb 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_KVM_ARM_HOST) += entry.o
 obj-$(CONFIG_KVM_ARM_HOST) += switch.o
 obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
 obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
+obj-$(CONFIG_KVM_ARM_HOST) += at.o
 obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
 obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o
 
diff --git a/arch/arm64/kvm/hyp/at.c b/arch/arm64/kvm/hyp/at.c
new file mode 100644
index 000..d491d94
--- /dev/null
+++ b/arch/arm64/kvm/hyp/at.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2017 - Linaro Ltd
+ * Author: Jintack Lim 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+
+static void __hyp_text __save_vmregs(struct kvm_cpu_context *ctxt)
+{
+   u64 *sys_regs = kern_hyp_va(ctxt->hw_sys_regs);
+
+   sys_regs[TTBR0_EL1] = read_sysreg_el1(ttbr0);
+   sys_regs[TTBR1_EL1] = read_sysreg_el1(ttbr1);
+   sys_regs[TCR_EL1]   = read_sysreg_el1(tcr);
+   sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr);
+}
+
+static void __hyp_text __restore_vmregs(struct kvm_cpu_context *ctxt)
+{
+   u64 *sys_regs = kern_hyp_va(ctxt->hw_sys_regs);
+
+   write_sysreg_el1(sys_regs[TTBR0_EL1],   ttbr0);
+   write_sysreg_el1(sys_regs[TTBR1_EL1],   ttbr1);
+   write_sysreg_el1(sys_regs[TCR_EL1], tcr);
+   write_sysreg_el1(sys_regs[SCTLR_EL1],   sctlr);
+}
+
+void __hyp_text __at_switch_to_guest_nvhe(struct kvm_vcpu *vcpu,
+ bool el2_regime)
+{
+   struct kvm_cpu_context *host_ctxt;
+   struct kvm_cpu_context *guest_ctxt;
+   u64 val;
+
+   host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+   guest_ctxt = &vcpu->arch.ctxt;
+
+   __save_vmregs(host_ctxt);
+   __restore_vmregs(guest_ctxt);
+
+   val = read_sysreg(hcr_el2);
+   if (el2_regime)
+   val |= (HCR_NV | HCR_NV1);
+   write_sysreg(val, hcr_el2);
+}
+
+void __hyp_text __at_switch_to_guest_vhe(struct kvm_vcpu *vcpu, bool 
el2_regime)
+{
+   struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
+   u64 val;
+
+   __restore_vmregs(guest_ctxt);
+
+   val = read_sysreg(hcr_el2);
+   val &= ~HCR_TGE;
+   if (el2_regime)
+   val |= (HCR_NV | HCR_NV1);
+   write_sysreg(val, hcr_el2);
+}
+
+/*
+ * Switching to guest.
+ *
+ * 1. [nvhe] Save host vm regs
+ * 2. [both] Restore guest vm regs
+ * 3. [both] Set HCR_EL2.NV/NV1 bit if necessary
+ * 4. [vhe]  Clear HCR_EL2.TGE
+ */
+static hyp_alternate_select(__at_switch_to_guest,
+   __at_switch_to_guest_nvhe, __at_switch_to_guest_vhe,
+   ARM64_HAS_VIRT_HOST_EXTN);
+
+void __hyp_text __kvm_at_insn(struct kvm_vcpu *vcpu, unsigned long vaddr,
+ bool el2_regime, int sys_encoding)
+{
+   struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
+   struct kvm_cpu_context *host_ctxt;
+
+   host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+
+   __at_switch_to_guest()(vcp

[RFC PATCH v2 25/31] KVM: arm64: Emulate TLBI VAE2* instrutions

2017-10-02 Thread Jintack Lim

Emulate TLBI VAE2* instruction executed in the virtual EL2. Based on the
same principle as TLBI ALLE2 instruction, we can simply emulate those
instructions by executing corresponding VAE1* instructions with the
virtual EL2's VMID assigned by the host hypervisor.

Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/kvm_asm.h |  1 +
 arch/arm64/kvm/hyp/tlb.c | 28 
 arch/arm64/kvm/sys_regs.c| 25 +
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 4bded9d..cd7fb85 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -55,6 +55,7 @@
 extern void __kvm_tlb_flush_vmid_ipa(u64 vttbr, phys_addr_t ipa);
 extern void __kvm_tlb_flush_vmid(u64 vttbr);
 extern void __kvm_tlb_flush_local_vmid(u64 vttbr);
+extern void __kvm_tlb_vae2(u64 vttbr, u64 va, u64 sys_encoding);
 
 extern void __kvm_at_insn(struct kvm_vcpu *vcpu, unsigned long vaddr,
  bool el2_regime, int sys_encoding);
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index 680b960..bd8b92c 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -151,3 +151,31 @@ void __hyp_text __kvm_flush_vm_context(void)
asm volatile("ic ialluis" : : );
dsb(ish);
 }
+
+void __hyp_text __kvm_tlb_vae2(u64 vttbr, u64 va, u64 sys_encoding)
+{
+   /* Switch to requested VMID */
+   __tlb_switch_to_guest()(vttbr);
+
+   /* Execute the EL1 version of TLBI VAE2* instruction */
+   switch (sys_encoding) {
+   case TLBI_VAE2IS:
+   __tlbi(vae1is, va);
+   break;
+   case TLBI_VALE2IS:
+   __tlbi(vale1is, va);
+   break;
+   case TLBI_VAE2:
+   __tlbi(vae1, va);
+   break;
+   case TLBI_VALE2:
+   __tlbi(vale1, va);
+   break;
+   default:
+   break;
+   }
+   dsb(nsh);
+   isb();
+
+   __tlb_switch_to_host()();
+}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 90329b7..a1ae8fb 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1778,6 +1778,23 @@ static bool handle_alle2is(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
return true;
 }
 
+static bool handle_vae2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+  const struct sys_reg_desc *r)
+{
+   struct kvm_s2_mmu *mmu = &vcpu->kvm->arch.mmu;
+   u64 vttbr = kvm_get_vttbr(&mmu->el2_vmid, mmu);
+   int sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+   /*
+* Based on the same principle as TLBI ALLE2 instruction emulation, we
+* emulate TLBI VAE2* instructions by executing corresponding TLBI VAE1*
+* instructions with the virtual EL2's VMID assigned by the host
+* hypervisor.
+*/
+   kvm_call_hyp(__kvm_tlb_vae2, vttbr, p->regval, sys_encoding);
+   return true;
+}
+
 /*
  * AT instruction emulation
  *
@@ -1862,16 +1879,16 @@ static bool handle_alle2is(struct kvm_vcpu *vcpu, 
struct sys_reg_params *p,
SYS_INSN_TO_DESC(TLBI_IPAS2E1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE2IS, handle_alle2is, NULL),
-   SYS_INSN_TO_DESC(TLBI_VAE2IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VAE2IS, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE1IS, NULL, NULL),
-   SYS_INSN_TO_DESC(TLBI_VALE2IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VALE2IS, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2E1, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2LE1, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE2, handle_alle2, NULL),
-   SYS_INSN_TO_DESC(TLBI_VAE2, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VAE2, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE1, NULL, NULL),
-   SYS_INSN_TO_DESC(TLBI_VALE2, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VALE2, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_VMALLS12E1, NULL, NULL),
 };
 
-- 
1.9.1

[RFC PATCH v2 24/31] KVM: arm64: Emulate TLBI ALLE2(IS) instruction

2017-10-02 Thread Jintack Lim

Emulate TLBI ALLE2(IS) instruction executed in the virtual EL2. Since we
emulate the virtual EL2 in the EL1, we invalidate EL1&0 regime stage 1
TLB entries with setting vttbr_el2 having the VMID of the virtual EL2.

Note that we are able to emulate TLBI ALLE2IS precisely by only
invalidating stage 1 TLB entries via TLBI VMALL1IS instruction, but to
make it simeple, we reuse the existing function, __kvm_tlb_flush_vmid(),
which invalidates both of stage 1 and 2 TLB entries.

Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/sys_regs.c | 35 +--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 7950ee0..90329b7 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1747,6 +1747,37 @@ static bool handle_s12w(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
return handle_s12(vcpu, p, r, true);
 }
 
+static bool handle_alle2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+const struct sys_reg_desc *r)
+{
+   struct kvm_s2_mmu *mmu = &vcpu->kvm->arch.mmu;
+   u64 vttbr = kvm_get_vttbr(&mmu->el2_vmid, mmu);
+
+   /*
+* To emulate invalidating all EL2 regime stage 1 TLB entries,
+* invalidate EL1&0 regime stage 1 TLB entries with the virtual EL2's
+* VMID.
+*/
+   kvm_call_hyp(__kvm_tlb_flush_local_vmid, vttbr);
+   return true;
+}
+
+static bool handle_alle2is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+  const struct sys_reg_desc *r)
+{
+   struct kvm_s2_mmu *mmu = &vcpu->kvm->arch.mmu;
+   u64 vttbr = kvm_get_vttbr(&mmu->el2_vmid, mmu);
+
+   /*
+* To emulate invalidating all EL2 regime stage 1 TLB entries for all
+* PEs, executing TLBI VMALLE1IS is enough. But reuse the existing
+* interface for the simplicity; invalidating stage 2 entries doesn't
+* affect the correctness.
+*/
+   kvm_call_hyp(__kvm_tlb_flush_vmid, vttbr);
+   return true;
+}
+
 /*
  * AT instruction emulation
  *
@@ -1830,14 +1861,14 @@ static bool handle_s12w(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
SYS_INSN_TO_DESC(AT_S12E0W, handle_s12w, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2E1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, NULL, NULL),
-   SYS_INSN_TO_DESC(TLBI_ALLE2IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_ALLE2IS, handle_alle2is, NULL),
SYS_INSN_TO_DESC(TLBI_VAE2IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_VALE2IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2E1, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2LE1, NULL, NULL),
-   SYS_INSN_TO_DESC(TLBI_ALLE2, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_ALLE2, handle_alle2, NULL),
SYS_INSN_TO_DESC(TLBI_VAE2, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE1, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_VALE2, NULL, NULL),
-- 
1.9.1

[RFC PATCH v2 28/31] KVM: arm64: Emulate TLBI IPAS2E1* instructions

2017-10-02 Thread Jintack Lim

Based on the same principle as TLBI ALLE1(IS) and TLBI VMALLS12E1(IS)
emulation, we clear the mappings in the shadow stage-2 page tables and
invalidate TLB entries. We do it only for one mapping for the current
VMID from the guest hypervisor's view.

Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/sys_regs.c | 38 ++
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 5fd47ad..eb91f00 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1849,6 +1849,36 @@ static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, 
struct sys_reg_params *p,
return true;
 }
 
+static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+const struct sys_reg_desc *r)
+{
+   u64 vttbr;
+   struct kvm_s2_mmu *mmu;
+   bool ret;
+
+   spin_lock(&vcpu->kvm->mmu_lock);
+   /*
+* Clear a mapping in the shadow page tables and invalidate the stage
+* 2 TLB entries via kvm_tlb_flush_vmid_ipa() for the current
+* VMID and the given ipa.
+*/
+   ret = kvm_nested_s2_clear_curr_vmid(vcpu, p->regval, PAGE_SIZE);
+   spin_unlock(&vcpu->kvm->mmu_lock);
+
+   if (!ret) {
+   /*
+* Invalidate TLB entries explicitly for the case that the
+* current VMID is for the host OS in the VM; we don't manage
+* shadow stage 2 page tables for it.
+*/
+   mmu = &vcpu->kvm->arch.mmu;
+   vttbr = kvm_get_vttbr(&mmu->vmid, mmu);
+   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, vttbr, p->regval);
+   }
+
+   return true;
+}
+
 /*
  * AT instruction emulation
  *
@@ -1930,15 +1960,15 @@ static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, 
struct sys_reg_params *p,
SYS_INSN_TO_DESC(AT_S12E1W, handle_s12w, NULL),
SYS_INSN_TO_DESC(AT_S12E0R, handle_s12r, NULL),
SYS_INSN_TO_DESC(AT_S12E0W, handle_s12w, NULL),
-   SYS_INSN_TO_DESC(TLBI_IPAS2E1IS, NULL, NULL),
-   SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_IPAS2E1IS, handle_ipas2e1is, NULL),
+   SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, handle_ipas2e1is, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE2IS, handle_alle2is, NULL),
SYS_INSN_TO_DESC(TLBI_VAE2IS, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE1IS, handle_alle1is, NULL),
SYS_INSN_TO_DESC(TLBI_VALE2IS, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, handle_vmalls12e1is, NULL),
-   SYS_INSN_TO_DESC(TLBI_IPAS2E1, NULL, NULL),
-   SYS_INSN_TO_DESC(TLBI_IPAS2LE1, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_IPAS2E1, handle_ipas2e1is, NULL),
+   SYS_INSN_TO_DESC(TLBI_IPAS2LE1, handle_ipas2e1is, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE2, handle_alle2, NULL),
SYS_INSN_TO_DESC(TLBI_VAE2, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE1, handle_alle1is, NULL),
-- 
1.9.1

[RFC PATCH v2 22/31] KVM: arm64: Emulate AT S1E2 instructions

2017-10-02 Thread Jintack Lim

Emulate AT S1E2 instructions by issuing the corresponding S1E1
instructions in EL2. We set the physical EL1 registers and the HCR_EL2
register as described in the AT instruction emulation overview.

Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/sys_regs.c | 19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a82274f..cb46db5 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1641,6 +1641,21 @@ static bool handle_s1e01(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
return true;
 }
 
+static bool handle_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+   const struct sys_reg_desc *r)
+{
+   struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
+   bool el2_format;
+   int sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+   /* See the '1. EL2 AT instructions: S1E2x' table */
+   ctxt->hw_sys_regs = ctxt->shadow_sys_regs;
+   el2_format = !vcpu_el2_e2h_is_set(vcpu);
+
+   kvm_call_hyp(__kvm_at_insn, vcpu, p->regval, el2_format, sys_encoding);
+   return true;
+}
+
 /*
  * AT instruction emulation
  *
@@ -1716,8 +1731,8 @@ static bool handle_s1e01(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
SYS_INSN_TO_DESC(AT_S1E0W, handle_s1e01, NULL),
SYS_INSN_TO_DESC(AT_S1E1RP, handle_s1e01, NULL),
SYS_INSN_TO_DESC(AT_S1E1WP, handle_s1e01, NULL),
-   SYS_INSN_TO_DESC(AT_S1E2R, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S1E2W, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E2R, handle_s1e2, NULL),
+   SYS_INSN_TO_DESC(AT_S1E2W, handle_s1e2, NULL),
SYS_INSN_TO_DESC(AT_S12E1R, NULL, NULL),
SYS_INSN_TO_DESC(AT_S12E1W, NULL, NULL),
SYS_INSN_TO_DESC(AT_S12E0R, NULL, NULL),
-- 
1.9.1

[RFC PATCH v2 23/31] KVM: arm64: Emulate AT S12E[01] instructions

2017-10-02 Thread Jintack Lim

Emulating AT A12E[01] instructions involves two steps. First, do the
stage-1 translation by reusing the existing AT emulation functions. Then
do the stage-2 translation by walking the guest hypervisor's stage-2
page table in software. Record the translation result to PAR_EL1.

Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/kvm_arm.h |  1 +
 arch/arm64/kvm/sys_regs.c| 99 ++--
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 3993703..e160895 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -111,6 +111,7 @@
 #define VTCR_EL2_TG0_16K   TCR_TG0_16K
 #define VTCR_EL2_TG0_64K   TCR_TG0_64K
 #define VTCR_EL2_SH0_MASK  TCR_SH0_MASK
+#define VTCR_EL2_SH0_SHIFT TCR_SH0_SHIFT
 #define VTCR_EL2_SH0_INNER TCR_SH0_INNER
 #define VTCR_EL2_ORGN0_MASKTCR_ORGN0_MASK
 #define VTCR_EL2_ORGN0_WBWATCR_ORGN0_WBWA
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index cb46db5..7950ee0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1656,6 +1656,97 @@ static bool handle_s1e2(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
return true;
 }
 
+static u64 setup_par_aborted(u32 esr)
+{
+   u64 par = 0;
+
+   /* S [9]: fault in the stage 2 translation */
+   par |= (1 << 9);
+   /* FST [6:1]: Fault status code  */
+   par |= (esr << 1);
+   /* F [0]: translation is aborted */
+   par |= 1;
+
+   return par;
+}
+
+static u64 setup_par_completed(struct kvm_vcpu *vcpu, struct kvm_s2_trans *out)
+{
+   u64 par, vtcr_sh0;
+
+   /* F [0]: Translation is completed successfully */
+   par = 0;
+   /* ATTR [63:56] */
+   par |= out->upper_attr;
+   /* PA [47:12] */
+   par |= out->output & GENMASK_ULL(11, 0);
+   /* RES1 [11] */
+   par |= (1UL << 11);
+   /* SH [8:7]: Shareability attribute */
+   vtcr_sh0 = vcpu_sys_reg(vcpu, VTCR_EL2) & VTCR_EL2_SH0_MASK;
+   par |= (vtcr_sh0 >> VTCR_EL2_SH0_SHIFT) << 7;
+
+   return par;
+}
+
+static bool handle_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+  const struct sys_reg_desc *r, bool write)
+{
+   u64 par, va;
+   u32 esr;
+   phys_addr_t ipa;
+   struct kvm_s2_trans out;
+   int ret;
+
+   /* Do the stage-1 translation */
+   handle_s1e01(vcpu, p, r);
+   par = vcpu_sys_reg(vcpu, PAR_EL1);
+   if (par & 1) {
+   /* The stage-1 translation aborted */
+   return true;
+   }
+
+   /* Do the stage-2 translation */
+   va = p->regval;
+   ipa = (par & GENMASK_ULL(47, 12)) | (va & GENMASK_ULL(11, 0));
+   out.esr = 0;
+   ret = kvm_walk_nested_s2(vcpu, ipa, &out);
+   if (ret < 0)
+   return false;
+
+   /* Check if the stage-2 PTW is aborted */
+   if (out.esr) {
+   esr = out.esr;
+   goto s2_trans_abort;
+   }
+
+   /* Check the access permission */
+   if ((!write && !out.readable) || (write && !out.writable)) {
+   esr = ESR_ELx_FSC_PERM;
+   esr |= out.level & 0x3;
+   goto s2_trans_abort;
+   }
+
+   vcpu_sys_reg(vcpu, PAR_EL1) = setup_par_completed(vcpu, &out);
+   return true;
+
+s2_trans_abort:
+   vcpu_sys_reg(vcpu, PAR_EL1) = setup_par_aborted(esr);
+   return true;
+}
+
+static bool handle_s12r(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+   const struct sys_reg_desc *r)
+{
+   return handle_s12(vcpu, p, r, false);
+}
+
+static bool handle_s12w(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+   const struct sys_reg_desc *r)
+{
+   return handle_s12(vcpu, p, r, true);
+}
+
 /*
  * AT instruction emulation
  *
@@ -1733,10 +1824,10 @@ static bool handle_s1e2(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
SYS_INSN_TO_DESC(AT_S1E1WP, handle_s1e01, NULL),
SYS_INSN_TO_DESC(AT_S1E2R, handle_s1e2, NULL),
SYS_INSN_TO_DESC(AT_S1E2W, handle_s1e2, NULL),
-   SYS_INSN_TO_DESC(AT_S12E1R, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S12E1W, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S12E0R, NULL, NULL),
-   SYS_INSN_TO_DESC(AT_S12E0W, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S12E1R, handle_s12r, NULL),
+   SYS_INSN_TO_DESC(AT_S12E1W, handle_s12w, NULL),
+   SYS_INSN_TO_DESC(AT_S12E0R, handle_s12r, NULL),
+   SYS_INSN_TO_DESC(AT_S12E0W, handle_s12w, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2E1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE2IS, NULL, NULL),
-- 
1.9.1

[RFC PATCH v2 29/31] KVM: arm64: Respect the virtual HCR_EL2.AT and NV setting

2017-10-02 Thread Jintack Lim

Forward system instruction traps to the virtual EL2 if a corresponding
bit in the virtual HCR_EL2 is set.

Signed-off-by: Jintack Lim 
---

Notes:
v1-->v2:
This is a new commit.  We can rework existing forward_nv_traps() and
forward_nv1_traps() defined in rfc-v2 cpu patches to reuse forward_traps()
function

 arch/arm64/include/asm/kvm_arm.h |  1 +
 arch/arm64/kvm/sys_regs.c| 69 +---
 2 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index e160895..925edfd 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -23,6 +23,7 @@
 #include 
 
 /* Hyp Configuration Register (HCR) bits */
+#define HCR_AT (UL(1) << 44)
 #define HCR_NV1(UL(1) << 43)
 #define HCR_NV (UL(1) << 42)
 #define HCR_E2H(UL(1) << 34)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index eb91f00..89e73af 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -966,6 +966,23 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu,
return true;
 }
 
+static bool forward_traps(struct kvm_vcpu *vcpu, u64 control_bit)
+{
+   bool control_bit_set;
+
+   control_bit_set = vcpu_sys_reg(vcpu, HCR_EL2) & control_bit;
+   if (!vcpu_mode_el2(vcpu) && control_bit_set) {
+   kvm_inject_nested_sync(vcpu, kvm_vcpu_get_hsr(vcpu));
+   return true;
+   }
+   return false;
+}
+
+static bool forward_at_traps(struct kvm_vcpu *vcpu)
+{
+   return forward_traps(vcpu, HCR_AT);
+}
+
 /* This function is to support the recursive nested virtualization */
 bool forward_nv_traps(struct kvm_vcpu *vcpu)
 {
@@ -1948,32 +1965,32 @@ static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, 
struct sys_reg_params *p,
 #define SYS_INSN_TO_DESC(insn, access_fn, forward_fn)  \
{ SYS_DESC((insn)), (access_fn), NULL, 0, 0, NULL, NULL, (forward_fn) }
 static struct sys_reg_desc sys_insn_descs[] = {
-   SYS_INSN_TO_DESC(AT_S1E1R, handle_s1e01, NULL),
-   SYS_INSN_TO_DESC(AT_S1E1W, handle_s1e01, NULL),
-   SYS_INSN_TO_DESC(AT_S1E0R, handle_s1e01, NULL),
-   SYS_INSN_TO_DESC(AT_S1E0W, handle_s1e01, NULL),
-   SYS_INSN_TO_DESC(AT_S1E1RP, handle_s1e01, NULL),
-   SYS_INSN_TO_DESC(AT_S1E1WP, handle_s1e01, NULL),
-   SYS_INSN_TO_DESC(AT_S1E2R, handle_s1e2, NULL),
-   SYS_INSN_TO_DESC(AT_S1E2W, handle_s1e2, NULL),
-   SYS_INSN_TO_DESC(AT_S12E1R, handle_s12r, NULL),
-   SYS_INSN_TO_DESC(AT_S12E1W, handle_s12w, NULL),
-   SYS_INSN_TO_DESC(AT_S12E0R, handle_s12r, NULL),
-   SYS_INSN_TO_DESC(AT_S12E0W, handle_s12w, NULL),
-   SYS_INSN_TO_DESC(TLBI_IPAS2E1IS, handle_ipas2e1is, NULL),
-   SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, handle_ipas2e1is, NULL),
-   SYS_INSN_TO_DESC(TLBI_ALLE2IS, handle_alle2is, NULL),
-   SYS_INSN_TO_DESC(TLBI_VAE2IS, handle_vae2, NULL),
-   SYS_INSN_TO_DESC(TLBI_ALLE1IS, handle_alle1is, NULL),
-   SYS_INSN_TO_DESC(TLBI_VALE2IS, handle_vae2, NULL),
-   SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, handle_vmalls12e1is, NULL),
-   SYS_INSN_TO_DESC(TLBI_IPAS2E1, handle_ipas2e1is, NULL),
-   SYS_INSN_TO_DESC(TLBI_IPAS2LE1, handle_ipas2e1is, NULL),
-   SYS_INSN_TO_DESC(TLBI_ALLE2, handle_alle2, NULL),
-   SYS_INSN_TO_DESC(TLBI_VAE2, handle_vae2, NULL),
-   SYS_INSN_TO_DESC(TLBI_ALLE1, handle_alle1is, NULL),
-   SYS_INSN_TO_DESC(TLBI_VALE2, handle_vae2, NULL),
-   SYS_INSN_TO_DESC(TLBI_VMALLS12E1, handle_vmalls12e1is, NULL),
+   SYS_INSN_TO_DESC(AT_S1E1R, handle_s1e01, forward_at_traps),
+   SYS_INSN_TO_DESC(AT_S1E1W, handle_s1e01, forward_at_traps),
+   SYS_INSN_TO_DESC(AT_S1E0R, handle_s1e01, forward_at_traps),
+   SYS_INSN_TO_DESC(AT_S1E0W, handle_s1e01, forward_at_traps),
+   SYS_INSN_TO_DESC(AT_S1E1RP, handle_s1e01, forward_at_traps),
+   SYS_INSN_TO_DESC(AT_S1E1WP, handle_s1e01, forward_at_traps),
+   SYS_INSN_TO_DESC(AT_S1E2R, handle_s1e2, forward_nv_traps),
+   SYS_INSN_TO_DESC(AT_S1E2W, handle_s1e2, forward_nv_traps),
+   SYS_INSN_TO_DESC(AT_S12E1R, handle_s12r, forward_nv_traps),
+   SYS_INSN_TO_DESC(AT_S12E1W, handle_s12w, forward_nv_traps),
+   SYS_INSN_TO_DESC(AT_S12E0R, handle_s12r, forward_nv_traps),
+   SYS_INSN_TO_DESC(AT_S12E0W, handle_s12w, forward_nv_traps),
+   SYS_INSN_TO_DESC(TLBI_IPAS2E1IS, handle_ipas2e1is, forward_nv_traps),
+   SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, handle_ipas2e1is, forward_nv_traps),
+   SYS_INSN_TO_DESC(TLBI_ALLE2IS, handle_alle2is, forward_nv_traps),
+   SYS_INSN_TO_DESC(TLBI_VAE2IS, handle_vae2, forward_nv_traps),
+   SYS_INSN_TO_DESC(TLBI_ALLE1IS, handle_alle1is, forward_nv_traps),
+   SYS_INSN_TO_DESC(TLBI_VALE2IS, handle_vae2, forward_nv_traps),
+   SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, handle_vmalls12e1is, 
forward_nv_traps),
+   SYS_INSN_TO_DESC(TLBI_IPAS

[RFC PATCH v2 27/31] KVM: arm64: Emulate TLBI VMALLS12E1(IS) instruction

2017-10-02 Thread Jintack Lim

Based on the same principle as TLBI ALLE1(IS) emulation, we clear the
mappings in the shadow stage-2 page tables and invalidate TLB entries.
But this time we do it only for the current VMID from the guest
hypervisor's perspective, not for all VMIDs.

Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/kvm_mmu.h |  2 ++
 arch/arm64/kvm/mmu-nested.c  | 23 +++
 arch/arm64/kvm/sys_regs.c| 33 +++--
 3 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 239bb89..6681be1 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -345,6 +345,8 @@ int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 void kvm_nested_s2_clear(struct kvm *kvm);
 void kvm_nested_s2_flush(struct kvm *kvm);
 int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
+bool kvm_nested_s2_clear_curr_vmid(struct kvm_vcpu *vcpu, phys_addr_t start,
+  u64 size);
 
 static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/kvm/mmu-nested.c b/arch/arm64/kvm/mmu-nested.c
index a440d7b..2189f2b 100644
--- a/arch/arm64/kvm/mmu-nested.c
+++ b/arch/arm64/kvm/mmu-nested.c
@@ -349,6 +349,29 @@ static struct kvm_nested_s2_mmu *lookup_nested_mmu(struct 
kvm_vcpu *vcpu,
return NULL;
 }
 
+/*
+ * Clear mappings in the shadow stage 2 page tables for the current VMID from
+ * the perspective of the guest hypervisor.
+ * This function expects kvm->mmu_lock to be held.
+ */
+bool kvm_nested_s2_clear_curr_vmid(struct kvm_vcpu *vcpu, phys_addr_t start,
+  u64 size)
+{
+   struct kvm_nested_s2_mmu *nested_mmu;
+   u64 vttbr = vcpu_sys_reg(vcpu, VTTBR_EL2);
+
+   /*
+* Look up a mmu that is used for the current VMID from the guest
+* hypervisor's view.
+*/
+   nested_mmu = lookup_nested_mmu(vcpu, vttbr);
+   if (!nested_mmu)
+   return false;
+
+   kvm_unmap_stage2_range(vcpu->kvm, &nested_mmu->mmu, start, size);
+   return true;
+}
+
 /**
  * create_nested_mmu - create mmu for the given virtual VMID
  *
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 5a82de9..5fd47ad 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1820,6 +1820,35 @@ static bool handle_alle1is(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
return true;
 }
 
+static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params 
*p,
+   const struct sys_reg_desc *r)
+{
+   u64 vttbr;
+   struct kvm_s2_mmu *mmu;
+   bool ret;
+
+   spin_lock(&vcpu->kvm->mmu_lock);
+   /*
+* Clear mappings in the shadow page tables and invalidate the stage
+* 1 and 2 TLB entries via kvm_tlb_flush_vmid_ipa() for the current
+* VMID.
+*/
+   ret = kvm_nested_s2_clear_curr_vmid(vcpu, 0, KVM_PHYS_SIZE);
+   spin_unlock(&vcpu->kvm->mmu_lock);
+
+   if (!ret) {
+   /*
+* Invalidate TLB entries explicitly for the case that the
+* current VMID is for the host OS in the VM; we don't manage
+* shadow stage 2 page tables for it.
+*/
+   mmu = &vcpu->kvm->arch.mmu;
+   vttbr = kvm_get_vttbr(&mmu->vmid, mmu);
+   kvm_call_hyp(__kvm_tlb_flush_vmid, vttbr);
+   }
+   return true;
+}
+
 /*
  * AT instruction emulation
  *
@@ -1907,14 +1936,14 @@ static bool handle_alle1is(struct kvm_vcpu *vcpu, 
struct sys_reg_params *p,
SYS_INSN_TO_DESC(TLBI_VAE2IS, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE1IS, handle_alle1is, NULL),
SYS_INSN_TO_DESC(TLBI_VALE2IS, handle_vae2, NULL),
-   SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, handle_vmalls12e1is, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2E1, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_IPAS2LE1, NULL, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE2, handle_alle2, NULL),
SYS_INSN_TO_DESC(TLBI_VAE2, handle_vae2, NULL),
SYS_INSN_TO_DESC(TLBI_ALLE1, handle_alle1is, NULL),
SYS_INSN_TO_DESC(TLBI_VALE2, handle_vae2, NULL),
-   SYS_INSN_TO_DESC(TLBI_VMALLS12E1, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VMALLS12E1, handle_vmalls12e1is, NULL),
 };
 
 #define reg_to_match_value(x)  \
-- 
1.9.1

[RFC PATCH v2 19/31] KVM: arm64: Describe AT instruction emulation design

2017-10-02 Thread Jintack Lim

This design overview will help to digest the subsequent patches that
implement AT instruction emulation.

Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/sys_regs.c | 66 +++
 1 file changed, 66 insertions(+)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 8d04926..d8728cc 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1621,6 +1621,72 @@ static bool access_id_aa64mmfr0_el1(struct kvm_vcpu *v,
{ SYS_DESC(SYS_SP_EL2), NULL, reset_special, SP_EL2, 0},
 };
 
+/*
+ * AT instruction emulation
+ *
+ * We emulate AT instructions executed in the virtual EL2.
+ * Basic strategy for the stage-1 translation emulation is to load proper
+ * context, which depends on the trapped instruction and the virtual HCR_EL2,
+ * to the EL1 virtual memory control registers and execute S1E[01] instructions
+ * in EL2. See below for more detail.
+ *
+ * For the stage-2 translation, which is necessary for S12E[01] emulation,
+ * we walk the guest hypervisor's stage-2 page table in software.
+ *
+ * The stage-1 translation emulations can be divided into two groups depending
+ * on the translation regime.
+ *
+ * 1. EL2 AT instructions: S1E2x
+ * +---+
+ * | | Setting for the emulation   |
+ * | Virtual HCR_EL2.E2H on trap |-+
+ * | | Phys EL1 regs | Phys NV, NV1 | Phys TGE |
+ * |---|
+ * | 0   | vEL2  |(1, 1)|0 |
+ * | 1   | vEL2  |(0, 0)|0 |
+ * +---+
+ *
+ * We emulate the EL2 AT instructions by loading virtual EL2 context
+ * to the EL1 virtual memory control registers and executing corresponding
+ * EL1 AT instructions.
+ *
+ * We set physical NV and NV1 bits to use EL2 page table format for non-VHE
+ * guest hypervisor (i.e. HCR_EL2.E2H == 0). As a VHE guest hypervisor uses the
+ * EL1 page table format, we don't set those bits.
+ *
+ * We should clear physical TGE bit not to use the EL2 translation regime when
+ * the host uses the VHE feature.
+ *
+ *
+ * 2. EL0/EL1 AT instructions: S1E[01]x, S12E1x
+ * +--+
+ * |   Virtual HCR_EL2 on trap  |Setting for the emulation|
+ * |--+
+ * | (vE2H, vTGE) | (vNV, vNV1) | Phys EL1 regs | Phys NV, NV1 | Phys TGE |
+ * |--|
+ * |(0, 0)*   |   (0, 0)|  vEL1 |(0, 0)|0 |
+ * |(0, 0)|   (1, 1)|  vEL1 |(1, 1)|0 |
+ * |(1, 1)|   (0, 0)|  vEL2 |(0, 0)|0 |
+ * |(1, 1)|   (1, 1)|  vEL2 |(1, 1)|0 |
+ * +--+
+ *
+ * *For (0, 0) in the 'Virtual HCR_EL2 on trap' column, it actually means
+ *  (1, 1). Keep them (0, 0) just for the readability.
+ *
+ * We set physical EL1 virtual memory control registers depending on
+ * (vE2H, vTGE) pair. When the pair is (0, 0) where AT instructions are
+ * supposed to use EL0/EL1 translation regime, we load the EL1 registers with
+ * the virtual EL1 registers (i.e. EL1 registers from the guest hypervisor's
+ * point of view). When the pair is (1, 1), however, AT instructions are 
defined
+ * to apply EL2 translation regime. To emulate this behavior, we load the EL1
+ * registers with the virtual EL2 context. (i.e the shadow registers)
+ *
+ * We respect the virtual NV and NV1 bit for the emulation. When those bits are
+ * set, it means that a guest hypervisor would like to use EL2 page table 
format
+ * for the EL1 translation regime. We emulate this by setting the physical
+ * NV and NV1 bits.
+ */
+
 #define SYS_INSN_TO_DESC(insn, access_fn, forward_fn)  \
{ SYS_DESC((insn)), (access_fn), NULL, 0, 0, NULL, NULL, (forward_fn) }
 static struct sys_reg_desc sys_insn_descs[] = {
-- 
1.9.1

[RFC PATCH v2 30/31] KVM: arm64: Emulate TLBI instructions accesible from EL1

2017-10-02 Thread Jintack Lim

Even though a guest hypervisor can execute TLBI instructions that are
accesible at EL1 without trap, it's wrong; All those TLBI instructions
work based on current VMID, and when running a guest hypervisor current
VMID is the one for itself, not the one from the virtual vttbr_el2. So
letting a guest hypervisor execute those TLBI instructions results in
invalidating its own TLB entries and leaving invalid TLB entries
unhandled.

Therefore we trap and emulate those TLBI instructions. The emulation is
simple; we find a shadow VMID mapped to the virtual vttbr_el2, set it in
the physical vttbr_el2, then execute the same instruction in EL2.

We don't set HCR_EL2.TTLB bit yet.

Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/kvm_asm.h |  1 +
 arch/arm64/include/asm/kvm_mmu.h |  1 +
 arch/arm64/include/asm/sysreg.h  | 15 
 arch/arm64/kvm/hyp/tlb.c | 52 
 arch/arm64/kvm/mmu-nested.c  |  3 +--
 arch/arm64/kvm/sys_regs.c| 50 ++
 6 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index cd7fb85..ce331d7 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -56,6 +56,7 @@
 extern void __kvm_tlb_flush_vmid(u64 vttbr);
 extern void __kvm_tlb_flush_local_vmid(u64 vttbr);
 extern void __kvm_tlb_vae2(u64 vttbr, u64 va, u64 sys_encoding);
+extern void __kvm_tlb_el1_instr(u64 vttbr, u64 val, u64 sys_encoding);
 
 extern void __kvm_at_insn(struct kvm_vcpu *vcpu, unsigned long vaddr,
  bool el2_regime, int sys_encoding);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 6681be1..601f431 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -347,6 +347,7 @@ int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
 bool kvm_nested_s2_clear_curr_vmid(struct kvm_vcpu *vcpu, phys_addr_t start,
   u64 size);
+struct kvm_nested_s2_mmu *lookup_nested_mmu(struct kvm_vcpu *vcpu, u64 vttbr);
 
 static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 53df733..fd6b98a 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -386,10 +386,25 @@
 
 /* TLBI instructions */
 #define TLBI_Op0   1
+#define TLBI_Op1_EL1   0   /* Accessible from EL1 or higher */
 #define TLBI_Op1_EL2   4   /* Accessible from EL2 or higher */
 #define TLBI_CRn   8
+#define tlbi_insn_el1(CRm, Op2)sys_insn(TLBI_Op0, TLBI_Op1_EL1, 
TLBI_CRn, (CRm), (Op2))
 #define tlbi_insn_el2(CRm, Op2)sys_insn(TLBI_Op0, TLBI_Op1_EL2, 
TLBI_CRn, (CRm), (Op2))
 
+#define TLBI_VMALLE1IS tlbi_insn_el1(3, 0)
+#define TLBI_VAE1IStlbi_insn_el1(3, 1)
+#define TLBI_ASIDE1IS  tlbi_insn_el1(3, 2)
+#define TLBI_VAAE1IS   tlbi_insn_el1(3, 3)
+#define TLBI_VALE1IS   tlbi_insn_el1(3, 5)
+#define TLBI_VAALE1IS  tlbi_insn_el1(3, 7)
+#define TLBI_VMALLE1   tlbi_insn_el1(7, 0)
+#define TLBI_VAE1  tlbi_insn_el1(7, 1)
+#define TLBI_ASIDE1tlbi_insn_el1(7, 2)
+#define TLBI_VAAE1 tlbi_insn_el1(7, 3)
+#define TLBI_VALE1 tlbi_insn_el1(7, 5)
+#define TLBI_VAALE1tlbi_insn_el1(7, 7)
+
 #define TLBI_IPAS2E1IS tlbi_insn_el2(0, 1)
 #define TLBI_IPAS2LE1IStlbi_insn_el2(0, 5)
 #define TLBI_ALLE2IS   tlbi_insn_el2(3, 0)
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index bd8b92c..096c234 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -179,3 +179,55 @@ void __hyp_text __kvm_tlb_vae2(u64 vttbr, u64 va, u64 
sys_encoding)
 
__tlb_switch_to_host()();
 }
+
+void __hyp_text __kvm_tlb_el1_instr(u64 vttbr, u64 val, u64 sys_encoding)
+{
+   /* Switch to requested VMID */
+   __tlb_switch_to_guest()(vttbr);
+
+   /* Execute the same instruction as the guest hypervisor did */
+   switch (sys_encoding) {
+   case TLBI_VMALLE1IS:
+   __tlbi(vmalle1is);
+   break;
+   case TLBI_VAE1IS:
+   __tlbi(vae1is, val);
+   break;
+   case TLBI_ASIDE1IS:
+   __tlbi(aside1is, val);
+   break;
+   case TLBI_VAAE1IS:
+   __tlbi(vaae1is, val);
+   break;
+   case TLBI_VALE1IS:
+   __tlbi(vale1is, val);
+   break;
+   case TLBI_VAALE1IS:
+   __tlbi(vaale1is, val);
+   break;
+   case TLBI_VMALLE1:
+   __tlbi(vmalle1);
+   break;
+   case TLBI_VAE1:
+   __tlbi(vae1, val);
+   break;
+   case TLBI_ASIDE1:
+   __tlbi(aside1, val);
+   break;
+   case TLBI_VAAE1:
+

[RFC PATCH v2 18/31] KVM: arm64: Enumerate AT and TLBI instructions to emulate

2017-10-02 Thread Jintack Lim

List all system instructions to emulate. This patch only introduces the
definitions, emulation handlers will be added in subsequent patches.

Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/sysreg.h | 38 ++
 arch/arm64/kvm/sys_regs.c   | 26 ++
 2 files changed, 64 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index a051d42..53df733 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -367,6 +367,44 @@
 
 #define SYS_SP_EL2 sys_reg(3, 6, 4, 1, 0)
 
+/* AT instructions */
+#define AT_Op0 1
+#define AT_CRn 7
+
+#define AT_S1E1R   sys_insn(AT_Op0, 0, AT_CRn, 8, 0)
+#define AT_S1E1W   sys_insn(AT_Op0, 0, AT_CRn, 8, 1)
+#define AT_S1E0R   sys_insn(AT_Op0, 0, AT_CRn, 8, 2)
+#define AT_S1E0W   sys_insn(AT_Op0, 0, AT_CRn, 8, 3)
+#define AT_S1E1RP  sys_insn(AT_Op0, 0, AT_CRn, 9, 0)
+#define AT_S1E1WP  sys_insn(AT_Op0, 0, AT_CRn, 9, 1)
+#define AT_S1E2R   sys_insn(AT_Op0, 4, AT_CRn, 8, 0)
+#define AT_S1E2W   sys_insn(AT_Op0, 4, AT_CRn, 8, 1)
+#define AT_S12E1R  sys_insn(AT_Op0, 4, AT_CRn, 8, 4)
+#define AT_S12E1W  sys_insn(AT_Op0, 4, AT_CRn, 8, 5)
+#define AT_S12E0R  sys_insn(AT_Op0, 4, AT_CRn, 8, 6)
+#define AT_S12E0W  sys_insn(AT_Op0, 4, AT_CRn, 8, 7)
+
+/* TLBI instructions */
+#define TLBI_Op0   1
+#define TLBI_Op1_EL2   4   /* Accessible from EL2 or higher */
+#define TLBI_CRn   8
+#define tlbi_insn_el2(CRm, Op2)sys_insn(TLBI_Op0, TLBI_Op1_EL2, 
TLBI_CRn, (CRm), (Op2))
+
+#define TLBI_IPAS2E1IS tlbi_insn_el2(0, 1)
+#define TLBI_IPAS2LE1IStlbi_insn_el2(0, 5)
+#define TLBI_ALLE2IS   tlbi_insn_el2(3, 0)
+#define TLBI_VAE2IStlbi_insn_el2(3, 1)
+#define TLBI_ALLE1IS   tlbi_insn_el2(3, 4)
+#define TLBI_VALE2IS   tlbi_insn_el2(3, 5)
+#define TLBI_VMALLS12E1IS  tlbi_insn_el2(3, 6)
+#define TLBI_IPAS2E1   tlbi_insn_el2(4, 1)
+#define TLBI_IPAS2LE1  tlbi_insn_el2(4, 5)
+#define TLBI_ALLE2 tlbi_insn_el2(7, 0)
+#define TLBI_VAE2  tlbi_insn_el2(7, 1)
+#define TLBI_ALLE1 tlbi_insn_el2(7, 4)
+#define TLBI_VALE2 tlbi_insn_el2(7, 5)
+#define TLBI_VMALLS12E1tlbi_insn_el2(7, 6)
+
 /* Common SCTLR_ELx flags. */
 #define SCTLR_ELx_EE(1 << 25)
 #define SCTLR_ELx_I(1 << 12)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 481bea64..8d04926 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1624,6 +1624,32 @@ static bool access_id_aa64mmfr0_el1(struct kvm_vcpu *v,
 #define SYS_INSN_TO_DESC(insn, access_fn, forward_fn)  \
{ SYS_DESC((insn)), (access_fn), NULL, 0, 0, NULL, NULL, (forward_fn) }
 static struct sys_reg_desc sys_insn_descs[] = {
+   SYS_INSN_TO_DESC(AT_S1E1R, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E1W, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E0R, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E0W, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E1RP, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E1WP, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E2R, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S1E2W, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S12E1R, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S12E1W, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S12E0R, NULL, NULL),
+   SYS_INSN_TO_DESC(AT_S12E0W, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_IPAS2E1IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_IPAS2LE1IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_ALLE2IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VAE2IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_ALLE1IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VALE2IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VMALLS12E1IS, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_IPAS2E1, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_IPAS2LE1, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_ALLE2, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VAE2, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_ALLE1, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VALE2, NULL, NULL),
+   SYS_INSN_TO_DESC(TLBI_VMALLS12E1, NULL, NULL),
 };
 
 #define reg_to_match_value(x)  \
-- 
1.9.1

[RFC PATCH v2 17/31] KVM: arm64: Rework the system instruction emulation framework

2017-10-02 Thread Jintack Lim

Rework the system instruction emulation framework to handle potentially
all system instruction traps other than MSR/MRS instructions. Those
system instructions would be AT and TLBI instructions controlled by
HCR_EL2.NV, AT, and TTLB bits.

Signed-off-by: Jintack Lim 
---
 arch/arm64/kvm/sys_regs.c | 66 ++-
 1 file changed, 25 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 88ce172..481bea64 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1621,6 +1621,11 @@ static bool access_id_aa64mmfr0_el1(struct kvm_vcpu *v,
{ SYS_DESC(SYS_SP_EL2), NULL, reset_special, SP_EL2, 0},
 };
 
+#define SYS_INSN_TO_DESC(insn, access_fn, forward_fn)  \
+   { SYS_DESC((insn)), (access_fn), NULL, 0, 0, NULL, NULL, (forward_fn) }
+static struct sys_reg_desc sys_insn_descs[] = {
+};
+
 #define reg_to_match_value(x)  \
({  \
unsigned long val;  \
@@ -1674,6 +1679,25 @@ static void perform_access(struct kvm_vcpu *vcpu,
kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
 }
 
+static int emulate_sys_instr(struct kvm_vcpu *vcpu, struct sys_reg_params *p)
+{
+
+   const struct sys_reg_desc *r;
+
+   /* Search from the system instruction table. */
+   r = find_reg(p, sys_insn_descs, ARRAY_SIZE(sys_insn_descs));
+
+   if (likely(r)) {
+   perform_access(vcpu, p, r);
+   } else {
+   kvm_err("Unsupported guest sys instruction at: %lx\n",
+   *vcpu_pc(vcpu));
+   print_sys_reg_instr(p);
+   kvm_inject_undefined(vcpu);
+   }
+   return 1;
+}
+
 static bool trap_dbgidr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -2236,47 +2260,6 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu,
return 1;
 }
 
-static int emulate_tlbi(struct kvm_vcpu *vcpu,
-struct sys_reg_params *params)
-{
-   /* TODO: support tlbi instruction emulation*/
-   kvm_inject_undefined(vcpu);
-   return 1;
-}
-
-static int emulate_at(struct kvm_vcpu *vcpu,
-struct sys_reg_params *params)
-{
-   /* TODO: support address translation instruction emulation */
-   kvm_inject_undefined(vcpu);
-   return 1;
-}
-
-static int emulate_sys_instr(struct kvm_vcpu *vcpu,
-struct sys_reg_params *params)
-{
-   int ret = 0;
-
-   /*
-* Forward this trap to the virtual EL2 if the virtual HCR_EL2.NV
-* bit is set.
-*/
-   if (forward_nv_traps(vcpu))
-   return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_hsr(vcpu));
-
-   /* TLB maintenance instructions*/
-   if (params->CRn == 0b1000)
-   ret = emulate_tlbi(vcpu, params);
-   /* Address Translation instructions */
-   else if (params->CRn == 0b0111 && params->CRm == 0b1000)
-   ret = emulate_at(vcpu, params);
-
-   if (ret)
-   kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-
-   return ret;
-}
-
 static void reset_sys_reg_descs(struct kvm_vcpu *vcpu,
  const struct sys_reg_desc *table, size_t num)
 {
@@ -2754,6 +2737,7 @@ void kvm_sys_reg_table_init(void)
BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs)));
BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs)));
BUG_ON(check_sysreg_table(invariant_sys_regs, 
ARRAY_SIZE(invariant_sys_regs)));
+   BUG_ON(check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs)));
 
/* We abuse the reset function to overwrite the table itself. */
for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
-- 
1.9.1

[RFC PATCH v2 31/31] KVM: arm64: Fixes to toggle_cache for nesting

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

So far we were flushing almost the entire universe whenever a VM would
load/unload the SCTLR_EL1 and the two versions of that register had
different MMU enabled settings.  This turned out to be so slow that it
prevented forward progress for a nested VM, because a scheduler timer
tick interrupt would always be pending when we reached the nested VM.

To avoid this problem, we consider the SCTLR_EL2 when evaluating if
caches are on or off when entering virtual EL2 (because this is the
value that we end up shadowing onto the hardware EL1 register).

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---
 arch/arm64/include/asm/kvm_mmu.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 601f431..7a1c581 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -240,7 +240,10 @@ static inline bool kvm_page_empty(void *ptr)
 
 static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {
-   return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
+   if (vcpu_mode_el2(vcpu))
+   return (vcpu_sys_reg(vcpu, SCTLR_EL2) & 0b101) == 0b101;
+   else
+   return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
 static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
-- 
1.9.1

[RFC PATCH v2 09/31] KVM: arm/arm64: Manage mmus for nested VMs

2017-10-02 Thread Jintack Lim

Now that a hypervisor can run in the virtual EL2, the guest hypervisor
can assign any VMID to its own VMs. To avoid conflicts between VMIDs
among a host and guest(s), the host hypervisor maps each VMID from a
guest hypervisor's view (i.e. virtual VMID) to an unique shadow VMID.
It also manages a set of shadow stage-2 page tables for each shadow
VMID. All this information is stored in kvm_nested_s2_mmu struct.

A host hypervisor manages a list of kvm_nested_s2_mmu objects per VM. On
a VM entry it searches an object in the list using a virtual VMID as a
key.

Signed-off-by: Jintack Lim 
---

Notes:
v1-->v2:
- This is a merged commit of [RFC 39/55] and [RFC 40/55].
- Updated the commit message and comments.
- Defer creating a new nested mmu structure until we enter the VM with 
stage 2
  paging enabled, which was previously done on vttbr_el2 write operations.
- Use the existing kvm->mmu_lock when iterating nested mmus instead of 
creating one.

 arch/arm/include/asm/kvm_host.h  |  12 
 arch/arm64/include/asm/kvm_emulate.h |  13 ++---
 arch/arm64/include/asm/kvm_host.h|  25 
 arch/arm64/include/asm/kvm_mmu.h |  21 +++
 arch/arm64/kvm/Makefile  |   1 +
 arch/arm64/kvm/context.c |   2 +-
 arch/arm64/kvm/mmu-nested.c  | 108 +++
 virt/kvm/arm/arm.c   |   1 +
 8 files changed, 174 insertions(+), 9 deletions(-)
 create mode 100644 arch/arm64/kvm/mmu-nested.c

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 33ccdbe..d84c1c1 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -67,6 +67,15 @@ struct kvm_s2_mmu {
pgd_t *pgd;
 };
 
+/* Per shadow VMID mmu structure. This is only for nested virtualization */
+struct kvm_nested_s2_mmu {
+   struct kvm_s2_mmu mmu;
+
+   u64 virtual_vttbr;
+
+   struct list_head list;
+};
+
 struct kvm_arch {
/* Stage 2 paging state for the VM */
struct kvm_s2_mmu mmu;
@@ -79,6 +88,9 @@ struct kvm_arch {
 * here.
 */
 
+   /* Never used on arm but added to be compatible with arm64 */
+   struct list_head nested_mmu_list;
+
/* Interrupt controller */
struct vgic_distvgic;
int max_vcpus;
diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index 71a3a04..f476576 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -199,6 +199,11 @@ static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu)
return false;
 }
 
+static inline bool vcpu_nested_stage2_enabled(const struct kvm_vcpu *vcpu)
+{
+   return (vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
+}
+
 static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu)
 {
return vcpu->arch.fault.esr_el2;
@@ -385,12 +390,4 @@ static inline unsigned long vcpu_data_host_to_guest(struct 
kvm_vcpu *vcpu,
return data;/* Leave LE untouched */
 }
 
-static inline struct kvm_s2_vmid *vcpu_get_active_vmid(struct kvm_vcpu *vcpu)
-{
-   if (unlikely(is_hyp_ctxt(vcpu)))
-   return &vcpu->kvm->arch.mmu.el2_vmid;
-
-   return &vcpu->kvm->arch.mmu.vmid;
-}
-
 #endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index a7edf0e..0c37e49 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -65,6 +65,28 @@ struct kvm_s2_mmu {
pgd_t *pgd;
 };
 
+/* Per shadow VMID mmu structure */
+struct kvm_nested_s2_mmu {
+   struct kvm_s2_mmu mmu;
+
+   /*
+* virtual_vttbr contains vttbr_el2 value from the guest hypervisor.
+* We use vmid field as a key to search for this mmu object in the list,
+* and ignore baddr field.
+*
+* Note that we may use both of vmid field and baddr field respectively
+* to find a shadow VMID and a pointer to the shadow stage-2 page
+* table, then combine them to set up hw_vttbr. The only benefit of
+* doing that would be reusing shadow stage-2 page tables for different
+* VMIDs, which is not usual. So, we choose the current design for the
+* simplicity.
+*
+*/
+   u64 virtual_vttbr;
+
+   struct list_head list;
+};
+
 struct kvm_arch {
/* Stage 2 paging state for the VM */
struct kvm_s2_mmu mmu;
@@ -77,6 +99,9 @@ struct kvm_arch {
 
/* Interrupt controller */
struct vgic_distvgic;
+
+   /* Stage 2 shadow paging contexts for nested L2 VM */
+   struct list_head nested_mmu_list;
 };
 
 #define KVM_NR_MEM_OBJS 40
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index bceaec1..452912f 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -112,6 +112,7 @@
 #include 
 #include 
 #include 
+#include 
 
 sta

[RFC PATCH v2 05/31] KVM: arm/arm64: Support mmu for the virtual EL2 execution

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

When running a guest hypervisor in virtual EL2, the translation context
has to be separate from the rest of the system, including the guest
EL1/0 translation regime, so we allocate a separate VMID for this mode.

Considering that we have two different vttbr values due to separate
VMIDs, it's racy to keep a vttbr value in a struct (kvm_s2_mmu) and
share it between multiple vcpus. So, remove the shared vttbr field, and
set up per-vcpu hw_vttbr field.

Hypercalls to flush tlb now have vttbr as a parameter instead of mmu,
since mmu structure does not have vttbr any more.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---

Notes:
v1-->v2:
Fixed a bug that hw_vttbr was not initialized correctly in 
kvm_arch_vcpu_init()
where vmid is not allocated yet. This prevented the guest from booting on 
32bit
arm; hw_vttbr is set on each entry on aarch64, so it was fine.

 arch/arm/include/asm/kvm_asm.h   |  6 ++--
 arch/arm/include/asm/kvm_emulate.h   |  4 +++
 arch/arm/include/asm/kvm_host.h  | 14 +---
 arch/arm/include/asm/kvm_mmu.h   | 11 ++
 arch/arm/kvm/hyp/switch.c|  4 +--
 arch/arm/kvm/hyp/tlb.c   | 15 -
 arch/arm64/include/asm/kvm_asm.h |  6 ++--
 arch/arm64/include/asm/kvm_emulate.h |  8 +
 arch/arm64/include/asm/kvm_host.h| 14 +---
 arch/arm64/include/asm/kvm_mmu.h | 11 ++
 arch/arm64/kvm/hyp/switch.c  |  4 +--
 arch/arm64/kvm/hyp/tlb.c | 34 +--
 virt/kvm/arm/arm.c   | 65 +---
 virt/kvm/arm/mmu.c   |  9 +++--
 14 files changed, 128 insertions(+), 77 deletions(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 71b7255..23a79bd 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -65,9 +65,9 @@
 extern char __kvm_hyp_vector[];
 
 extern void __kvm_flush_vm_context(void);
-extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa);
-extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
-extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu);
+extern void __kvm_tlb_flush_vmid_ipa(u64 vttbr, phys_addr_t ipa);
+extern void __kvm_tlb_flush_vmid(u64 vttbr);
+extern void __kvm_tlb_flush_local_vmid(u64 vttbr);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
diff --git a/arch/arm/include/asm/kvm_emulate.h 
b/arch/arm/include/asm/kvm_emulate.h
index 29a4dec..24a3fbf 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -293,4 +293,8 @@ static inline unsigned long vcpu_data_host_to_guest(struct 
kvm_vcpu *vcpu,
}
 }
 
+static inline struct kvm_s2_vmid *vcpu_get_active_vmid(struct kvm_vcpu *vcpu)
+{
+   return &vcpu->kvm->arch.mmu.vmid;
+}
 #endif /* __ARM_KVM_EMULATE_H__ */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 78d826e..33ccdbe 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -53,16 +53,18 @@
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 void kvm_reset_coprocs(struct kvm_vcpu *vcpu);
 
-struct kvm_s2_mmu {
+struct kvm_s2_vmid {
/* The VMID generation used for the virt. memory system */
u64vmid_gen;
u32vmid;
+};
+
+struct kvm_s2_mmu {
+   struct kvm_s2_vmid vmid;
+   struct kvm_s2_vmid el2_vmid;
 
/* Stage-2 page table */
pgd_t *pgd;
-
-   /* VTTBR value associated with above pgd and vmid */
-   u64vttbr;
 };
 
 struct kvm_arch {
@@ -193,6 +195,9 @@ struct kvm_vcpu_arch {
 
/* Stage 2 paging state used by the hardware on next switch */
struct kvm_s2_mmu *hw_mmu;
+
+   /* VTTBR value used by the hardware on next switch */
+   u64 hw_vttbr;
 };
 
 struct kvm_vm_stat {
@@ -239,6 +244,7 @@ static inline void 
kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 {
 }
 
+unsigned int get_kvm_vmid_bits(void);
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
 void kvm_arm_halt_guest(struct kvm *kvm);
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index fa6f217..86fdc70 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -221,6 +221,17 @@ static inline unsigned int kvm_get_vmid_bits(void)
return 8;
 }
 
+static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
+   struct kvm_s2_mmu *mmu)
+{
+   u64 vmid_field, baddr;
+
+   baddr = virt_to_phys(mmu->pgd);
+   vmid_field = ((u64)vmid->vmid << VTTBR_VMID_SHIFT) &
+   VTTBR_VMID_MASK(get_kvm_vmid_bits());
+   return baddr | vmid_field;
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index 4814671..4798e39 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/sw

[RFC PATCH v2 03/31] KVM: arm/arm64: Remove unused params in mmu functions

2017-10-02 Thread Jintack Lim

From: Christoffer Dall 

stage2_flush_xxx functions take a pointer to the kvm struct as the first
parameter but they are never used. Clean this up before modifying mmu
code for nested virtualization support.

Signed-off-by: Christoffer Dall 
Signed-off-by: Jintack Lim 
---
 virt/kvm/arm/mmu.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index f2d5b6c..0a5f5ca 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -315,8 +315,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t 
start, u64 size)
} while (pgd++, addr = next, addr != end);
 }
 
-static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
+static void stage2_flush_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
 {
pte_t *pte;
 
@@ -327,8 +326,7 @@ static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
-static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
+static void stage2_flush_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
 {
pmd_t *pmd;
phys_addr_t next;
@@ -340,13 +338,12 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
if (pmd_thp_or_huge(*pmd))
kvm_flush_dcache_pmd(*pmd);
else
-   stage2_flush_ptes(kvm, pmd, addr, next);
+   stage2_flush_ptes(pmd, addr, next);
}
} while (pmd++, addr = next, addr != end);
 }
 
-static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
+static void stage2_flush_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
 {
pud_t *pud;
phys_addr_t next;
@@ -358,7 +355,7 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
if (stage2_pud_huge(*pud))
kvm_flush_dcache_pud(*pud);
else
-   stage2_flush_pmds(kvm, pud, addr, next);
+   stage2_flush_pmds(pud, addr, next);
}
} while (pud++, addr = next, addr != end);
 }
@@ -374,7 +371,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
pgd = kvm->arch.pgd + stage2_pgd_index(addr);
do {
next = stage2_pgd_addr_end(addr, end);
-   stage2_flush_puds(kvm, pgd, addr, next);
+   stage2_flush_puds(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
 }
 
-- 
1.9.1

[PATCH V1 0/3] High baud rate supports of F81866/F81216H

2017-10-02 Thread Ji-Ze Hong (Peter Hong)

The Fintek F81866/F81216H support high baud rate and it's up to 1.5Mbps
with 24MHz clock source. It's also support 500Kbps via 24MHz clock too.

We'll implements clock source checking in function fintek_8250_set_termios().

Ji-Ze Hong (Peter Hong) (3):
  serial: 8250_fintek: UART dynamic clocksource on Fintek F81866
  serial: 8250_fintek: UART dynamic clocksource on Fintek F81216H
  serial: 8250_fintek: fix warning reported from smatch

 drivers/tty/serial/8250/8250_fintek.c | 84 ++-
 1 file changed, 83 insertions(+), 1 deletion(-)

-- 
1.9.1

[PATCH V1 1/3] serial: 8250_fintek: UART dynamic clocksource on Fintek F81866

2017-10-02 Thread Ji-Ze Hong (Peter Hong)

The F81866 had 4 clocksource 1.8432/18.432/14.769/24MHz and baud rates can
be up to 1.5Mbits with 24MHz. We'll implements the dynamic clocksource in
fintek_8250_set_termios().

Signed-off-by: Ji-Ze Hong (Peter Hong) 
---
 drivers/tty/serial/8250/8250_fintek.c | 54 +++
 1 file changed, 54 insertions(+)

diff --git a/drivers/tty/serial/8250/8250_fintek.c 
b/drivers/tty/serial/8250/8250_fintek.c
index e500f7d..53ea353 100644
--- a/drivers/tty/serial/8250/8250_fintek.c
+++ b/drivers/tty/serial/8250/8250_fintek.c
@@ -287,6 +287,59 @@ static void fintek_8250_goto_highspeed(struct 
uart_8250_port *uart,
}
 }
 
+void fintek_8250_set_termios(struct uart_port *port, struct ktermios *termios,
+   struct ktermios *old)
+{
+   struct fintek_8250 *pdata = port->private_data;
+   unsigned int baud = tty_termios_baud_rate(termios);
+   int i;
+   static u32 baudrate_table[] = {115200, 921600, 1152000, 150};
+   static u8 clock_table[] = { F81866_UART_CLK_1_8432MHZ,
+   F81866_UART_CLK_14_769MHZ, F81866_UART_CLK_18_432MHZ,
+   F81866_UART_CLK_24MHZ };
+
+   for (i = 0; i < ARRAY_SIZE(baudrate_table); ++i) {
+   if (baud > baudrate_table[i] || baudrate_table[i] % baud != 0)
+   continue;
+
+   if (port->uartclk == baudrate_table[i] * 16)
+   break;
+
+   if (fintek_8250_enter_key(pdata->base_port, pdata->key))
+   continue;
+
+   port->uartclk = baudrate_table[i] * 16;
+
+   sio_write_reg(pdata, LDN, pdata->index);
+   sio_write_mask_reg(pdata, F81866_UART_CLK,
+   F81866_UART_CLK_MASK, clock_table[i]);
+
+   fintek_8250_exit_key(pdata->base_port);
+   break;
+   }
+
+   if (i == ARRAY_SIZE(baudrate_table)) {
+   baud = tty_termios_baud_rate(old);
+   tty_termios_encode_baud_rate(termios, baud, baud);
+   }
+
+   serial8250_do_set_termios(port, termios, old);
+}
+
+static void fintek_8250_set_termios_handler(struct uart_8250_port *uart)
+{
+   struct fintek_8250 *pdata = uart->port.private_data;
+
+   switch (pdata->pid) {
+   case CHIP_ID_F81866:
+   uart->port.set_termios = fintek_8250_set_termios;
+   break;
+
+   default:
+   break;
+   }
+}
+
 static int probe_setup_port(struct fintek_8250 *pdata,
struct uart_8250_port *uart)
 {
@@ -373,6 +426,7 @@ int fintek_8250_probe(struct uart_8250_port *uart)
memcpy(pdata, &probe_data, sizeof(probe_data));
uart->port.private_data = pdata;
fintek_8250_set_rs485_handler(uart);
+   fintek_8250_set_termios_handler(uart);
 
return 0;
 }
-- 
1.9.1

1 2 3 4 5 6 7 8 >

1 - 100 of 713 matches

Mail list logo