date:20190912

This patch adds UDC driver for tegra XUSB 3.0 device mode controller.
XUSB device mode controller supports SS, HS and FS modes

Based on work by:
  Mark Kuo 
  Hui Fu 
  Andrew Bresticker 

Signed-off-by: Nagarjuna Kristam 
Acked-by: Thierry Reding 
---
 drivers/usb/gadget/udc/Kconfig  |   11 +
 drivers/usb/gadget/udc/Makefile |1 +
 drivers/usb/gadget/udc/tegra-xudc.c | 3787 +++
 3 files changed, 3799 insertions(+)
 create mode 100644 drivers/usb/gadget/udc/tegra-xudc.c

diff --git a/drivers/usb/gadget/udc/Kconfig b/drivers/usb/gadget/udc/Kconfig
index d354036..c1f9364 100644
--- a/drivers/usb/gadget/udc/Kconfig
+++ b/drivers/usb/gadget/udc/Kconfig
@@ -441,6 +441,17 @@ config USB_GADGET_XILINX
  dynamically linked module called "udc-xilinx" and force all
  gadget drivers to also be dynamically linked.
 
+config USB_TEGRA_XUDC
+   tristate "NVIDIA Tegra Superspeed USB 3.0 Device Controller"
+   depends on ARCH_TEGRA || COMPILE_TEST
+   depends on PHY_TEGRA_XUSB
+   help
+Enables NVIDIA Tegra USB 3.0 device mode controller driver.
+
+Say "y" to link the driver statically, or "m" to build a
+dynamically linked module called "tegra_xudc" and force all
+gadget drivers to also be dynamically linked.
+
 source "drivers/usb/gadget/udc/aspeed-vhub/Kconfig"
 
 #
diff --git a/drivers/usb/gadget/udc/Makefile b/drivers/usb/gadget/udc/Makefile
index 897f648..f6777e6 100644
--- a/drivers/usb/gadget/udc/Makefile
+++ b/drivers/usb/gadget/udc/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_USB_BCM63XX_UDC) += bcm63xx_udc.o
 obj-$(CONFIG_USB_FSL_USB2) += fsl_usb2_udc.o
 fsl_usb2_udc-y := fsl_udc_core.o
 fsl_usb2_udc-$(CONFIG_ARCH_MXC)+= fsl_mxc_udc.o
+obj-$(CONFIG_USB_TEGRA_XUDC)   += tegra-xudc.o
 obj-$(CONFIG_USB_M66592)   += m66592-udc.o
 obj-$(CONFIG_USB_R8A66597) += r8a66597-udc.o
 obj-$(CONFIG_USB_RENESAS_USB3) += renesas_usb3.o
diff --git a/drivers/usb/gadget/udc/tegra-xudc.c 
b/drivers/usb/gadget/udc/tegra-xudc.c
new file mode 100644
index 000..5124f61
--- /dev/null
+++ b/drivers/usb/gadget/udc/tegra-xudc.c
@@ -0,0 +1,3787 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * NVIDIA Tegra XUSB device mode controller
+ *
+ * Copyright (c) 2013-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2015, Google Inc.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* XUSB_DEV registers */
+#define SPARAM 0x000
+#define  SPARAM_ERSTMAX_MASK GENMASK(20, 16)
+#define  SPARAM_ERSTMAX(x) (((x) << 16) & SPARAM_ERSTMAX_MASK)
+#define DB 0x004
+#define  DB_TARGET_MASK GENMASK(15, 8)
+#define  DB_TARGET(x) (((x) << 8) & DB_TARGET_MASK)
+#define  DB_STREAMID_MASK GENMASK(31, 16)
+#define  DB_STREAMID(x) (((x) << 16) & DB_STREAMID_MASK)
+#define ERSTSZ 0x008
+#define  ERSTSZ_ERSTXSZ_SHIFT(x) ((x) * 16)
+#define  ERSTSZ_ERSTXSZ_MASK GENMASK(15, 0)
+#define ERSTXBALO(x) (0x010 + 8 * (x))
+#define ERSTXBAHI(x) (0x014 + 8 * (x))
+#define ERDPLO 0x020
+#define  ERDPLO_EHB BIT(3)
+#define ERDPHI 0x024
+#define EREPLO 0x028
+#define  EREPLO_ECS BIT(0)
+#define  EREPLO_SEGI BIT(1)
+#define EREPHI 0x02c
+#define CTRL 0x030
+#define  CTRL_RUN BIT(0)
+#define  CTRL_LSE BIT(1)
+#define  CTRL_IE BIT(4)
+#define  CTRL_SMI_EVT BIT(5)
+#define  CTRL_SMI_DSE BIT(6)
+#define  CTRL_EWE BIT(7)
+#define  CTRL_DEVADDR_MASK GENMASK(30, 24)
+#define  CTRL_DEVADDR(x) (((x) << 24) & CTRL_DEVADDR_MASK)
+#define  CTRL_ENABLE BIT(31)
+#define ST 0x034
+#define  ST_RC BIT(0)
+#define  ST_IP BIT(4)
+#define RT_IMOD0x038
+#define  RT_IMOD_IMODI_MASK GENMASK(15, 0)
+#define  RT_IMOD_IMODI(x) ((x) & RT_IMOD_IMODI_MASK)
+#define  RT_IMOD_IMODC_MASK GENMASK(31, 16)
+#define  RT_IMOD_IMODC(x) (((x) << 16) & RT_IMOD_IMODC_MASK)
+#define PORTSC 0x03c
+#define  PORTSC_CCS BIT(0)
+#define  PORTSC_PED BIT(1)
+#define  PORTSC_PR BIT(4)
+#define  PORTSC_PLS_SHIFT 5
+#define  PORTSC_PLS_MASK GENMASK(8, 5)
+#define  PORTSC_PLS_U0 0x0
+#define  PORTSC_PLS_U2 0x2
+#define  PORTSC_PLS_U3 0x3
+#define  PORTSC_PLS_DISABLED 0x4
+#define  PORTSC_PLS_RXDETECT 0x5
+#define  PORTSC_PLS_INACTIVE 0x6
+#define  PORTSC_PLS_RESUME 0xf
+#define  PORTSC_PLS(x) (((x) << PORTSC_PLS_SHIFT) & PORTSC_PLS_MASK)
+#define  PORTSC_PS_SHIFT 10
+#define  PORTSC_PS_MASK GENMASK(13, 10)
+#define  PORTSC_PS_UNDEFINED 0x0
+#define  PORTSC_PS_FS 0x1
+#define  PORTSC_PS_LS 0x2
+#define  PORTSC_PS_HS 0x3
+#define  PORTSC_PS_SS 0x4
+#define  PORTSC_LWS BIT(16)
+#define  PORTSC_CSC BIT(17)
+#define  PORTSC_WRC BIT(19)
+#define  PORTSC_PRC BIT(21)
+#define  PORTSC_PLC BIT(22)
+#define  PORTSC_CEC BIT(23)
+#define  PORTSC_WPR BIT(30)
+#define  PORTSC_CHANGE_MASK (PORTSC_CSC | PORTSC_WRC | PORTSC_PRC | \
+PORTSC_PLC | PORTSC_CEC)
+#defi

Re: [PATCH V2 0/2] mm/debug: Add tests for architecture exported page table helpers

2019-09-12 Thread Anshuman Khandual

On 09/12/2019 08:12 PM, Christophe Leroy wrote:
> Hi,
> 
> I didn't get patch 1 of this series, and it is not on linuxppc-dev patchwork 
> either. Can you resend ?

Its there on linux-mm patchwork and copied on linux-kernel@vger.kernel.org
as well. The CC list for the first patch was different than the second one.

https://patchwork.kernel.org/patch/11142317/

Let me know if you can not find it either on MM or LKML list.

- Anshuman

[PATCH] mm/pgtable/debug: Fix test validating architecture page table helpers

2019-09-12 Thread Christophe Leroy

Fix build failure on powerpc.

Fix preemption imbalance.

Signed-off-by: Christophe Leroy 
---
 mm/arch_pgtable_test.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/arch_pgtable_test.c b/mm/arch_pgtable_test.c
index 8b4a92756ad8..f2b3c9ec35fa 100644
--- a/mm/arch_pgtable_test.c
+++ b/mm/arch_pgtable_test.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -400,6 +401,8 @@ static int __init arch_pgtable_tests_init(void)
p4d_clear_tests(p4dp);
pgd_clear_tests(mm, pgdp);
 
+   pte_unmap(ptep);
+
pmd_populate_tests(mm, pmdp, saved_ptep);
pud_populate_tests(mm, pudp, saved_pmdp);
p4d_populate_tests(mm, p4dp, saved_pudp);
-- 
2.13.3

Re: [PATCH v3] arch/microblaze: add support for get_user() of size 8 bytes

2019-09-12 Thread Michal Simek

Hi Randy,

On 02. 09. 19 18:52, Linus Torvalds wrote:
> On Mon, Sep 2, 2019 at 6:17 AM Michal Simek  wrote:
>>
>> Randy/Linus: Are you going create regular patch from this?
> 
> Since I can't even test it, and Randy did most of the work (and that
> last patch worked for him too), I'd suggest he just send it in as his.
> 
> You can add my acked-by or sign-of depending on how you want to do it
> (but I really don't need authorship credit, it might as well go to
> Randy).

Can you please send v4 on this one?

Thanks,
Michal



signature.asc
Description: OpenPGP digital signature

Re: [PATCH v3 0/6] make use of gcc 9's "asm inline()"

On 13/09/2019 00.30, Miguel Ojeda wrote:
> On Fri, Sep 13, 2019 at 12:19 AM Rasmus Villemoes
>  wrote:
>>
>> Patch 1 has already been picked up by Greg in staging-next, it's
>> included here for completeness. I don't know how to route the rest, or
>> if they should simply wait for 5.5 given how close we are to the merge
>> window for 5.4.
> 
> If you want I can pick this up in compiler-attributes and submit it as
> a whole if we get Acks from rtl8723bs/x86/...maintainers.

Ingo has now acked the x86 parts, and Greg has already picked up the
rtl8723bs patch, which is at least an implicit ack. I'm just unsure of
how and if it will work if you also pick up that one - but, if you
don't, your tree would be (somewhat) dependent on Greg's staging-next :(

Rasmus

[Patch V9 6/8] arm64: tegra: Enable xudc on Jetson TX1

Enable XUSB device mode driver for USB0 slot on Jetson TX1.

Signed-off-by: Nagarjuna Kristam 
Reviewed-by: JC Kuo 
---
 arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi | 31 +-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi 
b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi
index a7dc319..c1e106e 100644
--- a/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi
@@ -1362,7 +1362,7 @@
status = "okay";
 
lanes {
-   usb2-0 {
+   micro_b: usb2-0 {
nvidia,function = "xusb";
status = "okay";
};
@@ -1483,6 +1483,21 @@
vmmc-supply = <&vdd_3v3_sd>;
};
 
+   usb@700d {
+   status = "okay";
+   phys = <µ_b>;
+   phy-names = "usb2";
+   avddio-usb-supply = <&vdd_3v3_sys>;
+   hvdd-usb-supply = <&vdd_1v8>;
+   usb-role-switch;
+
+   port {
+   usb_role_switch: endpoint {
+   remote-endpoint = <&usb_b_conn_ep>;
+   };
+   };
+   };
+
regulators {
compatible = "simple-bus";
#address-cells = <1>;
@@ -1641,4 +1656,18 @@
linux,code = ;
};
};
+
+   usb_type_b: connector {
+   compatible = "linux,usb-b-connector", "gpio-usb-b-connector";
+   label = "micro-USB";
+   type = "micro";
+   vbus-gpio = <&gpio TEGRA_GPIO(Z, 0) GPIO_ACTIVE_LOW>;
+
+   port {
+   usb_b_conn_ep: endpoint {
+   remote-endpoint = <&usb_role_switch>;
+   };
+   };
+   };
+
 };
-- 
2.7.4

[Patch V9 4/8] dt-bindings: usb: Add NVIDIA Tegra XUSB device mode controller binding

Add device-tree binding documentation for the XUSB device mode controller
present on Tegra210 SoC. This controller supports the USB 3.0
specification.

Signed-off-by: Nagarjuna Kristam 
Reviewed-by: JC Kuo 
Reviewed-by: Rob Herring 
Acked-by: Thierry Reding 
---
 .../devicetree/bindings/usb/nvidia,tegra-xudc.txt  | 110 +
 1 file changed, 110 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/usb/nvidia,tegra-xudc.txt

diff --git a/Documentation/devicetree/bindings/usb/nvidia,tegra-xudc.txt 
b/Documentation/devicetree/bindings/usb/nvidia,tegra-xudc.txt
new file mode 100644
index 000..ce15e26
--- /dev/null
+++ b/Documentation/devicetree/bindings/usb/nvidia,tegra-xudc.txt
@@ -0,0 +1,110 @@
+Device tree binding for NVIDIA Tegra XUSB device mode controller (XUDC)
+===
+
+The Tegra XUDC controller supports both USB 2.0 HighSpeed/FullSpeed and
+USB 3.0 SuperSpeed protocols.
+
+Required properties:
+
+- compatible: For Tegra210, must contain "nvidia,tegra210-xudc".
+- reg: Must contain the base and length of all registers used.
+- interrupts: Must contain the XUSB device interrupt.
+- clocks: Must contain an entry for all clocks used.
+  See ../clock/clock-bindings.txt for details.
+- clock-names: Must include the following entries:
+   - dev: Clock to enable core XUSB dev clock.
+   - ss: Clock to enable XUSB super speed clock.
+   - ss_src: Clock to enable XUSB super speed dev clock.
+   - hs_src: Clock to enable XUSB high speed dev clock.
+   - fs_src: Clock to enable XUSB full speed dev clock.
+- power-domains: A list of PM domain specifiers that reference each 
power-domain
+  used by the XUSB device mode controller. This list must comprise of a 
specifier
+  for the XUSBA and XUSBB power-domains. See ../power/power_domain.txt and
+  ../arm/tegra/nvidia,tegra20-pmc.txt for details.
+- power-domain-names: A list of names that represent each of the specifiers in
+  the 'power-domains' property. Must include 'ss' and 'dev'.
+- nvidia,xusb-padctl: phandle to the XUSB pad controller that is used to
+  configure the USB pads used by the XUDC controller.
+- phys: Must contain an entry for each entry in phy-names.
+  See ../phy/phy-bindings.txt for details.
+- phy-names: Should include an entry for each PHY used by the controller.
+  Names must be "usb2", and "usb3" if support SuperSpeed device mode.
+  - "usb3" phy, SuperSpeed (SSTX+/SSTX-/SSRX+/SSRX-) data lines.
+  - "usb2" phy, USB 2.0 (D+/D-) data lines.
+
+For Tegra210:
+- reg-names: Must include the following entries:
+   - base: XUSB device controller registers.
+   - fpci: XUSB device PCI Config registers.
+   - ipfs: XUSB device registers.
+- avddio-usb-supply: PCIe/USB3 analog logic power supply. Must supply 1.05 V.
+- hvdd-usb-supply: USB controller power supply. Must supply 3.3 V.
+
+
+Optional properties:
+
+- usb-role-switch: boolean property to indicate use of USB Role Switch driver.
+
+Sub-nodes:
+--
+- The port would be added as subnode if use "usb-role-switch" property.
+  see graph.txt.
+
+Example:
+
+   pmc: pmc@7000e400 {
+   compatible = "nvidia,tegra210-pmc";
+   reg = <0x0 0x7000e400 0x0 0x400>;
+   clocks = <&tegra_car TEGRA210_CLK_PCLK>, <&clk32k_in>;
+   clock-names = "pclk", "clk32k_in";
+
+   powergates {
+   pd_xusbss: xusba {
+   clocks = <&tegra_car TEGRA210_CLK_XUSB_SS>;
+   resets = <&tegra_car 156>;
+   #power-domain-cells = <0>;
+   };
+
+   pd_xusbdev: xusbb {
+   clocks = <&tegra_car TEGRA210_CLK_XUSB_DEV>;
+   resets = <&tegra_car 95>;
+   #power-domain-cells = <0>;
+   };
+   };
+   };
+
+   usb@700d {
+   compatible = "nvidia,tegra210-xudc";
+   reg = <0x0 0x700d 0x0 0x8000>,
+ <0x0 0x700d8000 0x0 0x1000>,
+ <0x0 0x700d9000 0x0 0x1000>;
+   reg-names = "base", "fpci", "ipfs";
+
+   interrupts = ;
+
+   clocks = <&tegra_car TEGRA210_CLK_XUSB_DEV>,
+<&tegra_car TEGRA210_CLK_XUSB_SS>,
+<&tegra_car TEGRA210_CLK_XUSB_SSP_SRC>,
+<&tegra_car TEGRA210_CLK_XUSB_HS_SRC>,
+<&tegra_car TEGRA210_CLK_XUSB_FS_SRC>;
+   clock-names = "dev", "ss", "ss_src", "hs_src", "fs_src";
+
+   power-domains = <&pd_xusbdev>, <&pd_xusbss>;
+   power-domain-names = "dev", "ss";
+
+   nvidia,xusb-padctl = <&padctl>;
+
+   phys = <µ_b>;
+   phy-names = "usb2";
+
+   avddio-usb-supply = <&

[Patch V9 2/8] phy: tegra: xusb: Add usb3 port fake support on Tegra210

On Tegra210, usb2 only otg/peripheral ports dont work in device mode.
They need an assosciated usb3 port to work in device mode. Identify
an unused usb3 port and assign it as a fake USB3 port to USB2 only
port whose mode is otg/peripheral.

Based on work by BH Hsieh .

Signed-off-by: Nagarjuna Kristam 
Acked-by: Thierry Reding 
---
 drivers/phy/tegra/xusb-tegra210.c | 56 +
 drivers/phy/tegra/xusb.c  | 65 +++
 drivers/phy/tegra/xusb.h  |  2 ++
 3 files changed, 123 insertions(+)

diff --git a/drivers/phy/tegra/xusb-tegra210.c 
b/drivers/phy/tegra/xusb-tegra210.c
index 0351c4a..8c31f03 100644
--- a/drivers/phy/tegra/xusb-tegra210.c
+++ b/drivers/phy/tegra/xusb-tegra210.c
@@ -50,6 +50,7 @@
 #define XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP_SHIFT(x) ((x) * 5)
 #define XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP_MASK(x) (0x7 << ((x) * 5))
 #define XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP(x, v) (((v) & 0x7) << ((x) * 5))
+#define XUSB_PADCTL_SS_PORT_MAP_PORT_DISABLED 0x7
 
 #define XUSB_PADCTL_ELPG_PROGRAM1 0x024
 #define XUSB_PADCTL_ELPG_PROGRAM1_AUX_MUX_LP0_VCORE_DOWN (1 << 31)
@@ -944,6 +945,34 @@ static int tegra210_usb2_phy_power_on(struct phy *phy)
 
priv = to_tegra210_xusb_padctl(padctl);
 
+   if (port->usb3_port_fake != -1) {
+   value = padctl_readl(padctl, XUSB_PADCTL_SS_PORT_MAP);
+   value &= ~XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP_MASK(
+   port->usb3_port_fake);
+   value |= XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP(
+   port->usb3_port_fake, index);
+   padctl_writel(padctl, value, XUSB_PADCTL_SS_PORT_MAP);
+
+   value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM1);
+   value &= ~XUSB_PADCTL_ELPG_PROGRAM1_SSPX_ELPG_VCORE_DOWN(
+   port->usb3_port_fake);
+   padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM1);
+
+   usleep_range(100, 200);
+
+   value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM1);
+   value &= ~XUSB_PADCTL_ELPG_PROGRAM1_SSPX_ELPG_CLAMP_EN_EARLY(
+   port->usb3_port_fake);
+   padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM1);
+
+   usleep_range(100, 200);
+
+   value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM1);
+   value &= ~XUSB_PADCTL_ELPG_PROGRAM1_SSPX_ELPG_CLAMP_EN(
+   port->usb3_port_fake);
+   padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM1);
+   }
+
value = padctl_readl(padctl, XUSB_PADCTL_USB2_BIAS_PAD_CTL0);
value &= ~((XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_SQUELCH_LEVEL_MASK <<
XUSB_PADCTL_USB2_BIAS_PAD_CTL0_HS_SQUELCH_LEVEL_SHIFT) |
@@ -1078,6 +1107,32 @@ static int tegra210_usb2_phy_power_off(struct phy *phy)
 
mutex_lock(&padctl->lock);
 
+   if (port->usb3_port_fake != -1) {
+   value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM1);
+   value |= XUSB_PADCTL_ELPG_PROGRAM1_SSPX_ELPG_CLAMP_EN_EARLY(
+   port->usb3_port_fake);
+   padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM1);
+
+   usleep_range(100, 200);
+
+   value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM1);
+   value |= XUSB_PADCTL_ELPG_PROGRAM1_SSPX_ELPG_CLAMP_EN(
+   port->usb3_port_fake);
+   padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM1);
+
+   usleep_range(250, 350);
+
+   value = padctl_readl(padctl, XUSB_PADCTL_ELPG_PROGRAM1);
+   value |= XUSB_PADCTL_ELPG_PROGRAM1_SSPX_ELPG_VCORE_DOWN(
+   port->usb3_port_fake);
+   padctl_writel(padctl, value, XUSB_PADCTL_ELPG_PROGRAM1);
+
+   value = padctl_readl(padctl, XUSB_PADCTL_SS_PORT_MAP);
+   value |= XUSB_PADCTL_SS_PORT_MAP_PORTX_MAP(port->usb3_port_fake,
+   XUSB_PADCTL_SS_PORT_MAP_PORT_DISABLED);
+   padctl_writel(padctl, value, XUSB_PADCTL_SS_PORT_MAP);
+   }
+
if (WARN_ON(pad->enable == 0))
goto out;
 
@@ -2052,6 +2107,7 @@ const struct tegra_xusb_padctl_soc 
tegra210_xusb_padctl_soc = {
.ops = &tegra210_xusb_padctl_ops,
.supply_names = tegra210_xusb_padctl_supply_names,
.num_supplies = ARRAY_SIZE(tegra210_xusb_padctl_supply_names),
+   .need_fake_usb3_port = true,
 };
 EXPORT_SYMBOL_GPL(tegra210_xusb_padctl_soc);
 
diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
index 2ea8497..b4b217e 100644
--- a/drivers/phy/tegra/xusb.c
+++ b/drivers/phy/tegra/xusb.c
@@ -800,9 +800,62 @@ static void __tegra_xusb_remove_ports(struct 
tegra_xusb_padctl *padctl)
}
 }
 
+static

[Patch V9 3/8] phy: tegra: xusb: Add vbus override support on Tegra210

Tegra XUSB device control driver needs to control vbus override
during its operations, add API for the support.

Signed-off-by: Nagarjuna Kristam 
Acked-by: Thierry Reding 
---
 drivers/phy/tegra/xusb-tegra210.c | 57 +++
 drivers/phy/tegra/xusb.c  | 22 +++
 drivers/phy/tegra/xusb.h  |  2 ++
 include/linux/phy/tegra/xusb.h|  4 ++-
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/phy/tegra/xusb-tegra210.c 
b/drivers/phy/tegra/xusb-tegra210.c
index 8c31f03..9e6f14b 100644
--- a/drivers/phy/tegra/xusb-tegra210.c
+++ b/drivers/phy/tegra/xusb-tegra210.c
@@ -65,6 +65,10 @@
 #define XUSB_PADCTL_USB3_PAD_MUX_PCIE_IDDQ_DISABLE(x) (1 << (1 + (x)))
 #define XUSB_PADCTL_USB3_PAD_MUX_SATA_IDDQ_DISABLE(x) (1 << (8 + (x)))
 
+#define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPADX_CTL0(x) (0x080 + (x) * 0x40)
+#define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL0_ZIP (1 << 18)
+#define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL0_ZIN (1 << 22)
+
 #define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPADX_CTL1(x) (0x084 + (x) * 0x40)
 #define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_SHIFT 7
 #define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_MASK 0x3
@@ -227,6 +231,12 @@
 #define XUSB_PADCTL_UPHY_USB3_PADX_ECTL6(x) (0xa74 + (x) * 0x40)
 #define XUSB_PADCTL_UPHY_USB3_PAD_ECTL6_RX_EQ_CTRL_H_VAL 0xfcf01368
 
+#define XUSB_PADCTL_USB2_VBUS_ID 0xc60
+#define XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_VBUS_ON (1 << 14)
+#define XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_SHIFT 18
+#define XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_MASK 0xf
+#define XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_FLOATING 8
+
 struct tegra210_xusb_fuse_calibration {
u32 hs_curr_level[4];
u32 hs_term_range_adj;
@@ -2016,6 +2026,51 @@ static const struct tegra_xusb_port_ops 
tegra210_usb3_port_ops = {
.map = tegra210_usb3_port_map,
 };
 
+static int tegra210_xusb_padctl_vbus_override(struct tegra_xusb_padctl *padctl,
+ bool status)
+{
+   u32 value;
+
+   dev_dbg(padctl->dev, "%s vbus override\n", status ? "set" : "clear");
+
+   value = padctl_readl(padctl, XUSB_PADCTL_USB2_VBUS_ID);
+
+   if (status) {
+   value |= XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_VBUS_ON;
+   value &= ~(XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_MASK <<
+  XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_SHIFT);
+   value |= XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_FLOATING <<
+XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_SHIFT;
+   } else
+   value &= ~XUSB_PADCTL_USB2_VBUS_ID_OVERRIDE_VBUS_ON;
+
+   padctl_writel(padctl, value, XUSB_PADCTL_USB2_VBUS_ID);
+
+   return 0;
+}
+
+static int tegra210_utmi_port_reset(struct phy *phy)
+{
+   struct tegra_xusb_padctl *padctl;
+   struct tegra_xusb_lane *lane;
+   u32 value;
+
+   lane = phy_get_drvdata(phy);
+   padctl = lane->pad->padctl;
+
+   value = padctl_readl(padctl,
+XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPADX_CTL0(lane->index));
+
+   if ((value & XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL0_ZIP) ||
+   (value & XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL0_ZIN)) {
+   tegra210_xusb_padctl_vbus_override(padctl, false);
+   tegra210_xusb_padctl_vbus_override(padctl, true);
+   return 1;
+   }
+
+   return 0;
+}
+
 static int
 tegra210_xusb_read_fuse_calibration(struct tegra210_xusb_fuse_calibration 
*fuse)
 {
@@ -2078,6 +2133,8 @@ static const struct tegra_xusb_padctl_ops 
tegra210_xusb_padctl_ops = {
.remove = tegra210_xusb_padctl_remove,
.usb3_set_lfps_detect = tegra210_usb3_set_lfps_detect,
.hsic_set_idle = tegra210_hsic_set_idle,
+   .vbus_override = tegra210_xusb_padctl_vbus_override,
+   .utmi_port_reset = tegra210_utmi_port_reset,
 };
 
 static const char * const tegra210_xusb_padctl_supply_names[] = {
diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
index b4b217e..bf4b008 100644
--- a/drivers/phy/tegra/xusb.c
+++ b/drivers/phy/tegra/xusb.c
@@ -1121,6 +1121,28 @@ int tegra_xusb_padctl_usb3_set_lfps_detect(struct 
tegra_xusb_padctl *padctl,
 }
 EXPORT_SYMBOL_GPL(tegra_xusb_padctl_usb3_set_lfps_detect);
 
+int tegra_xusb_padctl_set_vbus_override(struct tegra_xusb_padctl *padctl,
+   bool val)
+{
+   if (padctl->soc->ops->vbus_override)
+   return padctl->soc->ops->vbus_override(padctl, val);
+
+   return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_set_vbus_override);
+
+int tegra_phy_xusb_utmi_port_reset(struct phy *phy)
+{
+   struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+   struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+
+   if (padctl->soc->ops->utmi_port_reset)
+   return padctl->soc->ops->utmi_port_reset(phy);
+
+   return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(tegra_phy_xusb_utmi_port

[Patch V9 8/8] arm64: defconfig: Enable tegra XUDC support

Enable Nvidia XUSB device mode controller driver and USB GPIO Based
Connection Detection Driver as module.

Signed-off-by: Nagarjuna Kristam 
---
 arch/arm64/configs/defconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 358b163..de2826c 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -575,6 +575,7 @@ CONFIG_SND_SIMPLE_CARD=m
 CONFIG_SND_AUDIO_GRAPH_CARD=m
 CONFIG_I2C_HID=m
 CONFIG_USB=y
+CONFIG_USB_CONN_GPIO=m
 CONFIG_USB_OTG=y
 CONFIG_USB_XHCI_HCD=y
 CONFIG_USB_XHCI_TEGRA=y
@@ -600,6 +601,7 @@ CONFIG_USB_ULPI=y
 CONFIG_USB_GADGET=y
 CONFIG_USB_RENESAS_USBHS_UDC=m
 CONFIG_USB_RENESAS_USB3=m
+CONFIG_USB_TEGRA_XUDC=m
 CONFIG_TYPEC=m
 CONFIG_TYPEC_HD3SS3220=m
 CONFIG_MMC=y
-- 
2.7.4

[Patch V9 1/8] phy: tegra: xusb: Add XUSB dual mode support on Tegra210

Configure the port capabilities based on usb_dr_mode settings.

Based on work by JC Kuo .

Signed-off-by: Nagarjuna Kristam 
Reviewed-by: JC Kuo 
Acked-by: Thierry Reding 
---
 drivers/phy/tegra/xusb-tegra210.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/phy/tegra/xusb-tegra210.c 
b/drivers/phy/tegra/xusb-tegra210.c
index 0c0df68..0351c4a 100644
--- a/drivers/phy/tegra/xusb-tegra210.c
+++ b/drivers/phy/tegra/xusb-tegra210.c
@@ -39,7 +39,10 @@
 #define XUSB_PADCTL_USB2_PAD_MUX_USB2_BIAS_PAD_XUSB 0x1
 
 #define XUSB_PADCTL_USB2_PORT_CAP 0x008
+#define XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_DISABLED(x) (0x0 << ((x) * 4))
 #define XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_HOST(x) (0x1 << ((x) * 4))
+#define XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_DEVICE(x) (0x2 << ((x) * 4))
+#define XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_OTG(x) (0x3 << ((x) * 4))
 #define XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_MASK(x) (0x3 << ((x) * 4))
 
 #define XUSB_PADCTL_SS_PORT_MAP 0x014
@@ -64,6 +67,7 @@
 #define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPADX_CTL1(x) (0x084 + (x) * 0x40)
 #define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_SHIFT 7
 #define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_MASK 0x3
+#define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_VAL 0x1
 #define XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_FIX18 (1 << 6)
 
 #define XUSB_PADCTL_USB2_OTG_PADX_CTL0(x) (0x088 + (x) * 0x40)
@@ -957,7 +961,14 @@ static int tegra210_usb2_phy_power_on(struct phy *phy)
 
value = padctl_readl(padctl, XUSB_PADCTL_USB2_PORT_CAP);
value &= ~XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_MASK(index);
-   value |= XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_HOST(index);
+   if (port->mode == USB_DR_MODE_UNKNOWN)
+   value |= XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_DISABLED(index);
+   else if (port->mode == USB_DR_MODE_PERIPHERAL)
+   value |= XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_DEVICE(index);
+   else if (port->mode == USB_DR_MODE_HOST)
+   value |= XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_HOST(index);
+   else if (port->mode == USB_DR_MODE_OTG)
+   value |= XUSB_PADCTL_USB2_PORT_CAP_PORTX_CAP_OTG(index);
padctl_writel(padctl, value, XUSB_PADCTL_USB2_PORT_CAP);
 
value = padctl_readl(padctl, XUSB_PADCTL_USB2_OTG_PADX_CTL0(index));
@@ -989,7 +1000,12 @@ static int tegra210_usb2_phy_power_on(struct phy *phy)
 XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPADX_CTL1(index));
value &= ~(XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_MASK <<
   XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_SHIFT);
-   value |= XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_FIX18;
+   if (port->mode == USB_DR_MODE_HOST)
+   value |= XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_FIX18;
+   else
+   value |=
+ XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_VAL <<
+ XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPAD_CTL1_VREG_LEV_SHIFT;
padctl_writel(padctl, value,
  XUSB_PADCTL_USB2_BATTERY_CHRG_OTGPADX_CTL1(index));
 
-- 
2.7.4

[Patch V9 5/8] arm64: tegra: Add xudc node for Tegra210

Tegra210 has one XUSB device mode controller, which can be operated
HS and SS modes. Add DT support for XUSB device mode controller.

Signed-off-by: Nagarjuna Kristam 
Reviewed-by: JC Kuo 
---
 arch/arm64/boot/dts/nvidia/tegra210.dtsi | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/arch/arm64/boot/dts/nvidia/tegra210.dtsi 
b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
index 6597531..5de0b36 100644
--- a/arch/arm64/boot/dts/nvidia/tegra210.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
@@ -1203,6 +1203,25 @@
status = "disabled";
};
 
+   usb@700d {
+   compatible = "nvidia,tegra210-xudc";
+   reg = <0x0 0x700d 0x0 0x8000>,
+ <0x0 0x700d8000 0x0 0x1000>,
+ <0x0 0x700d9000 0x0 0x1000>;
+   reg-names = "base", "fpci", "ipfs";
+   interrupts = ;
+   clocks = <&tegra_car TEGRA210_CLK_XUSB_DEV>,
+<&tegra_car TEGRA210_CLK_XUSB_SS>,
+<&tegra_car TEGRA210_CLK_XUSB_SSP_SRC>,
+<&tegra_car TEGRA210_CLK_XUSB_HS_SRC>,
+<&tegra_car TEGRA210_CLK_XUSB_FS_SRC>;
+   clock-names = "dev", "ss", "ss_src", "hs_src", "fs_src";
+   power-domains = <&pd_xusbdev>, <&pd_xusbss>;
+   power-domain-names = "dev", "ss";
+   nvidia,xusb-padctl = <&padctl>;
+   status = "disabled";
+   };
+
mipi: mipi@700e3000 {
compatible = "nvidia,tegra210-mipi";
reg = <0x0 0x700e3000 0x0 0x100>;
-- 
2.7.4

[Patch V9 0/8] Tegra XUSB gadget driver support

Patches 1-3 are phy driver changes to add support for device
mode.
Patches 4-7 are changes related to XUSB device mode
controller driver.
Patch 8 is to enable drivers for XUDC support in defconfig

Test Steps(USB 2.0):
- Enable "USB Gadget precomposed configurations" in defconfig
- Build, flash and boot Jetson TX1
- Connect Jetson TX1 and Ubuntu device using USB A to Micro B
  cable
- After boot on Jetson TX1 terminal usb0 network device should be
  enumerated
- Assign static ip to usb0 on Jetson TX1 and corresponding net
  device on ubuntu
- Run ping test and transfer test(used scp) to check data transfer
  communication

SS mode is verified by enabling Type A port as peripheral
---
v9:
* Patches 1,2,3,4,5 - No changes.
* Patch 6 has update on compatible string as per suggestion from Chunfeng.
* Patch 7 has comment fixes as suggested by Chunfeng.
* Patch 8 has CONFIG_USB_GPIO enabled as module additionally.
---
v8:
* Patches 1,2,3,4,5,8 - No changes.
* Patch 6 has update on compatible string as per change done in [1].
* Patch 7 has issue fix, where device mode didnot got enabled after resume
  from suspend.
---
v7:
* Patches 1,2,3,4,5,6,8 - No changes.
* Patch 7 - Comments from Balbi and Chunfun adrresed.
  Added COMPILE_TEST in Kconfig and updated dependencies.
---
v6:
* Patches 1,2,3,7,8 - No changes.
* Patch 4,5,6 - Comments from Rob addressed, updated usb connector driver
  compatibility string.
---
v5:
* Patches 1-3 - Commit subject updated as per inputs from Thierry.
* Patch 4 - Added reg-names used on Tegra210 in the bindings doc
* Enabled xudc driver as module instead of part of kernel in patch 8.
* Patched 5-8 - No changes.
---
v4:
* patch 1 - no changes.
* corrected companion device search based on inputs from Thierry in patch 2.
* removed unneeded dev variable and corrected value read in
  tegra210_utmi_port_reset function in patch 3.
* dt binding doc and dtb files are corrected for alignments.
  Replaced extcon-usb-gpio with usb role switch.
* Added support for USB role switch instead of extcon-usb-gpio and other minor
  comments as suggested by Chunfeng.
* Enabled xudc driver as module instead of part of kernel in patch 8.
---
V3:
* Rebased patch 1 to top of tree.
* Fixed bug in patch 2, where xudc interrupts dont get generated if USB host
  mode fails to probe. Moved fake port detection logic to generic xusb.c. fake
  usb port data is updated based on soc flag need_fake_usb3_port.
* Added extra lines whereever necessary to make code more readable in patch 3
  and 7.
* dt binding doc is corrected for typos and extcon references. Also added
  details for clocks and removed xusb_ references to clock and power-domain
  names and accordingly patch 5 is updated.
* removed avdd-pll-utmip-supply in patch 6, as its now part of padctl driver.
* Patch 8 has no changes.
---
V2:
* Patches 1-3 are new patches in this series, which splits unified features
  patch to speprated features and removes need of port-fake entry in DT.
* Patch 4 is re-arragend dt-bindings patch which incorporates previous
  patch comments to sort DT entries alphabetically, addresses name changes
  and PM domain details added.
* Patch 5-6 are re-arranged DT patches with major changes - sort entries
  alphabetically, and adds clock names.
* Patch 7 is UDC driver tegra XUSB device mode controller with major
  changes - remove un-used module params, lockinng for device_mode flag,
  moving un-needed info logs to debug level, making changes feature flag
  dependent rather than SOC based macros and other error handling in probe.
* Patch 8 has no changes.

Nagarjuna Kristam (8):
  phy: tegra: xusb: Add XUSB dual mode support on Tegra210
  phy: tegra: xusb: Add usb3 port fake support on Tegra210
  phy: tegra: xusb: Add vbus override support on Tegra210
  dt-bindings: usb: Add NVIDIA Tegra XUSB device mode controller binding
  arm64: tegra: Add xudc node for Tegra210
  arm64: tegra: Enable xudc on Jetson TX1
  usb: gadget: Add UDC driver for tegra XUSB device mode controller
  arm64: defconfig: Enable tegra XUDC support

 .../devicetree/bindings/usb/nvidia,tegra-xudc.txt  |  110 +
 arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi |   31 +-
 arch/arm64/boot/dts/nvidia/tegra210.dtsi   |   19 +
 arch/arm64/configs/defconfig   |2 +
 drivers/phy/tegra/xusb-tegra210.c  |  133 +-
 drivers/phy/tegra/xusb.c   |   87 +
 drivers/phy/tegra/xusb.h   |4 +
 drivers/usb/gadget/udc/Kconfig |   12 +
 drivers/usb/gadget/udc/Makefile|1 +
 drivers/usb/gadget/udc/tegra-xudc.c| 3787 
 include/linux/phy/tegra/xusb.h |4 +-
 11 files changed, 4186 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/usb/nvidia,tegra-xudc.txt
 create mode 100644 drivers/usb/gadget/udc/tegra-xudc.c

-- 
2.7.4

Re: Regression: commit c9712e333809 breaks xilinx_uartps

2019-09-12 Thread Michal Simek

On 12. 09. 19 18:38, Paul Thomas wrote:
>>> ---
>>>  drivers/tty/serial/xilinx_uartps.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>>
>>> diff --git a/drivers/tty/serial/xilinx_uartps.c
>>> b/drivers/tty/serial/xilinx_uartps.c
>>> index 9dcc4d855ddd..ece7f6caa994 100644
>>> --- a/drivers/tty/serial/xilinx_uartps.c
>>> +++ b/drivers/tty/serial/xilinx_uartps.c
>>> @@ -1565,6 +1565,8 @@ static int cdns_uart_probe(struct platform_device 
>>> *pdev)
>>>
>>> cdns_uart_data->pclk = devm_clk_get(&pdev->dev, "pclk");
>>> if (PTR_ERR(cdns_uart_data->pclk) == -EPROBE_DEFER) {
>>> +   /* If we end up defering then set uartps_major back to 0 */
>>> +   uartps_major = 0;
>>> rc = PTR_ERR(cdns_uart_data->pclk);
>>> goto err_out_unregister_driver;
>>> }
>>>
>>
>> I expect that this can be problematic for all failures in probe.
>> What about this?
> Makes sense, this worked for me. Although, I think the patch is
> malformed by the email line lengths.

I just c&p. Let me send proper patch with description.

Thanks,
Michal

Re: [PATCH V7 3/3] arm64/mm: Enable memory hot remove

2019-09-12 Thread Anshuman Khandual

On 09/13/2019 01:45 AM, Catalin Marinas wrote:
> Hi Anshuman,
> 
> Thanks for the details on the need for removing the page tables and
> vmemmap backing. Some comments on the code below.
> 
> On Tue, Sep 03, 2019 at 03:15:58PM +0530, Anshuman Khandual wrote:
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -60,6 +60,14 @@ static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss 
>> __maybe_unused;
>>  
>>  static DEFINE_SPINLOCK(swapper_pgdir_lock);
>>  
>> +/*
>> + * This represents if vmalloc and vmemmap address range overlap with
>> + * each other on an intermediate level kernel page table entry which
>> + * in turn helps in deciding whether empty kernel page table pages
>> + * if any can be freed during memory hotplug operation.
>> + */
>> +static bool vmalloc_vmemmap_overlap;
> 
> I'd say just move the static find_vmalloc_vmemmap_overlap() function
> here, the compiler should be sufficiently smart enough to figure out
> that it's just a build-time constant.

Sure, will do.

> 
>> @@ -770,6 +1022,28 @@ int __meminit vmemmap_populate(unsigned long start, 
>> unsigned long end, int node,
>>  void vmemmap_free(unsigned long start, unsigned long end,
>>  struct vmem_altmap *altmap)
>>  {
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +/*
>> + * FIXME: We should have called remove_pagetable(start, end, true).
>> + * vmemmap and vmalloc virtual range might share intermediate kernel
>> + * page table entries. Removing vmemmap range page table pages here
>> + * can potentially conflict with a concurrent vmalloc() allocation.
>> + *
>> + * This is primarily because vmalloc() does not take init_mm ptl for
>> + * the entire page table walk and it's modification. Instead it just
>> + * takes the lock while allocating and installing page table pages
>> + * via [p4d|pud|pmd|pte]_alloc(). A concurrently vanishing page table
>> + * entry via memory hot remove can cause vmalloc() kernel page table
>> + * walk pointers to be invalid on the fly which can cause corruption
>> + * or worst, a crash.
>> + *
>> + * So free_empty_tables() gets called where vmalloc and vmemmap range
>> + * do not overlap at any intermediate level kernel page table entry.
>> + */
>> +unmap_hotplug_range(start, end, true);
>> +if (!vmalloc_vmemmap_overlap)
>> +free_empty_tables(start, end);
>> +#endif
>>  }
> 
> So, I see the risk with overlapping and I guess for some kernel
> configurations (PAGE_SIZE == 64K) we may not be able to avoid it. If we

Did not see 64K config options to have overlap, do you suspect they might ?
After the 52 bit KVA series has been merged, following configurations have
the vmalloc-vmemmap range overlap problem.

- 4K  page size with 48 bit VA space
- 16K page size with 48 bit VA space

> can, that's great, otherwise could we rewrite the above functions to
> handle floor and ceiling similar to free_pgd_range()? (I wonder how this
> function works if you called it on init_mm and kernel address range). By

Hmm, never tried that. Are you wondering if this can be used directly ?
There are two distinct elements which make it very specific to user page
tables, mmu_gather based TLB tracking and mm->pgtable_bytes accounting
with mm_dec_nr_pxx().

> having the vmemmap start/end information it avoids freeing partially
> filled page table pages.

Did you mean page table pages which can partially overlap with vmalloc ?

The problem (race) is not because of the inability to deal with partially
filled table. We can handle that correctly as explained below [1]. The
problem is with inadequate kernel page table locking during vmalloc()
which might be accessing intermediate kernel page table pointers which is
being freed with free_empty_tables() concurrently. Hence we cannot free
any page table page which can ever have entries from vmalloc() range.

Though not completely sure, whether I really understood the suggestion above
with respect to the floor-ceiling mechanism as in free_pgd_range(). Are you
suggesting that we should only attempt to free up those vmemmap range page
table pages which *definitely* could never overlap with vmalloc by working
on a modified (i.e cut down with floor-ceiling while avoiding vmalloc range
at each level) vmemmap range instead ? This can be one restrictive version of
the function free_empty_tables() called in case there is an overlap. So we
will maintain two versions for free_empty_tables(). Please correct me if any
the above assumptions or understanding is wrong.

But yes, with this we should be able to free up some possible empty page
table pages which were being left out in the current proposal when overlap
happens.

[1] Skipping partially filled page tables

All free_pXX_table() functions take care in avoiding freeing partially filled
page table pages whether they represent or ever represented linear or vmemmap
or vmalloc mapping in init_mm. They go over each individual entry in a given
page table making

RE: [EXT] Re: [PATCH 2/3] ASoC: fsl_asrc: update supported sample format

2019-09-12 Thread S.j. Wang

Hi

> 
> On Tue, Sep 10, 2019 at 02:07:25AM +, S.j. Wang wrote:
> > > On Mon, Sep 09, 2019 at 06:33:20PM -0400, Shengjiu Wang wrote:
> > > > The ASRC support 24bit/16bit/8bit input width, so S20_3LE format
> > > > should not be supported, it is word width is 20bit.
> > >
> > > I thought 3LE used 24-bit physical width. And the driver assigns
> > > ASRC_WIDTH_24_BIT to "width" for all non-16bit cases, so 20-bit
> > > would go for that 24-bit slot also. I don't clearly recall if I had
> > > explicitly tested S20_3LE, but I feel it should work since I put there...
> >
> > For S20_3LE, the width is 20bit,  but the ASRC only support 24bit, if
> > set the ASRMCR1n.IWD= 24bit, because the actual width is 20 bit, the
> > volume is Lower than expected,  it likes 24bit data right shift 4 bit.
> > So it is not supported.
> 
> Hmm..S20_3LE right-aligns 20 bits in a 24-bit slot? I thought they're left
> aligned...
> 
> If this is the case...shouldn't we have the same lower-volume problem for
> all hardwares that support S20_3LE now?

Actually some hardware/module when they do transmission from FIFO
to shift register, they can select the start bit, for example from the 20th
bit. but not all module have this capability.

For ASRC, it haven't.  IWD can only cover the data width,  there is no
Other bit for slot width.

Best regards
Wang shengjiu

Re: [PATCH v3 5/6] x86: alternative.h: use asm_inline for all alternative variants

2019-09-12 Thread Ingo Molnar



* Rasmus Villemoes  wrote:

> Most, if not all, uses of the alternative* family just provide one or
> two instructions in .text, but the string literal can be quite large,
> causing gcc to overestimate the size of the generated code. That in
> turn affects its decisions about inlining of the function containing
> the alternative() asm statement.
> 
> New enough versions of gcc allow one to overrule the estimated size by
> using "asm inline" instead of just "asm". So replace asm by the helper
> asm_inline, which for older gccs just expands to asm.
> 
> Signed-off-by: Rasmus Villemoes 

Acked-by: Ingo Molnar 

Thanks,

Ingo

Re: [PATCH v3 6/6] x86: bug.h: use asm_inline in _BUG_FLAGS definitions

2019-09-12 Thread Ingo Molnar



* Rasmus Villemoes  wrote:

> This helps preventing a BUG* or WARN* in some static inline from
> preventing that (or one of its callers) being inlined, so should allow
> gcc to make better informed inlining decisions.
> 
> For example, with gcc 9.2, tcp_fastopen_no_cookie() vanishes from
> net/ipv4/tcp_fastopen.o. It does not itself have any BUG or WARN, but
> it calls dst_metric() which has a WARN_ON_ONCE - and despite that
> WARN_ON_ONCE vanishing since the condition is compile-time false,
> dst_metric() is apparently sufficiently "large" that when it gets
> inlined into tcp_fastopen_no_cookie(), the latter becomes too large
> for inlining.
> 
> Overall, if one asks size(1), .text decreases a little and .data
> increases by about the same amount (x86-64 defconfig)
> 
> $ size vmlinux.{before,after}
>textdata bss dec hex filename
> 197097265202600 1630280 26542606195020e vmlinux.before
> 197093305203068 1630280 265426781950256 vmlinux.after
> 
> while bloat-o-meter says
> 
> add/remove: 10/28 grow/shrink: 103/51 up/down: 3669/-2854 (815)
> ...
> Total: Before=14783683, After=14784498, chg +0.01%
> 
> Signed-off-by: Rasmus Villemoes 

Acked-by: Ingo Molnar 

Thanks,

Ingo

[PATCH] tty: 8250_of: Use software emulated RS485 direction control

2019-09-12 Thread Heiko Schocher

Use software emulated RS485 direction control to provide RS485 API

Currently it is not possible to use rs485 as pointer to
rs485_config struct in struct uart_port is NULL in case we
configure the port through device tree.

Signed-off-by: Heiko Schocher 

---
Patch is based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master
commit:
505a8ec7e11a - Revert "drm/i915/userptr: Acquire the page lock around 
set_page_dirty()"

checkpatch output:
$ ./scripts/checkpatch.pl 
0001-tty-8250_of-Use-software-emulated-RS485-direction-co.patch
total: 0 errors, 0 warnings, 43 lines checked

0001-tty-8250_of-Use-software-emulated-RS485-direction-co.patch has no obvious 
style problems and is ready for submission.

 drivers/tty/serial/8250/8250_of.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/drivers/tty/serial/8250/8250_of.c 
b/drivers/tty/serial/8250/8250_of.c
index 0826cfdbd4063..92fbf46ce3bd9 100644
--- a/drivers/tty/serial/8250/8250_of.c
+++ b/drivers/tty/serial/8250/8250_of.c
@@ -48,6 +48,36 @@ static inline void tegra_serial_handle_break(struct 
uart_port *port)
 }
 #endif
 
+static int of_8250_rs485_config(struct uart_port *port,
+ struct serial_rs485 *rs485)
+{
+   struct uart_8250_port *up = up_to_u8250p(port);
+
+   /* Clamp the delays to [0, 100ms] */
+   rs485->delay_rts_before_send = min(rs485->delay_rts_before_send, 100U);
+   rs485->delay_rts_after_send  = min(rs485->delay_rts_after_send, 100U);
+
+   port->rs485 = *rs485;
+
+   /*
+* Both serial8250_em485_init and serial8250_em485_destroy
+* are idempotent
+*/
+   if (rs485->flags & SER_RS485_ENABLED) {
+   int ret = serial8250_em485_init(up);
+
+   if (ret) {
+   rs485->flags &= ~SER_RS485_ENABLED;
+   port->rs485.flags &= ~SER_RS485_ENABLED;
+   }
+   return ret;
+   }
+
+   serial8250_em485_destroy(up);
+
+   return 0;
+}
+
 /*
  * Fill a struct uart_port for a given device node
  */
@@ -178,6 +208,7 @@ static int of_platform_serial_setup(struct platform_device 
*ofdev,
port->flags |= UPF_SKIP_TEST;
 
port->dev = &ofdev->dev;
+   port->rs485_config = of_8250_rs485_config;
 
switch (type) {
case PORT_TEGRA:
-- 
2.21.0

Re: [PATCH] leds: remove PAGE_SIZE limit of /sys/class/leds//trigger

2019-09-12 Thread Greg Kroah-Hartman

On Fri, Sep 13, 2019 at 09:34:49AM +0900, Akinobu Mita wrote:
> 2019年9月13日(金) 2:15 Jacek Anaszewski :
> >
> > Hi Akinobu,
> >
> > Please bump patch version each time you send an update
> > of the patch with the same subject.
> 
> Oops, should I resend with the correct subject?

Yes please.

Re: [Ksummit-discuss] [PATCH v2 3/3] libnvdimm, MAINTAINERS: Maintainer Entry Profile

2019-09-12 Thread Greg KH

On Fri, Sep 13, 2019 at 07:41:55AM +0530, Aneesh Kumar K.V wrote:
> On 9/12/19 12:13 AM, Dan Carpenter wrote:
> > On Wed, Sep 11, 2019 at 08:48:59AM -0700, Dan Williams wrote:
> > > +Coding Style Addendum
> > > +-
> > > +libnvdimm expects multi-line statements to be double indented. I.e.
> > > +
> > > +if (x...
> > > +&& ...y) {
> > 
> > That looks horrible and it causes a checkpatch warning.  :(  Why not
> > do it the same way that everyone else does it.
> > 
> > if (blah_blah_x && <-- && has to be on the first line for checkpatch
> > blah_blah_y) { <-- [tab][space][space][space][space]blah
> > 
> > Now all the conditions are aligned visually which makes it readable.
> > They aren't aligned with the indent block so it's easy to tell the
> > inside from the if condition.
> 
> 
> I came across this while sending patches to libnvdimm subsystem. W.r.t
> coding Style can we have consistent styles across the kernel? Otherwise, one
> would have to change the editor settings as they work across different
> subsystems in the kernel. In this specific case both clang-format and emacs
> customization tip in the kernel documentation directory suggest the later
> style.

We _should_ have a consistent coding style across the whole kernel,
that's the whole reason for having a coding style in the first place!

The problem is, we all have agreed on the "basics" a long time ago, but
are now down in the tiny nits as to what some minor things should, or
should not, look like.

It might be time to just bite the bullet and do something like
"clang-format" to stop arguing about stuff like this for new
submissions, if for no other reason to keep us from wasting mental
energy on trivial things like this.

thanks,

greg k-h

Re: KASAN: slab-out-of-bounds Read in handle_vmptrld

2019-09-12 Thread Greg Kroah-Hartman

On Thu, Sep 12, 2019 at 06:49:26PM +0200, Paolo Bonzini wrote:
> [tl;dr: there could be a /dev/usb bug only affecting KASAN
> configurations, jump to the end to skip the analysis and get to the bug
> details]
> 
> On 12/09/19 15:54, Vitaly Kuznetsov wrote:
> > Hm, the bisection seems bogus but the stack points us to the following
> > piece of code:
> > 
> >  4776)  if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
> > 
> >  4783)  return nested_vmx_failValid(vcpu,
> >  4784)  
> > VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
> >  4785)  }
> >  4786) 
> >  4787)  new_vmcs12 = map.hva;
> >  4788) 
> > *4789)  if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
> >  4790)  (new_vmcs12->hdr.shadow_vmcs &&
> >  4791)   !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
> > 
> > the reported problem seems to be on VMCS12 region access but it's part
> > of guest memory and we successfuly managed to map it. We're definitely
> > within 1-page range. Maybe KASAN is just wrong here?
> 
> Here is the relevant part of the syzkaller repro:
> 
> syz_kvm_setup_cpu$x86(r1, 0x,
> &(0x7f00/0x18000)=nil, 0x0, 0x133, 0x0, 0x0, 0xff7d)
> r3 = syz_open_dev$usb(&(0x7f80)='/dev/bus/usb/00#/00#\x00',
> 0x4fd, 0x2008042)
> mmap$IORING_OFF_SQES(&(0x7f007000/0x2000)=nil, 0x2000, 0x4, 0x13,
> r3, 0x1000)
> syz_kvm_setup_cpu$x86(0x, r2,
> &(0x7f00/0x18000)=nil, 0x0, 0xfefd, 0x40, 0x0, 0xfdd4)
> ioctl$KVM_RUN(r2, 0xae80, 0x0)
> 
> The mmap$IORING_OFF_SQES is just a normal mmap from a device, which
> replaces the previous mapping for guest memory and in particular
> 0x7f007000 which is the VMCS (from the C reproducer: "#define
> ADDR_VAR_VMCS 0x7000").
> 
> The previous mapping is freed with do_munmap and then repopulated in
> usbdev_mmap with remap_pfn_range.  In KVM this means that kvm_vcpu_map
> goes through hva_to_pfn_remapped, which correctly calls get_page via
> kvm_get_pfn.  (Note that although drivers/usb/core/devio.c's usbdev_mmap
> sets VM_IO *after* calling remap_pfn_range, remap_pfn_range itself
> helpfully sets it before calling remap_p4d_range.  And anyway KVM is
> looking at vma->vm_flags under mmap_sem, which is held during mmap).
> 
> So, KVM should be doing the right thing.  Now, the error is:
> 
> > Read of size 4 at addr 888091e1 by task syz-executor758/10006
> > The buggy address belongs to the object at 888091e109c0 
> > The buggy address is located 2496 bytes to the left of
> >  8192-byte region [888091e109c0, 888091e129c0) 
> 
> And given the use of remap_pfn_range in devusb_mmap, the simplest
> explanation could be that USB expects kmalloc-8k to return 8k-aligned
> values, but this is not true anymore with KASAN.  CCing Dmitry, Greg and
> linux-usb.

USB drivers expect kmalloc to return DMA-able memory.  I don't know
about specific alignment issues, that should only an issue for the host
controller being used here, which you do not say in the above list.

We have had some reports that usbdev_mmap() does not do the "correct
thing" for all host controllers, but a lot of the DMA work that is in
linux-next for 5.4-rc1 should have helped resolve those issues.  What
tree are you seeing these bug reports happening from?

thanks,

greg k-h

Re: [RFC V1 1/7] genirq/msi: Differentiate between various MSI based interrupts

2019-09-12 Thread Greg KH

On Thu, Sep 12, 2019 at 06:32:02PM -0700, Megha Dey wrote:
> +enum msi_desc_tags {
> + IRQ_MSI_TAG_MSI,
> + IRQ_MSI_TAG_MSIX,
> + IRQ_MSI_TAG_IMS,
> + IRQ_MSI_TAG_PLAT,
> + IRQ_MSI_TAG_FSL,
> + IRQ_MSI_TAG_SCI,
> +};

What does any of these mean?  Can you please provide comments at the
very least saying what FSL, SCI, IMS and everything else is?

thanks,

greg k-h

Re: [RFC V1 2/7] drivers/base: Introduce callbacks for IMS interrupt domain

2019-09-12 Thread Greg KH

On Thu, Sep 12, 2019 at 06:32:03PM -0700, Megha Dey wrote:
> This patch serves as a preparatory patch to introduce a new IMS
> (Interrupt Message Store) domain. It consists of APIs which would
> be used as callbacks to the IRQ chip associated with the IMS domain.
> 
> The APIs introduced in this patch are:
> dev_ims_mask_irq - Generic irq chip callback to mask IMS interrupts
> dev_ims_unmask_irq - Generic irq chip callback to unmask IMS interrupts
> dev_ims_domain_write_msg - Helper to write MSI message to Device IMS
> 
> It also introduces IMS specific structures namely:
> dev_ims_ops - Callbacks for IMS domain ops
> dev_ims_desc - Device specific IMS msi descriptor data
> dev_ims_priv_data - Internal data structure containing a unique devid
> and a pointer to the IMS domain ops
> 
> Lastly, it adds a new config option MSI_IMS which must be enabled by
> any driver who would want to use the IMS infrastructure.
> 
> Since IMS is not PCI compliant (like platform-msi), most of the code is
> similar to platform-msi.c.
> 
> TODO: Conclude if ims-msi.c and platform-msi.c can be merged.
> 
> Cc: Jacob Pan 
> Signed-off-by: Sanjay Kumar 
> Signed-off-by: Megha Dey 
> ---
>  drivers/base/Kconfig   |  7 
>  drivers/base/Makefile  |  1 +
>  drivers/base/ims-msi.c | 94 
> ++
>  include/linux/msi.h| 35 ++-
>  4 files changed, 136 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/base/ims-msi.c
> 
> diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> index dc40449..038fabd 100644
> --- a/drivers/base/Kconfig
> +++ b/drivers/base/Kconfig
> @@ -206,3 +206,10 @@ config GENERIC_ARCH_TOPOLOGY
> runtime.
>  
>  endmenu
> +
> +config MSI_IMS
> + bool "Device Specific Interrupt Message Storage (IMS)"
> + select GENERIC_MSI_IRQ
> + help
> +   This allows device drivers to enable device specific
> +   interrupt message storage (IMS) besides standard MSI-X interrupts.

This text tells me nothing about if I want to enable this or not.  How
is a user (or even a developer) supposed to know if their hardware
requires this?

And I _really_ dont want to see this in drivers/base/ if at all possible
because suddenly I am responsible for this code that I know nothing
about.

greg k-h

[PATCH] iwlwifi: dbg_ini: fix memory leak in alloc_sgtable

2019-09-12 Thread Navid Emamdoost

In alloc_sgtable if alloc_page fails, the alocated table should be
released.

Signed-off-by: Navid Emamdoost 
---
 drivers/net/wireless/intel/iwlwifi/fw/dbg.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c 
b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
index 4d81776f576d..db41abb3361d 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
@@ -643,6 +643,7 @@ static struct scatterlist *alloc_sgtable(int size)
if (new_page)
__free_page(new_page);
}
+   kfree(table);
return NULL;
}
alloc_size = min_t(int, size, PAGE_SIZE);
-- 
2.17.1

Re: [PATCH v8 00/17] Enable FSGSBASE instructions

2019-09-12 Thread Andy Lutomirski


On 9/12/19 1:06 PM, Chang S. Bae wrote:


Updates from v7 [7]:
(1) Consider FSGSBASE when determining which Spectre SWAPGS mitigations are
 required.
(2) Fixed save_fsgs() to be aware of interrupt conditions
(3) Made selftest changes based on Andy's previous fixes and cleanups
(4) Included Andy's paranoid exit cleanup
(5) Included documentation rewritten by Thomas
(6) Carried on Thomas' edits on multiple changelogs and comments
(7) Used '[FS|GS] base' consistently, except for selftest where GSBASE has
 been already used in its test messages
(8) Dropped the READ_MSR_GSBASE macro



This looks unpleasant to review.  I wonder if it would be better to 
unrevert the reversion, merge up to Linus' tree or -tip, and then base 
the changes on top of that.


I also think that, before this series can have my ack, it needs an 
actual gdb maintainer to chime in, publicly, and state that they have 
thought about and tested the ABI changes and that gdb still works on 
patched kernels with and without FSGSBASE enabled.  I realize that there 
were all kinds of discussions, but they were all quite theoretical, and 
I think that the actual patches need to be considered by people who 
understand the concerns.  Specific test cases would be nice, too.


Finally, I wrote up some notes here:

https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/fixes&id=70a7d284989e3539ee84f9d709d6450099f773fb

I want to make sure that they're accounted for, and that patch should 
possibly be applied.  The parent (broken link, but should fix itself soon):


https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/fixes&id=166324e907f8a71c823b41bbc2e1b5bc711532d8

may also help understand the relevant code.

--Andy

Re: [PATCH] clk: imx: lpcg: write twice when writing lpcg regs

2019-09-12 Thread Shawn Guo

On Tue, Sep 10, 2019 at 02:47:59AM +, Anson Huang wrote:
> 
> 
> > On Sat, Sep 7, 2019 at 9:47 PM Stephen Boyd  wrote:
> > >
> > > Quoting Peng Fan (2019-08-27 01:17:50)
> > > > From: Peng Fan 
> > > >
> > > > There is hardware issue that:
> > > > The output clock the LPCG cell will not turn back on as expected,
> > > > even though a read of the IPG registers in the LPCG indicates that
> > > > the clock should be enabled.
> > > >
> > > > The software workaround is to write twice to enable the LPCG clock
> > > > output.
> > > >
> > > > Signed-off-by: Peng Fan 
> > >
> > > Does this need a Fixes tag?
> > 
> > Not sure as it's not code logic issue but a hardware bug.
> > And 4.19 LTS still have not this driver support.
> 
> Looks like there is an errata for this issue, and Ranjani just sent a patch 
> for review internally,

Having errata number in both commit log and code comment is generally
helpful.

Shawn

[PATCH net-next] net: dsa: b53: Add support for port_egress_floods callback

2019-09-12 Thread Florian Fainelli

Add support for configuring the per-port egress flooding control for
both Unicast and Multicast traffic.

Signed-off-by: Florian Fainelli 
---
Beneditk,

Do you mind re-testing, or confirming that this patch that I sent much
earlier does work correctly for you? Thanks!

 drivers/net/dsa/b53/b53_common.c | 33 
 drivers/net/dsa/b53/b53_priv.h   |  2 ++
 2 files changed, 35 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 7d328a5f0161..ac2ec08a652b 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -342,6 +342,13 @@ static void b53_set_forwarding(struct b53_device *dev, int 
enable)
b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, &mgmt);
mgmt |= B53_MII_DUMB_FWDG_EN;
b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt);
+
+   /* Look at B53_UC_FWD_EN and B53_MC_FWD_EN to decide whether
+* frames should be flooed or not.
+*/
+   b53_read8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, &mgmt);
+   mgmt |= B53_UC_FWD_EN | B53_MC_FWD_EN;
+   b53_write8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, mgmt);
 }
 
 static void b53_enable_vlan(struct b53_device *dev, bool enable,
@@ -1753,6 +1760,31 @@ void b53_br_fast_age(struct dsa_switch *ds, int port)
 }
 EXPORT_SYMBOL(b53_br_fast_age);
 
+int b53_br_egress_floods(struct dsa_switch *ds, int port,
+bool unicast, bool multicast)
+{
+   struct b53_device *dev = ds->priv;
+   u16 uc, mc;
+
+   b53_read16(dev, B53_CTRL_PAGE, B53_UC_FWD_EN, &uc);
+   if (unicast)
+   uc |= BIT(port);
+   else
+   uc &= ~BIT(port);
+   b53_write16(dev, B53_CTRL_PAGE, B53_UC_FWD_EN, uc);
+
+   b53_read16(dev, B53_CTRL_PAGE, B53_MC_FWD_EN, &mc);
+   if (multicast)
+   mc |= BIT(port);
+   else
+   mc &= ~BIT(port);
+   b53_write16(dev, B53_CTRL_PAGE, B53_MC_FWD_EN, mc);
+
+   return 0;
+
+}
+EXPORT_SYMBOL(b53_br_egress_floods);
+
 static bool b53_possible_cpu_port(struct dsa_switch *ds, int port)
 {
/* Broadcom switches will accept enabling Broadcom tags on the
@@ -1953,6 +1985,7 @@ static const struct dsa_switch_ops b53_switch_ops = {
.port_bridge_leave  = b53_br_leave,
.port_stp_state_set = b53_br_set_stp_state,
.port_fast_age  = b53_br_fast_age,
+   .port_egress_floods = b53_br_egress_floods,
.port_vlan_filtering= b53_vlan_filtering,
.port_vlan_prepare  = b53_vlan_prepare,
.port_vlan_add  = b53_vlan_add,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index f25bc80c4ffc..a7dd8acc281b 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -319,6 +319,8 @@ int b53_br_join(struct dsa_switch *ds, int port, struct 
net_device *bridge);
 void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
 void b53_br_fast_age(struct dsa_switch *ds, int port);
+int b53_br_egress_floods(struct dsa_switch *ds, int port,
+bool unicast, bool multicast);
 void b53_port_event(struct dsa_switch *ds, int port);
 void b53_phylink_validate(struct dsa_switch *ds, int port,
  unsigned long *supported,
-- 
2.17.1

[PATCH v2 2/2] gpiolib: introduce fwnode_gpiod_get_index()

This introduces fwnode_gpiod_get_index() that iterates through common gpio
suffixes when trying to locate a GPIO within a given firmware node.

We also switch devm_fwnode_gpiod_get_index() to call
fwnode_gpiod_get_index() instead of iterating through GPIO suffixes on
its own.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Dmitry Torokhov 

---

Changes in v2:
- rebased on top of Linus W devel branch
- added Andy's Reviewed-by

 drivers/gpio/gpiolib-devres.c | 16 +---
 drivers/gpio/gpiolib.c| 48 +++
 include/linux/gpio/consumer.h | 13 ++
 3 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/drivers/gpio/gpiolib-devres.c b/drivers/gpio/gpiolib-devres.c
index 9a0475c87f95..4421be09b960 100644
--- a/drivers/gpio/gpiolib-devres.c
+++ b/drivers/gpio/gpiolib-devres.c
@@ -205,29 +205,15 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct 
device *dev,
  enum gpiod_flags flags,
  const char *label)
 {
-   char prop_name[32]; /* 32 is max size of property name */
struct gpio_desc **dr;
struct gpio_desc *desc;
-   unsigned int i;
 
dr = devres_alloc(devm_gpiod_release, sizeof(struct gpio_desc *),
  GFP_KERNEL);
if (!dr)
return ERR_PTR(-ENOMEM);
 
-   for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) {
-   if (con_id)
-   snprintf(prop_name, sizeof(prop_name), "%s-%s",
-   con_id, gpio_suffixes[i]);
-   else
-   snprintf(prop_name, sizeof(prop_name), "%s",
-   gpio_suffixes[i]);
-
-   desc = fwnode_get_named_gpiod(fwnode, prop_name, index, flags,
- label);
-   if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
-   break;
-   }
+   desc = fwnode_gpiod_get_index(fwnode, con_id, index, flags, label);
if (IS_ERR(desc)) {
devres_free(dr);
return desc;
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 158e327a1285..11a6f4777436 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -4317,6 +4317,54 @@ static int platform_gpio_count(struct device *dev, const 
char *con_id)
return count;
 }
 
+/**
+ * fwnode_gpiod_get_index - obtain a GPIO from firmware node
+ * @fwnode:handle of the firmware node
+ * @con_id:function within the GPIO consumer
+ * @index: index of the GPIO to obtain for the consumer
+ * @flags: GPIO initialization flags
+ * @label: label to attach to the requested GPIO
+ *
+ * This function can be used for drivers that get their configuration
+ * from opaque firmware.
+ *
+ * The function properly finds the corresponding GPIO using whatever is the
+ * underlying firmware interface and then makes sure that the GPIO
+ * descriptor is requested before it is returned to the caller.
+ *
+ * Returns:
+ * On successful request the GPIO pin is configured in accordance with
+ * provided @flags.
+ *
+ * In case of error an ERR_PTR() is returned.
+ */
+struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode,
+const char *con_id, int index,
+enum gpiod_flags flags,
+const char *label)
+{
+   struct gpio_desc *desc;
+   char prop_name[32]; /* 32 is max size of property name */
+   unsigned int i;
+
+   for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) {
+   if (con_id)
+   snprintf(prop_name, sizeof(prop_name), "%s-%s",
+   con_id, gpio_suffixes[i]);
+   else
+   snprintf(prop_name, sizeof(prop_name), "%s",
+   gpio_suffixes[i]);
+
+   desc = fwnode_get_named_gpiod(fwnode, prop_name, index, flags,
+ label);
+   if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
+   break;
+   }
+
+   return desc;
+}
+EXPORT_SYMBOL_GPL(fwnode_gpiod_get_index);
+
 /**
  * gpiod_count - return the number of GPIOs associated with a device / function
  * or -ENOENT if no GPIO has been assigned to the requested 
function
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index dc0ddcd30515..5215fdba6b9a 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -176,6 +176,10 @@ struct gpio_desc *fwnode_get_named_gpiod(struct 
fwnode_handle *fwnode,
 const char *propname, int index,
 enum gpiod_flags dflags,
 const char *labe

[PATCH v2 0/2] Add support for software nodes to gpiolib

This is a part of the larger series previously posted at

https://lore.kernel.org/linux-gpio/20190911075215.78047-1-dmitry.torok...@gmail.com

that was rebased on top of linux-gpio devel branch.

Changes in v2:
- switched export to be EXPORT_SYMBOL_GPL to match the new export
  markings for the rest of GPIO devres functions
- rebased on top of Linus W devel branch
- added Andy's Reviewed-by

Dmitry Torokhov (2):
  gpiolib: introduce devm_fwnode_gpiod_get_index()
  gpiolib: introduce fwnode_gpiod_get_index()

 drivers/gpio/gpiolib-devres.c | 33 ++---
 drivers/gpio/gpiolib.c| 48 +++
 include/linux/gpio/consumer.h | 54 ---
 3 files changed, 101 insertions(+), 34 deletions(-)

-- 
2.23.0.237.gc6a4ce50a0-goog

[PATCH v2 1/2] gpiolib: introduce devm_fwnode_gpiod_get_index()

devm_fwnode_get_index_gpiod_from_child() is too long, besides the fwnode
in question does not have to be a child of device node. Let's rename it
to devm_fwnode_gpiod_get_index() and keep the old name for compatibility
for now.

Also let's add a devm_fwnode_gpiod_get() wrapper as majority of the
callers need a single GPIO.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Dmitry Torokhov 

---

Changes in v2:
- switched export to be EXPORT_SYMBOL_GPL to match the new export
  markings for the rest of GPIO devres functions
- rebased on top of Linus W devel branch
- added Andy's Reviewed-by

 drivers/gpio/gpiolib-devres.c | 19 
 include/linux/gpio/consumer.h | 41 ++-
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/drivers/gpio/gpiolib-devres.c b/drivers/gpio/gpiolib-devres.c
index 98e3c20d9730..9a0475c87f95 100644
--- a/drivers/gpio/gpiolib-devres.c
+++ b/drivers/gpio/gpiolib-devres.c
@@ -185,12 +185,11 @@ struct gpio_desc *devm_gpiod_get_from_of_node(struct 
device *dev,
 EXPORT_SYMBOL_GPL(devm_gpiod_get_from_of_node);
 
 /**
- * devm_fwnode_get_index_gpiod_from_child - get a GPIO descriptor from a
- * device's child node
+ * devm_fwnode_gpiod_get_index - get a GPIO descriptor from a given node
  * @dev:   GPIO consumer
+ * @fwnode:firmware node containing GPIO reference
  * @con_id:function within the GPIO consumer
  * @index: index of the GPIO to obtain in the consumer
- * @child: firmware node (child of @dev)
  * @flags: GPIO initialization flags
  * @label: label to attach to the requested GPIO
  *
@@ -200,11 +199,11 @@ EXPORT_SYMBOL_GPL(devm_gpiod_get_from_of_node);
  * On successful request the GPIO pin is configured in accordance with
  * provided @flags.
  */
-struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
-   const char *con_id, int index,
-   struct fwnode_handle *child,
-   enum gpiod_flags flags,
-   const char *label)
+struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
+ struct fwnode_handle *fwnode,
+ const char *con_id, int index,
+ enum gpiod_flags flags,
+ const char *label)
 {
char prop_name[32]; /* 32 is max size of property name */
struct gpio_desc **dr;
@@ -224,7 +223,7 @@ struct gpio_desc 
*devm_fwnode_get_index_gpiod_from_child(struct device *dev,
snprintf(prop_name, sizeof(prop_name), "%s",
gpio_suffixes[i]);
 
-   desc = fwnode_get_named_gpiod(child, prop_name, index, flags,
+   desc = fwnode_get_named_gpiod(fwnode, prop_name, index, flags,
  label);
if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
break;
@@ -239,7 +238,7 @@ struct gpio_desc 
*devm_fwnode_get_index_gpiod_from_child(struct device *dev,
 
return desc;
 }
-EXPORT_SYMBOL_GPL(devm_fwnode_get_index_gpiod_from_child);
+EXPORT_SYMBOL_GPL(devm_fwnode_gpiod_get_index);
 
 /**
  * devm_gpiod_get_index_optional - Resource-managed gpiod_get_index_optional()
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index b70af921c614..dc0ddcd30515 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -176,11 +176,11 @@ struct gpio_desc *fwnode_get_named_gpiod(struct 
fwnode_handle *fwnode,
 const char *propname, int index,
 enum gpiod_flags dflags,
 const char *label);
-struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
-   const char *con_id, int index,
-   struct fwnode_handle *child,
-   enum gpiod_flags flags,
-   const char *label);
+struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
+ struct fwnode_handle *child,
+ const char *con_id, int index,
+ enum gpiod_flags flags,
+ const char *label);
 
 #else /* CONFIG_GPIOLIB */
 
@@ -531,6 +531,29 @@ struct gpio_desc *fwnode_get_named_gpiod(struct 
fwnode_handle *fwnode,
return ERR_PTR(-ENOSYS);
 }
 
+static inline
+struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
+

[PATCH 03/11] KVM: x86/mmu: Use fast invalidate mechanism to zap MMIO sptes

Use the fast invalidate mechasim to zap MMIO sptes on a MMIO generation
wrap.  The fast invalidate flow was reintroduced to fix a livelock bug
in kvm_mmu_zap_all() that can occur if kvm_mmu_zap_all() is invoked when
the guest has live vCPUs.  I.e. using kvm_mmu_zap_all() to handle the
MMIO generation wrap is theoretically susceptible to the livelock bug.

This effectively reverts commit 4771450c345dc ("Revert "KVM: MMU: drop
kvm_mmu_zap_mmio_sptes""), i.e. restores the behavior of commit
a8eca9dcc656a ("KVM: MMU: drop kvm_mmu_zap_mmio_sptes").

Note, this actually fixes commit 571c5af06e303 ("KVM: x86/mmu:
Voluntarily reschedule as needed when zapping MMIO sptes"), but there
is no need to incrementally revert back to using fast invalidate, e.g.
doing so doesn't provide any bisection or stability benefits.

Fixes: 571c5af06e303 ("KVM: x86/mmu: Voluntarily reschedule as needed when 
zapping MMIO sptes")
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu.c  | 17 +++--
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc279b513446..ef378abac00f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -320,7 +320,6 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
bool unsync;
-   bool mmio_cached;
 
/*
 * The following two entries are used to key the shadow page in the
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 373e6f052f9f..8d3fbc48d1be 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -403,8 +403,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 
*sptep, u64 gfn,
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len;
 
-   page_header(__pa(sptep))->mmio_cached = true;
-
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
 }
@@ -5947,7 +5945,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
-static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
+void kvm_mmu_zap_all(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
@@ -5956,14 +5954,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool 
mmio_only)
spin_lock(&kvm->mmu_lock);
 restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-   if (mmio_only && !sp->mmio_cached)
-   continue;
if (sp->role.invalid && sp->root_count)
continue;
-   if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
-   WARN_ON_ONCE(mmio_only);
+   if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
-   }
if (cond_resched_lock(&kvm->mmu_lock))
goto restart;
}
@@ -5972,11 +5966,6 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool 
mmio_only)
spin_unlock(&kvm->mmu_lock);
 }
 
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-   return __kvm_mmu_zap_all(kvm, false);
-}
-
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
@@ -5998,7 +5987,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 
gen)
 */
if (unlikely(gen == 0)) {
kvm_debug_ratelimited("kvm: zapping shadow pages for mmio 
generation wraparound\n");
-   __kvm_mmu_zap_all(kvm, true);
+   kvm_mmu_zap_all_fast(kvm);
}
 }
 
-- 
2.22.0

[PATCH 00/11] KVM: x86/mmu: Restore fast invalidate/zap flow

Restore the fast invalidate flow for zapping shadow pages and use it
whenever vCPUs can be active in the VM.  This fixes (in theory, not yet
confirmed) a regression reported by James Harvey where KVM can livelock
in kvm_mmu_zap_all() when it's invoked in response to a memslot update.

The fast invalidate flow was removed as it was deemed to be unnecessary
after its primary user, memslot flushing, was reworked to zap only the
memslot in question instead of all shadow pages.  Unfortunately, zapping
only the memslot being (re)moved during a memslot update introduced a
regression for VMs with assigned devices.  Because we could not discern
why zapping only the relevant memslot broke device assignment, or if the
regression extended beyond device assignment, we reverted to zapping all
shadow pages when a memslot is (re)moved.

The revert to "zap all" failed to account for subsequent changes that
have been made to kvm_mmu_zap_all() between then and now.  Specifically,
kvm_mmu_zap_all() now conditionally drops reschedules and drops mmu_lock
if a reschedule is needed or if the lock is contended.  Dropping the lock
allows other vCPUs to add shadow pages, and, with enough vCPUs, can cause
kvm_mmu_zap_all() to get stuck in an infinite loop as it can never zap all
pages before observing lock contention or the need to reschedule.

The reasoning behind having kvm_mmu_zap_all() conditionally reschedule was
that it would only be used when the VM is inaccesible, e.g. when its
mm_struct is dying or when the VM itself is being destroyed.  In that case,
playing nice with the rest of the kernel instead of hogging cycles to free
unused shadow pages made sense.

Since it's unlikely we'll root cause the device assignment regression any
time soon, and that simply removing the conditional rescheduling isn't
guaranteed to return us to a known good state, restore the fast invalidate
flow for zapping on memslot updates, including mmio generation wraparound.
Opportunisticaly tack on a bug fix and a couple enhancements.

Alex and James, it probably goes without saying... please test, especially
patch 01/11 as a standalone patch as that'll likely need to be applied to
stable branches, assuming it works.  Thanks!

Sean Christopherson (11):
  KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot
  KVM: x86/mmu: Treat invalid shadow pages as obsolete
  KVM: x86/mmu: Use fast invalidate mechanism to zap MMIO sptes
  KVM: x86/mmu: Revert "Revert "KVM: MMU: show mmu_valid_gen in shadow
page related tracepoints""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: add tracepoint for
kvm_mmu_invalidate_all_pages""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: collapse TLB flushes when zap
all pages""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: reclaim the zapped-obsolete
page first""
  KVM: x86/mmu: Revert "KVM: x86/mmu: Remove is_obsolete() call"
  KVM: x86/mmu: Explicitly track only a single invalid mmu generation
  KVM: x86/mmu: Skip invalid pages during zapping iff root_count is zero

 arch/x86/include/asm/kvm_host.h |   4 +-
 arch/x86/kvm/mmu.c  | 154 
 arch/x86/kvm/mmutrace.h |  42 +++--
 arch/x86/kvm/x86.c  |   1 +
 4 files changed, 173 insertions(+), 28 deletions(-)

-- 
2.22.0

[PATCH 02/11] KVM: x86/mmu: Treat invalid shadow pages as obsolete

Treat invalid shadow pages as obsolete to fix a bug where an obsolete
and invalid page with a non-zero root count could become non-obsolete
due to mmu_valid_gen wrapping.  The bug is largely theoretical with the
current code base, as an unsigned long will effectively never wrap on
64-bit KVM, and userspace would have to deliberately stall a vCPU in
order to keep an obsolete invalid page on the active list while
simultaneously modifying memslots billions of times to trigger a wrap.

The obvious alternative is to use a 64-bit value for mmu_valid_gen,
but it's actually desirable to go in the opposite direction, i.e. using
a smaller 8-bit value to reduce KVM's memory footprint by 8 bytes per
shadow page, and relying on proper treatment of invalid pages instead of
preventing the generation from wrapping.

Note, "Fixes" points at a commit that was at one point reverted, but has
since been restored.

Fixes: 5304b8d37c2a5 ("KVM: MMU: fast invalidate all pages")
Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5ac5e3f50f92..373e6f052f9f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2252,7 +2252,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 #define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp,   \
  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-   if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {\
+   if (is_obsolete_sp((_kvm), (_sp))) {\
} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)
\
@@ -2311,7 +2311,8 @@ static void mmu_audit_disable(void) { }
 
 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-   return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+   return sp->role.invalid ||
+  unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
 }
 
 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-- 
2.22.0

[PATCH 04/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints""

Now that the fast invalidate mechanism has been reintroduced, restore
tracing of the generation number in shadow page tracepoints.

This reverts commit b59c4830ca185ba0e9f9e046fb1cd10a4a92627a.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmutrace.h | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index d8001b4bca05..e9832b5ec53c 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,16 +8,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS \
-   __field(__u64, gfn) \
-   __field(__u32, role) \
-   __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS\
+   __field(unsigned long, mmu_valid_gen)   \
+   __field(__u64, gfn) \
+   __field(__u32, role)\
+   __field(__u32, root_count)  \
__field(bool, unsync)
 
-#define KVM_MMU_PAGE_ASSIGN(sp) \
-   __entry->gfn = sp->gfn;  \
-   __entry->role = sp->role.word;   \
-   __entry->root_count = sp->root_count;\
+#define KVM_MMU_PAGE_ASSIGN(sp)\
+   __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+   __entry->gfn = sp->gfn; \
+   __entry->role = sp->role.word;  \
+   __entry->root_count = sp->root_count;   \
__entry->unsync = sp->unsync;
 
 #define KVM_MMU_PAGE_PRINTK() ({   \
@@ -29,8 +31,9 @@
\
role.word = __entry->role;  \
\
-   trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s"\
+   trace_seq_printf(p, "sp gen %lx gfn %llx l%u %u-byte q%u%s %s%s"\
 " %snxe %sad root %u %s%c",\
+__entry->mmu_valid_gen,\
 __entry->gfn, role.level,  \
 role.gpte_is_8_bytes ? 8 : 4,  \
 role.quadrant, \
-- 
2.22.0

[PATCH 11/11] KVM: x86/mmu: Skip invalid pages during zapping iff root_count is zero

Do not skip invalid shadow pages when zapping obsolete pages if the
pages' root_count has reached zero, in which case the page can be
immediately zapped and freed.

Update the comment accordingly.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a7b14750cde9..5e41b1f77a6d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5692,11 +5692,12 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
break;
 
/*
-* Since we are reversely walking the list and the invalid
-* list will be moved to the head, skip the invalid page
-* can help us to avoid the infinity list walking.
+* Skip invalid pages with a non-zero root count, zapping pages
+* with a non-zero root count will never succeed, i.e. the page
+* will get thrown back on active_mmu_pages and we'll get stuck
+* in an infinite loop.
 */
-   if (sp->role.invalid)
+   if (sp->role.invalid && sp->root_count)
continue;
 
/*
-- 
2.22.0

[PATCH 06/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch""

Now that the fast invalidate mechanism has been reintroduced, restore
the performance tweaks for fast invalidation that existed prior to its
removal.

Paraphrashing the original changelog:

  Zap at least 10 shadow pages before releasing mmu_lock to reduce the
  overhead associated with re-acquiring the lock.

  Note: "10" is an arbitrary number, speculated to be high enough so
  that a vCPU isn't stuck zapping obsolete pages for an extended period,
  but small enough so that other vCPUs aren't starved waiting for
  mmu_lock.

This reverts commit 43d2b14b105fb00b8864c7b0ee7043cc1cc4a969.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 35 +--
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bf20afc3e73..827414b12dbd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5670,12 +5670,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
return alloc_mmu_pages(vcpu);
 }
 
-
+#define BATCH_ZAP_PAGES10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
-   int ign;
+   int nr_zapped, batch = 0;
 
 restart:
list_for_each_entry_safe_reverse(sp, node,
@@ -5688,28 +5688,6 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
break;
 
/*
-* Do not repeatedly zap a root page to avoid unnecessary
-* KVM_REQ_MMU_RELOAD, otherwise we may not be able to
-* progress:
-*vcpu 0vcpu 1
-* call vcpu_enter_guest():
-*1): handle KVM_REQ_MMU_RELOAD
-*and require mmu-lock to
-*load mmu
-* repeat:
-*1): zap root page and
-*send KVM_REQ_MMU_RELOAD
-*
-*2): if (cond_resched_lock(mmu-lock))
-*
-*2): hold mmu-lock and load mmu
-*
-*3): see KVM_REQ_MMU_RELOAD bit
-*on vcpu->requests is set
-*then return 1 to call
-*vcpu_enter_guest() again.
-*goto repeat;
-*
 * Since we are reversely walking the list and the invalid
 * list will be moved to the head, skip the invalid page
 * can help us to avoid the infinity list walking.
@@ -5717,14 +5695,19 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
if (sp->role.invalid)
continue;
 
-   if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+   if (batch >= BATCH_ZAP_PAGES &&
+   (need_resched() || spin_needbreak(&kvm->mmu_lock))) {
+   batch = 0;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
cond_resched_lock(&kvm->mmu_lock);
goto restart;
}
 
-   if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
+   if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+  &nr_zapped)) {
+   batch += nr_zapped;
goto restart;
+   }
}
 
kvm_mmu_commit_zap_page(kvm, &invalid_list);
-- 
2.22.0

[PATCH 08/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: reclaim the zapped-obsolete page first""

Now that the fast invalidate mechanism has been reintroduced, restore
the performance tweaks for fast invalidation that existed prior to its
removal.

Paraphrashing the original changelog:

  Introduce a per-VM list to track obsolete shadow pages, i.e. pages
  which have been deleted from the mmu cache but haven't yet been freed.
  When page reclaiming is needed, zap/free the deleted pages first.

This reverts commit 52d5dedc79bdcbac2976159a172069618cf31be5.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c  | 22 +-
 arch/x86/kvm/x86.c  |  1 +
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ef378abac00f..6e4fa75351fd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -862,6 +862,7 @@ struct kvm_arch {
 * Hash table of struct kvm_mmu_page.
 */
struct list_head active_mmu_pages;
+   struct list_head zapped_obsolete_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8c0648bbc7c1..84d916674529 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5674,7 +5674,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
-   LIST_HEAD(invalid_list);
int nr_zapped, batch = 0;
 
 restart:
@@ -5707,8 +5706,8 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
goto restart;
}
 
-   if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
-  &nr_zapped)) {
+   if (__kvm_mmu_prepare_zap_page(kvm, sp,
+   &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
batch += nr_zapped;
goto restart;
}
@@ -5719,7 +5718,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
 * KVM is not in the middle of a lockless shadow page table walk, which
 * may reference the pages.
 */
-   kvm_mmu_commit_zap_page(kvm, &invalid_list);
+   kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
 }
 
 /*
@@ -5751,6 +5750,11 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
spin_unlock(&kvm->mmu_lock);
 }
 
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+   return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
@@ -6021,16 +6025,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct 
shrink_control *sc)
 * want to shrink a VM that only started to populate its MMU
 * anyway.
 */
-   if (!kvm->arch.n_used_mmu_pages)
+   if (!kvm->arch.n_used_mmu_pages &&
+   !kvm_has_zapped_obsolete_pages(kvm))
continue;
 
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
 
+   if (kvm_has_zapped_obsolete_pages(kvm)) {
+   kvm_mmu_commit_zap_page(kvm,
+ &kvm->arch.zapped_obsolete_pages);
+   goto unlock;
+   }
+
if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
+unlock:
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4cfd786d0b6..3d092b0f6bcb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9306,6 +9306,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+   INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
-- 
2.22.0

[PATCH 05/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages""

Now that the fast invalidate mechanism has been reintroduced, restore
the tracepoint associated with said mechanism.

Note, the name of the tracepoint deviates from the original tracepoint
so as to match KVM's current nomenclature.

This reverts commit 42560fb1f3c6c7f730897b7fa7a478bc37e0be50.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c  |  1 +
 arch/x86/kvm/mmutrace.h | 21 +
 2 files changed, 22 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8d3fbc48d1be..0bf20afc3e73 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5742,6 +5742,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
spin_lock(&kvm->mmu_lock);
+   trace_kvm_mmu_zap_all_fast(kvm);
kvm->arch.mmu_valid_gen++;
 
kvm_zap_obsolete_pages(kvm);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index e9832b5ec53c..1a063ba76281 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -282,6 +282,27 @@ TRACE_EVENT(
)
 );
 
+TRACE_EVENT(
+   kvm_mmu_zap_all_fast,
+   TP_PROTO(struct kvm *kvm),
+   TP_ARGS(kvm),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, mmu_valid_gen)
+   __field(unsigned int, mmu_used_pages)
+   ),
+
+   TP_fast_assign(
+   __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+   __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+   ),
+
+   TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+   )
+);
+
+
 TRACE_EVENT(
check_mmio_spte,
TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
-- 
2.22.0

[PATCH 01/11] KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot

Reintroduce the fast invalidate mechanism and use it when zapping shadow
pages in response to a memslot being deleted/moved.  Using the fast
mechanism fixes a livelock reported by James Harvey that was introduced
by commit d012a06ab1d23 ("Revert "KVM: x86/mmu: Zap only the relevant
pages when removing a memslot"").

The livelock occurs because kvm_mmu_zap_all() as it exists today will
voluntarily reschedule and drop KVM's mmu_lock, which allows other vCPUs
to add shadow pages.  With enough vCPUs, kvm_mmu_zap_all() can get stuck
in an infinite loop as it can never zap all pages before observing lock
contention or the need to reschedule.

The equivalent of kvm_mmu_zap_all() that was in use at the time of
the reverted commit (4e103134b8623, "KVM: x86/mmu: Zap only the relevant
pages when removing a memslot") employed a fast invalidate mechanism and
was not susceptible to the above livelock.  Restore the fast invalidate
code and use it when flushing a memslot.

Reverting the revert (commit d012a06ab1d23) is not a viable option as
the revert is needed to fix a regression that occurs when the guest has
one or more assigned devices.

Alternatively, the livelock could be eliminated by removing the
conditional reschedule from kvm_mmu_zap_all().  However, although
removing the reschedule would be a smaller code change, it's less safe
in the sense that the resulting kvm_mmu_zap_all() hasn't been used in
the wild for flushing memslots since the fast invalidate mechanism was
introduced by commit 6ca18b6950f8d ("KVM: x86: use the fast way to
invalidate all pages"), back in 2013.

For all intents and purposes, this is a revert of commit ea145aacf4ae8
("Revert "KVM: MMU: fast invalidate all pages"") and a partial revert of
commit 7390de1e99a70 ("Revert "KVM: x86: use the fast way to invalidate
all pages""), i.e. restores the behavior of commit 5304b8d37c2a5 ("KVM:
MMU: fast invalidate all pages") and commit 6ca18b6950f8d ("KVM: x86:
use the fast way to invalidate all pages") respectively.

Fixes: d012a06ab1d23 ("Revert "KVM: x86/mmu: Zap only the relevant pages when 
removing a memslot"")
Reported-by: James Harvey 
Cc: Alex Willamson 
Cc: Paolo Bonzini 
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/mmu.c  | 101 +++-
 2 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44a5ce57a905..fc279b513446 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -335,6 +335,7 @@ struct kvm_mmu_page {
int root_count;  /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+   unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -856,6 +857,7 @@ struct kvm_arch {
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
+   unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c45ff0cfbd0..5ac5e3f50f92 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2097,6 +2097,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct 
kvm_vcpu *vcpu, int direct
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+   /*
+* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
+* depends on valid pages being added to the head of the list.  See
+* comments in kvm_zap_obsolete_pages().
+*/
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2246,7 +2252,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 #define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp,   \
  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-   if ((_sp)->role.invalid) {\
+   if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {\
} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)
\
@@ -2303,6 +2309,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int 
point) { }
 static void mmu_audit_disable(void) { }
 #endif
 
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+   return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 struct list_head *invalid_list)

[PATCH 09/11] KVM: x86/mmu: Revert "KVM: x86/mmu: Remove is_obsolete() call"

Now that the fast invalidate mechanism has been reintroduced, restore
the performance tweaks for fast invalidation that existed prior to its
removal.

Paraphrasing the original changelog (commit 5ff0568374ed2 was itself a
partial revert):

  Don't force reloading the remote mmu when zapping an obsolete page, as
  a MMU_RELOAD request has already been issued by kvm_mmu_zap_all_fast()
  immediately after incrementing mmu_valid_gen, i.e. after marking pages
  obsolete.

This reverts commit 5ff0568374ed2e585376a3832857ade5daccd381.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 84d916674529..bce19918ca5a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2752,7 +2752,12 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
 
-   if (!sp->role.invalid)
+   /*
+* Obsolete pages cannot be used on any vCPUs, see the comment
+* in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
+* treats invalid shadow pages as being obsolete.
+*/
+   if (!is_obsolete_sp(kvm, sp))
kvm_reload_remote_mmus(kvm);
}
 
-- 
2.22.0

Re: [PATCH] memcg, kmem: do not fail __GFP_NOFAIL charges

2019-09-12 Thread Shakeel Butt

On Wed, Sep 11, 2019 at 8:16 AM Michal Hocko  wrote:
>
> On Wed 11-09-19 07:37:40, Andrew Morton wrote:
> > On Wed, 11 Sep 2019 14:00:02 +0200 Michal Hocko  wrote:
> >
> > > On Mon 09-09-19 13:22:45, Michal Hocko wrote:
> > > > On Fri 06-09-19 11:24:55, Shakeel Butt wrote:
> > > [...]
> > > > > I wonder what has changed since
> > > > > .
> > > >
> > > > I have completely forgot about that one. It seems that we have just
> > > > repeated the same discussion again. This time we have a poor user who
> > > > actually enabled the kmem limit.
> > > >
> > > > I guess there was no real objection to the change back then. The primary
> > > > discussion revolved around the fact that the accounting will stay broken
> > > > even when this particular part was fixed. Considering this leads to easy
> > > > to trigger crash (with the limit enabled) then I guess we should just
> > > > make it less broken and backport to stable trees and have a serious
> > > > discussion about discontinuing of the limit. Start by simply failing to
> > > > set any limit in the current upstream kernels.
> > >
> > > Any more concerns/objections to the patch? I can add a reference to your
> > > earlier post Shakeel if you want or to credit you the way you prefer.
> > >
> > > Also are there any objections to start deprecating process of kmem
> > > limit? I would see it in two stages
> > > - 1st warn in the kernel log
> > > pr_warn("kmem.limit_in_bytes is deprecated and will be removed.
> > > "Please report your usecase to linux...@kvack.org if you "
> > > "depend on this functionality."
> >
> > pr_warn_once() :)
> >
> > > - 2nd fail any write to kmem.limit_in_bytes
> > > - 3rd remove the control file completely
> >
> > Sounds good to me.
>
> Here we go
>
> From 512822e551fe2960040c23b12c7b27a5fdab9013 Mon Sep 17 00:00:00 2001
> From: Michal Hocko 
> Date: Wed, 11 Sep 2019 17:02:33 +0200
> Subject: [PATCH] memcg, kmem: deprecate kmem.limit_in_bytes
>
> Cgroup v1 memcg controller has exposed a dedicated kmem limit to users
> which turned out to be really a bad idea because there are paths which
> cannot shrink the kernel memory usage enough to get below the limit
> (e.g. because the accounted memory is not reclaimable). There are cases
> when the failure is even not allowed (e.g. __GFP_NOFAIL). This means
> that the kmem limit is in excess to the hard limit without any way to
> shrink and thus completely useless. OOM killer cannot be invoked to
> handle the situation because that would lead to a premature oom killing.
>
> As a result many places might see ENOMEM returning from kmalloc and
> result in unexpected errors. E.g. a global OOM killer when there is a
> lot of free memory because ENOMEM is translated into VM_FAULT_OOM in #PF
> path and therefore pagefault_out_of_memory would result in OOM killer.
>
> Please note that the kernel memory is still accounted to the overall
> limit along with the user memory so removing the kmem specific limit
> should still allow to contain kernel memory consumption. Unlike the kmem
> one, though, it invokes memory reclaim and targeted memcg oom killing if
> necessary.
>
> Start the deprecation process by crying to the kernel log. Let's see
> whether there are relevant usecases and simply return to EINVAL in the
> second stage if nobody complains in few releases.
>
> Signed-off-by: Michal Hocko 

Reviewed-by: Shakeel Butt 

> ---
>  Documentation/admin-guide/cgroup-v1/memory.rst | 3 +++
>  mm/memcontrol.c| 3 +++
>  2 files changed, 6 insertions(+)
>
> diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst 
> b/Documentation/admin-guide/cgroup-v1/memory.rst
> index 41bdc038dad9..e53fc2f31549 100644
> --- a/Documentation/admin-guide/cgroup-v1/memory.rst
> +++ b/Documentation/admin-guide/cgroup-v1/memory.rst
> @@ -87,6 +87,9 @@ Brief summary of control files.
>  node
>
>   memory.kmem.limit_in_bytes  set/show hard limit for kernel memory
> + This knob is deprecated it shouldn't be
> + used. It is planned to be removed in
> + a foreseeable future.
>   memory.kmem.usage_in_bytes  show current kernel memory allocation
>   memory.kmem.failcnt show the number of kernel memory usage
>  hits limits
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e18108b2b786..113969bc57e8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3518,6 +3518,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file 
> *of,
> ret = mem_cgroup_resize_max(memcg, nr_pages, true);
> break;
> case _KMEM:
> +   pr_warn_once("kmem.limit_in_bytes is deprecated and 
> will be removed. "
> +

[PATCH 07/11] KVM: x86/mmu: Revert "Revert "KVM: MMU: collapse TLB flushes when zap all pages""

Now that the fast invalidate mechanism has been reintroduced, restore
the performance tweaks for fast invalidation that existed prior to its
removal.

Paraphrashing the original changelog:

  Reload the mmu on all vCPUs after updating the generation number so
  that obsolete pages are not used by any vCPUs.  This allows collapsing
  all TLB flushes during obsolete page zapping into a single flush, as
  there is no need to flush when dropping mmu_lock (to reschedule).

  Note: a remote TLB flush is still needed before freeing the pages as
  other vCPUs may be doing a lockless shadow page walk.

Opportunstically improve the comments restored by the revert (the
code itself is a true revert).

This reverts commit f34d251d66ba263c077ed9d2bbd1874339a4c887.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 827414b12dbd..8c0648bbc7c1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5695,11 +5695,15 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
if (sp->role.invalid)
continue;
 
+   /*
+* No need to flush the TLB since we're only zapping shadow
+* pages with an obsolete generation number and all vCPUS have
+* loaded a new root, i.e. the shadow pages being zapped cannot
+* be in active use by the guest.
+*/
if (batch >= BATCH_ZAP_PAGES &&
-   (need_resched() || spin_needbreak(&kvm->mmu_lock))) {
+   cond_resched_lock(&kvm->mmu_lock)) {
batch = 0;
-   kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   cond_resched_lock(&kvm->mmu_lock);
goto restart;
}
 
@@ -5710,6 +5714,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
}
}
 
+   /*
+* Trigger a remote TLB flush before freeing the page tables to ensure
+* KVM is not in the middle of a lockless shadow page table walk, which
+* may reference the pages.
+*/
kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 
@@ -5728,6 +5737,16 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
trace_kvm_mmu_zap_all_fast(kvm);
kvm->arch.mmu_valid_gen++;
 
+   /*
+* Notify all vcpus to reload its shadow page table and flush TLB.
+* Then all vcpus will switch to new shadow page table with the new
+* mmu_valid_gen.
+*
+* Note: we need to do this under the protection of mmu_lock,
+* otherwise, vcpu would purge shadow page but miss tlb flush.
+*/
+   kvm_reload_remote_mmus(kvm);
+
kvm_zap_obsolete_pages(kvm);
spin_unlock(&kvm->mmu_lock);
 }
-- 
2.22.0

[PATCH 10/11] KVM: x86/mmu: Explicitly track only a single invalid mmu generation

Toggle mmu_valid_gen between '0' and '1' instead of blindly incrementing
the generation.  Because slots_lock is held for the entire duration of
zapping obsolete pages, it's impossible for there to be multiple invalid
generations associated with shadow pages at any given time.

Toggling between the two generations (valid vs. invalid) allows changing
mmu_valid_gen from an unsigned long to a u8, which reduces the size of
struct kvm_mmu_page from 160 to 152 bytes on 64-bit KVM, i.e. reduces
KVM's memory footprint by 8 bytes per shadow page.

Set sp->mmu_valid_gen before it is added to active_mmu_pages.
Functionally this has no effect as kvm_mmu_alloc_page() has a single
caller that sets sp->mmu_valid_gen soon thereafter, but visually it is
jarring to see a shadow page being added to the list without its
mmu_valid_gen first being set.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  4 ++--
 arch/x86/kvm/mmu.c  | 14 --
 arch/x86/kvm/mmutrace.h | 16 
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6e4fa75351fd..8912b04d4ae1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -320,6 +320,7 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
bool unsync;
+   u8 mmu_valid_gen;
 
/*
 * The following two entries are used to key the shadow page in the
@@ -334,7 +335,6 @@ struct kvm_mmu_page {
int root_count;  /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
-   unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -856,7 +856,7 @@ struct kvm_arch {
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
-   unsigned long mmu_valid_gen;
+   u8 mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bce19918ca5a..a7b14750cde9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2101,6 +2101,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct 
kvm_vcpu *vcpu, int direct
 * depends on valid pages being added to the head of the list.  See
 * comments in kvm_zap_obsolete_pages().
 */
+   sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2537,7 +2538,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
-   sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
 
@@ -5737,9 +5737,19 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
  */
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
+   lockdep_assert_held(&kvm->slots_lock);
+
spin_lock(&kvm->mmu_lock);
trace_kvm_mmu_zap_all_fast(kvm);
-   kvm->arch.mmu_valid_gen++;
+
+   /*
+* Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
+* held for the entire duration of zapping obsolete pages, it's
+* impossible for there to be multiple invalid generations associated
+* with *valid* shadow pages at any given time, i.e. there is exactly
+* one valid generation and (at most) one invalid generation.
+*/
+   kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
 
/*
 * Notify all vcpus to reload its shadow page table and flush TLB.
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 1a063ba76281..7ca8831c7d1a 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,11 +8,11 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS\
-   __field(unsigned long, mmu_valid_gen)   \
-   __field(__u64, gfn) \
-   __field(__u32, role)\
-   __field(__u32, root_count)  \
+#define KVM_MMU_PAGE_FIELDS\
+   __field(__u8, mmu_valid_gen)\
+   __field(__u64, gfn) \
+   __field(__u32, role)\
+   __field(__u32, root_count)  \
__field(bool, unsync)
 
 #define KVM_MMU_PAGE_ASSIGN(sp)\
@@ -31,7 +31,7 @@
\
role.word = __entry->role;  \

Re: [PATCH V3 2/5] input: keyboard: imx_sc: Add i.MX system controller key support

Hi Anson,

On Tue, Sep 03, 2019 at 05:36:37PM -0400, Anson Huang wrote:
> i.MX8QXP is an ARMv8 SoC which has a Cortex-M4 system controller
> inside, the system controller is in charge of controlling power,
> clock and scu key etc..
> 
> Adds i.MX system controller key driver support, Linux kernel has
> to communicate with system controller via MU (message unit) IPC
> to get scu key's status.
> 
> Signed-off-by: Anson Huang 
> ---
> Changes since V2:
>   - use private platform data instead of global data;
>   - use "key" instead of "pwrkey";
>   - fix some data format.
> ---
>  drivers/input/keyboard/Kconfig  |   7 ++
>  drivers/input/keyboard/Makefile |   1 +
>  drivers/input/keyboard/imx_sc_key.c | 178 
> 
>  3 files changed, 186 insertions(+)
>  create mode 100644 drivers/input/keyboard/imx_sc_key.c
> 
> diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
> index 2e6d288..607acf2 100644
> --- a/drivers/input/keyboard/Kconfig
> +++ b/drivers/input/keyboard/Kconfig
> @@ -469,6 +469,13 @@ config KEYBOARD_IMX
> To compile this driver as a module, choose M here: the
> module will be called imx_keypad.
>  
> +config KEYBOARD_IMX_SC_KEY
> + tristate "IMX SCU Key Driver"
> + depends on IMX_SCU
> + help
> +   This is the system controller key driver for NXP i.MX SoCs with
> +   system controller inside.
> +
>  config KEYBOARD_NEWTON
>   tristate "Newton keyboard"
>   select SERIO
> diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
> index 9510325..f5b1752 100644
> --- a/drivers/input/keyboard/Makefile
> +++ b/drivers/input/keyboard/Makefile
> @@ -29,6 +29,7 @@ obj-$(CONFIG_KEYBOARD_HIL)  += hil_kbd.o
>  obj-$(CONFIG_KEYBOARD_HIL_OLD)   += hilkbd.o
>  obj-$(CONFIG_KEYBOARD_IPAQ_MICRO)+= ipaq-micro-keys.o
>  obj-$(CONFIG_KEYBOARD_IMX)   += imx_keypad.o
> +obj-$(CONFIG_KEYBOARD_IMX_SC_KEY)+= imx_sc_key.o
>  obj-$(CONFIG_KEYBOARD_HP6XX) += jornada680_kbd.o
>  obj-$(CONFIG_KEYBOARD_HP7XX) += jornada720_kbd.o
>  obj-$(CONFIG_KEYBOARD_LKKBD) += lkkbd.o
> diff --git a/drivers/input/keyboard/imx_sc_key.c 
> b/drivers/input/keyboard/imx_sc_key.c
> new file mode 100644
> index 000..e69479b
> --- /dev/null
> +++ b/drivers/input/keyboard/imx_sc_key.c
> @@ -0,0 +1,178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2019 NXP.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define DEBOUNCE_TIME100
> +#define REPEAT_INTERVAL  60
> +
> +#define SC_IRQ_BUTTON1
> +#define SC_IRQ_GROUP_WAKE3
> +#define IMX_SC_MISC_FUNC_GET_BUTTON_STATUS   18
> +
> +struct imx_key_drv_data {
> + int keycode;
> + bool keystate;  /* 1: pressed, 0: release */
> + bool delay_check;
> + struct delayed_work check_work;
> + struct input_dev *input;
> + struct imx_sc_ipc *key_ipc_handle;
> + struct notifier_block key_notifier;
> +};
> +
> +struct imx_sc_msg_key {
> + struct imx_sc_rpc_msg hdr;
> + u8 state;
> +};
> +
> +static int imx_sc_key_notify(struct notifier_block *nb,
> +  unsigned long event, void *group)
> +{
> + struct imx_key_drv_data *priv =
> +  container_of(nb,
> +   struct imx_key_drv_data,
> +   key_notifier);
> +
> + if ((event & SC_IRQ_BUTTON) && (*(u8 *)group == SC_IRQ_GROUP_WAKE)
> + && !priv->delay_check) {
> + priv->delay_check = 1;
> + schedule_delayed_work(&priv->check_work,
> +   msecs_to_jiffies(REPEAT_INTERVAL));
> + }
> +
> + return 0;
> +}
> +
> +static void imx_sc_check_for_events(struct work_struct *work)
> +{
> + struct imx_key_drv_data *priv =
> +  container_of(work,
> +   struct imx_key_drv_data,
> +   check_work.work);
> + struct input_dev *input = priv->input;
> + struct imx_sc_msg_key msg;
> + struct imx_sc_rpc_msg *hdr = &msg.hdr;
> + bool state;
> + int ret;
> +
> + hdr->ver = IMX_SC_RPC_VERSION;
> + hdr->svc = IMX_SC_RPC_SVC_MISC;
> + hdr->func = IMX_SC_MISC_FUNC_GET_BUTTON_STATUS;
> + hdr->size = 1;
> +
> + ret = imx_scu_call_rpc(priv->key_ipc_handle, &msg, true);
> + if (ret) {
> + dev_err(&input->dev, "read imx sc key failed, ret %d\n", ret);
> + return;
> + }
> +
> + state = (bool)msg.state;
> +
> + if (!state && !priv->keystate)
> + state = true;
> +
> + if (state ^ priv->keystate) {
> + pm_wakeup_event(input->dev.parent, 0);
> + priv->keystate = state;
> +

Re: [PATCH] HID: hidraw: replace printk() with corresponding pr_xx() variant

Hi Rishi,

On Thu, Aug 22, 2019 at 10:13:52PM +0530, Rishi Gupta wrote:
> This commit replaces direct invocations of printk with
> their appropriate pr_info/warn() variant.
> 
> Signed-off-by: Rishi Gupta 
> ---
>  drivers/hid/hidraw.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
> index 006bd6f..67b652b 100644
> --- a/drivers/hid/hidraw.c
> +++ b/drivers/hid/hidraw.c
> @@ -197,14 +197,14 @@ static ssize_t hidraw_get_report(struct file *file, 
> char __user *buffer, size_t
>   }
>  
>   if (count > HID_MAX_BUFFER_SIZE) {
> - printk(KERN_WARNING "hidraw: pid %d passed too large report\n",
> + pr_warn("hidraw: pid %d passed too large report\n",
>   task_pid_nr(current));

If you are doing this, you should also look into pr_fmt() so that we do
not need to manually add "hidraw: " prefix to the messages.

>   ret = -EINVAL;
>   goto out;
>   }
>  
>   if (count < 2) {
> - printk(KERN_WARNING "hidraw: pid %d passed too short report\n",
> + pr_warn("hidraw: pid %d passed too short report\n",
>   task_pid_nr(current));
>   ret = -EINVAL;
>   goto out;
> @@ -597,7 +597,7 @@ int __init hidraw_init(void)
>   if (result < 0)
>   goto error_class;
>  
> - printk(KERN_INFO "hidraw: raw HID events driver (C) Jiri Kosina\n");
> + pr_info("hidraw: raw HID events driver (C) Jiri Kosina\n");
>  out:
>   return result;
>  
> -- 
> 2.7.4
> 

Thanks.

-- 
Dmitry

[PATCH] clk: Make clk_bulk_get_all() return a valid "id"

2019-09-12 Thread Bjorn Andersson

The adreno driver expects the "id" field of the returned clk_bulk_data
to be filled in with strings from the clock-names property.

But due to the use of kmalloc_array() in of_clk_bulk_get_all() it
receives a list of bogus pointers instead.

Zero-initialize the "id" field and attempt to populate with strings from
the clock-names property to resolve both these issues.

Fixes: 616e45df7c4a ("clk: add new APIs to operate on all available clocks")
Fixes: 8e3e791d20d2 ("drm/msm: Use generic bulk clock function")
Cc: Dong Aisheng 
Cc: Jordan Crouse 
Signed-off-by: Bjorn Andersson 
---
 drivers/clk/clk-bulk.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c
index 524bf9a53098..e9e16425c739 100644
--- a/drivers/clk/clk-bulk.c
+++ b/drivers/clk/clk-bulk.c
@@ -18,10 +18,13 @@ static int __must_check of_clk_bulk_get(struct device_node 
*np, int num_clks,
int ret;
int i;
 
-   for (i = 0; i < num_clks; i++)
+   for (i = 0; i < num_clks; i++) {
+   clks[i].id = NULL;
clks[i].clk = NULL;
+   }
 
for (i = 0; i < num_clks; i++) {
+   of_property_read_string_index(np, "clock-names", i, 
&clks[i].id);
clks[i].clk = of_clk_get(np, i);
if (IS_ERR(clks[i].clk)) {
ret = PTR_ERR(clks[i].clk);
-- 
2.18.0

Re: [PATCH v3] input: keyboard: snvs_pwrkey: Send key events for i.MX6 S, DL and Q

Hi Robin,

On Wed, Sep 04, 2019 at 06:23:29AM +, Robin van der Gracht wrote:
> The first generation i.MX6 processors does not send an interrupt when the
> power key is pressed. It sends a power down request interrupt if the key is
> released before a hard shutdown (5 second press). This should allow
> software to bring down the SoC safely.
> 
> For this driver to work as a regular power key with the older SoCs, we need
> to send a keypress AND release when we get the power down request irq.
> 
> Signed-off-by: Robin van der Gracht 
> ---
> 
> Changes v2 -> v3:
>  - Drop alt compatible string for identifying first revision snvs hardware,
>read minor revision from register instead.
>  - Drop imx6qdl.dtsi modification and device-tree binding documentation.
>  - Add an additional input_sync() to create 2 seperate input reports for press
>and release.
> 
>  drivers/input/keyboard/Kconfig   |  2 +-
>  drivers/input/keyboard/snvs_pwrkey.c | 28 ++--
>  2 files changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
> index 7c4f19dab34f..937e58da5ce1 100644
> --- a/drivers/input/keyboard/Kconfig
> +++ b/drivers/input/keyboard/Kconfig
> @@ -436,7 +436,7 @@ config KEYBOARD_SNVS_PWRKEY
>   depends on OF
>   help
> This is the snvs powerkey driver for the Freescale i.MX application
> -   processors that are newer than i.MX6 SX.
> +   processors.
>  
> To compile this driver as a module, choose M here; the
> module will be called snvs_pwrkey.
> diff --git a/drivers/input/keyboard/snvs_pwrkey.c 
> b/drivers/input/keyboard/snvs_pwrkey.c
> index 5342d8d45f81..828580eee0d2 100644
> --- a/drivers/input/keyboard/snvs_pwrkey.c
> +++ b/drivers/input/keyboard/snvs_pwrkey.c
> @@ -19,6 +19,7 @@
>  #include 
>  #include 
>  
> +#define SNVS_HPVIDR1_REG 0xF8
>  #define SNVS_LPSR_REG0x4C/* LP Status Register */
>  #define SNVS_LPCR_REG0x38/* LP Control Register */
>  #define SNVS_HPSR_REG0x14
> @@ -37,6 +38,7 @@ struct pwrkey_drv_data {
>   int wakeup;
>   struct timer_list check_timer;
>   struct input_dev *input;
> + u8 minor_rev;
>  };
>  
>  static void imx_imx_snvs_check_for_events(struct timer_list *t)
> @@ -45,6 +47,20 @@ static void imx_imx_snvs_check_for_events(struct 
> timer_list *t)
>   struct input_dev *input = pdata->input;
>   u32 state;
>  
> + if (pdata->minor_rev == 0) {
> + /*
> +  * The first generation i.MX6 SoCs only sends an interrupt on
> +  * button release. To mimic power-key usage, we'll prepend a
> +  * press event.
> +  */
> + input_report_key(input, pdata->keycode, 1);
> + input_sync(input);
> + input_report_key(input, pdata->keycode, 0);
> + input_sync(input);
> + pm_relax(input->dev.parent);
> + return;
> + }
> +
>   regmap_read(pdata->snvs, SNVS_HPSR_REG, &state);
>   state = state & SNVS_HPSR_BTN ? 1 : 0;
>  
> @@ -67,13 +83,17 @@ static irqreturn_t imx_snvs_pwrkey_interrupt(int irq, 
> void *dev_id)
>  {
>   struct platform_device *pdev = dev_id;
>   struct pwrkey_drv_data *pdata = platform_get_drvdata(pdev);
> + unsigned long expire = jiffies;
>   u32 lp_status;
>  
>   pm_wakeup_event(pdata->input->dev.parent, 0);
>  
>   regmap_read(pdata->snvs, SNVS_LPSR_REG, &lp_status);
> - if (lp_status & SNVS_LPSR_SPO)
> - mod_timer(&pdata->check_timer, jiffies + 
> msecs_to_jiffies(DEBOUNCE_TIME));
> + if (lp_status & SNVS_LPSR_SPO) {
> + if (pdata->minor_rev > 0)
> + expire = jiffies + msecs_to_jiffies(DEBOUNCE_TIME);
> + mod_timer(&pdata->check_timer, expire);

Why do we even need to fire the timer in case of the first generation
hardware? Just send press and release events directly from the ISR.

Thanks.

-- 
Dmitry

[PATCH v5 1/2] drivers: hv: vmbus: Introduce latency testing

2019-09-12 Thread Branden Bonaby

Introduce user specified latency in the packet reception path
By exposing the test parameters as part of the debugfs channel
attributes. We will control the testing state via these attributes.

Signed-off-by: Branden Bonaby 
---
changes in v5:
 - As per Stephen's suggestion, Moved CONFIG_HYPERV_TESTING
   to lib/Kconfig.debug.

 - Fixed build issue reported by Kbuild, with Michael's
   suggestion to make hv_debugfs part of the hv_vmbus
   module.

 - updated debugfs-hyperv to show kernel version 5.4

changes in v4:
 - Combined v3 patch 2 into this patch, and changed the
   commit description to reflect this.

 - Moved debugfs code from "vmbus_drv.c" that was in
   previous v3 patch 2, into a new file "debugfs.c" in
   drivers/hv.

 - Updated the Makefile to compile "debugfs.c" if
   CONFIG_HYPERV_TESTING is enabled

 - As per Michael's comments, added empty implementations
   of the new functions, so the compiler will not generate
   code when CONFIG_HYPERV_TESTING is not enabled.

 - Added microseconds into description for files in
   Documentation/ABI/testing/debugfs-hyperv.

Changes in v2:
 - Add #ifdef in Kconfig file so test code will not interfere
   with non-test code.
 - Move test code functions for delay to hyperv_vmbus header
   file.
 - Wrap test code under #ifdef statement.
 
Documentation/ABI/testing/debugfs-hyperv |  23 +++
 MAINTAINERS  |   1 +
 drivers/hv/Makefile  |   1 +
 drivers/hv/connection.c  |   1 +
 drivers/hv/hv_debugfs.c  | 185 +++
 drivers/hv/hyperv_vmbus.h|  31 
 drivers/hv/ring_buffer.c |   2 +
 drivers/hv/vmbus_drv.c   |   6 +
 include/linux/hyperv.h   |  19 +++
 lib/Kconfig.debug|   7 +
 10 files changed, 276 insertions(+)
 create mode 100644 Documentation/ABI/testing/debugfs-hyperv
 create mode 100644 drivers/hv/hv_debugfs.c

diff --git a/Documentation/ABI/testing/debugfs-hyperv 
b/Documentation/ABI/testing/debugfs-hyperv
new file mode 100644
index ..4427503ec762
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-hyperv
@@ -0,0 +1,23 @@
+What:   /sys/kernel/debug/hyperv//fuzz_test_state
+Date:   August 2019
+KernelVersion:  5.4
+Contact:Branden Bonaby 
+Description:Fuzz testing status of a vmbus device, whether its in an ON
+state or a OFF state
+Users:  Debugging tools
+
+What:   
/sys/kernel/debug/hyperv//delay/fuzz_test_buffer_interrupt_delay
+Date:   August 2019
+KernelVersion:  5.4
+Contact:Branden Bonaby 
+Description:Fuzz testing buffer interrupt delay value between 0 - 1000
+microseconds (inclusive).
+Users:  Debugging tools
+
+What:   /sys/kernel/debug/hyperv//delay/fuzz_test_message_delay
+Date:   August 2019
+KernelVersion:  5.4
+Contact:Branden Bonaby 
+Description:Fuzz testing message delay value between 0 - 1000 microseconds
+(inclusive).
+Users:  Debugging tools
diff --git a/MAINTAINERS b/MAINTAINERS
index e7a47b5210fd..00831931eb22 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7468,6 +7468,7 @@ F:include/uapi/linux/hyperv.h
 F: include/asm-generic/mshyperv.h
 F: tools/hv/
 F: Documentation/ABI/stable/sysfs-bus-vmbus
+F: Documentation/ABI/testing/debugfs-hyperv
 
 HYPERBUS SUPPORT
 M: Vignesh Raghavendra 
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index a1eec7177c2d..94daf8240c95 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -9,4 +9,5 @@ CFLAGS_hv_balloon.o = -I$(src)
 hv_vmbus-y := vmbus_drv.o \
 hv.o connection.o channel.o \
 channel_mgmt.o ring_buffer.o hv_trace.o
+hv_vmbus-$(CONFIG_HYPERV_TESTING)  += hv_debugfs.o
 hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 09829e15d4a0..4d4d40832846 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -357,6 +357,7 @@ void vmbus_on_event(unsigned long data)
 
trace_vmbus_on_event(channel);
 
+   hv_debug_delay_test(channel, INTERRUPT_DELAY);
do {
void (*callback_fn)(void *);
 
diff --git a/drivers/hv/hv_debugfs.c b/drivers/hv/hv_debugfs.c
new file mode 100644
index ..933080b51410
--- /dev/null
+++ b/drivers/hv/hv_debugfs.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ *   Branden Bonaby 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "hyperv_vmbus.h"
+
+struct dentry *hv_debug_root;
+
+static int hv_debugfs_delay_get(void *data, u64 *val)
+{
+   *val = *(u32 *)data;
+   return 0;
+}
+
+static int hv_debugfs_delay_set(void *data, u64 val)
+{
+   int ret = 0;
+
+   if (val >= 0 && val <= 1000)
+   *(u32 *)data = val;
+   else
+

[PATCH v5 2/2] tools: hv: add vmbus testing tool

2019-09-12 Thread Branden Bonaby

This is a userspace tool to drive the testing. Currently it supports
introducing user specified delay in the host to guest communication
path on a per-channel basis.

Signed-off-by: Branden Bonaby 
---
Changes in v4:
- Based on Harrys comments, made the tool more
  user friendly and added more error checking.

Changes in v3:
- Align python tool to match Linux coding style.

Changes in v2:
 - Move testing location to new location in debugfs.

 tools/hv/vmbus_testing | 376 +
 1 file changed, 376 insertions(+)
 create mode 100644 tools/hv/vmbus_testing

diff --git a/tools/hv/vmbus_testing b/tools/hv/vmbus_testing
new file mode 100644
index ..e7212903dd1d
--- /dev/null
+++ b/tools/hv/vmbus_testing
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Program to allow users to fuzz test Hyper-V drivers
+# by interfacing with Hyper-V debugfs attributes.
+# Current test methods available:
+#   1. delay testing
+#
+# Current file/directory structure of hyper-V debugfs:
+#   /sys/kernel/debug/hyperv/UUID
+#   /sys/kernel/debug/hyperv/UUID/
+#   /sys/kernel/debug/hyperv/UUID/
+#
+# author: Branden Bonaby 
+
+import os
+import cmd
+import argparse
+import glob
+from argparse import RawDescriptionHelpFormatter
+from argparse import RawTextHelpFormatter
+from enum import Enum
+
+# Do not change unless, you change the debugfs attributes
+# in /drivers/hv/debugfs.c. All fuzz testing
+# attributes will start with "fuzz_test".
+
+# debugfs path for hyperv must exist before proceeding
+debugfs_hyperv_path = "/sys/kernel/debug/hyperv"
+if not os.path.isdir(debugfs_hyperv_path):
+print("{} doesn't exist/check permissions".format(debugfs_hyperv_path))
+exit(-1)
+
+class dev_state(Enum):
+off = 0
+on = 1
+
+# File names, that correspond to the files created in
+# /drivers/hv/debugfs.c
+class f_names(Enum):
+state_f = "fuzz_test_state"
+buff_f =  "fuzz_test_buffer_interrupt_delay"
+mess_f =  "fuzz_test_message_delay"
+
+# Both single_actions and all_actions are used
+# for error checking and to allow for some subparser
+# names to be abbreviated. Do not abbreviate the
+# test method names, as it will become less intuitive
+# as to what the user can do. If you do decide to
+# abbreviate the test method name, make sure the main
+# function reflects this change.
+
+all_actions = [
+"disable_all",
+"D",
+"enable_all",
+"view_all",
+"V"
+]
+
+single_actions = [
+"disable_single",
+"d",
+"enable_single",
+"view_single",
+"v"
+]
+
+def main():
+
+file_map = recursive_file_lookup(debugfs_hyperv_path, dict())
+args = parse_args()
+if (not args.action):
+print ("Error, no options selected...exiting")
+exit(-1)
+arg_set = { k for (k,v) in vars(args).items() if v and k != "action" }
+arg_set.add(args.action)
+path = args.path if "path" in arg_set else None
+if (path and path[-1] == "/"):
+path = path[:-1]
+validate_args_path(path, arg_set, file_map)
+if (path and "enable_single" in arg_set):
+state_path = locate_state(path, file_map)
+set_test_state(state_path, dev_state.on.value, args.quiet)
+
+# Use subparsers as the key for different actions
+if ("delay" in arg_set):
+validate_delay_values(args.delay_time)
+if (args.enable_all):
+set_delay_all_devices(file_map, args.delay_time,
+  args.quiet)
+else:
+set_delay_values(path, file_map, args.delay_time,
+ args.quiet)
+elif ("disable_all" in arg_set or "D" in arg_set):
+disable_all_testing(file_map)
+elif ("disable_single" in arg_set or "d" in arg_set):
+disable_testing_single_device(path, file_map)
+elif ("view_all" in arg_set or "V" in arg_set):
+get_all_devices_test_status(file_map)
+elif ("view_single" in arg_set or  "v" in arg_set):
+get_device_test_values(path, file_map)
+
+# Get the state location
+def locate_state(device, file_map):
+return file_map[device][f_names.state_f.value]
+
+# Validate delay values to make sure they are acceptable to
+# enable delays on a device
+def validate_delay_values(delay):
+
+if (delay[0]  == -1 and delay[1] == -1):
+print("\nError, At least 1 value must be greater than 0")
+exit(-1)
+for i in delay:
+if (i < -1 or i == 0 or i > 1000):
+print("\nError, Values must be  equal to -1 "
+  "or be > 0 and <= 1000")
+exit(-1)
+
+# Validate argument path
+def

[PATCH v5 0/2] hv: vmbus: add fuzz testing to hv device

2019-09-12 Thread Branden Bonaby

This patchset introduces a testing framework for Hyper-V drivers.
This framework allows us to introduce delays in the packet receive
path on a per-device basis. While the current code only supports
introducing arbitrary delays in the host/guest communication path,
we intend to expand this to support error injection in the future.

changes in v5:
  patch 1:
As per Stephen's suggestion, Moved CONFIG_HYPERV_TESTING
to lib/Kconfig.debug.

Fixed build issue reported by Kbuild, with Michael's
suggestion to make hv_debugfs part of the hv_vmbus
module.

changes in v4:
  patch 1:
Combined previous v3 patches 1 and 2, into a single patch
which is now patch 1. This was done so that calls to
the new debugfs functions are in the same patch as
the definitions for these functions.

Moved debugfs code from "vmbus_drv.c" that was in
previous v3 patch 2, into a new file "debugfs.c" in
drivers/hv.

Updated the Makefile to compile "debugfs.c" if
CONFIG_HYPERV_TESTING is enabled

As per Michael's comments, added empty implementations
of the new functions, so the compiler will not generate
code when CONFIG_HYPERV_TESTING is not enabled.

  patch 2 (was previously v3 patch 3):
Based on Harrys comments, made the tool more
user friendly and added more error checking.

changes in v3:
  patch 2: change call to IS_ERR_OR_NULL, to IS_ERR.

  patch 3: Align python tool to match Linux coding style.

Changes in v2:
  Patch 1: As per Vitaly's suggestion, wrapped the test code under an
   #ifdef and updated the Kconfig file, so that the test code
   will only be used when the config option is set to true.
   (default is false).

   Updated hyperv_vmbus header to contain new #ifdef with new
   new functions for the test code.

  Patch 2: Moved code from under sysfs to debugfs and wrapped it under
   the new ifdef.

   Updated MAINTAINERS file with new debugfs-hyperv file under
   the section for hyperv.

  Patch 3: Updated testing tool with new debugfs location.

Branden Bonaby (2):
  drivers: hv: vmbus: Introduce latency testing
  tools: hv: add vmbus testing tool

 Documentation/ABI/testing/debugfs-hyperv |  23 ++
 MAINTAINERS  |   1 +
 drivers/hv/Makefile  |   1 +
 drivers/hv/connection.c  |   1 +
 drivers/hv/hv_debugfs.c  | 185 +++
 drivers/hv/hyperv_vmbus.h|  31 ++
 drivers/hv/ring_buffer.c |   2 +
 drivers/hv/vmbus_drv.c   |   6 +
 include/linux/hyperv.h   |  19 ++
 lib/Kconfig.debug|   7 +
 tools/hv/vmbus_testing   | 376 +++
 11 files changed, 652 insertions(+)
 create mode 100644 Documentation/ABI/testing/debugfs-hyperv
 create mode 100644 drivers/hv/hv_debugfs.c
 create mode 100644 tools/hv/vmbus_testing

-- 
2.17.1

Re: [Ksummit-discuss] [PATCH v2 3/3] libnvdimm, MAINTAINERS: Maintainer Entry Profile

2019-09-12 Thread Aneesh Kumar K.V


On 9/12/19 12:13 AM, Dan Carpenter wrote:

On Wed, Sep 11, 2019 at 08:48:59AM -0700, Dan Williams wrote:

+Coding Style Addendum
+-
+libnvdimm expects multi-line statements to be double indented. I.e.
+
+if (x...
+&& ...y) {


That looks horrible and it causes a checkpatch warning.  :(  Why not
do it the same way that everyone else does it.

if (blah_blah_x && <-- && has to be on the first line for checkpatch
blah_blah_y) { <-- [tab][space][space][space][space]blah

Now all the conditions are aligned visually which makes it readable.
They aren't aligned with the indent block so it's easy to tell the
inside from the if condition.



I came across this while sending patches to libnvdimm subsystem. W.r.t 
coding Style can we have consistent styles across the kernel? Otherwise, 
one would have to change the editor settings as they work across 
different subsystems in the kernel. In this specific case both 
clang-format and emacs customization tip in the kernel documentation 
directory suggest the later style.


-aneesh

Re: [PATCH v5 2/9] documention: leds: Add multicolor class documentation

2019-09-12 Thread Dan Murphy


Hello Pavel

Thanks for looking at this again

On 9/12/19 3:55 PM, Pavel Machek wrote:

Hi!


+Directory Layout Example
+
+root:/sys/class/leds/rgb:grouped_leds# ls -lR colors/
+colors/:
+drwxr-xr-x2 root root 0 Jun 28 20:21 blue
+drwxr-xr-x2 root root 0 Jun 28 20:21 green
+drwxr-xr-x2 root root 0 Jun 28 20:21 red
+-rw---1 root root  4096 Jun 28 20:21 color_mix
+
+colors/blue:
+-rw---1 root root  4096 Jun 28 20:21 intensity
+-r1 root root  4096 Jun 28 20:27 max_intensity
+-r1 root root  4096 Jun 28 20:21 color_id

I don't really like the directories... A bit too much complexity, and
it will have a memory footprint, too.


The directories should be fine to have I am not seeing the complexity. 
Is memory footprint really an issue? Maybe in the IoT space but this is 
small and memory footprint should be able to handle this for IoT and 
larger systems.


Having dedicated directories and files clears up issues for user space 
to know about the parameters for each LED especially with the color_mix 
file which I still am not a fan of, but conceded and implemented 
anyway.  It also gives the user space flexibility to call the monochrome 
LEDs specific intensity file.  The user space can either use the color 
intensity file or the color_mix file it is a choice for them to make.


This code was modeled off the LP50xx device which has individual LED 
intensity controls as well as a overall brightness control. Since we 
have no feedback from user space folks I feel we have to give some 
options not very many but some.




I'd expect max_intensity to be same for all the leds in
rgb:grouped_leds... Could we simply rely on max_brightness file?


I went under the assumption that not all grouped LEDs would have the 
same max_intensity.


I don't have specific use cases but wanted this as an option.

Dan


[If not, would one "max_intensity" file in rgb:grouped_leds be
enough?]

Best regards,
Pavel

Re: [PATCH 2/3] powperc/mm: read TLB Block Invalidate Characteristics

2019-09-12 Thread Aneesh Kumar K.V


On 9/13/19 12:56 AM, Laurent Dufour wrote:

Le 12/09/2019 à 16:44, Aneesh Kumar K.V a écrit :

Laurent Dufour  writes:



+
+    idx = 2;
+    while (idx < len) {
+    unsigned int block_size = local_buffer[idx++];
+    unsigned int npsize;
+
+    if (!block_size)
+    break;
+
+    block_size = 1 << block_size;
+    if (block_size != 8)
+    /* We only support 8 bytes size TLB invalidate buffer */
+    pr_warn("Unsupported H_BLOCK_REMOVE block size : %d\n",
+    block_size);


Should we skip setting block size if we find block_size != 8? Also can
we avoid doing that pr_warn in loop and only warn if we don't find
block_size 8 in the invalidate characteristics array?


My idea here is to fully read and process the data returned by the 
hcall, and to put the limitation to 8 when checking before calling 
H_BLOCK_REMOVE.

The warning is there because I want it to be displayed once at boot.




Can we have two block size reported for the same base page size/actual 
page size combination? If so we will overwrite the hblk[actual_psize] ?





+
+    for (npsize = local_buffer[idx++];  npsize > 0; npsize--)
+    check_lp_set_hblk((unsigned int) local_buffer[idx++],
+  block_size);
+    }
+
+    for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+    for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
+    if (mmu_psize_defs[bpsize].hblk[idx])
+    pr_info("H_BLOCK_REMOVE supports base psize:%d 
psize:%d block size:%d",

+    bpsize, idx,
+    mmu_psize_defs[bpsize].hblk[idx]);
+
+    return 0;
+}
+machine_arch_initcall(pseries, read_tlbbi_characteristics);
+
  /*
   * Take a spinlock around flushes to avoid bouncing the hypervisor 
tlbie

   * lock.


-aneesh

[PATCH v2] KVM: x86: Handle unexpected MMIO accesses using master abort semantics

Use master abort semantics, i.e. reads return all ones and writes are
dropped, to handle unexpected MMIO accesses when reading guest memory
instead of returning X86EMUL_IO_NEEDED, which in turn gets interpreted
as a guest page fault.

Emulation of certain instructions, notably VMX instructions, involves
reading or writing guest memory without going through the emulator.
These emulation flows are not equipped to handle MMIO accesses as no
sane and properly functioning guest kernel will target MMIO with such
instructions, and so simply inject a page fault in response to
X86EMUL_IO_NEEDED.

While not 100% correct, using master abort semantics is at least
sometimes correct, e.g. non-existent MMIO accesses do actually master
abort, whereas injecting a page fault is always wrong, i.e. the issue
lies in the physical address domain, not in the virtual to physical
translation.

Apply the logic to kvm_write_guest_virt_system() in addition to
replacing existing #PF logic in kvm_read_guest_virt(), as VMPTRST uses
the former, i.e. can also leak a host stack address.

Reported-by: Fuqian Huang 
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---

v2: Fix the comment for kvm_read_guest_virt_helper().

 arch/x86/kvm/x86.c | 40 +++-
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4cfd786d0b6..3da57f137470 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5234,16 +5234,24 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
   struct x86_exception *exception)
 {
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+   int r;
+
+   r = kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+  exception);
 
/*
-* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
-* is returned, but our callers are not ready for that and they blindly
-* call kvm_inject_page_fault.  Ensure that they at least do not leak
-* uninitialized kernel stack memory into cr2 and error code.
+* FIXME: this should technically call out to userspace to handle the
+* MMIO access, but our callers are not ready for that, so emulate
+* master abort behavior instead, i.e. reads return all ones.
 */
-   memset(exception, 0, sizeof(*exception));
-   return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- exception);
+   if (r == X86EMUL_IO_NEEDED) {
+   memset(val, 0xff, bytes);
+   return 0;
+   }
+   if (r == X86EMUL_PROPAGATE_FAULT)
+   return -EFAULT;
+   WARN_ON_ONCE(r);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
@@ -5317,11 +5325,25 @@ static int emulator_write_std(struct x86_emulate_ctxt 
*ctxt, gva_t addr, void *v
 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception 
*exception)
 {
+   int r;
+
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
 
-   return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
-  PFERR_WRITE_MASK, exception);
+   r = kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+   PFERR_WRITE_MASK, exception);
+
+   /*
+* FIXME: this should technically call out to userspace to handle the
+* MMIO access, but our callers are not ready for that, so emulate
+* master abort behavior instead, i.e. writes are dropped.
+*/
+   if (r == X86EMUL_IO_NEEDED)
+   return 0;
+   if (r == X86EMUL_PROPAGATE_FAULT)
+   return -EFAULT;
+   WARN_ON_ONCE(r);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
-- 
2.22.0

Re: [PATCH] KVM: x86: Handle unexpected MMIO accesses using master abort semantics

On Thu, Sep 12, 2019 at 04:56:03PM -0700, Sean Christopherson wrote:
> Use master abort semantics, i.e. reads return all ones and writes are
> dropped, to handle unexpected MMIO accesses when reading guest memory
> instead of returning X86EMUL_IO_NEEDED, which in turn gets interpreted
> as a guest page fault.
> 
> Emulation of certain instructions, notably VMX instructions, involves
> reading or writing guest memory without going through the emulator.
> These emulation flows are not equipped to handle MMIO accesses as no
> sane and properly functioning guest kernel will target MMIO with such
> instructions, and so simply inject a page fault in response to
> X86EMUL_IO_NEEDED.
> 
> While not 100% correct, using master abort semantics is at least
> sometimes correct, e.g. non-existent MMIO accesses do actually master
> abort, whereas injecting a page fault is always wrong, i.e. the issue
> lies in the physical address domain, not in the virtual to physical
> translation.
> 
> Apply the logic to kvm_write_guest_virt_system() in addition to
> replacing existing #PF logic in kvm_read_guest_virt(), as VMPTRST uses
> the former, i.e. can also leak a host stack address.
> 
> Reported-by: Fuqian Huang 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/kvm/x86.c | 40 +++-
>  1 file changed, 31 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b4cfd786d0b6..d1d7e9fac17a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5234,16 +5234,24 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
>  struct x86_exception *exception)
>  {
>   u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
> + int r;
> +
> + r = kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
> +exception);
>  
>   /*
> -  * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
> -  * is returned, but our callers are not ready for that and they blindly
> -  * call kvm_inject_page_fault.  Ensure that they at least do not leak
> -  * uninitialized kernel stack memory into cr2 and error code.
> +  * FIXME: this should technically call out to userspace to handle the
> +  * MMIO access, but our callers are not ready for that, so emulate
> +  * master abort behavior instead, i.e. writes are dropped.

Dagnabbit, fixed this to make it 'reads return all ones' and forgot to
commit..  v2 on its way.

>*/
> - memset(exception, 0, sizeof(*exception));
> - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
> -   exception);
> + if (r == X86EMUL_IO_NEEDED) {
> + memset(val, 0xff, bytes);
> + return 0;
> + }
> + if (r == X86EMUL_PROPAGATE_FAULT)
> + return -EFAULT;
> + WARN_ON_ONCE(r);
> + return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
>  
> @@ -5317,11 +5325,25 @@ static int emulator_write_std(struct x86_emulate_ctxt 
> *ctxt, gva_t addr, void *v
>  int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
>   unsigned int bytes, struct x86_exception 
> *exception)
>  {
> + int r;
> +
>   /* kvm_write_guest_virt_system can pull in tons of pages. */
>   vcpu->arch.l1tf_flush_l1d = true;
>  
> - return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
> -PFERR_WRITE_MASK, exception);
> + r = kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
> + PFERR_WRITE_MASK, exception);
> +
> + /*
> +  * FIXME: this should technically call out to userspace to handle the
> +  * MMIO access, but our callers are not ready for that, so emulate
> +  * master abort behavior instead, i.e. writes are dropped.
> +  */
> + if (r == X86EMUL_IO_NEEDED)
> + return 0;
> + if (r == X86EMUL_PROPAGATE_FAULT)
> + return -EFAULT;
> + WARN_ON_ONCE(r);
> + return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
>  
> -- 
> 2.22.0
>

Re: [PATCH 0/5] hugetlbfs: Disable PMD sharing for large systems

2019-09-12 Thread Dave Chinner

On Wed, Sep 11, 2019 at 04:05:32PM +0100, Waiman Long wrote:
> A customer with large SMP systems (up to 16 sockets) with application
> that uses large amount of static hugepages (~500-1500GB) are experiencing
> random multisecond delays. These delays was caused by the long time it
> took to scan the VMA interval tree with mmap_sem held.
> 
> To fix this problem while perserving existing behavior as much as
> possible, we need to allow timeout in down_write() and disabling PMD
> sharing when it is taking too long to do so. Since a transaction can
> involving touching multiple huge pages, timing out for each of the huge
> page interactions does not completely solve the problem. So a threshold
> is set to completely disable PMD sharing if too many timeouts happen.
> 
> The first 4 patches of this 5-patch series adds a new
> down_write_timedlock() API which accepts a timeout argument and return
> true is locking is successful or false otherwise. It works more or less
> than a down_write_trylock() but the calling thread may sleep.

Just on general principle, this is a non-starter. If a lock is being
held too long, then whatever the lock is protecting needs fixing.
Adding timeouts to locks and sysctls to tune them is not a viable
solution to address latencies caused by algorithm scalability
issues.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com

Re: problem starting /sbin/init (32-bit 5.3-rc8)

2019-09-12 Thread Kees Cook

On Thu, Sep 12, 2019 at 05:16:02PM -0700, Kees Cook wrote:
> On Thu, Sep 12, 2019 at 02:40:19PM -0700, Randy Dunlap wrote:
> > This is 32-bit kernel, just happens to be running on a 64-bit laptop.
> > I added the debug printk in __phys_addr() just before "[cut here]".
> > 
> > CONFIG_HARDENED_USERCOPY=y
> 
> I can reproduce this under CONFIG_DEBUG_VIRTUAL=y, and it goes back
> to at least to v5.2. Booting with "hardened_usercopy=off" or without
> CONFIG_DEBUG_VIRTUAL makes this go away (since __phys_addr() doesn't
> get called):
> 
> __check_object_size+0xff/0x1b0:
> pfn_to_section_nr at include/linux/mmzone.h:1153
> (inlined by) __pfn_to_section at include/linux/mmzone.h:1291
> (inlined by) virt_to_head_page at include/linux/mm.h:729
> (inlined by) check_heap_object at mm/usercopy.c:230
> (inlined by) __check_object_size at mm/usercopy.c:280
> 
> Is virt_to_head_page() illegal to use under some recently new conditions?

This combination appears to be bugged since the original introduction
of hardened usercopy in v4.8. Is this an untested combination until
now? (I don't usually do tests with CONFIG_DEBUG_VIRTUAL, but I guess
I will from now on!)

Note from the future (i.e. the end of this email where I figure it out):
it turns out it's actually these three together:

CONFIG_HIGHMEM=y
CONFIG_DEBUG_VIRTUAL=y
CONFIG_HARDENED_USERCOPY=y

> 
> > The BUG is this line in arch/x86/mm/physaddr.c:
> > VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
> > It's line 83 in my source file only due to adding  and
> > a conditional pr_crit() call.

What exactly is this trying to test?

> > [   19.730409][T1] debug: unmapping init [mem 0xdc7bc000-0xdca30fff]
> > [   19.734289][T1] Write protecting kernel text and read-only data: 
> > 13888k
> > [   19.737675][T1] rodata_test: all tests were successful
> > [   19.740757][T1] Run /sbin/init as init process
> > [   19.792877][T1] __phys_addr: max_low_pfn=0x36ffe, x=0xff001ff1, 
> > phys_addr=0x3f001ff1

It seems like this address is way out of range of the physical memory.
That seems like it's vmalloc or something, but that was actually
explicitly tested for back in the v4.8 version (it became unneeded
later).

> > [   19.796561][T1] [ cut here ]
> > [   19.797501][T1] kernel BUG at ../arch/x86/mm/physaddr.c:83!
> > [   19.802799][T1] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
> > [   19.803782][T1] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc8 
> > #6
> > [   19.803782][T1] Hardware name: Dell Inc. Inspiron 1318   
> > /0C236D, BIOS A04 01/15/2009
> > [   19.803782][T1] EIP: __phys_addr+0xaf/0x100
> > [   19.803782][T1] Code: 85 c0 74 67 89 f7 c1 ef 0c 39 f8 73 2e 56 53 
> > 50 68 90 9f 1f dc 68 00 eb 45 dc e8 ec b3 09 00 83 c4 14 3b 3d 30 55 cf dc 
> > 76 11 <0f> 0b b8 7c 3b 5c dc e8 45 53 4c 00 90 8d 74 26 00 89 d8 e8 39 cd
> > [   19.803782][T1] EAX: 0044 EBX: ff001ff1 ECX:  EDX: 
> > db90a471
> > [   19.803782][T1] ESI: 3f001ff1 EDI: 0003f001 EBP: f41ddea0 ESP: 
> > f41dde90
> > [   19.803782][T1] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 
> > 00010216
> > [   19.803782][T1] CR0: 80050033 CR2: dc218544 CR3: 1ca39000 CR4: 
> > 000406d0
> > [   19.803782][T1] Call Trace:
> > [   19.803782][T1]  __check_object_size+0xaf/0x3c0
> > [   19.803782][T1]  ? __might_sleep+0x80/0xa0
> > [   19.803782][T1]  copy_strings+0x1c2/0x370

Oh, this is actually copying into a kmap() pointer due to the weird
stuff exec() does:

kaddr = kmap(kmapped_page);
...
if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {

> > [   19.803782][T1]  copy_strings_kernel+0x2b/0x40
> > 
> > Full boot log or kernel .config file are available if wanted.

Is kmap somewhere "unexpected" in this case? Ah-ha, yes, it seems it is.
There is even a helper to do the "right" thing as virt_to_page(). This
seems to be used very rarely in the kernel... is there a page type for
kmap pages? This seems like a hack, but it fixes it:


diff --git a/mm/usercopy.c b/mm/usercopy.c
index 98e924864554..5a14b80ad63e 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -11,6 +11,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -227,7 +228,7 @@ static inline void check_heap_object(const void *ptr, 
unsigned long n,
if (!virt_addr_valid(ptr))
return;
 
-   page = virt_to_head_page(ptr);
+   page = compound_head(kmap_to_page((void *)ptr));
 
if (PageSlab(page)) {
/* Check slab allocator for flags and size. */


What's the right way to "ignore" the kmap range? (i.e. it's not Slab, so
ignore it here: I can't find a page type nor a "is this kmap?" helper...)

-- 
Kees Cook

[RFC V1 0/7] Add support for a new IMS interrupt mechanism

Currently, MSI (Message signaled interrupts) and MSI-X are the de facto
standard for device interrupt mechanism. MSI-X supports up to 2048
interrupts per device while MSI supports 32, which seems more than enough
for current devices. However, the introduction of SIOV (Scalable IO
virtualization) shifts the creation of assignable virtual devices from
hardware to a more software assisted approach. This flexible composition
of direct assignable devices, a.k.a. assignable device interfaces (ADIs)
unchains hardware from costly PCI standard. Under SIOV, device resource
can now be mapped directly to a guest or other user space drivers for
near native DMA performance. To complete functionality of ADIs, a matching
interrupt resource must also be introduced which will be scalable.

Interrupt message storage (IMS) is conceived as a scalable albeit device
specific interrupt mechanism to meet such a demand. With IMS, there is
theoretically no upper bound on the number of interrupts which a device
can support. The size and location of IMS is device-specific; some devices
may implement IMS as on-device storage which are memory-mapped, others may
opt to implement IMS in system memory. IMS stores each interrupt message as
a DWORD size data payload and a 64-bit address(same as MSI-X). Access to
the IMS is through the host driver due to the non-architectural nature of
device IMS unlike the architectural MSI-X table which are accessed through
PCI drivers.

In this patchset, we introduce generic IMS APIs that fits the Linux IRQ
subsystem, supports IMS IRQ chip and domains that can be used by drivers
which are capable of generating IMS interrupts.

The IMS has been introduced as part of Intel's Scalable I/O virtualization
specification:
https://software.intel.com/en-us/download/intel-scalable-io-virtualization-technical-specification

This patchset is based on Linux 5.3-rc8.

Currently there is no device out in the market which supports SIOV (Hence no
device supports IMS).

This series is a basic patchset to get the ball rolling and receive some
inital comments. As per my discussion with Marc Zyngier and Thomas Gleixner
at the Linux Plumbers, I need to do the following:
1. Since a device can support MSI-X and IMS simultaneously, ensure proper
locking mechanism for the 'msi_list' in the device structure.
2. Introduce dynamic allocation of IMS vectors perhaps by using a group ID
3. IMS support of a device needs to be discoverable. A bit in the vendor
specific capability in the PCI config is to be added rather than getting
this information from each device driver.

Jason Gunthorpe of Mellanox technologies is looking to do something similar
on ARM platforms and was wondering why IMS is x86 sepcific. Perhaps we can
use this thread to discuss further on this.

Megha Dey (7):
genirq/msi: Differentiate between various MSI based interrupts
drivers/base: Introduce callbacks for IMS interrupt domain
x86/ims: Add support for a new IMS irq domain
irq_remapping: New interfaces to support IMS irqdomain
x86/ims: Introduce x86_ims_ops
ims-msi: Add APIs to allocate/free IMS interrupts
ims: Add the set_desc callback

--
2.7.4

[RFC V1 4/7] irq_remapping: New interfaces to support IMS irqdomain

Introduce new interfaces for interrupt remapping drivers to support
IMS irqdomains:

irq_remapping_get_ims_irq_domain(): get the IMS irqdomain for an IRQ
allocation. We must build one IMS irqdomain for each interrupt remapping
unit. The driver calls this interface to get the IMS irqdomain associated
with an IR irqdomain which manages the devices.

Architecture specific hooks:
arch_create_ims_irq_domain(): create an IMS irqdomain associated with the
interrupt remapping unit.

We also add following callback into struct irq_remap_ops:
struct irq_domain *(*get_ims_irq_domain)(struct irq_alloc_info *);

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 arch/x86/include/asm/irq_remapping.h | 13 +
 drivers/iommu/intel_irq_remapping.c  | 30 ++
 drivers/iommu/irq_remapping.c|  9 +
 drivers/iommu/irq_remapping.h|  3 +++
 include/linux/intel-iommu.h  |  1 +
 5 files changed, 56 insertions(+)

diff --git a/arch/x86/include/asm/irq_remapping.h 
b/arch/x86/include/asm/irq_remapping.h
index 4bc985f..a735507 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -48,11 +48,18 @@ extern struct irq_domain *
 irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
 extern struct irq_domain *
 irq_remapping_get_irq_domain(struct irq_alloc_info *info);
+extern struct irq_domain *
+irq_remapping_get_ims_irq_domain(struct irq_alloc_info *info);
 
 /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
 extern struct irq_domain *
 arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int 
id);
 
+/* Create IMS irqdomain, use @parent as the parent irqdomain. */
+#ifdef CONFIG_MSI_IMS
+extern struct irq_domain *arch_create_ims_irq_domain(struct irq_domain 
*parent);
+#endif
+
 /* Get parent irqdomain for interrupt remapping irqdomain */
 static inline struct irq_domain *arch_get_ir_parent_domain(void)
 {
@@ -85,5 +92,11 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info)
return NULL;
 }
 
+static inline struct irq_domain *
+irq_remapping_get_ims_irq_domain(struct irq_alloc_info *info)
+{
+   return NULL;
+}
+
 #endif /* CONFIG_IRQ_REMAP */
 #endif /* __X86_IRQ_REMAPPING_H */
diff --git a/drivers/iommu/intel_irq_remapping.c 
b/drivers/iommu/intel_irq_remapping.c
index 4786ca0..3c0c0cb 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -573,6 +573,10 @@ static int intel_setup_irq_remapping(struct intel_iommu 
*iommu)
 "INTEL-IR-MSI",
 iommu->seq_id);
 
+#ifdef CONFIG_MSI_IMS
+   iommu->ir_ims_domain = arch_create_ims_irq_domain(iommu->ir_domain);
+#endif
+
ir_table->base = page_address(pages);
ir_table->bitmap = bitmap;
iommu->ir_table = ir_table;
@@ -633,6 +637,10 @@ static void intel_teardown_irq_remapping(struct 
intel_iommu *iommu)
irq_domain_remove(iommu->ir_msi_domain);
iommu->ir_msi_domain = NULL;
}
+   if (iommu->ir_ims_domain) {
+   irq_domain_remove(iommu->ir_ims_domain);
+   iommu->ir_ims_domain = NULL;
+   }
if (iommu->ir_domain) {
irq_domain_remove(iommu->ir_domain);
iommu->ir_domain = NULL;
@@ -1139,6 +1147,27 @@ static struct irq_domain *intel_get_irq_domain(struct 
irq_alloc_info *info)
return NULL;
 }
 
+static struct irq_domain *intel_get_ims_irq_domain(struct irq_alloc_info *info)
+{
+   struct intel_iommu *iommu;
+
+   if (!info)
+   return NULL;
+
+   switch (info->type) {
+   case X86_IRQ_ALLOC_TYPE_MSI:
+   case X86_IRQ_ALLOC_TYPE_MSIX:
+   iommu = map_dev_to_ir(info->msi_dev);
+   if (iommu)
+   return iommu->ir_ims_domain;
+   break;
+   default:
+   break;
+   }
+
+   return NULL;
+}
+
 struct irq_remap_ops intel_irq_remap_ops = {
.prepare= intel_prepare_irq_remapping,
.enable = intel_enable_irq_remapping,
@@ -1147,6 +1176,7 @@ struct irq_remap_ops intel_irq_remap_ops = {
.enable_faulting= enable_drhd_fault_handling,
.get_ir_irq_domain  = intel_get_ir_irq_domain,
.get_irq_domain = intel_get_irq_domain,
+   .get_ims_irq_domain = intel_get_ims_irq_domain,
 };
 
 static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 83f36f6..c4352fc 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -193,3 +193,12 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 
return remap_ops->get_irq_domain(info);
 }
+
+stru

[RFC V1 3/7] x86/ims: Add support for a new IMS irq domain

This patch adds support for the creation of a new IMS irq domain. It
creates a new irq_chip associated with the IMS domain and adds the
necessary domain operations to it.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 arch/x86/include/asm/msi.h   |  4 ++
 arch/x86/kernel/apic/Makefile|  1 +
 arch/x86/kernel/apic/ims.c   | 93 
 arch/x86/kernel/apic/msi.c   |  4 +-
 drivers/vfio/mdev/mdev_core.c|  6 +++
 drivers/vfio/mdev/mdev_private.h |  1 -
 include/linux/mdev.h |  2 +
 7 files changed, 108 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/apic/ims.c

diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 25ddd09..51f9d25 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -11,4 +11,8 @@ int pci_msi_prepare(struct irq_domain *domain, struct device 
*dev, int nvec,
 
 void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
 
+struct msi_domain_info;
+
+irq_hw_number_t msi_get_hwirq(struct msi_domain_info *info,
+   msi_alloc_info_t *arg);
 #endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index a6fcaf16..75a2270 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -12,6 +12,7 @@ obj-y += hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)  += io_apic.o
 obj-$(CONFIG_PCI_MSI)  += msi.o
+obj-$(CONFIG_MSI_IMS)  += ims.o
 obj-$(CONFIG_SMP)  += ipi.o
 
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/apic/ims.c b/arch/x86/kernel/apic/ims.c
new file mode 100644
index 000..d9808a5
--- /dev/null
+++ b/arch/x86/kernel/apic/ims.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2019 Intel Corporation.
+ *
+ * Author: Megha Dey 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * Determine if a dev is mdev or not. Return NULL if not mdev device.
+ * Return mdev's parent dev if success.
+ */
+static inline struct device *mdev_to_parent(struct device *dev)
+{
+   struct device *ret = NULL;
+   struct device *(*fn)(struct device *dev);
+   struct bus_type *bus = symbol_get(mdev_bus_type);
+
+   if (bus && dev->bus == bus) {
+   fn = symbol_get(mdev_dev_to_parent_dev);
+   ret = fn(dev);
+   symbol_put(mdev_dev_to_parent_dev);
+   symbol_put(mdev_bus_type);
+   }
+
+   return ret;
+}
+
+static struct pci_dev *ims_get_pci_dev(struct device *dev)
+{
+   struct pci_dev *pdev;
+
+   if (dev_is_mdev(dev)) {
+   struct device *parent = mdev_to_parent(dev);
+
+   pdev = to_pci_dev(parent);
+   } else {
+   pdev = to_pci_dev(dev);
+   }
+
+   return pdev;
+}
+
+int dev_ims_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+   msi_alloc_info_t *arg)
+{
+   struct pci_dev *pdev = ims_get_pci_dev(dev);
+
+   init_irq_alloc_info(arg, NULL);
+   arg->msi_dev = pdev;
+   arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(dev_ims_prepare);
+
+#ifdef CONFIG_IRQ_REMAP
+
+static struct msi_domain_ops dev_ims_domain_ops = {
+   .get_hwirq  = msi_get_hwirq,
+   .msi_prepare= dev_ims_prepare,
+};
+
+static struct irq_chip dev_ims_ir_controller = {
+   .name   = "IR-DEV-IMS",
+   .irq_unmask = dev_ims_unmask_irq,
+   .irq_mask   = dev_ims_mask_irq,
+   .irq_ack= irq_chip_ack_parent,
+   .irq_retrigger  = irq_chip_retrigger_hierarchy,
+   .irq_set_vcpu_affinity  = irq_chip_set_vcpu_affinity_parent,
+   .flags  = IRQCHIP_SKIP_SET_WAKE,
+   .irq_write_msi_msg  = dev_ims_write_msg,
+};
+
+static struct msi_domain_info ims_ir_domain_info = {
+   .flags  = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+   .ops= &dev_ims_domain_ops,
+   .chip   = &dev_ims_ir_controller,
+   .handler= handle_edge_irq,
+   .handler_name   = "edge",
+};
+
+struct irq_domain *arch_create_ims_irq_domain(struct irq_domain *parent)
+{
+   return pci_msi_create_irq_domain(NULL, &ims_ir_domain_info, parent);
+}
+
+#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 435bcda..65da813 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -84,7 +84,7 @@ void native_teardown_msi_irq(unsigned int irq)
irq_domain_free_irqs(irq, 1);
 }
 
-static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+irq_hw_number_t msi_get_hwirq(struct msi_domain_info *info,
 msi_alloc_info_t *arg)
 {
return arg->msi_hwirq;
@@ -116,7

[RFC V1 5/7] x86/ims: Introduce x86_ims_ops

This patch introduces an x86 specific indirect mechanism to setup the
interrupt message storage. The IMS specific functions (setup, teardown,
restore) become function pointers in an x86_ims_ops struct, that
defaults to their implementations in ims.c and ims-msi.c.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 arch/x86/include/asm/pci.h  |  4 
 arch/x86/include/asm/x86_init.h | 10 ++
 arch/x86/kernel/apic/ims.c  | 18 ++
 arch/x86/kernel/x86_init.c  | 23 +++
 drivers/base/ims-msi.c  | 34 ++
 include/linux/msi.h |  6 ++
 6 files changed, 95 insertions(+)

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index e662f98..2ef513f 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -114,6 +114,10 @@ struct msi_desc;
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev);
+#ifdef CONFIG_MSI_IMS
+int native_setup_ims_irqs(struct device *dev, int nvec);
+#endif
+
 #else
 #define native_setup_msi_irqs  NULL
 #define native_teardown_msi_irqNULL
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ac09341..9c2cbbb 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -287,6 +287,15 @@ struct x86_msi_ops {
void (*restore_msi_irqs)(struct pci_dev *dev);
 };
 
+struct device;
+
+struct x86_ims_ops {
+   int (*setup_ims_irqs)(struct device *dev, int nvec);
+   void (*teardown_ims_irq)(unsigned int irq);
+   void (*teardown_ims_irqs)(struct device *dev);
+   void (*restore_ims_irqs)(struct device *dev);
+};
+
 struct x86_apic_ops {
unsigned int(*io_apic_read)   (unsigned int apic, unsigned int reg);
void(*restore)(void);
@@ -297,6 +306,7 @@ extern struct x86_cpuinit_ops x86_cpuinit;
 extern struct x86_platform_ops x86_platform;
 extern struct x86_msi_ops x86_msi;
 extern struct x86_apic_ops x86_apic_ops;
+extern struct x86_ims_ops x86_ims;
 
 extern void x86_early_init_platform_quirks(void);
 extern void x86_init_noop(void);
diff --git a/arch/x86/kernel/apic/ims.c b/arch/x86/kernel/apic/ims.c
index d9808a5..a539666 100644
--- a/arch/x86/kernel/apic/ims.c
+++ b/arch/x86/kernel/apic/ims.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Determine if a dev is mdev or not. Return NULL if not mdev device.
@@ -45,6 +46,23 @@ static struct pci_dev *ims_get_pci_dev(struct device *dev)
return pdev;
 }
 
+int native_setup_ims_irqs(struct device *dev, int nvec)
+{
+   struct irq_domain *domain;
+   struct irq_alloc_info info;
+   struct pci_dev *pdev = ims_get_pci_dev(dev);
+
+   init_irq_alloc_info(&info, NULL);
+   info.type = X86_IRQ_ALLOC_TYPE_MSIX;
+   info.msi_dev = pdev;
+
+   domain = irq_remapping_get_ims_irq_domain(&info);
+   if (!domain)
+   return -ENOSYS;
+
+   return msi_domain_alloc_irqs(domain, dev, nvec);
+}
+
 int dev_ims_prepare(struct irq_domain *domain, struct device *dev, int nvec,
msi_alloc_info_t *arg)
 {
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1bef687..3ce42d4 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -153,6 +153,29 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
 }
 #endif
 
+#if defined(CONFIG_MSI_IMS)
+struct x86_ims_ops x86_ims __ro_after_init = {
+   .setup_ims_irqs = native_setup_ims_irqs,
+   .teardown_ims_irqs  = dev_ims_teardown_irqs,
+   .restore_ims_irqs   = dev_ims_restore_irqs,
+};
+
+int arch_setup_ims_irqs(struct device *dev, int nvec)
+{
+   return x86_ims.setup_ims_irqs(dev, nvec);
+}
+
+void arch_teardown_ims_irqs(struct device *dev)
+{
+   x86_ims.teardown_ims_irqs(dev);
+}
+
+void arch_restore_ims_irqs(struct device *dev)
+{
+   x86_ims.restore_ims_irqs(dev);
+}
+#endif
+
 struct x86_apic_ops x86_apic_ops __ro_after_init = {
.io_apic_read   = native_io_apic_read,
.restore= native_restore_boot_irq_mode,
diff --git a/drivers/base/ims-msi.c b/drivers/base/ims-msi.c
index 68dc10f..df28ee2 100644
--- a/drivers/base/ims-msi.c
+++ b/drivers/base/ims-msi.c
@@ -92,3 +92,37 @@ void dev_ims_write_msg(struct irq_data *data, struct msi_msg 
*msg)
__dev_write_ims_msg(desc, msg);
 }
 EXPORT_SYMBOL_GPL(dev_ims_write_msg);
+
+void dev_ims_teardown_irqs(struct device *dev)
+{
+   struct msi_desc *entry;
+
+   for_each_msi_entry(entry, dev)
+   if (entry->irq && entry->tag == IRQ_MSI_TAG_IMS)
+   arch_teardown_msi_irq(entry->irq);
+}
+
+static void dev_ims_restore_irq(struct device *dev, int irq)
+{
+   struct msi_desc *entry = NULL;
+   struct dev_ims_ops *ops;
+
+   for_each_msi_e

[RFC V1 2/7] drivers/base: Introduce callbacks for IMS interrupt domain

This patch serves as a preparatory patch to introduce a new IMS
(Interrupt Message Store) domain. It consists of APIs which would
be used as callbacks to the IRQ chip associated with the IMS domain.

The APIs introduced in this patch are:
dev_ims_mask_irq - Generic irq chip callback to mask IMS interrupts
dev_ims_unmask_irq - Generic irq chip callback to unmask IMS interrupts
dev_ims_domain_write_msg - Helper to write MSI message to Device IMS

It also introduces IMS specific structures namely:
dev_ims_ops - Callbacks for IMS domain ops
dev_ims_desc - Device specific IMS msi descriptor data
dev_ims_priv_data - Internal data structure containing a unique devid
and a pointer to the IMS domain ops

Lastly, it adds a new config option MSI_IMS which must be enabled by
any driver who would want to use the IMS infrastructure.

Since IMS is not PCI compliant (like platform-msi), most of the code is
similar to platform-msi.c.

TODO: Conclude if ims-msi.c and platform-msi.c can be merged.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 drivers/base/Kconfig   |  7 
 drivers/base/Makefile  |  1 +
 drivers/base/ims-msi.c | 94 ++
 include/linux/msi.h| 35 ++-
 4 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/ims-msi.c

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index dc40449..038fabd 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -206,3 +206,10 @@ config GENERIC_ARCH_TOPOLOGY
  runtime.
 
 endmenu
+
+config MSI_IMS
+   bool "Device Specific Interrupt Message Storage (IMS)"
+   select GENERIC_MSI_IRQ
+   help
+ This allows device drivers to enable device specific
+ interrupt message storage (IMS) besides standard MSI-X interrupts.
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 1574520..659b9b0 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_SOC_BUS) += soc.o
 obj-$(CONFIG_PINCTRL) += pinctrl.o
 obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
 obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
+obj-$(CONFIG_MSI_IMS) += ims-msi.o
 obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o
 
 obj-y  += test/
diff --git a/drivers/base/ims-msi.c b/drivers/base/ims-msi.c
new file mode 100644
index 000..68dc10f
--- /dev/null
+++ b/drivers/base/ims-msi.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2019 Intel Corporation.
+ *
+ * Author: Megha Dey 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct dev_ims_priv_data {
+   struct device   *dev;
+   msi_alloc_info_targ;
+   int devid;
+   struct dev_ims_ops  *ims_ops;
+};
+
+u32 __dev_ims_desc_mask_irq(struct msi_desc *desc, u32 flag)
+{
+   u32 mask_bits = desc->dev_ims.masked;
+   struct dev_ims_ops *ops;
+
+   ops = desc->dev_ims.priv->ims_ops;
+   if (!ops)
+   return 0;
+
+   if (flag) {
+   if (ops->irq_mask)
+   mask_bits = ops->irq_mask(desc);
+   } else {
+   if (ops->irq_unmask)
+   mask_bits = ops->irq_unmask(desc);
+   }
+
+   return mask_bits;
+}
+
+static void ims_mask_irq(struct msi_desc *desc, u32 flag)
+{
+   desc->dev_ims.masked = __dev_ims_desc_mask_irq(desc, flag);
+}
+
+static void ims_set_mask_bit(struct irq_data *data, u32 flag)
+{
+   struct msi_desc *desc = irq_data_get_msi_desc(data);
+
+   ims_mask_irq(desc, flag);
+}
+
+static void __dev_write_ims_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   struct dev_ims_ops *ops;
+
+   ops = desc->dev_ims.priv->ims_ops;
+   if (ops && ops->irq_write_msi_msg)
+   ops->irq_write_msi_msg(desc, msg);
+
+   desc->msg = *msg;
+}
+
+/**
+ * dev_ims_mask_irq - Generic irq chip callback to mask IMS interrupts
+ * @data: pointer to irqdata associated to that interrupt
+ */
+void dev_ims_mask_irq(struct irq_data *data)
+{
+   ims_set_mask_bit(data, 1);
+}
+EXPORT_SYMBOL_GPL(dev_ims_mask_irq);
+
+/**
+ * dev_msi_unmask_irq - Generic irq chip callback to unmask IMS interrupts
+ * @data: pointer to irqdata associated to that interrupt
+ */
+void dev_ims_unmask_irq(struct irq_data *data)
+{
+   ims_set_mask_bit(data, 0);
+}
+EXPORT_SYMBOL_GPL(dev_ims_unmask_irq);
+
+/**
+ * dev_ims_write_msg - Helper to write MSI message to Device IMS
+ * @irq_data: Pointer to interrupt data of the MSI interrupt
+ * @msg:  Pointer to the message
+ */
+void dev_ims_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+   struct msi_desc *desc = irq_data_get_msi_desc(data);
+
+   __dev_write_ims_msg(desc, msg);
+}
+EXPORT_SYMBOL_GPL(dev_ims_write_msg);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 22591b6..246285a 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -17,6 +17

[RFC V1 7/7] ims: Add the set_desc callback

Add the set_desc callback to the ims domain ops.

The set_desc callback is used to find a unique hwirq number from a given
domain.

Each mdev can have a maximum of 2048 IMS interrupts.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 arch/x86/kernel/apic/ims.c | 7 +++
 drivers/base/ims-msi.c | 9 +
 include/linux/msi.h| 1 +
 3 files changed, 17 insertions(+)

diff --git a/arch/x86/kernel/apic/ims.c b/arch/x86/kernel/apic/ims.c
index a539666..7e36571 100644
--- a/arch/x86/kernel/apic/ims.c
+++ b/arch/x86/kernel/apic/ims.c
@@ -76,11 +76,18 @@ int dev_ims_prepare(struct irq_domain *domain, struct 
device *dev, int nvec,
 }
 EXPORT_SYMBOL_GPL(dev_ims_prepare);
 
+void dev_ims_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+   arg->msi_hwirq = dev_ims_calc_hwirq(desc);
+}
+EXPORT_SYMBOL_GPL(dev_ims_set_desc);
+
 #ifdef CONFIG_IRQ_REMAP
 
 static struct msi_domain_ops dev_ims_domain_ops = {
.get_hwirq  = msi_get_hwirq,
.msi_prepare= dev_ims_prepare,
+   .set_desc   = dev_ims_set_desc,
 };
 
 static struct irq_chip dev_ims_ir_controller = {
diff --git a/drivers/base/ims-msi.c b/drivers/base/ims-msi.c
index 3e579c9..48f3d24 100644
--- a/drivers/base/ims-msi.c
+++ b/drivers/base/ims-msi.c
@@ -22,6 +22,15 @@ struct dev_ims_priv_data {
 
 static DEFINE_IDA(dev_ims_devid_ida);
 
+irq_hw_number_t dev_ims_calc_hwirq(struct msi_desc *desc)
+{
+   u32 devid;
+
+   devid = desc->dev_ims.priv->devid;
+
+   return (devid << (32 - DEVIMS_ID_SHIFT)) | desc->dev_ims.ims_index;
+}
+
 u32 __dev_ims_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
u32 mask_bits = desc->dev_ims.masked;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 4543bbf..fe4678e 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -237,6 +237,7 @@ void dev_ims_teardown_irqs(struct device *dev);
 void dev_ims_restore_irqs(struct device *dev);
 int dev_ims_alloc_irqs(struct device *dev, int nvec, struct dev_ims_ops *ops);
 void dev_ims_free_irqs(struct device *dev);
+irq_hw_number_t dev_ims_calc_hwirq(struct msi_desc *desc);
 
 /*
  * The arch hooks to setup up msi irqs. Those functions are
-- 
2.7.4

[RFC V1 1/7] genirq/msi: Differentiate between various MSI based interrupts

Since a device can support both MSI-X and IMS interrupts simultaneously,
do away with is_msix and introduce a new enum msi_desc_tag to
differentiate between the various types of msi_descs.

Signed-off-by: Megha Dey 
---
 arch/mips/pci/msi-xlp.c|  2 +-
 arch/s390/pci/pci_irq.c|  2 +-
 arch/x86/kernel/apic/msi.c |  2 +-
 arch/x86/pci/xen.c |  2 +-
 drivers/pci/msi.c  | 19 ++-
 include/linux/msi.h| 11 ++-
 kernel/irq/msi.c   |  2 +-
 7 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/arch/mips/pci/msi-xlp.c b/arch/mips/pci/msi-xlp.c
index bb14335..0f06ad1 100644
--- a/arch/mips/pci/msi-xlp.c
+++ b/arch/mips/pci/msi-xlp.c
@@ -457,7 +457,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc 
*desc)
node = slot / 8;
lnkbase = nlm_get_pcie_base(node, link);
 
-   if (desc->msi_attrib.is_msix)
+   if (desc->tag == IRQ_MSI_TAG_MSIX)
return xlp_setup_msix(lnkbase, node, link, desc);
else
return xlp_setup_msi(lnkbase, node, link, desc);
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index d80616a..1938582 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -332,7 +332,7 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev)
for_each_pci_msi_entry(msi, pdev) {
if (!msi->irq)
continue;
-   if (msi->msi_attrib.is_msix)
+   if (msi->tag == IRQ_MSI_TAG_MSIX)
__pci_msix_desc_mask_irq(msi, 1);
else
__pci_msi_desc_mask_irq(msi, 1, 1);
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7f75334..435bcda 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -98,7 +98,7 @@ int pci_msi_prepare(struct irq_domain *domain, struct device 
*dev, int nvec,
 
init_irq_alloc_info(arg, NULL);
arg->msi_dev = pdev;
-   if (desc->msi_attrib.is_msix) {
+   if (desc->tag == IRQ_MSI_TAG_MSIX) {
arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
} else {
arg->type = X86_IRQ_ALLOC_TYPE_MSI;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 91220cc..5e850b8 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -382,7 +382,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
struct msi_desc *msidesc;
 
msidesc = first_pci_msi_entry(dev);
-   if (msidesc->msi_attrib.is_msix)
+   if (msidesc->tag == IRQ_MSI_TAG_MSIX)
xen_pci_frontend_disable_msix(dev);
else
xen_pci_frontend_disable_msi(dev);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 0884bed..8a05416 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -235,7 +235,7 @@ static void msi_set_mask_bit(struct irq_data *data, u32 
flag)
 {
struct msi_desc *desc = irq_data_get_msi_desc(data);
 
-   if (desc->msi_attrib.is_msix) {
+   if (desc->tag == IRQ_MSI_TAG_MSIX) {
msix_mask_irq(desc, flag);
readl(desc->mask_base); /* Flush write to device */
} else {
@@ -278,7 +278,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct 
msi_msg *msg)
 
BUG_ON(dev->current_state != PCI_D0);
 
-   if (entry->msi_attrib.is_msix) {
+   if (entry->tag == IRQ_MSI_TAG_MSIX) {
void __iomem *base = pci_msix_desc_addr(entry);
 
if (!base) {
@@ -313,7 +313,7 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct 
msi_msg *msg)
 
if (dev->current_state != PCI_D0 || pci_dev_is_disconnected(dev)) {
/* Don't touch the hardware now */
-   } else if (entry->msi_attrib.is_msix) {
+   } else if (entry->tag == IRQ_MSI_TAG_MSIX) {
void __iomem *base = pci_msix_desc_addr(entry);
 
if (!base)
@@ -376,7 +376,7 @@ static void free_msi_irqs(struct pci_dev *dev)
pci_msi_teardown_msi_irqs(dev);
 
list_for_each_entry_safe(entry, tmp, msi_list, list) {
-   if (entry->msi_attrib.is_msix) {
+   if (entry->tag == IRQ_MSI_TAG_MSIX) {
if (list_is_last(&entry->list, msi_list))
iounmap(entry->mask_base);
}
@@ -471,7 +471,7 @@ static ssize_t msi_mode_show(struct device *dev, struct 
device_attribute *attr,
entry = irq_get_msi_desc(irq);
if (entry)
return sprintf(buf, "%s\n",
-   entry->msi_attrib.is_msix ? "msix" : "msi");
+   (entry->tag == IRQ_MSI_TAG_MSIX) ? "msix" : "msi");
 
return -ENODEV;
 }
@@ -570,7 +570,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct 
irq_affinity *affd)
 
pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
 
-   entry->msi_attrib.is_msix   = 0;
+   entry->tag  = IRQ_MSI_TAG

[RFC V1 6/7] ims-msi: Add APIs to allocate/free IMS interrupts

This patch introduces APIs to allocate and free IMS interrupts.

Cc: Jacob Pan 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Megha Dey 
---
 drivers/base/ims-msi.c | 216 +
 include/linux/msi.h|   2 +
 2 files changed, 218 insertions(+)

diff --git a/drivers/base/ims-msi.c b/drivers/base/ims-msi.c
index df28ee2..3e579c9 100644
--- a/drivers/base/ims-msi.c
+++ b/drivers/base/ims-msi.c
@@ -7,9 +7,12 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
+#define DEVIMS_ID_SHIFT21
+
 struct dev_ims_priv_data {
struct device   *dev;
msi_alloc_info_targ;
@@ -17,6 +20,8 @@ struct dev_ims_priv_data {
struct dev_ims_ops  *ims_ops;
 };
 
+static DEFINE_IDA(dev_ims_devid_ida);
+
 u32 __dev_ims_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
u32 mask_bits = desc->dev_ims.masked;
@@ -126,3 +131,214 @@ void dev_ims_restore_irqs(struct device *dev)
if (entry->tag == IRQ_MSI_TAG_IMS)
dev_ims_restore_irq(dev, entry->irq);
 }
+
+static void dev_ims_free_descs(struct device *dev)
+{
+   struct msi_desc *desc, *tmp;
+
+   for_each_msi_entry(desc, dev)
+   if (desc->irq && desc->tag == IRQ_MSI_TAG_IMS)
+   BUG_ON(irq_has_action(desc->irq));
+
+   dev_ims_teardown_irqs(dev);
+
+   list_for_each_entry_safe(desc, tmp, dev_to_msi_list(dev), list) {
+   if (desc->tag == IRQ_MSI_TAG_IMS) {
+   list_del(&desc->list);
+   free_msi_entry(desc);
+   }
+   }
+}
+
+static int dev_ims_setup_msi_irqs(struct device *dev, int nvec)
+{
+   struct irq_domain *domain;
+
+   domain = dev_get_msi_domain(dev);
+   if (domain && irq_domain_is_hierarchy(domain))
+   return msi_domain_alloc_irqs(domain, dev, nvec);
+
+   return arch_setup_ims_irqs(dev, nvec);
+}
+
+static struct dev_ims_priv_data *
+dev_ims_alloc_priv_data(struct device *dev, unsigned int nvec,
+   struct dev_ims_ops *ops)
+{
+   struct dev_ims_priv_data *datap;
+   int ret;
+
+   /*
+* Currently there is no limit to the number of IRQs a device can
+* allocate.
+*/
+   if (!nvec)
+   return ERR_PTR(-EINVAL);
+
+   datap = kzalloc(sizeof(*datap), GFP_KERNEL);
+   if (!datap)
+   return ERR_PTR(-ENOMEM);
+
+   ret = ida_simple_get(&dev_ims_devid_ida,
+   0, 1 << DEVIMS_ID_SHIFT, GFP_KERNEL);
+
+   if (ret < 0) {
+   kfree(datap);
+   return ERR_PTR(ret);
+   }
+
+   datap->devid = ret;
+   datap->ims_ops = ops;
+   datap->dev = dev;
+
+   return datap;
+}
+
+static int dev_ims_alloc_descs(struct device *dev,
+  int nvec, struct dev_ims_priv_data *data,
+  struct irq_affinity *affd)
+{
+   struct irq_affinity_desc *curmsk, *masks = NULL;
+   struct msi_desc *desc;
+   int i, base = 0;
+
+   if (!list_empty(dev_to_msi_list(dev))) {
+   desc = list_last_entry(dev_to_msi_list(dev),
+   struct msi_desc, list);
+   base = desc->dev_ims.ims_index + 1;
+   }
+
+   if (affd) {
+   masks = irq_create_affinity_masks(nvec, affd);
+   if (!masks)
+   dev_err(dev, "Unable to allocate affinity masks, 
ignoring\n");
+   }
+
+   for (i = 0, curmsk = masks; i < nvec; i++) {
+   desc = alloc_msi_entry(dev, 1, NULL);
+   if (!desc)
+   break;
+
+   desc->dev_ims.priv = data;
+   desc->tag = IRQ_MSI_TAG_IMS;
+   desc->dev_ims.ims_index = base + i;
+
+   list_add_tail(&desc->list, dev_to_msi_list(dev));
+
+   if (masks)
+   curmsk++;
+   }
+
+   kfree(masks);
+
+   if (i != nvec) {
+   /* Clean up the mess */
+   dev_ims_free_descs(dev);
+   return -ENOMEM;
+   }
+
+   return 0;
+}
+
+static void dev_ims_free_priv_data(struct dev_ims_priv_data *data)
+{
+   ida_simple_remove(&dev_ims_devid_ida, data->devid);
+   kfree(data);
+}
+
+/**
+ * dev_ims_enable_irqs - Allocate IMS interrupts for @dev
+ * @dev:   The device for which to allocate interrupts
+ * @nvec:  The number of interrupts to allocate
+ * @ops:   IMS device operations
+ * @affd:  optional description of the affinity requirements
+ *
+ * Returns:
+ * Zero for success, or an error code in case of failure
+ */
+int dev_ims_enable_irqs(struct device *dev, unsigned int nvec,
+   struct dev_ims_ops *ops,
+   struct irq_affinity *affd)
+{
+   struct dev_ims_priv_data *priv_data;
+   int err;
+
+   priv_data = dev_ims

Re: [PATCH] leds: remove PAGE_SIZE limit of /sys/class/leds//trigger

2019-09-12 Thread Akinobu Mita

2019年9月13日(金) 2:15 Jacek Anaszewski :
>
> Hi Akinobu,
>
> Please bump patch version each time you send an update
> of the patch with the same subject.

Oops, should I resend with the correct subject?

Re: problem starting /sbin/init (32-bit 5.3-rc8)

2019-09-12 Thread Kees Cook

On Thu, Sep 12, 2019 at 02:40:19PM -0700, Randy Dunlap wrote:
> This is 32-bit kernel, just happens to be running on a 64-bit laptop.
> I added the debug printk in __phys_addr() just before "[cut here]".
> 
> CONFIG_HARDENED_USERCOPY=y

I can reproduce this under CONFIG_DEBUG_VIRTUAL=y, and it goes back
to at least to v5.2. Booting with "hardened_usercopy=off" or without
CONFIG_DEBUG_VIRTUAL makes this go away (since __phys_addr() doesn't
get called):

__check_object_size+0xff/0x1b0:
pfn_to_section_nr at include/linux/mmzone.h:1153
(inlined by) __pfn_to_section at include/linux/mmzone.h:1291
(inlined by) virt_to_head_page at include/linux/mm.h:729
(inlined by) check_heap_object at mm/usercopy.c:230
(inlined by) __check_object_size at mm/usercopy.c:280

Is virt_to_head_page() illegal to use under some recently new conditions?

> The BUG is this line in arch/x86/mm/physaddr.c:
>   VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
> It's line 83 in my source file only due to adding  and
> a conditional pr_crit() call.
> 
> 
> [   19.730409][T1] debug: unmapping init [mem 0xdc7bc000-0xdca30fff]
> [   19.734289][T1] Write protecting kernel text and read-only data: 13888k
> [   19.737675][T1] rodata_test: all tests were successful
> [   19.740757][T1] Run /sbin/init as init process
> [   19.792877][T1] __phys_addr: max_low_pfn=0x36ffe, x=0xff001ff1, 
> phys_addr=0x3f001ff1
> [   19.796561][T1] [ cut here ]
> [   19.797501][T1] kernel BUG at ../arch/x86/mm/physaddr.c:83!
> [   19.802799][T1] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
> [   19.803782][T1] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc8 #6
> [   19.803782][T1] Hardware name: Dell Inc. Inspiron 1318 
>   /0C236D, BIOS A04 01/15/2009
> [   19.803782][T1] EIP: __phys_addr+0xaf/0x100
> [   19.803782][T1] Code: 85 c0 74 67 89 f7 c1 ef 0c 39 f8 73 2e 56 53 50 
> 68 90 9f 1f dc 68 00 eb 45 dc e8 ec b3 09 00 83 c4 14 3b 3d 30 55 cf dc 76 11 
> <0f> 0b b8 7c 3b 5c dc e8 45 53 4c 00 90 8d 74 26 00 89 d8 e8 39 cd
> [   19.803782][T1] EAX: 0044 EBX: ff001ff1 ECX:  EDX: db90a471
> [   19.803782][T1] ESI: 3f001ff1 EDI: 0003f001 EBP: f41ddea0 ESP: f41dde90
> [   19.803782][T1] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 
> 00010216
> [   19.803782][T1] CR0: 80050033 CR2: dc218544 CR3: 1ca39000 CR4: 000406d0
> [   19.803782][T1] Call Trace:
> [   19.803782][T1]  __check_object_size+0xaf/0x3c0
> [   19.803782][T1]  ? __might_sleep+0x80/0xa0
> [   19.803782][T1]  copy_strings+0x1c2/0x370
> [   19.803782][T1]  copy_strings_kernel+0x2b/0x40
> 
> Full boot log or kernel .config file are available if wanted.

I'll see if I can bisect, but I'm getting on a plane soon...

-- 
Kees Cook

Re: [alsa-devel] [PATCH] ASoC: fsl_sai: Implement set_bclk_ratio

On Wed, Sep 11, 2019 at 04:06:41PM +0300, Daniel Baluta wrote:
> On Wed, Sep 11, 2019 at 2:01 PM Mark Brown  wrote:
> >
> > On Thu, Sep 05, 2019 at 06:29:39PM -0700, Nicolin Chen wrote:
> > > On Sat, Aug 31, 2019 at 12:59:10AM +0300, Daniel Baluta wrote:
> >
> > > > This is to allow machine drivers to set a certain bitclk rate
> > > > which might not be exactly rate * frame size.
> >
> > > Just a quick thought of mine: slot_width and slots could be
> > > set via set_dai_tdm_slot() actually, while set_bclk_ratio()
> > > would override that one with your change. I'm not sure which
> > > one could be more important...so would you mind elaborating
> > > your use case?
> >
> > The reason we have both operations is partly that some hardware
> > can configure the ratio but not do TDM and partly that setting
> > TDM slots forces us to configure the slot size depending on the
> > current stream configuration while just setting the ratio means
> > we can just fix the configuration once.  I'd say it's just a user
> > error to try to do both simultaneously.
> 
> Yes, exactly. We wanted to have a better control of bclk freq.
> Sorry for the late answer, I'm traveling.

I see. Thanks for the explain. Just acked.

[PATCH] KVM: x86: Handle unexpected MMIO accesses using master abort semantics

Use master abort semantics, i.e. reads return all ones and writes are
dropped, to handle unexpected MMIO accesses when reading guest memory
instead of returning X86EMUL_IO_NEEDED, which in turn gets interpreted
as a guest page fault.

Emulation of certain instructions, notably VMX instructions, involves
reading or writing guest memory without going through the emulator.
These emulation flows are not equipped to handle MMIO accesses as no
sane and properly functioning guest kernel will target MMIO with such
instructions, and so simply inject a page fault in response to
X86EMUL_IO_NEEDED.

While not 100% correct, using master abort semantics is at least
sometimes correct, e.g. non-existent MMIO accesses do actually master
abort, whereas injecting a page fault is always wrong, i.e. the issue
lies in the physical address domain, not in the virtual to physical
translation.

Apply the logic to kvm_write_guest_virt_system() in addition to
replacing existing #PF logic in kvm_read_guest_virt(), as VMPTRST uses
the former, i.e. can also leak a host stack address.

Reported-by: Fuqian Huang 
Cc: sta...@vger.kernel.org
Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c | 40 +++-
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4cfd786d0b6..d1d7e9fac17a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5234,16 +5234,24 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
   struct x86_exception *exception)
 {
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+   int r;
+
+   r = kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+  exception);
 
/*
-* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
-* is returned, but our callers are not ready for that and they blindly
-* call kvm_inject_page_fault.  Ensure that they at least do not leak
-* uninitialized kernel stack memory into cr2 and error code.
+* FIXME: this should technically call out to userspace to handle the
+* MMIO access, but our callers are not ready for that, so emulate
+* master abort behavior instead, i.e. writes are dropped.
 */
-   memset(exception, 0, sizeof(*exception));
-   return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- exception);
+   if (r == X86EMUL_IO_NEEDED) {
+   memset(val, 0xff, bytes);
+   return 0;
+   }
+   if (r == X86EMUL_PROPAGATE_FAULT)
+   return -EFAULT;
+   WARN_ON_ONCE(r);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
@@ -5317,11 +5325,25 @@ static int emulator_write_std(struct x86_emulate_ctxt 
*ctxt, gva_t addr, void *v
 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception 
*exception)
 {
+   int r;
+
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
 
-   return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
-  PFERR_WRITE_MASK, exception);
+   r = kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+   PFERR_WRITE_MASK, exception);
+
+   /*
+* FIXME: this should technically call out to userspace to handle the
+* MMIO access, but our callers are not ready for that, so emulate
+* master abort behavior instead, i.e. writes are dropped.
+*/
+   if (r == X86EMUL_IO_NEEDED)
+   return 0;
+   if (r == X86EMUL_PROPAGATE_FAULT)
+   return -EFAULT;
+   WARN_ON_ONCE(r);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
-- 
2.22.0

Re: [PATCH] ASoC: fsl_sai: Implement set_bclk_ratio

On Sat, Aug 31, 2019 at 12:59:10AM +0300, Daniel Baluta wrote:
> From: Viorel Suman 
> 
> This is to allow machine drivers to set a certain bitclk rate
> which might not be exactly rate * frame size.
> 
> Cc: NXP Linux Team 
> Signed-off-by: Viorel Suman 
> Signed-off-by: Daniel Baluta 

Acked-by: Nicolin Chen 

> ---
>  sound/soc/fsl/fsl_sai.c | 21 +++--
>  sound/soc/fsl/fsl_sai.h |  1 +
>  2 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
> index fe126029f4e3..e896b577b1f7 100644
> --- a/sound/soc/fsl/fsl_sai.c
> +++ b/sound/soc/fsl/fsl_sai.c
> @@ -137,6 +137,16 @@ static int fsl_sai_set_dai_tdm_slot(struct snd_soc_dai 
> *cpu_dai, u32 tx_mask,
>   return 0;
>  }
>  
> +static int fsl_sai_set_dai_bclk_ratio(struct snd_soc_dai *dai,
> +   unsigned int ratio)
> +{
> + struct fsl_sai *sai = snd_soc_dai_get_drvdata(dai);
> +
> + sai->bclk_ratio = ratio;
> +
> + return 0;
> +}
> +
>  static int fsl_sai_set_dai_sysclk_tr(struct snd_soc_dai *cpu_dai,
>   int clk_id, unsigned int freq, int fsl_dir)
>  {
> @@ -423,8 +433,14 @@ static int fsl_sai_hw_params(struct snd_pcm_substream 
> *substream,
>   slot_width = sai->slot_width;
>  
>   if (!sai->is_slave_mode) {
> - ret = fsl_sai_set_bclk(cpu_dai, tx,
> - slots * slot_width * params_rate(params));
> + if (sai->bclk_ratio)
> + ret = fsl_sai_set_bclk(cpu_dai, tx,
> +sai->bclk_ratio *
> +params_rate(params));
> + else
> + ret = fsl_sai_set_bclk(cpu_dai, tx,
> +slots * slot_width *
> +params_rate(params));
>   if (ret)
>   return ret;
>  
> @@ -640,6 +656,7 @@ static void fsl_sai_shutdown(struct snd_pcm_substream 
> *substream,
>  }
>  
>  static const struct snd_soc_dai_ops fsl_sai_pcm_dai_ops = {
> + .set_bclk_ratio = fsl_sai_set_dai_bclk_ratio,
>   .set_sysclk = fsl_sai_set_dai_sysclk,
>   .set_fmt= fsl_sai_set_dai_fmt,
>   .set_tdm_slot   = fsl_sai_set_dai_tdm_slot,
> diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h
> index 3a3f6f8e5595..f96f8d97489d 100644
> --- a/sound/soc/fsl/fsl_sai.h
> +++ b/sound/soc/fsl/fsl_sai.h
> @@ -177,6 +177,7 @@ struct fsl_sai {
>   unsigned int mclk_streams;
>   unsigned int slots;
>   unsigned int slot_width;
> + unsigned int bclk_ratio;
>  
>   const struct fsl_sai_soc_data *soc_data;
>   struct snd_dmaengine_dai_dma_data dma_params_rx;
> -- 
> 2.17.1
>

Re: [PATCH 3/3] ASoC: fsl_asrc: Fix error with S24_3LE format bitstream in i.MX8

On Wed, Sep 11, 2019 at 12:08:07PM +0100, Mark Brown wrote:
> On Mon, Sep 09, 2019 at 06:52:13PM -0700, Nicolin Chen wrote:
> 
> > And a quick feeling is that below code is mostly identical to what
> > is in the soc-generic-dmaengine-pcm.c file. So I'm wondering if we
> > could abstract a helper function somewhere in the ASoC core: Mark?
> 
> That's roughly what sound/core/pcm_dmaengine.c is doing -
> possibly we should move more stuff into there.

It looks like a right place to me. Thank you!

Re: [PATCH 2/3] ASoC: fsl_asrc: update supported sample format

On Tue, Sep 10, 2019 at 02:07:25AM +, S.j. Wang wrote:
> > On Mon, Sep 09, 2019 at 06:33:20PM -0400, Shengjiu Wang wrote:
> > > The ASRC support 24bit/16bit/8bit input width, so S20_3LE format
> > > should not be supported, it is word width is 20bit.
> > 
> > I thought 3LE used 24-bit physical width. And the driver assigns
> > ASRC_WIDTH_24_BIT to "width" for all non-16bit cases, so 20-bit would go
> > for that 24-bit slot also. I don't clearly recall if I had explicitly tested
> > S20_3LE, but I feel it should work since I put there...
> 
> For S20_3LE, the width is 20bit,  but the ASRC only support 24bit, if set the
> ASRMCR1n.IWD= 24bit, because the actual width is 20 bit, the volume is
> Lower than expected,  it likes 24bit data right shift 4 bit.
> So it is not supported.

Hmm..S20_3LE right-aligns 20 bits in a 24-bit slot? I thought
they're left aligned...

If this is the case...shouldn't we have the same lower-volume
problem for all hardwares that support S20_3LE now?

Re: [PATCH] KVM: x86: work around leak of uninitialized stack contents

On Thu, Sep 12, 2019 at 02:20:09PM -0700, Jim Mattson wrote:
> On Wed, Sep 11, 2019 at 9:18 PM Fuqian Huang  wrote:
> >
> > Emulation of VMPTRST can incorrectly inject a page fault
> > when passed an operand that points to an MMIO address.
> > The page fault will use uninitialized kernel stack memory
> > as the CR2 and error code.
> >
> > The right behavior would be to abort the VM with a KVM_EXIT_INTERNAL_ERROR
> > exit to userspace; however, it is not an easy fix, so for now just ensure
> > that the error code and CR2 are zero.
> >
> > Signed-off-by: Fuqian Huang 
> > ---
> >  arch/x86/kvm/x86.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 290c3c3efb87..7f442d710858 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5312,6 +5312,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu 
> > *vcpu, gva_t addr, void *val,
> > /* kvm_write_guest_virt_system can pull in tons of pages. */
> > vcpu->arch.l1tf_flush_l1d = true;
> >
> > +   memset(exception, 0, sizeof(*exception));
> > return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
> >PFERR_WRITE_MASK, exception);
> >  }
> > --
> > 2.11.0
> >
> Perhaps you could also add a comment like the one Paolo added when he
> made the same change in kvm_read_guest_virt?
> See commit 353c0956a618 ("KVM: x86: work around leak of uninitialized
> stack contents (CVE-2019-7222)").

I have a better hack-a-fix, we can handle the unexpected MMIO using master
abort semantics, i.e. reads return all ones, writes are dropped.  It's not
100% correct as KVM won't handle the case where the address is legit MMIO,
but it's at least sometimes correct and thus better than a #PF.

Patch and a unit test incoming...

Re: [rfc patch script] treewide conversion of __section(foo) to section("foo");

On Thu, 2019-09-12 at 15:45 -0700, Nick Desaulniers wrote:
> If you want to email me just the patch file (so I don't have to
> copy+pasta from an email),

Lazy... ;)

> I'd be happy to apply it and compile+boot test a few more arch's
> than x86.

Thanks.  attached.



section.pl
Description: Perl program

Memory corruption (redzone overwritten) names_cache?

2019-09-12 Thread Jakub Jankowski


Hi,

We're getting some random memory corruption on an AWS EC2 instance with 
4.19.x kernels. I've tried 4.19.19, 4.19.52, but the results below are 
from the most recent (4.19.72). For debugging I enabled 
KASAN+slub_debug, but TBH, I can't make heads or tails from these.


Without slub_debug, the host reboots within couple of minutes of uptime. 
With slub_debug it survives a bit longer, but eventually all sorts of 
issues manifest (including: reboot; ps not being able to read some 
processes' /proc//cmdline while /proc//stack shows 
acct_collect()->down_read(), etc).


Upon multiple tests, the slab I most often seen pop up as first detected 
as corrupted was names_cache.
What is really weird is that multiple times I saw redzone being 
overwritten by the same content, which looks like part of 'sessions.py'  
Python's 'requests' module.


Any debugging hints would be greatly appreciated.


Command line: BOOT_IMAGE=(hd0,msdos2)/vmlinuz ro root=/dev/xvda5 console=tty0 
console=ttyS0,9600n8 crashkernel=512M-2G:64M,2G-:128M kmemleak=on 
slub_debug=FZPU slub_nomerge
(...)
[  262.957418] 
=
[  262.957423] BUG vm_area_struct (Tainted: GB  O ): Redzone 
overwritten
[  262.957424] 
-

[  262.957427] INFO: 0xb91cc681-0x98bd5238. First byte 0x6e 
instead of 0xcc
[  262.957433] INFO: Allocated in vm_area_dup+0x1e/0x180 age=6117 cpu=0 pid=8187
[  262.957438]  kmem_cache_alloc+0x1a4/0x1d0
[  262.957439]  vm_area_dup+0x1e/0x180
[  262.957441]  copy_process.part.4+0x2fa9/0x6cd0
[  262.957443]  _do_fork+0x151/0x7a0
[  262.957446]  do_syscall_64+0x9b/0x290
[  262.957452]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  262.957455] INFO: Freed in qlist_free_all+0x37/0xd0 age=7431 cpu=0 pid=8521
[  262.957457]  quarantine_reduce+0x1a2/0x210
[  262.957458]  kasan_kmalloc+0x95/0xc0
[  262.957460]  kmem_cache_alloc+0xc6/0x1d0
[  262.957463]  getname_flags+0xba/0x510
[  262.957465]  user_path_at_empty+0x1d/0x40
[  262.957468]  vfs_statx+0xb9/0x140
[  262.957470]  __se_sys_newstat+0x7c/0xd0
[  262.957472]  do_syscall_64+0x9b/0x290
[  262.957474]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  262.957476] INFO: Slab 0xca532806 objects=30 used=24 
fp=0x6ce6da86 flags=0x2008101
[  262.957477] INFO: Object 0x5eb7e26b @offset=8 fp=0xac807fa7

[  262.957480] Redzone b91cc681: 6e 73 2e 70 79 5c 22 2c 
 ns.py\",
[  262.957482] Object 5eb7e26b: 20 6c 69 6e 65 20 36 34 36 2c 20 69 6e 
20 73 65   line 646, in se
[  262.957484] Object 7d5d4673: 6e 64 5c 6e 20 20 20 20 72 20 3d 20 61 
64 61 70  nd\nr = adap
[  262.957485] Object a3cf6db1: 74 65 72 2e 73 65 6e 64 28 72 65 71 75 
65 73 74  ter.send(request
[  262.957487] Object d8b14cdd: 2c 20 2a 2a 6b 77 61 72 00 00 00 00 00 
00 00 00  , **kwar
[  262.957489] Object 5eca0928: 40 97 5a 73 83 88 ff ff 25 00 00 00 00 
00 00 80  @.Zs%...
[  262.957491] Object 592ffbd7: 71 00 00 00 00 00 00 00 e0 c8 22 6d 83 88 ff 
ff  q."m
[  262.957492] Object 84c88ae5: 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00  
[  262.957494] Object ea6d1cb3: 83 00 00 00 00 00 00 00 80 c0 fd 5a 83 
88 ff ff  ...Z
[  262.957495] Object a236617c: 80 c0 fd 5a 83 88 ff ff 00 00 00 00 00 
00 00 00  ...Z
[  262.957497] Object 91c7956c: 00 3a 94 b0 ff ff ff ff 75 00 00 00 00 
00 00 00  .:..u...
[  262.957499] Object 216cef35: c0 85 cc 6a 83 88 ff ff 00 00 00 00 00 
00 00 00  ...j
[  262.957500] Object e0fd506c: 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00  
[  262.957502] Redzone f5906e86: cc cc cc cc cc cc cc cc
  
[  262.957503] Padding 53d79574: 5a 5a 5a 5a 5a 5a 5a 5a
  
[  262.957507] CPU: 3 PID: 11769 Comm: ps Kdump: loaded Tainted: GB  O  
4.19.72_3upstreamdbg #1
[  262.957508] Hardware name: Xen HVM domU, BIOS 4.2.amazon 08/24/2006
[  262.957509] Call Trace:
[  262.957516]  dump_stack+0x9a/0xf0
[  262.957519]  check_bytes_and_report.cold.24+0x3f/0x6b
[  262.957521]  check_object+0x17c/0x280
[  262.957524]  free_debug_processing+0x105/0x2a0
[  262.957526]  ? qlist_free_all+0x37/0xd0
[  262.957527]  ? qlist_free_all+0x37/0xd0
[  262.957529]  __slab_free+0x218/0x3b0
[  262.957533]  ? __free_pages_ok+0x62f/0x840
[  262.957536]  ? _raw_spin_unlock_irqrestore+0x2b/0x40
[  262.957537]  ? qlist_free_all+0x37/0xd0
[  262.957541]  ? trace_hardirqs_on+0x35/0x140
[  262.957543]  ? qlist_free_all+0x37/0xd0
[  262.957544]  qlist_free_all+0x4c/0xd0
[  262.957546]  quarantine_reduce+0x1a2/0x210
[  262.957549]  ? getname_flags+0xba/0x510
[  262.957550]  kasan_kmalloc+0x95/0xc0
[  262.957553]  ? getname

clang-format and 'clang-format on' and 'clang-format off'

On Thu, 2019-09-12 at 23:58 +0200, Miguel Ojeda wrote:
> On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:

> > Marking sections _no_auto_format_ isn't really a
> > good solution is it?
> 
> I am thinking about special tables that are hand-crafted or very
> complex macros. For those, yes, I think it is a fine solution.

Can the 'clang-format on/off' trigger be indirected into
something non-clang specific via a macro?

Not every project is going to use only the clang-format tool.

Re: [EXT] Re: [PATCH 1/3] ASoC: fsl_asrc: Use in(out)put_format instead of in(out)put_word_width

On Tue, Sep 10, 2019 at 02:22:06AM +, S.j. Wang wrote:
> Hi
> 
> > 
> > On Mon, Sep 09, 2019 at 06:33:19PM -0400, Shengjiu Wang wrote:
> > > snd_pcm_format_t is more formal than enum asrc_word_width, which
> > has
> > > two property, width and physical width, which is more accurate than
> > > enum asrc_word_width. So it is better to use in(out)put_format instead
> > > of in(out)put_word_width.
> > 
> > Hmm...I don't really see the benefit of using snd_pcm_format_t here...I
> > mean, I know it's a generic one, and would understand if we use it as a
> > param for a common API. But this patch merely packs the "width" by
> > intentionally using this snd_pcm_format_t and then adds another
> > translation to unpack it.. I feel it's a bit overcomplicated. Or am I 
> > missing
> > something?
> > 
> > And I feel it's not necessary to use ALSA common format in our own "struct
> > asrc_config" since it is more IP/register specific.
> > 
> > Thanks
> > Nicolin
> > 
> 
> As you know, we have another M2M function internally, when user want to
> Set the format through M2M API, it is better to use snd_pcm_format_t instead 
> the
> Width, for snd_pcm_format_t include two property, data with and physical width
> In driver some place need data width, some place need physical width.
> For example how to distinguish S24_LE and S24_3LE in driver,  DMA setting 
> needs
> The physical width,  but ASRC need data width. 
> 
> Another purpose is that we have another new designed ASRC, which support more
> Formats, I would like it can share same API with this ASRC, using 
> snd_pcm_format_t
> That we can use the common API, like snd_pcm_format_linear,
> snd_pcm_format_big_endian to get the property of the format, which is needed 
> by
> driver.

I see. Just acked the patch.

Re: [PATCH 1/3] ASoC: fsl_asrc: Use in(out)put_format instead of in(out)put_word_width

On Mon, Sep 09, 2019 at 06:33:19PM -0400, Shengjiu Wang wrote:
> snd_pcm_format_t is more formal than enum asrc_word_width, which has
> two property, width and physical width, which is more accurate than
> enum asrc_word_width. So it is better to use in(out)put_format
> instead of in(out)put_word_width.
> 
> Signed-off-by: Shengjiu Wang 

Acked-by: Nicolin Chen 

> ---
>  sound/soc/fsl/fsl_asrc.c | 56 +++-
>  sound/soc/fsl/fsl_asrc.h |  4 +--
>  2 files changed, 40 insertions(+), 20 deletions(-)
> 
> diff --git a/sound/soc/fsl/fsl_asrc.c b/sound/soc/fsl/fsl_asrc.c
> index cfa40ef6b1ca..4d3804a1ea55 100644
> --- a/sound/soc/fsl/fsl_asrc.c
> +++ b/sound/soc/fsl/fsl_asrc.c
> @@ -265,6 +265,8 @@ static int fsl_asrc_config_pair(struct fsl_asrc_pair 
> *pair)
>   struct asrc_config *config = pair->config;
>   struct fsl_asrc *asrc_priv = pair->asrc_priv;
>   enum asrc_pair_index index = pair->index;
> + enum asrc_word_width input_word_width;
> + enum asrc_word_width output_word_width;
>   u32 inrate, outrate, indiv, outdiv;
>   u32 clk_index[2], div[2];
>   int in, out, channels;
> @@ -283,9 +285,32 @@ static int fsl_asrc_config_pair(struct fsl_asrc_pair 
> *pair)
>   return -EINVAL;
>   }
>  
> - /* Validate output width */
> - if (config->output_word_width == ASRC_WIDTH_8_BIT) {
> - pair_err("does not support 8bit width output\n");
> + switch (snd_pcm_format_width(config->input_format)) {
> + case 8:
> + input_word_width = ASRC_WIDTH_8_BIT;
> + break;
> + case 16:
> + input_word_width = ASRC_WIDTH_16_BIT;
> + break;
> + case 24:
> + input_word_width = ASRC_WIDTH_24_BIT;
> + break;
> + default:
> + pair_err("does not support this input format, %d\n",
> +  config->input_format);
> + return -EINVAL;
> + }
> +
> + switch (snd_pcm_format_width(config->output_format)) {
> + case 16:
> + output_word_width = ASRC_WIDTH_16_BIT;
> + break;
> + case 24:
> + output_word_width = ASRC_WIDTH_24_BIT;
> + break;
> + default:
> + pair_err("does not support this output format, %d\n",
> +  config->output_format);
>   return -EINVAL;
>   }
>  
> @@ -383,8 +408,8 @@ static int fsl_asrc_config_pair(struct fsl_asrc_pair 
> *pair)
>   /* Implement word_width configurations */
>   regmap_update_bits(asrc_priv->regmap, REG_ASRMCR1(index),
>  ASRMCR1i_OW16_MASK | ASRMCR1i_IWD_MASK,
> -ASRMCR1i_OW16(config->output_word_width) |
> -ASRMCR1i_IWD(config->input_word_width));
> +ASRMCR1i_OW16(output_word_width) |
> +ASRMCR1i_IWD(input_word_width));
>  
>   /* Enable BUFFER STALL */
>   regmap_update_bits(asrc_priv->regmap, REG_ASRMCR(index),
> @@ -497,13 +522,13 @@ static int fsl_asrc_dai_hw_params(struct 
> snd_pcm_substream *substream,
> struct snd_soc_dai *dai)
>  {
>   struct fsl_asrc *asrc_priv = snd_soc_dai_get_drvdata(dai);
> - int width = params_width(params);
>   struct snd_pcm_runtime *runtime = substream->runtime;
>   struct fsl_asrc_pair *pair = runtime->private_data;
>   unsigned int channels = params_channels(params);
>   unsigned int rate = params_rate(params);
>   struct asrc_config config;
> - int word_width, ret;
> + snd_pcm_format_t format;
> + int ret;
>  
>   ret = fsl_asrc_request_pair(channels, pair);
>   if (ret) {
> @@ -513,15 +538,10 @@ static int fsl_asrc_dai_hw_params(struct 
> snd_pcm_substream *substream,
>  
>   pair->config = &config;
>  
> - if (width == 16)
> - width = ASRC_WIDTH_16_BIT;
> - else
> - width = ASRC_WIDTH_24_BIT;
> -
>   if (asrc_priv->asrc_width == 16)
> - word_width = ASRC_WIDTH_16_BIT;
> + format = SNDRV_PCM_FORMAT_S16_LE;
>   else
> - word_width = ASRC_WIDTH_24_BIT;
> + format = SNDRV_PCM_FORMAT_S24_LE;
>  
>   config.pair = pair->index;
>   config.channel_num = channels;
> @@ -529,13 +549,13 @@ static int fsl_asrc_dai_hw_params(struct 
> snd_pcm_substream *substream,
>   config.outclk = OUTCLK_ASRCK1_CLK;
>  
>   if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
> - config.input_word_width   = width;
> - config.output_word_width  = word_width;
> + config.input_format   = params_format(params);
> + config.output_format  = format;
>   config.input_sample_rate  = rate;
>   config.output_sample_rate = asrc_priv->asrc_rate;
>   } else {
> - config.input_word_width   = word_width;
> - config.output_word_width  = width;
> +

[PATCH] mm: memory: fix /proc/meminfo reporting for MLOCK_ONFAULT

2019-09-12 Thread Lucian Adrian Grijincu

As pages are faulted in MLOCK_ONFAULT correctly updates
/proc/self/smaps, but doesn't update /proc/meminfo's Mlocked field.

- Before this /proc/meminfo fields didn't change as pages were faulted in:

```
= Start =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
= Creating testfile =

= after mlock2(MLOCK_ONFAULT) =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
/proc/self/smaps
7f871400-7f875400 rw-s  08:04 50857050   /root/testfile
Locked:0 kB

= after reading half of the file =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
/proc/self/smaps
7f871400-7f875400 rw-s  08:04 50857050   /root/testfile
Locked:   524288 kB

= after reading the entire the file =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
/proc/self/smaps
7f871400-7f875400 rw-s  08:04 50857050   /root/testfile
Locked:  1048576 kB

= after munmap =
/proc/meminfo
Unevictable:   10128 kB
Mlocked:   10132 kB
/proc/self/smaps
```

- After: /proc/meminfo fields are properly updated as pages are touched:

```
= Start =
/proc/meminfo
Unevictable:  60 kB
Mlocked:  60 kB
= Creating testfile =

= after mlock2(MLOCK_ONFAULT) =
/proc/meminfo
Unevictable:  60 kB
Mlocked:  60 kB
/proc/self/smaps
7f2b9c60-7f2bdc60 rw-s  08:04 63045798   /root/testfile
Locked:0 kB

= after reading half of the file =
/proc/meminfo
Unevictable:  524220 kB
Mlocked:  524220 kB
/proc/self/smaps
7f2b9c60-7f2bdc60 rw-s  08:04 63045798   /root/testfile
Locked:   524288 kB

= after reading the entire the file =
/proc/meminfo
Unevictable: 1048496 kB
Mlocked: 1048508 kB
/proc/self/smaps
7f2b9c60-7f2bdc60 rw-s  08:04 63045798   /root/testfile
Locked:  1048576 kB

= after munmap =
/proc/meminfo
Unevictable: 176 kB
Mlocked:  60 kB
/proc/self/smaps
```

Repro code.
---

int mlock2wrap(const void* addr, size_t len, int flags) {
  return syscall(SYS_mlock2, addr, len, flags);
}

void smaps() {
  char smapscmd[1000];
  snprintf(
  smapscmd,
  sizeof(smapscmd) - 1,
  "grep testfile -A 20 /proc/%d/smaps | grep -E '(testfile|Locked)'",
  getpid());
  printf("/proc/self/smaps\n");
  fflush(stdout);
  system(smapscmd);
}

void meminfo() {
  const char* meminfocmd = "grep -E '(Mlocked|Unevictable)' /proc/meminfo";
  printf("/proc/meminfo\n");
  fflush(stdout);
  system(meminfocmd);
}

  { \
int rc = (call);\
if (rc != 0) {  \
  printf("error %d %s\n", rc, strerror(errno)); \
  exit(1);  \
}   \
  }
int main(int argc, char* argv[]) {
  printf("= Start =\n");
  meminfo();

  printf("= Creating testfile =\n");
  size_t size = 1 << 30; // 1 GiB
  int fd = open("testfile", O_CREAT | O_RDWR, 0666);
  {
void* buf = malloc(size);
write(fd, buf, size);
free(buf);
  }
  int ret = 0;
  void* addr = NULL;
  addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

  if (argc > 1) {
PCHECK(mlock2wrap(addr, size, MLOCK_ONFAULT));
printf("= after mlock2(MLOCK_ONFAULT) =\n");
meminfo();
smaps();

for (size_t i = 0; i < size / 2; i += 4096) {
  ret += ((char*)addr)[i];
}
printf("= after reading half of the file =\n");
meminfo();
smaps();

for (size_t i = 0; i < size; i += 4096) {
  ret += ((char*)addr)[i];
}
printf("= after reading the entire the file =\n");
meminfo();
smaps();

  } else {
PCHECK(mlock(addr, size));
printf("= after mlock =\n");
meminfo();
smaps();
  }

  PCHECK(munmap(addr, size));
  printf("= after munmap =\n");
  meminfo();
  smaps();

  return ret;
}

---

Signed-off-by: Lucian Adrian Grijincu 
---
 mm/memory.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index e0c232fe81d9..7e8dc3ed4e89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,9 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct 
mem_cgroup *memcg,
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
+   if ((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED &&
+   !PageTransCompound(page))
+   mlock_vma_page(page);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
-- 
2.17.1

Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-09-12 Thread Aubrey Li

On Thu, Sep 12, 2019 at 8:04 PM Aaron Lu  wrote:
>
> On Wed, Sep 11, 2019 at 09:19:02AM -0700, Tim Chen wrote:
> > On 9/11/19 7:02 AM, Aaron Lu wrote:
> > I think Julien's result show that my patches did not do as well as
> > your patches for fairness. Aubrey did some other testing with the same
> > conclusion.  So I think keeping the forced idle time balanced is not
> > enough for maintaining fairness.
>
> Well, I have done following tests:
> 1 Julien's test script: https://paste.debian.net/plainh/834cf45c
> 2 start two tagged will-it-scale/page_fault1, see how each performs;
> 3 Aubrey's mysql test: https://github.com/aubreyli/coresched_bench.git
>
> They all show your patchset performs equally well...And consider what
> the patch does, I think they are really doing the same thing in
> different ways.

It looks like we are not on the same page, if you don't mind, can both of
you rebase your patchset onto v5.3-rc8 and provide a public branch so I
can fetch and test it at least by my benchmark?

Thanks,
-Aubrey

RE: [PATCH] scsi: storvsc: Add the support of hibernation

2019-09-12 Thread Dexuan Cui

> From: linux-scsi-ow...@vger.kernel.org 
> On Behalf Of kbuild test robot
> Sent: Thursday, September 12, 2019 1:54 PM
> To: Dexuan Cui 
> Cc: kbuild-...@01.org; KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; sas...@kernel.org; j...@linux.ibm.com;
> martin.peter...@oracle.com; linux-hyp...@vger.kernel.org;
> linux-s...@vger.kernel.org; linux-kernel@vger.kernel.org; Michael Kelley
> ; Dexuan Cui 
> Subject: Re: [PATCH] scsi: storvsc: Add the support of hibernation
> 
> Hi Dexuan,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on linus/master]
> [cannot apply to v5.3-rc8 next-20190904]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system]
> 
> >> drivers//scsi/storvsc_drv.c:1982:3: error: 'struct hv_driver' has no member
> named 'suspend'
>  .suspend = storvsc_suspend,
>   ^~~

This build failure is expected: In the patch mail, I mentioned this patch
has a build dependency on the commit 271b2224d42f ("Drivers: hv: vmbus: 
Implement
suspend/resume for VSC drivers for hibernation"), which is on Sasha Levin's
Hyper-V tree's hyperv-next branch:
https://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git/log/?h=hyperv-next

Thanks,
-- Dexuan

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

On Thu, 2019-09-12 at 16:00 -0700, Nick Desaulniers wrote:

> Consider the fact that not all kernel developers run checkpatch.pl.
> Is that a deficiency in checkpatch.pl, or the lack of enforcement in
> kernel developers' workflows?

No.  Mostly it's because the kernel is like a bunch of little
untethered development planets, each with a little prince that
wants to keep their own little fiefdom separate from the others.

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

2019-09-12 Thread Nick Desaulniers

On Thu, Sep 12, 2019 at 3:38 PM Joe Perches  wrote:
>
> On Thu, 2019-09-12 at 23:58 +0200, Miguel Ojeda wrote:
> > On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:
> > > Please name the major projects and then point to their
> > > .clang-format equivalents.
> > >
> > > Also note the size/scope/complexity of the major projects.
> >
> > Mozilla, WebKit, LLVM and Microsoft. They have their style distributed
> > with the official clang-format, not sure if they enforce it.
>
> At least for LLVM, it appears not.

I acknowledge the irony you present, but that's because there's no
enforcement on the LLVM side.  I frequently forget to run:
$ git-clang-format HEAD~

If you have automated systems that help encourage (ie. force) the use
of the formatter, this helps.

Consider the fact that not all kernel developers run checkpatch.pl.
Is that a deficiency in checkpatch.pl, or the lack of enforcement in
kernel developers' workflows?
-- 
Thanks,
~Nick Desaulniers

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

2019-09-12 Thread Nick Desaulniers

On Thu, Sep 12, 2019 at 2:58 PM Miguel Ojeda
 wrote:
>
> On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:
> >
> > Please name the major projects and then point to their
> > .clang-format equivalents.
> >
> > Also note the size/scope/complexity of the major projects.
>
> Mozilla, WebKit, LLVM and Microsoft. They have their style distributed
> with the official clang-format, not sure if they enforce it.
>
> Same for Chromium/Chrome, but it looks like they indeed enforce it:
>
>   "A checkout should give you clang-format to automatically format C++
> code. By policy, Clang's formatting of code should always be accepted
> in code reviews."
>
> I would bet other Google projects do so as well (since Chandler
> Carruth has been giving talks about clang-format for 7+ years). Nick?

So Google3 (the internal monorepo that Android, Chromium, ChromiumOS,
Fuchsia are not a part of) is pretty sweet.  You cannot even post code
unless the linter has been run on it (presubmit hook), which for our
~350 millions LoC of C++ is clang-format.  If you bypass local
presubmit hooks, our code review tool ("critique") won't let you
submit code that fails lint presubmit checks.  I suspect the initial
conversion was probably committed by bots.

>
> I hope those are major enough. There is also precedent in other
> languages (e.g. Java, C#, Rust).

Yep! Other people coming to C/C++ from these languages find the
discussion about tabs vs spaces to be highly entertaining!  When you
have an automated code formatter and an agreed upon coding style (and
hopefully enforcement), you save so much time from avoided bikesheds!
Don't like the codebase's coding style?  Then write the code how you
like and just run the formatter when you're done (might not help with
conventions though, maybe that's where checkpatch.pl can shine).
Done! No more wasted time on what color to paint the bikeshed!
-- 
Thanks,
~Nick Desaulniers

Re: [PATCH] watchdog: f71808e_wdt: Add F81803 support

2019-09-12 Thread Jaret Cantu


On 9/12/19 2:50 PM, Guenter Roeck wrote:

On Thu, Sep 12, 2019 at 01:55:50PM -0400, Jaret Cantu wrote:

This adds watchdog support for the Fintek F81803 Super I/O chip.

Testing was done on the Seneca XK-QUAD.

Signed-off-by: Jaret Cantu 


Since there is no datasheet, we can only hope that this works
for other platforms using the same chip. Nothing we can do
about that, so


I did get the register descriptions after hounding the vendor's support 
team for a good long while, which is how I was able to get the watchdog 
working in the first place.  Nothing publicly available, however.


The only real difference between this part and others in the family is 
requiring a bank select before setting the WDTRST pin.  (And the 
registers/bits which have to be twiddled to do so, of course.)




Reviewed-by: Guenter Roeck 


---
  drivers/watchdog/Kconfig   |  4 ++--
  drivers/watchdog/f71808e_wdt.c | 17 -
  2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 8188963a405b..781ff835f2a4 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1046,8 +1046,8 @@ config F71808E_WDT
depends on X86
help
  This is the driver for the hardware watchdog on the Fintek F71808E,
- F71862FG, F71868, F71869, F71882FG, F71889FG, F81865 and F81866
- Super I/O controllers.
+ F71862FG, F71868, F71869, F71882FG, F71889FG, F81803, F81865, and
+ F81866 Super I/O controllers.
  
  	  You can compile this driver directly into the kernel, or use

  it as a module.  The module will be called f71808e_wdt.
diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c
index ff5cf1b48a4d..e46104c2fd94 100644
--- a/drivers/watchdog/f71808e_wdt.c
+++ b/drivers/watchdog/f71808e_wdt.c
@@ -31,8 +31,10 @@
  #define SIO_REG_DEVID 0x20/* Device ID (2 bytes) */
  #define SIO_REG_DEVREV0x22/* Device revision */
  #define SIO_REG_MANID 0x23/* Fintek ID (2 bytes) */
+#define SIO_REG_CLOCK_SEL  0x26/* Clock select */
  #define SIO_REG_ROM_ADDR_SEL  0x27/* ROM address select */
  #define SIO_F81866_REG_PORT_SEL   0x27/* F81866 Multi-Function 
Register */
+#define SIO_REG_TSI_LEVEL_SEL  0x28/* TSI Level select */
  #define SIO_REG_MFUNCT1   0x29/* Multi function select 1 */
  #define SIO_REG_MFUNCT2   0x2a/* Multi function select 2 */
  #define SIO_REG_MFUNCT3   0x2b/* Multi function select 3 */
@@ -49,6 +51,7 @@
  #define SIO_F71869A_ID0x1007  /* Chipset ID */
  #define SIO_F71882_ID 0x0541  /* Chipset ID */
  #define SIO_F71889_ID 0x0723  /* Chipset ID */
+#define SIO_F81803_ID  0x1210  /* Chipset ID */
  #define SIO_F81865_ID 0x0704  /* Chipset ID */
  #define SIO_F81866_ID 0x1010  /* Chipset ID */
  
@@ -108,7 +111,7 @@ MODULE_PARM_DESC(start_withtimeout, "Start watchdog timer on module load with"

" given initial timeout. Zero (default) disables this feature.");
  
  enum chips { f71808fg, f71858fg, f71862fg, f71868, f71869, f71882fg, f71889fg,

-f81865, f81866};
+f81803, f81865, f81866};
  
  static const char *f71808e_names[] = {

"f71808fg",
@@ -118,6 +121,7 @@ static const char *f71808e_names[] = {
"f71869",
"f71882fg",
"f71889fg",
+   "f81803",
"f81865",
"f81866",
  };
@@ -370,6 +374,14 @@ static int watchdog_start(void)
superio_inb(watchdog.sioaddr, SIO_REG_MFUNCT3) & 0xcf);
break;
  
+	case f81803:

+   /* Enable TSI Level register bank */
+   superio_clear_bit(watchdog.sioaddr, SIO_REG_CLOCK_SEL, 3);
+   /* Set pin 27 to WDTRST# */
+   superio_outb(watchdog.sioaddr, SIO_REG_TSI_LEVEL_SEL, 0x5f &
+   superio_inb(watchdog.sioaddr, SIO_REG_TSI_LEVEL_SEL));
+   break;
+
case f81865:
/* Set pin 70 to WDTRST# */
superio_clear_bit(watchdog.sioaddr, SIO_REG_MFUNCT3, 5);
@@ -809,6 +821,9 @@ static int __init f71808e_find(int sioaddr)
/* Confirmed (by datasheet) not to have a watchdog. */
err = -ENODEV;
goto exit;
+   case SIO_F81803_ID:
+   watchdog.type = f81803;
+   break;
case SIO_F81865_ID:
watchdog.type = f81865;
break;
--
2.11.0

Re: [rfc patch script] treewide conversion of __section(foo) to section("foo");

2019-09-12 Thread Nick Desaulniers

On Sun, Sep 8, 2019 at 9:21 PM Joe Perches  wrote:

> So running the script:
>
> $ perl section.pl
>
> produces a commit
> ---
> From 04e52f34fd4ee7008ea5bf0d8896bf8d1fdf9f3f Mon Sep 17 00:00:00 2001
> Message-Id: 
> <04e52f34fd4ee7008ea5bf0d8896bf8d1fdf9f3f.1568001863.git@perches.com>
> From: Joe Perches 
> Date: Sun, 8 Sep 2019 20:53:41 -0700
> Subject: [PATCH] treewide: Convert macro and uses of __section(foo) to
>  __section("foo")
>
> Use a more generic form for __section that requires quotes to avoid
> complications with clang and gcc differences.
>
> Remove the quote operator # from compiler_attributes.h __section macro.
>
> Convert all unquoted __section(foo) uses to quoted __section("foo").
> Also convert __attribute__((section("foo"))) uses to __section("foo")
> even if the __attribute__ has multiple list entry forms.
>
> Signed-off-by: Joe Perches 
> ---
>  arch/arc/include/asm/linkage.h|  8 +++---
>  arch/arc/include/asm/mach_desc.h  |  2 +-
>  arch/arc/plat-hsdk/platform.c |  2 +-
>  arch/arm/include/asm/cache.h  |  2 +-
>  arch/arm/include/asm/cpuidle.h|  2 +-
>  arch/arm/include/asm/idmap.h  |  2 +-
>  arch/arm/include/asm/kvm_hyp.h|  2 +-
>  arch/arm/include/asm/mach/arch.h  |  4 +--
>  arch/arm/include/asm/setup.h  |  2 +-
>  arch/arm/include/asm/smp.h|  2 +-
>  arch/arm/include/asm/tcm.h|  8 +++---
>  arch/arm/kernel/cpuidle.c |  2 +-
>  arch/arm/kernel/devtree.c |  2 +-
>  arch/arm64/include/asm/cache.h|  2 +-
>  arch/arm64/include/asm/exception.h|  2 +-
>  arch/arm64/include/asm/kvm_hyp.h  |  2 +-
>  arch/arm64/kernel/efi.c   |  2 +-
>  arch/arm64/kernel/smp_spin_table.c|  2 +-
>  arch/ia64/include/asm/cache.h |  2 +-
>  arch/microblaze/kernel/setup.c|  2 +-
>  arch/mips/include/asm/cache.h |  2 +-
>  arch/mips/include/asm/mach-pmcs-msp71xx/msp_pci.h |  4 +--
>  arch/mips/include/asm/machine.h   |  2 +-
>  arch/mips/include/asm/mips_machine.h  |  2 +-
>  arch/mips/kernel/setup.c  |  2 +-
>  arch/mips/mm/init.c   |  2 +-
>  arch/parisc/include/asm/cache.h   |  2 +-
>  arch/parisc/include/asm/ldcw.h|  2 +-
>  arch/parisc/kernel/ftrace.c   |  2 +-
>  arch/parisc/mm/init.c |  6 ++--
>  arch/powerpc/boot/main.c  |  2 +-
>  arch/powerpc/boot/ps3.c   |  2 +-
>  arch/powerpc/include/asm/cache.h  |  2 +-
>  arch/powerpc/include/asm/machdep.h|  2 +-
>  arch/powerpc/kernel/btext.c   |  2 +-
>  arch/powerpc/kernel/prom_init.c   |  2 +-
>  arch/powerpc/kvm/book3s_64_vio_hv.c   |  2 +-
>  arch/s390/boot/compressed/decompressor.c  |  2 +-
>  arch/s390/boot/ipl_parm.c |  4 +--
>  arch/s390/boot/startup.c  |  2 +-
>  arch/s390/include/asm/cache.h |  2 +-
>  arch/s390/include/asm/sections.h  |  4 +--
>  arch/s390/kernel/setup.c  |  2 +-
>  arch/s390/mm/init.c   |  2 +-
>  arch/sh/boards/of-generic.c   |  2 +-
>  arch/sh/include/asm/cache.h   |  2 +-
>  arch/sh/include/asm/machvec.h |  2 +-
>  arch/sh/include/asm/smp.h |  2 +-
>  arch/sparc/include/asm/cache.h|  2 +-
>  arch/sparc/kernel/btext.c |  2 +-
>  arch/um/include/shared/init.h | 22 +++
>  arch/um/kernel/skas/clone.c   |  2 +-
>  arch/um/kernel/um_arch.c  |  2 +-
>  arch/x86/boot/compressed/pgtable_64.c |  2 +-
>  arch/x86/boot/tty.c   |  8 +++---
>  arch/x86/boot/video.h |  2 +-
>  arch/x86/include/asm/apic.h   |  4 +--
>  arch/x86/include/asm/cache.h  |  2 +-
>  arch/x86/include/asm/intel-mid.h  |  2 +-
>  arch/x86/include/asm/iommu_table.h|  2 +-
>  arch/x86/include/asm/irqflags.h   |  2 +-
>  arch/x86/include/asm/mem_encrypt.h|  2 +-
>  arch/x86/include/asm/setup.h  |  2 +-
>  arch/x86/kernel/cpu/cpu.h |  2 +-
>  arch/x86/kernel/head64.c  |  2 +-
>  arch/x86/mm/mem_encrypt.c |  4 +--
>  arch/x86/mm/mem_encrypt_identity.c|  2

Re: [PATCH 00/13] nvdimm: Use more common kernel coding style

On Thu, 2019-09-12 at 23:58 +0200, Miguel Ojeda wrote:
> On Thu, Sep 12, 2019 at 11:08 PM Joe Perches  wrote:
> > Please name the major projects and then point to their
> > .clang-format equivalents.
> > 
> > Also note the size/scope/complexity of the major projects.
> 
> Mozilla, WebKit, LLVM and Microsoft. They have their style distributed
> with the official clang-format, not sure if they enforce it.

At least for LLVM, it appears not.

I just tried a very small portion of the clang compiler:

$ git ls-files llvm/lib/CodeGen/ | wc -l
293
$ git ls-files llvm/lib/CodeGen/ | xargs clang-format -i

and got:

$ git diff --shortstat
 245 files changed, 19519 insertions(+), 17794 deletions(-)

btw: that seems a pretty small ~7% of the overall lines

$ git ls-files llvm/lib/CodeGen/ | xargs wc -l | tail -1
 251034 total

Re: [RFC] ARM: dts: omap36xx: Enable thermal throttling

2019-09-12 Thread Daniel Lezcano



Hi Adam,

On 12/09/2019 23:19, Adam Ford wrote:
> On Thu, Sep 12, 2019 at 4:12 PM Daniel Lezcano
>  wrote:
>>
>> On 12/09/2019 20:30, Adam Ford wrote:
>>> The thermal sensor in the omap3 family isn't accurate, but it's
>>> better than nothing.  The various OPP's enabled for the omap3630
>>> support up to OPP1G, however the datasheet for the DM3730 states
>>> that OPP130 and OPP1G are not available above TJ of 90C.
>>>
>>> This patch configures the thermal throttling to limit the
>>> operating points of the omap3630 to Only OPP50 and OPP100 if
>>> the thermal sensor reads a value above 90C.

Oh, that's a very interesting use case.

AFAICT the thermal framework is not designed to deal with this
situation. I agree this setup may work (even if I'm not convinced about
the stability of the whole).

May be Viresh can help for the cpufreq side?

>> Out of curiosity, what are the OPP50 and OPP100 mentioned above? and
>> what does mean "OPP130 and OPP1G are not available above TJ of 90C"?
>>
> OPP130 is the 800 MHz and OPP1G is 1GHz operating point.
> The 90C is the max junction temperature.  When the temperature exceeds
> 90C, the processor is not designed to operate at 800+ MHz.  The
> statement itself is a direct quote from the public datasheet for the
> dm3730, Table 4-19.

> The datasheet is: http://www.ti.com/lit/ds/symlink/dm3730.pdf

It is ambiguous how it is stated:

"OPP130 and OPP1G are not available above TJ of 90C"

that can be interpreted the OPP is disabled by the hardware, no?

> The operating points were updated in [1], but they haven't yet been
> fully applied yet, but during the discussion, the question came about
> regarding how to limit the speed at high temp, so that's why this
> patch was done.
> 
> [1] - https://patchwork.kernel.org/patch/11141643/

I see, you switched to opp-v2.

Thanks for the detailed answer.



>> I don't see the connection between these OPP names and the definition in
>> the DT.
>>
>>> Signed-off-by: Adam Ford 
>>>
>>> diff --git a/arch/arm/boot/dts/omap36xx.dtsi 
>>> b/arch/arm/boot/dts/omap36xx.dtsi
>>> index 4bb4f534afe2..58b9d347019f 100644
>>> --- a/arch/arm/boot/dts/omap36xx.dtsi
>>> +++ b/arch/arm/boot/dts/omap36xx.dtsi
>>> @@ -25,6 +25,7 @@
>>>
>>>   vbb-supply = <&abb_mpu_iva>;
>>>   clock-latency = <30>; /* From omap-cpufreq driver 
>>> */
>>> + #cooling-cells = <2>;
>>>   };
>>>   };
>>>
>>> @@ -195,6 +196,31 @@
>>>   };
>>>  };
>>>
>>> +&cpu_thermal {
>>> + cpu_trips: trips {
>>> + /* OPP130 and OPP1G are not available above TJ of 90C. */
>>> + cpu_alert0: cpu_alert {
>>> + temperature = <9>; /* millicelsius */
>>> + hysteresis = <2000>; /* millicelsius */
>>> + type = "passive";
>>> + };
>>> +
>>> + cpu_crit: cpu_crit {
>>> + temperature = <125000>; /* millicelsius */
>>> + hysteresis = <2000>; /* millicelsius */
>>> + type = "critical";
>>> + };
>>> + };
>>> +
>>> + cpu_cooling_maps: cooling-maps {
>>> + map0 {
>>> + trip = <&cpu_alert0>;
>>> + /* Only allow OPP50 and OPP100 */
>>> + cooling-device = <&cpu 0 1>;
>>> + };
>>> + };
>>> +};
>>> +
>>>  /* OMAP3630 needs dss_96m_fck for VENC */
>>>  &venc {
>>>   clocks = <&dss_tv_fck>, <&dss_96m_fck>;
>>>
>>
>>
>> --
>>   Linaro.org │ Open source software for ARM SoCs
>>
>> Follow Linaro:   Facebook |
>>  Twitter |
>>  Blog
>>


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

Re: [PATCH v3 0/6] make use of gcc 9's "asm inline()"

2019-09-12 Thread Miguel Ojeda

On Fri, Sep 13, 2019 at 12:19 AM Rasmus Villemoes
 wrote:
>
> Patch 1 has already been picked up by Greg in staging-next, it's
> included here for completeness. I don't know how to route the rest, or
> if they should simply wait for 5.5 given how close we are to the merge
> window for 5.4.

If you want I can pick this up in compiler-attributes and submit it as
a whole if we get Acks from rtl8723bs/x86/...maintainers.

Cheers,
Miguel

[PATCH v3 4/6] compiler-types.h: add asm_inline definition

This adds an asm_inline macro which expands to "asm inline" [1] when
the compiler supports it. This is currently gcc 9.1+, gcc 8.3
and (once released) gcc 7.5 [2]. It expands to just "asm" for other
compilers.

Using asm inline("foo") instead of asm("foo") overrules gcc's
heuristic estimate of the size of the code represented by the asm()
statement, and makes gcc use the minimum possible size instead. That
can in turn affect gcc's inlining decisions.

I wasn't sure whether to make this a function-like macro or not - this
way, it can be combined with volatile as

  asm_inline volatile()

but perhaps we'd prefer to spell that

  asm_inline_volatile()

anyway.

The Kconfig logic is taken from an RFC patch by Masahiro Yamada [3].

[1] Technically, asm __inline, since both inline and __inline__
are macros that attach various attributes, making gcc barf if one
literally does "asm inline()". However, the third spelling __inline is
available for referring to the bare keyword.

[2] https://lore.kernel.org/lkml/20190907001411.gg9...@gate.crashing.org/

[3] 
https://lore.kernel.org/lkml/1544695154-15250-1-git-send-email-yamada.masah...@socionext.com/

Signed-off-by: Rasmus Villemoes 
---
 include/linux/compiler_types.h | 6 ++
 init/Kconfig   | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index ee49be6d6088..2bf316fe0a20 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -198,6 +198,12 @@ struct ftrace_likely_data {
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
 
+#ifdef CONFIG_CC_HAS_ASM_INLINE
+#define asm_inline asm __inline
+#else
+#define asm_inline asm
+#endif
+
 #ifndef __no_fgcse
 # define __no_fgcse
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index bd7d650d4a99..7fee5978dd73 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -30,6 +30,9 @@ config CC_CAN_LINK
 config CC_HAS_ASM_GOTO
def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC))
 
+config CC_HAS_ASM_INLINE
+   def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) 
-x c - -c -o /dev/null)
+
 config CC_HAS_WARN_MAYBE_UNINITIALIZED
def_bool $(cc-option,-Wmaybe-uninitialized)
help
-- 
2.20.1

[PATCH v3 3/6] compiler_types.h: don't #define __inline

The spellings __inline and __inline__ should be reserved for uses
where one really wants to refer to the inline keyword, regardless of
whether or not the spelling "inline" has been #defined to something
else. Due to use of __inline__ in uapi headers, we can't easily get
rid of the definition of __inline__. However, almost all users of
__inline has been converted to inline, so we can get rid of that
#define.

The exception is include/acpi/platform/acintel.h. However, that header
is only included when using the intel compiler (does anybody actually
build the kernel with that?), and the ACPI_INLINE macro is only used
in the definition of utterly trivial stub functions, where I doubt a
small change of semantics (lack of __gnu_inline) changes anything.

Signed-off-by: Rasmus Villemoes 
---
 include/linux/compiler_types.h | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 599c27b56c29..ee49be6d6088 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -150,8 +150,17 @@ struct ftrace_likely_data {
__maybe_unused notrace
 #endif
 
+/*
+ * gcc provides both __inline__ and __inline as alternate spellings of
+ * the inline keyword, though the latter is undocumented. New kernel
+ * code should only use the inline spelling, but some existing code
+ * uses __inline__. Since we #define inline above, to ensure
+ * __inline__ has the same semantics, we need this #define.
+ *
+ * However, the spelling __inline is strictly reserved for referring
+ * to the bare keyword.
+ */
 #define __inline__ inline
-#define __inline   inline
 
 /*
  * Rather then using noinline to prevent stack consumption, use
-- 
2.20.1

Re: [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT

2019-09-12 Thread Joel Fernandes

On Thu, Sep 12, 2019 at 05:38:43PM -0400, Joel Fernandes wrote:
> Hi Scott,
> 
> Would you mind CC'ing r...@vger.kernel.org on RCU related patches? I added it
> for this time.
> 
> On Wed, Sep 11, 2019 at 05:57:28PM +0100, Scott Wood wrote:
> > Besides restoring behavior that used to be default on RT, this avoids
> > a deadlock on scheduler locks:
[snip]
> > [  136.995194] 039:  May be due to missing lock nesting notation
> > 
> > [  137.001115] 039: 3 locks held by rcu_torture_rea/13474:
> > [  137.006341] 039:  #0:
> > [  137.008707] 039: 5f25146d
> > [  137.012024] 039:  (
> > [  137.014131] 039: &p->pi_lock
> > [  137.017015] 039: ){-...}
> > [  137.019558] 039: , at: try_to_wake_up+0x39/0x920
> > [  137.024175] 039:  #1:
> > [  137.026540] 039: 11c8e51d
> > [  137.029859] 039:  (
> > [  137.031966] 039: &rq->lock
> > [  137.034679] 039: ){-...}
> > [  137.037217] 039: , at: try_to_wake_up+0x241/0x920
> > [  137.041924] 039:  #2:
> > [  137.044291] 039: 098649b9
> > [  137.047610] 039:  (
> > [  137.049714] 039: rcu_read_lock
> > [  137.052774] 039: ){}
> > [  137.055314] 039: , at: cpuacct_charge+0x33/0x1e0
> > [  137.059934] 039:
> > stack backtrace:
> > [  137.063425] 039: CPU: 39 PID: 13474 Comm: rcu_torture_rea Kdump: loaded 
> > Tainted: GE 5.2.9-rt3.dbg+ #174
> > [  137.074197] 039: Hardware name: Intel Corporation S2600BT/S2600BT, BIOS 
> > SE5C620.86B.01.00.0763.022420181017 02/24/2018
> > [  137.084886] 039: Call Trace:
> > [  137.087773] 039:  
> > [  137.090226] 039:  dump_stack+0x5e/0x8b
> > [  137.093997] 039:  __lock_acquire+0x725/0x1100
> > [  137.098358] 039:  lock_acquire+0xc0/0x240
> > [  137.102374] 039:  ? try_to_wake_up+0x39/0x920
> > [  137.106737] 039:  _raw_spin_lock_irqsave+0x47/0x90
> > [  137.111534] 039:  ? try_to_wake_up+0x39/0x920
> > [  137.115910] 039:  try_to_wake_up+0x39/0x920
> > [  137.120098] 039:  rcu_read_unlock_special+0x65/0xb0
> > [  137.124977] 039:  __rcu_read_unlock+0x5d/0x70
> > [  137.129337] 039:  cpuacct_charge+0xd9/0x1e0
> > [  137.133522] 039:  ? cpuacct_charge+0x33/0x1e0
> > [  137.137880] 039:  update_curr+0x14b/0x420
> > [  137.141894] 039:  enqueue_entity+0x42/0x370
> > [  137.146080] 039:  enqueue_task_fair+0xa9/0x490
> > [  137.150528] 039:  activate_task+0x5a/0xf0
> > [  137.154539] 039:  ttwu_do_activate+0x4e/0x90
> > [  137.158813] 039:  try_to_wake_up+0x277/0x920
> > [  137.163086] 039:  irq_exit+0xb6/0xf0
[snip]
> > Signed-off-by: Scott Wood 
> > ---
> > The prohibition on use_softirq should be able to be dropped once RT gets
> > the latest RCU code, but the question of what use_softirq should default
> > to on PREEMPT_RT remains.
> > 
> > v3: Use IS_ENABLED
> 
> Out of curiosity, does PREEMPT_RT use the NOCB callback offloading? If no,
> should it use it? IIUC, that does make the work the softirq have to do less
> work since the callbacks are executed in threaded context.
> 
> If yes, can RT tolerate use_softirq=false and what could a realistic softirq

s/use_softirq=false/use_softirq=true/

thanks,

 - Joel

[PATCH v3 6/6] x86: bug.h: use asm_inline in _BUG_FLAGS definitions

This helps preventing a BUG* or WARN* in some static inline from
preventing that (or one of its callers) being inlined, so should allow
gcc to make better informed inlining decisions.

For example, with gcc 9.2, tcp_fastopen_no_cookie() vanishes from
net/ipv4/tcp_fastopen.o. It does not itself have any BUG or WARN, but
it calls dst_metric() which has a WARN_ON_ONCE - and despite that
WARN_ON_ONCE vanishing since the condition is compile-time false,
dst_metric() is apparently sufficiently "large" that when it gets
inlined into tcp_fastopen_no_cookie(), the latter becomes too large
for inlining.

Overall, if one asks size(1), .text decreases a little and .data
increases by about the same amount (x86-64 defconfig)

$ size vmlinux.{before,after}
   textdata bss dec hex filename
197097265202600 1630280 26542606195020e vmlinux.before
197093305203068 1630280 265426781950256 vmlinux.after

while bloat-o-meter says

add/remove: 10/28 grow/shrink: 103/51 up/down: 3669/-2854 (815)
...
Total: Before=14783683, After=14784498, chg +0.01%

Signed-off-by: Rasmus Villemoes 
---
 arch/x86/include/asm/bug.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 6804d6642767..facba9bc30ca 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -32,7 +32,7 @@
 
 #define _BUG_FLAGS(ins, flags) \
 do {   \
-   asm volatile("1:\t" ins "\n"\
+   asm_inline volatile("1:\t" ins "\n" \
 ".pushsection __bug_table,\"aw\"\n"\
 "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n"   \
 "\t"  __BUG_REL(%c0) "\t# bug_entry::file\n"   \
@@ -49,7 +49,7 @@ do {  
\
 
 #define _BUG_FLAGS(ins, flags) \
 do {   \
-   asm volatile("1:\t" ins "\n"\
+   asm_inline volatile("1:\t" ins "\n" \
 ".pushsection __bug_table,\"aw\"\n"\
 "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n"   \
 "\t.word %c0""\t# bug_entry::flags\n"  \
-- 
2.20.1

[PATCH v3 2/6] lib/zstd/mem.h: replace __inline by inline

Currently, compiler_types.h #defines __inline as inline (and further
#defines inline to automatically attach some attributes), so this does
not change functionality. It serves as preparation for removing the
#define of __inline.

While at it, also remove the __attribute__((unused)) - it's already
included in the definition of the inline macro, and "open-coded"
__attribute__(()) should be avoided.

Since commit a95b37e20db9 (kbuild: get  out of
), compiler_types.h is automatically included by all
kernel C code - i.e., the definition of inline including the unused
attribute is guaranteed to be in effect whenever ZSTD_STATIC is
expanded.

Signed-off-by: Rasmus Villemoes 
---
 lib/zstd/mem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/zstd/mem.h b/lib/zstd/mem.h
index 3a0f34c8706c..93d7a2c377fe 100644
--- a/lib/zstd/mem.h
+++ b/lib/zstd/mem.h
@@ -27,7 +27,7 @@
 /*-
 *  Compiler specifics
 **/
-#define ZSTD_STATIC static __inline __attribute__((unused))
+#define ZSTD_STATIC static inline
 
 /*-**
 *  Basic Types
-- 
2.20.1

[PATCH v3 5/6] x86: alternative.h: use asm_inline for all alternative variants