[PATCH v13 0/3] mfd: Add Renesas R-Car Gen3 RPC-IF MFD & SPI driver
Hi, v13 patch including: 1) rename mfd to ddata for SPI driver. 2) Patch RPC-IF devicetree for SPI and HyperFlash. v12 patch including: 1) add back "wbuf" in dts example. 2) RPC-IF replace rpc-if in dts. v11 patch including: 1) Patch mfd include header file. 2) mfd coding style. 3) add back wbuf description in dts. v10 patch including: 1) Address range for > 64M byte flash. 2) Removed dirmap_write due to WBUF 256 bytes transfer issue. 3) Dummy bytes setting according to spi-nor.c layer. v9 patch is for RPC MFD driver and RPC SPI driver. v8 patch including: 1) Supported SoC-specific values in DTS. 2) Rename device node name as flash. v7 patch is according to Geert and Sergei's comments: 1) Add all R-Car Gen3 model in dts. 2) patch rpc-if child node search. 3) minror coding style. v6 patch is accroding to Geert, Marek and Sergei's comments: 1) spi_controller for new code. 2) "renesas,rcar-gen3-rpc" instead of "renesas,r8a77995-rpc." 3) patch external address read mode w/o u64 readq(). 4) patch dts for write buffer & drop "renesas,rpc-mode". 5) coding style and so on. v5 patch is accroding to Sergei's comments: 1) Read 6 bytes ID from Sergei's patch. 2) regmap_update_bits(). 3) C++ style comment. v4 patch is according to Sergei's comments including: 1) Drop soc_device_match(). 2) Drop unused RPC registers. 3) Use ilog2() instead of fls(). 4) Patch read 6 bytes ID w/ one command. 5) Coding style and so on. v3 patch is according to Marek and Geert's comments including: 1) soc_device_mach() to set up RPC_PHYCNT_STRTIM. 2) get_unaligned(). 3) rpc-mode for rpi-spi-flash or rpc-hyperflash. 4) coding style and so on. v2 patch including: 1) remove RPC clock enable/dis-able control, 2) patch run time PM. 3) add RPC module software reset, 4) add regmap. 5) other coding style and so on. thanks for your review. best regards, Mason Mason Yang (3): mfd: Add Renesas R-Car Gen3 RPC-IF MFD driver spi: Add Renesas R-Car Gen3 RPC-IF SPI controller driver dt-bindings: mfd: Document Renesas R-Car Gen3 RPC-IF controller bindings .../devicetree/bindings/mfd/renesas-rpc-if.txt | 65 +++ drivers/mfd/Kconfig| 9 + drivers/mfd/Makefile | 1 + drivers/mfd/renesas-rpc.c | 125 + drivers/spi/Kconfig| 6 + drivers/spi/Makefile | 1 + drivers/spi/spi-renesas-rpc.c | 573 + include/linux/mfd/renesas-rpc.h| 141 + 8 files changed, 921 insertions(+) create mode 100644 Documentation/devicetree/bindings/mfd/renesas-rpc-if.txt create mode 100644 drivers/mfd/renesas-rpc.c create mode 100644 drivers/spi/spi-renesas-rpc.c create mode 100644 include/linux/mfd/renesas-rpc.h -- 1.9.1
[PATCH v13 1/3] mfd: Add Renesas R-Car Gen3 RPC-IF MFD driver
Add a driver for Renesas R-Car Gen3 RPC-IF MFD Signed-off-by: Mason Yang --- drivers/mfd/Kconfig | 9 +++ drivers/mfd/Makefile| 1 + drivers/mfd/renesas-rpc.c | 125 +++ include/linux/mfd/renesas-rpc.h | 141 4 files changed, 276 insertions(+) create mode 100644 drivers/mfd/renesas-rpc.c create mode 100644 include/linux/mfd/renesas-rpc.h diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 294d956..cdbde79 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1002,6 +1002,15 @@ config MFD_RDC321X southbridge which provides access to GPIOs and Watchdog using the southbridge PCI device configuration space. +config MFD_RENESAS_RPC + tristate "Renesas R-Car Gen3 RPC-IF controller driver" + select MFD_CORE + depends on ARCH_RENESAS + help + This supports Renesas R-Car Gen3 RPC-IF controller which provides + either SPI host or HyperFlash. + You have to select individual components under the corresponding menu. + config MFD_RT5033 tristate "Richtek RT5033 Power Management IC" depends on I2C diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 52b1a90..459eb2f 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -184,6 +184,7 @@ obj-$(CONFIG_MFD_INTEL_QUARK_I2C_GPIO) += intel_quark_i2c_gpio.o obj-$(CONFIG_LPC_SCH) += lpc_sch.o obj-$(CONFIG_LPC_ICH) += lpc_ich.o obj-$(CONFIG_MFD_RDC321X) += rdc321x-southbridge.o +obj-$(CONFIG_MFD_RENESAS_RPC) += renesas-rpc.o obj-$(CONFIG_MFD_JANZ_CMODIO) += janz-cmodio.o obj-$(CONFIG_MFD_JZ4740_ADC) += jz4740-adc.o obj-$(CONFIG_MFD_TPS6586X) += tps6586x.o diff --git a/drivers/mfd/renesas-rpc.c b/drivers/mfd/renesas-rpc.c new file mode 100644 index 000..c80c8d1 --- /dev/null +++ b/drivers/mfd/renesas-rpc.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Copyright (C) 2018 ~ 2019 Renesas Solutions Corp. +// Copyright (C) 2019 Macronix International Co., Ltd. +// +// R-Car Gen3 RPC-IF MFD driver +// +// Author: +// Mason Yang +// + +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell rpc_hf_ctlr = { + .name = "rpc-hf", +}; + +static const struct mfd_cell rpc_spi_ctlr = { + .name = "rpc-spi", +}; + +static const struct regmap_range rpc_mfd_volatile_ranges[] = { + regmap_reg_range(RPC_SMRDR0, RPC_SMRDR1), + regmap_reg_range(RPC_SMWDR0, RPC_SMWDR1), + regmap_reg_range(RPC_CMNSR, RPC_CMNSR), +}; + +static const struct regmap_access_table rpc_mfd_volatile_table = { + .yes_ranges = rpc_mfd_volatile_ranges, + .n_yes_ranges = ARRAY_SIZE(rpc_mfd_volatile_ranges), +}; + +static const struct regmap_config rpc_mfd_regmap_config = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .fast_io = true, + .max_register = RPC_PHYOFFSET2, + .volatile_table = &rpc_mfd_volatile_table, +}; + +static int rpc_mfd_probe(struct platform_device *pdev) +{ + struct device_node *flash; + const struct mfd_cell *cell; + struct resource *res; + struct rpc_mfd *rpc; + void __iomem *base; + + flash = of_get_next_child(pdev->dev.of_node, NULL); + if (!flash) { + dev_warn(&pdev->dev, "no flash node found\n"); + return -ENODEV; + } + + if (of_device_is_compatible(flash, "jedec,spi-nor")) { + cell = &rpc_spi_ctlr; + } else if (of_device_is_compatible(flash, "cfi-flash")) { + cell = &rpc_hf_ctlr; + } else { + dev_warn(&pdev->dev, "unknown flash type\n"); + return -ENODEV; + } + + rpc = devm_kzalloc(&pdev->dev, sizeof(*rpc), GFP_KERNEL); + if (!rpc) + return -ENOMEM; + + rpc->clk_rpc = devm_clk_get(&pdev->dev, "rpc"); + if (IS_ERR(rpc->clk_rpc)) + return PTR_ERR(rpc->clk_rpc); + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs"); + base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(base)) + return PTR_ERR(base); + + rpc->regmap = devm_regmap_init_mmio(&pdev->dev, base, + &rpc_mfd_regmap_config); + if (IS_ERR(rpc->regmap)) { + dev_err(&pdev->dev, + "failed to init regmap for rpc-mfd, error %ld\n", + PTR_ERR(rpc->regmap)); + return PTR_ERR(rpc->regmap); + } + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dirmap"); + rpc->dirmap = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(rpc->dirmap)) + rpc->dirmap = NULL; + + rpc->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL); + if (IS_ERR(rpc->rstc)) + re
[PATCH v13 2/3] spi: Add Renesas R-Car Gen3 RPC-IF SPI controller driver
Add a driver for Renesas R-Car Gen3 RPC-IF SPI controller. Signed-off-by: Mason Yang Signed-off-by: Sergei Shtylyov --- drivers/spi/Kconfig | 6 + drivers/spi/Makefile | 1 + drivers/spi/spi-renesas-rpc.c | 573 ++ 3 files changed, 580 insertions(+) create mode 100644 drivers/spi/spi-renesas-rpc.c diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 0fba8f4..229f6d7 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -570,6 +570,12 @@ config SPI_RSPI help SPI driver for Renesas RSPI and QSPI blocks. +config SPI_RENESAS_RPC + tristate "Renesas R-Car Gen3 RPC-IF SPI controller" + depends on ARCH_RENESAS || COMPILE_TEST + help + SPI driver for Renesas R-Car Gen3 RPC-IF. + config SPI_QCOM_QSPI tristate "QTI QSPI controller" depends on ARCH_QCOM diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile index f2f78d0..2a6e052 100644 --- a/drivers/spi/Makefile +++ b/drivers/spi/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_SPI_QUP) += spi-qup.o obj-$(CONFIG_SPI_ROCKCHIP) += spi-rockchip.o obj-$(CONFIG_SPI_RB4XX)+= spi-rb4xx.o obj-$(CONFIG_SPI_RSPI) += spi-rspi.o +obj-$(CONFIG_SPI_RENESAS_RPC) += spi-renesas-rpc.o obj-$(CONFIG_SPI_S3C24XX) += spi-s3c24xx-hw.o spi-s3c24xx-hw-y := spi-s3c24xx.o spi-s3c24xx-hw-$(CONFIG_SPI_S3C24XX_FIQ) += spi-s3c24xx-fiq.o diff --git a/drivers/spi/spi-renesas-rpc.c b/drivers/spi/spi-renesas-rpc.c new file mode 100644 index 000..86931cf --- /dev/null +++ b/drivers/spi/spi-renesas-rpc.c @@ -0,0 +1,573 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Copyright (C) 2018 ~ 2019 Renesas Solutions Corp. +// Copyright (C) 2019 Macronix International Co., Ltd. +// +// R-Car Gen3 RPC-IF SPI/QSPI/Octa driver +// +// Author: +// Mason Yang +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct rpc_spi { + struct rpc_mfd *ddata; + u32 cur_speed_hz; + u32 cmd; + u32 addr; + u32 dummy; + u32 smcr; + u32 smenr; + u32 xferlen; + u32 totalxferlen; + enum spi_mem_data_dir xfer_dir; +}; + +static int rpc_spi_set_freq(struct rpc_spi *rpc, unsigned long freq) +{ + int ret; + + if (rpc->cur_speed_hz == freq) + return 0; + + ret = clk_set_rate(rpc->ddata->clk_rpc, freq); + if (ret) + return ret; + + rpc->cur_speed_hz = freq; + return ret; +} + +static void rpc_spi_hw_init(struct rpc_spi *rpc) +{ + // + // NOTE: The 0x260 are undocumented bits, but they must be set. + // RPC_PHYCNT_STRTIM is strobe timing adjustment bit, + // 0x0 : the delay is biggest, + // 0x1 : the delay is 2nd biggest, + // On H3 ES1.x, the value should be 0, while on others, + // the value should be 6. + // + regmap_write(rpc->ddata->regmap, RPC_PHYCNT, RPC_PHYCNT_CAL | + RPC_PHYCNT_STRTIM(6) | 0x260); + + // + // NOTE: The 0x1511144 are undocumented bits, but they must be set + // for RPC_PHYOFFSET1. + // The 0x31 are undocumented bits, but they must be set + // for RPC_PHYOFFSET2. + // + regmap_write(rpc->ddata->regmap, RPC_PHYOFFSET1, +RPC_PHYOFFSET1_DDRTMG(3) | 0x1511144); + regmap_write(rpc->ddata->regmap, RPC_PHYOFFSET2, 0x31 | +RPC_PHYOFFSET2_OCTTMG(4)); + regmap_write(rpc->ddata->regmap, RPC_SSLDR, RPC_SSLDR_SPNDL(7) | +RPC_SSLDR_SLNDL(7) | RPC_SSLDR_SCKDL(7)); + regmap_write(rpc->ddata->regmap, RPC_CMNCR, RPC_CMNCR_MD | +RPC_CMNCR_SFDE | RPC_CMNCR_MOIIO_HIZ | RPC_CMNCR_IOFV_HIZ | +RPC_CMNCR_BSZ(0)); +} + +static int wait_msg_xfer_end(struct rpc_spi *rpc) +{ + u32 sts; + + return regmap_read_poll_timeout(rpc->ddata->regmap, RPC_CMNSR, sts, + sts & RPC_CMNSR_TEND, 0, USEC_PER_SEC); +} + +static u8 rpc_bits_set(u32 nbytes) +{ + nbytes = clamp(nbytes, 1U, 4U); + + return GENMASK(3, 4 - nbytes); +} + +static int rpc_spi_io_xfer(struct rpc_spi *rpc, + const void *tx_buf, void *rx_buf) +{ + u32 smenr, smcr, data, pos = 0; + int ret; + + regmap_update_bits(rpc->ddata->regmap, RPC_CMNCR, RPC_CMNCR_MD, + RPC_CMNCR_MD); + regmap_write(rpc->ddata->regmap, RPC_SMDRENR, 0); + regmap_write(rpc->ddata->regmap, RPC_SMCMR, rpc->cmd); + regmap_write(rpc->ddata->regmap, RPC_SMDMCR, rpc->dummy); + regmap_write(rpc->ddata->regmap, RPC_SMADR, rpc->addr); + smenr = rpc->smenr; + + if (tx_buf) { +
[PATCH v13 3/3] dt-bindings: mfd: Document Renesas R-Car Gen3 RPC-IF controller bindings
Document the bindings used by the Renesas R-Car Gen3 RPC-IF controller. Signed-off-by: Mason Yang --- .../devicetree/bindings/mfd/renesas-rpc-if.txt | 65 ++ 1 file changed, 65 insertions(+) create mode 100644 Documentation/devicetree/bindings/mfd/renesas-rpc-if.txt diff --git a/Documentation/devicetree/bindings/mfd/renesas-rpc-if.txt b/Documentation/devicetree/bindings/mfd/renesas-rpc-if.txt new file mode 100644 index 000..20ec85b --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/renesas-rpc-if.txt @@ -0,0 +1,65 @@ +Renesas R-Car Gen3 RPC-IF controller Device Tree Bindings +- + +RPC-IF supports both SPI NOR and HyperFlash (CFI-compliant flash) + +Required properties: +- compatible: should be an SoC-specific compatible value, followed by + "renesas,rcar-gen3-rpc" as a fallback. + supported SoC-specific values are: + "renesas,r8a77995-rpc" (R-Car D3) +- reg: should contain three register areas: + first for RPC-IF registers, + second for the direct mapping read mode and + third for the write buffer area. +- reg-names: should contain "regs", "dirmap" and "wbuf" +- clocks: should contain 1 entries for the module's clock +- clock-names: should contain "rpc" +- power-domains: should contain system-controller(sysc) for power-domain-cell +- resets: should contain clock pulse generator(cpg) for reset-cell, + power-domain-cell and clock-cell +- #address-cells: should be 1 +- #size-cells: should be 0 + +Example: +- SPI mode: + + rpc: spi@ee20 { + compatible = "renesas,r8a77995-rpc", "renesas,rcar-gen3-rpc"; + reg = <0 0xee20 0 0x200>, <0 0x0800 0 0x400>, + <0 0xee208000 0 0x100>; + reg-names = "regs", "dirmap", "wbuf"; + clocks = <&cpg CPG_MOD 917>; + clock-names = "rpc"; + power-domains = <&sysc R8A77995_PD_ALWAYS_ON>; + resets = <&cpg 917>; + #address-cells = <1>; + #size-cells = <0>; + + flash@0 { + compatible = "jedec,spi-nor"; + reg = <0>; + spi-max-frequency = <4000>; + spi-tx-bus-width = <1>; + spi-rx-bus-width = <1>; + }; + }; + +- HF mode: + rpc: spi@ee20 { + compatible = "renesas,r8a77995-rpc", "renesas,rcar-gen3-rpc"; + reg = <0 0xee20 0 0x200>, <0 0x0800 0 0x400>, + <0 0xee208000 0 0x100>; + reg-names = "regs", "dirmap", "wbuf"; + clocks = <&cpg CPG_MOD 917>; + clock-names = "rpc"; + power-domains = <&sysc R8A77995_PD_ALWAYS_ON>; + resets = <&cpg 917>; + #address-cells = <1>; + #size-cells = <0>; + + flash@0 { + compatible = "cypress,hyperflash", "cfi-flash"; + reg = <0>; + }; + }; -- 1.9.1
Re: [v2 PATCH] mm: vmscan: correct nr_reclaimed for THP
[ check_move_unevictable_pages() seems weird. It gets a pagevec from find_get_entries(), which, if I understand the THP page cache code correctly, might contain the same compound page over and over. It'll be !unevictable after the first iteration, so will only run once. So it produces incorrect numbers now, but it is probably best to ignore it until we figure out THP cache. Maybe add an XXX comment. ] The commit 5fd4ca2d84b2 ("mm: page cache: store only head pages in i_pages") changed how THP is stored in page cache, but find_get_entries() would return base page by calling find_subpage(), so check_move_unevictable_pages() should just returns the number of base pages.
Re: [PATCH] nvme: target: use struct_size() in kmalloc()
Looks ok to me, although for a fixed size argument the whole overflow detection thing in struct_size() is rather pointless..
Re: [PATCH] powerpc/mm: mark more tlb functions as __always_inline
Le 21/05/2019 à 08:16, Masahiro Yamada a écrit : With CONFIG_OPTIMIZE_INLINING enabled, Laura Abbott reported error with gcc 9.1.1: arch/powerpc/mm/book3s64/radix_tlb.c: In function '_tlbiel_pid': arch/powerpc/mm/book3s64/radix_tlb.c:104:2: warning: asm operand 3 probably doesn't match constraints 104 | asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) | ^~~ arch/powerpc/mm/book3s64/radix_tlb.c:104:2: error: impossible constraint in 'asm' Fixing _tlbiel_pid() is enough to address the warning above, but I inlined more functions to fix all potential issues. To meet the 'i' (immediate) constraint for the asm operands, functions propagating propagated 'ric' must be always inlined. Fixes: 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING") Reported-by: Laura Abbott Signed-off-by: Masahiro Yamada --- arch/powerpc/mm/book3s64/hash_native.c | 8 +++-- arch/powerpc/mm/book3s64/radix_tlb.c | 44 +++--- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index aaa28fd918fe..bc2c35c0d2b1 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -60,9 +60,11 @@ static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) * tlbiel instruction for hash, set invalidation * i.e., r=1 and is=01 or is=10 or is=11 */ -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, + unsigned int is, + unsigned int pid, + unsigned int ric, + unsigned int prs) Please don't split the line more than it is. powerpc accepts lines up to 90 chars, see arch/powerpc/tools/checkpatch.pl { unsigned long rb; unsigned long rs; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 4d841369399f..91c4242c1be3 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -29,9 +29,11 @@ * tlbiel instruction for radix, set invalidation * i.e., r=1 and is=01 or is=10 or is=11 */ -static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) +static __always_inline void tlbiel_radix_set_isa300(unsigned int set, + unsigned int is, + unsigned int pid, + unsigned int ric, + unsigned int prs) Please don't split the line more than it is. { unsigned long rb; unsigned long rs; @@ -150,8 +152,8 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, - unsigned long ric) +static __always_inline void __tlbiel_lpid_guest(unsigned long lpid, int set, + unsigned long ric) { unsigned long rb,rs,prs,r; @@ -167,8 +169,8 @@ static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, } -static inline void __tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -183,8 +185,8 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -199,8 +201,10 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbie_lpid_va(unsigned long va, + unsigned long lpid, +
Re: [RFC PATCH v2 0/4] Input: mpr121-polled: Add polled driver for MPR121
On 21. 05. 19 7:37, Dmitry Torokhov wrote: Hi Michal, On Fri, May 17, 2019 at 03:12:49PM +0200, Michal Vokáč wrote: Hi, I have to deal with a situation where we have a custom i.MX6 based platform in production that uses the MPR121 touchkey controller. Unfortunately the chip is connected using only the I2C interface. The interrupt line is not used. Back in 2015 (Linux v3.14), my colleague modded the existing mpr121_touchkey.c driver to use polling instead of interrupt. For quite some time yet I am in a process of updating the product from the ancient Freescale v3.14 kernel to the latest mainline and pushing any needed changes upstream. The DT files for our imx6dl-yapp4 platform already made it into v5.1-rc. I rebased and updated our mpr121 patch to the latest mainline. It is created as a separate driver, similarly to gpio_keys_polled. The I2C device is quite susceptible to ESD. An ESD test quite often causes reset of the chip or some register randomly changes its value. The [PATCH 3/4] adds a write-through register cache. With the cache this state can be detected and the device can be re-initialied. The main question is: Is there any chance that such a polled driver could be accepted? Is it correct to implement it as a separate driver or should it be done as an option in the existing driver? I can not really imagine how I would do that though.. There are also certain worries that the MPR121 chip may no longer be available in nonspecifically distant future. In case of EOL I will need to add a polled driver for an other touchkey chip. May it be already in mainline or a completely new one. I think that my addition of input_polled_dev was ultimately a wrong thing to do. I am looking into enabling polling mode for regular input devices as we then can enable polling mode in existing drivers. OK, that sounds good. Especially when one needs to switch from one chip to another that is already in tree, the need for a whole new polling driver is eliminated. I am still quite a novice in all kernel areas as I literally jump from one subsystem to another to fix issues related to our platform. Anyway, do you see any opportunity to help with that work? As far as gpio-keys vs gpio-key-polled, I feel that the capabilities of polling driver is sufficiently different from interrupt-driven one, so we will likely keep them separate. OK, understand. Thank you, Michal
Re: [RFC 4/7] mm: factor out madvise's core functionality
On Tue 21-05-19 08:36:28, Oleksandr Natalenko wrote: [...] > Regarding restricting the hints, I'm definitely interested in having > remote MADV_MERGEABLE/MADV_UNMERGEABLE. But, OTOH, doing it via remote > madvise() introduces another issue with traversing remote VMAs reliably. > IIUC, one can do this via userspace by parsing [s]maps file only, which > is not very consistent, and once some range is parsed, and then it is > immediately gone, a wrong hint will be sent. > > Isn't this a problem we should worry about? See http://lkml.kernel.org/r/20190520091829.gy6...@dhcp22.suse.cz -- Michal Hocko SUSE Labs
Re: [PATCH 2/2] Input: synaptics - remove X240 from the topbuttonpad list
On Tue, May 21, 2019 at 7:09 AM Dmitry Torokhov wrote: > > Hi Aaron, > > On Sun, May 19, 2019 at 03:27:11PM +0800, Aaron Ma wrote: > > Lenovo ThinkPad X240 does not have the top software button. > > When this wrong ID in top button list, smbus mode will fail to probe, > > so keep it working at PS2 mode. > > > > Cc: sta...@vger.kernel.org > > Signed-off-by: Aaron Ma > > --- > > drivers/input/mouse/synaptics.c | 1 - > > 1 file changed, 1 deletion(-) > > > > diff --git a/drivers/input/mouse/synaptics.c > > b/drivers/input/mouse/synaptics.c > > index b6da0c1267e3..6ae7bc92476b 100644 > > --- a/drivers/input/mouse/synaptics.c > > +++ b/drivers/input/mouse/synaptics.c > > @@ -140,7 +140,6 @@ static const char * const topbuttonpad_pnp_ids[] = { > > "LEN002E", > > "LEN0033", /* Helix */ > > "LEN0034", /* T431s, L440, L540, T540, W540, X1 Carbon 2nd */ > > - "LEN0035", /* X240 */ > > According to the history this came from Synaptics through Hans, so I'd > like to make sure there are no several X240 versions floating around... A quick google image search showed that the X240 had 2 versions: one with the top software buttons, one without. And this definitively rings a bell. I am sure we asked Lenovo and Synaptics to change the PnPID when they would do such a change, but they "forgot" during the *40 series refresh. We have code in place to fix the reported ranges of the coordinates, and we had to check against the board id (see min_max_pnpid_table[] in synaptics.c). Unfortunately, X240 (LEN0035) is not part of this table, so I don't know which refresh of the board ID has implemented the non top software buttons. Cheers, Benjamin > > > "LEN0036", /* T440 */ > > "LEN0037", /* X1 Carbon 2nd */ > > "LEN0038", > > -- > > 2.17.1 > > > > Thanks. > > -- > Dmitry
Re: [PATCH] arm64: dts: allwinner: a64-oceanic-5205-5inmfd: Enable CAN
Hi Maxime On Thu, Apr 18, 2019 at 4:56 PM Maxime Ripard wrote: > > On Thu, Apr 18, 2019 at 07:46:58PM +0530, Jagan Teki wrote: > > Oceanic 5205 5inMFD has MCP2515 CAN device connected via SPI1. > > > > - via SPI1 bus > > - vdd supplied by 5V supply along with PL2 enable pin > > - xceiver supply same as vdd > > - can oscillator connected at 20MHz > > - PB2 gpio as interrupt pin > > - PD6 gpio as RX_BUF1_CAN0 > > - PD7 gpio as RX_BUF0_CAN0 > > > > Tested-by: Tamas Papp > > Signed-off-by: Jagan Teki > > --- > > .../sun50i-a64-oceanic-5205-5inmfd.dts| 43 +++ > > 1 file changed, 43 insertions(+) > > > > diff --git > > a/arch/arm64/boot/dts/allwinner/sun50i-a64-oceanic-5205-5inmfd.dts > > b/arch/arm64/boot/dts/allwinner/sun50i-a64-oceanic-5205-5inmfd.dts > > index f0cd6587f619..22535a297f51 100644 > > --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-oceanic-5205-5inmfd.dts > > +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-oceanic-5205-5inmfd.dts > > @@ -21,6 +21,24 @@ > > chosen { > > stdout-path = "serial0:115200n8"; > > }; > > + > > + can_osc: can-osc { > > + compatible = "fixed-clock"; > > + #clock-cells = <0>; > > + clock-frequency = <2000>; > > + }; > > + > > + reg_can_v5v: reg-can-v5v { > > + compatible = "regulator-fixed"; > > + regulator-name = "reg-can-v5v"; > > + regulator-min-microvolt = <500>; > > + regulator-max-microvolt = <500>; > > + regulator-boot-on; > > + enable-active-high; > > + gpio = <&r_pio 0 2 GPIO_ACTIVE_HIGH>; /* CAN_3V3_EN: PL2 */ > > + status = "okay"; > > You don't need the status property here. > Correct, need to be dropped > > + }; > > + > > }; > > > > &ehci0 { > > @@ -77,6 +95,31 @@ > > status = "okay"; > > }; > > > > +&pio { > > + can_pins: can-pins { > > + pins = "PD6", /* RX_BUF1_CAN0 */ > > +"PD7"; /* RX_BUF0_CAN0 */ > > + function = "gpio_in"; > > + }; > > +}; > > That isn't needed. What are they used for, you're not tying them to > anything? Mux of their function is correct. They are connected in the schematics but not used right now. I can garantee that kernel wlll always configurred in the right way and if I want I can export in userspace for debug purpose Michael > > Maxime > > -- > Maxime Ripard, Bootlin > Embedded Linux and Kernel engineering > https://bootlin.com -- | Michael Nazzareno Trimarchi Amarula Solutions BV | | COO - Founder Cruquiuskade 47 | | +31(0)851119172 Amsterdam 1018 AM NL | | [`as] http://www.amarulasolutions.com |
Re: [PATCH v3] vt: Fix a missing-check bug in drivers/tty/vt/vt.c
Cc'ing Grzegorz. On Tue, May 21, 2019 at 11:21:18AM +0800, Gen Zhang wrote: > On Mon, May 20, 2019 at 10:55:40PM -0400, Nicolas Pitre wrote: > > As soon as you release the lock, another thread could come along and > > start using the memory pointed by vc_cons[currcons].d you're about to > > free here. This is unlikely for an initcall, but still. > > > > You should consider this ordering instead: > > > > err_vc_screenbuf: > > kfree(vc); > > vc_cons[currcons].d = NULL; > > err_vc: > > console_unlock(); > > return -ENOMEM; > In function con_init(), the pointer variable vc_cons[currcons].d, vc and > vc->vc_screenbuf is allocated a memory space via kzalloc(). And they are > used in the following codes. > However, when there is a memory allocation error, kzalloc() can fail. > Thus null pointer (vc_cons[currcons].d, vc and vc->vc_screenbuf) > dereference may happen. And it will cause the kernel to crash. Therefore, > we should check return value and handle the error. > Further,the loop condition MIN_NR_CONSOLES is defined as 1 in > include/uapi/linux/vt.h and it is not changed. So there is no need to > unwind the loop. > > Signed-off-by: Gen Zhang > > --- > diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c > index fdd12f8..ea47eb3 100644 > --- a/drivers/tty/vt/vt.c > +++ b/drivers/tty/vt/vt.c > @@ -3350,10 +3350,14 @@ static int __init con_init(void) > > for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) { > vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), > GFP_NOWAIT); > + if (!vc) > + goto err_vc; > INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); > tty_port_init(&vc->port); > visual_init(vc, currcons, 1); > vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT); > + if (!vc->vc_screenbuf) > + goto err_vc_screenbuf; > vc_init(vc, vc->vc_rows, vc->vc_cols, > currcons || !vc->vc_sw->con_save_screen); > } > @@ -3375,6 +3379,13 @@ static int __init con_init(void) > register_console(&vt_console_driver); > #endif > return 0; > +err_vc_screenbuf: > + kfree(vc); > + vc_cons[currcons].d = NULL; > +err_vc: > + console_unlock(); > + return -ENOMEM; > + > } > console_initcall(con_init); > --- -- Best regards, Oleksandr Natalenko (post-factum) Senior Software Maintenance Engineer
Re: [RFC 6/7] mm: extend process_madvise syscall to support vector arrary
On Tue 21-05-19 11:48:20, Minchan Kim wrote: > On Mon, May 20, 2019 at 11:22:58AM +0200, Michal Hocko wrote: > > [Cc linux-api] > > > > On Mon 20-05-19 12:52:53, Minchan Kim wrote: > > > Currently, process_madvise syscall works for only one address range > > > so user should call the syscall several times to give hints to > > > multiple address range. > > > > Is that a problem? How big of a problem? Any numbers? > > We easily have 2000+ vma so it's not trivial overhead. I will come up > with number in the description at respin. Does this really have to be a fast operation? I would expect the monitor is by no means a fast path. The system call overhead is not what it used to be, sigh, but still for something that is not a hot path it should be tolerable, especially when the whole operation is quite expensive on its own (wrt. the syscall entry/exit). I am not saying we do not need a multiplexing API, I am just not sure we need it right away. Btw. there was some demand for other MM syscalls to provide a multiplexing API (e.g. mprotect), maybe it would be better to handle those in one go? -- Michal Hocko SUSE Labs
[PATCH] perf docs: description of header HEADER_BPF_PROG_INFO and HEADER_BPF_BTF
This patch addes description of HEADER_BPF_PROG_INFO and HEADER_BPF_BTF to perf.data-file-format.txt. Signed-off-by: Song Liu --- .../perf/Documentation/perf.data-file-format.txt | 16 1 file changed, 16 insertions(+) diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index 6967e9b02be5..ab48db21ff16 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -272,6 +272,22 @@ struct { Two uint64_t for the time of first sample and the time of last sample. +HEADER_BPF_PROG_INFO = 25, + +struct bpf_prog_info_linear, which contains detailed information about +a BPF program, including type, id, tag, jited/xlated instructions, etc. + +HEADER_BPF_BTF = 26, + +Contains BPF Type Format (BTF). For more information about BTF, please +refer to Documentation/bpf/btf.rst. + +struct { + u32 id; + u32 data_size; + chardata[]; +}; + HEADER_COMPRESSED = 27, struct { -- 2.17.1
Re: [PATCH] arm64: dts: allwinner: a64: Add lradc node
On May 20, 2019 1:07:42 PM GMT+02:00, Maxime Ripard wrote: >Hi! > >On Sat, May 18, 2019 at 07:09:30PM +0200, Luca Weiss wrote: >> Add a node describing the KEYADC on the A64. >> >> Signed-off-by: Luca Weiss >> --- >> arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi >b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi >> index 7734f70e1057..dc1bf8c1afb5 100644 >> --- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi >> +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi >> @@ -704,6 +704,13 @@ >> status = "disabled"; >> }; >> >> +lradc: lradc@1c21800 { >> +compatible = "allwinner,sun4i-a10-lradc-keys"; >> +reg = <0x01c21800 0x100>; >> +interrupts = ; >> +status = "disabled"; >> +}; >> + > >The controller is pretty different on the A64 compared to the A10. The >A10 has two channels for example, while the A64 has only one. > >It looks like the one in the A83t though, so you can use that >compatible instead. > >Maxime > >-- >Maxime Ripard, Bootlin >Embedded Linux and Kernel engineering >https://bootlin.com Hi, Looking at the patch for the A83t, the only difference is that it uses a 3/4 instead of a 2/3 voltage divider, nothing is changed with the channels. But I'm also not sure which one (or a different one) is used from looking at the "A64 User Manual". Thanks, Luca
Re: [PATCH 1/2] Input: elantech - enable middle button support on 2 ThinkPads
Hi, On Tue, May 21, 2019 at 7:11 AM Dmitry Torokhov wrote: > > Hi Aaron, > > On Sun, May 19, 2019 at 03:27:10PM +0800, Aaron Ma wrote: > > Adding 2 new touchpad PNPIDs to enable middle button support. > > Could you add their names in the comments please? > > > > > Cc: sta...@vger.kernel.org > > Signed-off-by: Aaron Ma > > --- > > drivers/input/mouse/elantech.c | 2 ++ > > 1 file changed, 2 insertions(+) > > > > diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c > > index a7f8b1614559..530142b5a115 100644 > > --- a/drivers/input/mouse/elantech.c > > +++ b/drivers/input/mouse/elantech.c > > @@ -1189,6 +1189,8 @@ static const char * const middle_button_pnp_ids[] = { > > "LEN2132", /* ThinkPad P52 */ > > "LEN2133", /* ThinkPad P72 w/ NFC */ > > "LEN2134", /* ThinkPad P72 */ > > + "LEN0407", AFAICT, this one is the Lenovo P53. However, having a whitelist of new models is not going to scale IMO. I was thinking at submitting a patch to enable middle button unconditionally, but then I realized that libinput disables middle click emulation on touchpads with an actual physical middle button. And this morning I just realized that we might have a better chance at this if we consider the new models to have the physical middle button. I know LEN0407 attempts to bind over SMBus, so I would think we can reduce the set of whitelist by just checking for ETP_NEW_IC_SMBUS_HOST_NOTIFY (patch coming in). Cheers, Benjamin > > + "LEN0408", > > These should come first - I'd like to keep the list sorted > alphabetically. > > > NULL > > }; > > > > -- > > 2.17.1 > > > > Thanks. > > -- > Dmitry
Re: [PATCH 2/2] clk: sprd: Add check for return value of sprd_clk_regmap_init()
On Tue, 21 May 2019 at 14:15, Chunyan Zhang wrote: > > sprd_clk_regmap_init() doesn't always return success, adding check > for its return value should make the code more strong. > > Signed-off-by: Chunyan Zhang Reviewed-by: Baolin Wang > --- > drivers/clk/sprd/sc9860-clk.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/drivers/clk/sprd/sc9860-clk.c b/drivers/clk/sprd/sc9860-clk.c > index 9980ab55271b..1ed45b4f2fe8 100644 > --- a/drivers/clk/sprd/sc9860-clk.c > +++ b/drivers/clk/sprd/sc9860-clk.c > @@ -2031,7 +2031,9 @@ static int sc9860_clk_probe(struct platform_device > *pdev) > } > > desc = match->data; > - sprd_clk_regmap_init(pdev, desc); > + ret = sprd_clk_regmap_init(pdev, desc); > + if (ret) > + return ret; > > return sprd_clk_probe(&pdev->dev, desc->hw_clks); > } > -- > 2.17.1 > -- Baolin Wang Best Regards
Re: [PATCH 1/2] clk: sprd: Switch from of_iomap() to devm_ioremap_resource()
Hi Chunyan, On Tue, 21 May 2019 at 14:15, Chunyan Zhang wrote: > > devm_ioremap_resources() automatically requests resources and devm_ wrappers > do better error handling and unmapping of the I/O region when needed, > that would make drivers more clean and simple. > > Signed-off-by: Chunyan Zhang > --- > drivers/clk/sprd/common.c | 9 +++-- > 1 file changed, 7 insertions(+), 2 deletions(-) > > diff --git a/drivers/clk/sprd/common.c b/drivers/clk/sprd/common.c > index e038b0447206..a5bdca1de5d0 100644 > --- a/drivers/clk/sprd/common.c > +++ b/drivers/clk/sprd/common.c > @@ -42,6 +42,7 @@ int sprd_clk_regmap_init(struct platform_device *pdev, > void __iomem *base; > struct device_node *node = pdev->dev.of_node; > struct regmap *regmap; > + struct resource *res; > > if (of_find_property(node, "sprd,syscon", NULL)) { > regmap = syscon_regmap_lookup_by_phandle(node, "sprd,syscon"); > @@ -50,10 +51,14 @@ int sprd_clk_regmap_init(struct platform_device *pdev, > return PTR_ERR(regmap); > } > } else { > - base = of_iomap(node, 0); > + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); > + base = devm_ioremap_resource(&pdev->dev, res); > + if (IS_ERR(base)) > + return PTR_ERR(base); > + > regmap = devm_regmap_init_mmio(&pdev->dev, base, >&sprdclk_regmap_config); > - if (IS_ERR_OR_NULL(regmap)) { > + if (IS_ERR(regmap)) { You did not mention this fix in your commit message, and it's better to move into one separate patch. > pr_err("failed to init regmap\n"); > return PTR_ERR(regmap); > } > -- > 2.17.1 > -- Baolin Wang Best Regards
Re: [RFC 4/7] mm: factor out madvise's core functionality
Hi. On Tue, May 21, 2019 at 10:26:49AM +0900, Minchan Kim wrote: > On Mon, May 20, 2019 at 04:26:33PM +0200, Oleksandr Natalenko wrote: > > Hi. > > > > On Mon, May 20, 2019 at 12:52:51PM +0900, Minchan Kim wrote: > > > This patch factor out madvise's core functionality so that upcoming > > > patch can reuse it without duplication. > > > > > > It shouldn't change any behavior. > > > > > > Signed-off-by: Minchan Kim > > > --- > > > mm/madvise.c | 168 +++ > > > 1 file changed, 89 insertions(+), 79 deletions(-) > > > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > index 9a6698b56845..119e82e1f065 100644 > > > --- a/mm/madvise.c > > > +++ b/mm/madvise.c > > > @@ -742,7 +742,8 @@ static long madvise_dontneed_single_vma(struct > > > vm_area_struct *vma, > > > return 0; > > > } > > > > > > -static long madvise_dontneed_free(struct vm_area_struct *vma, > > > +static long madvise_dontneed_free(struct task_struct *tsk, > > > + struct vm_area_struct *vma, > > > struct vm_area_struct **prev, > > > unsigned long start, unsigned long end, > > > int behavior) > > > @@ -754,8 +755,8 @@ static long madvise_dontneed_free(struct > > > vm_area_struct *vma, > > > if (!userfaultfd_remove(vma, start, end)) { > > > *prev = NULL; /* mmap_sem has been dropped, prev is stale */ > > > > > > - down_read(¤t->mm->mmap_sem); > > > - vma = find_vma(current->mm, start); > > > + down_read(&tsk->mm->mmap_sem); > > > + vma = find_vma(tsk->mm, start); > > > if (!vma) > > > return -ENOMEM; > > > if (start < vma->vm_start) { > > > @@ -802,7 +803,8 @@ static long madvise_dontneed_free(struct > > > vm_area_struct *vma, > > > * Application wants to free up the pages and associated backing store. > > > * This is effectively punching a hole into the middle of a file. > > > */ > > > -static long madvise_remove(struct vm_area_struct *vma, > > > +static long madvise_remove(struct task_struct *tsk, > > > + struct vm_area_struct *vma, > > > struct vm_area_struct **prev, > > > unsigned long start, unsigned long end) > > > { > > > @@ -836,13 +838,13 @@ static long madvise_remove(struct vm_area_struct > > > *vma, > > > get_file(f); > > > if (userfaultfd_remove(vma, start, end)) { > > > /* mmap_sem was not released by userfaultfd_remove() */ > > > - up_read(¤t->mm->mmap_sem); > > > + up_read(&tsk->mm->mmap_sem); > > > } > > > error = vfs_fallocate(f, > > > FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, > > > offset, end - start); > > > fput(f); > > > - down_read(¤t->mm->mmap_sem); > > > + down_read(&tsk->mm->mmap_sem); > > > return error; > > > } > > > > > > @@ -916,12 +918,13 @@ static int madvise_inject_error(int behavior, > > > #endif > > > > What about madvise_inject_error() and get_user_pages_fast() in it > > please? > > Good point. Maybe, there more places where assume context is "current" so > I'm thinking to limit hints we could allow from external process. > It would be better for maintainance point of view in that we could know > the workload/usecases when someone ask new advises from external process > without making every hints works both contexts. Well, for madvise_inject_error() we still have a remote variant of get_user_pages(), and that should work, no? Regarding restricting the hints, I'm definitely interested in having remote MADV_MERGEABLE/MADV_UNMERGEABLE. But, OTOH, doing it via remote madvise() introduces another issue with traversing remote VMAs reliably. IIUC, one can do this via userspace by parsing [s]maps file only, which is not very consistent, and once some range is parsed, and then it is immediately gone, a wrong hint will be sent. Isn't this a problem we should worry about? > > Thanks. -- Best regards, Oleksandr Natalenko (post-factum) Senior Software Maintenance Engineer
RE: [RESEND] input: keyboard: imx: make sure keyboard can always wake up system
Hi, Dmitry > -Original Message- > From: dmitry.torok...@gmail.com [mailto:dmitry.torok...@gmail.com] > Sent: Tuesday, May 21, 2019 1:31 PM > To: Anson Huang > Cc: shawn...@kernel.org; s.ha...@pengutronix.de; > ker...@pengutronix.de; feste...@gmail.com; linux-in...@vger.kernel.org; > linux-arm-ker...@lists.infradead.org; linux-kernel@vger.kernel.org; dl-linux- > imx > Subject: Re: [RESEND] input: keyboard: imx: make sure keyboard can always > wake up system > > Hi Anson, > On Thu, Apr 04, 2019 at 01:40:16AM +, Anson Huang wrote: > > There are several scenarios that keyboard can NOT wake up system from > > suspend, e.g., if a keyboard is depressed between system device > > suspend phase and device noirq suspend phase, the keyboard ISR will be > > called and both keyboard depress and release interrupts will be > > disabled, then keyboard will no longer be able to wake up system. > > Another scenario would be, if a keyboard is kept depressed, and then > > system goes into suspend, the expected behavior would be when keyboard > > is released, system will be waked up, but current implementation can > > NOT achieve that, because both depress and release interrupts are > > disabled in ISR, and the event check is still in progress. > > > > To fix these issues, need to make sure keyboard's depress or release > > interrupt is enabled after noirq device suspend phase, this patch > > moves the suspend/resume callback to noirq suspend/resume phase, and > > enable the corresponding interrupt according to current keyboard status. > > I believe it is possible for IRQ to be disabled and still being enabled as > wakeup source. What happens if you call disable_irq() before disabling the > clock? Doing below does NOT fix the scenario/issue 100%, if the keypad's IRQ arrived during suspend phase but before disabling its IRQ in its suspend callback, then issue is still there, as the issue is that when system suspend, keypad's irq arrived during suspend and noirq suspend phase, then keypad's hardware interrupt detection will be disabled in the ISR handler, and the timer event setup by ISR handler is NOT fired, imx_keypad_check_for_events() is NOT executed and hardware keypad's depress/release interrupt is NOT re-enabled yet, so it can NOT wake up system anymore... So I think the solid fix is to make sure keypad can generate IRQ (either depress or release) at any time during system suspend flow. +++ b/drivers/input/keyboard/imx_keypad.c @@ -533,6 +533,8 @@ static int __maybe_unused imx_kbd_suspend(struct device *dev) /* imx kbd can wake up system even clock is disabled */ mutex_lock(&input_dev->mutex); + disable_irq(kbd->irq); + if (input_dev->users) clk_disable_unprepare(kbd->clk); @@ -562,6 +569,8 @@ static int __maybe_unused imx_kbd_resume(struct device *dev) goto err_clk; } + enable_irq(kbd->irq); + err_clk: Anson. > > Thanks. > > -- > Dmitry
Re: [PATCH] riscv: include generic support for MSI irqdomains
On Mon, May 20, 2019 at 11:25:28AM -0700, Paul Walmsley wrote: > Some RISC-V systems include PCIe host controllers that support PCIe > message-signaled interrupts. For this to work on Linux, we need to > enable PCI_MSI_IRQ_DOMAIN and define struct msi_alloc_info. Support > for the latter is enabled by including the architecture-generic msi.h > include. > > Based on a patch from Wesley Terpstra : > > https://github.com/riscv/riscv-linux/commit/7d55f38fb79f459d2e88bcee7e147796400cafa8 > > Signed-off-by: Paul Walmsley > Signed-off-by: Paul Walmsley > Cc: Wesley Terpstra Well, this is very much Wes' patch as-is. It should probably be attributed to him and you should ask for his signoff. Otherwise this looks fine: Reviewed-by: Christoph Hellwig
[PATCH v2 1/2] mfd: rk808: Check pm_power_off pointer
The function pointer pm_power_off may point to function from other module (PSCI for example). If rk808 is removed, pm_power_off is overwritten to NULL and the system cannot be powered off. This patch checks if pm_power_off points to a module function. Signed-off-by: Stefan Mavrodiev --- Changes in v2: - Initial release actually drivers/mfd/rk808.c | 13 +++-- include/linux/mfd/rk808.h | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c index 94377782d208..c0b179792bbf 100644 --- a/drivers/mfd/rk808.c +++ b/drivers/mfd/rk808.c @@ -438,7 +438,6 @@ static int rk808_probe(struct i2c_client *client, struct rk808 *rk808; const struct rk808_reg_data *pre_init_reg; const struct mfd_cell *cells; - void (*pm_pwroff_fn)(void); int nr_pre_init_regs; int nr_cells; int pm_off = 0, msb, lsb; @@ -475,7 +474,7 @@ static int rk808_probe(struct i2c_client *client, nr_pre_init_regs = ARRAY_SIZE(rk805_pre_init_reg); cells = rk805s; nr_cells = ARRAY_SIZE(rk805s); - pm_pwroff_fn = rk805_device_shutdown; + rk808->pm_pwroff_fn = rk805_device_shutdown; break; case RK808_ID: rk808->regmap_cfg = &rk808_regmap_config; @@ -484,7 +483,7 @@ static int rk808_probe(struct i2c_client *client, nr_pre_init_regs = ARRAY_SIZE(rk808_pre_init_reg); cells = rk808s; nr_cells = ARRAY_SIZE(rk808s); - pm_pwroff_fn = rk808_device_shutdown; + rk808->pm_pwroff_fn = rk808_device_shutdown; break; case RK818_ID: rk808->regmap_cfg = &rk818_regmap_config; @@ -493,7 +492,7 @@ static int rk808_probe(struct i2c_client *client, nr_pre_init_regs = ARRAY_SIZE(rk818_pre_init_reg); cells = rk818s; nr_cells = ARRAY_SIZE(rk818s); - pm_pwroff_fn = rk818_device_shutdown; + rk808->pm_pwroff_fn = rk818_device_shutdown; break; default: dev_err(&client->dev, "Unsupported RK8XX ID %lu\n", @@ -548,7 +547,7 @@ static int rk808_probe(struct i2c_client *client, "rockchip,system-power-controller"); if (pm_off && !pm_power_off) { rk808_i2c_client = client; - pm_power_off = pm_pwroff_fn; + pm_power_off = rk808->pm_pwroff_fn; } return 0; @@ -563,7 +562,9 @@ static int rk808_remove(struct i2c_client *client) struct rk808 *rk808 = i2c_get_clientdata(client); regmap_del_irq_chip(client->irq, rk808->irq_data); - pm_power_off = NULL; + + if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn) + pm_power_off = NULL; return 0; } diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h index d3156594674c..8b5d68a7bb9c 100644 --- a/include/linux/mfd/rk808.h +++ b/include/linux/mfd/rk808.h @@ -453,5 +453,6 @@ struct rk808 { longvariant; const struct regmap_config *regmap_cfg; const struct regmap_irq_chip*regmap_irq_chip; + void(*pm_pwroff_fn)(void); }; #endif /* __LINUX_REGULATOR_RK808_H */ -- 2.17.1
[PATCH v2 0/2] mfd: rk808: Fix pointers and poweroff
This patch is actually follow-up to: [PATCH 1/1] mfd: rk808: Prepare rk8085 for poweroff The patchset fixes poweroff function for boards with RK8085 PMU. During the preparation of v2 possible wrong pointer access was spot (pm_power_off), so one more patch was introduced in the series. Stefan Mavrodiev (2): mfd: rk808: Check pm_power_off pointer mfd: rk808: Prepare rk805 for poweroff drivers/mfd/rk808.c | 42 +-- include/linux/mfd/rk808.h | 2 ++ 2 files changed, 38 insertions(+), 6 deletions(-) -- 2.17.1
[PATCH v2 2/2] mfd: rk808: Prepare rk805 for poweroff
RK805 has SLEEP signal, which can put the device into SLEEP or OFF mode. The default is SLEEP mode. However, when the kernel performs power-off (actually the ATF) the device will not go fully off and this will result in higher power consumption and inability to wake the device with RTC alarm. The solution is to enable pm_power_off_prepare function, which will configure SLEEP pin for OFF function. Signed-off-by: Stefan Mavrodiev --- Changes for v2: - Move pm_pwroff_prep_fn to header - Check pm_power_off_prepare before make it NULL drivers/mfd/rk808.c | 29 + include/linux/mfd/rk808.h | 1 + 2 files changed, 30 insertions(+) diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c index c0b179792bbf..fb6cdf900899 100644 --- a/drivers/mfd/rk808.c +++ b/drivers/mfd/rk808.c @@ -387,6 +387,24 @@ static void rk805_device_shutdown(void) dev_err(&rk808_i2c_client->dev, "power off error!\n"); } +static void rk805_device_shutdown_prepare(void) +{ + int ret; + struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client); + + if (!rk808) { + dev_warn(&rk808_i2c_client->dev, +"have no rk805, so do nothing here\n"); + return; + } + + ret = regmap_update_bits(rk808->regmap, +RK805_GPIO_IO_POL_REG, +SLP_SD_MSK, SHUTDOWN_FUN); + if (ret) + dev_err(&rk808_i2c_client->dev, "power off error!\n"); +} + static void rk808_device_shutdown(void) { int ret; @@ -475,6 +493,7 @@ static int rk808_probe(struct i2c_client *client, cells = rk805s; nr_cells = ARRAY_SIZE(rk805s); rk808->pm_pwroff_fn = rk805_device_shutdown; + rk808->pm_pwroff_prep_fn = rk805_device_shutdown_prepare; break; case RK808_ID: rk808->regmap_cfg = &rk808_regmap_config; @@ -550,6 +569,12 @@ static int rk808_probe(struct i2c_client *client, pm_power_off = rk808->pm_pwroff_fn; } + if (pm_off && !pm_power_off_prepare) { + if (!rk808_i2c_client) + rk808_i2c_client = client; + pm_power_off_prepare = rk808->pm_pwroff_prep_fn; + } + return 0; err_irq: @@ -566,6 +591,10 @@ static int rk808_remove(struct i2c_client *client) if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn) pm_power_off = NULL; + if (rk808->pm_pwroff_prep_fn && + pm_power_off_prepare == rk808->pm_pwroff_prep_fn) + pm_power_off_prepare = NULL; + return 0; } diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h index 8b5d68a7bb9c..ec928173e507 100644 --- a/include/linux/mfd/rk808.h +++ b/include/linux/mfd/rk808.h @@ -454,5 +454,6 @@ struct rk808 { const struct regmap_config *regmap_cfg; const struct regmap_irq_chip*regmap_irq_chip; void(*pm_pwroff_fn)(void); + void(*pm_pwroff_prep_fn)(void); }; #endif /* __LINUX_REGULATOR_RK808_H */ -- 2.17.1
Re: [RFC 0/7] introduce memory hinting API for external process
[linux-api] On Mon 20-05-19 18:44:52, Matthew Wilcox wrote: > On Mon, May 20, 2019 at 12:52:47PM +0900, Minchan Kim wrote: > > IMHO we should spell it out that this patchset complements MADV_WONTNEED > > and MADV_FREE by adding non-destructive ways to gain some free memory > > space. MADV_COLD is similar to MADV_WONTNEED in a way that it hints the > > kernel that memory region is not currently needed and should be reclaimed > > immediately; MADV_COOL is similar to MADV_FREE in a way that it hints the > > kernel that memory region is not currently needed and should be reclaimed > > when memory pressure rises. > > Do we tear down page tables for these ranges? That seems like a good > way of reclaiming potentially a substantial amount of memory. I do not think we can in general because this is a non-destructive operation. So at least we cannot tear down anonymous ptes (they will turn into swap entries). -- Michal Hocko SUSE Labs
Re: [PATCH 6/7] staging: vt6656: clean-up registers initialization error path
On Mon, May 20, 2019 at 04:39:04PM +, Quentin Deslandes wrote: > Avoid discarding function's return code during register initialization. > Handle it instead and return 0 on success or a negative errno value on > error. > > Signed-off-by: Quentin Deslandes > --- > drivers/staging/vt6656/main_usb.c | 163 ++ > 1 file changed, 96 insertions(+), 67 deletions(-) > > diff --git a/drivers/staging/vt6656/main_usb.c > b/drivers/staging/vt6656/main_usb.c > index 5fd845cbdd52..8ed96e8eedbe 100644 > --- a/drivers/staging/vt6656/main_usb.c > +++ b/drivers/staging/vt6656/main_usb.c > @@ -109,33 +109,38 @@ static void vnt_set_options(struct vnt_private *priv) > */ > static int vnt_init_registers(struct vnt_private *priv) > { > + int ret = 0; Minor nit here, no need to set this to 0 as you instantly set it with this call: > struct vnt_cmd_card_init *init_cmd = &priv->init_command; > struct vnt_rsp_card_init *init_rsp = &priv->init_response; > u8 antenna; > int ii; > - int status = STATUS_SUCCESS; > u8 tmp; > u8 calib_tx_iq = 0, calib_tx_dc = 0, calib_rx_iq = 0; > > dev_dbg(&priv->usb->dev, ">INIbInitAdapter. [%d][%d]\n", > DEVICE_INIT_COLD, priv->packet_type); > > - if (!vnt_check_firmware_version(priv)) { > - if (vnt_download_firmware(priv) == true) { > - if (vnt_firmware_branch_to_sram(priv) == false) { > - dev_dbg(&priv->usb->dev, > - " vnt_firmware_branch_to_sram fail\n"); > - return false; > - } > - } else { > - dev_dbg(&priv->usb->dev, "FIRMWAREbDownload fail\n"); > - return false; > + ret = vnt_check_firmware_version(priv); You can fix that up in a later patch :) At first glance, these all look really good, thanks for doing this work. greg k-h
Re: [RFC 0/7] introduce memory hinting API for external process
[Cc linux-api] On Tue 21-05-19 13:39:50, Minchan Kim wrote: > On Mon, May 20, 2019 at 12:46:05PM -0400, Johannes Weiner wrote: > > On Mon, May 20, 2019 at 12:52:47PM +0900, Minchan Kim wrote: > > > - Approach > > > > > > The approach we chose was to use a new interface to allow userspace to > > > proactively reclaim entire processes by leveraging platform information. > > > This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages > > > that are known to be cold from userspace and to avoid races with lmkd > > > by reclaiming apps as soon as they entered the cached state. Additionally, > > > it could provide many chances for platform to use much information to > > > optimize memory efficiency. > > > > > > IMHO we should spell it out that this patchset complements MADV_WONTNEED > > > and MADV_FREE by adding non-destructive ways to gain some free memory > > > space. MADV_COLD is similar to MADV_WONTNEED in a way that it hints the > > > kernel that memory region is not currently needed and should be reclaimed > > > immediately; MADV_COOL is similar to MADV_FREE in a way that it hints the > > > kernel that memory region is not currently needed and should be reclaimed > > > when memory pressure rises. > > > > I agree with this approach and the semantics. But these names are very > > vague and extremely easy to confuse since they're so similar. > > > > MADV_COLD could be a good name, but for deactivating pages, not > > reclaiming them - marking memory "cold" on the LRU for later reclaim. > > > > For the immediate reclaim one, I think there is a better option too: > > In virtual memory speak, putting a page into secondary storage (or > > ensuring it's already there), and then freeing its in-memory copy, is > > called "paging out". And that's what this flag is supposed to do. So > > how about MADV_PAGEOUT? > > > > With that, we'd have: > > > > MADV_FREE: Mark data invalid, free memory when needed > > MADV_DONTNEED: Mark data invalid, free memory immediately > > > > MADV_COLD: Data is not used for a while, free memory when needed > > MADV_PAGEOUT: Data is not used for a while, free memory immediately > > > > What do you think? > > There are several suggestions until now. Thanks, Folks! > > For deactivating: > > - MADV_COOL > - MADV_RECLAIM_LAZY > - MADV_DEACTIVATE > - MADV_COLD > - MADV_FREE_PRESERVE > > > For reclaiming: > > - MADV_COLD > - MADV_RECLAIM_NOW > - MADV_RECLAIMING > - MADV_PAGEOUT > - MADV_DONTNEED_PRESERVE > > It seems everybody doesn't like MADV_COLD so want to go with other. > For consisteny of view with other existing hints of madvise, -preserve > postfix suits well. However, originally, I don't like the naming FREE > vs DONTNEED from the beginning. They were easily confused. > I prefer PAGEOUT to RECLAIM since it's more likely to be nuance to > represent reclaim with memory pressure and is supposed to paged-in > if someone need it later. So, it imply PRESERVE. > If there is not strong against it, I want to go with MADV_COLD and > MADV_PAGEOUT. > > Other opinion? I do not really care strongly. I am pretty sure we will have a lot of suggestions because people tend to be good at arguing about that... Anyway, unlike DONTNEED/FREE we do not have any other OS to implement these features, right? So we shouldn't be tight to existing names. On the other hand I kinda like the reference to the existing names but DEACTIVATE/PAGEOUT seem a good fit to me as well. Unless there is way much better name suggested I would go with one of those. Up to you. -- Michal Hocko SUSE Labs
[PATCH] platform/x86: asus-wmi: Only Tell EC the OS will handle display hotkeys from asus_nb_wmi
Commit 78f3ac76d9e5 ("platform/x86: asus-wmi: Tell the EC the OS will handle the display off hotkey") causes the backlight to be permanently off on various EeePC laptop models using the eeepc-wmi driver (Asus EeePC 1015BX, Asus EeePC 1025C). The asus_wmi_set_devstate(ASUS_WMI_DEVID_BACKLIGHT, 2, NULL) call added by that commit is made conditional in this commit and only enabled in the quirk_entry structs in the asus-nb-wmi driver fixing the broken display / backlight on various EeePC laptop models. Cc: João Paulo Rechi Vita Fixes: 78f3ac76d9e5 ("platform/x86: asus-wmi: Tell the EC the OS will handle the display off hotkey") Signed-off-by: Hans de Goede --- drivers/platform/x86/asus-nb-wmi.c | 8 drivers/platform/x86/asus-wmi.c| 2 +- drivers/platform/x86/asus-wmi.h| 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index b6f2ff95c3ed..59f3a37a44d7 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -78,10 +78,12 @@ static bool asus_q500a_i8042_filter(unsigned char data, unsigned char str, static struct quirk_entry quirk_asus_unknown = { .wapf = 0, + .wmi_backlight_set_devstate = true, }; static struct quirk_entry quirk_asus_q500a = { .i8042_filter = asus_q500a_i8042_filter, + .wmi_backlight_set_devstate = true, }; /* @@ -92,26 +94,32 @@ static struct quirk_entry quirk_asus_q500a = { static struct quirk_entry quirk_asus_x55u = { .wapf = 4, .wmi_backlight_power = true, + .wmi_backlight_set_devstate = true, .no_display_toggle = true, }; static struct quirk_entry quirk_asus_wapf4 = { .wapf = 4, + .wmi_backlight_set_devstate = true, }; static struct quirk_entry quirk_asus_x200ca = { .wapf = 2, + .wmi_backlight_set_devstate = true, }; static struct quirk_entry quirk_asus_ux303ub = { .wmi_backlight_native = true, + .wmi_backlight_set_devstate = true, }; static struct quirk_entry quirk_asus_x550lb = { + .wmi_backlight_set_devstate = true, .xusb2pr = 0x01D9, }; static struct quirk_entry quirk_asus_forceals = { + .wmi_backlight_set_devstate = true, .wmi_force_als_set = true, }; diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index ee1fa93708ec..a66e99500c12 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -2131,7 +2131,7 @@ static int asus_wmi_add(struct platform_device *pdev) err = asus_wmi_backlight_init(asus); if (err && err != -ENODEV) goto fail_backlight; - } else + } else if (asus->driver->quirks->wmi_backlight_set_devstate) err = asus_wmi_set_devstate(ASUS_WMI_DEVID_BACKLIGHT, 2, NULL); status = wmi_install_notify_handler(asus->driver->event_guid, diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h index 6c1311f4b04d..57a79bddb286 100644 --- a/drivers/platform/x86/asus-wmi.h +++ b/drivers/platform/x86/asus-wmi.h @@ -44,6 +44,7 @@ struct quirk_entry { bool store_backlight_power; bool wmi_backlight_power; bool wmi_backlight_native; + bool wmi_backlight_set_devstate; bool wmi_force_als_set; int wapf; /* -- 2.21.0
Re: [PATCH] usb: fix typos in code comments
On Sun, May 19, 2019 at 11:55:42AM +0800, Weitao Hou wrote: > fix lenght to length > > Signed-off-by: Weitao Hou > --- > Documentation/devicetree/bindings/usb/s3c2410-usb.txt | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) You sent 2 different patches that do different things, yet have identical subject lines :( Please fix that up and resend these as a patch series, with unique subjects. thanks, greg k-h
Re: [PATCH] vt/fbcon: deinitialize resources in visual_init() after failed memory allocation
Hi. On Fri, Apr 26, 2019 at 04:43:57PM +0200, Grzegorz Halat wrote: > After memory allocation failure vc_allocate() doesn't clean up data > which has been initialized in visual_init(). In case of fbcon this > leads to divide-by-0 in fbcon_init() on next open of the same tty. > > memory allocation in vc_allocate() may fail here: > 1097: vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_KERNEL); > > on next open() fbcon_init() skips vc_font.data initialization: > 1088: if (!p->fontdata) { > > division by zero in fbcon_init() happens here: > 1149: new_cols /= vc->vc_font.width; > > Additional check is needed in fbcon_deinit() to prevent > usage of uninitialized vc_screenbuf: > > 1251:if (vc->vc_hi_font_mask && vc->vc_screenbuf) > 1252:set_vc_hi_font(vc, false); > > Crash: > > #6 [c90001eafa60] divide_error at 81a00be4 > [exception RIP: fbcon_init+463] > RIP: 814b860f RSP: c90001eafb18 RFLAGS: 00010246 > ... > #7 [c90001eafb60] visual_init at 8154c36e > #8 [c90001eafb80] vc_allocate at 8154f53c > #9 [c90001eafbc8] con_install at 8154f624 > ... > > Signed-off-by: Grzegorz Halat > --- > drivers/tty/vt/vt.c | 11 +-- > drivers/video/fbdev/core/fbcon.c | 2 +- > 2 files changed, 10 insertions(+), 3 deletions(-) > > diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c > index 650c66886c80..ec85d195678f 100644 > --- a/drivers/tty/vt/vt.c > +++ b/drivers/tty/vt/vt.c > @@ -1056,6 +1056,13 @@ static void visual_init(struct vc_data *vc, int num, > int init) > vc->vc_screenbuf_size = vc->vc_rows * vc->vc_size_row; > } > > + > +static void visual_deinit(struct vc_data *vc) > +{ > + vc->vc_sw->con_deinit(vc); > + module_put(vc->vc_sw->owner); > +} > + > int vc_allocate(unsigned int currcons) /* return 0 on success */ > { > struct vt_notifier_param param; > @@ -1103,6 +1110,7 @@ int vc_allocate(unsigned int currcons) /* return 0 on > success */ > > return 0; > err_free: > + visual_deinit(vc); > kfree(vc); > vc_cons[currcons].d = NULL; > return -ENOMEM; > @@ -1331,9 +1339,8 @@ struct vc_data *vc_deallocate(unsigned int currcons) > param.vc = vc = vc_cons[currcons].d; > atomic_notifier_call_chain(&vt_notifier_list, VT_DEALLOCATE, > ¶m); > vcs_remove_sysfs(currcons); > - vc->vc_sw->con_deinit(vc); > + visual_deinit(vc); > put_pid(vc->vt_pid); > - module_put(vc->vc_sw->owner); > vc_uniscr_set(vc, NULL); > kfree(vc->vc_screenbuf); > vc_cons[currcons].d = NULL; > diff --git a/drivers/video/fbdev/core/fbcon.c > b/drivers/video/fbdev/core/fbcon.c > index cd059a801662..c59b23f6e9ba 100644 > --- a/drivers/video/fbdev/core/fbcon.c > +++ b/drivers/video/fbdev/core/fbcon.c > @@ -1248,7 +1248,7 @@ static void fbcon_deinit(struct vc_data *vc) > if (free_font) > vc->vc_font.data = NULL; > > - if (vc->vc_hi_font_mask) > + if (vc->vc_hi_font_mask && vc->vc_screenbuf) > set_vc_hi_font(vc, false); > > if (!con_is_bound(&fb_con)) > -- > 2.20.1 > LGTM. Reviewed-by: Oleksandr Natalenko -- Best regards, Oleksandr Natalenko (post-factum) Senior Software Maintenance Engineer
Re: [RFC 7/7] mm: madvise support MADV_ANONYMOUS_FILTER and MADV_FILE_FILTER
On Tue 21-05-19 11:55:33, Minchan Kim wrote: > On Mon, May 20, 2019 at 11:28:01AM +0200, Michal Hocko wrote: > > [cc linux-api] > > > > On Mon 20-05-19 12:52:54, Minchan Kim wrote: > > > System could have much faster swap device like zRAM. In that case, > > > swapping > > > is extremely cheaper than file-IO on the low-end storage. > > > In this configuration, userspace could handle different strategy for each > > > kinds of vma. IOW, they want to reclaim anonymous pages by MADV_COLD > > > while it keeps file-backed pages in inactive LRU by MADV_COOL because > > > file IO is more expensive in this case so want to keep them in memory > > > until memory pressure happens. > > > > > > To support such strategy easier, this patch introduces > > > MADV_ANONYMOUS_FILTER and MADV_FILE_FILTER options in madvise(2) like > > > that /proc//clear_refs already has supported same filters. > > > They are filters could be Ored with other existing hints using top two > > > bits > > > of (int behavior). > > > > madvise operates on top of ranges and it is quite trivial to do the > > filtering from the userspace so why do we need any additional filtering? > > > > > Once either of them is set, the hint could affect only the interested vma > > > either anonymous or file-backed. > > > > > > With that, user could call a process_madvise syscall simply with a entire > > > range(0x0 - 0x) but either of MADV_ANONYMOUS_FILTER and > > > MADV_FILE_FILTER so there is no need to call the syscall range by range. > > > > OK, so here is the reason you want that. The immediate question is why > > cannot the monitor do the filtering from the userspace. Slightly more > > work, all right, but less of an API to expose and that itself is a > > strong argument against. > > What I should do if we don't have such filter option is to enumerate all of > vma via /proc//maps and then parse every ranges and inode from string, > which would be painful for 2000+ vmas. Painful is not an argument to add a new user API. If the existing API suits the purpose then it should be used. If it is not usable, we can think of a different way. -- Michal Hocko SUSE Labs
[PATCH] MIPS: mark ginvt() as __always_inline
To meet the 'i' (immediate) constraint for the asm operands, this function must be always inlined. Signed-off-by: Masahiro Yamada --- arch/mips/include/asm/ginvt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/ginvt.h b/arch/mips/include/asm/ginvt.h index 49c6dbe37338..6eb7c2b94dc7 100644 --- a/arch/mips/include/asm/ginvt.h +++ b/arch/mips/include/asm/ginvt.h @@ -19,7 +19,7 @@ _ASM_MACRO_1R1I(ginvt, rs, type, # define _ASM_SET_GINV #endif -static inline void ginvt(unsigned long addr, enum ginvt_type type) +static __always_inline void ginvt(unsigned long addr, enum ginvt_type type) { asm volatile( ".set push\n" -- 2.17.1
[PATCH] powerpc/mm: mark more tlb functions as __always_inline
With CONFIG_OPTIMIZE_INLINING enabled, Laura Abbott reported error with gcc 9.1.1: arch/powerpc/mm/book3s64/radix_tlb.c: In function '_tlbiel_pid': arch/powerpc/mm/book3s64/radix_tlb.c:104:2: warning: asm operand 3 probably doesn't match constraints 104 | asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) | ^~~ arch/powerpc/mm/book3s64/radix_tlb.c:104:2: error: impossible constraint in 'asm' Fixing _tlbiel_pid() is enough to address the warning above, but I inlined more functions to fix all potential issues. To meet the 'i' (immediate) constraint for the asm operands, functions propagating propagated 'ric' must be always inlined. Fixes: 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING") Reported-by: Laura Abbott Signed-off-by: Masahiro Yamada --- arch/powerpc/mm/book3s64/hash_native.c | 8 +++-- arch/powerpc/mm/book3s64/radix_tlb.c | 44 +++--- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index aaa28fd918fe..bc2c35c0d2b1 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -60,9 +60,11 @@ static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) * tlbiel instruction for hash, set invalidation * i.e., r=1 and is=01 or is=10 or is=11 */ -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, + unsigned int is, + unsigned int pid, + unsigned int ric, + unsigned int prs) { unsigned long rb; unsigned long rs; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 4d841369399f..91c4242c1be3 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -29,9 +29,11 @@ * tlbiel instruction for radix, set invalidation * i.e., r=1 and is=01 or is=10 or is=11 */ -static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) +static __always_inline void tlbiel_radix_set_isa300(unsigned int set, + unsigned int is, + unsigned int pid, + unsigned int ric, + unsigned int prs) { unsigned long rb; unsigned long rs; @@ -150,8 +152,8 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, - unsigned long ric) +static __always_inline void __tlbiel_lpid_guest(unsigned long lpid, int set, + unsigned long ric) { unsigned long rb,rs,prs,r; @@ -167,8 +169,8 @@ static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, } -static inline void __tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -183,8 +185,8 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -199,8 +201,10 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbie_lpid_va(unsigned long va, + unsigned long lpid, + unsigned long ap, + unsigned long ric) { unsigned long rb,rs,prs,r; @@ -239,7 +243,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid) /* * We use 128 set
Re: linux-next: Tree for May 21
Hi Masahiro, On Tue, 21 May 2019 14:48:21 +0900 Masahiro Yamada wrote: > > FYI. > Commit 15e57a12d4df3c662f6cceaec6d1efa98a3d70f8 > is equivalent to commit ecebc5ce59a003163eb608ace38a01d7ffeb0a95 > which is already in the mainline. > > The former should be dropped, shouldn't it? I have dropped it. I assume andrew will take it out of his patch queue in due course. Andrew, this is: 15e57a12d4df ("kdb: det rid of broken attempt to print CCVERSION in kdb summary") I dropped most (all?) of the rest that you have sent to Linus, but this hit Linus' tree via another tree. BTW, Masahiro, please trim your quoting to just the relevant bits, -- Cheers, Stephen Rothwell pgpQ0mFSmQLxl.pgp Description: OpenPGP digital signature
[PATCH 0/2] Return immediately if sprd_clk_regmap_init() fails
The function sprd_clk_regmap_init() doesn't always return success, drivers should return immediately when it fails ranther than continue the clock initialization. The patch 2/2 in this set switchs to use devm_ioremap_resources() instead of of_iomap(), that will make caller programs more simple. Chunyan Zhang (2): clk: sprd: Switch from of_iomap() to devm_ioremap_resource() clk: sprd: Add check for return value of sprd_clk_regmap_init() drivers/clk/sprd/common.c | 9 +++-- drivers/clk/sprd/sc9860-clk.c | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) -- 2.17.1
[PATCH 2/2] clk: sprd: Add check for return value of sprd_clk_regmap_init()
sprd_clk_regmap_init() doesn't always return success, adding check for its return value should make the code more strong. Signed-off-by: Chunyan Zhang --- drivers/clk/sprd/sc9860-clk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/clk/sprd/sc9860-clk.c b/drivers/clk/sprd/sc9860-clk.c index 9980ab55271b..1ed45b4f2fe8 100644 --- a/drivers/clk/sprd/sc9860-clk.c +++ b/drivers/clk/sprd/sc9860-clk.c @@ -2031,7 +2031,9 @@ static int sc9860_clk_probe(struct platform_device *pdev) } desc = match->data; - sprd_clk_regmap_init(pdev, desc); + ret = sprd_clk_regmap_init(pdev, desc); + if (ret) + return ret; return sprd_clk_probe(&pdev->dev, desc->hw_clks); } -- 2.17.1
[PATCH 1/2] clk: sprd: Switch from of_iomap() to devm_ioremap_resource()
devm_ioremap_resources() automatically requests resources and devm_ wrappers do better error handling and unmapping of the I/O region when needed, that would make drivers more clean and simple. Signed-off-by: Chunyan Zhang --- drivers/clk/sprd/common.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/clk/sprd/common.c b/drivers/clk/sprd/common.c index e038b0447206..a5bdca1de5d0 100644 --- a/drivers/clk/sprd/common.c +++ b/drivers/clk/sprd/common.c @@ -42,6 +42,7 @@ int sprd_clk_regmap_init(struct platform_device *pdev, void __iomem *base; struct device_node *node = pdev->dev.of_node; struct regmap *regmap; + struct resource *res; if (of_find_property(node, "sprd,syscon", NULL)) { regmap = syscon_regmap_lookup_by_phandle(node, "sprd,syscon"); @@ -50,10 +51,14 @@ int sprd_clk_regmap_init(struct platform_device *pdev, return PTR_ERR(regmap); } } else { - base = of_iomap(node, 0); + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(base)) + return PTR_ERR(base); + regmap = devm_regmap_init_mmio(&pdev->dev, base, &sprdclk_regmap_config); - if (IS_ERR_OR_NULL(regmap)) { + if (IS_ERR(regmap)) { pr_err("failed to init regmap\n"); return PTR_ERR(regmap); } -- 2.17.1
Re: [RFC 5/7] mm: introduce external memory hinting API
On Tue 21-05-19 11:41:07, Minchan Kim wrote: > On Mon, May 20, 2019 at 11:18:29AM +0200, Michal Hocko wrote: > > [Cc linux-api] > > > > On Mon 20-05-19 12:52:52, Minchan Kim wrote: > > > There is some usecase that centralized userspace daemon want to give > > > a memory hint like MADV_[COOL|COLD] to other process. Android's > > > ActivityManagerService is one of them. > > > > > > It's similar in spirit to madvise(MADV_WONTNEED), but the information > > > required to make the reclaim decision is not known to the app. Instead, > > > it is known to the centralized userspace daemon(ActivityManagerService), > > > and that daemon must be able to initiate reclaim on its own without > > > any app involvement. > > > > Could you expand some more about how this all works? How does the > > centralized daemon track respective ranges? How does it synchronize > > against parallel modification of the address space etc. > > Currently, we don't track each address ranges because we have two > policies at this moment: > > deactive file pages and reclaim anonymous pages of the app. > > Since the daemon has a ability to let background apps resume(IOW, process > will be run by the daemon) and both hints are non-disruptive stabilty point > of view, we are okay for the race. Fair enough but the API should consider future usecases where this might be a problem. So we should really think about those potential scenarios now. If we are ok with that, fine, but then we should be explicit and document it that way. Essentially say that any sort of synchronization is supposed to be done by monitor. This will make the API less usable but maybe that is enough. > > > To solve the issue, this patch introduces new syscall process_madvise(2) > > > which works based on pidfd so it could give a hint to the exeternal > > > process. > > > > > > int process_madvise(int pidfd, void *addr, size_t length, int advise); > > > > OK, this makes some sense from the API point of view. When we have > > discussed that at LSFMM I was contemplating about something like that > > except the fd would be a VMA fd rather than the process. We could extend > > and reuse /proc//map_files interface which doesn't support the > > anonymous memory right now. > > > > I am not saying this would be a better interface but I wanted to mention > > it here for a further discussion. One slight advantage would be that > > you know the exact object that you are operating on because you have a > > fd for the VMA and we would have a more straightforward way to reject > > operation if the underlying object has changed (e.g. unmapped and reused > > for a different mapping). > > I agree your point. If I didn't miss something, such kinds of vma level > modify notification doesn't work even file mapped vma at this moment. > For anonymous vma, I think we could use userfaultfd, pontentially. > It would be great if someone want to do with disruptive hints like > MADV_DONTNEED. > > I'd like to see it further enhancement after landing address range based > operation via limiting hints process_madvise supports to non-disruptive > only(e.g., MADV_[COOL|COLD]) so that we could catch up the usercase/workload > when someone want to extend the API. So do you think we want both interfaces (process_madvise and madvisefd)? -- Michal Hocko SUSE Labs
GNÚ
Microsoft seems to be more and more interested in GNU. Do you not want to be part of that money? Indeed, may I suggest a slight stylistic change of GNU, to GNÚ. Peace, YC.
Re: Re: Re: [PATCH v3 11/14] dmaengine: imx-sdma: fix ecspi1 rx dma not work on i.mx8mm
On 21-05-19, 05:41, Robin Gong wrote: > > -Original Message- > > From: Vinod Koul > > Sent: 2019年5月21日 13:13 > > > > On 21-05-19, 04:58, Robin Gong wrote: > > > > -Original Message- > > > > From: Vinod Koul > > > > Sent: 2019年5月21日 12:18 > > > > > > > > On 07-05-19, 09:16, Robin Gong wrote: > > > > > Because the number of ecspi1 rx event on i.mx8mm is 0, the > > > > > condition check ignore such special case without dma channel > > > > > enabled, which caused > > > > > ecspi1 rx works failed. Actually, no need to check event_id0, > > > > > checking > > > > > event_id1 is enough for DEV_2_DEV case because it's so lucky that > > > > > event_id1 never be 0. > > > > > > > > Well is that by chance or design that event_id1 will be never 0? > > > > > > > That's by chance. DEV_2_DEV is just for Audio case and non-zero for > > event_id1 on current i.MX family. > > > > Then it wont be fgood to rely on chance :) > Yes, I knew that. May I create another independent patch for event_id1 since > that's potential issue is not related with this ecspi patch set? Sure a patch should change one thing but I think it should come before this one. The log for this should be fixed up as well -- ~Vinod
Re: [RFC 3/7] mm: introduce MADV_COLD
On Tue 21-05-19 08:00:38, Minchan Kim wrote: > On Mon, May 20, 2019 at 10:27:03AM +0200, Michal Hocko wrote: > > [Cc linux-api] > > > > On Mon 20-05-19 12:52:50, Minchan Kim wrote: > > > When a process expects no accesses to a certain memory range > > > for a long time, it could hint kernel that the pages can be > > > reclaimed instantly but data should be preserved for future use. > > > This could reduce workingset eviction so it ends up increasing > > > performance. > > > > > > This patch introduces the new MADV_COLD hint to madvise(2) > > > syscall. MADV_COLD can be used by a process to mark a memory range > > > as not expected to be used for a long time. The hint can help > > > kernel in deciding which pages to evict proactively. > > > > As mentioned in other email this looks like a non-destructive > > MADV_DONTNEED alternative. > > > > > Internally, it works via reclaiming memory in process context > > > the syscall is called. If the page is dirty but backing storage > > > is not synchronous device, the written page will be rotate back > > > into LRU's tail once the write is done so they will reclaim easily > > > when memory pressure happens. If backing storage is > > > synchrnous device(e.g., zram), hte page will be reclaimed instantly. > > > > Why do we special case async backing storage? Please always try to > > explain _why_ the decision is made. > > I didn't make any decesion. ;-) That's how current reclaim works to > avoid latency of freeing page in interrupt context. I had a patchset > to resolve the concern a few years ago but got distracted. Please articulate that in the changelog then. Or even do not go into implementation details and stick with - reuse the current reclaim implementation. If you call out some of the specific details you are risking people will start depending on them. The fact that this reuses the currect reclaim logic is enough from the review point of view because we know that there is no additional special casing to worry about. -- Michal Hocko SUSE Labs
[PATCH 1/3] KVM: Documentation: Add disable pause exits to KVM_CAP_X86_DISABLE_EXITS
From: Wanpeng Li Commit b31c114b (KVM: X86: Provide a capability to disable PAUSE intercepts) forgot to add the KVM_X86_DISABLE_EXITS_PAUSE into api doc. This patch adds it. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Signed-off-by: Wanpeng Li --- Documentation/virtual/kvm/api.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ba6c42c..33cd92d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4893,6 +4893,7 @@ Valid bits in args[0] are #define KVM_X86_DISABLE_EXITS_MWAIT(1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) +#define KVM_X86_DISABLE_EXITS_PAUSE(1 << 2) Enabling this capability on a VM provides userspace with a way to no longer intercept some instructions for improved latency in some -- 2.7.4
[PATCH v2 2/3] KVM: X86: Provide a capability to disable cstate msr read intercepts
From: Wanpeng Li Allow guest reads CORE cstate when exposing host CPU power management capabilities to the guest. PKG cstate is restricted to avoid a guest to get the whole package information in multi-tenant scenario. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Signed-off-by: Wanpeng Li --- v1 -> v2: * use a separate bit for KVM_CAP_X86_DISABLE_EXITS Documentation/virtual/kvm/api.txt | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c| 6 ++ arch/x86/kvm/x86.c| 5 - arch/x86/kvm/x86.h| 5 + include/uapi/linux/kvm.h | 4 +++- tools/include/uapi/linux/kvm.h| 4 +++- 7 files changed, 23 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 33cd92d..91fd86f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4894,6 +4894,7 @@ Valid bits in args[0] are #define KVM_X86_DISABLE_EXITS_MWAIT(1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE(1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) Enabling this capability on a VM provides userspace with a way to no longer intercept some instructions for improved latency in some diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d5457c7..1ce8289 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -882,6 +882,7 @@ struct kvm_arch { bool mwait_in_guest; bool hlt_in_guest; bool pause_in_guest; + bool cstate_in_guest; unsigned long irq_sources_bitmap; s64 kvmclock_offset; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0861c71..da24f18 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6637,6 +6637,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + if (kvm_cstate_in_guest(kvm)) { + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + } vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 202048e..765fe59 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3098,7 +3098,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE; + r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_CSTATE; if(kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; @@ -4612,6 +4613,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a470ff0..275b3b6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -333,6 +333,11 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm) return kvm->arch.pause_in_guest; } +static inline bool kvm_cstate_in_guest(struct kvm *kvm) +{ + return kvm->arch.cstate_in_guest; +} + DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2fe12b4..c2152f3 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -696,9 +696,11 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT(1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) #define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \ KVM_X86_DISABLE_EXITS_HLT | \ - K
[PATCH v2 3/3] KVM: X86: Emulate MSR_IA32_MISC_ENABLE MWAIT bit
From: Wanpeng Li MSR IA32_MISC_ENABLE bit 18, according to SDM: | When this bit is set to 0, the MONITOR feature flag is not set (CPUID.01H:ECX[bit 3] = 0). | This indicates that MONITOR/MWAIT are not supported. | | Software attempts to execute MONITOR/MWAIT will cause #UD when this bit is 0. | | When this bit is set to 1 (default), MONITOR/MWAIT are supported (CPUID.01H:ECX[bit 3] = 1). The CPUID.01H:ECX[bit 3] ought to mirror the value of the MSR bit, CPUID.01H:ECX[bit 3] is a better guard than kvm_mwait_in_guest(). kvm_mwait_in_guest() affects the behavior of MONITOR/MWAIT, not its guest visibility. This patch implements toggling of the CPUID bit based on guest writes to the MSR. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Cc: Konrad Rzeszutek Wilk Signed-off-by: Wanpeng Li --- v1 -> v2: * hide behind KVM_CAP_DISABLE_QUIRKS arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/cpuid.c| 10 ++ arch/x86/kvm/x86.c | 10 ++ 3 files changed, 21 insertions(+) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7a0e64c..e3ae96b5 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -382,6 +382,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_CD_NW_CLEARED(1 << 1) #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_X86_QUIRK_MISC_ENABLE_MWAIT (1 << 4) #define KVM_STATE_NESTED_GUEST_MODE0x0001 #define KVM_STATE_NESTED_RUN_PENDING 0x0002 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e18a9f9..f54d266 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -137,6 +137,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_MWAIT)) { + best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + if (best) { + if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT) + best->ecx |= F(MWAIT); + else + best->ecx &= ~F(MWAIT); + } + } + /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); kvm_mmu_reset_context(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 765fe59..a4eb711 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2547,6 +2547,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_MISC_ENABLE: + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_MWAIT) && + ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + if ((vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT) && + !(data & MSR_IA32_MISC_ENABLE_MWAIT)) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + return 1; + } + vcpu->arch.ia32_misc_enable_msr = data; + kvm_update_cpuid(vcpu); + } vcpu->arch.ia32_misc_enable_msr = data; break; case MSR_IA32_SMBASE: -- 2.7.4
Re: [RFC 1/7] mm: introduce MADV_COOL
On Tue 21-05-19 07:54:19, Minchan Kim wrote: > On Mon, May 20, 2019 at 10:16:21AM +0200, Michal Hocko wrote: [...] > > > Internally, it works via deactivating memory from active list to > > > inactive's head so when the memory pressure happens, they will be > > > reclaimed earlier than other active pages unless there is no > > > access until the time. > > > > Could you elaborate about the decision to move to the head rather than > > tail? What should happen to inactive pages? Should we move them to the > > tail? Your implementation seems to ignore those completely. Why? > > Normally, inactive LRU could have used-once pages without any mapping > to user's address space. Such pages would be better candicate to > reclaim when the memory pressure happens. With deactivating only > active LRU pages of the process to the head of inactive LRU, we will > keep them in RAM longer than used-once pages and could have more chance > to be activated once the process is resumed. You are making some assumptions here. You have an explicit call what is cold now you are assuming something is even colder. Is this assumption a general enough to make people depend on it? Not that we wouldn't be able to change to logic later but that will always be risky - especially in the area when somebody want to make a user space driven memory management. > > What should happen for shared pages? In other words do we want to allow > > less privileged process to control evicting of shared pages with a more > > privileged one? E.g. think of all sorts of side channel attacks. Maybe > > we want to do the same thing as for mincore where write access is > > required. > > It doesn't work with shared pages(ie, page_mapcount > 1). I will add it > in the description. OK, this is good for the starter. It makes the implementation simpler and we can add shared mappings coverage later. Although I would argue that touching only writeable mappings should be reasonably safe. -- Michal Hocko SUSE Labs
Re: [PATCH] drivers: md: Unify common definitions of raid1 and raid10
On Thu, May 16, 2019 at 8:39 AM Song Liu wrote: > > On Thu, May 16, 2019 at 4:24 AM Marcos Paulo de Souza > wrote: > > > > ping. > > Applied to https://github.com/liu-song-6/linux/tree/md-next. > > Thanks for the patch. I will process it after the merge window closes. > > Song > > > On Thu, May 09, 2019 at 08:18:49AM -0300, Marcos Paulo de Souza wrote: > > > These definitions are being moved to raid1-10.c. > > > > > > Signed-off-by: Marcos Paulo de Souza > > > --- > > > drivers/md/raid1-10.c | 25 + > > > drivers/md/raid1.c| 29 ++--- > > > drivers/md/raid10.c | 27 +-- > > > 3 files changed, 28 insertions(+), 53 deletions(-) > > > > > > diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c > > > index 41b815db..7d968bf08e54 100644 > > > --- a/drivers/md/raid1-10.c > > > +++ b/drivers/md/raid1-10.c > > > @@ -3,6 +3,31 @@ > > > #define RESYNC_BLOCK_SIZE (64*1024) > > > #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) > > > > > > +/* > > > + * Number of guaranteed raid bios in case of extreme VM load: > > > + */ > > > +#define NR_RAID_BIOS 256 > > > + > > > +/* when we get a read error on a read-only array, we redirect to another > > > + * device without failing the first device, or trying to over-write to > > > + * correct the read error. To keep track of bad blocks on a per-bio > > > + * level, we store IO_BLOCKED in the appropriate 'bios' pointer > > > + */ > > > +#define IO_BLOCKED ((struct bio *)1) > > > +/* When we successfully write to a known bad-block, we need to remove the > > > + * bad-block marking which must be done from process context. So we > > > record > > > + * the success by setting devs[n].bio to IO_MADE_GOOD > > > + */ > > > +#define IO_MADE_GOOD ((struct bio *)2) > > > + > > > +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) > > > + > > > +/* When there are this many requests queue to be written by > > > + * the raid thread, we become 'congested' to provide back-pressure > > > + * for writeback. > > > + */ > > > +static int max_queued_requests = 1024; > > > + > > > /* for managing resync I/O pages */ > > > struct resync_pages { > > > void*raid_bio; > > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c > > > index 0c8a098d220e..bb052c35bf29 100644 > > > --- a/drivers/md/raid1.c > > > +++ b/drivers/md/raid1.c > > > @@ -50,31 +50,6 @@ > > >(1L << MD_HAS_PPL) | \ > > >(1L << MD_HAS_MULTIPLE_PPLS)) > > > > > > -/* > > > - * Number of guaranteed r1bios in case of extreme VM load: > > > - */ > > > -#define NR_RAID1_BIOS 256 > > > - > > > -/* when we get a read error on a read-only array, we redirect to another > > > - * device without failing the first device, or trying to over-write to > > > - * correct the read error. To keep track of bad blocks on a per-bio > > > - * level, we store IO_BLOCKED in the appropriate 'bios' pointer > > > - */ > > > -#define IO_BLOCKED ((struct bio *)1) > > > -/* When we successfully write to a known bad-block, we need to remove the > > > - * bad-block marking which must be done from process context. So we > > > record > > > - * the success by setting devs[n].bio to IO_MADE_GOOD > > > - */ > > > -#define IO_MADE_GOOD ((struct bio *)2) > > > - > > > -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) > > > - > > > -/* When there are this many requests queue to be written by > > > - * the raid1 thread, we become 'congested' to provide back-pressure > > > - * for writeback. > > > - */ > > > -static int max_queued_requests = 1024; > > > - > > > static void allow_barrier(struct r1conf *conf, sector_t sector_nr); > > > static void lower_barrier(struct r1conf *conf, sector_t sector_nr); > > > > > > @@ -2955,7 +2930,7 @@ static struct r1conf *setup_conf(struct mddev > > > *mddev) > > > if (!conf->poolinfo) > > > goto abort; > > > conf->poolinfo->raid_disks = mddev->raid_disks * 2; > > > - err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, > > > r1bio_pool_alloc, > > > + err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, > > > r1bio_pool_alloc, > > > r1bio_pool_free, conf->poolinfo); > > > if (err) > > > goto abort; > > > @@ -3240,7 +3215,7 @@ static int raid1_reshape(struct mddev *mddev) > > > newpoolinfo->mddev = mddev; > > > newpoolinfo->raid_disks = raid_disks * 2; > > > > > > - ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc, > > > + ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, > > > r1bio_pool_free, newpoolinfo); > > > if (ret) { > > > kfree(newpoolinfo); > > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > > > index 3b6880dd648d..24cb116d950f 100644 > > > --- a/drivers/md/raid10.c > > > +++ b/drivers/md/raid10.c > > > @@ -73,31 +73,6 @@ > > > *[B A] [D C][B A] [E C D] > > > */ > > > >
Re: [PATCH RESEND] kvm: make kvm_vcpu_(un)map dependency on CONFIG_HAS_IOMEM explicit
On Mon, May 20, 2019 at 07:23:43PM +0200, Paolo Bonzini wrote: > On 20/05/19 18:44, Michal Kubecek wrote: > > Recently introduced functions kvm_vcpu_map() and kvm_vcpu_unmap() call > > memremap() and memunmap() which are only available if HAS_IOMEM is enabled > > but this dependency is not explicit, so that the build fails with HAS_IOMEM > > disabled. > > > > As both function are only used on x86 where HAS_IOMEM is always enabled, > > the easiest fix seems to be to only provide them when HAS_IOMEM is enabled. > > > > Fixes: e45adf665a53 ("KVM: Introduce a new guest mapping API") > > Signed-off-by: Michal Kubecek > > --- > > Thank you very much. However, it's better if only the memremap part is > hidden behind CONFIG_HAS_IOMEM. I'll send a patch tomorrow and have it > reach Linus at most on Wednesday. That sounds like a better solution. As I'm not familiar with the code, I didn't want to risk and suggested the easiest way around. Michal > There is actually nothing specific to CONFIG_HAS_IOMEM in them, > basically the functionality we want is remap_pfn_range but without a > VMA. However, it's for a niche use case where KVM guest memory is > mmap-ed from /dev/mem and it's okay if for now that part remains > disabled on s390. > > Paolo
Re: [PATCH 2/4] md: raid0: Remove return statement from void function
On Mon, May 20, 2019 at 2:45 PM Marcos Paulo de Souza wrote: > > This return statement was introduced in commit > 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 ("Linux-2.6.12-rc2") and can be > safely removed. Wow, that's a really old commit. :) I think 3/4 and 4/4 of the set makes git-blame more difficult to follow. Let's not apply them. Thanks, Song > > Signed-off-by: Marcos Paulo de Souza > --- > drivers/md/raid0.c | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c > index f3fb5bb8c82a..42b0287104bd 100644 > --- a/drivers/md/raid0.c > +++ b/drivers/md/raid0.c > @@ -609,7 +609,6 @@ static bool raid0_make_request(struct mddev *mddev, > struct bio *bio) > static void raid0_status(struct seq_file *seq, struct mddev *mddev) > { > seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); > - return; > } > > static void *raid0_takeover_raid45(struct mddev *mddev) > -- > 2.21.0 >
Re: [PATCH RESEND] kvm: make kvm_vcpu_(un)map dependency on CONFIG_HAS_IOMEM explicit
On Mon, May 20, 2019 at 03:45:29PM -0700, Bjorn Andersson wrote: > On Mon, May 20, 2019 at 9:44 AM Michal Kubecek wrote: > > > > Recently introduced functions kvm_vcpu_map() and kvm_vcpu_unmap() call > > memremap() and memunmap() which are only available if HAS_IOMEM is enabled > > but this dependency is not explicit, so that the build fails with HAS_IOMEM > > disabled. > > > > As both function are only used on x86 where HAS_IOMEM is always enabled, > > the easiest fix seems to be to only provide them when HAS_IOMEM is enabled. > > > > Fixes: e45adf665a53 ("KVM: Introduce a new guest mapping API") > > Signed-off-by: Michal Kubecek > > Hi Michal, > > I see the same build issue on arm64 and as CONFIG_HAS_IOMEM is set > there this patch has no effect on solving that. Instead I had to > include linux/io.h in kvm_main.c to make it compile. This sounds like a different problem which was already resolved in mainline by commit c011d23ba046 ("kvm: fix compilation on aarch64") which is present in v5.2-rc1. The issue I'm trying to address is link time failure (unresolved reference to memremap()/memunmap()) when CONFIG_HAS_IOMEM is disabled (in our case it affects a special minimalistic s390x config for zfcpdump). Michal
Re: linux-next: Tree for May 21
Hi Stephen, Andrew, On Tue, May 21, 2019 at 2:15 PM Stephen Rothwell wrote: > > Hi all, FYI. Commit 15e57a12d4df3c662f6cceaec6d1efa98a3d70f8 is equivalent to commit ecebc5ce59a003163eb608ace38a01d7ffeb0a95 which is already in the mainline. The former should be dropped, shouldn't it? Thanks. > > Changes since 20190520: > > New trees: soc-fsl, soc-fsl-fixes > > Removed trees: (not updated for more than a year) > alpine, samsung, sh, befs, kconfig, dwmw2-iommu, trivial, > target-updates, target-bva, init_task > > The imx-mxs tree gained a build failure so I used the version from > next-20190520. > > The sunxi tree gained a conflict against the imx-mxs tree. > > The drm-misc tree gained conflicts against Linis' and the amdgpu trees. > > Non-merge commits (relative to Linus' tree): 991 > 998 files changed, 29912 insertions(+), 14691 deletions(-) > > > > I have created today's linux-next tree at > git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git > (patches at http://www.kernel.org/pub/linux/kernel/next/ ). If you > are tracking the linux-next tree using git, you should not use "git pull" > to do so as that will try to merge the new linux-next release with the > old one. You should use "git fetch" and checkout or reset to the new > master. > > You can see which trees have been included by looking in the Next/Trees > file in the source. There are also quilt-import.log and merge.log > files in the Next directory. Between each merge, the tree was built > with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a > multi_v7_defconfig for arm and a native build of tools/perf. After > the final fixups (if any), I do an x86_64 modules_install followed by > builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit), > ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc > and sparc64 defconfig. And finally, a simple boot test of the powerpc > pseries_le_defconfig kernel in qemu (with and without kvm enabled). > > Below is a summary of the state of the merge. > > I am currently merging 290 trees (counting Linus' and 70 trees of bug > fix patches pending for the current merge release). > > Stats about the size of the tree over time can be seen at > http://neuling.org/linux-next-size.html . > > Status of my local build tests will be at > http://kisskb.ellerman.id.au/linux-next . If maintainers want to give > advice about cross compilers/configs that work, we are always open to add > more builds. > > Thanks to Randy Dunlap for doing many randconfig builds. And to Paul > Gortmaker for triage and bug fixes. > > -- > Cheers, > Stephen Rothwell > > $ git checkout master > $ git reset --hard stable > Merging origin/master (f49aa1de9836 Merge tag 'for-5.2-rc1-tag' of > git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux) > Merging fixes/master (2bbacd1a9278 Merge tag 'kconfig-v5.2' of > git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild) > Merging kspp-gustavo/for-next/kspp (b324f1b28dc0 afs: yfsclient: Mark > expected switch fall-throughs) > Merging kbuild-current/fixes (a2d635decbfa Merge tag 'drm-next-2019-05-09' of > git://anongit.freedesktop.org/drm/drm) > Merging arc-current/for-curr (c5a1726d7383 ARC: entry: EV_Trap expects r10 > (vs. r9) to have exception cause) > Merging arm-current/fixes (e17b1af96b2a ARM: 8857/1: efi: enable CP15 DMB > instructions before cleaning the cache) > Merging arm64-fixes/for-next/fixes (7a0a93c51799 arm64: vdso: Explicitly add > build-id option) > Merging m68k-current/for-linus (fdd20ec8786a Documentation/features/time: > Mark m68k having modern-timekeeping) > Merging powerpc-fixes/fixes (672eaf37db9f powerpc/cacheinfo: Remove double > free) > Merging sparc/master (f49aa1de9836 Merge tag 'for-5.2-rc1-tag' of > git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux) > Merging fscrypt-current/for-stable (ae64f9bd1d36 Linux 4.15-rc2) > Merging net/master (fa2c52be7129 vlan: Mark expected switch fall-through) > Merging bpf/master (6a0a923dfa14 of_net: fix of_get_mac_address retval if > compiled without CONFIG_OF) > Merging ipsec/master (9b3040a6aafd ipv4: Define __ipv4_neigh_lookup_noref > when CONFIG_INET is disabled) > Merging netfilter/master (2c82c7e724ff netfilter: nf_tables: fix oops during > rule dump) > Merging ipvs/master (b2e3d68d1251 netfilter: nft_compat: destroy function > must not have side effects) > Merging wireless-drivers/master (7a0f8ad5ff63 Merge ath-current from > git://git.kernel.org/pub/scm/linux/kernel/git/k
RE: [PATCH v4] clk: qoriq: add support for lx2160a
Hello Stephen, I have incorporated review comments from https://patchwork.kernel.org/patch/10917171/ A gentle reminder to apply the patch https://patchwork.kernel.org/patch/10918407/. Regards, Vabhav > -Original Message- > From: Vabhav Sharma > Sent: Friday, April 26, 2019 12:24 PM > To: linux-kernel@vger.kernel.org; linux-...@vger.kernel.org > Cc: sb...@kernel.org; mturque...@baylibre.com; Vabhav Sharma > ; Andy Tang ; Yogesh > Narayan Gaur > Subject: [PATCH v4] clk: qoriq: add support for lx2160a > > Add clockgen support and configuration for NXP SoC lx2160a with > compatible property as "fsl,lx2160a-clockgen". > > Signed-off-by: Tang Yuantian > Signed-off-by: Yogesh Gaur > Signed-off-by: Vabhav Sharma > Acked-by: Scott Wood > Acked-by: Stephen Boyd > Acked-by: Viresh Kumar > --- > Changes for v4: > - Incorporated review comments from Stephen Boyd > > Changes for v3: > - Incorporated review comments of Rafael J. Wysocki > - Updated commit message > > Changes for v2: > - Subject line updated > > drivers/clk/clk-qoriq.c | 12 > 1 file changed, 12 insertions(+) > > diff --git a/drivers/clk/clk-qoriq.c b/drivers/clk/clk-qoriq.c index > 3d51d7c..1a15201 100644 > --- a/drivers/clk/clk-qoriq.c > +++ b/drivers/clk/clk-qoriq.c > @@ -570,6 +570,17 @@ static const struct clockgen_chipinfo chipinfo[] = { > .flags = CG_VER3 | CG_LITTLE_ENDIAN, > }, > { > + .compat = "fsl,lx2160a-clockgen", > + .cmux_groups = { > + &clockgen2_cmux_cga12, &clockgen2_cmux_cgb > + }, > + .cmux_to_group = { > + 0, 0, 0, 0, 1, 1, 1, 1, -1 > + }, > + .pll_mask = 0x37, > + .flags = CG_VER3 | CG_LITTLE_ENDIAN, > + }, > + { > .compat = "fsl,p2041-clockgen", > .guts_compat = "fsl,qoriq-device-config-1.0", > .init_periph = p2041_init_periph, > @@ -1427,6 +1438,7 @@ CLK_OF_DECLARE(qoriq_clockgen_ls1043a, > "fsl,ls1043a-clockgen", clockgen_init); > CLK_OF_DECLARE(qoriq_clockgen_ls1046a, "fsl,ls1046a-clockgen", > clockgen_init); CLK_OF_DECLARE(qoriq_clockgen_ls1088a, "fsl,ls1088a- > clockgen", clockgen_init); CLK_OF_DECLARE(qoriq_clockgen_ls2080a, > "fsl,ls2080a-clockgen", clockgen_init); > +CLK_OF_DECLARE(qoriq_clockgen_lx2160a, "fsl,lx2160a-clockgen", > +clockgen_init); > CLK_OF_DECLARE(qoriq_clockgen_p2041, "fsl,p2041-clockgen", clockgen_init); > CLK_OF_DECLARE(qoriq_clockgen_p3041, "fsl,p3041-clockgen", clockgen_init); > CLK_OF_DECLARE(qoriq_clockgen_p4080, "fsl,p4080-clockgen", clockgen_init); > -- > 2.7.4
RE: Re: Re: [PATCH v3 11/14] dmaengine: imx-sdma: fix ecspi1 rx dma not work on i.mx8mm
> -Original Message- > From: Vinod Koul > Sent: 2019年5月21日 13:13 > > On 21-05-19, 04:58, Robin Gong wrote: > > > -Original Message- > > > From: Vinod Koul > > > Sent: 2019年5月21日 12:18 > > > > > > On 07-05-19, 09:16, Robin Gong wrote: > > > > Because the number of ecspi1 rx event on i.mx8mm is 0, the > > > > condition check ignore such special case without dma channel > > > > enabled, which caused > > > > ecspi1 rx works failed. Actually, no need to check event_id0, > > > > checking > > > > event_id1 is enough for DEV_2_DEV case because it's so lucky that > > > > event_id1 never be 0. > > > > > > Well is that by chance or design that event_id1 will be never 0? > > > > > That's by chance. DEV_2_DEV is just for Audio case and non-zero for > event_id1 on current i.MX family. > > Then it wont be fgood to rely on chance :) Yes, I knew that. May I create another independent patch for event_id1 since that's potential issue is not related with this ecspi patch set? > > -- > ~Vinod
Re: [PATCHv1 4/8] arm64: dts: qcom: msm8916: Use more generic idle state names
On Wed, May 15, 2019 at 6:33 PM Niklas Cassel wrote: > > On Wed, May 15, 2019 at 03:43:19PM +0530, Amit Kucheria wrote: > > On Tue, May 14, 2019 at 9:42 PM Niklas Cassel > > wrote: > > > > > > On Fri, May 10, 2019 at 04:59:42PM +0530, Amit Kucheria wrote: > > > > Instead of using Qualcomm-specific terminology, use generic node names > > > > for the idle states that are easier to understand. Move the description > > > > into the "idle-state-name" property. > > > > > > > > Signed-off-by: Amit Kucheria > > > > --- > > > > arch/arm64/boot/dts/qcom/msm8916.dtsi | 11 ++- > > > > 1 file changed, 6 insertions(+), 5 deletions(-) > > > > > > > > diff --git a/arch/arm64/boot/dts/qcom/msm8916.dtsi > > > > b/arch/arm64/boot/dts/qcom/msm8916.dtsi > > > > index ded1052e5693..400b609bb3fd 100644 > > > > --- a/arch/arm64/boot/dts/qcom/msm8916.dtsi > > > > +++ b/arch/arm64/boot/dts/qcom/msm8916.dtsi > > > > @@ -110,7 +110,7 @@ > > > > reg = <0x0>; > > > > next-level-cache = <&L2_0>; > > > > enable-method = "psci"; > > > > - cpu-idle-states = <&CPU_SPC>; > > > > + cpu-idle-states = <&CPU_SLEEP_0>; > > > > clocks = <&apcs>; > > > > operating-points-v2 = <&cpu_opp_table>; > > > > #cooling-cells = <2>; > > > > @@ -122,7 +122,7 @@ > > > > reg = <0x1>; > > > > next-level-cache = <&L2_0>; > > > > enable-method = "psci"; > > > > - cpu-idle-states = <&CPU_SPC>; > > > > + cpu-idle-states = <&CPU_SLEEP_0>; > > > > clocks = <&apcs>; > > > > operating-points-v2 = <&cpu_opp_table>; > > > > #cooling-cells = <2>; > > > > @@ -134,7 +134,7 @@ > > > > reg = <0x2>; > > > > next-level-cache = <&L2_0>; > > > > enable-method = "psci"; > > > > - cpu-idle-states = <&CPU_SPC>; > > > > + cpu-idle-states = <&CPU_SLEEP_0>; > > > > clocks = <&apcs>; > > > > operating-points-v2 = <&cpu_opp_table>; > > > > #cooling-cells = <2>; > > > > @@ -146,7 +146,7 @@ > > > > reg = <0x3>; > > > > next-level-cache = <&L2_0>; > > > > enable-method = "psci"; > > > > - cpu-idle-states = <&CPU_SPC>; > > > > + cpu-idle-states = <&CPU_SLEEP_0>; > > > > clocks = <&apcs>; > > > > operating-points-v2 = <&cpu_opp_table>; > > > > #cooling-cells = <2>; > > > > @@ -160,8 +160,9 @@ > > > > idle-states { > > > > entry-method="psci"; > > > > > > Please add a space before and after "=". > > > > > > > > > > > - CPU_SPC: spc { > > > > + CPU_SLEEP_0: cpu-sleep-0 { > > > > > > While I like your idea of using power state names from > > > Server Base System Architecture document (SBSA) where applicable, > > > does each qcom power state have a matching state in SBSA? > > > > > > These are the qcom power states: > > > https://source.codeaurora.org/quic/la/kernel/msm-4.4/tree/Documentation/devicetree/bindings/arm/msm/lpm-levels.txt?h=msm-4.4#n53 > > > > > > Note that qcom defines: > > > "wfi", "retention", "gdhs", "pc", "fpc" > > > while SBSA simply defines "idle_standby" (aka wfi), "idle_retention", > > > "sleep". > > > > > > Unless you know the equivalent name for each qcom power state > > > (perhaps several qcom power states are really the same SBSA state?), > > > I think that you should omit the renaming from this patch series. > > > > That is what SLEEP_0, SLEEP_1, SLEEP_2 could be used for. > > Ok, sounds good to me. > > > > > IOW, all these qcom definitions are nicely represented in the > > state-name and we could simply stick to SLEEP_0, SLEEP_1 for the node > > names. There is wide variability in the the names of the qcom idle > > states across SoC families downstream, so I'd argue against using > > those for the node names. > > > > Just for cpu states (non-wfi) I see the use of the following names > > downstream across families. The C seems to come from x86 > > world[1]: > > > > - C4, standalone power collapse (spc) > > - C4, power collapse (fpc) > > - C2D, retention > > - C3, power collapse (pc) > > - C4, rail power collapse (rail-pc) > > > > [1] > > https://www.hardwaresecrets.com/everything-you-need-to-know-about-the-cpu-c-states-power-saving-modes/ > > Indeed, there seems to be mixed names used, I've also seen "fpc-def". > > So, you have convinced me. > > > Kind regards, > Niklas Can I take that as a Reviewed-by?
linux-next: Signed-off-by missing for commit in the amlogic tree
Hi all, Commit 5d32a77c6e2e ("arm64: dts: meson-g12a: Add PWM nodes") is missing a Signed-off-by from its committer. -- Cheers, Stephen Rothwell pgpBisrW4v1Uo.pgp Description: OpenPGP digital signature
Re: [RFC PATCH v2 0/4] Input: mpr121-polled: Add polled driver for MPR121
Hi Michal, On Fri, May 17, 2019 at 03:12:49PM +0200, Michal Vokáč wrote: > Hi, > > I have to deal with a situation where we have a custom i.MX6 based > platform in production that uses the MPR121 touchkey controller. > Unfortunately the chip is connected using only the I2C interface. > The interrupt line is not used. Back in 2015 (Linux v3.14), my > colleague modded the existing mpr121_touchkey.c driver to use polling > instead of interrupt. > > For quite some time yet I am in a process of updating the product from > the ancient Freescale v3.14 kernel to the latest mainline and pushing > any needed changes upstream. The DT files for our imx6dl-yapp4 platform > already made it into v5.1-rc. > > I rebased and updated our mpr121 patch to the latest mainline. > It is created as a separate driver, similarly to gpio_keys_polled. > > The I2C device is quite susceptible to ESD. An ESD test quite often > causes reset of the chip or some register randomly changes its value. > The [PATCH 3/4] adds a write-through register cache. With the cache > this state can be detected and the device can be re-initialied. > > The main question is: Is there any chance that such a polled driver > could be accepted? Is it correct to implement it as a separate driver > or should it be done as an option in the existing driver? I can not > really imagine how I would do that though.. > > There are also certain worries that the MPR121 chip may no longer be > available in nonspecifically distant future. In case of EOL I will need > to add a polled driver for an other touchkey chip. May it be already > in mainline or a completely new one. I think that my addition of input_polled_dev was ultimately a wrong thing to do. I am looking into enabling polling mode for regular input devices as we then can enable polling mode in existing drivers. As far as gpio-keys vs gpio-key-polled, I feel that the capabilities of polling driver is sufficiently different from interrupt-driven one, so we will likely keep them separate. Thanks. -- Dmitry
Re: [PATCH 1/2] Input: atmel_mxt_ts - add wakeup support
On Sat, May 18, 2019 at 06:55:10PM +0200, stefano.ma...@gmail.com wrote: > Hi Dmitry, > > On Fri, 2019-05-17 at 14:30 -0700, Dmitry Torokhov wrote: > > Hi Sefano, > > > > On Fri, May 17, 2019 at 11:17:40PM +0200, Stefano Manni wrote: > > > Add wakeup support to the maxtouch driver. > > > The device can wake up the system from suspend, > > > mark the IRQ as wakeup capable, so that device > > > irq is not disabled during system suspend. > > > > This should already be handled by I2C core, see lines after "if > > (client->flags & I2C_CLIENT_WAKE)" in drivers/i2c/i2c-core-base.c. > > > > Unless there is dedicated wakeup interrupt we configure main > > interrupt > > as wake source. > > > > what's about the other drivers (e.g. ili210x.c) doing like this? > Shall they be purged? They were likely done before I2C and driver core were enhanced to handle wakeup automatically. We might want to clean them up, as long as we verify that they keep working. Thanks. -- Dmitry
Re: [PATCH v2] edac: sifive: Add EDAC platform driver for SiFive SoCs
On Mon, May 6, 2019 at 4:57 PM Yash Shah wrote: > > The initial ver of EDAC driver supports: > - ECC event monitoring and reporting through the EDAC framework for SiFive > L2 cache controller. > > The EDAC driver registers for notifier events from the L2 cache controller > driver (arch/riscv/mm/sifive_l2_cache.c) for L2 ECC events > > Signed-off-by: Yash Shah > Reviewed-by: James Morse > --- > This patch depends on patch > 'RISC-V: sifive_l2_cache: Add L2 cache controller driver for SiFive SoCs' > https://lkml.org/lkml/2019/5/6/255 The prerequisite patch (sifive_l2_cache driver) has been merged into mainline v5.2-rc1 It should be OK to merge this edac driver now. - Yash
Re: [PATCH 0/5] firmware: Add support for loading compressed files
On Mon, 20 May 2019 11:56:07 +0200, Takashi Iwai wrote: > > On Mon, 20 May 2019 11:39:29 +0200, > Greg Kroah-Hartman wrote: > > > > On Mon, May 20, 2019 at 11:26:42AM +0200, Takashi Iwai wrote: > > > Hi, > > > > > > this is a patch set to add the support for loading compressed firmware > > > files. > > > > > > The primary motivation is to reduce the storage size; e.g. currently > > > the amount of /lib/firmware on my machine counts up to 419MB, and this > > > can be reduced to 130MB file compression. No bad deal. > > > > > > The feature adds only fallback to the compressed file, so it should > > > work as it was as long as the normal firmware file is present. The > > > f/w loader decompresses the content, so that there is no change needed > > > in the caller side. > > > > > > Currently only XZ format is supported. A caveat is that the kernel XZ > > > helper code supports only CRC32 (or none) integrity check type, so > > > you'll have to compress the files via xz -C crc32 option. > > > > > > The patch set begins with a few other improvements and refactoring, > > > followed by the compression support. > > > > > > In addition to this, dracut needs a small fix to deal with the *.xz > > > files. > > > > > > Also, the latest patchset is found in topic/fw-decompress branch of my > > > sound.git tree: > > > git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git > > > > After a quick review, these all look good to me, nice job. > > > > One recommendation, can we add support for testing this to the > > tools/testing/selftests/firmware/ tests? And you did run those > > regression tests to verify that you didn't get any of the config options > > messed up, right? :) > > Oh, do you believe I'm a so modern person who lets computer working on > everything? ;) I only tested manually, so far, this will be my > homework today. After fixing the regression in kselftest, I could verify and confirm that no regression was introduced by my patchset. Also, below is the patch to add tests for the compressed firmware load. I'll add to the series at the next respin, if needed. thanks, Takashi -- 8< -- From: Takashi Iwai Subject: [PATCH] selftests: firmware: Add compressed firmware tests This patch adds the test cases for checking compressed firmware load. Two more cases are added to fw_filesystem.sh: - Both a plain file and an xz file are present, and load the former - Only an xz file is present, and load without '.xz' suffix The tests are enabled only when CONFIG_FW_LOADER_COMPRESS is enabled and xz program is installed. Signed-off-by: Takashi Iwai --- tools/testing/selftests/firmware/fw_filesystem.sh | 73 +++ tools/testing/selftests/firmware/fw_lib.sh| 7 +++ tools/testing/selftests/firmware/fw_run_tests.sh | 1 + 3 files changed, 71 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/firmware/fw_filesystem.sh b/tools/testing/selftests/firmware/fw_filesystem.sh index a4320c4b44dc..f901076aa2ea 100755 --- a/tools/testing/selftests/firmware/fw_filesystem.sh +++ b/tools/testing/selftests/firmware/fw_filesystem.sh @@ -153,13 +153,18 @@ config_set_read_fw_idx() read_firmwares() { + if [ "$1" = "xzonly" ]; then + fwfile="${FW}-orig" + else + fwfile="$FW" + fi for i in $(seq 0 3); do config_set_read_fw_idx $i # Verify the contents are what we expect. # -Z required for now -- check for yourself, md5sum # on $FW and DIR/read_firmware will yield the same. Even # cmp agrees, so something is off. - if ! diff -q -Z "$FW" $DIR/read_firmware 2>/dev/null ; then + if ! diff -q -Z "$fwfile" $DIR/read_firmware 2>/dev/null ; then echo "request #$i: firmware was not loaded" >&2 exit 1 fi @@ -246,17 +251,17 @@ test_request_firmware_nowait_custom_nofile() test_batched_request_firmware() { - echo -n "Batched request_firmware() try #$1: " + echo -n "Batched request_firmware() $2 try #$1: " config_reset config_trigger_sync - read_firmwares + read_firmwares $2 release_all_firmware echo "OK" } test_batched_request_firmware_direct() { - echo -n "Batched request_firmware_direct() try #$1: " + echo -n "Batched request_firmware_direct() $2 try #$1: " config_reset config_set_sync_direct config_trigger_sync @@ -266,7 +271,7 @@ test_batched_request_firmware_direct() test_request_firmware_nowait_uevent() { - echo -n "Batched request_firmware_nowait(uevent=true) try #$1: " + echo -n "Batched request_firmware_nowait(uevent=true) $2 try #$1: " config_reset config_trigger_async release_all_firmware @@ -275,11 +280,16 @@ test_request_firmware_nowait_uevent() test_request_firmware_nowait_custom() { - echo -n "Ba
Re: [RESEND] input: keyboard: imx: make sure keyboard can always wake up system
Hi Anson, On Thu, Apr 04, 2019 at 01:40:16AM +, Anson Huang wrote: > There are several scenarios that keyboard can NOT wake up system > from suspend, e.g., if a keyboard is depressed between system > device suspend phase and device noirq suspend phase, the keyboard > ISR will be called and both keyboard depress and release interrupts > will be disabled, then keyboard will no longer be able to wake up > system. Another scenario would be, if a keyboard is kept depressed, > and then system goes into suspend, the expected behavior would be > when keyboard is released, system will be waked up, but current > implementation can NOT achieve that, because both depress and release > interrupts are disabled in ISR, and the event check is still in > progress. > > To fix these issues, need to make sure keyboard's depress or release > interrupt is enabled after noirq device suspend phase, this patch > moves the suspend/resume callback to noirq suspend/resume phase, and > enable the corresponding interrupt according to current keyboard status. I believe it is possible for IRQ to be disabled and still being enabled as wakeup source. What happens if you call disable_irq() before disabling the clock? Thanks. -- Dmitry
Re: [PATCH 1/2] selftests: Remove forced unbuffering for test running
On Tue, 21 May 2019 00:37:48 +0200, Kees Cook wrote: > > As it turns out, the "stdbuf" command will actually force all > subprocesses into unbuffered output, and some implementations of "echo" > turn into single-character writes, which utterly wrecks writes to /sys > and /proc files. > > Instead, drop the "stdbuf" usage, and for any tests that want explicit > flushing between newlines, they'll have to add "fflush(stdout);" as > needed. > > Reported-by: Takashi Iwai > Fixes: 5c069b6dedef ("selftests: Move test output to diagnostic lines") > Signed-off-by: Kees Cook Tested-by: Takashi Iwai BTW, this might be specific to shell invocation. As in the original discussion thread, it starts working when I replace "echo" with "/usr/bin/echo". Still it's not easy to control in a script itself, so dropping the unbuffered mode is certainly safer, yes. Thanks! Takashi > --- > tools/testing/selftests/kselftest/runner.sh | 12 +--- > 1 file changed, 1 insertion(+), 11 deletions(-) > > diff --git a/tools/testing/selftests/kselftest/runner.sh > b/tools/testing/selftests/kselftest/runner.sh > index eff3ee303d0d..00c9020bdda8 100644 > --- a/tools/testing/selftests/kselftest/runner.sh > +++ b/tools/testing/selftests/kselftest/runner.sh > @@ -24,16 +24,6 @@ tap_prefix() > fi > } > > -# If stdbuf is unavailable, we must fall back to line-at-a-time piping. > -tap_unbuffer() > -{ > - if ! which stdbuf >/dev/null ; then > - "$@" > - else > - stdbuf -i0 -o0 -e0 "$@" > - fi > -} > - > run_one() > { > DIR="$1" > @@ -54,7 +44,7 @@ run_one() > echo "not ok $test_num $TEST_HDR_MSG" > else > cd `dirname $TEST` > /dev/null > - (tap_unbuffer ./$BASENAME_TEST 2>&1; echo $? >&3) | > + (./$BASENAME_TEST 2>&1; echo $? >&3) | > tap_prefix >&4) 3>&1) | > (read xs; exit $xs)) 4>>"$logfile" && > echo "ok $test_num $TEST_HDR_MSG") || > -- > 2.17.1 >
Re: [PATCH v2 1/9] media: ov6650: Fix MODDULE_DESCRIPTION
On Tue, May 21, 2019 at 12:49:59AM +0200, Janusz Krzysztofik wrote: > Commit 23a52386fabe ("media: ov6650: convert to standalone v4l2 > subdevice") converted the driver from a soc_camera sensor to a > standalone V4L subdevice driver. Unfortunately, module description was > not updated to reflect the change. Fix it. > > While being at it, update email address of the module author. > > Fixes: 23a52386fabe ("media: ov6650: convert to standalone v4l2 subdevice") > Signed-off-by: Janusz Krzysztofik > cc: sta...@vger.kernel.org > --- > drivers/media/i2c/ov6650.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/drivers/media/i2c/ov6650.c b/drivers/media/i2c/ov6650.c > index 1b972e591b48..a3d00afcb0c8 100644 > --- a/drivers/media/i2c/ov6650.c > +++ b/drivers/media/i2c/ov6650.c > @@ -1045,6 +1045,6 @@ static struct i2c_driver ov6650_i2c_driver = { > > module_i2c_driver(ov6650_i2c_driver); > > -MODULE_DESCRIPTION("SoC Camera driver for OmniVision OV6650"); > -MODULE_AUTHOR("Janusz Krzysztofik "); > +MODULE_DESCRIPTION("V4L2 subdevice driver for OmniVision OV6650 camera > sensor"); > +MODULE_AUTHOR("Janusz Krzysztofik MODULE_LICENSE("GPL v2"); > -- > 2.21.0 > is this _really_ a patch that meets the stable kernel requirements? Same for this whole series... thanks, greg k-h
Re: [PATCH 11/12] powerpc/pseries/svm: Force SWIOTLB for secure guests
> diff --git a/arch/powerpc/include/asm/mem_encrypt.h > b/arch/powerpc/include/asm/mem_encrypt.h > new file mode 100644 > index ..45d5e4d0e6e0 > --- /dev/null > +++ b/arch/powerpc/include/asm/mem_encrypt.h > @@ -0,0 +1,19 @@ > +/* SPDX-License-Identifier: GPL-2.0+ */ > +/* > + * SVM helper functions > + * > + * Copyright 2019 IBM Corporation > + */ > + > +#ifndef _ASM_POWERPC_MEM_ENCRYPT_H > +#define _ASM_POWERPC_MEM_ENCRYPT_H > + > +#define sme_me_mask 0ULL > + > +static inline bool sme_active(void) { return false; } > +static inline bool sev_active(void) { return false; } > + > +int set_memory_encrypted(unsigned long addr, int numpages); > +int set_memory_decrypted(unsigned long addr, int numpages); > + > +#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */ S/390 seems to be adding a stub header just like this. Can you please clean up the Kconfig and generic headers bits for memory encryption so that we don't need all this boilerplate code? > config PPC_SVM > bool "Secure virtual machine (SVM) support for POWER" > depends on PPC_PSERIES > + select SWIOTLB > + select ARCH_HAS_MEM_ENCRYPT > default n n is the default default, no need to explictly specify it.
Re: [PATCH] input: imx6ul_tsc: use devm_platform_ioremap_resource() to simplify code
On Mon, Apr 01, 2019 at 05:19:55AM +, Anson Huang wrote: > Use the new helper devm_platform_ioremap_resource() which wraps the > platform_get_resource() and devm_ioremap_resource() together, to > simplify the code. > > Signed-off-by: Anson Huang Applied, thank you. > --- > drivers/input/touchscreen/imx6ul_tsc.c | 8 ++-- > 1 file changed, 2 insertions(+), 6 deletions(-) > > diff --git a/drivers/input/touchscreen/imx6ul_tsc.c > b/drivers/input/touchscreen/imx6ul_tsc.c > index c10fc59..e04eecd 100644 > --- a/drivers/input/touchscreen/imx6ul_tsc.c > +++ b/drivers/input/touchscreen/imx6ul_tsc.c > @@ -364,8 +364,6 @@ static int imx6ul_tsc_probe(struct platform_device *pdev) > struct device_node *np = pdev->dev.of_node; > struct imx6ul_tsc *tsc; > struct input_dev *input_dev; > - struct resource *tsc_mem; > - struct resource *adc_mem; > int err; > int tsc_irq; > int adc_irq; > @@ -403,16 +401,14 @@ static int imx6ul_tsc_probe(struct platform_device > *pdev) > return err; > } > > - tsc_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); > - tsc->tsc_regs = devm_ioremap_resource(&pdev->dev, tsc_mem); > + tsc->tsc_regs = devm_platform_ioremap_resource(pdev, 0); > if (IS_ERR(tsc->tsc_regs)) { > err = PTR_ERR(tsc->tsc_regs); > dev_err(&pdev->dev, "failed to remap tsc memory: %d\n", err); > return err; > } > > - adc_mem = platform_get_resource(pdev, IORESOURCE_MEM, 1); > - tsc->adc_regs = devm_ioremap_resource(&pdev->dev, adc_mem); > + tsc->adc_regs = devm_platform_ioremap_resource(pdev, 1); > if (IS_ERR(tsc->adc_regs)) { > err = PTR_ERR(tsc->adc_regs); > dev_err(&pdev->dev, "failed to remap adc memory: %d\n", err); > -- > 2.7.4 > -- Dmitry
Re: [RFC PATCH 02/12] powerpc: Add support for adding an ESM blob to the zImage wrapper
On Tue, May 21, 2019 at 01:49:02AM -0300, Thiago Jung Bauermann wrote: > From: Benjamin Herrenschmidt > > For secure VMs, the signing tool will create a ticket called the "ESM blob" > for the Enter Secure Mode ultravisor call with the signatures of the kernel > and initrd among other things. > > This adds support to the wrapper script for adding that blob via the "-e" > option to the zImage.pseries. > > It also adds code to the zImage wrapper itself to retrieve and if necessary > relocate the blob, and pass its address to Linux via the device-tree, to be > later consumed by prom_init. Where does the "BLOB" come from? How is it licensed and how can we satisfy the GPL with it?
Re: [RFC 1/1] Add dm verity root hash pkcs7 sig validation.
On 5/21/19 7:54 AM, Jaskaran Khurana wrote: > Adds in-kernel pkcs7 signature checking for the roothash of > the dm-verity hash tree. > > The verification is to support cases where the roothash is not secured by > Trusted Boot, UEFI Secureboot or similar technologies. > One of the use cases for this is for dm-verity volumes mounted after boot, > the root hash provided during the creation of the dm-verity volume has to > be secure and thus in-kernel validation implemented here will be used > before we trust the root hash and allow the block device to be created. > The first patch was your cover letter, I'd suggest name it that way in the subject. > The signature being provided for verification must verify the root hash and > must be trusted by the builtin keyring for verification to succeed. > > Adds DM_VERITY_VERIFY_ROOTHASH_SIG: roothash verification > against the roothash signature file *if* specified, if signature file is > specified verification must succeed prior to creation of device mapper > block device. > > Adds DM_VERITY_VERIFY_ROOTHASH_SIG_FORCE: roothash signature *must* be > specified for all dm verity volumes and verification must succeed prior > to creation of device mapper block device. > > Signed-off-by: Jaskaran Khurana > --- > drivers/md/Kconfig| 23 ++ > drivers/md/Makefile | 2 +- > drivers/md/dm-verity-target.c | 44 -- > drivers/md/dm-verity-verify-sig.c | 129 ++ > drivers/md/dm-verity-verify-sig.h | 32 > 5 files changed, 222 insertions(+), 8 deletions(-) > create mode 100644 drivers/md/dm-verity-verify-sig.c > create mode 100644 drivers/md/dm-verity-verify-sig.h > > diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig > index db269a348b20..da4115753f25 100644 > --- a/drivers/md/Kconfig > +++ b/drivers/md/Kconfig > @@ -489,6 +489,29 @@ config DM_VERITY > > If unsure, say N. > > +config DM_VERITY_VERIFY_ROOTHASH_SIG > + def_bool n > + bool "Verity data device root hash signature verification support" > + depends on DM_VERITY > + select SYSTEM_DATA_VERIFICATION > + help > + The device mapper target created by DM-VERITY can be validated if the > + pre-generated tree of cryptographic checksums passed has a pkcs#7 > + signature file that can validate the roothash of the tree. > + > + If unsure, say N. > + > +config DM_VERITY_VERIFY_ROOTHASH_SIG_FORCE > + def_bool n > + bool "Forces all dm verity data device root hash should be signed" > + depends on DM_VERITY_VERIFY_ROOTHASH_SIG > + help > + The device mapper target created by DM-VERITY will succeed only if the > + pre-generated tree of cryptographic checksums passed also has a pkcs#7 > + signature file that can validate the roothash of the tree. > + > + If unsure, say N. > + > config DM_VERITY_FEC > bool "Verity forward error correction support" > depends on DM_VERITY > diff --git a/drivers/md/Makefile b/drivers/md/Makefile > index be7a6eb92abc..8a8c142bcfe1 100644 > --- a/drivers/md/Makefile > +++ b/drivers/md/Makefile > @@ -61,7 +61,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o > obj-$(CONFIG_DM_ZERO)+= dm-zero.o > obj-$(CONFIG_DM_RAID)+= dm-raid.o > obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o > -obj-$(CONFIG_DM_VERITY) += dm-verity.o > +obj-$(CONFIG_DM_VERITY) += dm-verity.o dm-verity-verify-sig.o > obj-$(CONFIG_DM_CACHE) += dm-cache.o > obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o > obj-$(CONFIG_DM_ERA) += dm-era.o > diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c > index f4c31ffaa88e..53aebfa8bc38 100644 > --- a/drivers/md/dm-verity-target.c > +++ b/drivers/md/dm-verity-target.c > @@ -16,7 +16,7 @@ > > #include "dm-verity.h" > #include "dm-verity-fec.h" > - > +#include "dm-verity-verify-sig.h" > #include > #include > > @@ -34,7 +34,11 @@ > #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" > #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" > > -#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) > +#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC + \ > + DM_VERITY_ROOT_HASH_VERIFICATION_OPTS) > + > +#define DM_VERITY_MANDATORY_ARGS10 > + > > static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; > > @@ -855,7 +859,8 @@ static int verity_alloc_zero_digest(struct dm_verity *v) > return r; > } > > -static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) > +static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, > + struct dm_verity_sig_opts *verify_args) > { > int r; > unsigned argc; > @@ -904,6 +909,15 @@ static int verity_parse_opt_args(struct dm_arg_set *as, > struct dm_v
Re: [RFC 0/7] introduce memory hinting API for external process
On Tue, May 21, 2019 at 08:25:55AM +0530, Anshuman Khandual wrote: > > > On 05/20/2019 10:29 PM, Tim Murray wrote: > > On Sun, May 19, 2019 at 11:37 PM Anshuman Khandual > > wrote: > >> > >> Or Is the objective here is reduce the number of processes which get > >> killed by > >> lmkd by triggering swapping for the unused memory (user hinted) sooner so > >> that > >> they dont get picked by lmkd. Under utilization for zram hardware is a > >> concern > >> here as well ? > > > > The objective is to avoid some instances of memory pressure by > > proactively swapping pages that userspace knows to be cold before > > those pages reach the end of the LRUs, which in turn can prevent some > > apps from being killed by lmk/lmkd. As soon as Android userspace knows > > that an application is not being used and is only resident to improve > > performance if the user returns to that app, we can kick off > > process_madvise on that process's pages (or some portion of those > > pages) in a power-efficient way to reduce memory pressure long before > > the system hits the free page watermark. This allows the system more > > time to put pages into zram versus waiting for the watermark to > > trigger kswapd, which decreases the likelihood that later memory > > allocations will cause enough pressure to trigger a kill of one of > > these apps. > > So this opens up bit of LRU management to user space hints. Also because the > app > in itself wont know about the memory situation of the entire system, new > system > call needs to be called from an external process. That's why process_madvise is introduced here. > > > > >> Swapping out memory into zram wont increase the latency for a hot start ? > >> Or > >> is it because as it will prevent a fresh cold start which anyway will be > >> slower > >> than a slow hot start. Just being curious. > > > > First, not all swapped pages will be reloaded immediately once an app > > is resumed. We've found that an app's working set post-process_madvise > > is significantly smaller than what an app allocates when it first > > launches (see the delta between pswpin and pswpout in Minchan's > > results). Presumably because of this, faulting to fetch from zram does > > pswpin 4176131392647 975034 233.00 > pswpout127422426617311387507 108.00 > > IIUC the swap-in ratio is way higher in comparison to that of swap out. Is > that > always the case ? Or it tend to swap out from an active area of the working > set > which faulted back again. I think it's because apps are alive longer via reducing being killed so turn into from pgpgin to swapin. > > > not seem to introduce a noticeable hot start penalty, not does it > > cause an increase in performance problems later in the app's > > lifecycle. I've measured with and without process_madvise, and the > > differences are within our noise bounds. Second, because we're not > > That is assuming that post process_madvise() working set for the application > is > always smaller. There is another challenge. The external process should > ideally > have the knowledge of active areas of the working set for an application in > question for it to invoke process_madvise() correctly to prevent such > scenarios. There are several ways to detect workingset more accurately at the cost of runtime. For example, with idle page tracking or clear_refs. Accuracy is always trade-off of overhead for LRU aging. > > > preemptively evicting file pages and only making them more likely to > > be evicted when there's already memory pressure, we avoid the case > > where we process_madvise an app then immediately return to the app and > > reload all file pages in the working set even though there was no > > intervening memory pressure. Our initial version of this work evicted > > That would be the worst case scenario which should be avoided. Memory pressure > must be a parameter before actually doing the swap out. But pages if know to > be > inactive/cold can be marked high priority to be swapped out. > > > file pages preemptively and did cause a noticeable slowdown (~15%) for > > that case; this patch set avoids that slowdown. Finally, the benefit > > from avoiding cold starts is huge. The performance improvement from > > having a hot start instead of a cold start ranges from 3x for very > > small apps to 50x+ for larger apps like high-fidelity games. > > Is there any other real world scenario apart from this app based ecosystem > where > user hinted LRU management might be helpful ? Just being curious. Thanks for > the > detailed explanation. I will continue looking into this series.
linux-next: Tree for May 21
Hi all, Changes since 20190520: New trees: soc-fsl, soc-fsl-fixes Removed trees: (not updated for more than a year) alpine, samsung, sh, befs, kconfig, dwmw2-iommu, trivial, target-updates, target-bva, init_task The imx-mxs tree gained a build failure so I used the version from next-20190520. The sunxi tree gained a conflict against the imx-mxs tree. The drm-misc tree gained conflicts against Linis' and the amdgpu trees. Non-merge commits (relative to Linus' tree): 991 998 files changed, 29912 insertions(+), 14691 deletions(-) I have created today's linux-next tree at git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git (patches at http://www.kernel.org/pub/linux/kernel/next/ ). If you are tracking the linux-next tree using git, you should not use "git pull" to do so as that will try to merge the new linux-next release with the old one. You should use "git fetch" and checkout or reset to the new master. You can see which trees have been included by looking in the Next/Trees file in the source. There are also quilt-import.log and merge.log files in the Next directory. Between each merge, the tree was built with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a multi_v7_defconfig for arm and a native build of tools/perf. After the final fixups (if any), I do an x86_64 modules_install followed by builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit), ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc and sparc64 defconfig. And finally, a simple boot test of the powerpc pseries_le_defconfig kernel in qemu (with and without kvm enabled). Below is a summary of the state of the merge. I am currently merging 290 trees (counting Linus' and 70 trees of bug fix patches pending for the current merge release). Stats about the size of the tree over time can be seen at http://neuling.org/linux-next-size.html . Status of my local build tests will be at http://kisskb.ellerman.id.au/linux-next . If maintainers want to give advice about cross compilers/configs that work, we are always open to add more builds. Thanks to Randy Dunlap for doing many randconfig builds. And to Paul Gortmaker for triage and bug fixes. -- Cheers, Stephen Rothwell $ git checkout master $ git reset --hard stable Merging origin/master (f49aa1de9836 Merge tag 'for-5.2-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux) Merging fixes/master (2bbacd1a9278 Merge tag 'kconfig-v5.2' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild) Merging kspp-gustavo/for-next/kspp (b324f1b28dc0 afs: yfsclient: Mark expected switch fall-throughs) Merging kbuild-current/fixes (a2d635decbfa Merge tag 'drm-next-2019-05-09' of git://anongit.freedesktop.org/drm/drm) Merging arc-current/for-curr (c5a1726d7383 ARC: entry: EV_Trap expects r10 (vs. r9) to have exception cause) Merging arm-current/fixes (e17b1af96b2a ARM: 8857/1: efi: enable CP15 DMB instructions before cleaning the cache) Merging arm64-fixes/for-next/fixes (7a0a93c51799 arm64: vdso: Explicitly add build-id option) Merging m68k-current/for-linus (fdd20ec8786a Documentation/features/time: Mark m68k having modern-timekeeping) Merging powerpc-fixes/fixes (672eaf37db9f powerpc/cacheinfo: Remove double free) Merging sparc/master (f49aa1de9836 Merge tag 'for-5.2-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux) Merging fscrypt-current/for-stable (ae64f9bd1d36 Linux 4.15-rc2) Merging net/master (fa2c52be7129 vlan: Mark expected switch fall-through) Merging bpf/master (6a0a923dfa14 of_net: fix of_get_mac_address retval if compiled without CONFIG_OF) Merging ipsec/master (9b3040a6aafd ipv4: Define __ipv4_neigh_lookup_noref when CONFIG_INET is disabled) Merging netfilter/master (2c82c7e724ff netfilter: nf_tables: fix oops during rule dump) Merging ipvs/master (b2e3d68d1251 netfilter: nft_compat: destroy function must not have side effects) Merging wireless-drivers/master (7a0f8ad5ff63 Merge ath-current from git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git) Merging mac80211/master (933b40530b4b mac80211: remove set but not used variable 'old') Merging rdma-fixes/for-rc (2557fabd6e29 RDMA/hns: Bugfix for mapping user db) Merging sound-current/for-linus (c7b55fabfa44 ALSA: hdac: fix memory release for SST and SOF drivers) Merging sound-asoc-fixes/for-linus (08b9e0213aeb Merge branch 'asoc-5.1' into asoc-linus) Merging regmap-fixes/for-linus (1d6106cafb37 Merge branch 'regmap-5.1' into regmap-linus) Merging regulator-fixes/for-linus (0d183fc1760f Merge branch 'regulator-5.1' into regulator-linus) Merging spi-fixes/for-linus (72e3b3285a43 Merge branch 'spi-5.1' into spi-linus) Merging pci-current/for-linus (a188339ca5a3 Linux 5.2-rc1) Me
Re: Re: [PATCH v3 11/14] dmaengine: imx-sdma: fix ecspi1 rx dma not work on i.mx8mm
On 21-05-19, 04:58, Robin Gong wrote: > > -Original Message- > > From: Vinod Koul > > Sent: 2019年5月21日 12:18 > > > > On 07-05-19, 09:16, Robin Gong wrote: > > > Because the number of ecspi1 rx event on i.mx8mm is 0, the condition > > > check ignore such special case without dma channel enabled, which > > > caused > > > ecspi1 rx works failed. Actually, no need to check event_id0, checking > > > event_id1 is enough for DEV_2_DEV case because it's so lucky that > > > event_id1 never be 0. > > > > Well is that by chance or design that event_id1 will be never 0? > > > That's by chance. DEV_2_DEV is just for Audio case and non-zero for event_id1 > on current i.MX family. Then it wont be fgood to rely on chance :) -- ~Vinod
Re: [PATCH] input: keyboard: imx: use devm_platform_ioremap_resource() to simplify code
On Mon, Apr 01, 2019 at 05:28:12AM +, Anson Huang wrote: > Use the new helper devm_platform_ioremap_resource() which wraps the > platform_get_resource() and devm_ioremap_resource() together, to > simplify the code. > > Signed-off-by: Anson Huang Applied, thank you. > --- > drivers/input/keyboard/imx_keypad.c | 4 +--- > 1 file changed, 1 insertion(+), 3 deletions(-) > > diff --git a/drivers/input/keyboard/imx_keypad.c > b/drivers/input/keyboard/imx_keypad.c > index 539cb67..cf08f4a 100644 > --- a/drivers/input/keyboard/imx_keypad.c > +++ b/drivers/input/keyboard/imx_keypad.c > @@ -422,7 +422,6 @@ static int imx_keypad_probe(struct platform_device *pdev) > dev_get_platdata(&pdev->dev); > struct imx_keypad *keypad; > struct input_dev *input_dev; > - struct resource *res; > int irq, error, i, row, col; > > if (!keymap_data && !pdev->dev.of_node) { > @@ -455,8 +454,7 @@ static int imx_keypad_probe(struct platform_device *pdev) > timer_setup(&keypad->check_matrix_timer, > imx_keypad_check_for_events, 0); > > - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); > - keypad->mmio_base = devm_ioremap_resource(&pdev->dev, res); > + keypad->mmio_base = devm_platform_ioremap_resource(pdev, 0); > if (IS_ERR(keypad->mmio_base)) > return PTR_ERR(keypad->mmio_base); > > -- > 2.7.4 > -- Dmitry
Re: [PATCH 1/2] Input: elantech - enable middle button support on 2 ThinkPads
Hi Aaron, On Sun, May 19, 2019 at 03:27:10PM +0800, Aaron Ma wrote: > Adding 2 new touchpad PNPIDs to enable middle button support. Could you add their names in the comments please? > > Cc: sta...@vger.kernel.org > Signed-off-by: Aaron Ma > --- > drivers/input/mouse/elantech.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c > index a7f8b1614559..530142b5a115 100644 > --- a/drivers/input/mouse/elantech.c > +++ b/drivers/input/mouse/elantech.c > @@ -1189,6 +1189,8 @@ static const char * const middle_button_pnp_ids[] = { > "LEN2132", /* ThinkPad P52 */ > "LEN2133", /* ThinkPad P72 w/ NFC */ > "LEN2134", /* ThinkPad P72 */ > + "LEN0407", > + "LEN0408", These should come first - I'd like to keep the list sorted alphabetically. > NULL > }; > > -- > 2.17.1 > Thanks. -- Dmitry
[PATCH v3] kernel: fix typos and some coding style in comments
fix lenght to length Signed-off-by: Weitao Hou --- Changes in v3: - fix all other same typos with git grep --- .../devicetree/bindings/usb/s3c2410-usb.txt| 2 +- .../wireless/mediatek/mt76/mt76x02_usb_core.c | 2 +- kernel/sysctl.c| 18 +- sound/soc/qcom/qdsp6/q6asm.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/usb/s3c2410-usb.txt b/Documentation/devicetree/bindings/usb/s3c2410-usb.txt index e45b38ce2986..26c85afd0b53 100644 --- a/Documentation/devicetree/bindings/usb/s3c2410-usb.txt +++ b/Documentation/devicetree/bindings/usb/s3c2410-usb.txt @@ -4,7 +4,7 @@ OHCI Required properties: - compatible: should be "samsung,s3c2410-ohci" for USB host controller - - reg: address and lenght of the controller memory mapped region + - reg: address and length of the controller memory mapped region - interrupts: interrupt number for the USB OHCI controller - clocks: Should reference the bus and host clocks - clock-names: Should contain two strings diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c index 6b89f7eab26c..e0f5e6202a27 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c @@ -53,7 +53,7 @@ int mt76x02u_skb_dma_info(struct sk_buff *skb, int port, u32 flags) pad = round_up(skb->len, 4) + 4 - skb->len; /* First packet of a A-MSDU burst keeps track of the whole burst -* length, need to update lenght of it and the last packet. +* length, need to update length of it and the last packet. */ skb_walk_frags(skb, iter) { last = iter; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 943c89178e3d..f78f725f225e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -187,17 +187,17 @@ extern int no_unaligned_warning; * enum sysctl_writes_mode - supported sysctl write modes * * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value - * to be written, and multiple writes on the same sysctl file descriptor - * will rewrite the sysctl value, regardless of file position. No warning - * is issued when the initial position is not 0. + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is - * not 0. + * not 0. * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at - * file position 0 and the value must be fully contained in the buffer - * sent to the write syscall. If dealing with strings respect the file - * position, but restrict this to the max length of the buffer, anything - * passed the max lenght will be ignored. Multiple writes will append - * to the buffer. + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max length will be ignored. Multiple writes will append + * to the buffer. * * These write modes control how current file position affects the behavior of * updating sysctl values through the proc interface on each write. diff --git a/sound/soc/qcom/qdsp6/q6asm.c b/sound/soc/qcom/qdsp6/q6asm.c index 4f85cb19a309..e8141a33a55e 100644 --- a/sound/soc/qcom/qdsp6/q6asm.c +++ b/sound/soc/qcom/qdsp6/q6asm.c @@ -1194,7 +1194,7 @@ EXPORT_SYMBOL_GPL(q6asm_open_read); * q6asm_write_async() - non blocking write * * @ac: audio client pointer - * @len: lenght in bytes + * @len: length in bytes * @msw_ts: timestamp msw * @lsw_ts: timestamp lsw * @wflags: flags associated with write -- 2.18.0
Re: [PATCH 2/2] Input: synaptics - remove X240 from the topbuttonpad list
Hi Aaron, On Sun, May 19, 2019 at 03:27:11PM +0800, Aaron Ma wrote: > Lenovo ThinkPad X240 does not have the top software button. > When this wrong ID in top button list, smbus mode will fail to probe, > so keep it working at PS2 mode. > > Cc: sta...@vger.kernel.org > Signed-off-by: Aaron Ma > --- > drivers/input/mouse/synaptics.c | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c > index b6da0c1267e3..6ae7bc92476b 100644 > --- a/drivers/input/mouse/synaptics.c > +++ b/drivers/input/mouse/synaptics.c > @@ -140,7 +140,6 @@ static const char * const topbuttonpad_pnp_ids[] = { > "LEN002E", > "LEN0033", /* Helix */ > "LEN0034", /* T431s, L440, L540, T540, W540, X1 Carbon 2nd */ > - "LEN0035", /* X240 */ According to the history this came from Synaptics through Hans, so I'd like to make sure there are no several X240 versions floating around... > "LEN0036", /* T440 */ > "LEN0037", /* X1 Carbon 2nd */ > "LEN0038", > -- > 2.17.1 > Thanks. -- Dmitry
Re: [PATCH V6 02/15] PCI/PME: Export pcie_pme_disable_msi() & pcie_pme_no_msi() APIs
On 5/20/2019 11:27 PM, Bjorn Helgaas wrote: On Sat, May 18, 2019 at 07:28:29AM +0530, Vidya Sagar wrote: On 5/18/2019 12:25 AM, Bjorn Helgaas wrote: On Fri, May 17, 2019 at 11:23:36PM +0530, Vidya Sagar wrote: On 5/17/2019 6:54 PM, Bjorn Helgaas wrote: Do you have "lspci -vvxxx" output for the root ports handy? If there's some clue in the standard config space that would tell us that MSI works for some events but not others, we could make the PCI core pay attention it. That would be the best solution because it wouldn't require Tegra-specific code. Here is the output of 'lspci vvxxx' for one of Tegra194's root ports. Thanks! This port advertises both MSI and MSI-X, and neither one is enabled. This particular port doesn't have a slot, so hotplug isn't applicable to it. But if I understand correctly, if MSI or MSI-X were enabled and the port had a slot, the port would generate MSI/MSI-X hotplug interrupts. But PME and AER events would still cause INTx interrupts (even with MSI or MSI-X enabled). Do I have that right? I just want to make sure that the reason for PME being INTx is a permanent hardware choice and that it's not related to MSI and MSI-X currently being disabled. Yes. Thats right. Its hardware choice that our hardware engineers made to use INTx for PME instead of MSI irrespective of MSI/MSI-X enabled/disabled in the root port. Here are more spec references that seem applicable: - PCIe r4.0, sec 7.7.1.2 (Message Control Register for MSI) says: MSI Enable – If Set and the MSI-X Enable bit in the MSI-X Message Control register (see Section 7.9.2) is Clear, the Function is permitted to use MSI to request service and is prohibited from using INTx interrupts. - PCIe r4.0, sec 7.7.2.2 (Message Control Register for MSI-X) says: MSI-X Enable – If Set and the MSI Enable bit in the MSI Message Control register (see Section 6.8.1.3) is Clear, the Function is permitted to use MSI-X to request service and is prohibited from using INTx interrupts (if implemented). I read that to mean a device is prohibited from using MSI/MSI-X for some interrupts and INTx for others. Since Tegra194 cannot use MSI/MSI-X for PME, it should use INTx for *all* interrupts. That makes the MSI/MSI-X Capabilities superfluous, and they should be omitted. If we set pdev->no_msi for Tegra194, we'll avoid MSI/MSI-X completely, so we'll assume *all* interrupts including hotplug will be INTx. Will that work? Yes. We are fine with having all root port originated interrupts getting generated through INTx instead of MSI/MSI-X.
Re: [RFC 0/7] introduce memory hinting API for external process
On Mon, May 20, 2019 at 06:44:52PM -0700, Matthew Wilcox wrote: > On Mon, May 20, 2019 at 12:52:47PM +0900, Minchan Kim wrote: > > IMHO we should spell it out that this patchset complements MADV_WONTNEED > > and MADV_FREE by adding non-destructive ways to gain some free memory > > space. MADV_COLD is similar to MADV_WONTNEED in a way that it hints the > > kernel that memory region is not currently needed and should be reclaimed > > immediately; MADV_COOL is similar to MADV_FREE in a way that it hints the > > kernel that memory region is not currently needed and should be reclaimed > > when memory pressure rises. > > Do we tear down page tables for these ranges? That seems like a good True for MADV_COLD(reclaiming) but false for MADV_COOL(deactivating) at this implementation. > way of reclaiming potentially a substantial amount of memory. Given that consider refauting are spread out over time and reclaim occurs in burst, that does make sense to speed up the reclaiming. However, a concern to me is anonymous pages since they need swap cache insertion, which would be wasteful if they are not reclaimed, finally.
[PATCH 07/12] powerpc/pseries/svm: Use shared memory for Debug Trace Log (DTL)
From: Anshuman Khandual Secure guests need to share the DTL buffers with the hypervisor. To that end, use a kmem_cache constructor which converts the underlying buddy allocated SLUB cache pages into shared memory. Signed-off-by: Anshuman Khandual Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/svm.h | 5 arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/setup.c | 5 +++- arch/powerpc/platforms/pseries/svm.c| 40 + 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h index fef3740f46a6..f253116c31fc 100644 --- a/arch/powerpc/include/asm/svm.h +++ b/arch/powerpc/include/asm/svm.h @@ -15,6 +15,9 @@ static inline bool is_secure_guest(void) return mfmsr() & MSR_S; } +void dtl_cache_ctor(void *addr); +#define get_dtl_cache_ctor() (is_secure_guest() ? dtl_cache_ctor : NULL) + #else /* CONFIG_PPC_SVM */ static inline bool is_secure_guest(void) @@ -22,5 +25,7 @@ static inline bool is_secure_guest(void) return false; } +#define get_dtl_cache_ctor() NULL + #endif /* CONFIG_PPC_SVM */ #endif /* _ASM_POWERPC_SVM_H */ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index a43ec843c8e2..b7b6e6f52bd0 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o +obj-$(CONFIG_PPC_SVM) += svm.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index e4f0dfd4ae33..c928e6e8a279 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -71,6 +71,7 @@ #include #include #include +#include #include "pseries.h" #include "../../../../drivers/pci/pci.h" @@ -329,8 +330,10 @@ static inline int alloc_dispatch_logs(void) static int alloc_dispatch_log_kmem_cache(void) { + void (*ctor)(void *) = get_dtl_cache_ctor(); + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, - DISPATCH_LOG_BYTES, 0, NULL); + DISPATCH_LOG_BYTES, 0, ctor); if (!dtl_cache) { pr_warn("Failed to create dispatch trace log buffer cache\n"); pr_warn("Stolen time statistics will be unreliable\n"); diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c new file mode 100644 index ..c508196f7c83 --- /dev/null +++ b/arch/powerpc/platforms/pseries/svm.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Secure VM platform + * + * Copyright 2019 IBM Corporation + * Author: Anshuman Khandual + */ + +#include +#include + +/* There's one dispatch log per CPU. */ +#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE) + +static struct page *dtl_page_store[NR_DTL_PAGE]; +static long dtl_nr_pages; + +static bool is_dtl_page_shared(struct page *page) +{ + long i; + + for (i = 0; i < dtl_nr_pages; i++) + if (dtl_page_store[i] == page) + return true; + + return false; +} + +void dtl_cache_ctor(void *addr) +{ + unsigned long pfn = PHYS_PFN(__pa(addr)); + struct page *page = pfn_to_page(pfn); + + if (!is_dtl_page_shared(page)) { + dtl_page_store[dtl_nr_pages] = page; + dtl_nr_pages++; + WARN_ON(dtl_nr_pages >= NR_DTL_PAGE); + uv_share_page(pfn, 1); + } +}
RE: Re: [PATCH v3 11/14] dmaengine: imx-sdma: fix ecspi1 rx dma not work on i.mx8mm
> -Original Message- > From: Vinod Koul > Sent: 2019年5月21日 12:18 > > On 07-05-19, 09:16, Robin Gong wrote: > > Because the number of ecspi1 rx event on i.mx8mm is 0, the condition > > check ignore such special case without dma channel enabled, which > > caused > > ecspi1 rx works failed. Actually, no need to check event_id0, checking > > event_id1 is enough for DEV_2_DEV case because it's so lucky that > > event_id1 never be 0. > > Well is that by chance or design that event_id1 will be never 0? > That's by chance. DEV_2_DEV is just for Audio case and non-zero for event_id1 on current i.MX family.
[PATCH 01/12] powerpc/pseries: Introduce option to build secure virtual machines
Introduce CONFIG_PPC_SVM to control support for secure guests and include Ultravisor-related helpers when it is selected Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/ultravisor.h | 2 +- arch/powerpc/kernel/Makefile | 4 +++- arch/powerpc/platforms/pseries/Kconfig | 12 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h index 4ffec7a36acd..09e0a615d96f 100644 --- a/arch/powerpc/include/asm/ultravisor.h +++ b/arch/powerpc/include/asm/ultravisor.h @@ -28,7 +28,7 @@ extern int early_init_dt_scan_ultravisor(unsigned long node, const char *uname, * This call supports up to 6 arguments and 4 return arguments. Use * UCALL_BUFSIZE to size the return argument buffer. */ -#if defined(CONFIG_PPC_UV) +#if defined(CONFIG_PPC_UV) || defined(CONFIG_PPC_SVM) long ucall(unsigned long opcode, unsigned long *retbuf, ...); #else static long ucall(unsigned long opcode, unsigned long *retbuf, ...) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 43ff4546e469..1e9b721634c8 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -154,7 +154,9 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST)+= kvm.o kvm_emul.o -obj-$(CONFIG_PPC_UV) += ultravisor.o ucall.o +ifneq ($(CONFIG_PPC_UV)$(CONFIG_PPC_SVM),) +obj-y += ultravisor.o ucall.o +endif # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 9c6b3d860518..82c16aa4f1ce 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -144,3 +144,15 @@ config PAPR_SCM tristate "Support for the PAPR Storage Class Memory interface" help Enable access to hypervisor provided storage class memory. + +config PPC_SVM + bool "Secure virtual machine (SVM) support for POWER" + depends on PPC_PSERIES + default n + help +Support secure guests on POWER. There are certain POWER platforms which +support secure guests using the Protected Execution Facility, with the +help of an Ultravisor executing below the hypervisor layer. This +enables the support for those guests. + +If unsure, say "N".
Re: [PATCH] dma: dw-axi-dmac: fix null dereference when pointer first is null
On 08-05-19, 23:33, Colin King wrote: > From: Colin Ian King > > In the unlikely event that axi_desc_get returns a null desc in the > very first iteration of the while-loop the error exit path ends > up calling axi_desc_put on a null pointer 'first' and this causes > a null pointer dereference. Fix this by adding a null check on > pointer 'first' before calling axi_desc_put. Applied, thanks -- ~Vinod
[RFC PATCH 02/12] powerpc: Add support for adding an ESM blob to the zImage wrapper
From: Benjamin Herrenschmidt For secure VMs, the signing tool will create a ticket called the "ESM blob" for the Enter Secure Mode ultravisor call with the signatures of the kernel and initrd among other things. This adds support to the wrapper script for adding that blob via the "-e" option to the zImage.pseries. It also adds code to the zImage wrapper itself to retrieve and if necessary relocate the blob, and pass its address to Linux via the device-tree, to be later consumed by prom_init. Signed-off-by: Benjamin Herrenschmidt [ Minor adjustments to some comments. ] Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/boot/main.c | 41 ++ arch/powerpc/boot/ops.h| 2 ++ arch/powerpc/boot/wrapper | 24 +--- arch/powerpc/boot/zImage.lds.S | 8 +++ 4 files changed, 72 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c index 78aaf4ffd7ab..ca612efd3e81 100644 --- a/arch/powerpc/boot/main.c +++ b/arch/powerpc/boot/main.c @@ -150,6 +150,46 @@ static struct addr_range prep_initrd(struct addr_range vmlinux, void *chosen, return (struct addr_range){(void *)initrd_addr, initrd_size}; } +#ifdef __powerpc64__ +static void prep_esm_blob(struct addr_range vmlinux, void *chosen) +{ + unsigned long esm_blob_addr, esm_blob_size; + + /* Do we have an ESM (Enter Secure Mode) blob? */ + if (_esm_blob_end <= _esm_blob_start) + return; + + printf("Attached ESM blob at 0x%p-0x%p\n\r", + _esm_blob_start, _esm_blob_end); + esm_blob_addr = (unsigned long)_esm_blob_start; + esm_blob_size = _esm_blob_end - _esm_blob_start; + + /* +* If the ESM blob is too low it will be clobbered when the +* kernel relocates to its final location. In this case, +* allocate a safer place and move it. +*/ + if (esm_blob_addr < vmlinux.size) { + void *old_addr = (void *)esm_blob_addr; + + printf("Allocating 0x%lx bytes for esm_blob ...\n\r", + esm_blob_size); + esm_blob_addr = (unsigned long)malloc(esm_blob_size); + if (!esm_blob_addr) + fatal("Can't allocate memory for ESM blob !\n\r"); + printf("Relocating ESM blob 0x%lx <- 0x%p (0x%lx bytes)\n\r", + esm_blob_addr, old_addr, esm_blob_size); + memmove((void *)esm_blob_addr, old_addr, esm_blob_size); + } + + /* Tell the kernel ESM blob address via device tree. */ + setprop_val(chosen, "linux,esm-blob-start", (u32)(esm_blob_addr)); + setprop_val(chosen, "linux,esm-blob-end", (u32)(esm_blob_addr + esm_blob_size)); +} +#else +static inline void prep_esm_blob(struct addr_range vmlinux, void *chosen) { } +#endif + /* A buffer that may be edited by tools operating on a zImage binary so as to * edit the command line passed to vmlinux (by setting /chosen/bootargs). * The buffer is put in it's own section so that tools may locate it easier. @@ -218,6 +258,7 @@ void start(void) vmlinux = prep_kernel(); initrd = prep_initrd(vmlinux, chosen, loader_info.initrd_addr, loader_info.initrd_size); + prep_esm_blob(vmlinux, chosen); prep_cmdline(chosen); printf("Finalizing device tree..."); diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h index cd043726ed88..e0606766480f 100644 --- a/arch/powerpc/boot/ops.h +++ b/arch/powerpc/boot/ops.h @@ -251,6 +251,8 @@ extern char _initrd_start[]; extern char _initrd_end[]; extern char _dtb_start[]; extern char _dtb_end[]; +extern char _esm_blob_start[]; +extern char _esm_blob_end[]; static inline __attribute__((const)) int __ilog2_u32(u32 n) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index f9141eaec6ff..36b2ad6cd5b7 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -14,6 +14,7 @@ # -i initrdspecify initrd file # -d devtree specify device-tree blob # -s tree.dts specify device-tree source file (needs dtc installed) +# -e esm_blob specify ESM blob for secure images # -c cache $kernel.strip.gz (use if present & newer, else make) # -C prefixspecify command prefix for cross-building tools # (strip, objcopy, ld) @@ -38,6 +39,7 @@ platform=of initrd= dtb= dts= +esm_blob= cacheit= binary= compression=.gz @@ -60,9 +62,9 @@ tmpdir=. usage() { echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2 -echo ' [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2 -echo ' [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2 -echo ' [--no-compression] [vmlinux]' >&2 +echo ' [-d devtree] [-s tree.dts] [-e esm_blob]' >&2 +echo ' [-c] [-C cross-prefix] [-D datadir] [-W workingdir]' >&2 +echo ' [-Z (gz|xz|none)] [--no-compression]
[PATCH 09/12] powerpc/pseries/svm: Disable doorbells in SVM guests
From: Sukadev Bhattiprolu Normally, the HV emulates some instructions like MSGSNDP, MSGCLRP from a KVM guest. To emulate the instructions, it must first read the instruction from the guest's memory and decode its parameters. However for a secure guest (aka SVM), the page containing the instruction is in secure memory and the HV cannot access directly. It would need the Ultravisor (UV) to facilitate accessing the instruction and parameters but the UV currently does not have the support for such accesses. Until the UV has such support, disable doorbells in SVMs. This might incur a performance hit but that is yet to be quantified. With this patch applied (needed only in SVMs not needed for HV) we are able to launch SVM guests with multi-core support. Eg: qemu -smp sockets=2,cores=2,threads=2. Fix suggested by Benjamin Herrenschmidt. Thanks to input from Paul Mackerras, Ram Pai and Michael Anderson. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/platforms/pseries/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 3df46123cce3..95a5c24a1544 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "pseries.h" #include "offline_states.h" @@ -225,7 +226,7 @@ static __init void pSeries_smp_probe_xics(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) + if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest()) smp_ops->cause_ipi = smp_pseries_cause_ipi; else smp_ops->cause_ipi = icp_ops->cause_ipi;
Re: [PATCH v1] dmaengine: tegra-apb: Handle DMA_PREP_INTERRUPT flag properly
On 08-05-19, 10:24, Jon Hunter wrote: > > On 05/05/2019 19:12, Dmitry Osipenko wrote: > > The DMA_PREP_INTERRUPT flag means that descriptor's callback should be > > invoked upon transfer completion and that's it. For some reason driver > > completely disables the hardware interrupt handling, leaving channel in > > unusable state if transfer is issued with the flag being unset. Note > > that there are no occurrences in the relevant drivers that do not set > > the flag, hence this patch doesn't fix any actual bug and merely fixes > > potential problem. > > > > Signed-off-by: Dmitry Osipenko > > >From having a look at this, I am guessing that we have never really > tested the case where DMA_PREP_INTERRUPT flag is not set because as you > mentioned it does not look like this will work at all! That is a fair argument > > Is there are use-case you are looking at where you don't set the > DMA_PREP_INTERRUPT flag? > > If not I am wondering if we should even bother supporting this and warn > if it is not set. AFAICT it does not appear to be mandatory, but maybe > Vinod can comment more on this. This is supposed to be used in the cases where you submit a bunch of descriptors and selectively dont want an interrupt in few cases... Is this such a case? Thanks ~Vinod
[PATCH 11/12] powerpc/pseries/svm: Force SWIOTLB for secure guests
From: Anshuman Khandual SWIOTLB checks range of incoming CPU addresses to be bounced and sees if the device can access it through its DMA window without requiring bouncing. In such cases it just chooses to skip bouncing. But for cases like secure guests on powerpc platform all addresses need to be bounced into the shared pool of memory because the host cannot access it otherwise. Hence the need to do the bouncing is not related to device's DMA window and use of bounce buffers is forced by setting swiotlb_force. Also, connect the shared memory conversion functions into the ARCH_HAS_MEM_ENCRYPT hooks and call swiotlb_update_mem_attributes() to convert SWIOTLB's memory pool to shared memory. Signed-off-by: Anshuman Khandual [ Use ARCH_HAS_MEM_ENCRYPT hooks to share swiotlb memory pool. ] Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/mem_encrypt.h | 19 +++ arch/powerpc/platforms/pseries/Kconfig | 5 +++ arch/powerpc/platforms/pseries/svm.c | 45 ++ 3 files changed, 69 insertions(+) diff --git a/arch/powerpc/include/asm/mem_encrypt.h b/arch/powerpc/include/asm/mem_encrypt.h new file mode 100644 index ..45d5e4d0e6e0 --- /dev/null +++ b/arch/powerpc/include/asm/mem_encrypt.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * SVM helper functions + * + * Copyright 2019 IBM Corporation + */ + +#ifndef _ASM_POWERPC_MEM_ENCRYPT_H +#define _ASM_POWERPC_MEM_ENCRYPT_H + +#define sme_me_mask0ULL + +static inline bool sme_active(void) { return false; } +static inline bool sev_active(void) { return false; } + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */ diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 82c16aa4f1ce..41b10f3bc729 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -145,9 +145,14 @@ config PAPR_SCM help Enable access to hypervisor provided storage class memory. +config ARCH_HAS_MEM_ENCRYPT + def_bool n + config PPC_SVM bool "Secure virtual machine (SVM) support for POWER" depends on PPC_PSERIES + select SWIOTLB + select ARCH_HAS_MEM_ENCRYPT default n help Support secure guests on POWER. There are certain POWER platforms which diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index c508196f7c83..618622d636d5 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -7,8 +7,53 @@ */ #include +#include +#include +#include #include +static int __init init_svm(void) +{ + if (!is_secure_guest()) + return 0; + + /* Don't release the SWIOTLB buffer. */ + ppc_swiotlb_enable = 1; + + /* +* Since the guest memory is inaccessible to the host, devices always +* need to use the SWIOTLB buffer for DMA even if dma_capable() says +* otherwise. +*/ + swiotlb_force = SWIOTLB_FORCE; + + /* Share the SWIOTLB buffer with the host. */ + swiotlb_update_mem_attributes(); + + return 0; +} +machine_early_initcall(pseries, init_svm); + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_unshare_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_share_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + /* There's one dispatch log per CPU. */ #define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
[PATCH 07/14] fs: teach the mm about range locking
Conversion is straightforward, mmap_sem is used within the the same function context most of the time. No change in semantics. Signed-off-by: Davidlohr Bueso --- fs/aio.c | 5 +++-- fs/coredump.c | 5 +++-- fs/exec.c | 19 +--- fs/io_uring.c | 5 +++-- fs/proc/base.c| 23 fs/proc/internal.h| 2 ++ fs/proc/task_mmu.c| 32 +++ fs/proc/task_nommu.c | 22 +++ fs/userfaultfd.c | 50 ++- include/linux/userfaultfd_k.h | 5 +++-- 10 files changed, 100 insertions(+), 68 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 3490d1fa0e16..215d19dbbefa 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -461,6 +461,7 @@ static const struct address_space_operations aio_ctx_aops = { static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) { + DEFINE_RANGE_LOCK_FULL(mmrange); struct aio_ring *ring; struct mm_struct *mm = current->mm; unsigned long size, unused; @@ -521,7 +522,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); - if (down_write_killable(&mm->mmap_sem)) { + if (mm_write_lock_killable(mm, &mmrange)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -EINTR; @@ -530,7 +531,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 0, &unused, NULL); - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); diff --git a/fs/coredump.c b/fs/coredump.c index e42e17e55bfd..433713b63187 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -409,6 +409,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, static int coredump_wait(int exit_code, struct core_state *core_state) { + DEFINE_RANGE_LOCK_FULL(mmrange); struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; int core_waiters = -EBUSY; @@ -417,12 +418,12 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_state->dumper.task = tsk; core_state->dumper.next = NULL; - if (down_write_killable(&mm->mmap_sem)) + if (mm_write_lock_killable(mm, &mmrange)) return -EINTR; if (!mm->core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); if (core_waiters > 0) { struct core_thread *ptr; diff --git a/fs/exec.c b/fs/exec.c index e96fd5328739..fbcb36bc4fd1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -241,6 +241,7 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, static int __bprm_mm_init(struct linux_binprm *bprm) { + DEFINE_RANGE_LOCK_FULL(mmrange); int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; @@ -250,7 +251,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) return -ENOMEM; vma_set_anonymous(vma); - if (down_write_killable(&mm->mmap_sem)) { + if (mm_write_lock_killable(mm, &mmrange)) { err = -EINTR; goto err_free; } @@ -273,11 +274,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm) mm->stack_vm = mm->total_vm = 1; arch_bprm_mm_init(mm, vma); - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); bprm->p = vma->vm_end - sizeof(void *); return 0; err: - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); err_free: bprm->vma = NULL; vm_area_free(vma); @@ -691,6 +692,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { + DEFINE_RANGE_LOCK_FULL(mmrange); unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; @@ -738,7 +740,7 @@ int setup_arg_pages(struct linux_binprm *bprm, bprm->loader -= stack_shift; bprm->exec -= stack_shift; - if (down_write_killable(&mm->mmap_sem)) + if (mm_write_lock_killable(mm, &mmrange)) return -EINTR; vm_flags = VM_STACK_FLAGS; @@ -795,7 +797,7 @@ int setup_arg_pages(struct linux_binprm *bprm, ret = -EFAULT; out_unlock: - up_write(&mm->mmap_sem); + mm_write_unlock
[PATCH 12/12] powerpc/configs: Enable secure guest support in pseries and ppc64 defconfigs
From: Ryan Grimm Enables running as a secure guest in platforms with an Ultravisor. Signed-off-by: Ryan Grimm Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/configs/ppc64_defconfig | 1 + arch/powerpc/configs/pseries_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index d7c381009636..725297438320 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -31,6 +31,7 @@ CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y +CONFIG_PPC_SVM=y CONFIG_PPC_MAPLE=y CONFIG_PPC_PASEMI=y CONFIG_PPC_PASEMI_IOMMU=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 62e12f61a3b2..724a574fe4b2 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -42,6 +42,7 @@ CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y +CONFIG_PPC_SVM=y # CONFIG_PPC_PMAC is not set CONFIG_RTAS_FLASH=m CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
[PATCH 13/14] drivers: teach the mm about range locking
Conversion is straightforward, mmap_sem is used within the the same function context most of the time. No change in semantics. Signed-off-by: Davidlohr Bueso --- drivers/android/binder_alloc.c | 7 --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 7 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 9 + drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 +++-- drivers/gpu/drm/i915/i915_gem.c | 5 +++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 11 +++ drivers/gpu/drm/nouveau/nouveau_svm.c| 23 ++- drivers/gpu/drm/radeon/radeon_cs.c | 5 +++-- drivers/gpu/drm/radeon/radeon_gem.c | 8 +--- drivers/gpu/drm/radeon/radeon_mn.c | 7 --- drivers/gpu/drm/ttm/ttm_bo_vm.c | 4 ++-- drivers/infiniband/core/umem.c | 7 --- drivers/infiniband/core/umem_odp.c | 12 +++- drivers/infiniband/core/uverbs_main.c| 5 +++-- drivers/infiniband/hw/mlx4/mr.c | 5 +++-- drivers/infiniband/hw/qib/qib_user_pages.c | 7 --- drivers/infiniband/hw/usnic/usnic_uiom.c | 5 +++-- drivers/iommu/amd_iommu_v2.c | 4 ++-- drivers/iommu/intel-svm.c| 4 ++-- drivers/media/v4l2-core/videobuf-core.c | 5 +++-- drivers/media/v4l2-core/videobuf-dma-contig.c| 5 +++-- drivers/media/v4l2-core/videobuf-dma-sg.c| 5 +++-- drivers/misc/cxl/cxllib.c| 5 +++-- drivers/misc/cxl/fault.c | 5 +++-- drivers/misc/sgi-gru/grufault.c | 20 drivers/misc/sgi-gru/grufile.c | 5 +++-- drivers/misc/sgi-gru/grukservices.c | 4 +++- drivers/misc/sgi-gru/grumain.c | 6 -- drivers/misc/sgi-gru/grutables.h | 5 - drivers/oprofile/buffer_sync.c | 12 +++- drivers/staging/kpc2000/kpc_dma/fileops.c| 5 +++-- drivers/tee/optee/call.c | 5 +++-- drivers/vfio/vfio_iommu_type1.c | 9 + drivers/xen/gntdev.c | 5 +++-- drivers/xen/privcmd.c| 17 ++--- include/linux/hmm.h | 7 --- 37 files changed, 160 insertions(+), 109 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index bb929eb87116..0b9cd9becd76 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -195,6 +195,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, struct vm_area_struct *vma = NULL; struct mm_struct *mm = NULL; bool need_mm = false; + DEFINE_RANGE_LOCK_FULL(mmrange); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: %s pages %pK-%pK\n", alloc->pid, @@ -220,7 +221,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, mm = alloc->vma_vm_mm; if (mm) { - down_read(&mm->mmap_sem); + mm_read_lock(mm, &mmrange); vma = alloc->vma; } @@ -279,7 +280,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, /* vm_insert_page does not seem to increment the refcount */ } if (mm) { - up_read(&mm->mmap_sem); + mm_read_unlock(mm, &mmrange); mmput(mm); } return 0; @@ -310,7 +311,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } err_no_vma: if (mm) { - up_read(&mm->mmap_sem); + mm_read_unlock(mm, &mmrange); mmput(mm); } return vma ? -ENOMEM : -ESRCH; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 123eb0d7e2e9..28ddd42b27be 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1348,9 +1348,9 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( * concurrently and the queues are actually stopped */ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { - down_write(¤t->mm->mmap_sem); + mm_write_lock(current->mm, &mmrange); is_invalid_userptr = atomic_read(&mem->invalid); - up_write(¤t->mm->mmap_sem); + mm_write_unlock(current->mm, &mmrange); } mutex_lock(&mem->lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 58ed401c5996..d002df91c7b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++
[PATCH 14/14] mm: convert mmap_sem to range mmap_lock
With mmrange now in place and everyone using the mm locking wrappers, we can convert the rwsem to a the range locking scheme. Every single user of mmap_sem will use a full range, which means that there is no more parallelism than what we already had. This is the worst case scenario. Prefetching and some lockdep stuff have been blindly converted (for now). This lays out the foundations for later mm address space locking scalability. Signed-off-by: Davidlohr Bueso --- arch/x86/events/core.c | 2 +- arch/x86/kernel/tboot.c| 2 +- arch/x86/mm/fault.c| 2 +- drivers/firmware/efi/efi.c | 2 +- include/linux/mm.h | 26 +- include/linux/mm_types.h | 4 ++-- kernel/bpf/stackmap.c | 9 + kernel/fork.c | 2 +- mm/init-mm.c | 2 +- mm/memory.c| 2 +- 10 files changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f315425d8468..45ecca077255 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2179,7 +2179,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(&mm->mmap_sem); + lockdep_assert_held_exclusive(&mm->mmap_lock); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 6e5ef8fb8a02..e5423e2451d3 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -104,7 +104,7 @@ static struct mm_struct tboot_mm = { .pgd= swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .mmap_lock = __RANGE_LOCK_TREE_INITIALIZER(init_mm.mmap_lock), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), }; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fbb060c89e7d..9f285ba76f1e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1516,7 +1516,7 @@ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { - prefetchw(¤t->mm->mmap_sem); + prefetchw(¤t->mm->mmap_lock); if (unlikely(kmmio_fault(regs, address))) return; diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 55b77c576c42..01e4937f3cea 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -80,7 +80,7 @@ struct mm_struct efi_mm = { .mm_rb = RB_ROOT, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), + .mmap_lock = __RANGE_LOCK_TREE_INITIALIZER(efi_mm.mmap_lock), .page_table_lock= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bf3e2542047..5ac33c46679f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2899,74 +2899,74 @@ static inline void setup_nr_node_ids(void) {} static inline bool mm_is_locked(struct mm_struct *mm, struct range_lock *mmrange) { - return rwsem_is_locked(&mm->mmap_sem); + return range_is_locked(&mm->mmap_lock, mmrange); } /* Reader wrappers */ static inline int mm_read_trylock(struct mm_struct *mm, struct range_lock *mmrange) { - return down_read_trylock(&mm->mmap_sem); + return range_read_trylock(&mm->mmap_lock, mmrange); } static inline void mm_read_lock(struct mm_struct *mm, struct range_lock *mmrange) { - down_read(&mm->mmap_sem); + range_read_lock(&mm->mmap_lock, mmrange); } static inline void mm_read_lock_nested(struct mm_struct *mm, struct range_lock *mmrange, int subclass) { - down_read_nested(&mm->mmap_sem, subclass); + range_read_lock_nested(&mm->mmap_lock, mmrange, subclass); } static inline void mm_read_unlock(struct mm_struct *mm, struct range_lock *mmrange) { - up_read(&mm->mmap_sem); + range_read_unlock(&mm->mmap_lock, mmrange); } /* Writer wrappers */ static inline int mm_write_trylock(struct mm_struct *mm, struct range_lock *mmrange) { - return down_write_trylock(&mm->mmap_sem); + return
[PATCH 06/14] mm: teach the mm about range locking
Conversion is straightforward, mmap_sem is used within the the same function context most of the time, and we already have vmf updated. No changes in semantics. Signed-off-by: Davidlohr Bueso --- include/linux/mm.h | 8 +++--- mm/filemap.c | 8 +++--- mm/frame_vector.c | 4 +-- mm/gup.c | 21 +++ mm/hmm.c | 3 ++- mm/khugepaged.c| 54 +-- mm/ksm.c | 42 +- mm/madvise.c | 36 ++ mm/memcontrol.c| 10 +--- mm/memory.c| 10 +--- mm/mempolicy.c | 25 ++ mm/migrate.c | 10 +--- mm/mincore.c | 6 +++-- mm/mlock.c | 20 +-- mm/mmap.c | 69 -- mm/mmu_notifier.c | 9 --- mm/mprotect.c | 15 ++- mm/mremap.c| 9 --- mm/msync.c | 9 --- mm/nommu.c | 25 ++ mm/oom_kill.c | 5 ++-- mm/process_vm_access.c | 4 +-- mm/shmem.c | 2 +- mm/swapfile.c | 5 ++-- mm/userfaultfd.c | 21 --- mm/util.c | 10 +--- 26 files changed, 252 insertions(+), 188 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 044e428b1905..8bf3e2542047 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1459,6 +1459,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, * right now." 1 means "skip the current vma." * @mm:mm_struct representing the target process of page table walk * @vma: vma currently walked (NULL if walking outside vmas) + * @mmrange: mm address space range locking * @private: private data for callbacks' usage * * (see the comment on walk_page_range() for more details) @@ -2358,8 +2359,8 @@ static inline int check_data_rlimit(unsigned long rlim, return 0; } -extern int mm_take_all_locks(struct mm_struct *mm); -extern void mm_drop_all_locks(struct mm_struct *mm); +extern int mm_take_all_locks(struct mm_struct *mm, struct range_lock *mmrange); +extern void mm_drop_all_locks(struct mm_struct *mm, struct range_lock *mmrange); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); @@ -2389,7 +2390,8 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); extern int __do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf, bool downgrade); + struct list_head *uf, bool downgrade, + struct range_lock *); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); diff --git a/mm/filemap.c b/mm/filemap.c index 959022841bab..71f0d8a18f40 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1388,7 +1388,7 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, if (flags & FAULT_FLAG_RETRY_NOWAIT) return 0; - up_read(&mm->mmap_sem); + mm_read_unlock(mm, mmrange); if (flags & FAULT_FLAG_KILLABLE) wait_on_page_locked_killable(page); else @@ -1400,7 +1400,7 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, ret = __lock_page_killable(page); if (ret) { - up_read(&mm->mmap_sem); + mm_read_unlock(mm, mmrange); return 0; } } else @@ -2317,7 +2317,7 @@ static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == FAULT_FLAG_ALLOW_RETRY) { fpin = get_file(vmf->vma->vm_file); - up_read(&vmf->vma->vm_mm->mmap_sem); + mm_read_unlock(vmf->vma->vm_mm, vmf->lockrange); } return fpin; } @@ -2357,7 +2357,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, * mmap_sem here and return 0 if we don't have a fpin. */ if (*fpin == NULL) - up_read(&vmf->vma->vm_mm->mmap_sem); + mm_read_unlock(vmf->vma->vm_mm, vmf->lockrange); return 0; } } else diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 4e1a577cbb79..ef33d21b3f39 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -47,7 +47,7 @@ int get_vaddr_frames(unsigned long start
[PATCH 02/14] Introduce range reader/writer lock
This implements a sleepable range rwlock, based on interval tree, serializing conflicting/intersecting/overlapping ranges within the tree. The largest range is given by [0, ~0] (inclusive). Unlike traditional locks, range locking involves dealing with the tree itself and the range to be locked, normally stack allocated and always explicitly prepared/initialized by the user in a [a0, a1] a0 <= a1 sorted manner, before actually taking the lock. Interval-tree based range locking is about controlling tasks' forward progress when adding an arbitrary interval (node) to the tree, depending on any overlapping ranges. A task can only continue (wakeup) if there are no intersecting ranges, thus achieving mutual exclusion. To this end, a reference counter is kept for each intersecting range in the tree (_before_ adding itself to it). To enable shared locking semantics, the reader to-be-locked will not take reference if an intersecting node is also a reader, therefore ignoring the node altogether. Fairness and freedom of starvation are guaranteed by the lack of lock stealing, thus range locks depend directly on interval tree semantics. This is particularly for iterations, where the key for the rbtree is given by the interval's low endpoint, and duplicates are walked as it would an inorder traversal of the tree. The cost of lock and unlock of a range is O((1+R_int)log(R_all)) where R_all is total number of ranges and R_int is the number of ranges intersecting the operated range. How much does it cost: -- The cost of lock and unlock of a range is O((1+R_int)log(R_all)) where R_all is total number of ranges and R_int is the number of ranges intersecting the new range range to be added. Due to its sharable nature, full range locks can be compared with rw-sempahores, which also serves from a mutex standpoint as writer-only situations are pretty similar nowadays. The first is the memory footprint, tree locks are smaller than rwsems: 32 vs 40 bytes, but require an additional 72 bytes of stack for the range structure. Secondly, because every range call is serialized by the tree->lock, any lock() fastpath will at least have an interval_tree_insert() and spinlock lock+unlock overhead compared to a single atomic insn in the case of rwsems. Similar scenario obviously for the unlock() case. The torture module was used to measure 1-1 differences in lock acquisition with increasing core counts over a period of 10 minutes. Readers and writers are interleaved, with a slight advantage to writers as its the first kthread that is created. The following shows the avg ops/minute with various thread-setups on boxes with small and large core-counts. ** 4-core AMD Opteron ** (write-only) rwsem-2thr: 4198.5, stddev: 7.77 range-2thr: 4199.1, stddev: 0.73 rwsem-4thr: 6036.8, stddev: 50.91 range-4thr: 6004.9, stddev: 126.57 rwsem-8thr: 6245.6, stddev: 59.39 range-8thr: 6229.3, stddev: 10.60 (read-only) rwsem-2thr: 5930.7, stddev: 21.92 range-2thr: 5917.3, stddev: 25.45 rwsem-4thr: 9881.6, stddev: 0.70 range-4thr: 9540.2, stddev: 98.28 rwsem-8thr: 11633.2, stddev: 7.72 range-8thr: 11314.7, stddev: 62.22 For the read/write-only cases, there is very little difference between the range lock and rwsems, with up to a 3% hit, which could very well be considered in the noise range. (read-write) rwsem-write-1thr: 1744.8, stddev: 11.59 rwsem-read-1thr: 1043.1, stddev: 3.97 range-write-1thr: 1740.2, stddev: 5.99 range-read-1thr: 1022.5, stddev: 6.41 rwsem-write-2thr: 1662.5, stddev: 0.70 rwsem-read-2thr: 1278.0, stddev: 25.45 range-write-2thr: 1321.5, stddev: 51.61 range-read-2thr: 1243.5, stddev: 30.40 rwsem-write-4thr: 1761.0, stddev: 11.31 rwsem-read-4thr: 1426.0, stddev: 7.07 range-write-4thr: 1417.0, stddev: 29.69 range-read-4thr: 1398.0, stddev: 56.56 While a single reader and writer threads does not show must difference, increasing core counts shows that in reader/writer workloads, writer threads can take a hit in raw performance of up to ~20%, while the number of reader throughput is quite similar among both locks. ** 240-core (ht) IvyBridge ** (write-only) rwsem-120thr: 6844.5, stddev: 82.73 range-120thr: 6070.5, stddev: 85.55 rwsem-240thr: 6292.5, stddev: 146.3 range-240thr: 6099.0, stddev: 15.55 rwsem-480thr: 6164.8, stddev: 33.94 range-480thr: 6062.3, stddev: 19.79 (read-only) rwsem-120thr: 136860.4, stddev: 2539.92 range-120thr: 138052.2, stddev: 327.39 rwsem-240thr: 235297.5, stddev: 2220.50 range-240thr: 232099.1, stddev: 3614.72 rwsem-480thr: 272683.0, stddev: 3924.32 range-480thr: 256539.2, stddev: 9541.69 Similar to the small box, larger machines show that range locks take only a minor (up to ~6% for 480 threads) hit even in completely exclusive or shared scenarios. (read-write) rwsem-write-60thr: 4658.1, stddev: 1303.19 rwsem-read-60thr: 1108.7, stddev: 718.42 range-write-60thr: 3203.6, stddev: 139.30 range-read-60thr: 1852.8, stddev: 147.5 rwsem-write-120thr: 3971.3, stddev:
[PATCH 12/14] kernel: teach the mm about range locking
Conversion is straightforward, mmap_sem is used within the the same function context most of the time. No change in semantics. Signed-off-by: Davidlohr Bueso --- kernel/acct.c | 5 +++-- kernel/bpf/stackmap.c | 7 +-- kernel/events/core.c| 5 +++-- kernel/events/uprobes.c | 20 kernel/exit.c | 9 + kernel/fork.c | 16 ++-- kernel/futex.c | 5 +++-- kernel/sched/fair.c | 5 +++-- kernel/sys.c| 22 +- kernel/trace/trace_output.c | 5 +++-- 10 files changed, 60 insertions(+), 39 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 81f9831a7859..2bbcecbd78ef 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -538,14 +538,15 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + DEFINE_RANGE_LOCK_FULL(mmrange); - down_read(¤t->mm->mmap_sem); + mm_read_lock(current->mm, &mmrange); vma = current->mm->mmap; while (vma) { vsize += vma->vm_end - vma->vm_start; vma = vma->vm_next; } - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); } spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 950ab2f28922..fdb352bea7e8 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -37,6 +37,7 @@ struct bpf_stack_map { struct stack_map_irq_work { struct irq_work irq_work; struct rw_semaphore *sem; + struct range_lock *mmrange; }; static void do_up_read(struct irq_work *entry) @@ -291,6 +292,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, struct vm_area_struct *vma; bool irq_work_busy = false; struct stack_map_irq_work *work = NULL; + DEFINE_RANGE_LOCK_FULL(mmrange); if (in_nmi()) { work = this_cpu_ptr(&up_read_work); @@ -309,7 +311,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - down_read_trylock(¤t->mm->mmap_sem) == 0) { + mm_read_trylock(current->mm, &mmrange) == 0) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -334,9 +336,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); } else { work->sem = ¤t->mm->mmap_sem; + work->mmrange = &mmrange; irq_work_queue(&work->irq_work); /* * The irq_work will release the mmap_sem with diff --git a/kernel/events/core.c b/kernel/events/core.c index abbd4b3b96c2..3b43cfe63b54 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9079,6 +9079,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) struct mm_struct *mm = NULL; unsigned int count = 0; unsigned long flags; + DEFINE_RANGE_LOCK_FULL(mmrange); /* * We may observe TASK_TOMBSTONE, which means that the event tear-down @@ -9092,7 +9093,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (!mm) goto restart; - down_read(&mm->mmap_sem); + mm_read_lock(mm, &mmrange); } raw_spin_lock_irqsave(&ifh->lock, flags); @@ -9118,7 +9119,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { - up_read(&mm->mmap_sem); + mm_read_unlock(mm, &mmrange); mmput(mm); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3689eceb8d0c..6779c237799a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -997,6 +997,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) bool is_register = !!new; struct map_info *info; int err = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, @@ -1013,7 +1014,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - down_write(&mm->mmap_sem); + mm_write_lock(mm, &mmrange); vma = find_vma(
[PATCH 11/14] ipc: teach the mm about range locking
Conversion is straightforward, mmap_sem is used within the the same function context most of the time. No change in semantics. Signed-off-by: Davidlohr Bueso --- ipc/shm.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index ce1ca9f7c6e9..3666fa71bfc2 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1418,6 +1418,7 @@ COMPAT_SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, void __user *, uptr) long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, unsigned long shmlba) { + DEFINE_RANGE_LOCK_FULL(mmrange); struct shmid_kernel *shp; unsigned long addr = (unsigned long)shmaddr; unsigned long size; @@ -1544,7 +1545,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, if (err) goto out_fput; - if (down_write_killable(¤t->mm->mmap_sem)) { + if (mm_write_lock_killable(current->mm, &mmrange)) { err = -EINTR; goto out_fput; } @@ -1564,7 +1565,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, if (IS_ERR_VALUE(addr)) err = (long)addr; invalid: - up_write(¤t->mm->mmap_sem); + mm_write_unlock(current->mm, &mmrange); if (populate) mm_populate(addr, populate); @@ -1625,6 +1626,7 @@ COMPAT_SYSCALL_DEFINE3(shmat, int, shmid, compat_uptr_t, shmaddr, int, shmflg) */ long ksys_shmdt(char __user *shmaddr) { + DEFINE_RANGE_LOCK_FULL(mmrange); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long addr = (unsigned long)shmaddr; @@ -1638,7 +1640,7 @@ long ksys_shmdt(char __user *shmaddr) if (addr & ~PAGE_MASK) return retval; - if (down_write_killable(&mm->mmap_sem)) + if (mm_write_lock_killable(mm, &mmrange)) return -EINTR; /* @@ -1726,7 +1728,7 @@ long ksys_shmdt(char __user *shmaddr) #endif - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); return retval; } -- 2.16.4
[PATCH 08/14] arch/x86: teach the mm about range locking
Conversion is straightforward, mmap_sem is used within the the same function context most of the time. No change in semantics. Signed-off-by: Davidlohr Bueso --- arch/x86/entry/vdso/vma.c | 12 +++- arch/x86/kernel/vm86_32.c | 5 +++-- arch/x86/kvm/paging_tmpl.h | 9 + arch/x86/mm/debug_pagetables.c | 8 arch/x86/mm/fault.c| 8 arch/x86/mm/mpx.c | 15 +-- arch/x86/um/vdso/vma.c | 5 +++-- 7 files changed, 35 insertions(+), 27 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index babc4e7a519c..f6d8950f37b8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -145,12 +145,13 @@ static const struct vm_special_mapping vvar_mapping = { */ static int map_vdso(const struct vdso_image *image, unsigned long addr) { + DEFINE_RANGE_LOCK_FULL(mmrange); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long text_start; int ret = 0; - if (down_write_killable(&mm->mmap_sem)) + if (mm_write_lock_killable(mm, &mmrange)) return -EINTR; addr = get_unmapped_area(NULL, addr, @@ -193,7 +194,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } up_fail: - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); return ret; } @@ -254,8 +255,9 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + DEFINE_RANGE_LOCK_FULL(mmrange); - down_write(&mm->mmap_sem); + mm_write_lock(mm, &mmrange); /* * Check if we have already mapped vdso blob - fail to prevent * abusing from userspace install_speciall_mapping, which may @@ -266,11 +268,11 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma_is_special_mapping(vma, &vdso_mapping) || vma_is_special_mapping(vma, &vvar_mapping)) { - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); return -EEXIST; } } - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); return map_vdso(image, addr); } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 6a38717d179c..39eecee07dcd 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -171,8 +171,9 @@ static void mark_screen_rdonly(struct mm_struct *mm) pmd_t *pmd; pte_t *pte; int i; + DEFINE_RANGE_LOCK_FULL(mmrange); - down_write(&mm->mmap_sem); + mm_write_lock(mm, &mmrange); pgd = pgd_offset(mm, 0xA); if (pgd_none_or_clear_bad(pgd)) goto out; @@ -198,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) } pte_unmap_unlock(pte, ptl); out: - up_write(&mm->mmap_sem); + mm_write_unlock(mm, &mmrange); flush_tlb_mm_range(mm, 0xA, 0xA + 32*PAGE_SIZE, PAGE_SHIFT, false); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 367a47df4ba0..347d3ba41974 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -152,23 +152,24 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK; unsigned long pfn; unsigned long paddr; + DEFINE_RANGE_LOCK_FULL(mmrange); - down_read(¤t->mm->mmap_sem); + mm_read_lock(current->mm, &mmrange); vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); if (!vma || !(vma->vm_flags & VM_PFNMAP)) { - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); return -EFAULT; } pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; paddr = pfn << PAGE_SHIFT; table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); if (!table) { - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); return -EFAULT; } ret = CMPXCHG(&table[index], orig_pte, new_pte); memunmap(table); - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); } return (ret != orig_pte); diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index cd84f067e41d..0d131edc6a75 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -15,9 +15,9 @@ DEFI
[PATCH 09/14] virt: teach the mm about range locking
Conversion is straightforward, mmap_sem is used within the the same function context most of the time. No change in semantics. Signed-off-by: Davidlohr Bueso --- virt/kvm/arm/mmu.c | 17 ++--- virt/kvm/async_pf.c | 4 ++-- virt/kvm/kvm_main.c | 11 ++- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 74b6582eaa3c..85f8b9ccfabe 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -980,9 +980,10 @@ void stage2_unmap_vm(struct kvm *kvm) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int idx; + DEFINE_RANGE_LOCK_FULL(mmrange); idx = srcu_read_lock(&kvm->srcu); - down_read(¤t->mm->mmap_sem); + mm_read_lock(current->mm, &mmrange); spin_lock(&kvm->mmu_lock); slots = kvm_memslots(kvm); @@ -990,7 +991,7 @@ void stage2_unmap_vm(struct kvm *kvm) stage2_unmap_memslot(kvm, memslot); spin_unlock(&kvm->mmu_lock); - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); srcu_read_unlock(&kvm->srcu, idx); } @@ -1688,6 +1689,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); + DEFINE_RANGE_LOCK_FULL(mmrange); unsigned long vma_pagesize, flags = 0; write_fault = kvm_is_write_fault(vcpu); @@ -1700,11 +1702,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } /* Let's check if we will get back a huge page backed by hugetlbfs */ - down_read(¤t->mm->mmap_sem); + mm_read_lock(current->mm, &mmrange); vma = find_vma_intersection(current->mm, hva, hva + 1); if (unlikely(!vma)) { kvm_err("Failed to find VMA for hva 0x%lx\n", hva); - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); return -EFAULT; } @@ -1725,7 +1727,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PMD_SIZE || (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); /* We need minimum second+third level pages */ ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), @@ -2280,6 +2282,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, hva_t reg_end = hva + mem->memory_size; bool writable = !(mem->flags & KVM_MEM_READONLY); int ret = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && change != KVM_MR_FLAGS_ONLY) @@ -2293,7 +2296,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; - down_read(¤t->mm->mmap_sem); + mm_read_lock(current->mm, &mmrange); /* * A memory region could potentially cover multiple VMAs, and any holes * between them, so iterate over all of them to find out if we can map @@ -2361,7 +2364,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, stage2_flush_memslot(kvm, memslot); spin_unlock(&kvm->mmu_lock); out: - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); return ret; } diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index e93cd8515134..03d9f9bc5270 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -87,11 +87,11 @@ static void async_pf_execute(struct work_struct *work) * mm and might be done in another context, so we must * access remotely. */ - down_read(&mm->mmap_sem); + mm_read_lock(mm, &mmrange); get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, &locked, &mmrange); if (locked) - up_read(&mm->mmap_sem); + mm_read_unlock(mm, &mmrange); kvm_async_page_present_sync(vcpu, apf); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e1484150a3dd..421652e66a03 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1331,6 +1331,7 @@ EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) { struct vm_area_struct *vma; + DEFINE_RANGE_LOCK_FULL(mmrange); unsigned long addr, size; size = PAGE_SIZE; @@ -1339,7 +1340,7 @@ unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) if (kvm_is_error_hva(addr)) return PAGE_SIZE; - down_read(¤t->mm->mmap_sem); + mm_read_lock(current->mm, &mmrange); vma = find_vma(current->mm, addr); if (!v
[PATCH 10/14] net: teach the mm about range locking
Conversion is straightforward, mmap_sem is used within the the same function context most of the time. No change in semantics. Signed-off-by: Davidlohr Bueso --- net/ipv4/tcp.c | 5 +++-- net/xdp/xdp_umem.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 53d61ca3ac4b..2be929dcafa8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1731,6 +1731,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct tcp_sock *tp; int inq; int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; @@ -1740,7 +1741,7 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); - down_read(¤t->mm->mmap_sem); + mm_read_lock(current->mm, &mmrange); ret = -EINVAL; vma = find_vma(current->mm, address); @@ -1802,7 +1803,7 @@ static int tcp_zerocopy_receive(struct sock *sk, frags++; } out: - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); if (length) { tp->copied_seq = seq; tcp_rcv_space_adjust(sk); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 2b18223e7eb8..2bf444fb998d 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -246,16 +246,17 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) unsigned int gup_flags = FOLL_WRITE; long npgs; int err; + DEFINE_RANGE_LOCK_FULL(mmrange); umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; - down_read(¤t->mm->mmap_sem); + mm_read_lock(current->mm, &mmrange); npgs = get_user_pages(umem->address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); - up_read(¤t->mm->mmap_sem); + mm_read_unlock(current->mm, &mmrange); if (npgs != umem->npgs) { if (npgs >= 0) { -- 2.16.4
[PATCH 10/12] powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests
Secure guest memory is inacessible to devices so regular DMA isn't possible. In that case set devices' dma_map_ops to NULL so that the generic DMA code path will use SWIOTLB and DMA to bounce buffers. Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/platforms/pseries/iommu.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 03bbb299320e..7d9550edb700 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "pseries.h" @@ -1332,7 +1333,10 @@ void iommu_init_early_pSeries(void) of_reconfig_notifier_register(&iommu_reconfig_nb); register_memory_notifier(&iommu_mem_nb); - set_pci_dma_ops(&dma_iommu_ops); + if (is_secure_guest()) + set_pci_dma_ops(NULL); + else + set_pci_dma_ops(&dma_iommu_ops); } static int __init disable_multitce(char *str)
[PATCH 05/14] mm: remove some BUG checks wrt mmap_sem
This patch is a collection of hacks that shamelessly remove mmap_sem state checks in order to not have to teach file_operations about range locking; for thp and huge pagecache: By dropping the rwsem_is_locked checks in zap_pmd_range() and zap_pud_range() we can avoid having to teach file_operations about mmrange. For example in xfs: iomap_dio_rw() is called by .read_iter file callbacks. We also avoid mmap_sem trylock in vm_insert_page(): The rules to this function state that mmap_sem must be acquired by the caller: - for write if used in f_op->mmap() (by far the most common case) - for read if used from vma_op->fault()(with VM_MIXEDMAP) The only exception is: mmap_vmcore() remap_vmalloc_range_partial() mmap_vmcore() But there is no concurrency here, thus mmap_sem is not held. After auditing the kernel, the following drivers use the fault path and correctly set VM_MIXEDMAP): .fault = etnaviv_gem_fault .fault = udl_gem_fault tegra_bo_fault() As such, drop the reader trylock BUG_ON() for the common case. This avoids having file_operations know about mmranges, as mmap_sem is held during, mmap() for example. Signed-off-by: Davidlohr Bueso --- include/linux/huge_mm.h | 2 -- mm/memory.c | 2 -- mm/mmap.c | 4 ++-- mm/pagewalk.c | 3 --- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7cd5c150c21d..a4a9cfa78d8f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -194,7 +194,6 @@ static inline int is_swap_pmd(pmd_t pmd) static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else @@ -203,7 +202,6 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { - VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); if (pud_trans_huge(*pud) || pud_devmap(*pud)) return __pud_trans_huge_lock(pud, vma); else diff --git a/mm/memory.c b/mm/memory.c index 9516c95108a1..73971f859035 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1212,7 +1212,6 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { - VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; @@ -1519,7 +1518,6 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, if (!page_count(page)) return -EINVAL; if (!(vma->vm_flags & VM_MIXEDMAP)) { - BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); BUG_ON(vma->vm_flags & VM_PFNMAP); vma->vm_flags |= VM_MIXEDMAP; } diff --git a/mm/mmap.c b/mm/mmap.c index af228ae3508d..a03ded49f9eb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3466,7 +3466,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); + down_write(&mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares @@ -3496,7 +3496,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); + down_write(&mm->mmap_sem); } } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3084ff2569d..6246acf17054 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -303,8 +303,6 @@ int walk_page_range(unsigned long start, unsigned long end, if (!walk->mm) return -EINVAL; - VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); - vma = find_vma(walk->mm, start); do { if (!vma) { /* after the last vma */ @@ -346,7 +344,6 @@ int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) if (!walk->mm) return -EINVAL; - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); VM_BUG_ON(!vma); walk->vma = vma; err
[PATCH 03/14] mm: introduce mm locking wrappers
This patch adds the necessary wrappers to encapsulate mmap_sem locking and will enable any future changes to be a lot more confined to here. In addition, future users will incrementally be added in the next patches. mm_[read/write]_[un]lock() naming is used. Signed-off-by: Davidlohr Bueso --- include/linux/mm.h | 76 ++ 1 file changed, 76 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e8834ac32b7..780b6097ee47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2880,5 +2881,80 @@ void __init setup_nr_node_ids(void); static inline void setup_nr_node_ids(void) {} #endif +/* + * Address space locking wrappers. + */ +static inline bool mm_is_locked(struct mm_struct *mm, + struct range_lock *mmrange) +{ + return rwsem_is_locked(&mm->mmap_sem); +} + +/* Reader wrappers */ +static inline int mm_read_trylock(struct mm_struct *mm, + struct range_lock *mmrange) +{ + return down_read_trylock(&mm->mmap_sem); +} + +static inline void mm_read_lock(struct mm_struct *mm, + struct range_lock *mmrange) +{ + down_read(&mm->mmap_sem); +} + +static inline void mm_read_lock_nested(struct mm_struct *mm, + struct range_lock *mmrange, int subclass) +{ + down_read_nested(&mm->mmap_sem, subclass); +} + +static inline void mm_read_unlock(struct mm_struct *mm, + struct range_lock *mmrange) +{ + up_read(&mm->mmap_sem); +} + +/* Writer wrappers */ +static inline int mm_write_trylock(struct mm_struct *mm, + struct range_lock *mmrange) +{ + return down_write_trylock(&mm->mmap_sem); +} + +static inline void mm_write_lock(struct mm_struct *mm, +struct range_lock *mmrange) +{ + down_write(&mm->mmap_sem); +} + +static inline int mm_write_lock_killable(struct mm_struct *mm, +struct range_lock *mmrange) +{ + return down_write_killable(&mm->mmap_sem); +} + +static inline void mm_downgrade_write(struct mm_struct *mm, + struct range_lock *mmrange) +{ + downgrade_write(&mm->mmap_sem); +} + +static inline void mm_write_unlock(struct mm_struct *mm, + struct range_lock *mmrange) +{ + up_write(&mm->mmap_sem); +} + +static inline void mm_write_lock_nested(struct mm_struct *mm, + struct range_lock *mmrange, + int subclass) +{ + down_write_nested(&mm->mmap_sem, subclass); +} + +#define mm_write_nest_lock(mm, range, nest_lock) \ + down_write_nest_lock(&(mm)->mmap_sem, nest_lock) + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- 2.16.4
[PATCH 04/14] mm: teach pagefault paths about range locking
When handling a page fault, it happens that the mmap_sem is released during the processing. As moving to range lock requires remembering the range parameter to do the lock/unlock, this patch adds a pointer to struct vm_fault. As such, we work outwards from arming the vmf from: handle_mm_fault(), __collapse_huge_page_swapin() and hugetlb_no_page() The idea is to use a local, stack allocated variable (no concurrency) whenever the mmap_sem is originally taken and we end up in pf paths that end up retaking the lock. Ie: DEFINE_RANGE_LOCK_FULL(mmrange); down_write(&mm->mmap_sem); some_fn(a, b, c, &mmrange); ... handle_mm_fault(vma, addr, flags, mmrange); ... up_write(&mm->mmap_sem); Consequentially we also end up updating lock_page_or_retry(), which can drop the mmap_sem. For the the gup family, we pass nil for scenarios when the semaphore will remain untouched. Semantically nothing changes at all, and the 'mmrange' ends up being unused for now. Later patches will use the variable when the mmap_sem wrappers replace straightforward down/up. *** For simplicity, this patch breaks when used in ksm and hmm. *** Signed-off-by: Davidlohr Bueso --- arch/x86/mm/fault.c | 27 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/infiniband/core/umem_odp.c | 2 +- drivers/iommu/amd_iommu_v2.c| 3 +- drivers/iommu/intel-svm.c | 3 +- drivers/vfio/vfio_iommu_type1.c | 2 +- fs/exec.c | 2 +- include/linux/hugetlb.h | 9 +++-- include/linux/mm.h | 24 include/linux/pagemap.h | 6 +-- kernel/events/uprobes.c | 7 ++-- kernel/futex.c | 2 +- mm/filemap.c| 2 +- mm/frame_vector.c | 6 ++- mm/gup.c| 65 - mm/hmm.c| 4 +- mm/hugetlb.c| 14 --- mm/internal.h | 3 +- mm/khugepaged.c | 24 +++- mm/ksm.c| 3 +- mm/memory.c | 14 --- mm/mempolicy.c | 9 +++-- mm/mmap.c | 4 +- mm/mprotect.c | 2 +- mm/process_vm_access.c | 4 +- security/tomoyo/domain.c| 2 +- virt/kvm/async_pf.c | 3 +- virt/kvm/kvm_main.c | 9 +++-- 29 files changed, 159 insertions(+), 100 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 46df4c6aae46..fb869c292b91 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -938,7 +938,8 @@ bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static void __bad_area(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 pkey, int si_code) + unsigned long address, u32 pkey, int si_code, + struct range_lock *mmrange) { struct mm_struct *mm = current->mm; /* @@ -951,9 +952,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, } static noinline void -bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, +struct range_lock *mmrange) { - __bad_area(regs, error_code, address, 0, SEGV_MAPERR); + __bad_area(regs, error_code, address, 0, SEGV_MAPERR, mmrange); } static inline bool bad_area_access_from_pkeys(unsigned long error_code, @@ -975,7 +977,8 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, struct vm_area_struct *vma, + struct range_lock *mmrange) { /* * This OSPKE check is not strictly necessary at runtime. @@ -1005,9 +1008,9 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, */ u32 pkey = vma_pkey(vma); - __bad_area(regs, error_code, address, pkey, SEGV_PKUERR); + __bad_area(regs, error_code, address, pkey, SEGV_PKUERR, mmrange); } else { - __bad_area(regs, error_code, address, 0, SEGV_ACCERR); + __bad_area(regs, error_code, address, 0, SEGV_ACCERR, mmrange); } } @@ -1306,6 +1309,7 @@ void do_user_addr_fault(struct pt_regs *regs, struct mm_struct *mm; vm_fault_t fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RA
[PATCH 08/12] powerpc/pseries/svm: Export guest SVM status to user space via sysfs
From: Ryan Grimm User space might want to know it's running in a secure VM. It can't do a mfmsr because mfmsr is a privileged instruction. The solution here is to create a cpu attribute: /sys/devices/system/cpu/svm which will read 0 or 1 based on the S bit of the guest's CPU 0. Signed-off-by: Ryan Grimm Reviewed-by: Ram Pai Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/kernel/sysfs.c | 29 + 1 file changed, 29 insertions(+) diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e8e93c2c7d03..8fdab134e9ae 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cacheinfo.h" #include "setup.h" @@ -714,6 +715,32 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ +#ifdef CONFIG_PPC_SVM +static void get_svm(void *val) +{ + u32 *value = val; + + *value = is_secure_guest(); +} + +static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 val; + smp_call_function_single(0, get_svm, &val, 1); + return sprintf(buf, "%u\n", val); +} +static DEVICE_ATTR(svm, 0444, show_svm, NULL); + +static void create_svm_file(void) +{ + device_create_file(cpu_subsys.dev_root, &dev_attr_svm); +} +#else +static void create_svm_file(void) +{ +} +#endif /* CONFIG_PPC_SVM */ + static int register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); @@ -1057,6 +1084,8 @@ static int __init topology_init(void) sysfs_create_dscr_default(); #endif /* CONFIG_PPC64 */ + create_svm_file(); + return 0; } subsys_initcall(topology_init);
[RFC PATCH 00/14] mmap_sem range locking
Hi, The following is a summarized repost of the range locking mmap_sem idea[1] and is _not_ intended for being considered upstream as there are quite a few issues that arise with this approach of tackling mmap_sem contention (keep reading). In fact this patch is quite incomplete and will break compiling on anything non-x86, and is also _completely broken_ for ksm and hmm. That being said, this does build an enterprise kernel and survives a number of workloads as well as 'runltp -f syscalls'. The previous series is a complete range locking conversion, which ensured we had all the range locking apis we needed. The changelog also included a number of performance numbers and overall design. While finding issues with the code itself is always welcome, the idea of this series is to discuss what can be done on top of it, if anything. >From a locking pov, most recently there has been a revival in the interest of >the range lock code for dchinner's plans of range locking the i_rwsem. However, it showed that xfs's extent tree significantly outperformed[2] the (full) range lock. The performance differences when doing 1:1 rwsem comparisons, have already been shown in [1]. Considering both the range lock and the extent tree lock the whole tree, most of this performance penalties are due to the fact that rbtrees' depth is a lot larger than btree's, so the latter avoids most of the pointer chasing which is a common performance issue. This was a trade-off for not having to allocate memory for the range nodes. However, on the _positive side_, and which is what we care most about for mmap_sem, when actually using the lock as intended, the range locking did show its purpose: IOPS read/write (buffered IO) fio processes rwsem rangelock 1 57k / 57k 64k / 64k 2 61k / 61k 111k / 111k 4 61k / 61k 228k / 228k 8 55k / 55k 195k / 195k 16 15k / 15k40k / 40k So it would be nice to apply this concept to our address space and allow mmaps, munmaps and pagefaults to all work concurrently in non-overlapping scenarios -- which is what is provided by userspace mm related syscalls. However, when using the range lock without a full range, a number of issues around the vma immediately popup as a consequence of this *top-down* approach to solving scalability: Races within a vma: non-overlapping regions can still belong to the same vma, hence wrecking merges and splits. One popular idea is to have a vma->rwsem (taken, for example, after a find_vma()), however, this throws out the window any potential scalability gains for large vmas as we just end up just moving down the point of contention. The same problem occurs when refcouting the vma (such as with speculative pfs). There's also the fact that we can end up taking numerous vma locks as the vma list is later traversed once the first vma is found. Alternatively, we could just expand the passed range such that it covers the whole first and last vma(s) endpoints; of course we don't have that information aprori (protected by mmap_sem :), and enlarging the range _after_ acquiring the lock opens a can of worms because now we have to inform userspace and/or deadlock, among others. Similarly, there's the issue of keeping the vma tree correct during modifications as well as regular find_vma()s. Laurent has already pointed out that we have too many ways of getting a vma: the tree, the list and the vmacache, all currently protected by mmap_sem and breaks because of the above when not using full ranges. This also touches a bit in a more *bottom-up* approach to mmap_sem performance, which scales from within, instead of putting a big rangelock tree on top of the address space. Matthew has pointed out a the xarray as well as an rcu based maple tree[3] replacement of the rbtree, however we already have the vmacache so most of the benefits of a shallower data structure are unnecessary, in cache-hot situations, naturally. The vma-list is easily removable once we have O(1) next/prev pointers, which for rbtrees can be done via threading the data structure (at the cost of extra branch for every level down the tree when inserting). Maple trees already give us this. So all in all, if we were going to go down this path of a cache friendlier tree, we'd end up needing comparisons of the maple tree vs the current vmacache+rbtree combo. Regarding rcu-ifying the vma tree and replacing read locking (and therefore plays nicer with cachelines), I sounds nice, it does not seem practical considering that the page tables cannot be rcu-ified. I'm sure I'm missing a lot more, but I'm hoping to kickstart the conversation again. Patches 1-2: adds the range locking machinery. This is rebased on the rbtree optimizations for interval trees such that we
[PATCH 01/14] interval-tree: build unconditionally
In preparation for range locking, this patch gets rid of CONFIG_INTERVAL_TREE option as we will unconditionally build it. Signed-off-by: Davidlohr Bueso --- drivers/gpu/drm/Kconfig | 2 -- drivers/gpu/drm/i915/Kconfig | 1 - drivers/iommu/Kconfig| 1 - lib/Kconfig | 14 -- lib/Kconfig.debug| 1 - lib/Makefile | 3 +-- 6 files changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index e360a4a131e1..3405336175ed 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -200,7 +200,6 @@ config DRM_RADEON select POWER_SUPPLY select HWMON select BACKLIGHT_CLASS_DEVICE - select INTERVAL_TREE help Choose this option if you have an ATI Radeon graphics card. There are both PCI and AGP versions. You don't need to choose this to @@ -220,7 +219,6 @@ config DRM_AMDGPU select POWER_SUPPLY select HWMON select BACKLIGHT_CLASS_DEVICE - select INTERVAL_TREE select CHASH help Choose this option if you have a recent AMD Radeon graphics card. diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 3d5f1cb6a76c..54d4bc8d141f 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -3,7 +3,6 @@ config DRM_I915 depends on DRM depends on X86 && PCI select INTEL_GTT - select INTERVAL_TREE # we need shmfs for the swappable backing store, and in particular # the shmem_readpage() which depends upon tmpfs select SHMEM diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index a2ed2b51a0f7..d21e6dc2adae 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -477,7 +477,6 @@ config VIRTIO_IOMMU depends on VIRTIO=y depends on ARM64 select IOMMU_API - select INTERVAL_TREE help Para-virtualised IOMMU driver with virtio. diff --git a/lib/Kconfig b/lib/Kconfig index 8d9239a4156c..e089ac40c062 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -409,20 +409,6 @@ config TEXTSEARCH_FSM config BTREE bool -config INTERVAL_TREE - bool - help - Simple, embeddable, interval-tree. Can find the start of an - overlapping range in log(n) time and then iterate over all - overlapping nodes. The algorithm is implemented as an - augmented rbtree. - - See: - - Documentation/rbtree.txt - - for more information. - config XARRAY_MULTI bool help diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4c35e52c5a2e..54bafed8ba70 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1759,7 +1759,6 @@ config RBTREE_TEST config INTERVAL_TREE_TEST tristate "Interval tree test" depends on DEBUG_KERNEL - select INTERVAL_TREE help A benchmark measuring the performance of the interval tree library diff --git a/lib/Makefile b/lib/Makefile index fb7697031a79..39fd34156692 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -50,7 +50,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o rhashtable.o \ once.o refcount.o usercopy.o errseq.o bucket_locks.o \ -generic-radix-tree.o +generic-radix-tree.o interval_tree.o obj-$(CONFIG_STRING_SELFTEST) += test_string.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o @@ -115,7 +115,6 @@ obj-y += logic_pio.o obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o -obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o obj-$(CONFIG_ASSOCIATIVE_ARRAY) += assoc_array.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o -- 2.16.4