[PATCH net-next 1/1] stmmac: intel: use managed PCI function on probe and resume

2021-03-31 Thread Wong Vee Khee
Update dwmac-intel to use managed function, i.e. pcim_enable_device().

This will allow devres framework to call resource free function for us.

Signed-off-by: Wong Vee Khee 
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 3d9a57043af2..add95e20548d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -924,7 +924,7 @@ static int intel_eth_pci_probe(struct pci_dev *pdev,
return -ENOMEM;
 
/* Enable pci device */
-   ret = pci_enable_device(pdev);
+   ret = pcim_enable_device(pdev);
if (ret) {
dev_err(>dev, "%s: ERROR: failed to enable device\n",
__func__);
@@ -1006,13 +1006,9 @@ static void intel_eth_pci_remove(struct pci_dev *pdev)
 
stmmac_dvr_remove(>dev);
 
-   pci_free_irq_vectors(pdev);
-
clk_unregister_fixed_rate(priv->plat->stmmac_clk);
 
pcim_iounmap_regions(pdev, BIT(0));
-
-   pci_disable_device(pdev);
 }
 
 static int __maybe_unused intel_eth_pci_suspend(struct device *dev)
@@ -1028,7 +1024,6 @@ static int __maybe_unused intel_eth_pci_suspend(struct 
device *dev)
if (ret)
return ret;
 
-   pci_disable_device(pdev);
pci_wake_from_d3(pdev, true);
return 0;
 }
@@ -1041,7 +1036,7 @@ static int __maybe_unused intel_eth_pci_resume(struct 
device *dev)
pci_restore_state(pdev);
pci_set_power_state(pdev, PCI_D0);
 
-   ret = pci_enable_device(pdev);
+   ret = pcim_enable_device(pdev);
if (ret)
return ret;
 
-- 
2.25.1



[PATCH] misc: vmw_vmci: initialize payload passed to vmci_send_datagram()

2021-03-31 Thread Tetsuo Handa
KMSAN complains that the vmci_use_ppn64() == false path in
vmci_dbell_register_notification_bitmap() left upper 32bits of
bitmap_set_msg.bitmap_ppn64 member uninitialized.

KMSAN also complains that vmci_check_host_caps() left the payload part
of check_msg uninitialized.

  [   21.458023][T1] vmw_vmci :00:07.7: Found VMCI PCI device at 
0x11080, irq 16
  [   21.461252][T1] vmw_vmci :00:07.7: Using capabilities 0xc
  [   21.463199][T1] =
  [   21.465014][T1] BUG: KMSAN: uninit-value in kmsan_check_memory+0xd/0x10
  [   21.465014][T1] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7+ 
#4
  [   21.465014][T1] Hardware name: VMware, Inc. VMware Virtual 
Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
  [   21.465014][T1] Call Trace:
  [   21.465014][T1]  dump_stack+0x21c/0x280
  [   21.465014][T1]  kmsan_report+0xfb/0x1e0
  [   21.465014][T1]  kmsan_internal_check_memory+0x484/0x520
  [   21.465014][T1]  ? kmsan_get_metadata+0x116/0x180
  [   21.465014][T1]  kmsan_check_memory+0xd/0x10
  [   21.465014][T1]  iowrite8_rep+0x86/0x380
  [   21.465014][T1]  vmci_send_datagram+0x150/0x280
  [   21.465014][T1]  vmci_dbell_register_notification_bitmap+0x133/0x1e0
  [   21.465014][T1]  vmci_guest_probe_device+0xcab/0x1e70
  [   21.465014][T1]  ? vmci_send_datagram+0x280/0x280
  [   21.465014][T1]  pci_device_probe+0xab3/0xe70
  [   21.465014][T1]  ? pci_uevent+0x830/0x830
  [   21.465014][T1]  really_probe+0xd16/0x24d0
  [   21.465014][T1]  driver_probe_device+0x29d/0x3a0
  [   21.465014][T1]  device_driver_attach+0x25a/0x490
  [   21.465014][T1]  __driver_attach+0x78c/0x840
  [   21.465014][T1]  ? kmsan_get_metadata+0x116/0x180
  [   21.465014][T1]  bus_for_each_dev+0x210/0x340
  [   21.465014][T1]  ? driver_attach+0xb0/0xb0
  [   21.465014][T1]  driver_attach+0x89/0xb0
  [   21.465014][T1]  bus_add_driver+0x677/0xc40
  [   21.465014][T1]  driver_register+0x485/0x8e0
  [   21.465014][T1]  __pci_register_driver+0x1ff/0x350
  [   21.465014][T1]  vmci_guest_init+0x3e/0x41
  [   21.465014][T1]  vmci_drv_init+0x1d6/0x43f
  [   21.465014][T1]  do_one_initcall+0x39c/0x9a0
  [   21.465014][T1]  ? null_init+0x11dc/0x11dc
  [   21.465014][T1]  ? kmsan_get_metadata+0x116/0x180
  [   21.465014][T1]  ? kmsan_get_shadow_origin_ptr+0x84/0xb0
  [   21.465014][T1]  ? null_init+0x11dc/0x11dc
  [   21.465014][T1]  do_initcall_level+0x1d7/0x259
  [   21.465014][T1]  do_initcalls+0x127/0x1cb
  [   21.465014][T1]  ? cpu_init_udelay+0xcf/0xcf
  [   21.465014][T1]  ? debug_boot_weak_hash_enable+0x61/0x61
  [   21.465014][T1]  do_basic_setup+0x33/0x36
  [   21.465014][T1]  kernel_init_freeable+0x29a/0x3ed
  [   21.465014][T1]  ? rest_init+0x1f0/0x1f0
  [   21.465014][T1]  kernel_init+0x1f/0x840
  [   21.465014][T1]  ? rest_init+0x1f0/0x1f0
  [   21.465014][T1]  ret_from_fork+0x1f/0x30
  [   21.465014][T1]
  [   21.465014][T1] Local variable 
bitmap_set_msg@vmci_dbell_register_notification_bitmap created at:
  [   21.465014][T1]  vmci_dbell_register_notification_bitmap+0x50/0x1e0
  [   21.465014][T1]  vmci_dbell_register_notification_bitmap+0x50/0x1e0
  [   21.465014][T1]
  [   21.465014][T1] Bytes 28-31 of 32 are uninitialized
  [   21.465014][T1] Memory access of size 32 starts at 88810098f570
  [   21.465014][T1] =
  [   21.465014][T1] Disabling lock debugging due to kernel taint
  [   21.539748][T1] =
  [   21.541627][T1] BUG: KMSAN: uninit-value in kmsan_check_memory+0xd/0x10
  [   21.543636][T1] CPU: 1 PID: 1 Comm: swapper/0 Tainted: GB  
   5.11.0-rc7+ #4
  [   21.546134][T1] Hardware name: VMware, Inc. VMware Virtual 
Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
  [   21.549126][T1] Call Trace:
  [   21.549639][T1]  dump_stack+0x21c/0x280
  [   21.549639][T1]  kmsan_report+0xfb/0x1e0
  [   21.549639][T1]  kmsan_internal_check_memory+0x202/0x520
  [   21.549639][T1]  ? kmsan_get_metadata+0x116/0x180
  [   21.549639][T1]  kmsan_check_memory+0xd/0x10
  [   21.549639][T1]  iowrite8_rep+0x86/0x380
  [   21.549639][T1]  vmci_guest_probe_device+0xf0b/0x1e70
  [   21.549639][T1]  ? vmci_send_datagram+0x280/0x280
  [   21.549639][T1]  pci_device_probe+0xab3/0xe70
  [   21.549639][T1]  ? pci_uevent+0x830/0x830
  [   21.549639][T1]  really_probe+0xd16/0x24d0
  [   21.549639][T1]  driver_probe_device+0x29d/0x3a0
  [   21.549639][T1]  device_driver_attach+0x25a/0x490
  [   21.549639][T1]  __driver_attach+0x78c/0x840
  [   21.549639][T1]  ? kmsan_get_metadata+0x116/0x180
  [   21.549639][T1]  bus_for_each_dev+0x210/0x340
  [   

Re: [PATCH 0/3] Fix block comment warnings

2021-03-31 Thread Greg KH
On Wed, Mar 31, 2021 at 01:05:34PM -0700, Deborah Brouwer wrote:
> This patchset fixes checkpatch warnings arising
> from the block comments

Note, your 0/X email subject should also have the subsystem/driver
prefix in there so that we know what this series is for.  Much like your
individual patches do.

thanks,

greg k-h


Re: [PATCH v1 3/3] KEYS: trusted: Introduce support for NXP CAAM-based trusted keys

2021-03-31 Thread Jarkko Sakkinen
On Thu, Apr 01, 2021 at 12:11:32PM +1100, Herbert Xu wrote:
> On Wed, Mar 31, 2021 at 04:34:29PM -0700, Eric Biggers wrote:
> > On Thu, Apr 01, 2021 at 02:31:46AM +0300, Jarkko Sakkinen wrote:
> > > 
> > > It's a bummer but uapi is the god in the end. Since TPM does not do it
> > > today, that behaviour must be supported forever. That's why a boot option
> > > AND a warning would be the best compromise.
> > 
> > It's not UAPI if there is no way for userspace to tell if it changed.
> 
> Exactly.  UAPI is only an issue if something *breaks*.

If there's even one user that comes shouting that he has a user space
configuration, where e.g. rng entropy is consumed constantly and the
code assumes that trusted keys does not add to that, then something
would break.

It would be a crap user space yes, but I don't want to go on reverting
because of that. I think there is small but still existing chance that
something could break.

Why not just add a boot parameter instead of making brutal enforcing
changes, indirectly visible to the user space?

/Jarkko


Re: [PATCH v1 3/3] KEYS: trusted: Introduce support for NXP CAAM-based trusted keys

2021-03-31 Thread Jarkko Sakkinen
On Wed, Mar 31, 2021 at 04:34:29PM -0700, Eric Biggers wrote:
> On Thu, Apr 01, 2021 at 02:31:46AM +0300, Jarkko Sakkinen wrote:
> > 
> > It's a bummer but uapi is the god in the end. Since TPM does not do it
> > today, that behaviour must be supported forever. That's why a boot option
> > AND a warning would be the best compromise.
> > 
> 
> It's not UAPI if there is no way for userspace to tell if it changed.
> 
> - Eric

It's enough uapi for me. People might assume that the entropy source is
TPM for this, since it has been so far.

/Jarkko


Re: [PATCH 1/1] arm: dts: owl-s500-roseapplepi: Add ATC2603C PMIC

2021-03-31 Thread Manivannan Sadhasivam
On Fri, Mar 12, 2021 at 11:49:27AM +0200, Cristian Ciocaltea wrote:
> Add device tree node for ATC2603C PMIC and remove the 'fixed-3.1V'
> dummy regulator used for the uSD supply.
> 
> Additionally, add 'SYSPWR' fixed regulator and provide cpu0 supply.
> 
> Signed-off-by: Cristian Ciocaltea 

Applied to for-next after fixing the patch subject as below:

"ARM: dts: owl-s500-roseapplepi: Add ATC2603C PMIC"

Thanks,
Mani

> ---
> Please note the patch depends on the ATC260x PMIC support which is queued
> for merging in v5.13:
> 
> https://lore.kernel.org/lkml/cover.1611653995.git.cristian.ciocal...@gmail.com/
> https://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git/log/?h=for-mfd-next=range=a38fd8748464831584a19438cbb3082b5a2dab15..eac013a0b7041f5cfc8feedf429a767675350102
> 
>  arch/arm/boot/dts/owl-s500-roseapplepi.dts | 132 -
>  1 file changed, 126 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm/boot/dts/owl-s500-roseapplepi.dts 
> b/arch/arm/boot/dts/owl-s500-roseapplepi.dts
> index ff91561ca99c..b8c5db2344aa 100644
> --- a/arch/arm/boot/dts/owl-s500-roseapplepi.dts
> +++ b/arch/arm/boot/dts/owl-s500-roseapplepi.dts
> @@ -2,7 +2,7 @@
>  /*
>   * Roseapple Pi
>   *
> - * Copyright (C) 2020 Cristian Ciocaltea 
> + * Copyright (C) 2020-2021 Cristian Ciocaltea 
>   */
>  
>  /dts-v1/;
> @@ -27,20 +27,140 @@ memory@0 {
>   reg = <0x0 0x8000>; /* 2GB */
>   };
>  
> - /* Fixed regulator used in the absence of PMIC */
> - sd_vcc: sd-vcc {
> + syspwr: regulator-5v0 {
>   compatible = "regulator-fixed";
> - regulator-name = "fixed-3.1V";
> - regulator-min-microvolt = <310>;
> - regulator-max-microvolt = <310>;
> + regulator-name = "SYSPWR";
> + regulator-min-microvolt = <500>;
> + regulator-max-microvolt = <500>;
>   regulator-always-on;
>   };
>  };
>  
> + {
> + cpu0-supply = <_cpu>;
> +};
> +
>   {
>   status = "okay";
>   pinctrl-names = "default";
>   pinctrl-0 = <_pins>;
> +
> + atc260x: pmic@65 {
> + compatible = "actions,atc2603c";
> + reg = <0x65>;
> + interrupt-parent = <>;
> + interrupts = <2 IRQ_TYPE_LEVEL_HIGH>;
> +
> + reset-time-sec = <6>;
> +
> + regulators {
> + compatible = "actions,atc2603c-regulator";
> +
> + dcdc1-supply = <>;
> + dcdc2-supply = <>;
> + dcdc3-supply = <>;
> + ldo1-supply = <>;
> + ldo2-supply = <>;
> + ldo3-supply = <>;
> + ldo5-supply = <>;
> + ldo6-supply = <>;
> + ldo7-supply = <>;
> + ldo8-supply = <>;
> + ldo11-supply = <>;
> + ldo12-supply = <>;
> + switchldo1-supply = <>;
> +
> + vdd_cpu: dcdc1 {
> + regulator-name = "VDD_CPU";
> + regulator-min-microvolt = <70>;
> + regulator-max-microvolt = <140>;
> + regulator-always-on;
> + };
> +
> + vddq: dcdc2 {
> + regulator-name = "VDDQ";
> + regulator-min-microvolt = <130>;
> + regulator-max-microvolt = <215>;
> + regulator-always-on;
> + regulator-boot-on;
> + };
> +
> + vcc: dcdc3 {
> + regulator-name = "VCC";
> + regulator-min-microvolt = <260>;
> + regulator-max-microvolt = <330>;
> + regulator-always-on;
> + };
> +
> + vcc_3v3: ldo1 {
> + regulator-name = "VCC_3V3";
> + regulator-min-microvolt = <260>;
> + regulator-max-microvolt = <330>;
> + regulator-always-on;
> + };
> +
> + avcc: ldo2 {
> + regulator-name = "AVCC";
> + regulator-min-microvolt = <260>;
> + regulator-max-microvolt = <330>;
> + regulator-always-on;
> + };
> +
> + vdd_1v8: ldo3 {
> + regulator-name = "VDD_1V8";
> + regulator-min-microvolt = <150>;
> + regulator-max-microvolt = <200>;
> + regulator-always-on;
> + };
> +
> + 

[PATCH] PCI: merge slot and bus reset implementations

2021-03-31 Thread Raphael Norwitz
Slot resets are bus resets with additional logic to prevent a device
from being removed during the reset. Currently slot and bus resets have
separate implementations in pci.c, complicating higher level logic. As
discussed on the mailing list, they should be combined into a generic
function which performs an SBR. This change adds a function,
pci_reset_bus_function(), which first attempts a slot reset and then
attempts a bus reset if -ENOTTY is returned, such that there is now a
single device agnostic function to perform an SBR.

This new function is also needed to add SBR reset quirks and therefore
is exposed in pci.h.

Link: https://lkml.org/lkml/2021/3/23/911

Suggested-by: Alex Williamson 
Signed-off-by: Amey Narkhede 
Signed-off-by: Raphael Norwitz 
---
 drivers/pci/pci.c   | 17 +
 include/linux/pci.h |  1 +
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 16a17215f633..12a91af2ade4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4982,6 +4982,13 @@ static int pci_dev_reset_slot_function(struct pci_dev 
*dev, int probe)
return pci_reset_hotplug_slot(dev->slot->hotplug, probe);
 }
 
+int pci_reset_bus_function(struct pci_dev *dev, int probe)
+{
+   int rc = pci_dev_reset_slot_function(dev, probe);
+
+   return (rc == -ENOTTY) ? pci_parent_bus_reset(dev, probe) : rc;
+}
+
 static void pci_dev_lock(struct pci_dev *dev)
 {
pci_cfg_access_lock(dev);
@@ -5102,10 +5109,7 @@ int __pci_reset_function_locked(struct pci_dev *dev)
rc = pci_pm_reset(dev, 0);
if (rc != -ENOTTY)
return rc;
-   rc = pci_dev_reset_slot_function(dev, 0);
-   if (rc != -ENOTTY)
-   return rc;
-   return pci_parent_bus_reset(dev, 0);
+   return pci_reset_bus_function(dev, 0);
 }
 EXPORT_SYMBOL_GPL(__pci_reset_function_locked);
 
@@ -5135,13 +5139,10 @@ int pci_probe_reset_function(struct pci_dev *dev)
if (rc != -ENOTTY)
return rc;
rc = pci_pm_reset(dev, 1);
-   if (rc != -ENOTTY)
-   return rc;
-   rc = pci_dev_reset_slot_function(dev, 1);
if (rc != -ENOTTY)
return rc;
 
-   return pci_parent_bus_reset(dev, 1);
+   return pci_reset_bus_function(dev, 1);
 }
 
 /**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 86c799c97b77..979d54335ac1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1228,6 +1228,7 @@ int pci_probe_reset_bus(struct pci_bus *bus);
 int pci_reset_bus(struct pci_dev *dev);
 void pci_reset_secondary_bus(struct pci_dev *dev);
 void pcibios_reset_secondary_bus(struct pci_dev *dev);
+int pci_reset_bus_function(struct pci_dev *dev, int probe);
 void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, 
resource_size_t add_size, resource_size_t align);
-- 
2.20.1


Re: [PATCH v2 2/3] drivers/tty/serial/8250: add DT property for aspeed vuart sirq polarity

2021-03-31 Thread Andrew Jeffery



On Thu, 1 Apr 2021, at 15:48, Zev Weiss wrote:
> On Wed, Mar 31, 2021 at 11:15:44PM CDT, Joel Stanley wrote:
> >On Thu, 1 Apr 2021 at 00:57, Zev Weiss  wrote:
> >>
> >> This provides a simple boolean to use instead of the deprecated
> >> aspeed,sirq-polarity-sense property.
> >>
> >> Signed-off-by: Zev Weiss 
> >> ---
> >>  drivers/tty/serial/8250/8250_aspeed_vuart.c | 3 +++
> >>  1 file changed, 3 insertions(+)
> >>
> >> diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c 
> >> b/drivers/tty/serial/8250/8250_aspeed_vuart.c
> >> index c33e02cbde93..e5ef9f957f9a 100644
> >> --- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
> >> +++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
> >> @@ -482,6 +482,9 @@ static int aspeed_vuart_probe(struct platform_device 
> >> *pdev)
> >> of_node_put(sirq_polarity_sense_args.np);
> >> }
> >>
> >> +   if (of_property_read_bool(np, "aspeed,sirq-active-high"))
> >> +   aspeed_vuart_set_sirq_polarity(vuart, 1);
> >
> >This assumes the default is always low, so we don't need a property to
> >set it to that state?
> >
> >Would it make more sense to have the property describe if it's high or
> >low? (I'm happy for the answer to be "no", as we've gotten by for the
> >past few years without it).
> >
> 
> Yeah, that sounds like better way to approach it -- I think I'll 
> rearrange as Andrew suggested in 
> https://lore.kernel.org/openbmc/d66753ee-7db2-41e5-9fe5-762b1ab67...@www.fastmail.com/
> 
> >This brings up another point. We already have the sysfs file for
> >setting the lpc address, from userspace. In OpenBMC land this can be
> >set with obmc-console-client (/etc/obmc-console.conf). Should we add
> >support to that application for setting the irq polarity too, and do
> >away with device tree descriptions?
> >
> 
> I guess I might lean slightly toward keeping the DT description so that 
> if for whatever reason obmc-console-server flakes out and doesn't start 
> you're better positioned to try banging on /dev/ttyS* manually if you're 
> desperate.  Though I suppose that in turn might imply that I'm arguing 
> for adding DT properties for lpc_address and sirq too,

Why not just adopt exactly what I've done with KCS, where we have 
aspeed,lpc-io-reg and aspeed,lpc-interrupts?

Andrew


[RESEND PATCH v4 1/2] dt-bindings: drm/bridge: anx7625: Add power supplies

2021-03-31 Thread Hsin-Yi Wang
anx7625 requires 3 power supply regulators.

Signed-off-by: Hsin-Yi Wang 
Reviewed-by: Rob Herring 
Reviewed-by: Robert Foss 
---
v3->v4: rebase to drm-misc/for-linux-next
---
 .../bindings/display/bridge/analogix,anx7625.yaml | 15 +++
 1 file changed, 15 insertions(+)

diff --git 
a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml 
b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml
index c789784efe306..ab48ab2f4240d 100644
--- a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml
@@ -34,6 +34,15 @@ properties:
 description: used for reset chip control, RESET_N pin B7.
 maxItems: 1
 
+  vdd10-supply:
+description: Regulator that provides the supply 1.0V power.
+
+  vdd18-supply:
+description: Regulator that provides the supply 1.8V power.
+
+  vdd33-supply:
+description: Regulator that provides the supply 3.3V power.
+
   ports:
 $ref: /schemas/graph.yaml#/properties/ports
 
@@ -55,6 +64,9 @@ properties:
 required:
   - compatible
   - reg
+  - vdd10-supply
+  - vdd18-supply
+  - vdd33-supply
   - ports
 
 additionalProperties: false
@@ -72,6 +84,9 @@ examples:
 reg = <0x58>;
 enable-gpios = < 45 GPIO_ACTIVE_HIGH>;
 reset-gpios = < 73 GPIO_ACTIVE_HIGH>;
+vdd10-supply = <_mipibrdg>;
+vdd18-supply = <_mipibrdg>;
+vdd33-supply = <_mipibrdg>;
 
 ports {
 #address-cells = <1>;
-- 
2.31.0.291.g576ba9dcdaf-goog



[RESEND PATCH v4 2/2] drm/bridge: anx7625: disable regulators when power off

2021-03-31 Thread Hsin-Yi Wang
When suspending the driver, anx7625_power_standby() will be called to
turn off reset-gpios and enable-gpios. However, power supplies are not
disabled. To save power, the driver can get the power supply regulators
and turn off them in anx7625_power_standby().

Signed-off-by: Hsin-Yi Wang 
Reviewed-by: Robert Foss 
Reviewed-by: Xin Ji 
---
 drivers/gpu/drm/bridge/analogix/anx7625.c | 34 +++
 drivers/gpu/drm/bridge/analogix/anx7625.h |  1 +
 2 files changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.c 
b/drivers/gpu/drm/bridge/analogix/anx7625.c
index 65cc05982f826..23283ba0c4f93 100644
--- a/drivers/gpu/drm/bridge/analogix/anx7625.c
+++ b/drivers/gpu/drm/bridge/analogix/anx7625.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -875,12 +876,25 @@ static int sp_tx_edid_read(struct anx7625_data *ctx,
 static void anx7625_power_on(struct anx7625_data *ctx)
 {
struct device *dev = >client->dev;
+   int ret, i;
 
if (!ctx->pdata.low_power_mode) {
DRM_DEV_DEBUG_DRIVER(dev, "not low power mode!\n");
return;
}
 
+   for (i = 0; i < ARRAY_SIZE(ctx->pdata.supplies); i++) {
+   ret = regulator_enable(ctx->pdata.supplies[i].consumer);
+   if (ret < 0) {
+   DRM_DEV_DEBUG_DRIVER(dev, "cannot enable supply %d: 
%d\n",
+i, ret);
+   goto reg_err;
+   }
+   usleep_range(2000, 2100);
+   }
+
+   usleep_range(4000, 4100);
+
/* Power on pin enable */
gpiod_set_value(ctx->pdata.gpio_p_on, 1);
usleep_range(1, 11000);
@@ -889,11 +903,16 @@ static void anx7625_power_on(struct anx7625_data *ctx)
usleep_range(1, 11000);
 
DRM_DEV_DEBUG_DRIVER(dev, "power on !\n");
+   return;
+reg_err:
+   for (--i; i >= 0; i--)
+   regulator_disable(ctx->pdata.supplies[i].consumer);
 }
 
 static void anx7625_power_standby(struct anx7625_data *ctx)
 {
struct device *dev = >client->dev;
+   int ret;
 
if (!ctx->pdata.low_power_mode) {
DRM_DEV_DEBUG_DRIVER(dev, "not low power mode!\n");
@@ -904,6 +923,12 @@ static void anx7625_power_standby(struct anx7625_data *ctx)
usleep_range(1000, 1100);
gpiod_set_value(ctx->pdata.gpio_p_on, 0);
usleep_range(1000, 1100);
+
+   ret = regulator_bulk_disable(ARRAY_SIZE(ctx->pdata.supplies),
+ctx->pdata.supplies);
+   if (ret < 0)
+   DRM_DEV_DEBUG_DRIVER(dev, "cannot disable supplies %d\n", ret);
+
DRM_DEV_DEBUG_DRIVER(dev, "power down\n");
 }
 
@@ -1742,6 +1767,15 @@ static int anx7625_i2c_probe(struct i2c_client *client,
platform->client = client;
i2c_set_clientdata(client, platform);
 
+   pdata->supplies[0].supply = "vdd10";
+   pdata->supplies[1].supply = "vdd18";
+   pdata->supplies[2].supply = "vdd33";
+   ret = devm_regulator_bulk_get(dev, ARRAY_SIZE(pdata->supplies),
+ pdata->supplies);
+   if (ret) {
+   DRM_DEV_ERROR(dev, "fail to get power supplies: %d\n", ret);
+   return ret;
+   }
anx7625_init_gpio(platform);
 
atomic_set(>power_status, 0);
diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.h 
b/drivers/gpu/drm/bridge/analogix/anx7625.h
index 193ad86c54503..e4a086b3a3d7b 100644
--- a/drivers/gpu/drm/bridge/analogix/anx7625.h
+++ b/drivers/gpu/drm/bridge/analogix/anx7625.h
@@ -350,6 +350,7 @@ struct s_edid_data {
 struct anx7625_platform_data {
struct gpio_desc *gpio_p_on;
struct gpio_desc *gpio_reset;
+   struct regulator_bulk_data supplies[3];
struct drm_bridge *panel_bridge;
int intp_irq;
u32 low_power_mode;
-- 
2.31.0.291.g576ba9dcdaf-goog



Re: [PATCH 1/1] arm: dts: owl-s500-roseapplepi: Add ATC2603C PMIC

2021-03-31 Thread Manivannan Sadhasivam
On Fri, Mar 12, 2021 at 11:49:27AM +0200, Cristian Ciocaltea wrote:
> Add device tree node for ATC2603C PMIC and remove the 'fixed-3.1V'
> dummy regulator used for the uSD supply.
> 
> Additionally, add 'SYSPWR' fixed regulator and provide cpu0 supply.
> 
> Signed-off-by: Cristian Ciocaltea 

Reviewed-by: Manivannan Sadhasivam 

Thanks,
Mani

> ---
> Please note the patch depends on the ATC260x PMIC support which is queued
> for merging in v5.13:
> 
> https://lore.kernel.org/lkml/cover.1611653995.git.cristian.ciocal...@gmail.com/
> https://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git/log/?h=for-mfd-next=range=a38fd8748464831584a19438cbb3082b5a2dab15..eac013a0b7041f5cfc8feedf429a767675350102
> 
>  arch/arm/boot/dts/owl-s500-roseapplepi.dts | 132 -
>  1 file changed, 126 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm/boot/dts/owl-s500-roseapplepi.dts 
> b/arch/arm/boot/dts/owl-s500-roseapplepi.dts
> index ff91561ca99c..b8c5db2344aa 100644
> --- a/arch/arm/boot/dts/owl-s500-roseapplepi.dts
> +++ b/arch/arm/boot/dts/owl-s500-roseapplepi.dts
> @@ -2,7 +2,7 @@
>  /*
>   * Roseapple Pi
>   *
> - * Copyright (C) 2020 Cristian Ciocaltea 
> + * Copyright (C) 2020-2021 Cristian Ciocaltea 
>   */
>  
>  /dts-v1/;
> @@ -27,20 +27,140 @@ memory@0 {
>   reg = <0x0 0x8000>; /* 2GB */
>   };
>  
> - /* Fixed regulator used in the absence of PMIC */
> - sd_vcc: sd-vcc {
> + syspwr: regulator-5v0 {
>   compatible = "regulator-fixed";
> - regulator-name = "fixed-3.1V";
> - regulator-min-microvolt = <310>;
> - regulator-max-microvolt = <310>;
> + regulator-name = "SYSPWR";
> + regulator-min-microvolt = <500>;
> + regulator-max-microvolt = <500>;
>   regulator-always-on;
>   };
>  };
>  
> + {
> + cpu0-supply = <_cpu>;
> +};
> +
>   {
>   status = "okay";
>   pinctrl-names = "default";
>   pinctrl-0 = <_pins>;
> +
> + atc260x: pmic@65 {
> + compatible = "actions,atc2603c";
> + reg = <0x65>;
> + interrupt-parent = <>;
> + interrupts = <2 IRQ_TYPE_LEVEL_HIGH>;
> +
> + reset-time-sec = <6>;
> +
> + regulators {
> + compatible = "actions,atc2603c-regulator";
> +
> + dcdc1-supply = <>;
> + dcdc2-supply = <>;
> + dcdc3-supply = <>;
> + ldo1-supply = <>;
> + ldo2-supply = <>;
> + ldo3-supply = <>;
> + ldo5-supply = <>;
> + ldo6-supply = <>;
> + ldo7-supply = <>;
> + ldo8-supply = <>;
> + ldo11-supply = <>;
> + ldo12-supply = <>;
> + switchldo1-supply = <>;
> +
> + vdd_cpu: dcdc1 {
> + regulator-name = "VDD_CPU";
> + regulator-min-microvolt = <70>;
> + regulator-max-microvolt = <140>;
> + regulator-always-on;
> + };
> +
> + vddq: dcdc2 {
> + regulator-name = "VDDQ";
> + regulator-min-microvolt = <130>;
> + regulator-max-microvolt = <215>;
> + regulator-always-on;
> + regulator-boot-on;
> + };
> +
> + vcc: dcdc3 {
> + regulator-name = "VCC";
> + regulator-min-microvolt = <260>;
> + regulator-max-microvolt = <330>;
> + regulator-always-on;
> + };
> +
> + vcc_3v3: ldo1 {
> + regulator-name = "VCC_3V3";
> + regulator-min-microvolt = <260>;
> + regulator-max-microvolt = <330>;
> + regulator-always-on;
> + };
> +
> + avcc: ldo2 {
> + regulator-name = "AVCC";
> + regulator-min-microvolt = <260>;
> + regulator-max-microvolt = <330>;
> + regulator-always-on;
> + };
> +
> + vdd_1v8: ldo3 {
> + regulator-name = "VDD_1V8";
> + regulator-min-microvolt = <150>;
> + regulator-max-microvolt = <200>;
> + regulator-always-on;
> + };
> +
> + vcc_3v1: ldo5 {
> + regulator-name = "VCC_3V1";
> 

Re: [PATCH v2 0/6] Add support for Actions Semi Owl socinfo

2021-03-31 Thread Manivannan Sadhasivam
On Tue, Mar 30, 2021 at 04:48:15PM +0300, Cristian Ciocaltea wrote:
> This patchset adds a socinfo driver which provides information about
> Actions Semi Owl SoCs to user space via sysfs: machine, family, soc_id,
> serial_number.
> 
> Please note the serial number is currently available only for the S500
> SoC variant.
> 
> This has been tested on the S500 SoC based RoseapplePi SBC.
> 

Is this the soc_id provided by the vendor bootloader (uboot)? If so, under
what basis it provides? I don't think the SoC has the provision for
soc_id based on HW parameters.

Thanks,
Mani

> Thanks,
> Cristi
> 
> Changes in v2:
>  - Exposed the memory range for reading the SoC serial number under
>/reserved-memory DT node, according to Rob's review; as a consequence
>added a new binding document (actions,owl-soc-serial.yaml) and updated
>owl-socinfo.yaml
> 
>  - Replaced the unportable usage of system_serial_{low,high} globals
>with a public API to provide external access to SoC serial number
>parts (e.g. Owl Ethernet MAC driver will use this to generate a
>stable MAC address)
> 
>  - Rebased patch series on v5.12-rc5
> 
> Cristian Ciocaltea (6):
>   dt-bindings: reserved-memory: Add Owl SoC serial number binding
>   dt-bindings: soc: actions: Add Actions Semi Owl socinfo binding
>   soc: actions: Add Actions Semi Owl socinfo driver
>   arm: dts: owl-s500: Add reserved-memory range for Owl SoC serial
> number
>   arm: dts: owl-s500: Add socinfo support
>   MAINTAINERS: Add entries for Owl reserved-memory and socinfo bindings
> 
>  .../actions,owl-soc-serial.yaml   |  53 ++
>  .../bindings/soc/actions/owl-socinfo.yaml |  57 +++
>  MAINTAINERS   |   2 +
>  arch/arm/boot/dts/owl-s500.dtsi   |  13 +-
>  drivers/soc/actions/Kconfig   |   8 +
>  drivers/soc/actions/Makefile  |   1 +
>  drivers/soc/actions/owl-socinfo.c | 152 ++
>  include/linux/soc/actions/owl-serial-number.h |  20 +++
>  8 files changed, 305 insertions(+), 1 deletion(-)
>  create mode 100644 
> Documentation/devicetree/bindings/reserved-memory/actions,owl-soc-serial.yaml
>  create mode 100644 
> Documentation/devicetree/bindings/soc/actions/owl-socinfo.yaml
>  create mode 100644 drivers/soc/actions/owl-socinfo.c
>  create mode 100644 include/linux/soc/actions/owl-serial-number.h
> 
> -- 
> 2.31.1
> 


[PATCH 1/2] x86/sgx: Do not update sgx_nr_free_pages in sgx_setup_epc_section()

2021-03-31 Thread Jarkko Sakkinen
Now that the sanitization process will make pages available by calling
sgx_free_epc_page(), sgx_setup_epc_section() should not touch to
sgx_nr_free_pages. This will result sgx_nr_free_pages to contain 2x the
number of actual free pages. Simply, remove the statement.

Fixes: 51ab30eb2ad4 ("x86/sgx: Replace section->init_laundry_list with 
sgx_dirty_page_list")
Signed-off-by: Jarkko Sakkinen 
---
 arch/x86/kernel/cpu/sgx/main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 13a7599ce7d4..7df7048cb1c9 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -657,7 +657,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 
size,
list_add_tail(>pages[i].list, _dirty_page_list);
}
 
-   sgx_nr_free_pages += nr_pages;
return true;
 }
 
-- 
2.31.1



[PATCH 2/2] x86/sgx: Add sgx_nr_{all, free}_pages to the debugfs

2021-03-31 Thread Jarkko Sakkinen
Add two debugs attributes:

* /sys/kernel/debug/x86/sgx_nr_all_pages
* /sys/kernel/debug/x86/sgx_nr_free_pages

These provide useful statistics for testing purposes.

E.g. on a NUC7CJYH2, when no enclaves are running, and EPC set to 32 MB:

$ sudo cat /sys/kernel/debug/x86/sgx_nr_all_pages
5632

$ sudo cat /sys/kernel/debug/x86/sgx_nr_free_pages
5632

Signed-off-by: Jarkko Sakkinen 
---
 arch/x86/kernel/cpu/sgx/main.c | 53 +-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 7df7048cb1c9..190c96735c9f 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
+#include 
 #include 
 #include 
 #include 
@@ -25,7 +26,10 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
 static LIST_HEAD(sgx_active_page_list);
 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
 
-/* The free page list lock protected variables prepend the lock. */
+/* The number of EPC pages in total in all nodes. */
+static unsigned long sgx_nr_all_pages;
+
+/* The number of free EPC pages in all nodes. */
 static unsigned long sgx_nr_free_pages;
 
 /* Nodes with one or more EPC sections. */
@@ -657,6 +661,8 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 
size,
list_add_tail(>pages[i].list, _dirty_page_list);
}
 
+   sgx_nr_all_pages += nr_pages;
+
return true;
 }
 
@@ -730,6 +736,44 @@ static bool __init sgx_page_cache_init(void)
return true;
 }
 
+#ifdef CONFIG_DEBUG_FS
+static ssize_t sgx_nr_all_pages_read(struct file *file, char __user *user_buf,
+size_t count, loff_t *pos)
+{
+   char kernel_buf[32];
+   int len;
+
+   len = snprintf(kernel_buf, sizeof(kernel_buf), "%lu\n", 
sgx_nr_all_pages);
+   if (len < 0)
+   return len;
+
+   return simple_read_from_buffer(user_buf, count, pos, kernel_buf, len);
+}
+
+static const struct file_operations sgx_nr_all_pages_fops = {
+   .read = sgx_nr_all_pages_read,
+   .llseek = default_llseek,
+};
+
+static ssize_t sgx_nr_free_pages_read(struct file *file, char __user *user_buf,
+size_t count, loff_t *pos)
+{
+   char kernel_buf[32];
+   int len;
+
+   len = snprintf(kernel_buf, sizeof(kernel_buf), "%lu\n", 
sgx_nr_free_pages);
+   if (len < 0)
+   return len;
+
+   return simple_read_from_buffer(user_buf, count, pos, kernel_buf, len);
+}
+
+static const struct file_operations sgx_nr_free_pages_fops = {
+   .read = sgx_nr_free_pages_read,
+   .llseek = default_llseek,
+};
+#endif /* CONFIG_DEBUG_FS */
+
 static int __init sgx_init(void)
 {
int ret;
@@ -750,6 +794,13 @@ static int __init sgx_init(void)
if (ret)
goto err_kthread;
 
+#ifdef CONFIG_DEBUG_FS
+   debugfs_create_file("sgx_nr_all_pages", 0400, arch_debugfs_dir, NULL,
+   _nr_all_pages_fops);
+   debugfs_create_file("sgx_nr_free_pages", 0400, arch_debugfs_dir, NULL,
+   _nr_free_pages_fops);
+#endif /* CONFIG_DEBUG_FS */
+
return 0;
 
 err_kthread:
-- 
2.31.1



Re: [PATCH v2 2/3] drivers/tty/serial/8250: add DT property for aspeed vuart sirq polarity

2021-03-31 Thread Zev Weiss

On Wed, Mar 31, 2021 at 11:15:44PM CDT, Joel Stanley wrote:

On Thu, 1 Apr 2021 at 00:57, Zev Weiss  wrote:


This provides a simple boolean to use instead of the deprecated
aspeed,sirq-polarity-sense property.

Signed-off-by: Zev Weiss 
---
 drivers/tty/serial/8250/8250_aspeed_vuart.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c 
b/drivers/tty/serial/8250/8250_aspeed_vuart.c
index c33e02cbde93..e5ef9f957f9a 100644
--- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
@@ -482,6 +482,9 @@ static int aspeed_vuart_probe(struct platform_device *pdev)
of_node_put(sirq_polarity_sense_args.np);
}

+   if (of_property_read_bool(np, "aspeed,sirq-active-high"))
+   aspeed_vuart_set_sirq_polarity(vuart, 1);


This assumes the default is always low, so we don't need a property to
set it to that state?

Would it make more sense to have the property describe if it's high or
low? (I'm happy for the answer to be "no", as we've gotten by for the
past few years without it).



Yeah, that sounds like better way to approach it -- I think I'll 
rearrange as Andrew suggested in 
https://lore.kernel.org/openbmc/d66753ee-7db2-41e5-9fe5-762b1ab67...@www.fastmail.com/



This brings up another point. We already have the sysfs file for
setting the lpc address, from userspace. In OpenBMC land this can be
set with obmc-console-client (/etc/obmc-console.conf). Should we add
support to that application for setting the irq polarity too, and do
away with device tree descriptions?



I guess I might lean slightly toward keeping the DT description so that 
if for whatever reason obmc-console-server flakes out and doesn't start 
you're better positioned to try banging on /dev/ttyS* manually if you're 
desperate.  Though I suppose that in turn might imply that I'm arguing 
for adding DT properties for lpc_address and sirq too, and if you're 
really that desperate you can just fiddle with sysfs anyway, so...shrug?  
I could be convinced either way fairly easily.



Zev



Re: [PATCH v3] sysfs: Unconditionally use vmalloc for buffer

2021-03-31 Thread Greg Kroah-Hartman
On Wed, Mar 31, 2021 at 07:21:45PM -0700, Kees Cook wrote:
> The sysfs interface to seq_file continues to be rather fragile
> (seq_get_buf() should not be used outside of seq_file), as seen with
> some recent exploits[1]. Move the seq_file buffer to the vmap area
> (while retaining the accounting flag), since it has guard pages that
> will catch and stop linear overflows. This seems justified given that
> sysfs's use of seq_file already uses kvmalloc(), is almost always using
> a PAGE_SIZE or larger allocation, has normally short-lived allocations,
> and is not normally on a performance critical path.
> 
> Once seq_get_buf() has been removed (and all sysfs callbacks using
> seq_file directly), this change can also be removed.
> 
> [1] https://blog.grimm-co.com/2021/03/new-old-bugs-in-linux-kernel.html
> 
> Signed-off-by: Kees Cook 
> ---
> v3:
> - Limit to only sysfs (instead of all of seq_file).
> v2: 
> https://lore.kernel.org/lkml/20210315174851.68-1-keesc...@chromium.org/
> v1: 
> https://lore.kernel.org/lkml/20210312205558.2947488-1-keesc...@chromium.org/
> ---
>  fs/sysfs/file.c | 23 +++
>  1 file changed, 23 insertions(+)
> 
> diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
> index 9aefa7779b29..70e7a450e5d1 100644
> --- a/fs/sysfs/file.c
> +++ b/fs/sysfs/file.c
> @@ -16,6 +16,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "sysfs.h"
>  
> @@ -32,6 +33,25 @@ static const struct sysfs_ops *sysfs_file_ops(struct 
> kernfs_node *kn)
>   return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
>  }
>  
> +/*
> + * To be proactively defensive against sysfs show() handlers that do not
> + * correctly stay within their PAGE_SIZE buffer, use the vmap area to gain
> + * the trailing guard page which will stop linear buffer overflows.
> + */
> +static void *sysfs_kf_seq_start(struct seq_file *sf, loff_t *ppos)
> +{
> + struct kernfs_open_file *of = sf->private;
> + struct kernfs_node *kn = of->kn;
> +
> + WARN_ON_ONCE(sf->buf);

How can buf ever not be NULL?  And if it is, we will leak memory in the
next line so we shouldn't have _ONCE, we should always know, but not
rebooting the machine would be nice.

> + sf->buf = __vmalloc(kn->attr.size, GFP_KERNEL_ACCOUNT);
> + if (!sf->buf)
> + return ERR_PTR(-ENOMEM);
> + sf->size = kn->attr.size;
> +
> + return NULL + !*ppos;
> +}

Will this also cause the vmalloc fragmentation/abuse that others have
mentioned as userspace can trigger this?

And what code frees it?

thanks,

greg k-h


Re: [PATCH v5 00/27] Memory Folios

2021-03-31 Thread Al Viro
On Tue, Mar 30, 2021 at 10:09:29PM +0100, Matthew Wilcox wrote:

> That's a very Intel-centric way of looking at it.  Other architectures
> support a multitude of page sizes, from the insane ia64 (4k, 8k, 16k, then
> every power of four up to 4GB) to more reasonable options like (4k, 32k,
> 256k, 2M, 16M, 128M).  But we (in software) shouldn't constrain ourselves
> to thinking in terms of what the hardware currently supports.  Google
> have data showing that for their workloads, 32kB is the goldilocks size.
> I'm sure for some workloads, it's much higher and for others it's lower.
> But for almost no workload is 4kB the right choice any more, and probably
> hasn't been since the late 90s.

Out of curiosity I looked at the distribution of file sizes in the
kernel tree:
71455 files total
0--4Kb  36702
4--8Kb  11820
8--16Kb 10066
16--32Kb6984
32--64Kb3804
64--128Kb   1498
128--256Kb  393
256--512Kb  108
512Kb--1Mb  35
1--2Mb  25
2--4Mb  5
4--6Mb  7
6--8Mb  4
12Mb2 
14Mb1
16Mb1

... incidentally, everything bigger than 1.2Mb lives^Wshambles under
drivers/gpu/drm/amd/include/asic_reg/

Page size   Footprint
4Kb 1128Mb
8Kb 1324Mb
16Kb1764Mb
32Kb2739Mb
64Kb4832Mb
128Kb   9191Mb
256Kb   18062Mb
512Kb   35883Mb
1Mb 71570Mb
2Mb 142958Mb

So for kernel builds (as well as grep over the tree, etc.) uniform 2Mb pages
would be... interesting.


Re: [PATCH v2] kernel/resource: Fix locking in request_free_mem_region

2021-03-31 Thread Alistair Popple
On Thursday, 1 April 2021 3:56:05 PM AEDT Muchun Song wrote:
> External email: Use caution opening links or attachments
> 
> 
> On Fri, Mar 26, 2021 at 9:22 AM Alistair Popple  wrote:
> >
> > request_free_mem_region() is used to find an empty range of physical
> > addresses for hotplugging ZONE_DEVICE memory. It does this by iterating
> > over the range of possible addresses using region_intersects() to see if
> > the range is free.
> >
> > region_intersects() obtains a read lock before walking the resource tree
> > to protect against concurrent changes. However it drops the lock prior
> > to returning. This means by the time request_mem_region() is called in
> > request_free_mem_region() another thread may have already reserved the
> > requested region resulting in unexpected failures and a message in the
> > kernel log from hitting this condition:
> >
> > /*
> >  * mm/hmm.c reserves physical addresses which then
> >  * become unavailable to other users.  Conflicts are
> >  * not expected.  Warn to aid debugging if encountered.
> >  */
> > if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
> > pr_warn("Unaddressable device %s %pR conflicts with %pR",
> > conflict->name, conflict, res);
> >
> > To fix this create versions of region_intersects() and
> > request_mem_region() that allow the caller to take the appropriate lock
> > such that it may be held over the required calls.
> >
> > Instead of creating another version of devm_request_mem_region() that
> > doesn't take the lock open-code it to allow the caller to pre-allocate
> > the required memory prior to taking the lock.
> >
> > Fixes: 0c385190392d8 ("resource: add a not device managed 
request_free_mem_region variant")
> > Fixes: 0092908d16c60 ("mm: factor out a devm_request_free_mem_region 
helper")
> > Fixes: 4ef589dc9b10c ("mm/hmm/devmem: device memory hotplug using 
ZONE_DEVICE")
> > Signed-off-by: Alistair Popple 
> 
> +cc my email (songmuc...@bytedance.com).
> 
> Hi Alistair,
> 
> Thanks for your patch. But there is a deadlock that should be fixed.
> Please see the following scenario.
> 
> __request_region
> write_lock(_lock)
> request_region_locked
> revoke_iomem
> devmem_is_allowed (arch/x86/mm/init.c)
> region_intersects
> read_lock(_lock)   // deadlock

Thanks for the report and apologies for the breakage. The kernel test robot 
caught it pretty quickly - see https://lore.kernel.org/linux-mm/
20210330003842.18948-1-apop...@nvidia.com/ for an updated version that fixes 
this.

 - Alistair

> >
> > ---
> >
> > v2:
> >  - Added Fixes tag
> >
> > ---
> >  kernel/resource.c | 146 +-
> >  1 file changed, 94 insertions(+), 52 deletions(-)
> >
> > diff --git a/kernel/resource.c b/kernel/resource.c
> > index 627e61b0c124..2d4652383dd2 100644
> > --- a/kernel/resource.c
> > +++ b/kernel/resource.c
> > @@ -523,6 +523,34 @@ int __weak page_is_ram(unsigned long pfn)
> >  }
> >  EXPORT_SYMBOL_GPL(page_is_ram);
> >
> > +static int __region_intersects(resource_size_t start, size_t size,
> > +  unsigned long flags, unsigned long desc)
> > +{
> > +   struct resource res;
> > +   int type = 0; int other = 0;
> > +   struct resource *p;
> > +
> > +   res.start = start;
> > +   res.end = start + size - 1;
> > +
> > +   for (p = iomem_resource.child; p ; p = p->sibling) {
> > +   bool is_type = (((p->flags & flags) == flags) &&
> > +   ((desc == IORES_DESC_NONE) ||
> > +(desc == p->desc)));
> > +
> > +   if (resource_overlaps(p, ))
> > +   is_type ? type++ : other++;
> > +   }
> > +
> > +   if (type == 0)
> > +   return REGION_DISJOINT;
> > +
> > +   if (other == 0)
> > +   return REGION_INTERSECTS;
> > +
> > +   return REGION_MIXED;
> > +}
> > +
> >  /**
> >   * region_intersects() - determine intersection of region with known 
resources
> >   * @start: region start address
> > @@ -546,31 +574,12 @@ EXPORT_SYMBOL_GPL(page_is_ram);
> >  int region_intersects(resource_size_t start, size_t size, unsigned long 
flags,
> >   unsigned long desc)
> >  {
> > -   struct resource res;
> > -   int type = 0; int other = 0;
> > -   struct resource *p;
> > -
> > -   res.start = start;
> > -   res.end = start + size - 1;
> > +   int rc;
> >
> > read_lock(_lock);
> > -   for (p = iomem_resource.child; p ; p = p->sibling) {
> > -   bool is_type = (((p->flags & flags) == flags) &&
> > -   ((desc == IORES_DESC_NONE) ||
> > -(desc == p->desc)));
> > -
> > -   if (resource_overlaps(p, ))
> > -   is_type ? type++ : 

Re: [PATCH v1 1/4] docs: make reporting-issues.rst official and delete reporting-bugs.rst

2021-03-31 Thread Alex Shi



On 2021/3/31 下午4:33, Wu X.C. wrote:
> Cc Alex Shi's new email 
> 
> On Tue, Mar 30, 2021 at 04:13:04PM +0200, Thorsten Leemhuis wrote:
>> Removing Documentation/admin-guide/reporting-bugs.rst will break links
>> in some of the translations. I was unsure if simply changing them to
>> Documentation/admin-guide/reporting-issue.rst was wise, so I didn't

A bit time info late won't hurt sth, people would update them soon if it's
their care.

>> touch anything for now and CCed the maintainers for the Chinese and
>> Italian translation. I couldn't find one for the Japanse translation.
>>
>> Please advice. For completeness, this are the places where things will
>> break afaics:
>>
>> $ grep -ri 'reporting-bugs.rst' Documentation/
>> Documentation/translations/zh_CN/SecurityBugs:是有帮助的信息,那就请重温一下admin-guide/reporting-bugs.rst文件中的概述过程。任
>> Documentation/translations/zh_CN/process/howto.rst:内核源码主目录中的:ref:`admin-guide/reporting-bugs.rst
>>  `
>> Documentation/translations/zh_CN/admin-guide/reporting-issues.rst:   
>> 本文档将取代“Documentation/admin-guide/reporting-bugs.rst”。主要的工作
>> Documentation/translations/zh_CN/admin-guide/reporting-issues.rst:   
>> “Documentation/admin-guide/reporting-bugs.rst”中的旧文字非常相似。它和它
> 
> Yeah, as Greg said, we will solve that after you patches be merged in next
> tree. Since I have translate the zh reporting-issues.rst in the next tree,
> will correct the link when I sync it with your new version. May cause 
> Warning for some days, but don't worry about it.

yes, also thanks for generous commitment!

thanks
Alex


Re: [PATCH v2 3/3] dt-bindings: serial: 8250: add aspeed,sirq-active-high

2021-03-31 Thread Zev Weiss

On Wed, Mar 31, 2021 at 11:04:44PM CDT, Andrew Jeffery wrote:



On Thu, 1 Apr 2021, at 11:27, Zev Weiss wrote:

This provides a simpler, more direct alternative to the deprecated
aspeed,sirq-polarity-sense property for indicating the polarity of
the Aspeed VUART's SIRQ line.

Signed-off-by: Zev Weiss 
---
 Documentation/devicetree/bindings/serial/8250.yaml | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/serial/8250.yaml
b/Documentation/devicetree/bindings/serial/8250.yaml
index 491b9297432d..e79bb6ab9d2c 100644
--- a/Documentation/devicetree/bindings/serial/8250.yaml
+++ b/Documentation/devicetree/bindings/serial/8250.yaml
@@ -12,8 +12,9 @@ maintainers:
 allOf:
   - $ref: /schemas/serial.yaml#
   - if:
-  required:
-- aspeed,sirq-polarity-sense
+  anyOf:
+- required: [ aspeed,sirq-active-high ]


Do you think we could make use of the approach I put forward here?

https://lore.kernel.org/openbmc/20210319062752.145730-18-and...@aj.id.au/T/#u

Andrew


If you mean using a u32 property (say aspeed,sirq-polarity) with an 
explicit IRQ_TYPE_LEVEL_{LOW,HIGH} instead of a present/absent bool,
sure, I guess that seems like a somewhat clearer, more orthogonal 
arrangement.



Zev



Re: [PATCH v2] kernel/resource: Fix locking in request_free_mem_region

2021-03-31 Thread Muchun Song
On Fri, Mar 26, 2021 at 9:22 AM Alistair Popple  wrote:
>
> request_free_mem_region() is used to find an empty range of physical
> addresses for hotplugging ZONE_DEVICE memory. It does this by iterating
> over the range of possible addresses using region_intersects() to see if
> the range is free.
>
> region_intersects() obtains a read lock before walking the resource tree
> to protect against concurrent changes. However it drops the lock prior
> to returning. This means by the time request_mem_region() is called in
> request_free_mem_region() another thread may have already reserved the
> requested region resulting in unexpected failures and a message in the
> kernel log from hitting this condition:
>
> /*
>  * mm/hmm.c reserves physical addresses which then
>  * become unavailable to other users.  Conflicts are
>  * not expected.  Warn to aid debugging if encountered.
>  */
> if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
> pr_warn("Unaddressable device %s %pR conflicts with %pR",
> conflict->name, conflict, res);
>
> To fix this create versions of region_intersects() and
> request_mem_region() that allow the caller to take the appropriate lock
> such that it may be held over the required calls.
>
> Instead of creating another version of devm_request_mem_region() that
> doesn't take the lock open-code it to allow the caller to pre-allocate
> the required memory prior to taking the lock.
>
> Fixes: 0c385190392d8 ("resource: add a not device managed 
> request_free_mem_region variant")
> Fixes: 0092908d16c60 ("mm: factor out a devm_request_free_mem_region helper")
> Fixes: 4ef589dc9b10c ("mm/hmm/devmem: device memory hotplug using 
> ZONE_DEVICE")
> Signed-off-by: Alistair Popple 

+cc my email (songmuc...@bytedance.com).

Hi Alistair,

Thanks for your patch. But there is a deadlock that should be fixed.
Please see the following scenario.

__request_region
write_lock(_lock)
request_region_locked
revoke_iomem
devmem_is_allowed (arch/x86/mm/init.c)
region_intersects
read_lock(_lock)   // deadlock


>
> ---
>
> v2:
>  - Added Fixes tag
>
> ---
>  kernel/resource.c | 146 +-
>  1 file changed, 94 insertions(+), 52 deletions(-)
>
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 627e61b0c124..2d4652383dd2 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -523,6 +523,34 @@ int __weak page_is_ram(unsigned long pfn)
>  }
>  EXPORT_SYMBOL_GPL(page_is_ram);
>
> +static int __region_intersects(resource_size_t start, size_t size,
> +  unsigned long flags, unsigned long desc)
> +{
> +   struct resource res;
> +   int type = 0; int other = 0;
> +   struct resource *p;
> +
> +   res.start = start;
> +   res.end = start + size - 1;
> +
> +   for (p = iomem_resource.child; p ; p = p->sibling) {
> +   bool is_type = (((p->flags & flags) == flags) &&
> +   ((desc == IORES_DESC_NONE) ||
> +(desc == p->desc)));
> +
> +   if (resource_overlaps(p, ))
> +   is_type ? type++ : other++;
> +   }
> +
> +   if (type == 0)
> +   return REGION_DISJOINT;
> +
> +   if (other == 0)
> +   return REGION_INTERSECTS;
> +
> +   return REGION_MIXED;
> +}
> +
>  /**
>   * region_intersects() - determine intersection of region with known 
> resources
>   * @start: region start address
> @@ -546,31 +574,12 @@ EXPORT_SYMBOL_GPL(page_is_ram);
>  int region_intersects(resource_size_t start, size_t size, unsigned long 
> flags,
>   unsigned long desc)
>  {
> -   struct resource res;
> -   int type = 0; int other = 0;
> -   struct resource *p;
> -
> -   res.start = start;
> -   res.end = start + size - 1;
> +   int rc;
>
> read_lock(_lock);
> -   for (p = iomem_resource.child; p ; p = p->sibling) {
> -   bool is_type = (((p->flags & flags) == flags) &&
> -   ((desc == IORES_DESC_NONE) ||
> -(desc == p->desc)));
> -
> -   if (resource_overlaps(p, ))
> -   is_type ? type++ : other++;
> -   }
> +   rc = __region_intersects(start, size, flags, desc);
> read_unlock(_lock);
> -
> -   if (type == 0)
> -   return REGION_DISJOINT;
> -
> -   if (other == 0)
> -   return REGION_INTERSECTS;
> -
> -   return REGION_MIXED;
> +   return rc;
>  }
>  EXPORT_SYMBOL_GPL(region_intersects);
>
> @@ -1171,31 +1180,17 @@ struct address_space *iomem_get_mapping(void)
> return smp_load_acquire(_inode)->i_mapping;
>  }
>
> -/**
> - * __request_region - create a new busy resource region
> - * @parent: parent resource 

[PATCH v5 1/1] x86/tdx: Handle MWAIT, MONITOR and WBINVD

2021-03-31 Thread Kuppuswamy Sathyanarayanan
When running as a TDX guest, there are a number of existing,
privileged instructions that do not work. If the guest kernel
uses these instructions, the hardware generates a #VE.

You can find the list of unsupported instructions in Intel
Trust Domain Extensions (Intel® TDX) Module specification,
sec 9.2.2 and in Guest-Host Communication Interface (GHCI)
Specification for Intel TDX, sec 2.4.1.
   
To prevent TD guest from using these unsupported instructions,
following measures are adapted:
   
1. For MWAIT/MONITOR instructions, support for these
instructions are already disabled by TDX module (SEAM).
So CPUID flags for these instructions should be in disabled
state. Also, just to be sure that these instructions are
disabled, forcefully unset X86_FEATURE_MWAIT CPU cap in OS.
       
2. For WBINVD instruction, we use audit to find the code that
uses this instruction and disable them for TD.

After the above mentioned preventive measures, if TD guests still
execute these instructions, add appropriate warning messages in #VE
handler.

Signed-off-by: Kuppuswamy Sathyanarayanan 

Reviewed-by: Andi Kleen 
---

Changes since v4:
 * Fixed commit log and comments as per Dave's comments
 * Used WARN_ONCE for MWAIT/MONITOR #VE.
 * Removed X86_FEATURE_MWAIT suppression code.

Changes since v3:
 * WARN user if SEAM does not disable MONITOR/MWAIT instruction.
 * Fix the commit log and comments to address review comments from
   from Dave & Sean.

Changes since v2:
 * Added BUG() for WBINVD, WARN for MONITOR instructions.
 * Fixed comments as per Dave's review.

Changes since v1:
 * Added WARN() for MWAIT #VE exception.

Changes since previous series:
 * Suppressed MWAIT feature as per Andi's comment.
 * Added warning debug log for MWAIT #VE exception.

 arch/x86/kernel/tdx.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index e936b2f88bf6..9bc84caf4096 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -362,6 +362,22 @@ int tdg_handle_virtualization_exception(struct pt_regs 
*regs,
case EXIT_REASON_EPT_VIOLATION:
ve->instr_len = tdg_handle_mmio(regs, ve);
break;
+   case EXIT_REASON_WBINVD:
+   /*
+* WBINVD is not supported inside TDX guests. All in-
+* kernel uses should have been disabled.
+*/
+   pr_err("TD Guest used unsupported WBINVD instruction\n");
+   BUG();
+   break;
+   case EXIT_REASON_MONITOR_INSTRUCTION:
+   case EXIT_REASON_MWAIT_INSTRUCTION:
+   /*
+* Something in the kernel used MONITOR or MWAIT despite
+* X86_FEATURE_MWAIT being cleared for TDX guests.
+*/
+   WARN_ONCE(1, "TD Guest used unsupported MWAIT/MONITOR 
instruction\n");
+   break;
default:
pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
return -EFAULT;
-- 
2.25.1



[PATCH] drivers: net: fix memory leak in atusb_probe

2021-03-31 Thread Pavel Skripkin
syzbot reported memory leak in atusb_probe()[1].
The problem was in atusb_alloc_urbs().
Since urb is anchored, we need to release the reference
to correctly free the urb

backtrace:
[] kmalloc include/linux/slab.h:559 [inline]
[] usb_alloc_urb+0x66/0xe0 drivers/usb/core/urb.c:74
[] atusb_alloc_urbs drivers/net/ieee802154/atusb.c:362 
[inline][2]
[] atusb_probe+0x158/0x820 
drivers/net/ieee802154/atusb.c:1038 [1]

Reported-by: syzbot+28a246747e0a46512...@syzkaller.appspotmail.com
Signed-off-by: Pavel Skripkin 
---
 drivers/net/ieee802154/atusb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ieee802154/atusb.c b/drivers/net/ieee802154/atusb.c
index 0dd0ba915ab9..23ee0b14cbfa 100644
--- a/drivers/net/ieee802154/atusb.c
+++ b/drivers/net/ieee802154/atusb.c
@@ -365,6 +365,7 @@ static int atusb_alloc_urbs(struct atusb *atusb, int n)
return -ENOMEM;
}
usb_anchor_urb(urb, >idle_urbs);
+   usb_free_urb(urb);
n--;
}
return 0;
-- 
2.30.2



[PATCH v3] ARM: dts: aspeed: add ASRock E3C246D4I BMC

2021-03-31 Thread Zev Weiss
This is a relatively low-cost AST2500-based Xeon E-2100/E-2200 series
mini-ITX board that we hope can provide a decent platform for OpenBMC
development.

This initial device-tree provides the necessary configuration for
basic BMC functionality such as host power control, serial console and
KVM support, and POST code snooping.

Signed-off-by: Zev Weiss 
Reviewed-by: Joel Stanley 
---

Changes since v2:
 - un-bungled filename in dtb-$(CONFIG_ARCH_ASPEED)
Changes since v1:
 - added entry to dtb-$(CONFIG_ARCH_ASPEED)
 - added board to compatible
 - added pinctrl properties to adc
 - split out of mostly-unrelated patch series

 arch/arm/boot/dts/Makefile|   1 +
 .../boot/dts/aspeed-bmc-asrock-e3c246d4i.dts  | 202 ++
 2 files changed, 203 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 8e5d4ab4e75e..c22151b50ddc 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1406,6 +1406,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
aspeed-bmc-ampere-mtjade.dtb \
aspeed-bmc-arm-centriq2400-rep.dtb \
aspeed-bmc-arm-stardragon4800-rep2.dtb \
+   aspeed-bmc-asrock-e3c246d4i.dtb \
aspeed-bmc-bytedance-g220a.dtb \
aspeed-bmc-facebook-cmm.dtb \
aspeed-bmc-facebook-galaxy100.dtb \
diff --git a/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts 
b/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts
new file mode 100644
index ..dcab6e78dfa4
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+
+#include "aspeed-g5.dtsi"
+#include 
+#include 
+
+/{
+   model = "ASRock E3C246D4I BMC";
+   compatible = "asrock,e3c246d4i-bmc", "aspeed,ast2500";
+
+   aliases {
+   serial4 = 
+   };
+
+   chosen {
+   stdout-path = 
+   bootargs = "console=tty0 console=ttyS4,115200 earlyprintk";
+   };
+
+   memory@8000 {
+   reg = <0x8000 0x2000>;
+   };
+
+   leds {
+   compatible = "gpio-leds";
+
+   heartbeat {
+   /* BMC_HB_LED_N */
+   gpios = < ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+   linux,default-trigger = "timer";
+   };
+
+   system-fault {
+   /* SYSTEM_FAULT_LED_N */
+   gpios = < ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+   panic-indicator;
+   };
+   };
+
+   gpio-keys {
+   compatible = "gpio-keys";
+
+   uid-button {
+   label = "uid-button";
+   gpios = < ASPEED_GPIO(F, 1) GPIO_ACTIVE_LOW>;
+   linux,code = ;
+   };
+   };
+
+   iio-hwmon {
+   compatible = "iio-hwmon";
+   io-channels = < 0>, < 1>, < 2>, < 3>, < 4>,
+   < 5>, < 6>, < 7>, < 8>, < 9>,
+   < 10>, < 11>, < 12>;
+   };
+};
+
+ {
+   status = "okay";
+   flash@0 {
+   status = "okay";
+   m25p,fast-read;
+   label = "bmc";
+   spi-max-frequency = <1>; /* 100 MHz */
+#include "openbmc-flash-layout.dtsi"
+   };
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+   aspeed,sirq-active-high;
+};
+
+ {
+   status = "okay";
+
+   pinctrl-names = "default";
+   pinctrl-0 = <_rgmii1_default _mdio1_default>;
+};
+
+ {
+   status = "okay";
+
+   /* thermal sensor, one diode run to a disconnected header */
+   w83773g@4c {
+   compatible = "nuvoton,w83773g";
+   reg = <0x4c>;
+   };
+};
+
+ {
+   status = "okay";
+
+   /* FRU EEPROM */
+   eeprom@57 {
+   compatible = "st,24c128", "atmel,24c128";
+   reg = <0x57>;
+   pagesize = <16>;
+   };
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+};
+
+_ctrl {
+   status = "okay";
+};
+
+_snoop {
+   status = "okay";
+   snoop-ports = <0x80>;
+};
+
+ {
+   status = "okay";
+   gpio-line-names =
+   /*  A */ "BMC_MAC1_INTB", "BMC_MAC2_INTB", "NMI_BTN_N", 
"BMC_NMI",
+   "", "", "", "",
+   /*  B */ "", "", "", "", "", "IRQ_BMC_PCH_SMI_LPC_N", "", "",
+   /*  C */ "", "", "", "", "", "", "", "",
+   /*  D */ "BMC_PSIN", "BMC_PSOUT", "BMC_RESETCON", "RESETCON",
+   "", "", "", "",
+   /*  E */ "", "", "", "", "", "", "", "",
+   /*  F */ "LOCATORLED_STATUS_N", "LOCATORBTN", "", "",
+   "", "", "BMC_PCH_SCI_LPC", "BMC_NCSI_MUX_CTL",
+   /*  G */ "HWM_BAT_EN", "CHASSIS_ID0", "CHASSIS_ID1", 
"CHASSIS_ID2",
+   

RE: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-31 Thread Liu, Yi L
> From: Jason Gunthorpe 
> Sent: Wednesday, March 31, 2021 8:41 PM
> 
> On Wed, Mar 31, 2021 at 07:38:36AM +, Liu, Yi L wrote:
> 
> > The reason is /dev/ioasid FD is per-VM since the ioasid allocated to
> > the VM should be able to be shared by all assigned device for the VM.
> > But the SVA operations (bind/unbind page table, cache_invalidate) should
> > be per-device.
> 
> It is not *per-device* it is *per-ioasid*
>
> And as /dev/ioasid is an interface for controlling multiple ioasid's
> there is no issue to also multiplex the page table manipulation for
> multiple ioasids as well.
> 
> What you should do next is sketch out in some RFC the exactl ioctls
> each FD would have and show how the parts I outlined would work and
> point out any remaining gaps.
> 
> The device FD is something like the vfio_device FD from VFIO, it has
> *nothing* to do with PASID beyond having a single ioctl to authorize
> the device to use the PASID. All control of the PASID is in
> /dev/ioasid.

good to see this reply. Your idea is much clearer to me now. If I'm getting
you correctly. I think the skeleton is something like below:

1) userspace opens a /dev/ioasid, meanwhile there will be an ioasid
   allocated and a per-ioasid context which can be used to do bind page
   table and cache invalidate, an ioasid FD returned to userspace.
2) userspace passes the ioasid FD to VFIO, let it associated with a device
   FD (like vfio_device FD).
3) userspace binds page table on the ioasid FD with the page table info.
4) userspace unbinds the page table on the ioasid FD
5) userspace de-associates the ioasid FD and device FD

Does above suit your outline?

If yes, I still have below concern and wish to see your opinion.
- the ioasid FD and device association will happen at runtime instead of
  just happen in the setup phase.
- how about AMD and ARM's vSVA support? Their PASID allocation and page table
  happens within guest. They only need to bind the guest PASID table to host.
  Above model seems unable to fit them. (Jean, Eric, Jacob please feel free
  to correct me)
- this per-ioasid SVA operations is not aligned with the native SVA usage
  model. Native SVA bind is per-device.

Regards,
Yi Liu


Re: [PATCH v7 3/8] mm/rmap: Split try_to_munlock from try_to_unmap

2021-03-31 Thread Alistair Popple
On Wednesday, 31 March 2021 10:57:46 PM AEDT Jason Gunthorpe wrote:
> On Wed, Mar 31, 2021 at 03:15:47PM +1100, Alistair Popple wrote:
> > On Wednesday, 31 March 2021 2:56:38 PM AEDT John Hubbard wrote:
> > > On 3/30/21 3:56 PM, Alistair Popple wrote:
> > > ...
> > > >> +1 for renaming "munlock*" items to "mlock*", where applicable. good 
> > grief.
> > > > 
> > > > At least the situation was weird enough to prompt further 
investigation :)
> > > > 
> > > > Renaming to mlock* doesn't feel like the right solution to me either 
> > though. I
> > > > am not sure if you saw me responding to myself earlier but I am 
thinking
> > > > renaming try_to_munlock() -> page_mlocked() and try_to_munlock_one() -
>
> > > > page_mlock_one() might be better. Thoughts?
> > > > 
> > > 
> > > Quite confused by this naming idea. Because: try_to_munlock() returns
> > > void, so a boolean-style name such as "page_mlocked()" is already not a
> > > good fit.
> > > 
> > > Even more important, though, is that try_to_munlock() is mlock-ing the
> > > page, right? Is there some subtle point I'm missing? It really is doing
> > > an mlock to the best of my knowledge here. Although the kerneldoc
> > > comment for try_to_munlock() seems questionable too:
> > 
> > It's mlocking the page if it turns out it still needs to be locked after 
> > unlocking it. But I don't think you're missing anything.
> 
> It is really searching all VMA's to see if the VMA flag is set and if
> any are found then it mlocks the page.
> 
> But presenting this rountine in its simplified form raises lots of
> questions:
> 
>  - What locking is being used to read the VMA flag?
>  - Why do we need to manipulate global struct page flags under the
>page table locks of a single VMA?

I was wondering that and questioned it in an earlier version of this series. I 
have done some digging and the commit log for b87537d9e2fe ("mm: rmap use pte 
lock not mmap_sem to set PageMlocked") provides the original justification.

It's fairly long so I won't quote it here but the summary seems to be that 
among other things the combination of page lock and ptl makes this safe. I 
have yet to verify if everything there still holds and is sensible, but the 
last paragraph certainly is :-)

"Stopped short of separating try_to_munlock_one() from try_to_munmap_one()
on this occasion, but that's probably the sensible next step - with a
rename, given that try_to_munlock()'s business is to try to set Mlocked."

>  - Why do we need to check for huge pages inside the VMA loop, not
>before going to the rmap? PageTransCompoundHead() is not sensitive to
>the PTEs. (and what happens if the huge page breaks up concurrently?)
>  - Why do we clear the mlock bit then run around to try and set it?

I don't have an answer for that as I'm not (yet) across all the mlock code 
paths, but I'm hoping this patch at least won't change anything.

>Feels racey.
>
> Jason
> 






Re: [PATCH] psi: allow unprivileged users with CAP_SYS_RESOURCE to write psi files

2021-03-31 Thread Eric W. Biederman
Josh Hunt  writes:

> Currently only root can write files under /proc/pressure. Relax this to
> allow tasks running as unprivileged users with CAP_SYS_RESOURCE to be
> able to write to these files.

The test for CAP_SYS_RESOURCE really needs to be in open rather
than in write.

Otherwise a suid root executable could have stdout redirected
into these files.

Eric


> Signed-off-by: Josh Hunt 
> ---
>  kernel/sched/psi.c | 9 ++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index b1b00e9bd7ed..98ff7baf1ba8 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -1270,6 +1270,9 @@ static ssize_t psi_write(struct file *file, const char 
> __user *user_buf,
>   if (!nbytes)
>   return -EINVAL;
>  
> + if (!capable(CAP_SYS_RESOURCE))
> + return -EPERM;
> +
>   buf_size = min(nbytes, sizeof(buf));
>   if (copy_from_user(buf, user_buf, buf_size))
>   return -EFAULT;
> @@ -1353,9 +1356,9 @@ static int __init psi_proc_init(void)
>  {
>   if (psi_enable) {
>   proc_mkdir("pressure", NULL);
> - proc_create("pressure/io", 0, NULL, _io_proc_ops);
> - proc_create("pressure/memory", 0, NULL, _memory_proc_ops);
> - proc_create("pressure/cpu", 0, NULL, _cpu_proc_ops);
> + proc_create("pressure/io", 0666, NULL, _io_proc_ops);
> + proc_create("pressure/memory", 0666, NULL, 
> _memory_proc_ops);
> + proc_create("pressure/cpu", 0666, NULL, _cpu_proc_ops);
>   }
>   return 0;
>  }


Re: [PATCH] powerpc/8xx: Load modules closer to kernel text

2021-03-31 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 31/03/2021 à 15:39, Michael Ellerman a écrit :
>> Christophe Leroy  writes:
>>> On the 8xx, TASK_SIZE is 0x8000. The space between TASK_SIZE and
>>> PAGE_OFFSET is not used.
>>>
>>> Use it to load modules in order to minimise the distance between
>>> kernel text and modules and avoid trampolines in modules to access
>>> kernel functions or other module functions.
>>>
>>> Define a 16Mbytes area for modules, that's more than enough.
>> 
>> 16MB seems kind of small.
>> 
>> At least on 64-bit we could potentially have hundreds of MBs of modules.
>> 
>
> Well, with a 16 MB kernel and 16 MB modules, my board is full :)

Heh.

> Even on the more recent board that has 128 MB, I don't expect more than a few 
> MBs of modules in 
> addition to the kernel which is approx 8M.
>
> But ok, I'll do something more generic, though it will conflict with Jordan's 
> series.

Don't feel you have to. You're the expert on 8xx, not me.

cheers


mmotm 2021-03-31-21-27 uploaded

2021-03-31 Thread akpm
The mm-of-the-moment snapshot 2021-03-31-21-27 has been uploaded to

   https://www.ozlabs.org/~akpm/mmotm/

mmotm-readme.txt says

README for mm-of-the-moment:

https://www.ozlabs.org/~akpm/mmotm/

This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
more than once a week.

You will need quilt to apply these patches to the latest Linus release (5.x
or 5.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
https://ozlabs.org/~akpm/mmotm/series

The file broken-out.tar.gz contains two datestamp files: .DATE and
.DATE--mm-dd-hh-mm-ss.  Both contain the string -mm-dd-hh-mm-ss,
followed by the base kernel version against which this patch series is to
be applied.

This tree is partially included in linux-next.  To see which patches are
included in linux-next, consult the `series' file.  Only the patches
within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in
linux-next.


A full copy of the full kernel tree with the linux-next and mmotm patches
already applied is available through git within an hour of the mmotm
release.  Individual mmotm releases are tagged.  The master branch always
points to the latest release, so it's constantly rebasing.

https://github.com/hnaz/linux-mm

The directory https://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second)
contains daily snapshots of the -mm tree.  It is updated more frequently
than mmotm, and is untested.

A git copy of this tree is also available at

https://github.com/hnaz/linux-mm



This mmotm tree contains the following patches against 5.12-rc5:
(patches marked "*" will be included in linux-next)

  origin.patch
* maintainers-update-cznics-turris-information.patch
* treewide-change-my-e-mail-address-fix-my-name.patch
* mailmap-update-email-address-for-jordan-crouse.patch
* kasan-fix-hwasan-build-for-gcc.patch
* kasan-remove-redundant-config-option.patch
* kasan-remove-redundant-config-option-fix.patch
* mm-gup-check-page-posion-status-for-coredump.patch
* mm-gup-check-page-posion-status-for-coredump-fix.patch
* mm-gup-check-page-posion-status-for-coredump-v4.patch
* 
nds32-flush_dcache_page-use-page_mapping_file-to-avoid-races-with-swapoff.patch
* fs-direct-io-fix-missing-sdio-boundary.patch
* kasan-fix-conflict-with-page-poisoning.patch
* kfence-x86-fix-preemptible-warning-on-kpti-enabled-systems.patch
* lib-fix-kconfig-dependency-on-arch_want_frame_pointers.patch
* ocfs2-fix-deadlock-between-setattr-and-dio_end_io_write.patch
* ia64-fix-user_stack_pointer-for-ptrace.patch
* proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch
* proc-kpageflags-do-not-use-uninitialized-struct-pages.patch
* module-remove-duplicate-include-in-arch-ia64-kernel-heads.patch
* ia64-kernel-few-typos-fixed-in-the-file-fsyss.patch
* ia64-include-asm-minor-typo-fixes-in-the-file-pgtableh.patch
* ia64-ensure-proper-numa-distance-and-possible-map-initialization.patch
* ia64-drop-unused-ia64_fw_emu-ifdef.patch
* ia64-simplify-code-flow-around-swiotlb-init.patch
* ia64-tools-remove-inclusion-of-ia64-specific-version-of-errnoh-header.patch
* ia64-tools-remove-duplicate-definition-of-ia64_mf-on-ia64.patch
* ia64-trivial-spelling-fixes.patch
* ia64-fix-efi_debug-build.patch
* ia64-mca-always-make-ia64_mca_debug-an-expression.patch
* sparse-can-do-constant-folding-of-__builtin_bswap.patch
* scripts-spellingtxt-add-entries-for-recent-discoveries.patch
* sh-remove-duplicate-include-in-tlbh.patch
* ocfs2-replace-define_simple_attribute-with-define_debugfs_attribute.patch
* ocfs2-map-flags-directly-in-flags_to_o2dlm.patch
* ocfs2-fix-a-typo.patch
* ocfs2-clear-links-count-in-ocfs2_mknod-if-an-error-occurs.patch
* ocfs2-fix-ocfs2-corrupt-when-iputting-an-inode.patch
* watchdog-rename-__touch_watchdog-to-a-better-descriptive-name.patch
* watchdog-explicitly-update-timestamp-when-reporting-softlockup.patch
* watchdog-softlockup-report-the-overall-time-of-softlockups.patch
* watchdog-softlockup-remove-logic-that-tried-to-prevent-repeated-reports.patch
* watchdog-fix-barriers-when-printing-backtraces-from-all-cpus.patch
* watchdog-fix-barriers-when-printing-backtraces-from-all-cpus-fix.patch
* watchdog-cleanup-handling-of-false-positives.patch
  mm.patch
* 
mm-slab_common-provide-slab_merge-option-for-is_enabledconfig_slab_merge_default-builds.patch
* 
mm-slub-enable-slub_debug-static-key-when-creating-cache-with-explicit-debug-flags.patch
* kunit-add-a-kunit-test-for-slub-debugging-functionality.patch
* slub-remove-resiliency_test-function.patch
* mm-slubc-trivial-typo-fixes.patch
* mm-kmemleak-fix-a-typo.patch
* mm-page_owner-record-the-timestamp-of-all-pages-during-free.patch
* mm-page_owner-remove-unused-parameter-in-__set_page_owner_handle.patch
* mm-provide-filemap_range_needs_writeback-helper.patch
* mm-use-filemap_range_needs_writeback-for-o_direct-reads.patch
* iomap-use-filemap_range_needs_writeback-for-o_direct-reads.patch
* mm-filemap-use-filemap_read_page-in-filemap_fault.patch
* 

Re: [PATCH v4 1/1] x86/tdx: Handle MWAIT, MONITOR and WBINVD

2021-03-31 Thread Andi Kleen
On Wed, Mar 31, 2021 at 08:46:18PM -0700, Dave Hansen wrote:
> On 3/31/21 8:28 PM, Andi Kleen wrote:
> >> The hardware (and VMMs and SEAM) have ways of telling the guest kernel
> >> what is supported: CPUID.  If it screws up, and the guest gets an
> >> unexpected #VE, so be it.
> > The main reason for disabling stuff is actually that we don't need
> > to harden it. All these things are potential attack paths.
> 
> Wait, MWAIT is an attack path?  If it were an attack path, wouldn't it

No MWAIT is not, but lots of other things that can be controlled by the
host are. And that will be a motivation to disable things.

> >> We don't have all kinds of crazy handling in the kernel's #UD handler
> >> just in case a CPU mis-enumerates a feature and we get a #UD.  We have
> >> to trust the underlying hardware to be sane.  If it isn't, we die a
> >> horrible death as fast as possible.  Why should TDX be any different?
> > That's what the original patch did -- no unnecessary checks -- but reviewers
> > keep asking for the extra checks, so Sathya added more. We have the not
> > unusual problem here that reviewers don't agree among themselves.
> 
> Getting consensus is a pain in the neck, eh?

Tt seems more like a circular argument currently.
> 
> It's too bad all the reviewers in the community aren't like all of the
> engineers at big companies where everyone always agrees. :)

I would propose to go back to the original patch without all the extra
checks. I think that's what you're arguing too. IIRC the person
who originally requested extra checks was Andy, if he's ok with 
that too we can do it, so that you guys can finally move on
to the other patches that actually do more than just trivial things.

-Andi


[PATCH] psi: allow unprivileged users with CAP_SYS_RESOURCE to write psi files

2021-03-31 Thread Josh Hunt
Currently only root can write files under /proc/pressure. Relax this to
allow tasks running as unprivileged users with CAP_SYS_RESOURCE to be
able to write to these files.

Signed-off-by: Josh Hunt 
---
 kernel/sched/psi.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index b1b00e9bd7ed..98ff7baf1ba8 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1270,6 +1270,9 @@ static ssize_t psi_write(struct file *file, const char 
__user *user_buf,
if (!nbytes)
return -EINVAL;
 
+   if (!capable(CAP_SYS_RESOURCE))
+   return -EPERM;
+
buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
@@ -1353,9 +1356,9 @@ static int __init psi_proc_init(void)
 {
if (psi_enable) {
proc_mkdir("pressure", NULL);
-   proc_create("pressure/io", 0, NULL, _io_proc_ops);
-   proc_create("pressure/memory", 0, NULL, _memory_proc_ops);
-   proc_create("pressure/cpu", 0, NULL, _cpu_proc_ops);
+   proc_create("pressure/io", 0666, NULL, _io_proc_ops);
+   proc_create("pressure/memory", 0666, NULL, 
_memory_proc_ops);
+   proc_create("pressure/cpu", 0666, NULL, _cpu_proc_ops);
}
return 0;
 }
-- 
2.17.1



Re: [PATCH v2 2/3] drivers/tty/serial/8250: add DT property for aspeed vuart sirq polarity

2021-03-31 Thread Joel Stanley
On Thu, 1 Apr 2021 at 00:57, Zev Weiss  wrote:
>
> This provides a simple boolean to use instead of the deprecated
> aspeed,sirq-polarity-sense property.
>
> Signed-off-by: Zev Weiss 
> ---
>  drivers/tty/serial/8250/8250_aspeed_vuart.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c 
> b/drivers/tty/serial/8250/8250_aspeed_vuart.c
> index c33e02cbde93..e5ef9f957f9a 100644
> --- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
> +++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
> @@ -482,6 +482,9 @@ static int aspeed_vuart_probe(struct platform_device 
> *pdev)
> of_node_put(sirq_polarity_sense_args.np);
> }
>
> +   if (of_property_read_bool(np, "aspeed,sirq-active-high"))
> +   aspeed_vuart_set_sirq_polarity(vuart, 1);

This assumes the default is always low, so we don't need a property to
set it to that state?

Would it make more sense to have the property describe if it's high or
low? (I'm happy for the answer to be "no", as we've gotten by for the
past few years without it).

This brings up another point. We already have the sysfs file for
setting the lpc address, from userspace. In OpenBMC land this can be
set with obmc-console-client (/etc/obmc-console.conf). Should we add
support to that application for setting the irq polarity too, and do
away with device tree descriptions?

> +
> aspeed_vuart_set_enabled(vuart, true);
> aspeed_vuart_set_host_tx_discard(vuart, true);
> platform_set_drvdata(pdev, vuart);
> --
> 2.31.1
>


Re: [PATCH] ARM: dts: aspeed: add ASRock E3C246D4I BMC

2021-03-31 Thread Zev Weiss

On Wed, Mar 31, 2021 at 10:51:42PM CDT, Joel Stanley wrote:

Hi Zev,

On Thu, 1 Apr 2021 at 02:57, Zev Weiss  wrote:


This is a relatively low-cost AST2500-based Xeon E-2100/E-2200 series
mini-ITX board that we hope can provide a decent platform for OpenBMC
development.

This initial device-tree provides the necessary configuration for
basic BMC functionality such as host power control, serial console and
KVM support, and POST code snooping.


The patch looks good! Some minor things below.

When sending subsequent versions, make sure to add -v N to your git
format-patch to mark it as the Nth version.

You've also set this to be threaded with a previous version of the
patch. We normally don't do that, and in this case it's doubly
confusing as you've split this patch out from the previous series.

I noticed you cc'd s...@kernel.org. We normally only do this when we
want the soc maintainers to apply a patch directly without going
through another maintainer. In this case the patch should go through
the aspeed maintainer's tree (me), so you don't need to cc that
address.



Hmm, that came from using './scripts/get_maintainer.pl --no-rolestats' 
with git send-email's --cc-cmd flag; does there happen to be a similarly 
easy alternative that wouldn't do the "wrong" thing there?


Ack on the rest, will send v3 soon.


Thanks,
Zev


Signed-off-by: Zev Weiss 
Reviewed-by: Joel Stanley 
---


This spot just here is where you should put the changes between v1 and v2.


 arch/arm/boot/dts/Makefile|   1 +
 .../boot/dts/aspeed-bmc-asrock-e3c246d4i.dts  | 202 ++
 2 files changed, 203 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 8e5d4ab4e75e..b12911262ca1 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1406,6 +1406,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
aspeed-bmc-ampere-mtjade.dtb \
aspeed-bmc-arm-centriq2400-rep.dtb \
aspeed-bmc-arm-stardragon4800-rep2.dtb \
+   aspeed-bmc-asrock-e3c246d4i.dts \


This should be the output name (.dtb).


aspeed-bmc-bytedance-g220a.dtb \
aspeed-bmc-facebook-cmm.dtb \
aspeed-bmc-facebook-galaxy100.dtb \
diff --git a/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts 
b/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts
new file mode 100644
index ..dcab6e78dfa4
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts


The device tree itself looks good!

If you fix up the things I mentioned and send a v3 I will apply it.

Cheers,

Joel


@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+
+#include "aspeed-g5.dtsi"
+#include 
+#include 
+
+/{
+   model = "ASRock E3C246D4I BMC";
+   compatible = "asrock,e3c246d4i-bmc", "aspeed,ast2500";
+
+   aliases {
+   serial4 = 
+   };
+
+   chosen {
+   stdout-path = 
+   bootargs = "console=tty0 console=ttyS4,115200 earlyprintk";
+   };
+
+   memory@8000 {
+   reg = <0x8000 0x2000>;
+   };
+
+   leds {
+   compatible = "gpio-leds";
+
+   heartbeat {
+   /* BMC_HB_LED_N */
+   gpios = < ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+   linux,default-trigger = "timer";
+   };
+
+   system-fault {
+   /* SYSTEM_FAULT_LED_N */
+   gpios = < ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+   panic-indicator;
+   };
+   };
+
+   gpio-keys {
+   compatible = "gpio-keys";
+
+   uid-button {
+   label = "uid-button";
+   gpios = < ASPEED_GPIO(F, 1) GPIO_ACTIVE_LOW>;
+   linux,code = ;
+   };
+   };
+
+   iio-hwmon {
+   compatible = "iio-hwmon";
+   io-channels = < 0>, < 1>, < 2>, < 3>, < 4>,
+   < 5>, < 6>, < 7>, < 8>, < 9>,
+   < 10>, < 11>, < 12>;
+   };
+};
+
+ {
+   status = "okay";
+   flash@0 {
+   status = "okay";
+   m25p,fast-read;
+   label = "bmc";
+   spi-max-frequency = <1>; /* 100 MHz */
+#include "openbmc-flash-layout.dtsi"
+   };
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+   aspeed,sirq-active-high;
+};
+
+ {
+   status = "okay";
+
+   pinctrl-names = "default";
+   pinctrl-0 = <_rgmii1_default _mdio1_default>;
+};
+
+ {
+   status = "okay";
+
+   /* thermal sensor, one diode run to a disconnected header */
+   w83773g@4c {
+   compatible = "nuvoton,w83773g";
+   reg = <0x4c>;
+   };
+};
+
+ {
+   status = "okay";
+
+   /* FRU EEPROM */
+   eeprom@57 {
+   compatible = 

Re: [PATCH v2 3/3] dt-bindings: serial: 8250: add aspeed,sirq-active-high

2021-03-31 Thread Andrew Jeffery



On Thu, 1 Apr 2021, at 11:27, Zev Weiss wrote:
> This provides a simpler, more direct alternative to the deprecated
> aspeed,sirq-polarity-sense property for indicating the polarity of
> the Aspeed VUART's SIRQ line.
> 
> Signed-off-by: Zev Weiss 
> ---
>  Documentation/devicetree/bindings/serial/8250.yaml | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/serial/8250.yaml 
> b/Documentation/devicetree/bindings/serial/8250.yaml
> index 491b9297432d..e79bb6ab9d2c 100644
> --- a/Documentation/devicetree/bindings/serial/8250.yaml
> +++ b/Documentation/devicetree/bindings/serial/8250.yaml
> @@ -12,8 +12,9 @@ maintainers:
>  allOf:
>- $ref: /schemas/serial.yaml#
>- if:
> -  required:
> -- aspeed,sirq-polarity-sense
> +  anyOf:
> +- required: [ aspeed,sirq-active-high ]

Do you think we could make use of the approach I put forward here?

https://lore.kernel.org/openbmc/20210319062752.145730-18-and...@aj.id.au/T/#u

Andrew


Re: [PATCH 1/2] gpio: sysfs: Obey valid_mask

2021-03-31 Thread Matti Vaittinen


On Wed, 2021-03-31 at 20:29 +0200, Bartosz Golaszewski wrote:
> On Wed, Mar 31, 2021 at 2:25 PM Andy Shevchenko
>  wrote:
> > On Wed, Mar 31, 2021 at 10:58 AM Bartosz Golaszewski
> >  wrote:
> > > On Mon, Mar 29, 2021 at 1:41 PM Matti Vaittinen
> > >  wrote:
> > > > Do not allow exporting GPIOs which are set invalid
> > > > by the driver's valid mask.
> > > > 
> > > > Fixes: 726cb3ba49692bdae6caff457755e7cdb432efa4
> > 
> > I have just noticed that this is invalid format for the Fixes tag
> > (luckily, haha, due to a blank line it's not recognized as a tag!).
> > 
> > Matti, I highly recommend to add in your .gitconfig file an alias:
> > one = show -s --pretty='format:%h (\"%s\")'
> > 
> > Bart, there are real Fixes tag issues from another series. I will
> > comment there as well to let an author know.
> > 
> > --
> 
> Eek, sorry I should have looked more carefully. I'll fix it in my
> tree.

Thanks for fixing this Bartosz.
Andy - well spotted. And the alias you pointed is something I've missed
:)

Sorry for the trouble! I should have used the correct tag format.

Thanks again!

Best Regards
Matti Vaittinen




Re: [PATCH v2 1/3] dt-bindings: serial: 8250: deprecate aspeed,sirq-polarity-sense

2021-03-31 Thread Joel Stanley
On Thu, 1 Apr 2021 at 00:57, Zev Weiss  wrote:
>
> This property ties SIRQ polarity to SCU register bits that don't
> necessarily have any direct relationship to it; the only use of it
> was removed in commit c82bf6e133d30e0f9172a20807814fa28aef0f67.
>
> Signed-off-by: Zev Weiss 

Reviewed-by: Joel Stanley 

> ---
>  Documentation/devicetree/bindings/serial/8250.yaml | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/Documentation/devicetree/bindings/serial/8250.yaml 
> b/Documentation/devicetree/bindings/serial/8250.yaml
> index f54cae9ff7b2..491b9297432d 100644
> --- a/Documentation/devicetree/bindings/serial/8250.yaml
> +++ b/Documentation/devicetree/bindings/serial/8250.yaml
> @@ -188,6 +188,7 @@ properties:
>offset and bit number to identify how the SIRQ polarity should be
>configured. One possible data source is the LPC/eSPI mode bit. Only
>applicable to aspeed,ast2500-vuart.
> +deprecated: true
>
>  required:
>- reg
> --
> 2.31.1
>


Re: [PATCH] ARM: dts: aspeed: add ASRock E3C246D4I BMC

2021-03-31 Thread Joel Stanley
Hi Zev,

On Thu, 1 Apr 2021 at 02:57, Zev Weiss  wrote:
>
> This is a relatively low-cost AST2500-based Xeon E-2100/E-2200 series
> mini-ITX board that we hope can provide a decent platform for OpenBMC
> development.
>
> This initial device-tree provides the necessary configuration for
> basic BMC functionality such as host power control, serial console and
> KVM support, and POST code snooping.

The patch looks good! Some minor things below.

When sending subsequent versions, make sure to add -v N to your git
format-patch to mark it as the Nth version.

You've also set this to be threaded with a previous version of the
patch. We normally don't do that, and in this case it's doubly
confusing as you've split this patch out from the previous series.

I noticed you cc'd s...@kernel.org. We normally only do this when we
want the soc maintainers to apply a patch directly without going
through another maintainer. In this case the patch should go through
the aspeed maintainer's tree (me), so you don't need to cc that
address.

> Signed-off-by: Zev Weiss 
> Reviewed-by: Joel Stanley 
> ---

This spot just here is where you should put the changes between v1 and v2.

>  arch/arm/boot/dts/Makefile|   1 +
>  .../boot/dts/aspeed-bmc-asrock-e3c246d4i.dts  | 202 ++
>  2 files changed, 203 insertions(+)
>  create mode 100644 arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts
>
> diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
> index 8e5d4ab4e75e..b12911262ca1 100644
> --- a/arch/arm/boot/dts/Makefile
> +++ b/arch/arm/boot/dts/Makefile
> @@ -1406,6 +1406,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
> aspeed-bmc-ampere-mtjade.dtb \
> aspeed-bmc-arm-centriq2400-rep.dtb \
> aspeed-bmc-arm-stardragon4800-rep2.dtb \
> +   aspeed-bmc-asrock-e3c246d4i.dts \

This should be the output name (.dtb).

> aspeed-bmc-bytedance-g220a.dtb \
> aspeed-bmc-facebook-cmm.dtb \
> aspeed-bmc-facebook-galaxy100.dtb \
> diff --git a/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts 
> b/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts
> new file mode 100644
> index ..dcab6e78dfa4
> --- /dev/null
> +++ b/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts

The device tree itself looks good!

If you fix up the things I mentioned and send a v3 I will apply it.

Cheers,

Joel

> @@ -0,0 +1,202 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/dts-v1/;
> +
> +#include "aspeed-g5.dtsi"
> +#include 
> +#include 
> +
> +/{
> +   model = "ASRock E3C246D4I BMC";
> +   compatible = "asrock,e3c246d4i-bmc", "aspeed,ast2500";
> +
> +   aliases {
> +   serial4 = 
> +   };
> +
> +   chosen {
> +   stdout-path = 
> +   bootargs = "console=tty0 console=ttyS4,115200 earlyprintk";
> +   };
> +
> +   memory@8000 {
> +   reg = <0x8000 0x2000>;
> +   };
> +
> +   leds {
> +   compatible = "gpio-leds";
> +
> +   heartbeat {
> +   /* BMC_HB_LED_N */
> +   gpios = < ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
> +   linux,default-trigger = "timer";
> +   };
> +
> +   system-fault {
> +   /* SYSTEM_FAULT_LED_N */
> +   gpios = < ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
> +   panic-indicator;
> +   };
> +   };
> +
> +   gpio-keys {
> +   compatible = "gpio-keys";
> +
> +   uid-button {
> +   label = "uid-button";
> +   gpios = < ASPEED_GPIO(F, 1) GPIO_ACTIVE_LOW>;
> +   linux,code = ;
> +   };
> +   };
> +
> +   iio-hwmon {
> +   compatible = "iio-hwmon";
> +   io-channels = < 0>, < 1>, < 2>, < 3>, < 
> 4>,
> +   < 5>, < 6>, < 7>, < 8>, < 9>,
> +   < 10>, < 11>, < 12>;
> +   };
> +};
> +
> + {
> +   status = "okay";
> +   flash@0 {
> +   status = "okay";
> +   m25p,fast-read;
> +   label = "bmc";
> +   spi-max-frequency = <1>; /* 100 MHz */
> +#include "openbmc-flash-layout.dtsi"
> +   };
> +};
> +
> + {
> +   status = "okay";
> +};
> +
> + {
> +   status = "okay";
> +   aspeed,sirq-active-high;
> +};
> +
> + {
> +   status = "okay";
> +
> +   pinctrl-names = "default";
> +   pinctrl-0 = <_rgmii1_default _mdio1_default>;
> +};
> +
> + {
> +   status = "okay";
> +
> +   /* thermal sensor, one diode run to a disconnected header */
> +   w83773g@4c {
> +   compatible = "nuvoton,w83773g";
> +   reg = <0x4c>;
> +   };
> +};
> +
> + {
> +   status = "okay";
> +
> +   /* FRU EEPROM */
> +   eeprom@57 {
> +   compatible = "st,24c128", "atmel,24c128";
> +   reg = <0x57>;
> + 

Re: [PATCH v4 1/1] x86/tdx: Handle MWAIT, MONITOR and WBINVD

2021-03-31 Thread Dave Hansen
On 3/31/21 8:28 PM, Andi Kleen wrote:
>> The hardware (and VMMs and SEAM) have ways of telling the guest kernel
>> what is supported: CPUID.  If it screws up, and the guest gets an
>> unexpected #VE, so be it.
> The main reason for disabling stuff is actually that we don't need
> to harden it. All these things are potential attack paths.

Wait, MWAIT is an attack path?  If it were an attack path, wouldn't it
be an attack path that was created from the SEAM layer or the hardware
being broken?  Aren't those two things within the trust boundary?  Do we
harden against other things within the trust boundary?

>> We don't have all kinds of crazy handling in the kernel's #UD handler
>> just in case a CPU mis-enumerates a feature and we get a #UD.  We have
>> to trust the underlying hardware to be sane.  If it isn't, we die a
>> horrible death as fast as possible.  Why should TDX be any different?
> That's what the original patch did -- no unnecessary checks -- but reviewers
> keep asking for the extra checks, so Sathya added more. We have the not
> unusual problem here that reviewers don't agree among themselves.

Getting consensus is a pain in the neck, eh?

It's too bad all the reviewers in the community aren't like all of the
engineers at big companies where everyone always agrees. :)


Re: [PATCH -next] erofs: Clean up spelling mistakes found in fs/erofs

2021-03-31 Thread Gao Xiang
On Wed, Mar 31, 2021 at 05:39:20AM -0400, Ruiqi Gong wrote:
> zmap.c:  s/correspoinding/corresponding
> zdata.c: s/endding/ending
> 
> Reported-by: Hulk Robot 
> Signed-off-by: Ruiqi Gong 

Reviewed-by: Gao Xiang 

Thanks,
Gao Xiang



Re: [PATCH v4 1/2] scsi: ufs: Fix task management request completion timeout

2021-03-31 Thread Bart Van Assche
On 3/31/21 9:45 AM, Avri Altman wrote:
>> ufshcd_tmc_handler() calls blk_mq_tagset_busy_iter(fn =
>> ufshcd_compl_tm()),
>> but since blk_mq_tagset_busy_iter() only iterates over all reserved tags
>> and requests which are not in IDLE state, ufshcd_compl_tm() never gets a
>> chance to run. Thus, TMR always ends up with completion timeout. Fix it by
>> calling blk_mq_start_request() in  __ufshcd_issue_tm_cmd().
>>
>> Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to allocate and
>> free TMFs")
>>
>> Signed-off-by: Can Guo 
>> ---
>>  drivers/scsi/ufs/ufshcd.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index b49555fa..d4f8cb2 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -6464,6 +6464,7 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba
>> *hba,
>>
>> spin_lock_irqsave(host->host_lock, flags);
>> task_tag = hba->nutrs + free_slot;
>> +   blk_mq_start_request(req);
> Maybe just set req->state to MQ_RQ_IN_FLIGHT
> Without all other irrelevant initializations such as add timeout etc.

Hmm ... I'm not sure that any of the actions performed by
blk_mq_start_request() are irrelevant in this context. Additionally, no
other block or SCSI driver sets MQ_RQ_IN_FLIGHT directly.

Thanks,

Bart.


Re: [syzbot] KASAN: vmalloc-out-of-bounds Read in bpf_trace_run2

2021-03-31 Thread syzbot
syzbot suspects this issue was fixed by commit:

commit befe6d946551d65cddbd32b9cb0170b0249fd5ed
Author: Steven Rostedt (VMware) 
Date:   Wed Nov 18 14:34:05 2020 +

tracepoint: Do not fail unregistering a probe due to memory failure

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=123358a1d0
start commit:   70b97111 bpf: Use hlist_add_head_rcu when linking to local..
git tree:   bpf-next
kernel config:  https://syzkaller.appspot.com/x/.config?x=7e0ca96a9b6ee858
dashboard link: https://syzkaller.appspot.com/bug?extid=845923d2172947529b58
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=10193f3b90
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=168c729b90

If the result looks correct, please mark the issue as fixed by replying with:

#syz fix: tracepoint: Do not fail unregistering a probe due to memory failure

For information about bisection process see: https://goo.gl/tpsmEJ#bisection


Re: [PATCH -next] i2c: gpio: use DEFINE_SPINLOCK() for spinlock

2021-03-31 Thread chenlifu

Kindly pinging ...

Best Regards,
Chen Lifu

在 2021/3/27 17:52, Chen Lifu 写道:

From: Lifu Chen 

spinlock can be initialized automatically with DEFINE_SPINLOCK()
rather than explicitly calling spin_lock_init().

Reported-by: Hulk Robot 
Signed-off-by: Lifu Chen 
---
  arch/arm/mach-sa1100/simpad.c | 4 +---
  1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/arm/mach-sa1100/simpad.c b/arch/arm/mach-sa1100/simpad.c
index c7fb9a73e4c5..c183432880d3 100644
--- a/arch/arm/mach-sa1100/simpad.c
+++ b/arch/arm/mach-sa1100/simpad.c
@@ -45,7 +45,7 @@
   */
  
  static long cs3_shadow;

-static spinlock_t cs3_lock;
+static DEFINE_SPINLOCK(cs3_lock);
  static struct gpio_chip cs3_gpio;
  
  long simpad_get_cs3_ro(void)

@@ -379,8 +379,6 @@ static int __init simpad_init(void)
  {
int ret;
  
-	spin_lock_init(_lock);

-
cs3_gpio.label = "simpad_cs3";
cs3_gpio.base = SIMPAD_CS3_GPIO_BASE;
cs3_gpio.ngpio = 24;

.



Re: [PATCH] mm: memcontrol: fix forget to obtain the ref to objcg in split_page_memcg

2021-03-31 Thread Miaohe Lin
On 2021/4/1 11:35, Roman Gushchin wrote:
> On Thu, Apr 01, 2021 at 11:31:16AM +0800, Miaohe Lin wrote:
>> On 2021/4/1 11:01, Muchun Song wrote:
>>> Christian Borntraeger reported a warning about "percpu ref
>>> (obj_cgroup_release) <= 0 (-1) after switching to atomic".
>>> Because we forgot to obtain the reference to the objcg and
>>> wrongly obtain the reference of memcg.
>>>
>>> Reported-by: Christian Borntraeger 
>>> Signed-off-by: Muchun Song 
>>
>> Thanks for the patch.
>> Is a Fixes tag needed?
> 
> No, as the original patch hasn't been merged into the Linus's tree yet.
> So the fix can be simply squashed.
> 
> Btw, the fix looks good to me.
> 
> Acked-by: Roman Gushchin 
> 

I see. Many thanks for explanation!

The code looks good to me.
Reviewed-by: Miaohe Lin 

>>
>>> ---
>>>  include/linux/memcontrol.h | 6 ++
>>>  mm/memcontrol.c| 6 +-
>>>  2 files changed, 11 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>> index 0e8907957227..c960fd49c3e8 100644
>>> --- a/include/linux/memcontrol.h
>>> +++ b/include/linux/memcontrol.h
>>> @@ -804,6 +804,12 @@ static inline void obj_cgroup_get(struct obj_cgroup 
>>> *objcg)
>>> percpu_ref_get(>refcnt);
>>>  }
>>>  
>>> +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
>>> +  unsigned long nr)
>>> +{
>>> +   percpu_ref_get_many(>refcnt, nr);
>>> +}
>>> +
>>>  static inline void obj_cgroup_put(struct obj_cgroup *objcg)
>>>  {
>>> percpu_ref_put(>refcnt);
>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>> index c0b83a396299..64ada9e650a5 100644
>>> --- a/mm/memcontrol.c
>>> +++ b/mm/memcontrol.c
>>> @@ -3133,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned 
>>> int nr)
>>>  
>>> for (i = 1; i < nr; i++)
>>> head[i].memcg_data = head->memcg_data;
>>> -   css_get_many(>css, nr - 1);
>>> +
>>> +   if (PageMemcgKmem(head))
>>> +   obj_cgroup_get_many(__page_objcg(head), nr - 1);
>>> +   else
>>> +   css_get_many(>css, nr - 1);
>>>  }
>>>  
>>>  #ifdef CONFIG_MEMCG_SWAP
>>>
>>
> .
> 



Re: [PATCH] mm: memcontrol: fix forget to obtain the ref to objcg in split_page_memcg

2021-03-31 Thread Roman Gushchin
On Thu, Apr 01, 2021 at 11:31:16AM +0800, Miaohe Lin wrote:
> On 2021/4/1 11:01, Muchun Song wrote:
> > Christian Borntraeger reported a warning about "percpu ref
> > (obj_cgroup_release) <= 0 (-1) after switching to atomic".
> > Because we forgot to obtain the reference to the objcg and
> > wrongly obtain the reference of memcg.
> > 
> > Reported-by: Christian Borntraeger 
> > Signed-off-by: Muchun Song 
> 
> Thanks for the patch.
> Is a Fixes tag needed?

No, as the original patch hasn't been merged into the Linus's tree yet.
So the fix can be simply squashed.

Btw, the fix looks good to me.

Acked-by: Roman Gushchin 

> 
> > ---
> >  include/linux/memcontrol.h | 6 ++
> >  mm/memcontrol.c| 6 +-
> >  2 files changed, 11 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 0e8907957227..c960fd49c3e8 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -804,6 +804,12 @@ static inline void obj_cgroup_get(struct obj_cgroup 
> > *objcg)
> > percpu_ref_get(>refcnt);
> >  }
> >  
> > +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
> > +  unsigned long nr)
> > +{
> > +   percpu_ref_get_many(>refcnt, nr);
> > +}
> > +
> >  static inline void obj_cgroup_put(struct obj_cgroup *objcg)
> >  {
> > percpu_ref_put(>refcnt);
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index c0b83a396299..64ada9e650a5 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -3133,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned 
> > int nr)
> >  
> > for (i = 1; i < nr; i++)
> > head[i].memcg_data = head->memcg_data;
> > -   css_get_many(>css, nr - 1);
> > +
> > +   if (PageMemcgKmem(head))
> > +   obj_cgroup_get_many(__page_objcg(head), nr - 1);
> > +   else
> > +   css_get_many(>css, nr - 1);
> >  }
> >  
> >  #ifdef CONFIG_MEMCG_SWAP
> > 
> 


[PATCH V2 4/4] doc: watchdog: Modify the doc related to "watchdog/%u"

2021-03-31 Thread Wang Qing
"watchdog/%u" threads has be replaced by cpu_stop_work. The current 
description is extremely misleading.
---
 Documentation/admin-guide/sysctl/kernel.rst | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst 
b/Documentation/admin-guide/sysctl/kernel.rst
index 1d56a6b..32b0791
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1282,11 +1282,11 @@ This parameter can be used to control the soft lockup 
detector.
 = =
 
 The soft lockup detector monitors CPUs for threads that are hogging the CPUs
-without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
-from running. The mechanism depends on the CPUs ability to respond to timer
-interrupts which are needed for the 'watchdog/N' threads to be woken up by
-the watchdog timer function, otherwise the NMI watchdog — if enabled — can
-detect a hard lockup condition.
+without rescheduling voluntarily, and thus prevent the 'migration/N' threads
+from running, causing the watchdog work fail to execute. The mechanism depends
+on the CPUs ability to respond to timer interrupts which are needed for the
+watchdog work to be queued by the watchdog timer function, otherwise the NMI
+watchdog — if enabled — can detect a hard lockup condition.
 
 
 stack_erasing
-- 
2.7.4



[PATCH V2 3/4] doc: watchdog: Modify the explanation related to watchdog thread

2021-03-31 Thread Wang Qing
"watchdog/%u" threads has be replaced by cpu_stop_work. The current 
description is extremely misleading.

Signed-off-by: Wang Qing 
---
 Documentation/admin-guide/lockup-watchdogs.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/lockup-watchdogs.rst 
b/Documentation/admin-guide/lockup-watchdogs.rst
index 290840c..3e09284
--- a/Documentation/admin-guide/lockup-watchdogs.rst
+++ b/Documentation/admin-guide/lockup-watchdogs.rst
@@ -39,7 +39,7 @@ in principle, they should work in any architecture where these
 subsystems are present.
 
 A periodic hrtimer runs to generate interrupts and kick the watchdog
-task. An NMI perf event is generated every "watchdog_thresh"
+job. An NMI perf event is generated every "watchdog_thresh"
 (compile-time initialized to 10 and configurable through sysctl of the
 same name) seconds to check for hardlockups. If any CPU in the system
 does not receive any hrtimer interrupt during that time the
@@ -47,7 +47,7 @@ does not receive any hrtimer interrupt during that time the
 generate a kernel warning or call panic, depending on the
 configuration.
 
-The watchdog task is a high priority kernel thread that updates a
+The watchdog job runs in a stop scheduling thread that updates a
 timestamp every time it is scheduled. If that timestamp is not updated
 for 2*watchdog_thresh seconds (the softlockup threshold) the
 'softlockup detector' (coded inside the hrtimer callback function)
-- 
2.7.4



[PATCH V2 1/4] kernel: watchdog: Modify the explanation related to watchdog thread

2021-03-31 Thread Wang Qing
The watchdog thread has been replaced by cpu_stop_work, modify the 
explanation related.

Signed-off-by: Wang Qing 
---
 kernel/watchdog.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7110906..d7fb4fb
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -92,7 +92,7 @@ __setup("nmi_watchdog=", hardlockup_panic_setup);
  * own hardlockup detector.
  *
  * watchdog_nmi_enable/disable can be implemented to start and stop when
- * softlockup watchdog threads start and stop. The arch must select the
+ * softlockup watchdog start and stop. The arch must select the
  * SOFTLOCKUP_DETECTOR Kconfig.
  */
 int __weak watchdog_nmi_enable(unsigned int cpu)
@@ -322,7 +322,7 @@ static DEFINE_PER_CPU(struct completion, 
softlockup_completion);
 static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
 
 /*
- * The watchdog thread function - touches the timestamp.
+ * The watchdog feed function - touches the timestamp.
  *
  * It only runs once every sample_period seconds (4 seconds by
  * default) to reset the softlockup timestamp. If this gets delayed
@@ -551,11 +551,7 @@ static void lockup_detector_reconfigure(void)
 }
 
 /*
- * Create the watchdog thread infrastructure and configure the detector(s).
- *
- * The threads are not unparked as watchdog_allowed_mask is empty.  When
- * the threads are successfully initialized, take the proper locks and
- * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ * Create the watchdog infrastructure and configure the detector(s).
  */
 static __init void lockup_detector_setup(void)
 {
@@ -621,7 +617,7 @@ void lockup_detector_soft_poweroff(void)
 
 #ifdef CONFIG_SYSCTL
 
-/* Propagate any changes to the watchdog threads */
+/* Propagate any changes to the watchdog infrastructure */
 static void proc_watchdog_update(void)
 {
/* Remove impossible cpus to keep sysctl output clean. */
-- 
2.7.4



[PATCH V2 0/4] kernel/watchdog: Modify the explanation and doc related to watchdog thread

2021-03-31 Thread Wang Qing
"watchdog/%u" threads has be replaced by cpu_stop_work. The current 
description is extremely misleading, so we need to modify the 
explanation and documentation related to this.

Wang Qing (4):
  kernel: watchdog: Modify the explanation related to watchdog thread
  doc: watchdog: Delete the explanation about "watchdog/%u".
  doc: watchdog: Modify the explanation related to watchdog thread
  doc: watchdog: Modify the doc related to "watchdog/%u"

 .../admin-guide/kernel-per-CPU-kthreads.rst  | 20 
 Documentation/admin-guide/lockup-watchdogs.rst   |  4 ++--
 Documentation/admin-guide/sysctl/kernel.rst  | 10 +-
 kernel/watchdog.c| 12 
 4 files changed, 11 insertions(+), 35 deletions(-)

-- 
2.7.4



[PATCH V2 2/4] doc: watchdog: Delete the explanation about "watchdog/%u".

2021-03-31 Thread Wang Qing
"watchdog/%u" threads has be replaced by cpu_stop_work. The current description
is extremely misleading, so delete the explanation about "watchdog/%u".

Signed-off-by: Wang Qing 
---
 .../admin-guide/kernel-per-CPU-kthreads.rst  | 20 
 1 file changed, 20 deletions(-)

diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst 
b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
index 531f689..5e51ee5
--- a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
+++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
@@ -332,23 +332,3 @@ To reduce its OS jitter, do at least one of the following:
kthreads from being created in the first place.  However, please
note that this will not eliminate OS jitter, but will instead
shift it to RCU_SOFTIRQ.
-
-Name:
-  watchdog/%u
-
-Purpose:
-  Detect software lockups on each CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
-   kthreads from being created in the first place.
-2. Boot with "nosoftlockup=0", which will also prevent these kthreads
-   from being created.  Other related watchdog and softlockup boot
-   parameters may be found in 
Documentation/admin-guide/kernel-parameters.rst
-   and Documentation/watchdog/watchdog-parameters.rst.
-3. Echo a zero to /proc/sys/kernel/watchdog to disable the
-   watchdog timer.
-4. Echo a large number of /proc/sys/kernel/watchdog_thresh in
-   order to reduce the frequency of OS jitter due to the watchdog
-   timer down to a level that is acceptable for your workload.
-- 
2.7.4



Re: [PATCH] mm: memcontrol: fix forget to obtain the ref to objcg in split_page_memcg

2021-03-31 Thread Miaohe Lin
On 2021/4/1 11:01, Muchun Song wrote:
> Christian Borntraeger reported a warning about "percpu ref
> (obj_cgroup_release) <= 0 (-1) after switching to atomic".
> Because we forgot to obtain the reference to the objcg and
> wrongly obtain the reference of memcg.
> 
> Reported-by: Christian Borntraeger 
> Signed-off-by: Muchun Song 

Thanks for the patch.
Is a Fixes tag needed?

> ---
>  include/linux/memcontrol.h | 6 ++
>  mm/memcontrol.c| 6 +-
>  2 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0e8907957227..c960fd49c3e8 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -804,6 +804,12 @@ static inline void obj_cgroup_get(struct obj_cgroup 
> *objcg)
>   percpu_ref_get(>refcnt);
>  }
>  
> +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
> +unsigned long nr)
> +{
> + percpu_ref_get_many(>refcnt, nr);
> +}
> +
>  static inline void obj_cgroup_put(struct obj_cgroup *objcg)
>  {
>   percpu_ref_put(>refcnt);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c0b83a396299..64ada9e650a5 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3133,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned int 
> nr)
>  
>   for (i = 1; i < nr; i++)
>   head[i].memcg_data = head->memcg_data;
> - css_get_many(>css, nr - 1);
> +
> + if (PageMemcgKmem(head))
> + obj_cgroup_get_many(__page_objcg(head), nr - 1);
> + else
> + css_get_many(>css, nr - 1);
>  }
>  
>  #ifdef CONFIG_MEMCG_SWAP
> 



[PATCH v2 10/10] erofs: enable big pcluster feature

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

Enable COMPR_CFGS and BIG_PCLUSTER since the implementations are
all settled properly.

Signed-off-by: Gao Xiang 
---
 fs/erofs/erofs_fs.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index ecc3a0ea0bc4..8739d3adf51f 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -20,7 +20,10 @@
 #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING0x0001
 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS  0x0002
 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER0x0002
-#define EROFS_ALL_FEATURE_INCOMPAT 
EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+#define EROFS_ALL_FEATURE_INCOMPAT \
+   (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER)
 
 #define EROFS_SB_EXTSLOT_SIZE  16
 
-- 
2.20.1



[PATCH v2 04/10] erofs: fix up inplace I/O pointer for big pcluster

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

When picking up inplace I/O pages, it should be traversed in reverse
order in aligned with the traversal order of file-backed online pages.
Also, index should be updated together when preloading compressed pages.

Previously, only page-sized pclustersize was supported so no problem
at all. Also rename `compressedpages' to `icpage_ptr' to reflect its
functionality.

Signed-off-by: Gao Xiang 
---
 fs/erofs/zdata.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 7f572086b4e3..03f106ead8d2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -204,7 +204,8 @@ struct z_erofs_collector {
 
struct z_erofs_pcluster *pcl, *tailpcl;
struct z_erofs_collection *cl;
-   struct page **compressedpages;
+   /* a pointer used to pick up inplace I/O pages */
+   struct page **icpage_ptr;
z_erofs_next_pcluster_t owned_head;
 
enum z_erofs_collectmode mode;
@@ -238,17 +239,19 @@ static void preload_compressed_pages(struct 
z_erofs_collector *clt,
 enum z_erofs_cache_alloctype type,
 struct list_head *pagepool)
 {
-   const struct z_erofs_pcluster *pcl = clt->pcl;
-   struct page **pages = clt->compressedpages;
-   pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+   struct z_erofs_pcluster *pcl = clt->pcl;
bool standalone = true;
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+   struct page **pages;
+   pgoff_t index;
 
if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
return;
 
-   for (; pages < pcl->compressed_pages + pcl->pclusterpages; ++pages) {
+   pages = pcl->compressed_pages;
+   index = pcl->obj.index;
+   for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) {
struct page *page;
compressed_page_t t;
struct page *newpage = NULL;
@@ -360,16 +363,14 @@ int erofs_try_to_free_cached_page(struct address_space 
*mapping,
 }
 
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
- struct page *page)
+static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+  struct page *page)
 {
struct z_erofs_pcluster *const pcl = clt->pcl;
 
-   while (clt->compressedpages <
-  pcl->compressed_pages + pcl->pclusterpages) {
-   if (!cmpxchg(clt->compressedpages++, NULL, page))
+   while (clt->icpage_ptr > pcl->compressed_pages)
+   if (!cmpxchg(--clt->icpage_ptr, NULL, page))
return true;
-   }
return false;
 }
 
@@ -576,9 +577,8 @@ static int z_erofs_collector_begin(struct z_erofs_collector 
*clt,
z_erofs_pagevec_ctor_init(>vector, Z_EROFS_NR_INLINE_PAGEVECS,
  clt->cl->pagevec, clt->cl->vcnt);
 
-   clt->compressedpages = clt->pcl->compressed_pages;
-   if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
-   clt->compressedpages += clt->pcl->pclusterpages;
+   /* since file-backed online pages are traversed in reverse order */
+   clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages;
return 0;
 }
 
-- 
2.20.1



[PATCH v2 07/10] erofs: support parsing big pcluster compress indexes

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

When INCOMPAT_BIG_PCLUSTER sb feature is enabled, legacy compress indexes
will also have the same on-disk header compact indexes to keep per-file
configurations instead of leaving it zeroed.

If ADVISE_BIG_PCLUSTER is set for a file, CBLKCNT will be loaded for each
pcluster in this file by parsing 1st non-head lcluster.

Signed-off-by: Gao Xiang 
---
 fs/erofs/zmap.c | 79 +
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index bd7e10c2fdd3..d34ff810cc15 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -11,8 +11,10 @@
 int z_erofs_fill_inode(struct inode *inode)
 {
struct erofs_inode *const vi = EROFS_I(inode);
+   struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
 
-   if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+   if (!erofs_sb_has_big_pcluster(sbi) &&
+   vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
vi->z_advise = 0;
vi->z_algorithmtype[0] = 0;
vi->z_algorithmtype[1] = 0;
@@ -49,7 +51,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
if (test_bit(EROFS_I_Z_INITED_BIT, >flags))
goto out_unlock;
 
-   DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+   DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+ vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
 
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, 8);
@@ -96,7 +99,7 @@ struct z_erofs_maprecorder {
u8  type;
u16 clusterofs;
u16 delta[2];
-   erofs_blk_t pblk;
+   erofs_blk_t pblk, compressedlcs;
 };
 
 static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
@@ -159,6 +162,15 @@ static int legacy_load_cluster_from_disk(struct 
z_erofs_maprecorder *m,
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+   if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+   if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+   DBG_BUGON(1);
+   return -EFSCORRUPTED;
+   }
+   m->compressedlcs = m->delta[0] &
+   ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+   m->delta[0] = 1;
+   }
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
@@ -366,6 +378,58 @@ static int z_erofs_extent_lookback(struct 
z_erofs_maprecorder *m,
return 0;
 }
 
+static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+   unsigned int initial_lcn)
+{
+   struct erofs_inode *const vi = EROFS_I(m->inode);
+   struct erofs_map_blocks *const map = m->map;
+   const unsigned int lclusterbits = vi->z_logical_clusterbits;
+   unsigned long lcn;
+   int err;
+
+   DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
+   if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
+   !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+   map->m_plen = 1 << lclusterbits;
+   return 0;
+   }
+
+   lcn = m->lcn + 1;
+   if (m->compressedlcs)
+   goto out;
+   if (lcn == initial_lcn)
+   goto err_bonus_cblkcnt;
+
+   err = z_erofs_load_cluster_from_disk(m, lcn);
+   if (err)
+   return err;
+
+   switch (m->type) {
+   case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+   if (m->delta[0] != 1)
+   goto err_bonus_cblkcnt;
+   if (m->compressedlcs)
+   break;
+   fallthrough;
+   default:
+   erofs_err(m->inode->i_sb,
+ "cannot found CBLKCNT @ lcn %lu of nid %llu",
+ lcn, vi->nid);
+   DBG_BUGON(1);
+   return -EFSCORRUPTED;
+   }
+out:
+   map->m_plen = m->compressedlcs << lclusterbits;
+   return 0;
+err_bonus_cblkcnt:
+   erofs_err(m->inode->i_sb,
+ "bogus CBLKCNT @ lcn %lu of nid %llu",
+ lcn, vi->nid);
+   DBG_BUGON(1);
+   return -EFSCORRUPTED;
+}
+
 int z_erofs_map_blocks_iter(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
@@ -377,6 +441,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
};
int err = 0;
unsigned int lclusterbits, endoff;
+   unsigned long initial_lcn;
unsigned long long ofs, end;
 
trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
@@ -395,10 +460,10 @@ int 

[PATCH v2 08/10] erofs: support parsing big pcluster compact indexes

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

Different from non-compact indexes, several lclusters are packed
as the compact form at once and an unique base blkaddr is stored for
each pack, so each lcluster index would take less space on avarage
(e.g. 2 bytes for COMPACT_2B.) btw, that is also why BIG_PCLUSTER
switch should be consistent for compact head0/1.

Prior to big pcluster, the size of all pclusters is 1 lcluster.
Therefore, when a new HEAD lcluster was scanned, blkaddr would be
bumped by 1 lcluster. However, that way doesn't work anymore for
big pcluster since we actually don't know the compressed size of
pclusters in advance (before reading CBLKCNT).

So, instead, let blkaddr of each pack be the first pcluster blkaddr
with a valid CBLKCNT, in detail,

 1) if CBLKCNT starts at the pack, this first valid pcluster is
itself, e.g.
  _
 |_CBLKCNT0_|_NONHEAD_| .. |_HEAD_|_CBLKCNT1_| ... |_HEAD_| ...
 ^ = blkaddr base  ^ += CBLKCNT0   ^ += CBLKCNT1

 2) if CBLKCNT doesn't start at the pack, the first valid pcluster
is the next pcluster, e.g.
  _
 | NONHEAD_| .. |_HEAD_|_CBLKCNT0_| ... |_HEAD_|_HEAD_| ...
^ = blkaddr base^ += CBLKCNT0
   ^ += 1

When a CBLKCNT is found, blkaddr will be increased by CBLKCNT
lclusters, or a new HEAD is found immediately, bump blkaddr by 1
instead (see the picture above.)

Also noted if CBLKCNT is the end of the pack, instead of storing
delta1 (distance of the next HEAD lcluster) as normal NONHEADs,
it still stores the compressed block count (delta0) since delta1
can be calculated indirectly but the block count can't.

Adjust decoding logic to fit big pcluster compact indexes as well.

Signed-off-by: Gao Xiang 
---
 fs/erofs/zmap.c | 63 +
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index d34ff810cc15..545cd5989e6a 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -77,6 +77,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
}
 
vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
+   if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
+   !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
+   !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+   erofs_err(sb, "big pcluster head1/2 of compact indexes should 
be consistent for nid %llu",
+ vi->nid);
+   return -EFSCORRUPTED;
+   }
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, >flags);
@@ -207,6 +214,7 @@ static int unpack_compacted_index(struct 
z_erofs_maprecorder *m,
unsigned int vcnt, base, lo, encodebits, nblk;
int i;
u8 *in, type;
+   bool big_pcluster;
 
if (1 << amortizedshift == 4)
vcnt = 2;
@@ -215,6 +223,7 @@ static int unpack_compacted_index(struct 
z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
 
+   big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
base = round_down(eofs, vcnt << amortizedshift);
in = m->kaddr + base;
@@ -226,7 +235,15 @@ static int unpack_compacted_index(struct 
z_erofs_maprecorder *m,
m->type = type;
if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << lclusterbits;
-   if (i + 1 != vcnt) {
+   if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+   if (!big_pcluster) {
+   DBG_BUGON(1);
+   return -EFSCORRUPTED;
+   }
+   m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+   m->delta[0] = 1;
+   return 0;
+   } else if (i + 1 != (int)vcnt) {
m->delta[0] = lo;
return 0;
}
@@ -239,22 +256,48 @@ static int unpack_compacted_index(struct 
z_erofs_maprecorder *m,
  in, encodebits * (i - 1), );
if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
lo = 0;
+   else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT)
+   lo = 1;
m->delta[0] = lo + 1;
return 0;
}
m->clusterofs = lo;
m->delta[0] = 0;
/* figout out blkaddr (pblk) for HEAD lclusters */
-   nblk = 1;
-   while (i > 0) {
-   --i;
-   lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, );
-   if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
- 

[PATCH v2 09/10] erofs: support decompress big pcluster for lz4 backend

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

Prior to big pcluster, there was only one compressed page so it'd
easy to map this. However, when big pcluster is enabled, more work
needs to be done to handle multiple compressed pages. In detail,

 - (maptype 0) if there is only one compressed page + no need
   to copy inplace I/O, just map it directly what we did before;

 - (maptype 1) if there are more compressed pages + no need to
   copy inplace I/O, vmap such compressed pages instead;

 - (maptype 2) if inplace I/O needs to be copied, use per-CPU
   buffers for decompression then.

Another thing is how to detect inplace decompression is feasable or
not (it's still quite easy for non big pclusters), apart from the
inplace margin calculation, inplace I/O page reusing order is also
needed to be considered for each compressed page. Currently, if the
compressed page is the xth page, it shouldn't be reused as [0 ...
nrpages_out - nrpages_in + x], otherwise a full copy will be triggered.

Although there are some extra optimization ideas for this, I'd like
to make big pcluster work correctly first and obviously it can be
further optimized later since it has nothing with the on-disk format
at all.

Signed-off-by: Gao Xiang 
---
 fs/erofs/decompressor.c | 202 
 1 file changed, 122 insertions(+), 80 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 5d9f9dbd3681..c7b1d3fe8184 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -116,44 +116,87 @@ static int z_erofs_lz4_prepare_destpages(struct 
z_erofs_decompress_req *rq,
return kaddr ? 1 : 0;
 }
 
-static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
-  u8 *src, unsigned int pageofs_in)
+static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+   void *inpage, unsigned int *inputmargin, int *maptype,
+   bool support_0padding)
 {
-   /*
-* if in-place decompression is ongoing, those decompressed
-* pages should be copied in order to avoid being overlapped.
-*/
-   struct page **in = rq->in;
-   u8 *const tmp = erofs_get_pcpubuf(1);
-   u8 *tmpp = tmp;
-   unsigned int inlen = rq->inputsize - pageofs_in;
-   unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
-
-   while (tmpp < tmp + inlen) {
-   if (!src)
-   src = kmap_atomic(*in);
-   memcpy(tmpp, src + pageofs_in, count);
-   kunmap_atomic(src);
-   src = NULL;
-   tmpp += count;
-   pageofs_in = 0;
-   count = PAGE_SIZE;
+   unsigned int nrpages_in, nrpages_out;
+   unsigned int ofull, oend, inputsize, total, i, j;
+   struct page **in;
+   void *src, *tmp;
+
+   inputsize = rq->inputsize;
+   nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT;
+   oend = rq->pageofs_out + rq->outputsize;
+   ofull = PAGE_ALIGN(oend);
+   nrpages_out = ofull >> PAGE_SHIFT;
+
+   if (rq->inplace_io) {
+   if (rq->partial_decoding || !support_0padding ||
+   ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize))
+   goto docopy;
+
+   for (i = 0; i < nrpages_in; ++i) {
+   DBG_BUGON(rq->in[i] == NULL);
+   for (j = 0; j < nrpages_out - nrpages_in + i; ++j)
+   if (rq->out[j] == rq->in[i])
+   goto docopy;
+   }
+   }
+
+   if (nrpages_in <= 1) {
+   *maptype = 0;
+   return inpage;
+   }
+   kunmap_atomic(inpage);
+   might_sleep();
+   while (1) {
+   src = vm_map_ram(rq->in, nrpages_in, -1);
+   /* retry two more times (totally 3 times) */
+   if (src || ++i >= 3)
+   break;
+   vm_unmap_aliases();
+   }
+   *maptype = 1;
+   return src;
+docopy:
+   /* Or copy compressed data which can be overlapped to per-CPU buffer */
+   in = rq->in;
+   src = erofs_get_pcpubuf(nrpages_in);
+   if (!src) {
+   DBG_BUGON(1);
+   return ERR_PTR(-EFAULT);
+   }
+
+   tmp = src;
+   total = rq->inputsize;
+   while (total) {
+   unsigned int page_copycnt =
+   min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
+
+   if (!inpage)
+   inpage = kmap_atomic(*in);
+   memcpy(tmp, inpage + *inputmargin, page_copycnt);
+   kunmap_atomic(inpage);
+   inpage = NULL;
+   tmp += page_copycnt;
+   total -= page_copycnt;
++in;
+   *inputmargin = 0;
}
-   return tmp;
+   *maptype = 2;
+   return src;
 }
 
 static int 

[PATCH v2 06/10] erofs: adjust per-CPU buffers according to max_pclusterblks

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

Adjust per-CPU buffers on demand since big pcluster definition is
available. Also, bail out unsupported pcluster size according to
Z_EROFS_PCLUSTER_MAX_SIZE.

Signed-off-by: Gao Xiang 
---
 fs/erofs/decompressor.c | 16 
 fs/erofs/internal.h |  2 ++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index fb4838c0f0df..5d9f9dbd3681 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -32,6 +32,7 @@ int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int size)
 {
+   struct erofs_sb_info *sbi = EROFS_SB(sb);
u16 distance;
 
if (lz4) {
@@ -40,16 +41,23 @@ int z_erofs_load_lz4_config(struct super_block *sb,
return -EINVAL;
}
distance = le16_to_cpu(lz4->max_distance);
+
+   sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks);
+   if (sbi->lz4.max_pclusterblks >
+   Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
+   erofs_err(sb, "too large lz4 pcluster blocks %u",
+ sbi->lz4.max_pclusterblks);
+   return -EINVAL;
+   }
} else {
distance = le16_to_cpu(dsb->u1.lz4_max_distance);
+   sbi->lz4.max_pclusterblks = 1;
}
 
-   EROFS_SB(sb)->lz4.max_distance_pages = distance ?
+   sbi->lz4.max_distance_pages = distance ?
DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
LZ4_MAX_DISTANCE_PAGES;
-
-   /* TODO: use max pclusterblks after bigpcluster is enabled */
-   return erofs_pcpubuf_growsize(1);
+   return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
 }
 
 static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c4b3938a7e56..f1305af50f67 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -63,6 +63,8 @@ struct erofs_fs_context {
 struct erofs_sb_lz4_info {
/* # of pages needed for EROFS lz4 rolling decompression */
u16 max_distance_pages;
+   /* maximum possible blocks for pclusters in the filesystem */
+   u16 max_pclusterblks;
 };
 
 struct erofs_sb_info {
-- 
2.20.1



[PATCH v2 05/10] erofs: add big physical cluster definition

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

Big pcluster indicates the size of compressed data for each physical
pcluster is no longer fixed as block size, but could be more than 1
block (more accurately, 1 logical pcluster)

When big pcluster feature is enabled for head0/1, delta0 of the 1st
non-head lcluster index will keep block count of this pcluster in
lcluster size instead of 1. Or, the compressed size of pcluster
should be 1 lcluster if pcluster has no non-head lcluster index.

Also note that BIG_PCLUSTER feature reuses COMPR_CFGS feature since
it depends on COMPR_CFGS and will be released together.

Signed-off-by: Gao Xiang 
---
 fs/erofs/erofs_fs.h | 19 +++
 fs/erofs/internal.h |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 76777673eb63..ecc3a0ea0bc4 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -19,6 +19,7 @@
  */
 #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING0x0001
 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS  0x0002
+#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER0x0002
 #define EROFS_ALL_FEATURE_INCOMPAT 
EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
 
 #define EROFS_SB_EXTSLOT_SIZE  16
@@ -214,17 +215,20 @@ enum {
 /* 14 bytes (+ length field = 16 bytes) */
 struct z_erofs_lz4_cfgs {
__le16 max_distance;
-   u8 reserved[12];
+   __le16 max_pclusterblks;
+   u8 reserved[10];
 } __packed;
 
 /*
  * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
  *  e.g. for 4k logical cluster size,  4Bif compacted 2B is off;
  *  (4B) + 2B + (4B) if compacted 2B is on.
+ * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
+ * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
  */
-#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0
-
-#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+#define Z_EROFS_ADVISE_COMPACTED_2B0x0001
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_1  0x0002
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_2  0x0004
 
 struct z_erofs_map_header {
__le32  h_reserved1;
@@ -279,6 +283,13 @@ enum {
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS2
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0
 
+/*
+ * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
+ * compressed block count of a compressed extent (in logical clusters, aka.
+ * block count of a pcluster).
+ */
+#define Z_EROFS_VLE_DI_D0_CBLKCNT  (1 << 11)
+
 struct z_erofs_vle_decompressed_index {
__le16 di_advise;
/* where to decompress in the head cluster */
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 06c294929069..c4b3938a7e56 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -230,6 +230,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info 
*sbi) \
 
 EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
 EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
+EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
 EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
 
 /* atomic flag definitions */
-- 
2.20.1



[PATCH v2 02/10] erofs: introduce multipage per-CPU buffers

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

To deal the with the cases which inplace decompression is infeasible
for some inplace I/O. Per-CPU buffers was introduced to get rid of page
allocation latency and thrash for low-latency decompression algorithms
such as lz4.

For the big pcluster feature, introduce multipage per-CPU buffers to
keep such inplace I/O pclusters temporarily as well but note that
per-CPU pages are just consecutive virtually.

When a new big pcluster fs is mounted, its max pclustersize will be
read and per-CPU buffers can be growed if needed. Shrinking adjustable
per-CPU buffers is more complex (because we don't know if such size
is still be used), so currently just release them all when unloading.

Signed-off-by: Gao Xiang 
---
 fs/erofs/Makefile   |   2 +-
 fs/erofs/decompressor.c |   8 ++-
 fs/erofs/internal.h |  24 ++-
 fs/erofs/pcpubuf.c  | 134 
 fs/erofs/super.c|   1 +
 fs/erofs/utils.c|  12 
 6 files changed, 147 insertions(+), 34 deletions(-)
 create mode 100644 fs/erofs/pcpubuf.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index af159539fc1b..1f9aced49070 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 27aa6a99b371..fb4838c0f0df 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -47,7 +47,9 @@ int z_erofs_load_lz4_config(struct super_block *sb,
EROFS_SB(sb)->lz4.max_distance_pages = distance ?
DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
LZ4_MAX_DISTANCE_PAGES;
-   return 0;
+
+   /* TODO: use max pclusterblks after bigpcluster is enabled */
+   return erofs_pcpubuf_growsize(1);
 }
 
 static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
@@ -114,7 +116,7 @@ static void *generic_copy_inplace_data(struct 
z_erofs_decompress_req *rq,
 * pages should be copied in order to avoid being overlapped.
 */
struct page **in = rq->in;
-   u8 *const tmp = erofs_get_pcpubuf(0);
+   u8 *const tmp = erofs_get_pcpubuf(1);
u8 *tmpp = tmp;
unsigned int inlen = rq->inputsize - pageofs_in;
unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
@@ -271,7 +273,7 @@ static int z_erofs_decompress_generic(struct 
z_erofs_decompress_req *rq,
 * compressed data is preferred.
 */
if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
-   dst = erofs_get_pcpubuf(0);
+   dst = erofs_get_pcpubuf(1);
if (IS_ERR(dst))
return PTR_ERR(dst);
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 05b02f99324c..f707d28a46d9 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -197,9 +197,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct 
erofs_workgroup *grp)
 
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES   (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
-#define EROFS_PCPUBUF_NR_PAGES  Z_EROFS_CLUSTER_MAX_PAGES
-#else
-#define EROFS_PCPUBUF_NR_PAGES  0
 #endif /* !CONFIG_EROFS_FS_ZIP */
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
@@ -405,24 +402,15 @@ int erofs_namei(struct inode *dir, struct qstr *name,
 /* dir.c */
 extern const struct file_operations erofs_dir_fops;
 
+/* pcpubuf.c */
+void *erofs_get_pcpubuf(unsigned int requiredpages);
+void erofs_put_pcpubuf(void *ptr);
+int erofs_pcpubuf_growsize(unsigned int nrpages);
+void erofs_pcpubuf_exit(void);
+
 /* utils.c / zdata.c */
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-void *erofs_get_pcpubuf(unsigned int pagenr);
-#define erofs_put_pcpubuf(buf) do { \
-   (void)&(buf);   \
-   preempt_enable();   \
-} while (0)
-#else
-static inline void *erofs_get_pcpubuf(unsigned int pagenr)
-{
-   return ERR_PTR(-EOPNOTSUPP);
-}
-
-#define erofs_put_pcpubuf(buf) do {} while (0)
-#endif
-
 #ifdef CONFIG_EROFS_FS_ZIP
 int erofs_workgroup_put(struct erofs_workgroup *grp);
 struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
new file mode 100644
index ..24ad0a1a4dd1
--- /dev/null
+++ b/fs/erofs/pcpubuf.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020-2021 Gao Xiang 
+ *
+ * For low-latency decompression algorithms (e.g. lz4), reserve continuous
+ * per-CPU virtual memory (in pages) in advance to store such inplace I/O
+ * data if inplace decompression is 

[PATCH v2 03/10] erofs: introduce physical cluster slab pools

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

Since multiple pcluster sizes could be used at once, the number of
compressed pages will become a variable factor. It's necessary to
introduce slab pools rather than a single slab cache now.

This limits the pclustersize to 1M (Z_EROFS_PCLUSTER_MAX_SIZE), and
get rid of the obsolete EROFS_FS_CLUSTER_PAGE_LIMIT, which has no
use now.

Signed-off-by: Gao Xiang 
---
 fs/erofs/Kconfig|  14 
 fs/erofs/erofs_fs.h |   3 +
 fs/erofs/internal.h |   3 -
 fs/erofs/zdata.c| 172 +---
 fs/erofs/zdata.h|  14 ++--
 5 files changed, 126 insertions(+), 80 deletions(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 74b0aaa7114c..858b3339f381 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -76,17 +76,3 @@ config EROFS_FS_ZIP
 
  If you don't want to enable compression feature, say N.
 
-config EROFS_FS_CLUSTER_PAGE_LIMIT
-   int "EROFS Cluster Pages Hard Limit"
-   depends on EROFS_FS_ZIP
-   range 1 256
-   default "1"
-   help
- Indicates maximum # of pages of a compressed
- physical cluster.
-
- For example, if files in a image were compressed
- into 8k-unit, hard limit should not be configured
- less than 2. Otherwise, the image will be refused
- to mount on this kernel.
-
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 626b7d3e9ab7..76777673eb63 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -201,6 +201,9 @@ static inline unsigned int erofs_xattr_entry_size(struct 
erofs_xattr_entry *e)
 e->e_name_len + le16_to_cpu(e->e_value_size));
 }
 
+/* maximum supported size of a physical compression cluster */
+#define Z_EROFS_PCLUSTER_MAX_SIZE  (1024 * 1024)
+
 /* available compression algorithm types (for h_algorithmtype) */
 enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index f707d28a46d9..06c294929069 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -194,9 +194,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct 
erofs_workgroup *grp)
return v;
 }
 #endif /* !CONFIG_SMP */
-
-/* hard limit of pages per compressed cluster */
-#define Z_EROFS_CLUSTER_MAX_PAGES   (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
 #endif /* !CONFIG_EROFS_FS_ZIP */
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index eabfd8873e12..7f572086b4e3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -10,6 +10,93 @@
 
 #include 
 
+/*
+ * since pclustersize is variable for big pcluster feature, introduce slab
+ * pools implementation for different pcluster sizes.
+ */
+struct z_erofs_pcluster_slab {
+   struct kmem_cache *slab;
+   unsigned int maxpages;
+   char name[48];
+};
+
+#define _PCLP(n) { .maxpages = n }
+
+static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
+   _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
+   _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+};
+
+static void z_erofs_destroy_pcluster_pool(void)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+   if (!pcluster_pool[i].slab)
+   continue;
+   kmem_cache_destroy(pcluster_pool[i].slab);
+   pcluster_pool[i].slab = NULL;
+   }
+}
+
+static int z_erofs_create_pcluster_pool(void)
+{
+   struct z_erofs_pcluster_slab *pcs;
+   struct z_erofs_pcluster *a;
+   unsigned int size;
+
+   for (pcs = pcluster_pool;
+pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
+   size = struct_size(a, compressed_pages, pcs->maxpages);
+
+   sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
+   pcs->slab = kmem_cache_create(pcs->name, size, 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+   if (pcs->slab)
+   continue;
+
+   z_erofs_destroy_pcluster_pool();
+   return -ENOMEM;
+   }
+   return 0;
+}
+
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+   struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+   struct z_erofs_pcluster *pcl;
+
+   if (nrpages > pcs->maxpages)
+   continue;
+
+   pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+   if (!pcl)
+   return ERR_PTR(-ENOMEM);
+   pcl->pclusterpages = nrpages;
+   return pcl;
+   }
+   return ERR_PTR(-EINVAL);
+}
+
+static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+   struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+
+   if 

[PATCH v2 00/10] erofs: add big pcluster compression support

2021-03-31 Thread Gao Xiang
Hi folks,

This is the formal version of EROFS big pcluster support, which means
EROFS can compress data into more than 1 fs block after this patchset.

{l,p}cluster are EROFS-specific concepts, standing for `logical cluster'
and `physical cluster' correspondingly. Logical cluster is the basic unit
of compress indexes in file logical mapping, e.g. it can build compress
indexes in 2 blocks rather than 1 block (currently only 1 block lcluster
is supported). Physical cluster is a container of physical compressed
blocks which contains compressed data, the size of which is the multiple
of lclustersize.

Different from previous thoughts, which had fixed-sized pclusterblks
recorded in the on-disk compress index header, our on-disk design allows
variable-sized pclusterblks now. The main reasons are
 - user data varies in compression ratio locally, so fixed-sized
   clustersize approach is space-wasting and causes extra read
   amplificationfor high CR cases;

 - inplace decompression needs zero padding to guarantee its safe margin,
   but we don't want to pad more than 1 fs block for big pcluster;

 - end users can now customize the pcluster size according to data type
   since various pclustersize can exist in a file, for example, using
   different pcluster size for executable code and one-shot data. such
   design should be more flexible than many other public compression fses
   (Btw, each file in EROFS can have maximum 2 algorithms at the same time
   by using HEAD1/2, which will be formally added with LZMA support.)

In brief, EROFS can now compress from variable-sized input to
variable-sized pcluster blocks, as illustrated below:

  |<-_lcluster_->||<-_lcluster_->|
  |._|_ .. ___|___.__|
..
 . .
  .__.
  |__| .. |__|
  |<-  pcluster->|

The next step would be how to record the compressed block count in
lclusters. In compress indexes, there are 2 concepts called HEAD and
NONHEAD lclusters. The difference is that HEAD lcluster starts a new
pcluster in the lcluster, but NONHEAD not. It's easy to understand
that big pclusters at least have 2 pclusters, thus at least 2 lclusters
as well.

Therefore, let the delta0 (distance to its HEAD lcluster) of first NONHEAD
compress index store the compressed block count with a special flag as a
new called CBLKCNT compress index. It's also easy to know its delta0 is
constantly 1, as illustrated below:
  
 |_HEAD_|_CBLKCNT_|_NONHEAD_|_..._|_NONHEAD_|_HEAD | HEAD |
|<-- a pcluster with CBLKCNT ->|<-- -->|
   ^ a pcluster with 1

If another HEAD follows a HEAD lcluster, there is no room to record
CBLKCNT, but it's easy to know the size of pcluster will be 1.

More implementation details about this and compact indexes are in the
commit message.

On the runtime performance side, the current EROFS test results are:
 
|  file system  |   size| seq read | rand read | rand9m read |
|___|___|_ MiB/s __|__ MiB/s __|___ MiB/s ___|
|___erofs_4k|_556879872_|_ 781.4 __|__ 55.3 ___|___ 25.3  ___|
|___erofs_16k___|_452509696_|_ 864.8 __|_ 123.2 ___|___ 20.8  ___|
|___erofs_32k___|_415223808_|_ 899.8 __|_ 105.8 _*_|___ 16.8 |
|___erofs_64k___|_393814016_|_ 906.6 __|__ 66.6 _*_|___ 11.8 |
|__squashfs_8k__|_556191744_|_  64.9 __|__ 19.3 ___| 9.1 |
|__squashfs_16k_|_502661120_|_  98.9 __|__ 38.0 ___| 9.8 |
|__squashfs_32k_|_458784768_|_ 115.4 __|__ 71.6 _*_|___ 10.0 |
|_squashfs_128k_|_398204928_|_ 257.2 __|_ 253.8 _*_|___ 10.9 |
|ext4_4k|()_|_ 786.6 __|__ 28.6 ___|___ 27.8 |


* Squashfs grabs more page cache to keep all decompressed data with
  grab_cache_page_nowait() than the normal requested readahead (see
  squashfs_copy_cache and squashfs_readpage_block).
  In principle, EROFS can also cache such all decompressed data
  if necessary, yet it's low priority for now and has little use
  (rand9m is actually a better rand read workload, since the amount
   of I/O is 9m rather than full-sized 1000m).

More details are in
https://lore.kernel.org/r/20210329053654.ga3281...@xiangao.remote.csb

Also it's easy to know EROFS is not a fixed pcluster design, so users
can make several optimized strategy according to data type when mkfs.
And there is still room to optimize runtime performance for big pcluster
even further.

Finally, it passes ro_fsstress and can also successfully boot buildroot
& Android system with android-mainline repo.

current mkfs repo for big pcluster:
https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs-utils.git -b 
experimental-bigpcluster-compact

Thanks for your 

[PATCH v2 01/10] erofs: reserve physical_clusterbits[]

2021-03-31 Thread Gao Xiang
From: Gao Xiang 

Formal big pcluster design is actually more powerful / flexable than
the previous thought whose pclustersize was fixed as power-of-2 blocks,
which was obviously inefficient and space-wasting. Instead, pclustersize
can now be set independently for each pcluster, so various pcluster
sizes can also be used together in one file if mkfs wants (for example,
according to data type and/or compression ratio).

Let's get rid of previous physical_clusterbits[] setting (also notice
that corresponding on-disk fields are still 0 for now). Therefore,
head1/2 can be used for at most 2 different algorithms in one file and
again pclustersize is now independent of these.

Signed-off-by: Gao Xiang 
---
 fs/erofs/erofs_fs.h |  4 +---
 fs/erofs/internal.h |  1 -
 fs/erofs/zdata.c|  3 +--
 fs/erofs/zmap.c | 15 ---
 4 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 17bc0b5f117d..626b7d3e9ab7 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -233,9 +233,7 @@ struct z_erofs_map_header {
__u8h_algorithmtype;
/*
 * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
-* bit 3-4 : (physical - logical) cluster bits of head 1:
-*   For example, if logical clustersize = 4096, 1 for 8192.
-* bit 5-7 : (physical - logical) cluster bits of head 2.
+* bit 3-7 : reserved.
 */
__u8h_clusterbits;
 };
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 6006391a..05b02f99324c 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -266,7 +266,6 @@ struct erofs_inode {
unsigned short z_advise;
unsigned char  z_algorithmtype[2];
unsigned char  z_logical_clusterbits;
-   unsigned char  z_physical_clusterbits[2];
};
 #endif /* CONFIG_EROFS_FS_ZIP */
};
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index cd9b76216925..eabfd8873e12 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -430,8 +430,7 @@ static int z_erofs_register_collection(struct 
z_erofs_collector *clt,
else
pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
 
-   pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
-   pcl->clusterbits -= PAGE_SHIFT;
+   pcl->clusterbits = 0;
 
/* new pclusters should be claimed as type 1, primary and followed */
pcl->next = clt->owned_head;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 14d2de35110c..bd7e10c2fdd3 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -17,11 +17,8 @@ int z_erofs_fill_inode(struct inode *inode)
vi->z_algorithmtype[0] = 0;
vi->z_algorithmtype[1] = 0;
vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
-   vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
-   vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
set_bit(EROFS_I_Z_INITED_BIT, >flags);
}
-
inode->i_mapping->a_ops = _erofs_aops;
return 0;
 }
@@ -77,18 +74,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
}
 
vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
-   vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
-   ((h->h_clusterbits >> 3) & 3);
-
-   if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
-   erofs_err(sb, "unsupported physical clusterbits %u for nid 
%llu, please upgrade kernel",
- vi->z_physical_clusterbits[0], vi->nid);
-   err = -EOPNOTSUPP;
-   goto unmap_done;
-   }
-
-   vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
-   ((h->h_clusterbits >> 5) & 7);
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, >flags);
-- 
2.20.1



Re: [PATCH v4 1/1] x86/tdx: Handle MWAIT, MONITOR and WBINVD

2021-03-31 Thread Andi Kleen
> The hardware (and VMMs and SEAM) have ways of telling the guest kernel
> what is supported: CPUID.  If it screws up, and the guest gets an
> unexpected #VE, so be it.

The main reason for disabling stuff is actually that we don't need
to harden it. All these things are potential attack paths.

> 
> We don't have all kinds of crazy handling in the kernel's #UD handler
> just in case a CPU mis-enumerates a feature and we get a #UD.  We have
> to trust the underlying hardware to be sane.  If it isn't, we die a
> horrible death as fast as possible.  Why should TDX be any different?

That's what the original patch did -- no unnecessary checks -- but reviewers
keep asking for the extra checks, so Sathya added more. We have the not
unusual problem here that reviewers don't agree among themselves.

-Andi


[PATCH 1/1] watchdog: Fix a typo in Kconfig

2021-03-31 Thread Wong Vee Khee
s/thershold/threshold

Cc: Vijayakannan Ayyathurai 
Signed-off-by: Wong Vee Khee 
---
 drivers/watchdog/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 0470dc15c085..aa382e5edfef 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -2111,7 +2111,7 @@ config KEEMBAY_WATCHDOG
 This option enable support for an In-secure watchdog timer driver for
 Intel Keem Bay SoC. This WDT has a 32 bit timer and decrements in every
 count unit. An interrupt will be triggered, when the count crosses
-the thershold configured in the register.
+the threshold configured in the register.
 
 To compile this driver as a module, choose M here: the
 module will be called keembay_wdt.
-- 
2.25.1



[PATCH net-next] net: ipv6: Refactor in rt6_age_examine_exception

2021-03-31 Thread Xu Jia
The logic in rt6_age_examine_exception is confusing. The commit is
to refactor the code.

Signed-off-by: Xu Jia 
---
 net/ipv6/route.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ebb7519bec2a..f15c7605b11d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2085,13 +2085,10 @@ static void rt6_age_examine_exception(struct 
rt6_exception_bucket *bucket,
 
if (rt->rt6i_flags & RTF_GATEWAY) {
struct neighbour *neigh;
-   __u8 neigh_flags = 0;
 
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, 
>rt6i_gateway);
-   if (neigh)
-   neigh_flags = neigh->flags;
 
-   if (!(neigh_flags & NTF_ROUTER)) {
+   if (!(neigh && (neigh->flags & NTF_ROUTER))) {
RT6_TRACE("purging route %p via non-router but 
gateway\n",
  rt);
rt6_remove_exception(bucket, rt6_ex);
-- 
2.25.1



[PATCH -next v2] staging: greybus: camera: Switch to memdup_user_nul()

2021-03-31 Thread Yang Yingliang
Use memdup_user_nul() helper instead of open-coding to
simplify the code.

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
 drivers/staging/greybus/camera.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/staging/greybus/camera.c b/drivers/staging/greybus/camera.c
index b570e13394ac..2ecdc1bc5092 100644
--- a/drivers/staging/greybus/camera.c
+++ b/drivers/staging/greybus/camera.c
@@ -1120,16 +1120,9 @@ static ssize_t gb_camera_debugfs_write(struct file *file,
if (len > 1024)
return -EINVAL;
 
-   kbuf = kmalloc(len + 1, GFP_KERNEL);
-   if (!kbuf)
-   return -ENOMEM;
-
-   if (copy_from_user(kbuf, buf, len)) {
-   ret = -EFAULT;
-   goto done;
-   }
-
-   kbuf[len] = '\0';
+   kbuf = memdup_user_nul(buf, len);
+   if (IS_ERR(kbuf))
+   return PTR_ERR(kbuf);;
 
ret = op->execute(gcam, kbuf, len);
 
-- 
2.25.1



[PATCH] Bluetooth: Check inquiry status before sending one

2021-03-31 Thread Archie Pusaka
From: Archie Pusaka 

There is a possibility where HCI_INQUIRY flag is set but we still
send HCI_OP_INQUIRY anyway.

Such a case can be reproduced by connecting to an LE device while
active scanning. When the device is discovered, we initiate a
connection, stop LE Scan, and send Discovery MGMT with status
disabled, but we don't cancel the inquiry.

Signed-off-by: Archie Pusaka 
Reviewed-by: Sonny Sasaka 
---

 net/bluetooth/hci_request.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 8ace5d34b01e..5a5ec7ed15ea 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -2952,6 +2952,9 @@ static int bredr_inquiry(struct hci_request *req, 
unsigned long opt)
const u8 liac[3] = { 0x00, 0x8b, 0x9e };
struct hci_cp_inquiry cp;
 
+   if (test_bit(HCI_INQUIRY, >hdev->flags))
+   return 0;
+
bt_dev_dbg(req->hdev, "");
 
hci_dev_lock(req->hdev);
-- 
2.31.0.291.g576ba9dcdaf-goog



[PATCH 4/9] platform/x86: intel_pmc_core: Show LPM residency in microseconds

2021-03-31 Thread David E. Box
From: Gayatri Kammela 

Modify the low power mode (LPM or sub-state) residency counters to display
in microseconds just like the slp_s0_residency counter. The granularity of
the counter is approximately 30.5us per tick. Double this value then divide
by two to maintain accuracy.

Signed-off-by: Gayatri Kammela 
Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 14 --
 drivers/platform/x86/intel_pmc_core.h |  3 +++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index ce300c2942d0..ba0db301f07b 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -578,6 +578,7 @@ static const struct pmc_reg_map tgl_reg_map = {
.pm_read_disable_bit = CNP_PMC_READ_DISABLE_BIT,
.ltr_ignore_max = TGL_NUM_IP_IGN_ALLOWED,
.lpm_num_maps = TGL_LPM_NUM_MAPS,
+   .lpm_res_counter_step_x2 = TGL_PMC_LPM_RES_COUNTER_STEP_X2,
.lpm_en_offset = TGL_LPM_EN_OFFSET,
.lpm_priority_offset = TGL_LPM_PRI_OFFSET,
.lpm_residency_offset = TGL_LPM_RESIDENCY_OFFSET,
@@ -1026,17 +1027,26 @@ static int pmc_core_ltr_show(struct seq_file *s, void 
*unused)
 }
 DEFINE_SHOW_ATTRIBUTE(pmc_core_ltr);
 
+static inline u64 adjust_lpm_residency(struct pmc_dev *pmcdev, u32 offset,
+  const int lpm_adj_x2)
+{
+   u64 lpm_res = pmc_core_reg_read(pmcdev, offset);
+
+   return GET_X2_COUNTER((u64)lpm_adj_x2 * lpm_res);
+}
+
 static int pmc_core_substate_res_show(struct seq_file *s, void *unused)
 {
struct pmc_dev *pmcdev = s->private;
+   const int lpm_adj_x2 = pmcdev->map->lpm_res_counter_step_x2;
u32 offset = pmcdev->map->lpm_residency_offset;
int i, mode;
 
seq_printf(s, "%-10s %-15s\n", "Substate", "Residency");
 
pmc_for_each_mode(i, mode, pmcdev) {
-   seq_printf(s, "%-10s %-15u\n", pmc_lpm_modes[mode],
-  pmc_core_reg_read(pmcdev, offset + (4 * mode)));
+   seq_printf(s, "%-10s %-15llu\n", pmc_lpm_modes[mode],
+  adjust_lpm_residency(pmcdev, offset + (4 * mode), 
lpm_adj_x2));
}
 
return 0;
diff --git a/drivers/platform/x86/intel_pmc_core.h 
b/drivers/platform/x86/intel_pmc_core.h
index 5a4e3a49f5b1..3800c1ba6fb7 100644
--- a/drivers/platform/x86/intel_pmc_core.h
+++ b/drivers/platform/x86/intel_pmc_core.h
@@ -188,9 +188,11 @@ enum ppfear_regs {
 #define ICL_PMC_SLP_S0_RES_COUNTER_STEP0x64
 
 #define LPM_MAX_NUM_MODES  8
+#define GET_X2_COUNTER(v)  ((v) >> 1)
 
 #define TGL_NUM_IP_IGN_ALLOWED 22
 #define TGL_PMC_SLP_S0_RES_COUNTER_STEP0x7A
+#define TGL_PMC_LPM_RES_COUNTER_STEP_X261  /* 30.5us * 2 */
 
 /*
  * Tigerlake Power Management Controller register offsets
@@ -263,6 +265,7 @@ struct pmc_reg_map {
const u32 pm_vric1_offset;
/* Low Power Mode registers */
const int lpm_num_maps;
+   const int lpm_res_counter_step_x2;
const u32 lpm_en_offset;
const u32 lpm_priority_offset;
const u32 lpm_residency_offset;
-- 
2.25.1



[PATCH 5/9] platform/x86: intel_pmc_core: Get LPM requirements for Tiger Lake

2021-03-31 Thread David E. Box
From: Gayatri Kammela 

Platforms that support low power modes (LPM) such as Tiger Lake maintain
requirements for each sub-state that a readable in the PMC. However, unlike
LPM status registers, requirement registers are not memory mapped but are
available from an ACPI _DSM. Collect the requirements for Tiger Lake using
the _DSM method and store in a buffer.

Signed-off-by: Gayatri Kammela 
Co-developed-by: David E. Box 
Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 49 +++
 drivers/platform/x86/intel_pmc_core.h |  2 ++
 2 files changed, 51 insertions(+)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index ba0db301f07b..0ec26a4c715e 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -23,7 +23,9 @@
 #include 
 #include 
 #include 
+#include 
 
+#include 
 #include 
 #include 
 #include 
@@ -31,6 +33,9 @@
 
 #include "intel_pmc_core.h"
 
+#define ACPI_S0IX_DSM_UUID "57a6512e-3979-4e9d-9708-ff13b2508972"
+#define ACPI_GET_LOW_MODE_REGISTERS1
+
 /* PKGC MSRs are common across Intel Core SoCs */
 static const struct pmc_bit_map msr_map[] = {
{"Package C2",  MSR_PKG_C2_RESIDENCY},
@@ -587,6 +592,46 @@ static const struct pmc_reg_map tgl_reg_map = {
.lpm_live_status_offset = TGL_LPM_LIVE_STATUS_OFFSET,
 };
 
+static void pmc_core_get_tgl_lpm_reqs(struct platform_device *pdev)
+{
+   struct pmc_dev *pmcdev = platform_get_drvdata(pdev);
+   const int num_maps = pmcdev->map->lpm_num_maps;
+   size_t lpm_size = LPM_MAX_NUM_MODES * num_maps * 4;
+   union acpi_object *out_obj;
+   struct acpi_device *adev;
+   guid_t s0ix_dsm_guid;
+   u32 *lpm_req_regs;
+
+   adev = ACPI_COMPANION(>dev);
+   if (!adev)
+   return;
+
+   lpm_req_regs = devm_kzalloc(>dev, lpm_size * sizeof(u32),
+GFP_KERNEL);
+   if (!lpm_req_regs)
+   return;
+
+   guid_parse(ACPI_S0IX_DSM_UUID, _dsm_guid);
+
+   out_obj = acpi_evaluate_dsm(adev->handle, _dsm_guid, 0,
+   ACPI_GET_LOW_MODE_REGISTERS, NULL);
+   if (out_obj && out_obj->type == ACPI_TYPE_BUFFER) {
+   u32 *addr = (u32 *)out_obj->buffer.pointer;
+   int size = out_obj->buffer.length;
+
+   if (size != lpm_size)
+   return;
+
+   memcpy_fromio(lpm_req_regs, addr, lpm_size);
+   } else
+   acpi_handle_debug(adev->handle,
+ "_DSM function 0 evaluation failed\n");
+
+   ACPI_FREE(out_obj);
+
+   pmcdev->lpm_req_regs = lpm_req_regs;
+}
+
 static inline u32 pmc_core_reg_read(struct pmc_dev *pmcdev, int reg_offset)
 {
return readl(pmcdev->regbase + reg_offset);
@@ -1312,10 +1357,14 @@ static int pmc_core_probe(struct platform_device *pdev)
return -ENOMEM;
 
mutex_init(>lock);
+
pmcdev->pmc_xram_read_bit = pmc_core_check_read_lock_bit(pmcdev);
pmc_core_get_low_power_modes(pmcdev);
pmc_core_do_dmi_quirks(pmcdev);
 
+   if (pmcdev->map == _reg_map)
+   pmc_core_get_tgl_lpm_reqs(pdev);
+
/*
 * On TGL, due to a hardware limitation, the GBE LTR blocks PC10 when
 * a cable is attached. Tell the PMC to ignore it.
diff --git a/drivers/platform/x86/intel_pmc_core.h 
b/drivers/platform/x86/intel_pmc_core.h
index 3800c1ba6fb7..81d797feed33 100644
--- a/drivers/platform/x86/intel_pmc_core.h
+++ b/drivers/platform/x86/intel_pmc_core.h
@@ -288,6 +288,7 @@ struct pmc_reg_map {
  * @s0ix_counter:  S0ix residency (step adjusted)
  * @num_modes: Count of enabled modes
  * @lpm_en_modes:  Array of enabled modes from lowest to highest priority
+ * @lpm_req_regs:  List of substate requirements
  *
  * pmc_dev contains info about power management controller device.
  */
@@ -304,6 +305,7 @@ struct pmc_dev {
u64 s0ix_counter;
int num_modes;
int lpm_en_modes[LPM_MAX_NUM_MODES];
+   u32 *lpm_req_regs;
 };
 
 #define pmc_for_each_mode(i, mode, pmcdev) \
-- 
2.25.1



[PATCH 0/9] intel_pmc_core: Add sub-state requirements and mode latching support

2021-03-31 Thread David E. Box
- Patch 1 and 2 remove the use of the global struct pmc_dev
- Patches 3-7 add support for reading low power mode sub-state
  requirements, latching sub-state status on different low power mode
  events, and displaying the sub-state residency in microseconds
- Patch 8 adds missing LTR IPs for TGL
- Patch 9 adds support for ADL-P which is based on TGL

Applied on top of latest 5.12-rc2 based hans-review/review-hans

David E. Box (4):
  platform/x86: intel_pmc_core: Don't use global pmcdev in quirks
  platform/x86: intel_pmc_core: Remove global struct pmc_dev
  platform/x86: intel_pmc_core: Add option to set/clear LPM mode
  platform/x86: intel_pmc_core: Add support for Alder Lake PCH-P

Gayatri Kammela (5):
  platform/x86: intel_pmc_core: Handle sub-states generically
  platform/x86: intel_pmc_core: Show LPM residency in microseconds
  platform/x86: intel_pmc_core: Get LPM requirements for Tiger Lake
  platform/x86: intel_pmc_core: Add requirements file to debugfs
  platform/x86: intel_pmc_core: Add LTR registers for Tiger Lake

 drivers/platform/x86/intel_pmc_core.c | 359 +++---
 drivers/platform/x86/intel_pmc_core.h |  47 +++-
 2 files changed, 370 insertions(+), 36 deletions(-)

-- 
2.25.1



[PATCH 3/9] platform/x86: intel_pmc_core: Handle sub-states generically

2021-03-31 Thread David E. Box
From: Gayatri Kammela 

The current implementation of pmc_core_substate_res_show() is written
specifically for Tiger Lake. However, new platforms will also have
sub-states and may support different modes. Therefore rewrite the code to
handle sub-states generically.

Read the number and type of enabled states from the PMC. Use the Low
Power Mode (LPM) priority register to store the states in order from
shallowest to deepest for displaying. Add a for_each macro to simplify
this. While changing the sub-state display it makes sense to show only the
"enabled" sub-states instead of showing all possible ones. After this
patch, the debugfs file looks like this:

Substate   Residency
S0i2.0 0
S0i3.0 0
S0i2.1 9329279
S0i3.1 0
S0i3.2 0

Suggested-by: David E. Box 
Signed-off-by: Gayatri Kammela 
Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 59 ++-
 drivers/platform/x86/intel_pmc_core.h | 18 +++-
 2 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index 5ca40fe3da59..ce300c2942d0 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -577,8 +577,9 @@ static const struct pmc_reg_map tgl_reg_map = {
.pm_cfg_offset = CNP_PMC_PM_CFG_OFFSET,
.pm_read_disable_bit = CNP_PMC_READ_DISABLE_BIT,
.ltr_ignore_max = TGL_NUM_IP_IGN_ALLOWED,
-   .lpm_modes = tgl_lpm_modes,
+   .lpm_num_maps = TGL_LPM_NUM_MAPS,
.lpm_en_offset = TGL_LPM_EN_OFFSET,
+   .lpm_priority_offset = TGL_LPM_PRI_OFFSET,
.lpm_residency_offset = TGL_LPM_RESIDENCY_OFFSET,
.lpm_sts = tgl_lpm_maps,
.lpm_status_offset = TGL_LPM_STATUS_OFFSET,
@@ -1028,18 +1029,14 @@ DEFINE_SHOW_ATTRIBUTE(pmc_core_ltr);
 static int pmc_core_substate_res_show(struct seq_file *s, void *unused)
 {
struct pmc_dev *pmcdev = s->private;
-   const char **lpm_modes = pmcdev->map->lpm_modes;
u32 offset = pmcdev->map->lpm_residency_offset;
-   u32 lpm_en;
-   int index;
+   int i, mode;
 
-   lpm_en = pmc_core_reg_read(pmcdev, pmcdev->map->lpm_en_offset);
-   seq_printf(s, "status substate residency\n");
-   for (index = 0; lpm_modes[index]; index++) {
-   seq_printf(s, "%7s %7s %-15u\n",
-  BIT(index) & lpm_en ? "Enabled" : " ",
-  lpm_modes[index], pmc_core_reg_read(pmcdev, offset));
-   offset += 4;
+   seq_printf(s, "%-10s %-15s\n", "Substate", "Residency");
+
+   pmc_for_each_mode(i, mode, pmcdev) {
+   seq_printf(s, "%-10s %-15u\n", pmc_lpm_modes[mode],
+  pmc_core_reg_read(pmcdev, offset + (4 * mode)));
}
 
return 0;
@@ -1091,6 +1088,45 @@ static int pmc_core_pkgc_show(struct seq_file *s, void 
*unused)
 }
 DEFINE_SHOW_ATTRIBUTE(pmc_core_pkgc);
 
+static void pmc_core_get_low_power_modes(struct pmc_dev *pmcdev)
+{
+   u8 lpm_priority[LPM_MAX_NUM_MODES];
+   u32 lpm_en;
+   int mode, i, p;
+
+   /* Use LPM Maps to indicate support for substates */
+   if (!pmcdev->map->lpm_num_maps)
+   return;
+
+   lpm_en = pmc_core_reg_read(pmcdev, pmcdev->map->lpm_en_offset);
+   pmcdev->num_modes = hweight32(lpm_en);
+
+   /* Each byte contains information for 2 modes (7:4 and 3:0) */
+   for (mode = 0; mode < LPM_MAX_NUM_MODES; mode += 2) {
+   u8 priority = pmc_core_reg_read_byte(pmcdev,
+   pmcdev->map->lpm_priority_offset + (mode / 2));
+   int pri0 = GENMASK(3, 0) & priority;
+   int pri1 = (GENMASK(7, 4) & priority) >> 4;
+
+   lpm_priority[pri0] = mode;
+   lpm_priority[pri1] = mode + 1;
+   }
+
+   /*
+* Loop though all modes from lowest to highest priority,
+* and capture all enabled modes in order
+*/
+   i = 0;
+   for (p = LPM_MAX_NUM_MODES - 1; p >= 0; p--) {
+   int mode = lpm_priority[p];
+
+   if (!(BIT(mode) & lpm_en))
+   continue;
+
+   pmcdev->lpm_en_modes[i++] = mode;
+   }
+}
+
 static void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev)
 {
debugfs_remove_recursive(pmcdev->dbgfs_dir);
@@ -1267,6 +1303,7 @@ static int pmc_core_probe(struct platform_device *pdev)
 
mutex_init(>lock);
pmcdev->pmc_xram_read_bit = pmc_core_check_read_lock_bit(pmcdev);
+   pmc_core_get_low_power_modes(pmcdev);
pmc_core_do_dmi_quirks(pmcdev);
 
/*
diff --git a/drivers/platform/x86/intel_pmc_core.h 
b/drivers/platform/x86/intel_pmc_core.h
index f33cd2c34835..5a4e3a49f5b1 100644
--- a/drivers/platform/x86/intel_pmc_core.h
+++ b/drivers/platform/x86/intel_pmc_core.h
@@ -187,6 +187,8 @@ enum ppfear_regs {
 #define ICL_PMC_LTR_WIGIG  0x1BFC
 #define 

[PATCH 8/9] platform/x86: intel_pmc_core: Add LTR registers for Tiger Lake

2021-03-31 Thread David E. Box
From: Gayatri Kammela 

Just like Ice Lake, Tiger Lake uses Cannon Lake's LTR information
and supports a few additional registers. Hence add the LTR registers
specific to Tiger Lake to the cnp_ltr_show_map[].

Also adjust the number of LTR IPs for Tiger Lake to the correct amount.

Signed-off-by: Gayatri Kammela 
Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 2 ++
 drivers/platform/x86/intel_pmc_core.h | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index 458c0056e7a1..9168062c927e 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -383,6 +383,8 @@ static const struct pmc_bit_map cnp_ltr_show_map[] = {
 * a list of core SoCs using this.
 */
{"WIGIG",   ICL_PMC_LTR_WIGIG},
+   {"THC0",TGL_PMC_LTR_THC0},
+   {"THC1",TGL_PMC_LTR_THC1},
/* Below two cannot be used for LTR_IGNORE */
{"CURRENT_PLATFORM",CNP_PMC_LTR_CUR_PLT},
{"AGGREGATED_SYSTEM",   CNP_PMC_LTR_CUR_ASLT},
diff --git a/drivers/platform/x86/intel_pmc_core.h 
b/drivers/platform/x86/intel_pmc_core.h
index f41f61aa7008..634130b589a2 100644
--- a/drivers/platform/x86/intel_pmc_core.h
+++ b/drivers/platform/x86/intel_pmc_core.h
@@ -192,8 +192,10 @@ enum ppfear_regs {
 #define ETR3_CLEAR_LPM_EVENTS_BIT  28
 #define LPM_STS_LATCH_MODE_BIT 31
 
-#define TGL_NUM_IP_IGN_ALLOWED 22
 #define TGL_PMC_SLP_S0_RES_COUNTER_STEP0x7A
+#define TGL_PMC_LTR_THC0   0x1C04
+#define TGL_PMC_LTR_THC1   0x1C08
+#define TGL_NUM_IP_IGN_ALLOWED 23
 #define TGL_PMC_LPM_RES_COUNTER_STEP_X261  /* 30.5us * 2 */
 
 /*
-- 
2.25.1



[PATCH 7/9] platform/x86: intel_pmc_core: Add option to set/clear LPM mode

2021-03-31 Thread David E. Box
By default the Low Power Mode (LPM or sub-state) status registers will
latch condition status on every entry into Package C10. This is
configurable in the PMC to allow latching on any achievable sub-state. Add
a debugfs file to support this.

Also add the option to clear the status registers to 0. Clearing the status
registers before testing removes ambiguity around when the current values
were set.

The new file, latch_lpm_mode, looks like this:

[c10] S0i2.0 S0i3.0 S0i2.1 S0i3.1 S0i3.2 clear

Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 94 +++
 drivers/platform/x86/intel_pmc_core.h | 20 ++
 2 files changed, 114 insertions(+)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index 0b47a1da5f49..458c0056e7a1 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -584,6 +584,8 @@ static const struct pmc_reg_map tgl_reg_map = {
.ltr_ignore_max = TGL_NUM_IP_IGN_ALLOWED,
.lpm_num_maps = TGL_LPM_NUM_MAPS,
.lpm_res_counter_step_x2 = TGL_PMC_LPM_RES_COUNTER_STEP_X2,
+   .etr3_offset = TGL_ETR3_OFFSET,
+   .lpm_sts_latch_en_offset = TGL_LPM_STS_LATCH_EN_OFFSET,
.lpm_en_offset = TGL_LPM_EN_OFFSET,
.lpm_priority_offset = TGL_LPM_PRI_OFFSET,
.lpm_residency_offset = TGL_LPM_RESIDENCY_OFFSET,
@@ -1202,6 +1204,95 @@ static int pmc_core_substate_req_regs_show(struct 
seq_file *s, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(pmc_core_substate_req_regs);
 
+static int pmc_core_lpm_latch_mode_show(struct seq_file *s, void *unused)
+{
+   struct pmc_dev *pmcdev = s->private;
+   bool c10;
+   u32 reg;
+   int idx, mode;
+
+   reg = pmc_core_reg_read(pmcdev, pmcdev->map->lpm_sts_latch_en_offset);
+   if (reg & BIT(LPM_STS_LATCH_MODE_BIT)) {
+   seq_puts(s, "c10");
+   c10 = false;
+   } else {
+   seq_puts(s, "[c10]");
+   c10 = true;
+   }
+
+   pmc_for_each_mode(idx, mode, pmcdev) {
+   if ((BIT(mode) & reg) && !c10)
+   seq_printf(s, " [%s]", pmc_lpm_modes[mode]);
+   else
+   seq_printf(s, " %s", pmc_lpm_modes[mode]);
+   }
+
+   seq_puts(s, " clear\n");
+
+   return 0;
+}
+
+static ssize_t pmc_core_lpm_latch_mode_write(struct file *file,
+const char __user *userbuf,
+size_t count, loff_t *ppos)
+{
+   struct seq_file *s = file->private_data;
+   struct pmc_dev *pmcdev = s->private;
+   bool clear = false, c10 = false;
+   unsigned char buf[10] = {0};
+   size_t ret;
+   int mode;
+   u32 reg;
+
+   ret = simple_write_to_buffer(buf, 10, ppos, userbuf, count);
+   if (ret < 0)
+   return ret;
+
+   mode = sysfs_match_string(pmc_lpm_modes, buf);
+   if (mode < 0) {
+   if (strncmp("clear", buf, 5) == 0)
+   clear = true;
+   else if (strncmp("c10", buf, 3) == 0)
+   c10 = true;
+   else
+   return mode;
+   }
+
+   if (clear) {
+   mutex_lock(>lock);
+
+   reg = pmc_core_reg_read(pmcdev, pmcdev->map->etr3_offset);
+   reg |= BIT(ETR3_CLEAR_LPM_EVENTS_BIT);
+   pmc_core_reg_write(pmcdev, pmcdev->map->etr3_offset, reg);
+
+   mutex_unlock(>lock);
+
+   return count;
+   }
+
+   if (c10) {
+   mutex_lock(>lock);
+
+   reg = pmc_core_reg_read(pmcdev, 
pmcdev->map->lpm_sts_latch_en_offset);
+   reg &= ~BIT(LPM_STS_LATCH_MODE_BIT);
+   pmc_core_reg_write(pmcdev, 
pmcdev->map->lpm_sts_latch_en_offset, reg);
+
+   mutex_unlock(>lock);
+
+   return count;
+   }
+
+   /*
+* For LPM mode latching we set the latch enable bit and selected mode
+* and clear everything else.
+*/
+   reg = BIT(LPM_STS_LATCH_MODE_BIT) | BIT(mode);
+   pmc_core_reg_write(pmcdev, pmcdev->map->lpm_sts_latch_en_offset, reg);
+
+   return count;
+}
+DEFINE_PMC_CORE_ATTR_WRITE(pmc_core_lpm_latch_mode);
+
 static int pmc_core_pkgc_show(struct seq_file *s, void *unused)
 {
struct pmc_dev *pmcdev = s->private;
@@ -1320,6 +1411,9 @@ static void pmc_core_dbgfs_register(struct pmc_dev 
*pmcdev)
debugfs_create_file("substate_live_status_registers", 0444,
pmcdev->dbgfs_dir, pmcdev,
_core_substate_l_sts_regs_fops);
+   debugfs_create_file("lpm_latch_mode", 0644,
+   pmcdev->dbgfs_dir, pmcdev,
+   _core_lpm_latch_mode_fops);
}
 
if (pmcdev->lpm_req_regs) {
diff --git 

[PATCH 9/9] platform/x86: intel_pmc_core: Add support for Alder Lake PCH-P

2021-03-31 Thread David E. Box
Alder PCH-P is based on Tiger Lake PCH.

Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index 9168062c927e..88d582df829f 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -1440,6 +1440,7 @@ static const struct x86_cpu_id intel_pmc_core_ids[] = {
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT,_reg_map),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,  _reg_map),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,  _reg_map),
+   X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, _reg_map),
{}
 };
 
-- 
2.25.1



[PATCH 1/9] platform/x86: intel_pmc_core: Don't use global pmcdev in quirks

2021-03-31 Thread David E. Box
The DMI callbacks, used for quirks, currently access the PMC by getting
the address a global pmc_dev struct. Instead, have the callbacks set a
global quirk specific variable. In probe, after calling dmi_check_system(),
pass pmc_dev to a function that will handle each quirk if its variable
condition is met. This allows removing the global pmc_dev later.

Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index b5888aeb4bcf..260d49dca1ad 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -1186,9 +1186,15 @@ static const struct pci_device_id pmc_pci_ids[] = {
  * the platform BIOS enforces 24Mhz crystal to shutdown
  * before PMC can assert SLP_S0#.
  */
+static bool xtal_ignore;
 static int quirk_xtal_ignore(const struct dmi_system_id *id)
 {
-   struct pmc_dev *pmcdev = 
+   xtal_ignore = true;
+   return 0;
+}
+
+static void pmc_core_xtal_ignore(struct pmc_dev *pmcdev)
+{
u32 value;
 
value = pmc_core_reg_read(pmcdev, pmcdev->map->pm_vric1_offset);
@@ -1197,7 +1203,6 @@ static int quirk_xtal_ignore(const struct dmi_system_id 
*id)
/* Low Voltage Mode Enable */
value &= ~SPT_PMC_VRIC1_SLPS0LVEN;
pmc_core_reg_write(pmcdev, pmcdev->map->pm_vric1_offset, value);
-   return 0;
 }
 
 static const struct dmi_system_id pmc_core_dmi_table[]  = {
@@ -1212,6 +1217,14 @@ static const struct dmi_system_id pmc_core_dmi_table[]  
= {
{}
 };
 
+static void pmc_core_do_dmi_quirks(struct pmc_dev *pmcdev)
+{
+   dmi_check_system(pmc_core_dmi_table);
+
+   if (xtal_ignore)
+   pmc_core_xtal_ignore(pmcdev);
+}
+
 static int pmc_core_probe(struct platform_device *pdev)
 {
static bool device_initialized;
@@ -1253,7 +1266,7 @@ static int pmc_core_probe(struct platform_device *pdev)
mutex_init(>lock);
platform_set_drvdata(pdev, pmcdev);
pmcdev->pmc_xram_read_bit = pmc_core_check_read_lock_bit();
-   dmi_check_system(pmc_core_dmi_table);
+   pmc_core_do_dmi_quirks(pmcdev);
 
/*
 * On TGL, due to a hardware limitation, the GBE LTR blocks PC10 when
-- 
2.25.1



[PATCH 2/9] platform/x86: intel_pmc_core: Remove global struct pmc_dev

2021-03-31 Thread David E. Box
The intel_pmc_core driver did not always bind to a device which meant it
lacked a struct device that could be used to maintain driver data. So a
global instance of struct pmc_dev was used for this purpose and functions
accessed this directly. Since the driver now binds to an ACPI device,
remove the global pmc_dev in favor of one that is allocated during probe.
Modify users of the global to obtain the object by argument instead.

Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 41 ++-
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index 260d49dca1ad..5ca40fe3da59 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -31,8 +31,6 @@
 
 #include "intel_pmc_core.h"
 
-static struct pmc_dev pmc;
-
 /* PKGC MSRs are common across Intel Core SoCs */
 static const struct pmc_bit_map msr_map[] = {
{"Package C2",  MSR_PKG_C2_RESIDENCY},
@@ -617,9 +615,8 @@ static int pmc_core_dev_state_get(void *data, u64 *val)
 
 DEFINE_DEBUGFS_ATTRIBUTE(pmc_core_dev_state, pmc_core_dev_state_get, NULL, 
"%llu\n");
 
-static int pmc_core_check_read_lock_bit(void)
+static int pmc_core_check_read_lock_bit(struct pmc_dev *pmcdev)
 {
-   struct pmc_dev *pmcdev = 
u32 value;
 
value = pmc_core_reg_read(pmcdev, pmcdev->map->pm_cfg_offset);
@@ -744,28 +741,26 @@ static int pmc_core_ppfear_show(struct seq_file *s, void 
*unused)
 DEFINE_SHOW_ATTRIBUTE(pmc_core_ppfear);
 
 /* This function should return link status, 0 means ready */
-static int pmc_core_mtpmc_link_status(void)
+static int pmc_core_mtpmc_link_status(struct pmc_dev *pmcdev)
 {
-   struct pmc_dev *pmcdev = 
u32 value;
 
value = pmc_core_reg_read(pmcdev, SPT_PMC_PM_STS_OFFSET);
return value & BIT(SPT_PMC_MSG_FULL_STS_BIT);
 }
 
-static int pmc_core_send_msg(u32 *addr_xram)
+static int pmc_core_send_msg(struct pmc_dev *pmcdev, u32 *addr_xram)
 {
-   struct pmc_dev *pmcdev = 
u32 dest;
int timeout;
 
for (timeout = NUM_RETRIES; timeout > 0; timeout--) {
-   if (pmc_core_mtpmc_link_status() == 0)
+   if (pmc_core_mtpmc_link_status(pmcdev) == 0)
break;
msleep(5);
}
 
-   if (timeout <= 0 && pmc_core_mtpmc_link_status())
+   if (timeout <= 0 && pmc_core_mtpmc_link_status(pmcdev))
return -EBUSY;
 
dest = (*addr_xram & MTPMC_MASK) | (1U << 1);
@@ -791,7 +786,7 @@ static int pmc_core_mphy_pg_show(struct seq_file *s, void 
*unused)
 
mutex_lock(>lock);
 
-   if (pmc_core_send_msg(_core_reg_low) != 0) {
+   if (pmc_core_send_msg(pmcdev, _core_reg_low) != 0) {
err = -EBUSY;
goto out_unlock;
}
@@ -799,7 +794,7 @@ static int pmc_core_mphy_pg_show(struct seq_file *s, void 
*unused)
msleep(10);
val_low = pmc_core_reg_read(pmcdev, SPT_PMC_MFPMC_OFFSET);
 
-   if (pmc_core_send_msg(_core_reg_high) != 0) {
+   if (pmc_core_send_msg(pmcdev, _core_reg_high) != 0) {
err = -EBUSY;
goto out_unlock;
}
@@ -842,7 +837,7 @@ static int pmc_core_pll_show(struct seq_file *s, void 
*unused)
mphy_common_reg  = (SPT_PMC_MPHY_COM_STS_0 << 16);
mutex_lock(>lock);
 
-   if (pmc_core_send_msg(_common_reg) != 0) {
+   if (pmc_core_send_msg(pmcdev, _common_reg) != 0) {
err = -EBUSY;
goto out_unlock;
}
@@ -863,9 +858,8 @@ static int pmc_core_pll_show(struct seq_file *s, void 
*unused)
 }
 DEFINE_SHOW_ATTRIBUTE(pmc_core_pll);
 
-static int pmc_core_send_ltr_ignore(u32 value)
+static int pmc_core_send_ltr_ignore(struct pmc_dev *pmcdev, u32 value)
 {
-   struct pmc_dev *pmcdev = 
const struct pmc_reg_map *map = pmcdev->map;
u32 reg;
int err = 0;
@@ -891,6 +885,8 @@ static ssize_t pmc_core_ltr_ignore_write(struct file *file,
 const char __user *userbuf,
 size_t count, loff_t *ppos)
 {
+   struct seq_file *s = file->private_data;
+   struct pmc_dev *pmcdev = s->private;
u32 buf_size, value;
int err;
 
@@ -900,7 +896,7 @@ static ssize_t pmc_core_ltr_ignore_write(struct file *file,
if (err)
return err;
 
-   err = pmc_core_send_ltr_ignore(value);
+   err = pmc_core_send_ltr_ignore(pmcdev, value);
 
return err == 0 ? count : err;
 }
@@ -1228,13 +1224,19 @@ static void pmc_core_do_dmi_quirks(struct pmc_dev 
*pmcdev)
 static int pmc_core_probe(struct platform_device *pdev)
 {
static bool device_initialized;
-   struct pmc_dev *pmcdev = 
+   struct pmc_dev *pmcdev;
const struct x86_cpu_id *cpu_id;
u64 slp_s0_addr;
 
if (device_initialized)
  

[PATCH 6/9] platform/x86: intel_pmc_core: Add requirements file to debugfs

2021-03-31 Thread David E. Box
From: Gayatri Kammela 

Add the debugfs file, substate_requirements, to view the low power mode
(LPM) requirements for each enabled mode alongside the last latched status
of the condition.

After this patch, the new file will look like this:

Element |S0i2.0 |S0i3.0 |S0i2.1 |S0i3.1 |   
 S0i3.2 |Status |
USB2PLL_OFF_STS |  Required |  Required |  Required |  Required |  
Required |   |
PCIe/USB3.1_Gen2PLL_OFF_STS |  Required |  Required |  Required |  Required |  
Required |   |
   PCIe_Gen3PLL_OFF_STS |  Required |  Required |  Required |  Required |  
Required |   Yes |
OPIOPLL_OFF_STS |  Required |  Required |  Required |  Required |  
Required |   Yes |
  OCPLL_OFF_STS |  Required |  Required |  Required |  Required |  
Required |   Yes |
MainPLL_OFF_STS |   |  Required |   |  Required |  
Required |   |

Signed-off-by: Gayatri Kammela 
Co-developed-by: David E. Box 
Signed-off-by: David E. Box 
---
 drivers/platform/x86/intel_pmc_core.c | 86 +++
 1 file changed, 86 insertions(+)

diff --git a/drivers/platform/x86/intel_pmc_core.c 
b/drivers/platform/x86/intel_pmc_core.c
index 0ec26a4c715e..0b47a1da5f49 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -1122,6 +1122,86 @@ static int pmc_core_substate_l_sts_regs_show(struct 
seq_file *s, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(pmc_core_substate_l_sts_regs);
 
+static void pmc_core_substate_req_header_show(struct seq_file *s)
+{
+   struct pmc_dev *pmcdev = s->private;
+   int i, mode;
+
+   seq_printf(s, "%30s |", "Element");
+   pmc_for_each_mode(i, mode, pmcdev)
+   seq_printf(s, " %9s |", pmc_lpm_modes[mode]);
+
+   seq_printf(s, " %9s |\n", "Status");
+}
+
+static int pmc_core_substate_req_regs_show(struct seq_file *s, void *unused)
+{
+   struct pmc_dev *pmcdev = s->private;
+   const struct pmc_bit_map **maps = pmcdev->map->lpm_sts;
+   const struct pmc_bit_map *map;
+   const int num_maps = pmcdev->map->lpm_num_maps;
+   u32 sts_offset = pmcdev->map->lpm_status_offset;
+   u32 *lpm_req_regs = pmcdev->lpm_req_regs;
+   int mp;
+
+   /* Display the header */
+   pmc_core_substate_req_header_show(s);
+
+   /* Loop over maps */
+   for (mp = 0; mp < num_maps; mp++) {
+   u32 req_mask = 0;
+   u32 lpm_status;
+   int mode, idx, i, len = 32;
+
+   /*
+* Capture the requirements and create a mask so that we only
+* show an element if it's required for at least one of the
+* enabled low power modes
+*/
+   pmc_for_each_mode(idx, mode, pmcdev)
+   req_mask |= lpm_req_regs[mp + (mode * num_maps)];
+
+   /* Get the last latched status for this map */
+   lpm_status = pmc_core_reg_read(pmcdev, sts_offset + (mp * 4));
+
+   /*  Loop over elements in this map */
+   map = maps[mp];
+   for (i = 0; map[i].name && i < len; i++) {
+   u32 bit_mask = map[i].bit_mask;
+
+   if (!(bit_mask & req_mask))
+   /*
+* Not required for any enabled states
+* so don't display
+*/
+   continue;
+
+   /* Display the element name in the first column */
+   seq_printf(s, "%30s |", map[i].name);
+
+   /* Loop over the enabled states and display if required 
*/
+   pmc_for_each_mode(idx, mode, pmcdev) {
+   if (lpm_req_regs[mp + (mode * num_maps)] & 
bit_mask)
+   seq_printf(s, " %9s |",
+  "Required");
+   else
+   seq_printf(s, " %9s |", " ");
+   }
+
+   /* In Status column, show the last captured state of 
this agent */
+   if (lpm_status & bit_mask)
+   seq_printf(s, " %9s |", "Yes");
+   else
+   seq_printf(s, " %9s |", " ");
+
+   seq_puts(s, "\n");
+   }
+   }
+
+   return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(pmc_core_substate_req_regs);
+
 static int pmc_core_pkgc_show(struct seq_file *s, void *unused)
 {
struct pmc_dev *pmcdev = s->private;
@@ -1241,6 +1321,12 @@ static void pmc_core_dbgfs_register(struct pmc_dev 
*pmcdev)
pmcdev->dbgfs_dir, pmcdev,
_core_substate_l_sts_regs_fops);
}
+
+   

Re: [PATCH] mm: memcontrol: fix forget to obtain the ref to objcg in split_page_memcg

2021-03-31 Thread Shakeel Butt
On Wed, Mar 31, 2021 at 8:02 PM Muchun Song  wrote:
>
> Christian Borntraeger reported a warning about "percpu ref
> (obj_cgroup_release) <= 0 (-1) after switching to atomic".
> Because we forgot to obtain the reference to the objcg and
> wrongly obtain the reference of memcg.
>
> Reported-by: Christian Borntraeger 
> Signed-off-by: Muchun Song 

Looks good to me.

Reviewed-by: Shakeel Butt 


[PATCH] crypto: rockchip/rk3288_crypto_ahash - delete unneeded variable initialization

2021-03-31 Thread Kai Ye
Delete unneeded variable initialization

Signed-off-by: Kai Ye 
---
 drivers/crypto/rockchip/rk3288_crypto_ahash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c 
b/drivers/crypto/rockchip/rk3288_crypto_ahash.c
index 81befe7..ed03058 100644
--- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c
+++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c
@@ -48,7 +48,7 @@ static void rk_ahash_reg_init(struct rk_crypto_info *dev)
 {
struct ahash_request *req = ahash_request_cast(dev->async_req);
struct rk_ahash_rctx *rctx = ahash_request_ctx(req);
-   int reg_status = 0;
+   int reg_status;
 
reg_status = CRYPTO_READ(dev, RK_CRYPTO_CTRL) |
 RK_CRYPTO_HASH_FLUSH | _SBF(0x, 16);
-- 
2.8.1



[PATCH] mm: memcontrol: fix forget to obtain the ref to objcg in split_page_memcg

2021-03-31 Thread Muchun Song
Christian Borntraeger reported a warning about "percpu ref
(obj_cgroup_release) <= 0 (-1) after switching to atomic".
Because we forgot to obtain the reference to the objcg and
wrongly obtain the reference of memcg.

Reported-by: Christian Borntraeger 
Signed-off-by: Muchun Song 
---
 include/linux/memcontrol.h | 6 ++
 mm/memcontrol.c| 6 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0e8907957227..c960fd49c3e8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -804,6 +804,12 @@ static inline void obj_cgroup_get(struct obj_cgroup *objcg)
percpu_ref_get(>refcnt);
 }
 
+static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
+  unsigned long nr)
+{
+   percpu_ref_get_many(>refcnt, nr);
+}
+
 static inline void obj_cgroup_put(struct obj_cgroup *objcg)
 {
percpu_ref_put(>refcnt);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c0b83a396299..64ada9e650a5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3133,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned int nr)
 
for (i = 1; i < nr; i++)
head[i].memcg_data = head->memcg_data;
-   css_get_many(>css, nr - 1);
+
+   if (PageMemcgKmem(head))
+   obj_cgroup_get_many(__page_objcg(head), nr - 1);
+   else
+   css_get_many(>css, nr - 1);
 }
 
 #ifdef CONFIG_MEMCG_SWAP
-- 
2.11.0



[PATCH] f2fs: fix to avoid GC/mmap race with f2fs_truncate()

2021-03-31 Thread Chao Yu
It missed to hold i_gc_rwsem and i_map_sem around f2fs_truncate()
in f2fs_file_write_iter() to avoid racing with background GC and
mmap, fix it.

Signed-off-by: Chao Yu 
---
 fs/f2fs/file.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index dc79694e512c..f3ca63b55843 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4443,8 +4443,13 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
clear_inode_flag(inode, FI_NO_PREALLOC);
 
/* if we couldn't write data, we should deallocate blocks. */
-   if (preallocated && i_size_read(inode) < target_size)
+   if (preallocated && i_size_read(inode) < target_size) {
+   down_write(_I(inode)->i_gc_rwsem[WRITE]);
+   down_write(_I(inode)->i_mmap_sem);
f2fs_truncate(inode);
+   up_write(_I(inode)->i_mmap_sem);
+   up_write(_I(inode)->i_gc_rwsem[WRITE]);
+   }
 
if (ret > 0)
f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
-- 
2.29.2



Re: [PATCH v4 1/2] scsi: ufs: Fix task management request completion timeout

2021-03-31 Thread Can Guo

On 2021-04-01 00:45, Avri Altman wrote:

ufshcd_tmc_handler() calls blk_mq_tagset_busy_iter(fn =
ufshcd_compl_tm()),
but since blk_mq_tagset_busy_iter() only iterates over all reserved 
tags
and requests which are not in IDLE state, ufshcd_compl_tm() never gets 
a
chance to run. Thus, TMR always ends up with completion timeout. Fix 
it by

calling blk_mq_start_request() in  __ufshcd_issue_tm_cmd().

Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to 
allocate and

free TMFs")

Signed-off-by: Can Guo 
---
 drivers/scsi/ufs/ufshcd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index b49555fa..d4f8cb2 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -6464,6 +6464,7 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba
*hba,

spin_lock_irqsave(host->host_lock, flags);
task_tag = hba->nutrs + free_slot;
+   blk_mq_start_request(req);

Maybe just set req->state to MQ_RQ_IN_FLIGHT
Without all other irrelevant initializations such as add timeout etc.



I don't see any other drivers do that, is it appropriate
to call WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT) outside
block layer?

Thanks,
Can Guo.


Thanks,
Avri


treq->req_header.dword_0 |= cpu_to_be32(task_tag);

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a
Linux Foundation Collaborative Project.


Re: [PATCH AUTOSEL 5.11 10/38] net: correct sk_acceptq_is_full()

2021-03-31 Thread Sasha Levin

On Wed, Mar 31, 2021 at 06:17:27PM +0200, Eric Dumazet wrote:



On 3/30/21 12:21 AM, Sasha Levin wrote:

From: liuyacan 

[ Upstream commit f211ac154577ec9ccf07c15f18a6abf0d9bdb4ab ]

The "backlog" argument in listen() specifies
the maximom length of pending connections,
so the accept queue should be considered full
if there are exactly "backlog" elements.

Signed-off-by: liuyacan 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 129d200bccb4..a95f38a4b8c6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -936,7 +936,7 @@ static inline void sk_acceptq_added(struct sock *sk)

 static inline bool sk_acceptq_is_full(const struct sock *sk)
 {
-   return READ_ONCE(sk->sk_ack_backlog) > 
READ_ONCE(sk->sk_max_ack_backlog);
+   return READ_ONCE(sk->sk_ack_backlog) >= 
READ_ONCE(sk->sk_max_ack_backlog);
 }

 /*






I have not seen this patch going in our trees.

First, there was no Fixes: tag, so this is quite unfortunate.

Second, we already had such wrong patches in the past.

Please look at commits
64a146513f8f12ba204b7bf5cb7e9505594ead42 [NET]: Revert incorrect accept queue 
backlog changes.
8488df894d05d6fa41c2bd298c335f944bb0e401 [NET]: Fix bugs in "Whether sock accept 
queue is full" checking

Please revert  this patch, thanks !


Dropped, thanks for letting me know!

--
Thanks,
Sasha


[PATCH] ARM: dts: aspeed: add ASRock E3C246D4I BMC

2021-03-31 Thread Zev Weiss
This is a relatively low-cost AST2500-based Xeon E-2100/E-2200 series
mini-ITX board that we hope can provide a decent platform for OpenBMC
development.

This initial device-tree provides the necessary configuration for
basic BMC functionality such as host power control, serial console and
KVM support, and POST code snooping.

Signed-off-by: Zev Weiss 
Reviewed-by: Joel Stanley 
---
 arch/arm/boot/dts/Makefile|   1 +
 .../boot/dts/aspeed-bmc-asrock-e3c246d4i.dts  | 202 ++
 2 files changed, 203 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 8e5d4ab4e75e..b12911262ca1 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1406,6 +1406,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
aspeed-bmc-ampere-mtjade.dtb \
aspeed-bmc-arm-centriq2400-rep.dtb \
aspeed-bmc-arm-stardragon4800-rep2.dtb \
+   aspeed-bmc-asrock-e3c246d4i.dts \
aspeed-bmc-bytedance-g220a.dtb \
aspeed-bmc-facebook-cmm.dtb \
aspeed-bmc-facebook-galaxy100.dtb \
diff --git a/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts 
b/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts
new file mode 100644
index ..dcab6e78dfa4
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed-bmc-asrock-e3c246d4i.dts
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+
+#include "aspeed-g5.dtsi"
+#include 
+#include 
+
+/{
+   model = "ASRock E3C246D4I BMC";
+   compatible = "asrock,e3c246d4i-bmc", "aspeed,ast2500";
+
+   aliases {
+   serial4 = 
+   };
+
+   chosen {
+   stdout-path = 
+   bootargs = "console=tty0 console=ttyS4,115200 earlyprintk";
+   };
+
+   memory@8000 {
+   reg = <0x8000 0x2000>;
+   };
+
+   leds {
+   compatible = "gpio-leds";
+
+   heartbeat {
+   /* BMC_HB_LED_N */
+   gpios = < ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+   linux,default-trigger = "timer";
+   };
+
+   system-fault {
+   /* SYSTEM_FAULT_LED_N */
+   gpios = < ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+   panic-indicator;
+   };
+   };
+
+   gpio-keys {
+   compatible = "gpio-keys";
+
+   uid-button {
+   label = "uid-button";
+   gpios = < ASPEED_GPIO(F, 1) GPIO_ACTIVE_LOW>;
+   linux,code = ;
+   };
+   };
+
+   iio-hwmon {
+   compatible = "iio-hwmon";
+   io-channels = < 0>, < 1>, < 2>, < 3>, < 4>,
+   < 5>, < 6>, < 7>, < 8>, < 9>,
+   < 10>, < 11>, < 12>;
+   };
+};
+
+ {
+   status = "okay";
+   flash@0 {
+   status = "okay";
+   m25p,fast-read;
+   label = "bmc";
+   spi-max-frequency = <1>; /* 100 MHz */
+#include "openbmc-flash-layout.dtsi"
+   };
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+   aspeed,sirq-active-high;
+};
+
+ {
+   status = "okay";
+
+   pinctrl-names = "default";
+   pinctrl-0 = <_rgmii1_default _mdio1_default>;
+};
+
+ {
+   status = "okay";
+
+   /* thermal sensor, one diode run to a disconnected header */
+   w83773g@4c {
+   compatible = "nuvoton,w83773g";
+   reg = <0x4c>;
+   };
+};
+
+ {
+   status = "okay";
+
+   /* FRU EEPROM */
+   eeprom@57 {
+   compatible = "st,24c128", "atmel,24c128";
+   reg = <0x57>;
+   pagesize = <16>;
+   };
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+};
+
+_ctrl {
+   status = "okay";
+};
+
+_snoop {
+   status = "okay";
+   snoop-ports = <0x80>;
+};
+
+ {
+   status = "okay";
+   gpio-line-names =
+   /*  A */ "BMC_MAC1_INTB", "BMC_MAC2_INTB", "NMI_BTN_N", 
"BMC_NMI",
+   "", "", "", "",
+   /*  B */ "", "", "", "", "", "IRQ_BMC_PCH_SMI_LPC_N", "", "",
+   /*  C */ "", "", "", "", "", "", "", "",
+   /*  D */ "BMC_PSIN", "BMC_PSOUT", "BMC_RESETCON", "RESETCON",
+   "", "", "", "",
+   /*  E */ "", "", "", "", "", "", "", "",
+   /*  F */ "LOCATORLED_STATUS_N", "LOCATORBTN", "", "",
+   "", "", "BMC_PCH_SCI_LPC", "BMC_NCSI_MUX_CTL",
+   /*  G */ "HWM_BAT_EN", "CHASSIS_ID0", "CHASSIS_ID1", 
"CHASSIS_ID2",
+   "BMC_ALERT1_N_R", "BMC_ALERT2_N_R", "BMC_ALERT3_N", 
"SML0ALERT",
+   /*  H */ "FM_ME_RCVR_N", "O_PWROK", "SKL_CNL_R", 
"D4_DIMM_EVENT_3V_N",
+   "MFG_MODE_N", "BMC_RTCRST", "BMC_HB_LED_N", 
"BMC_CASEOPEN",
+

Re: [PATCH -next] staging: greybus: camera: Switch to memdup_user_nul()

2021-03-31 Thread Yang Yingliang

Hi,

On 2021/3/31 18:24, Dan Carpenter wrote:

On Wed, Mar 31, 2021 at 05:52:01PM +0800, Yang Yingliang wrote:

Use memdup_user_nul() helper instead of open-coding to
simplify the code.

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
  drivers/staging/greybus/camera.c | 10 ++
  1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/greybus/camera.c b/drivers/staging/greybus/camera.c
index b570e13394ac..0f005facffbc 100644
--- a/drivers/staging/greybus/camera.c
+++ b/drivers/staging/greybus/camera.c
@@ -1120,16 +1120,10 @@ static ssize_t gb_camera_debugfs_write(struct file 
*file,
if (len > 1024)
return -EINVAL;
  
-	kbuf = kmalloc(len + 1, GFP_KERNEL);

-   if (!kbuf)
+   kbuf = memdup_user_nul(buf, len);
+   if (IS_ERR(kbuf))
return -ENOMEM;

return PTR_ERR(kbuf);

  
-	if (copy_from_user(kbuf, buf, len)) {

-   ret = -EFAULT;
-   goto done;
-   }
-
-   kbuf[len] = '\0';
  


Please delete this blank line so there aren't two blank lines in a row.


I will change it and send a v2.

Thanks,

Yang




ret = op->execute(gcam, kbuf, len);

regards,
dan carpenter

.


Re: [PATCH v2] powerpc/traps: Enhance readability for trap types

2021-03-31 Thread Michael Ellerman
Segher Boessenkool  writes:
> On Wed, Mar 31, 2021 at 08:58:17PM +1100, Michael Ellerman wrote:
>> So perhaps:
>> 
>>   EXC_SYSTEM_RESET
>>   EXC_MACHINE_CHECK
>>   EXC_DATA_STORAGE
>>   EXC_DATA_SEGMENT
>>   EXC_INST_STORAGE
>>   EXC_INST_SEGMENT
>>   EXC_EXTERNAL_INTERRUPT
>>   EXC_ALIGNMENT
>>   EXC_PROGRAM_CHECK
>>   EXC_FP_UNAVAILABLE
>>   EXC_DECREMENTER
>>   EXC_HV_DECREMENTER
>>   EXC_SYSTEM_CALL
>>   EXC_HV_DATA_STORAGE
>>   EXC_PERF_MONITOR
>
> These are interrupt (vectors), not exceptions.  It doesn't matter all
> that much, but confusing things more isn't useful either!  There can be
> multiple exceptions that all can trigger the same interrupt.

Yeah I know, but I think that ship has already sailed as far as the
naming we have in the kernel.

We have over 250 uses of "exc", and several files called "exception"
something.

Using "interrupt" can also be confusing because Linux uses that to mean
"external interrupt".

But I dunno, maybe INT or VEC is clearer? .. or TRAP :)

cheers


[PATCH -next] ACPI: processor: Fix a prepocessor warning

2021-03-31 Thread Shixin Liu
When compiling with defconfig on x86_64, I got a warning:

drivers/acpi/processor_idle.c: In function ‘acpi_idle_play_dead’:
drivers/acpi/processor_idle.c:542:15: warning: extra tokens at end of #ifdef 
directive
  542 | #ifdef defined(CONFIG_X86) && defined(CONFIG_HOTPLUG_CPU)
  |

Fixes: bc5706eaeae0 ("ACPI: processor: Fix CPU0 wakeup in 
acpi_idle_play_dead()")
Signed-off-by: Shixin Liu 
---
 drivers/acpi/processor_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 19fb28a8005b..0925b1477230 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -539,7 +539,7 @@ static int acpi_idle_play_dead(struct cpuidle_device *dev, 
int index)
} else
return -ENODEV;
 
-#ifdef defined(CONFIG_X86) && defined(CONFIG_HOTPLUG_CPU)
+#if defined(CONFIG_X86) && defined(CONFIG_HOTPLUG_CPU)
/* If NMI wants to wake up CPU0, start CPU0. */
if (wakeup_cpu0())
start_cpu0();
-- 
2.25.1



Re: [PATCH] phy: hisilicon: Use the correct HiSilicon copyright

2021-03-31 Thread fanghao (A)




On 2021/3/31 20:29, Vinod Koul wrote:

On 30-03-21, 14:47, Hao Fang wrote:

s/Hisilicon/HiSilicon/g.
It should use capital S,
according to https://www.hisilicon.com/en/terms-of-use.


And I have not agreed to those terms of use! If you wish to change the
name, please do send the patch dropping this terms of use link. I dont
mind name appearing properly...



I put a link to show the correct example for copyright, maybe it makes a 
misunderstanding.
I will change it to "according to the official website", and send V2.

Thanks.

Hao


Thanks


Signed-off-by: Hao Fang 
---
 drivers/phy/hisilicon/phy-hi6220-usb.c   | 2 +-
 drivers/phy/hisilicon/phy-hix5hd2-sata.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/phy/hisilicon/phy-hi6220-usb.c 
b/drivers/phy/hisilicon/phy-hi6220-usb.c
index be05292..e92ba78 100644
--- a/drivers/phy/hisilicon/phy-hi6220-usb.c
+++ b/drivers/phy/hisilicon/phy-hi6220-usb.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (c) 2015 Linaro Ltd.
- * Copyright (c) 2015 Hisilicon Limited.
+ * Copyright (c) 2015 HiSilicon Limited.
  */

 #include 
diff --git a/drivers/phy/hisilicon/phy-hix5hd2-sata.c 
b/drivers/phy/hisilicon/phy-hix5hd2-sata.c
index c67b78c..b0f99a9 100644
--- a/drivers/phy/hisilicon/phy-hix5hd2-sata.c
+++ b/drivers/phy/hisilicon/phy-hix5hd2-sata.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (c) 2014 Linaro Ltd.
- * Copyright (c) 2014 Hisilicon Limited.
+ * Copyright (c) 2014 HiSilicon Limited.
  */

 #include 
--
2.8.1






Re: [PATCH -next] staging: rtl8723bs: os_dep: remove unused variable 'ret'

2021-03-31 Thread Yang Yingliang

Hi,

On 2021/3/31 18:27, Greg KH wrote:

On Wed, Mar 31, 2021 at 05:42:47PM +0800, Yang Yingliang wrote:

GCC reports the following warning with W=1:

drivers/staging/rtl8723bs/os_dep/recv_linux.c:101:6: warning:
  variable ‘ret’ set but not used [-Wunused-but-set-variable]
   101 |  int ret;
   |  ^~~

This variable is not used in function , this commit
remove it to fix the warning.

Fixes: de69e2b3f105 ("staging: rtl8723bs: remove DBG_COUNTER calls from 
os_dep/recv_linux.c")
Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
  drivers/staging/rtl8723bs/os_dep/recv_linux.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/staging/rtl8723bs/os_dep/recv_linux.c 
b/drivers/staging/rtl8723bs/os_dep/recv_linux.c
index fbdbcd04d44a..f6a9482be8e3 100644
--- a/drivers/staging/rtl8723bs/os_dep/recv_linux.c
+++ b/drivers/staging/rtl8723bs/os_dep/recv_linux.c
@@ -98,7 +98,6 @@ struct sk_buff *rtw_os_alloc_msdu_pkt(union recv_frame 
*prframe, u16 nSubframe_L
  void rtw_os_recv_indicate_pkt(struct adapter *padapter, struct sk_buff *pkt, 
struct rx_pkt_attrib *pattrib)
  {
struct mlme_priv *pmlmepriv = >mlmepriv;
-   int ret;
  
  	/* Indicate the packets to upper layer */

if (pkt) {
@@ -140,7 +139,7 @@ void rtw_os_recv_indicate_pkt(struct adapter *padapter, 
struct sk_buff *pkt, str
  
  		pkt->ip_summed = CHECKSUM_NONE;
  
-		ret = rtw_netif_rx(padapter->pnetdev, pkt);

+   rtw_netif_rx(padapter->pnetdev, pkt);

Why not handle the result of this call properly?


The return type of rtw_os_recv_indicate_pkt() is void, it can't use this 
return code.


I will try to make a patch to change return type of 
rtw_os_recv_indicate_pkt() to use

this return code later.

Thanks,

Yang



.


Re: [PATCH v5] mm/gup: check page hwposion status for coredump.

2021-03-31 Thread Aili Yao
On Wed, 31 Mar 2021 08:44:53 +0200
David Hildenbrand  wrote:

> On 31.03.21 06:32, HORIGUCHI NAOYA(堀口 直也) wrote:
> > On Wed, Mar 31, 2021 at 10:43:36AM +0800, Aili Yao wrote:  
> >> On Wed, 31 Mar 2021 01:52:59 + HORIGUCHI NAOYA(堀口 直也) 
> >>  wrote:  
> >>> On Fri, Mar 26, 2021 at 03:22:49PM +0100, David Hildenbrand wrote:  
>  On 26.03.21 15:09, David Hildenbrand wrote:  
> > On 22.03.21 12:33, Aili Yao wrote:  
> >> When we do coredump for user process signal, this may be one SIGBUS 
> >> signal
> >> with BUS_MCEERR_AR or BUS_MCEERR_AO code, which means this signal is
> >> resulted from ECC memory fail like SRAR or SRAO, we expect the memory
> >> recovery work is finished correctly, then the get_dump_page() will not
> >> return the error page as its process pte is set invalid by
> >> memory_failure().
> >>
> >> But memory_failure() may fail, and the process's related pte may not be
> >> correctly set invalid, for current code, we will return the poison 
> >> page,
> >> get it dumped, and then lead to system panic as its in kernel code.
> >>
> >> So check the hwpoison status in get_dump_page(), and if TRUE, return 
> >> NULL.
> >>
> >> There maybe other scenario that is also better to check hwposion status
> >> and not to panic, so make a wrapper for this check, Thanks to David's
> >> suggestion().
> >>
> >> Link: 
> >> https://lkml.kernel.org/r/20210319104437.6f30e80d@alex-virtual-machine
> >> Signed-off-by: Aili Yao 
> >> Cc: David Hildenbrand 
> >> Cc: Matthew Wilcox 
> >> Cc: Naoya Horiguchi 
> >> Cc: Oscar Salvador 
> >> Cc: Mike Kravetz 
> >> Cc: Aili Yao 
> >> Cc: sta...@vger.kernel.org
> >> Signed-off-by: Andrew Morton 
> >> ---
> >> mm/gup.c  |  4 
> >> mm/internal.h | 20 
> >> 2 files changed, 24 insertions(+)
> >>
> >> diff --git a/mm/gup.c b/mm/gup.c
> >> index e4c224c..6f7e1aa 100644
> >> --- a/mm/gup.c
> >> +++ b/mm/gup.c
> >> @@ -1536,6 +1536,10 @@ struct page *get_dump_page(unsigned long addr)
> >>  FOLL_FORCE | FOLL_DUMP | 
> >> FOLL_GET);
> >>if (locked)
> >>mmap_read_unlock(mm);  
> >
> > Thinking again, wouldn't we get -EFAULT from __get_user_pages_locked()
> > when stumbling over a hwpoisoned page?
> >
> > See __get_user_pages_locked()->__get_user_pages()->faultin_page():
> >
> > handle_mm_fault()->vm_fault_to_errno(), which translates
> > VM_FAULT_HWPOISON to -EFAULT, unless FOLL_HWPOISON is set (-> 
> > -EHWPOISON)
> >
> > ?  
> >>>
> >>> We could get -EFAULT, but sometimes not (depends on how memory_failure() 
> >>> fails).
> >>>
> >>> If we failed to unmap, the page table is not converted to hwpoison entry,
> >>> so __get_user_pages_locked() get the hwpoisoned page.
> >>>
> >>> If we successfully unmapped but failed in truncate_error_page() for 
> >>> example,
> >>> the processes mapping the page would get -EFAULT as expected.  But even in
> >>> this case, other processes could reach the error page via page cache and
> >>> __get_user_pages_locked() for them could return the hwpoisoned page.
> >>>  
> 
>  Or doesn't that happen as you describe "But memory_failure() may fail, 
>  and
>  the process's related pte may not be correctly set invalid" -- but why 
>  does
>  that happen?  
> >>>
> >>> Simply because memory_failure() doesn't handle some page types like ksm 
> >>> page
> >>> and zero page. Or maybe shmem thp also belongs to this class.  
> 
> Thanks for that info!
> 
> >>>  
> 
>  On a similar thought, should get_user_pages() never return a page that 
>  has
>  HWPoison set? E.g., check also for existing PTEs if the page is 
>  hwpoisoned?  
> >>>
> >>> Make sense to me. Maybe inserting hwpoison check into follow_page_pte() 
> >>> and
> >>> follow_huge_pmd() would work well.  
> >>
> >> I think we should take more care to broadcast the hwpoison check to other 
> >> cases,
> >> SIGBUS coredump is such a case that it is supposed to not touch the poison 
> >> page,
> >> and if we return NULL for this, the coredump process will get a successful 
> >> finish.
> >>
> >> Other cases may also meet the requirements like coredump, but we need to 
> >> identify it,
> >> that's the poison check wrapper's purpose. If not, we may break the 
> >> integrity of the
> >> related action, which may be no better than panic.  

I think I have wrong logic here, before this patch, the code has already 
returned error for
pages which the user pte has been set invalid because of hwpoison. And this 
patch is adding another
missing scenario for the same purpose. Without this patch, the code may still 
fail in gup.c for
hwpoison case, I think that's OK as it's already there. Then the same rule will 
apply to this missing
case, I 

Re: [PATCH v2] fs: Improve eventpoll logging to stop indicting timerfd

2021-03-31 Thread Al Viro
On Wed, Mar 31, 2021 at 07:16:45PM -0700, Manish Varma wrote:
> timerfd doesn't create any wakelocks, but eventpoll can.  When it does,
> it names them after the underlying file descriptor, and since all
> timerfd file descriptors are named "[timerfd]" (which saves memory on
> systems like desktops with potentially many timerfd instances), all
> wakesources created as a result of using the eventpoll-on-timerfd idiom
> are called... "[timerfd]".
> 
> However, it becomes impossible to tell which "[timerfd]" wakesource is
> affliated with which process and hence troubleshooting is difficult.
> 
> This change addresses this problem by changing the way eventpoll
> wakesources are named:
> 
> 1) the top-level per-process eventpoll wakesource is now named "epoll:P"
> (instead of just "eventpoll"), where P, is the PID of the creating
> process.
> 2) individual per-underlying-filedescriptor eventpoll wakesources are
> now named "epollitemN:P.F", where N is a unique ID token and P is PID
> of the creating process and F is the name of the underlying file
> descriptor.
> 
> All together that should be splitted up into a change to eventpoll and
> timerfd (or other file descriptors).

FWIW, it smells like a variant of wakeup_source_register() that would
take printf format + arguments would be a good idea.  I.e. something
like

> + snprintf(buf, sizeof(buf), "epoll:%d", task_pid);
> + epi->ep->ws = wakeup_source_register(NULL, buf);

... = wakeup_source_register(NULL, "epoll:%d", task_pid);

etc.


[PATCH] mm/page_alloc: redundant definition variables of pfn in for loop

2021-03-31 Thread huxiang
This variable pfn is defined repeatedly, so it can be deleted.

Signed-off-by: huxiang 
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cfc728739..740224232 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3277,7 +3277,7 @@ void free_unref_page_list(struct list_head *list)
 
local_irq_save(flags);
list_for_each_entry_safe(page, next, list, lru) {
-   unsigned long pfn = page_private(page);
+   pfn = page_private(page);
 
set_page_private(page, 0);
trace_mm_page_free_batched(page);
-- 
2.20.1





[PATCH -next 3/3] mm/debug_vm_pgtable: Remove useless pfn_pmd()

2021-03-31 Thread Shixin Liu
The call to pfn_pmd() here is redundant.

Signed-off-by: Shixin Liu 
---
 mm/debug_vm_pgtable.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c379bbe42c2a..9f4c4a114229 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -196,7 +196,6 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 
pgtable_trans_huge_deposit(mm, pmdp, pgtable);
 
-   pmd = pfn_pmd(pfn, prot);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_set_wrprotect(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
-- 
2.25.1



[PATCH -next 1/3] mm/debug_vm_pgtable: Fix one comment mistake

2021-03-31 Thread Shixin Liu
The branch condition should be CONFIG_TRANSPARENT_HUGEPAGE instead of
CONFIG_ARCH_HAS_PTE_DEVMAP.

Signed-off-by: Shixin Liu 
---
 mm/debug_vm_pgtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 05efe98a9ac2..a5c71a94e804 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -755,12 +755,12 @@ static void __init pmd_swap_soft_dirty_tests(unsigned 
long pfn, pgprot_t prot)
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
 }
-#else  /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+#else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
 static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
 static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
 {
 }
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
 {
-- 
2.25.1



[PATCH -next 2/3] mm/debug_vm_pgtable: Move {pmd/pud}_huge_tests out of CONFIG_TRANSPARENT_HUGEPAGE

2021-03-31 Thread Shixin Liu
The functions {pmd/pud}_set_huge and {pmd/pud}_clear_huge is not depend on THP.
But now if we want to test these functions, we have to enable THP. So move
{pmd/pud}_huge_tests out of CONFIG_TRANSPARENT_HUGEPAGE.

Signed-off-by: Shixin Liu 
---
 mm/debug_vm_pgtable.c | 91 +++
 1 file changed, 39 insertions(+), 52 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index a5c71a94e804..c379bbe42c2a 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -242,29 +242,6 @@ static void __init pmd_leaf_tests(unsigned long pfn, 
pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
 }
 
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t 
prot)
-{
-   pmd_t pmd;
-
-   if (!arch_vmap_pmd_supported(prot))
-   return;
-
-   pr_debug("Validating PMD huge\n");
-   /*
-* X86 defined pmd_set_huge() verifies that the given
-* PMD is not a populated non-leaf entry.
-*/
-   WRITE_ONCE(*pmdp, __pmd(0));
-   WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
-   WARN_ON(!pmd_clear_huge(pmdp));
-   pmd = READ_ONCE(*pmdp);
-   WARN_ON(!pmd_none(pmd));
-}
-#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t 
prot) { }
-#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-
 static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
pmd_t pmd = pfn_pmd(pfn, prot);
@@ -379,30 +356,6 @@ static void __init pud_leaf_tests(unsigned long pfn, 
pgprot_t prot)
pud = pud_mkhuge(pud);
WARN_ON(!pud_leaf(pud));
 }
-
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t 
prot)
-{
-   pud_t pud;
-
-   if (!arch_vmap_pud_supported(prot))
-   return;
-
-   pr_debug("Validating PUD huge\n");
-   /*
-* X86 defined pud_set_huge() verifies that the given
-* PUD is not a populated non-leaf entry.
-*/
-   WRITE_ONCE(*pudp, __pud(0));
-   WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
-   WARN_ON(!pud_clear_huge(pudp));
-   pud = READ_ONCE(*pudp);
-   WARN_ON(!pud_none(pud));
-}
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t 
prot) { }
-#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-
 #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, 
int idx) { }
 static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -412,9 +365,6 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 {
 }
 static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t 
prot)
-{
-}
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
 static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
@@ -433,14 +383,51 @@ static void __init pud_advanced_tests(struct mm_struct 
*mm,
 }
 static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
 static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t 
prot)
 {
+   pmd_t pmd;
+
+   if (!arch_vmap_pmd_supported(prot))
+   return;
+
+   pr_debug("Validating PMD huge\n");
+   /*
+* X86 defined pmd_set_huge() verifies that the given
+* PMD is not a populated non-leaf entry.
+*/
+   WRITE_ONCE(*pmdp, __pmd(0));
+   WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+   WARN_ON(!pmd_clear_huge(pmdp));
+   pmd = READ_ONCE(*pmdp);
+   WARN_ON(!pmd_none(pmd));
 }
+
 static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t 
prot)
 {
+   pud_t pud;
+
+   if (!arch_vmap_pud_supported(prot))
+   return;
+
+   pr_debug("Validating PUD huge\n");
+   /*
+* X86 defined pud_set_huge() verifies that the given
+* PUD is not a populated non-leaf entry.
+*/
+   WRITE_ONCE(*pudp, __pud(0));
+   WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+   WARN_ON(!pud_clear_huge(pudp));
+   pud = READ_ONCE(*pudp);
+   WARN_ON(!pud_none(pud));
 }
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t 
prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t 
prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
 static 

[PATCH v3] sysfs: Unconditionally use vmalloc for buffer

2021-03-31 Thread Kees Cook
The sysfs interface to seq_file continues to be rather fragile
(seq_get_buf() should not be used outside of seq_file), as seen with
some recent exploits[1]. Move the seq_file buffer to the vmap area
(while retaining the accounting flag), since it has guard pages that
will catch and stop linear overflows. This seems justified given that
sysfs's use of seq_file already uses kvmalloc(), is almost always using
a PAGE_SIZE or larger allocation, has normally short-lived allocations,
and is not normally on a performance critical path.

Once seq_get_buf() has been removed (and all sysfs callbacks using
seq_file directly), this change can also be removed.

[1] https://blog.grimm-co.com/2021/03/new-old-bugs-in-linux-kernel.html

Signed-off-by: Kees Cook 
---
v3:
- Limit to only sysfs (instead of all of seq_file).
v2: https://lore.kernel.org/lkml/20210315174851.68-1-keesc...@chromium.org/
v1: https://lore.kernel.org/lkml/20210312205558.2947488-1-keesc...@chromium.org/
---
 fs/sysfs/file.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9aefa7779b29..70e7a450e5d1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "sysfs.h"
 
@@ -32,6 +33,25 @@ static const struct sysfs_ops *sysfs_file_ops(struct 
kernfs_node *kn)
return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
 
+/*
+ * To be proactively defensive against sysfs show() handlers that do not
+ * correctly stay within their PAGE_SIZE buffer, use the vmap area to gain
+ * the trailing guard page which will stop linear buffer overflows.
+ */
+static void *sysfs_kf_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+   struct kernfs_open_file *of = sf->private;
+   struct kernfs_node *kn = of->kn;
+
+   WARN_ON_ONCE(sf->buf);
+   sf->buf = __vmalloc(kn->attr.size, GFP_KERNEL_ACCOUNT);
+   if (!sf->buf)
+   return ERR_PTR(-ENOMEM);
+   sf->size = kn->attr.size;
+
+   return NULL + !*ppos;
+}
+
 /*
  * Reads on sysfs are handled through seq_file, which takes care of hairy
  * details like buffering and seeking.  The following function pipes
@@ -206,14 +226,17 @@ static const struct kernfs_ops sysfs_file_kfops_empty = {
 };
 
 static const struct kernfs_ops sysfs_file_kfops_ro = {
+   .seq_start  = sysfs_kf_seq_start,
.seq_show   = sysfs_kf_seq_show,
 };
 
 static const struct kernfs_ops sysfs_file_kfops_wo = {
+   .seq_start  = sysfs_kf_seq_start,
.write  = sysfs_kf_write,
 };
 
 static const struct kernfs_ops sysfs_file_kfops_rw = {
+   .seq_start  = sysfs_kf_seq_start,
.seq_show   = sysfs_kf_seq_show,
.write  = sysfs_kf_write,
 };
-- 
2.25.1



Re: [PATCH v7 5/8] mm: Device exclusive memory access

2021-03-31 Thread Alistair Popple
On Thursday, 1 April 2021 11:48:13 AM AEDT Jason Gunthorpe wrote:
> On Thu, Apr 01, 2021 at 11:45:57AM +1100, Alistair Popple wrote:
> > On Thursday, 1 April 2021 12:46:04 AM AEDT Jason Gunthorpe wrote:
> > > On Thu, Apr 01, 2021 at 12:27:52AM +1100, Alistair Popple wrote:
> > > > On Thursday, 1 April 2021 12:18:54 AM AEDT Jason Gunthorpe wrote:
> > > > > On Wed, Mar 31, 2021 at 11:59:28PM +1100, Alistair Popple wrote:
> > > > > 
> > > > > > I guess that makes sense as the split could go either way at the
> > > > > > moment but I should add a check to make sure this isn't used with
> > > > > > pinned pages anyway.
> > > > > 
> > > > > Is it possible to have a pinned page under one of these things? If I
> > > > > pin it before you migrate it then it remains pinned but hidden under
> > > > > the swap entry?
> > > > 
> > > > At the moment yes. But I had planned (and this reminded me) to add a 
check 
> > to 
> > > > prevent marking pinned pages for exclusive access. 
> > > 
> > > How do you even do that without races with GUP fast?
> > 
> > Unless I've missed something I think I've convinced myself it should be 
safe 
> > to do the pin check after make_device_exclusive() has replaced all the 
PTEs 
> > with exclusive entries.
> > 
> > GUP fast sequence:
> > 1. Read PTE
> > 2. Pin page
> > 3. Check PTE
> > 4. if PTE changed -> unpin and fallback
> > 
> > If make_device_exclusive() runs after (1) it will either succeed or see 
the 
> > pin from (2) and fail (as desired). GUP should always see the PTE change 
and 
> > fallback which will revoke the exclusive access.
> 
> AFAICT the user can trigger fork at that instant and fork will try to
> copy the desposited migration entry before it has been checked

In that case the child will get a read-only exclusive entry and eventually a 
page copy via do_wp_page() and GUP will fallback (or fail in the case of fast 
only) so the parent's exclusive entry will get removed before the page can be 
pinned and therefore shouldn't split the wrong way.

But that is sounding rather complex, and I am not convinced I haven't missed a 
corner case. It also seems like it shouldn't be necessary to copy exclusive 
entries anyway. I could just remove them and restore the original entry, which 
would be far simpler.

> Jason
> 






Re: [RFC PATCH -tip 3/3] x86/kprobes,orc: Unwind optprobe trampoline correctly

2021-03-31 Thread Masami Hiramatsu
On Thu, 1 Apr 2021 10:44:52 +0900
Masami Hiramatsu  wrote:

> On Wed, 31 Mar 2021 10:57:36 -0500
> Josh Poimboeuf  wrote:
> 
> > On Wed, Mar 31, 2021 at 02:44:56PM +0900, Masami Hiramatsu wrote:
> > > +#ifdef CONFIG_UNWINDER_ORC
> > > +unsigned long recover_optprobe_trampoline(unsigned long addr, unsigned 
> > > long *sp)
> > > +{
> > > + unsigned long offset, entry, probe_addr;
> > > + struct optimized_kprobe *op;
> > > + struct orc_entry *orc;
> > > +
> > > + entry = find_kprobe_optinsn_slot_entry(addr);
> > > + if (!entry)
> > > + return addr;
> > > +
> > > + offset = addr - entry;
> > > +
> > > + /* Decode arg1 and get the optprobe */
> > > + op = (void *)extract_set_arg1((void *)(entry + TMPL_MOVE_IDX));
> > > + if (!op)
> > > + return addr;
> > > +
> > > + probe_addr = (unsigned long)op->kp.addr;
> > > +
> > > + if (offset < TMPL_END_IDX) {
> > > + orc = orc_find((unsigned long)optprobe_template_func + offset);
> > > + if (!orc || orc->sp_reg != ORC_REG_SP)
> > > + return addr;
> > > + /*
> > > +  * Since optprobe trampoline doesn't push caller on the stack,
> > > +  * need to decrement 1 stack entry size
> > > +  */
> > > + *sp += orc->sp_offset - sizeof(long);
> > > + return probe_addr;
> > > + } else {
> > > + return probe_addr + offset - TMPL_END_IDX;
> > > + }
> > > +}
> > > +#endif
> > 
> > Hm, I'd like to avoid intertwining kprobes and ORC like this.
> > 
> > ORC unwinds other generated code by assuming the generated code uses a
> > frame pointer.  Could we do that here?
> 
> No, because the optprobe is not a function call. I considered to make
> it call, but since it has to execute copied instructions directly on
> the trampoline code (without changing stack frame) it is not possible.
> 
> > With CONFIG_FRAME_POINTER, unwinding works because SAVE_REGS_STRING has
> > ENCODE_FRAME_POINTER, but that's not going to work for ORC.
> 
> Even in that case, the problem is that any interrupt can happen
> before doing ENCODE_FRAME_POINTER. I think this ENCODE_FRAME_POINTER
> in the SAVE_REGS_STRING is for probing right before the target
> function setup a frame pointer.
> 
> > Instead of these patches, can we 'push %rbp; mov %rsp, %rbp' at the
> > beginning of the template and 'pop %rbp' at the end?
> 
> No, since the trampoline code is not called, it is jumped into.
> This means there is no "return address" in the stack. If we setup
> the frame, there is no return address, thus it might stop there.
> (Moreover, optprobe can copy multiple instructins on trampoline
> buffer, since relative jump consumes 5bytes. where is the "return address"?)
> 
> > 
> > I guess SAVE_REGS_STRING would need to be smart enough to push the
> > original saved version of %rbp.  Of course then that breaks the
> > kretprobe_trampoline() usage, so it may need to be a separate macro.
> > 
> > [ Or make the same change to kretprobe_trampoline().  Then the other
> >   patch set wouldn't be needed either ;-) ]
> 
> Hmm, I don't think it is a good idea which making such change on the
> optimized (hot) path only for the stack tracing. Moreover, that maybe
> not transparent with the stack made by int3.
> 
> > Of course the downside is, when you get an interrupt during the frame
> > pointer setup, unwinding is broken.  But I think that's acceptable for
> > generated code.  We've lived with that limitation for all code, with
> > CONFIG_FRAME_POINTER, for many years.
> 
> But above code can fix such issue too. To fix a corner case, non-generic
> code may be required, even it is not so simple.

Hmm, I would like to confirm your policy on ORC unwinder. If it doesn't
care the stacktrace from the interrupt handler, I think your suggestion
is OK. But in that case, from a developer viewpoint, I need to recommend
users to configure CONFIG_UNWIND_FRAME=y when CONFIG_KPROBES=y.

> > Eventually we may want to have a way to register generated code (and the
> > ORC for it).

I see, but the generated code usually does not have a generic way to
handle it. E.g. bpf has a solid entry point, but kretprobe trampoline's
entry point is any "RET", optprobe trampoline's entry point is a jump
which is also generated (patched) ...

Thank you,

-- 
Masami Hiramatsu 


Re: [PATCH 1/2] fs/namespace: corrent/improve kernel-doc notation

2021-03-31 Thread Al Viro
On Wed, Mar 31, 2021 at 02:24:18PM -0600, Jonathan Corbet wrote:
> Randy Dunlap  writes:
> 
> > Fix kernel-doc warnings in fs/namespace.c:
> >
> > ./fs/namespace.c:1379: warning: Function parameter or member 'm' not 
> > described in 'may_umount_tree'
> > ./fs/namespace.c:1379: warning: Excess function parameter 'mnt' description 
> > in 'may_umount_tree'
> > ./fs/namespace.c:1950: warning: Function parameter or member 'path' not 
> > described in 'clone_private_mount'
> >
> > Also convert path_is_mountpoint() comments to kernel-doc.
> >
> > Signed-off-by: Randy Dunlap 
> > Cc: Al Viro 
> > Cc: Jonathan Corbet 
> > Cc: linux-...@vger.kernel.org
> > ---
> > Jon, Al has OK-ed you to merge this patch (and the next one, please).
> >
> >  fs/namespace.c |   14 --
> >  1 file changed, 8 insertions(+), 6 deletions(-)
> 
> An actual acked-by would have been nice, oh well.  Meanwhile, I've
> applied them with fixes to the typos in both changelogs :)

Generally speaking, I'm only glad to leave handling kernel-doc patches
to somebody else, especially when they are that trivial ;-)

Anyway,
Acked-by: Al Viro 


[PATCH v2] fs: Improve eventpoll logging to stop indicting timerfd

2021-03-31 Thread Manish Varma
timerfd doesn't create any wakelocks, but eventpoll can.  When it does,
it names them after the underlying file descriptor, and since all
timerfd file descriptors are named "[timerfd]" (which saves memory on
systems like desktops with potentially many timerfd instances), all
wakesources created as a result of using the eventpoll-on-timerfd idiom
are called... "[timerfd]".

However, it becomes impossible to tell which "[timerfd]" wakesource is
affliated with which process and hence troubleshooting is difficult.

This change addresses this problem by changing the way eventpoll
wakesources are named:

1) the top-level per-process eventpoll wakesource is now named "epoll:P"
(instead of just "eventpoll"), where P, is the PID of the creating
process.
2) individual per-underlying-filedescriptor eventpoll wakesources are
now named "epollitemN:P.F", where N is a unique ID token and P is PID
of the creating process and F is the name of the underlying file
descriptor.

All together that should be splitted up into a change to eventpoll and
timerfd (or other file descriptors).

Co-developed-by: Kelly Rossmoyer 
Signed-off-by: Kelly Rossmoyer 
Signed-off-by: Manish Varma 
---
 fs/eventpoll.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7df8c0fa462b..8d3369a02633 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -297,6 +297,7 @@ static LIST_HEAD(tfile_check_list);
 
 static long long_zero;
 static long long_max = LONG_MAX;
+static atomic_t wakesource_create_id  = ATOMIC_INIT(0);
 
 struct ctl_table epoll_table[] = {
{
@@ -1451,15 +1452,23 @@ static int ep_create_wakeup_source(struct epitem *epi)
 {
struct name_snapshot n;
struct wakeup_source *ws;
+   pid_t task_pid;
+   char buf[64];
+   int id;
+
+   task_pid = task_pid_nr(current);
 
if (!epi->ep->ws) {
-   epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
+   snprintf(buf, sizeof(buf), "epoll:%d", task_pid);
+   epi->ep->ws = wakeup_source_register(NULL, buf);
if (!epi->ep->ws)
return -ENOMEM;
}
 
+   id = atomic_inc_return(_create_id);
take_dentry_name_snapshot(, epi->ffd.file->f_path.dentry);
-   ws = wakeup_source_register(NULL, n.name.name);
+   snprintf(buf, sizeof(buf), "epollitem%d:%d.%s", id, task_pid, 
n.name.name);
+   ws = wakeup_source_register(NULL, buf);
release_dentry_name_snapshot();
 
if (!ws)
-- 
2.31.0.291.g576ba9dcdaf-goog



Re: [PATCH v31 07/12] landlock: Support filesystem access-control

2021-03-31 Thread Al Viro
On Wed, Mar 31, 2021 at 07:33:50PM +0200, Mickaël Salaün wrote:

> > +static inline u64 unmask_layers(
> > +   const struct landlock_ruleset *const domain,
> > +   const struct path *const path, const u32 access_request,
> > +   u64 layer_mask)
> > +{
> > +   const struct landlock_rule *rule;
> > +   const struct inode *inode;
> > +   size_t i;
> > +
> > +   if (d_is_negative(path->dentry))
> > +   /* Continues to walk while there is no mapped inode. */
 ^
Odd comment, that...

> > +static int check_access_path(const struct landlock_ruleset *const domain,
> > +   const struct path *const path, u32 access_request)
> > +{

> > +   walker_path = *path;
> > +   path_get(_path);

> > +   while (true) {
> > +   struct dentry *parent_dentry;
> > +
> > +   layer_mask = unmask_layers(domain, _path,
> > +   access_request, layer_mask);
> > +   if (layer_mask == 0) {
> > +   /* Stops when a rule from each layer grants access. */
> > +   allowed = true;
> > +   break;
> > +   }
> > +
> > +jump_up:
> > +   if (walker_path.dentry == walker_path.mnt->mnt_root) {
> > +   if (follow_up(_path)) {
> > +   /* Ignores hidden mount points. */
> > +   goto jump_up;
> > +   } else {
> > +   /*
> > +* Stops at the real root.  Denies access
> > +* because not all layers have granted access.
> > +*/
> > +   allowed = false;
> > +   break;
> > +   }
> > +   }
> > +   if (unlikely(IS_ROOT(walker_path.dentry))) {
> > +   /*
> > +* Stops at disconnected root directories.  Only allows
> > +* access to internal filesystems (e.g. nsfs, which is
> > +* reachable through /proc//ns/).
> > +*/
> > +   allowed = !!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
> > +   break;
> > +   }
> > +   parent_dentry = dget_parent(walker_path.dentry);
> > +   dput(walker_path.dentry);
> > +   walker_path.dentry = parent_dentry;
> > +   }
> > +   path_put(_path);
> > +   return allowed ? 0 : -EACCES;

That's a whole lot of grabbing/dropping references...  I realize that it's
an utterly tactless question, but... how costly it is?  IOW, do you have
profiling data?

> > +/*
> > + * pivot_root(2), like mount(2), changes the current mount namespace.  It 
> > must
> > + * then be forbidden for a landlocked process.

... and cross-directory rename(2) can change the tree topology.  Do you ban that
as well?

[snip]

> > +static int hook_path_rename(const struct path *const old_dir,
> > +   struct dentry *const old_dentry,
> > +   const struct path *const new_dir,
> > +   struct dentry *const new_dentry)
> > +{
> > +   const struct landlock_ruleset *const dom =
> > +   landlock_get_current_domain();
> > +
> > +   if (!dom)
> > +   return 0;
> > +   /* The mount points are the same for old and new paths, cf. EXDEV. */
> > +   if (old_dir->dentry != new_dir->dentry)
> > +   /* For now, forbids reparenting. */
> > +   return -EACCES;

You do, apparently, and not in a way that would have the userland fall
back to copy+unlink.  Lovely...  Does e.g. git survive such restriction?
Same question for your average package build...


  1   2   3   4   5   6   7   8   9   10   >