[PATCH v2] uml: fix a boot splat wrt use of cpu_all_mask

2019-04-10 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

Memory: 509108K/542612K available (3835K kernel code, 919K rwdata, 1028K 
rodata, 129K init, 211K bss, 33504K reserved, 0K cma-reserved)
NR_IRQS: 15
clocksource: timer: mask: 0x max_cycles: 0x1cd42e205, 
max_idle_ns: 881590404426 ns
[ cut here ]
WARNING: CPU: 0 PID: 0 at kernel/time/clockevents.c:458 
clockevents_register_device+0x72/0x140
posix-timer cpumask == cpu_all_mask, using cpu_possible_mask instead
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 5.1.0-rc4-00048-ged79cc87302b #4
Stack:
 604ebda0 603c5370 604ebe20 6046fd17
  6006fcbb 604ebdb0 603c53b5
 604ebe10 6003bfc4 604ebdd0 901ca
Call Trace:
 [<6006fcbb>] ? printk+0x0/0x94
 [<60083160>] ? clockevents_register_device+0x72/0x140
 [<6001f16e>] show_stack+0x13b/0x155
 [<603c5370>] ? dump_stack_print_info+0xe2/0xeb
 [<6006fcbb>] ? printk+0x0/0x94
 [<603c53b5>] dump_stack+0x2a/0x2c
 [<6003bfc4>] __warn+0x10e/0x13e
 [<60070320>] ? vprintk_func+0xc8/0xcf
 [<60030fd6>] ? block_signals+0x0/0x16
 [<6006fcbb>] ? printk+0x0/0x94
 [<6003c08b>] warn_slowpath_fmt+0x97/0x99
 [<600311a1>] ? set_signals+0x0/0x3f
 [<6003bff4>] ? warn_slowpath_fmt+0x0/0x99
 [<600842cb>] ? tick_oneshot_mode_active+0x44/0x4f
 [<60030fd6>] ? block_signals+0x0/0x16
 [<6006fcbb>] ? printk+0x0/0x94
 [<6007d2d5>] ? __clocksource_select+0x20/0x1b1
 [<60030fd6>] ? block_signals+0x0/0x16
 [<6006fcbb>] ? printk+0x0/0x94
 [<60083160>] clockevents_register_device+0x72/0x140
 [<60031192>] ? get_signals+0x0/0xf
 [<60030fd6>] ? block_signals+0x0/0x16
 [<6006fcbb>] ? printk+0x0/0x94
 [<60002eec>] um_timer_setup+0xc8/0xca
 [<60001b59>] start_kernel+0x47f/0x57e
 [<600035bc>] start_kernel_proc+0x49/0x4d
 [<6006c483>] ? kmsg_dump_register+0x82/0x8a
 [<6001de62>] new_thread_handler+0x81/0xb2
 [<60003571>] ? kmsg_dumper_stdout_init+0x1a/0x1c
 [<60020c75>] uml_finishsetup+0x54/0x59

random: get_random_bytes called from init_oops_id+0x27/0x34 with crng_init=0
---[ end trace 00173d0117a88acb ]---
Calibrating delay loop... 6941.90 BogoMIPS (lpj=34709504)

Signed-off-by: Maciej Żenczykowski 
Cc: Jeff Dike 
Cc: Richard Weinberger 
Cc: Anton Ivanov 
Cc: linux...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 arch/um/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 052de4c8acb2..0c572a48158e 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -56,7 +56,7 @@ static int itimer_one_shot(struct clock_event_device *evt)
 static struct clock_event_device timer_clockevent = {
.name   = "posix-timer",
.rating = 250,
-   .cpumask= cpu_all_mask,
+   .cpumask= cpu_possible_mask,
.features   = CLOCK_EVT_FEAT_PERIODIC |
  CLOCK_EVT_FEAT_ONESHOT,
.set_state_shutdown = itimer_shutdown,
-- 
2.21.0.392.gf8f6787159e-goog



Re: [PATCH v3 4/9] livepatch: Add klp-convert annotation helpers

2019-04-10 Thread Joe Lawrence
On Wed, Apr 10, 2019 at 11:50:53AM -0400, Joe Lawrence wrote:
> From: Josh Poimboeuf 
> 
> Define macros KLP_MODULE_RELOC and KLP_SYMPOS in
> include/linux/livepatch.h to improve user-friendliness of the
> livepatch annotation process.
> 
> Signed-off-by: Josh Poimboeuf 
> Signed-off-by: Joao Moreira 
> Signed-off-by: Joe Lawrence 
> ---
>  include/linux/livepatch.h | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
> index 16b48e8b29a2..947cfc2d1980 100644
> --- a/include/linux/livepatch.h
> +++ b/include/linux/livepatch.h
> @@ -236,6 +236,18 @@ void *klp_shadow_get_or_alloc(void *obj, unsigned long 
> id,
>  void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor);
>  void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor);
>  
> +/* Used to annotate symbol relocations in livepatches */
> +#define KLP_MODULE_RELOC(obj)
> \
> + struct klp_module_reloc \
> + __attribute__((__section__(".klp.module_relocs." #obj)))\
> + __attribute__((aligned (4)))
> +
> +#define KLP_SYMPOS(symbol, pos)  
> \
> + {   \
> + .sym = &symbol, \
> + .sympos = pos,  \
> + },
^^
nit: if we dropped the trailing array comma delimiter from KLP_SYMPOS
macro, the invocations would look more intuitively like an array.  For
example:

  KLP_MODULE_RELOC(test_klp_convert_mod) test_klp_convert_mod_relocs_a[] = {
KLP_SYMPOS(driver_name, 0),
KLP_SYMPOS(homonym_string, 2),
KLP_SYMPOS(get_homonym_string, 2),
  };

But I could not figure out a good regex to reference if other such
kernel preprocessor macros include or exclude the delimiter.  Are there
reasons to include it?

-- Joe


Re: [PATCH 1/2] module: Prepare for addition of new ro_after_init sections

2019-04-10 Thread Kees Cook
On Wed, Apr 10, 2019 at 10:48 AM Joel Fernandes  wrote:
> Thanks, if you are Ok with it, I will add your Reviewed-by tag as well.

With those fixes, absolutely. :) Thanks!

-- 
Kees Cook


Re: [PATCH V5 08/12] perf/x86/intel: Add Icelake support

2019-04-10 Thread Liang, Kan




On 4/8/2019 11:45 AM, Liang, Kan wrote:



On 4/8/2019 11:06 AM, Peter Zijlstra wrote:
On Tue, Apr 02, 2019 at 12:45:05PM -0700, kan.li...@linux.intel.com 
wrote:

+static struct event_constraint *
+icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+  struct perf_event *event)
+{
+    /*
+ * Fixed counter 0 has less skid.
+ * Force instruction:ppp in Fixed counter 0
+ */
+    if ((event->attr.precise_ip == 3) &&
+    ((event->hw.config & X86_RAW_EVENT_MASK) == 0x00c0))
+    return &fixed_counter0_constraint;


Does that want to be:

    event->hw.config == X86_CONFIG(.event=0xc0)

?

That is, are there really bits we want to mask in there?


For instruction event, right, we don't need mask it.
I will change it.



Actually, we have to mask some bits here, e.g. 
ARCH_PERFMON_EVENTSEL_INT, ARCH_PERFMON_EVENTSEL_USR and 
ARCH_PERFMON_EVENTSEL_OS. Those bits will be set in hw_config().


Also, other filds, e.g the INV, ANY, E, or CMASK fields are not allowed 
for reduced Skid PEBS.



diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index dae3d84..3fa36c9 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3463,6 +3463,9 @@ hsw_get_event_constraints(struct cpu_hw_events 
*cpuc, int idx,

return c;
 }

+#define EVENT_CONFIG(config)   \
+   (config & (X86_ALL_EVENT_FLAGS | INTEL_ARCH_EVENT_MASK))
+
 static struct event_constraint *
 icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
  struct perf_event *event)
@@ -3472,7 +3475,7 @@ icl_get_event_constraints(struct cpu_hw_events 
*cpuc, int idx,

 * Force instruction:ppp in Fixed counter 0
 */
if ((event->attr.precise_ip == 3) &&
-   (event->hw.config == X86_CONFIG(.event=0xc0)))
+   (EVENT_CONFIG(event->hw.config) == X86_CONFIG(.event=0xc0)))
return &fixed_counter0_constraint;

return hsw_get_event_constraints(cpuc, idx, event);

Thanks,
Kan


Re: [PATCH v3 3/9] livepatch: Add klp-convert tool

2019-04-10 Thread Joe Lawrence
On Wed, Apr 10, 2019 at 11:50:52AM -0400, Joe Lawrence wrote:
>
> [ ... snip ... ]
>
> diff --git a/scripts/livepatch/klp-convert.c b/scripts/livepatch/klp-convert.c
> new file mode 100644
> index ..62bd26941081
> --- /dev/null
> +++ b/scripts/livepatch/klp-convert.c
> 
> [ ... snip ... ]
>
> + if (argc != 4) {
> + WARN("Usage: %s   ", argv[0]);
   ^^
nit: since we're using .klp.o prefix, should this be "input.klp.o"

-- Joe


[PATCH][next] iio: temperature: max31856: fix uninitialized error return

2019-04-10 Thread Colin King
From: Colin Ian King 

Currently if mask is neither CHAN_INFO_RAW or CHAN_INFO_SCALE then
then an uninitialized error return 'ret' is returned. Fix this by
adding a default case that ensures -EINVAL is returned for this
specific case.

Addresses-Coverity: ("Uninitialized scalar variable")
Signed-off-by: Colin Ian King 
---
 drivers/iio/temperature/max31856.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iio/temperature/max31856.c 
b/drivers/iio/temperature/max31856.c
index 6b67d6b95cf9..f184ba5601d9 100644
--- a/drivers/iio/temperature/max31856.c
+++ b/drivers/iio/temperature/max31856.c
@@ -210,6 +210,9 @@ static int max31856_read_raw(struct iio_dev *indio_dev,
return IIO_VAL_INT_PLUS_MICRO;
}
break;
+   default:
+   ret = -EINVAL;
+   break;
}
 
return ret;
-- 
2.20.1



Re: [PATCH 1/2] module: Prepare for addition of new ro_after_init sections

2019-04-10 Thread Steven Rostedt
On Wed, 10 Apr 2019 11:21:23 -0700
Kees Cook  wrote:

> On Wed, Apr 10, 2019 at 10:48 AM Joel Fernandes  
> wrote:
> > Thanks, if you are Ok with it, I will add your Reviewed-by tag as well.  
> 
> With those fixes, absolutely. :) Thanks!
> 

I'll wait for v2 before adding my reviewed-by. ;-)

-- Steve


Re: [PATCH] staging: greybus: power_supply: Use struct_size() helper

2019-04-10 Thread Gustavo A. R. Silva
Hi Johan,

On 4/4/19 1:57 AM, Johan Hovold wrote:
> 
> This patch looks good, but I noticed a bug here in the current code,
> which should be fixed before applying this clean up.
> 
> sizeof(req) should have been sizeof(*req) above.
> 

Good catch.

>> - sizeof(struct gb_power_supply_props_desc),
>> + sizeof(req),
>> + struct_size(resp, props, props_count),
>>   GFP_KERNEL);
>>  if (!op)
>>  return -ENOMEM;
> 
> I've just submitted a fix (and CCed you as well). 
> 
> Would you mind respinning on top of that one?
> 

Yep. I'll send v2 shortly.

Thanks
--
Gustavo


[PATCH] ARM: dts: rockchip: Remove unnecessary setting of UART0 SCLK rate on veyron

2019-04-10 Thread Matthias Kaehlcke
Some veyron devices have a Bluetooth controller connected on UART0.
The UART needs to operate at a high speed, however setting the clock
rate at initialization has no practical effect. During initialization
user space adjusts the UART baudrate multiple times, which ends up
changing the SCLK rate. After a successful initiatalization the clk
is running at the desired speed (48MHz).

Remove the unnecessary clock rate configuration from the DT.

Signed-off-by: Matthias Kaehlcke 
---
 arch/arm/boot/dts/rk3288-veyron.dtsi | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/arm/boot/dts/rk3288-veyron.dtsi 
b/arch/arm/boot/dts/rk3288-veyron.dtsi
index 0bc2409f6903..97e980383e25 100644
--- a/arch/arm/boot/dts/rk3288-veyron.dtsi
+++ b/arch/arm/boot/dts/rk3288-veyron.dtsi
@@ -378,10 +378,6 @@
 &uart0 {
status = "okay";
 
-   /* We need to go faster than 24MHz, so adjust clock parents / rates */
-   assigned-clocks = <&cru SCLK_UART0>;
-   assigned-clock-rates = <4800>;
-
/* Pins don't include flow control by default; add that in */
pinctrl-names = "default";
pinctrl-0 = <&uart0_xfer &uart0_cts &uart0_rts>;
-- 
2.21.0.392.gf8f6787159e-goog



Re: [PATCH v2 2/5] arm64, mm: Move generic mmap layout functions to mm

2019-04-10 Thread Kees Cook
On Wed, Apr 10, 2019 at 12:33 AM Alexandre Ghiti  wrote:
>
> On 04/10/2019 08:59 AM, Christoph Hellwig wrote:
> > On Thu, Apr 04, 2019 at 01:51:25AM -0400, Alexandre Ghiti wrote:
> >> - fix the case where stack randomization should not be taken into
> >>account.
> > Hmm.  This sounds a bit vague.  It might be better if something
> > considered a fix is split out to a separate patch with a good
> > description.
>
> Ok, I will move this fix in another patch.

Yeah, I think it'd be best to break this into a few (likely small) patches:
- update the compat case in the arm64 code
- fix the "not randomized" case
- move the code to mm/ (line-for-line identical for easy review)

That'll make it much easier to review (at least for me).

Thanks!

-- 
Kees Cook


Re: [PATCH] staging: greybus: power_supply: Use struct_size() helper

2019-04-10 Thread Gustavo A. R. Silva
Johan,

On 4/4/19 2:24 AM, Johan Hovold wrote:
> On Thu, Apr 04, 2019 at 08:09:51AM +0100, Rui Miguel Silva wrote:
>> Hi Gustavo,
>> Thanks a lot for the patch.
>>
>> On Wed 03 Apr 2019 at 21:58, Gustavo A. R. Silva wrote:
>>> Make use of the struct_size() helper instead of an open-coded 
>>> version
>>> in order to avoid any potential type mistakes, in particular in 
>>> the
>>> context in which this code is being used.
>>>
>>> So, replace code of the following form:
>>>
>>> sizeof(*resp) + props_count * sizeof(struct 
>>> gb_power_supply_props_desc)
>>>
>>> with:
>>>
>>> struct_size(resp, props, props_count)
>>>
>>> This code was detected with the help of Coccinelle.
>>>
>>> Signed-off-by: Gustavo A. R. Silva 
>>
>> What are the odds of 2 people changing same code in greybus in
>> the same day :).
> 
> Well, I only noticed the bug in the current code, when reviewing
> Gustavo's diff. ;)
> 

Apparently, your patch hasn't been applied to any tree yet.  So, I'll
wait for it to be applied before sending v2.

Thanks
--
Gustavo


Re: [PATCH-tip v2 03/12] locking/rwsem: Remove rwsem_wake() wakeup optimization

2019-04-10 Thread Davidlohr Bueso

On Fri, 05 Apr 2019, Waiman Long wrote:


With the commit 59aabfc7e959 ("locking/rwsem: Reduce spinlock contention
in wakeup after up_read()/up_write()"), the rwsem_wake() forgoes doing
a wakeup if the wait_lock cannot be directly acquired and an optimistic
spinning locker is present.  This can help performance by avoiding
spinning on the wait_lock when it is contended.

With the later commit 133e89ef5ef3 ("locking/rwsem: Enable lockless
waiter wakeup(s)"), the performance advantage of the above optimization
diminishes as the average wait_lock hold time become much shorter.

By supporting rwsem lock handoff, we can no longer relies on the fact
that the presence of an optimistic spinning locker will ensure that the
lock will be acquired by a task soon. This can lead to missed wakeup
and application hang. So the commit 59aabfc7e959 ("locking/rwsem:
Reduce spinlock contention in wakeup after up_read()/up_write()")
will have to be reverted.

Signed-off-by: Waiman Long 
---
kernel/locking/rwsem-xadd.c | 74 -


I very much like removing this code. It was rather crazy.


Re: [PATCH] clk: rockchip: Fix video codec clocks on rk3288

2019-04-10 Thread Jonas Karlman
On 2019-04-10 17:45, Doug Anderson wrote:
> Hi,
>
> On Fri, Mar 29, 2019 at 2:55 PM Douglas Anderson  
> wrote:
>> It appears that there is a typo in the rk3288 TRM.  For
>> GRF_SOC_CON0[7] it says that 0 means "vepu" and 1 means "vdpu".  It's
>> the other way around.
>>
>> How do I know?  Here's my evidence:
>>
>> 1. Prior to commit 4d3e84f99628 ("clk: rockchip: describe aclk_vcodec
>>using the new muxgrf type on rk3288") we always pretended that we
>>were using "aclk_vdpu" and the comment in the code said that this
>>matched the default setting in the system.  In fact the default
>>setting is 0 according to the TRM and according to reading memory
>>at bootup.  In addition rk3288-based Chromebooks ran like this and
>>the video codecs worked.
>> 2. With the existing clock code if you boot up and try to enable the
>>new VIDEO_ROCKCHIP_VPU as a module (and without "clk_ignore_unused"
>>on the command line), you get errors like "failed to get ack on
>>domain 'pd_video', val=0x80208".  After flipping vepu/vdpu things
>>init OK.
>> 3. If I export and add both the vepu and vdpu to the list of clocks
>>for RK3288_PD_VIDEO I can get past the power domain errors, but now
>>I freeze when the vpu_mmu gets initted.
>> 4. If I just mark the "vdpu" as IGNORE_UNUSED then everything boots up
>>and probes OK showing that somehow the "vdpu" was important to keep
>>enabled.  This is because we were actually using it as a parent.
>> 5. After this change I can hack "aclk_vcodec_pre" to parent from
>>"aclk_vepu" using assigned-clocks and the video codec still probes
>>OK.
>>
>> Fixes: 4d3e84f99628 ("clk: rockchip: describe aclk_vcodec using the new 
>> muxgrf type on rk3288")
>> Signed-off-by: Douglas Anderson 
>> ---
>> I currently have no way to test the JPEG mem2mem driver, so hopefully
>> others can test this and make sure it's happy for them.  I'm just
>> happy not to get strange errors at boot anymore.
>>
>>  drivers/clk/rockchip/clk-rk3288.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
> Any thoughts about this patch?  I'm 99.9% certain it's correct and
> it'd be nice to get it landed.  Heiko: I assume you're still
> collecting Rockchip clock patches and would be the one to apply it and
> (at some point) send a pull request to the clock tree?
>
> -Doug

This clk fix is needed to make MPEG-2 decoding work on my RK3288 Tinker Board 
using the
rockchip vpu patchset and a patch to add RK3288 specific MPEG-2 code [1].

Also note that the same change was suggested in a previous patch [2] from by 
ayaka.

If possible please also add the CLK_SET_RATE_PARENT change from [3] in a 
possible v2,
that fixes assigning the aclk_vcodec clk to 400Mhz in the rockchip vpu driver.

[1] 
https://github.com/Kwiboo/linux-rockchip/commit/1f78093e05c7360515a185f48b7c5cb8ba1e3e15
[2] https://patchwork.kernel.org/patch/9725553/
[3] 
https://github.com/Kwiboo/linux-rockchip/commit/9216da3f1521a0be5889235abe7fa093a4894160

Regards,
Jonas



Re: [PATCH v2 2/2] PCI/DPC: Add Error Disconnect Recover (EDR) support

2019-04-10 Thread Bjorn Helgaas
On Tue, Mar 19, 2019 at 01:47:29PM -0700, 
sathyanarayanan.kuppusw...@linux.intel.com wrote:
> From: Kuppuswamy Sathyanarayanan 
> 
> As per PCI firmware specification v3.2 ECN
> (https://members.pcisig.com/wg/PCI-SIG/document/12614), when firmware
> owns Downstream Port Containment (DPC), its expected to use the "Error
> Disconnect Recover" (EDR) notification to alert OSPM of a DPC event and
> if OS supports EDR, its expected to handle the software state invalidation
> and port recovery in OS and let firmware know the recovery status via _OST
> ACPI call.
> 
> Since EDR is a hybrid service, DPC service shall be enabled in OS even
> if AER is not natively enabled in OS.
> 
> Signed-off-by: Kuppuswamy Sathyanarayanan 
> 
> ---
>  drivers/pci/pcie/Kconfig|  10 +
>  drivers/pci/pcie/dpc.c  | 326 ++--

I'll be looking for Keith's reviewed-by for this eventually.

It looks like this could be split into some smaller patches:

  - Add dpc_dev.native_dpc (hopefully we can find a less confusing name)
  - Convert dpc_handler() to dpc_handler() + dpc_process_error()
  - Add EDR support

>  drivers/pci/pcie/portdrv_core.c |   9 +-
>  3 files changed, 329 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
> index 5cbdbca904ac..55f65d1cbc9e 100644
> --- a/drivers/pci/pcie/Kconfig
> +++ b/drivers/pci/pcie/Kconfig
> @@ -142,3 +142,13 @@ config PCIE_PTM
>  
> This is only useful if you have devices that support PTM, but it
> is safe to enable even if you don't.
> +
> +config PCIE_EDR
> + bool "PCI Express Error Disconnect Recover support"
> + default n
> + depends on PCIE_DPC && ACPI
> + help
> +   This options adds Error Disconnect Recover support as specified
> +   in PCI firmware specification v3.2 Downstream Port Containment
> +   Related Enhancements ECN. Enable this if you want to support hybrid
> +   DPC model which uses both firmware and OS to implement DPC.
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 7b77754a82de..bdf4ca8a0acb 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "portdrv.h"
>  #include "../pci.h"
> @@ -20,8 +21,23 @@ struct dpc_dev {
>   u16 cap_pos;
>   boolrp_extensions;
>   u8  rp_log_size;
> + boolnative_dpc;

This is going to be way too confusing with a "native_dpc" in both the
struct pci_host_bridge and the struct dpc_dev.

> + pci_ers_result_terror_state;
> +#ifdef CONFIG_ACPI
> + struct acpi_device  *adev;
> +#endif
>  };
>  
> +#ifdef CONFIG_ACPI
> +
> +#define EDR_PORT_ENABLE_DSM 0x0C
> +#define EDR_PORT_LOCATE_DSM 0x0D
> +
> +static const guid_t pci_acpi_dsm_guid =
> + GUID_INIT(0xe5c937d0, 0x3553, 0x4d7a,
> +   0x91, 0x17, 0xea, 0x4d, 0x19, 0xc3, 0x43, 0x4d);
> +#endif
> +
>  static const char * const rp_pio_error_string[] = {
>   "Configuration Request received UR Completion",  /* Bit Position 0  */
>   "Configuration Request received CA Completion",  /* Bit Position 1  */
> @@ -67,6 +83,9 @@ void pci_save_dpc_state(struct pci_dev *dev)
>   if (!dpc)
>   return;
>  
> + if (!dpc->native_dpc)
> + return;
> +
>   save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
>   if (!save_state)
>   return;
> @@ -88,6 +107,9 @@ void pci_restore_dpc_state(struct pci_dev *dev)
>   if (!dpc)
>   return;
>  
> + if (!dpc->native_dpc)
> + return;
> +
>   save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
>   if (!save_state)
>   return;
> @@ -224,10 +246,9 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev 
> *dev,
>   return 1;
>  }
>  
> -static irqreturn_t dpc_handler(int irq, void *context)
> +static void dpc_process_error(struct dpc_dev *dpc)
>  {
>   struct aer_err_info info;
> - struct dpc_dev *dpc = context;
>   struct pci_dev *pdev = dpc->dev->port;
>   struct device *dev = &dpc->dev->device;
>   u16 cap = dpc->cap_pos, status, source, reason, ext_reason;
> @@ -261,6 +282,13 @@ static irqreturn_t dpc_handler(int irq, void *context)
>  
>   /* We configure DPC so it only triggers on ERR_FATAL */
>   pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC);
> +}
> +
> +static irqreturn_t dpc_handler(int irq, void *context)
> +{
> + struct dpc_dev *dpc = context;
> +
> + dpc_process_error(dpc);
>  
>   return IRQ_HANDLED;
>  }
> @@ -283,6 +311,230 @@ static irqreturn_t dpc_irq(int irq, void *context)
>   return IRQ_HANDLED;
>  }
>  
> +void dpc_error_resume(struct pci_dev *dev)

Looks like this should be static?

> +{
> + struct dpc_dev *dpc;
> +
> + dpc = to_dpc_dev(de

[PATCH-tip v3 01/14] locking/rwsem: Prevent unneeded warning during locking selftest

2019-04-10 Thread Waiman Long
Disable the DEBUG_RWSEMS check when locking selftest is running with
debug_locks_silent flag set.

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 37db17890e36..64877f5294e3 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -30,7 +30,8 @@
 
 #ifdef CONFIG_DEBUG_RWSEMS
 # define DEBUG_RWSEMS_WARN_ON(c, sem)  do {\
-   if (WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 
0x%lx, curr 0x%lx, list %sempty\n",\
+   if (!debug_locks_silent &&  \
+   WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 
0x%lx, curr 0x%lx, list %sempty\n",\
#c, atomic_long_read(&(sem)->count),\
(long)((sem)->owner), (long)current,\
list_empty(&(sem)->wait_list) ? "" : "not "))   \
-- 
2.18.1



[PATCH-tip v3 03/14] locking/rwsem: Implement a new locking scheme

2019-04-10 Thread Waiman Long
The current way of using various reader, writer and waiting biases
in the rwsem code are confusing and hard to understand. I have to
reread the rwsem count guide in the rwsem-xadd.c file from time to
time to remind myself how this whole thing works. It also makes the
rwsem code harder to be optimized.

To make rwsem more sane, a new locking scheme similar to the one in
qrwlock is now being used.  The atomic long count has the following
bit definitions:

  Bit  0   - writer locked bit
  Bit  1   - waiters present bit
  Bits 2-7 - reserved for future extension
  Bits 8-X - reader count (24/56 bits)

The cmpxchg instruction is now used to acquire the write lock. The read
lock is still acquired with xadd instruction, so there is no change here.
This scheme will allow up to 16M/64P active readers which should be
more than enough. We can always use some more reserved bits if necessary.

With that change, we can deterministically know if a rwsem has been
write-locked. Looking at the count alone, however, one cannot determine
for certain if a rwsem is owned by readers or not as the readers that
set the reader count bits may be in the process of backing out. So we
still need the reader-owned bit in the owner field to be sure.

With a locking microbenchmark running on 5.1 based kernel, the total
locking rates (in kops/s) of the benchmark on a 8-socket 120-core
IvyBridge-EX system before and after the patch were as follows:

  Before Patch  After Patch
   # of Threads  wlockrlockwlockrlock
     ----
130,659   31,341   31,055   31,283
2 8,909   16,4579,884   17,659
4 9,028   15,8238,933   20,233
8 8,410   14,2127,230   17,140
   16 8,217   25,2407,479   24,607

The locking rates of the benchmark on a Power8 system were as follows:

  Before Patch  After Patch
   # of Threads  wlockrlockwlockrlock
     ----
112,963   13,647   13,275   13,601
2 7,570   11,5697,902   10,829
4 5,2325,5165,4665,435
8 5,2333,3865,4673,168

The locking rates of the benchmark on a 2-socket ARM64 system were
as follows:

  Before Patch  After Patch
   # of Threads  wlockrlockwlockrlock
     ----
121,495   21,046   21,524   21,074
2 5,293   10,5025,333   10,504
4 5,325   11,4635,358   11,631
8 5,391   11,7125,470   11,680

The performance are roughly the same before and after the patch. There
are run-to-run variations in performance. Runs with higher variances
usually have higher throughput.

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c | 147 
 kernel/locking/rwsem.h  |  74 +-
 2 files changed, 86 insertions(+), 135 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 6b3ee9948bf1..adab6477be51 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -9,6 +9,8 @@
  *
  * Optimistic spinning by Tim Chen 
  * and Davidlohr Bueso . Based on mutexes.
+ *
+ * Rwsem count bit fields re-definition by Waiman Long .
  */
 #include 
 #include 
@@ -22,52 +24,20 @@
 #include "rwsem.h"
 
 /*
- * Guide to the rw_semaphore's count field for common values.
- * (32-bit case illustrated, similar for 64-bit)
- *
- * 0x000X  (1) X readers active or attempting lock, no writer waiting
- * X = #active_readers + #readers attempting to lock
- * (X*ACTIVE_BIAS)
- *
- * 0x  rwsem is unlocked, and no one is waiting for the lock or
- * attempting to read lock or write lock.
- *
- * 0x000X  (1) X readers active or attempting lock, with waiters for lock
- * X = #active readers + # readers attempting lock
- * (X*ACTIVE_BIAS + WAITING_BIAS)
- * (2) 1 writer attempting lock, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- * (3) 1 writer active, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- *
- * 0x0001  (1) 1 reader active or attempting lock, waiters for lock
- * (WAITING_BIAS + ACTIVE_BIAS)
- * (2) 1 writer active or attempting lock, no waiters for lock
- * (ACTIVE_WRITE_BIAS)
+ * Guide to the rw_semaphore's count field.
  *
- * 0x  (1) There are writers or readers queued but none active
- * or in the process of attempting lock.
- * (WAITING_BIAS)
- * Note: writer c

[PATCH-tip v3 10/14] locking/rwsem: Enable time-based spinning on reader-owned rwsem

2019-04-10 Thread Waiman Long
When the rwsem is owned by reader, writers stop optimistic spinning
simply because there is no easy way to figure out if all the readers
are actively running or not. However, there are scenarios where
the readers are unlikely to sleep and optimistic spinning can help
performance.

This patch provides a simple mechanism for spinning on a reader-owned
rwsem by a writer. It is a time threshold based spinning where the
allowable spinning time can vary from 10us to 25us depending on the
condition of the rwsem.

When the time threshold is exceeded, a bit will be set in the owner field
to indicate that no more optimistic spinning will be allowed on this
rwsem until it becomes writer owned again. Not even readers is allowed
to acquire the reader-locked rwsem by optimistic spinning for fairness.

The time taken for each iteration of the reader-owned rwsem spinning
loop varies. Below are sample minimum elapsed times for 16 iterations
of the loop.

  System Time for 16 Iterations
  -- --
  1-socket Skylake  ~800ns
  4-socket Broadwell~300ns
  2-socket ThunderX2 (arm64)~250ns

When the lock cacheline is contended, we can see up to almost 10X
increase in elapsed time.  So 25us will be at most 500, 1300 and 1600
iterations for each of the above systems.

With a locking microbenchmark running on 5.1 based kernel, the total
locking rates (in kops/s) on a 8-socket IvyBridge-EX system with
equal numbers of readers and writers before and after this patch were
as follows:

   # of Threads  Pre-patchPost-patch
     ---
2  1,7596,684
4  1,6846,738
8  1,0747,222
   169007,163
   324587,316
   64208  520
  128168  425
  240143  474

This patch gives a big boost in performance for mixed reader/writer
workloads.

With 32 locking threads, the rwsem lock event data were:

rwsem_opt_fail=79850
rwsem_opt_nospin=5069
rwsem_opt_rlock=597484
rwsem_opt_wlock=957339
rwsem_sleep_reader=57782
rwsem_sleep_writer=55663

With 64 locking threads, the data looked like:

rwsem_opt_fail=346723
rwsem_opt_nospin=6293
rwsem_opt_rlock=1127119
rwsem_opt_wlock=1400628
rwsem_sleep_reader=308201
rwsem_sleep_writer=72281

So a lot more threads acquired the lock in the slowpath and more threads
went to sleep.

Signed-off-by: Waiman Long 
---
 kernel/locking/lock_events_list.h |  1 +
 kernel/locking/rwsem-xadd.c   | 76 +--
 kernel/locking/rwsem.h| 45 ++
 3 files changed, 107 insertions(+), 15 deletions(-)

diff --git a/kernel/locking/lock_events_list.h 
b/kernel/locking/lock_events_list.h
index 333ed5fda333..f3550aa5866a 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -59,6 +59,7 @@ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups  
*/
 LOCK_EVENT(rwsem_opt_rlock)/* # of read locks opt-spin acquired*/
 LOCK_EVENT(rwsem_opt_wlock)/* # of write locks opt-spin acquired   */
 LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings*/
+LOCK_EVENT(rwsem_opt_nospin)   /* # of disabled reader opt-spinnings   */
 LOCK_EVENT(rwsem_rlock)/* # of read locks acquired 
*/
 LOCK_EVENT(rwsem_rlock_fast)   /* # of fast read locks acquired*/
 LOCK_EVENT(rwsem_rlock_fail)   /* # of failed read lock acquisitions   */
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7ee78a752815..09b245e0d6f4 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "rwsem.h"
@@ -314,7 +315,7 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
owner = READ_ONCE(sem->owner);
if (owner) {
ret = is_rwsem_owner_spinnable(owner) &&
- owner_on_cpu(owner);
+(is_rwsem_owner_reader(owner) || owner_on_cpu(owner));
}
rcu_read_unlock();
preempt_enable();
@@ -339,7 +340,7 @@ enum owner_state {
OWNER_READER= 1 << 2,
OWNER_NONSPINNABLE  = 1 << 3,
 };
-#define OWNER_SPINNABLE(OWNER_NULL | OWNER_WRITER)
+#define OWNER_SPINNABLE(OWNER_NULL | OWNER_WRITER | 
OWNER_READER)
 
 static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
 {
@@ -350,7 +351,8 @@ static noinline enum owner_state rwsem_spin_on_owner(struct 
rw_semaphore *sem)
return OWNER_NONSPINNABLE;
 
rcu_read_lock();
-   while (owner && (READ_ONCE(sem->owner) == owner)) {
+   while (owner && !is_rwsem_owner_reader(owner)
+&& (READ_ONCE(sem->owner) == owner)) {
   

[PATCH-tip v3 00/14] locking/rwsem: Rwsem rearchitecture part 2

2019-04-10 Thread Waiman Long
 v3:
  - Add 2 more patches in front to fix build and testing issues found.
Patch 1 can actually be merged on top of the patch "locking/rwsem:
Enhance DEBUG_RWSEMS_WARN_ON() macro" in part 1.
  - Change the handoff patch (now patch 4) to set handoff bit immediately
after wakeup for RT writers. The timeout limit is also tightened to
4ms.
  - There is no code changes in other patches other than resolving conflicts
with patches 1, 2 and 4.

 v2:
  - Move the negative reader count checking patch (patch 12->10)
forward to before the merge owner to count patch as suggested by
Linus & expand the comment.
  - Change the reader-owned rwsem spinning from count based to time
based to have better control of the max time allowed.

This is part 2 of a 3-part (0/1/2) series to rearchitect the internal
operation of rwsem.

part 0: merged into tip
part 1: https://lore.kernel.org/lkml/20190404174320.22416-1-long...@redhat.com/

This patchset revamps the current rwsem-xadd implementation to make
it saner and easier to work with. It also implements the following 3
new features:

 1) Waiter lock handoff
 2) Reader optimistic spinning
 3) Store write-lock owner in the atomic count (x86-64 only)

Waiter lock handoff is similar to the mechanism currently in the mutex
code. This ensures that lock starvation won't happen.

Reader optimistic spinning enables readers to acquire the lock more
quickly.  So workloads that use a mix of readers and writers should
see an increase in performance as long as the reader critical sections
are short.

Finally, storing the write-lock owner into the count will allow
optimistic spinners to get to the lock holder's task structure more
quickly and eliminating the timing gap where the write lock is acquired
but the owner isn't known yet. This is important for RT tasks where
spinning on a lock with an unknown owner is not allowed.

Because of the fact that multiple readers can share the same lock,
there is a natural preference for readers when measuring in term of
locking throughput as more readers are likely to get into the locking
fast path than the writers. With waiter lock handoff, we are not going
to starve the writers.

On a 8-socket 120-core 240-thread IvyBridge-EX system with 120 reader
and writer locking threads, the min/mean/max locking operations done
in a 5-second testing window before the patchset were:

  120 readers, Iterations Min/Mean/Max = 399/400/401
  120 writers, Iterations Min/Mean/Max = 400/33,389/211,359

After the patchset, they became:

  120 readers, Iterations Min/Mean/Max = 584/10,266/26,609
  120 writers, Iterations Min/Mean/Max = 22,080/29,016/38,728

So it was much fairer to readers. With less locking threads, the readers
were preferred than writers.

Patch 1 fixes an testing issue with locking selftest introduced by the
patch "locking/rwsem: Enhance DEBUG_RWSEMS_WARN_ON() macro" in part 1.

Patch 2 makes owner a permanent member of the rw_semaphore structure and
set it irrespective of CONFIG_RWSEM_SPIN_ON_OWNER.

Patch 3 implements a new rwsem locking scheme similar to what qrwlock
is current doing. Write lock is done by atomic_cmpxchg() while read
lock is still being done by atomic_add().

Patch 4 implments lock handoff to prevent lock starvation.

Patch 5 removes rwsem_wake() wakeup optimization as it doesn't work
with lock handoff.

Patch 6 makes rwsem_spin_on_owner() returns owner state.

Patch 7 disallows RT tasks to spin on a rwsem with unknown owner.

Patch 8 makes reader wakeup to wake almost all the readers in the wait
queue instead of just those in the front.

Patch 9 enables reader to spin on a writer-owned rwsem.

Patch 10 enables a writer to spin on a reader-owned rwsem for at most
25us.

Patch 11 adds some new rwsem owner access helper functions.

Patch 12 handles the case of too many readers by reserving the sign
bit to designate that a reader lock attempt will fail and the locking
reader will be put to sleep. This will ensure that we will not overflow
the reader count.

Patch 13 merges the write-lock owner task pointer into the count.
Only 64-bit count has enough space to provide a reasonable number of
bits for reader count. This is for x86-64 only for the time being.

Patch 14 eliminates redundant computation of the merged owner-count.

With a locking microbenchmark running on 5.1 based kernel, the total
locking rates (in kops/s) on a 8-socket IvyBridge-EX system with equal
numbers of readers and writers (mixed) before and after this patchset
were:

   # of Threads   Before Patch  After Patch
        ---
21,179 9,436
41,505 8,268
8  721 7,041
   16  575 7,652
   32   70 2,189
   64   39   534


Waiman Long (14):
  locking/rwsem: Prevent unneeded warning during locking selftest
  locking/rwsem: Make owner a

[PATCH-tip v3 05/14] locking/rwsem: Remove rwsem_wake() wakeup optimization

2019-04-10 Thread Waiman Long
With the commit 59aabfc7e959 ("locking/rwsem: Reduce spinlock contention
in wakeup after up_read()/up_write()"), the rwsem_wake() forgoes doing
a wakeup if the wait_lock cannot be directly acquired and an optimistic
spinning locker is present.  This can help performance by avoiding
spinning on the wait_lock when it is contended.

With the later commit 133e89ef5ef3 ("locking/rwsem: Enable lockless
waiter wakeup(s)"), the performance advantage of the above optimization
diminishes as the average wait_lock hold time become much shorter.

By supporting rwsem lock handoff, we can no longer relies on the fact
that the presence of an optimistic spinning locker will ensure that the
lock will be acquired by a task soon. This can lead to missed wakeup
and application hang. So the commit 59aabfc7e959 ("locking/rwsem:
Reduce spinlock contention in wakeup after up_read()/up_write()")
will have to be reverted.

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c | 74 -
 1 file changed, 74 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index e6f1d218ceca..d4854bfad589 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -372,25 +372,11 @@ static bool rwsem_optimistic_spin(struct rw_semaphore 
*sem)
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
 }
-
-/*
- * Return true if the rwsem has active spinner
- */
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
-   return osq_is_locked(&sem->osq);
-}
-
 #else
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
return false;
 }
-
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
-   return false;
-}
 #endif
 
 /*
@@ -667,67 +653,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, 
long count)
unsigned long flags;
DEFINE_WAKE_Q(wake_q);
 
-   /*
-   * __rwsem_down_write_failed_common(sem)
-   *   rwsem_optimistic_spin(sem)
-   * osq_unlock(sem->osq)
-   *   ...
-   *   atomic_long_add_return(&sem->count)
-   *
-   *  - VS -
-   *
-   *  __up_write()
-   *if (atomic_long_sub_return_release(&sem->count) < 0)
-   *  rwsem_wake(sem)
-   *osq_is_locked(&sem->osq)
-   *
-   * And __up_write() must observe !osq_is_locked() when it observes the
-   * atomic_long_add_return() in order to not miss a wakeup.
-   *
-   * This boils down to:
-   *
-   * [S.rel] X = 1[RmW] r0 = (Y += 0)
-   * MB RMB
-   * [RmW]   Y += 1   [L]   r1 = X
-   *
-   * exists (r0=1 /\ r1=0)
-   */
-   smp_rmb();
-
-   /*
-* If a spinner is present and the handoff flag isn't set, it is
-* not necessary to do the wakeup.
-*
-* Try to do wakeup only if the trylock succeeds to minimize
-* spinlock contention which may introduce too much delay in the
-* unlock operation.
-*
-*spinning writer   up_write/up_read caller
-*---   ---
-* [S]   osq_unlock()   [L]   osq
-*   MB   RMB
-* [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
-*
-* Here, it is important to make sure that there won't be a missed
-* wakeup while the rwsem is free and the only spinning writer goes
-* to sleep without taking the rwsem. Even when the spinning writer
-* is just going to break out of the waiting loop, it will still do
-* a trylock in rwsem_down_write_failed() before sleeping. IOW, if
-* rwsem_has_spinner() is true, it will guarantee at least one
-* trylock attempt on the rwsem later on.
-*/
-   if (rwsem_has_spinner(sem) && !RWSEM_COUNT_HANDOFF(count)) {
-   /*
-* The smp_rmb() here is to make sure that the spinner
-* state is consulted before reading the wait_lock.
-*/
-   smp_rmb();
-   if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
-   return sem;
-   goto locked;
-   }
raw_spin_lock_irqsave(&sem->wait_lock, flags);
-locked:
 
if (!list_empty(&sem->wait_list))
__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-- 
2.18.1



[PATCH-tip v3 14/14] locking/rwsem: Remove redundant computation of writer lock word

2019-04-10 Thread Waiman Long
On 64-bit architectures, each rwsem writer will have its unique lock
word for acquiring the lock. Right now, the writer code recomputes the
lock word every time it tries to acquire the lock. This is a waste of
time. The lock word is now cached and reused when it is needed.

On 32-bit architectures, the extra constant argument to
rwsem_try_write_lock() and rwsem_try_write_lock_unqueued() should be
optimized out by the compiler.

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index b2b9d1719965..5545184f82f0 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -230,8 +230,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
  * race conditions between checking the rwsem wait list and setting the
  * sem->count accordingly.
  */
-static inline bool
-rwsem_try_write_lock(long count, struct rw_semaphore *sem, bool first)
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem,
+   const long wlock, bool first)
 {
long new;
 
@@ -241,7 +241,7 @@ rwsem_try_write_lock(long count, struct rw_semaphore *sem, 
bool first)
if (!first && RWSEM_COUNT_HANDOFF(count))
return false;
 
-   new = (count & ~RWSEM_FLAG_HANDOFF) + RWSEM_WRITER_LOCKED -
+   new = (count & ~RWSEM_FLAG_HANDOFF) + wlock -
  (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0);
 
if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) {
@@ -280,13 +280,14 @@ static inline bool rwsem_try_read_lock_unqueued(struct 
rw_semaphore *sem)
 /*
  * Try to acquire write lock before the writer has been put on wait queue.
  */
-static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem,
+const long wlock)
 {
long count = atomic_long_read(&sem->count);
 
while (!RWSEM_COUNT_LOCKED_OR_HANDOFF(count)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
-   count + RWSEM_WRITER_LOCKED)) {
+   count + wlock)) {
rwsem_set_owner(sem);
lockevent_inc(rwsem_opt_wlock);
return true;
@@ -436,7 +437,7 @@ static inline u64 rwsem_rspin_threshold(struct rw_semaphore 
*sem)
: 25 * NSEC_PER_USEC);
 }
 
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem, const long wlock)
 {
bool taken = false;
bool is_rt_task = rt_task(current);
@@ -465,7 +466,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, 
bool wlock)
/*
 * Try to acquire the lock
 */
-   taken = wlock ? rwsem_try_write_lock_unqueued(sem)
+   taken = wlock ? rwsem_try_write_lock_unqueued(sem, wlock)
  : rwsem_try_read_lock_unqueued(sem);
 
if (taken)
@@ -544,7 +545,8 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
return false;
 }
 
-static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem,
+const long wlock)
 {
return false;
 }
@@ -601,7 +603,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, 
int state, long count)
 */
atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
adjustment = 0;
-   if (rwsem_optimistic_spin(sem, false)) {
+   if (rwsem_optimistic_spin(sem, 0)) {
unsigned long flags;
 
/*
@@ -717,10 +719,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore 
*sem, int state)
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q);
+   const long wlock = RWSEM_WRITER_LOCKED;
 
/* do optimistic spinning and steal lock if possible */
if (rwsem_can_spin_on_owner(sem) &&
-   rwsem_optimistic_spin(sem, true))
+   rwsem_optimistic_spin(sem, wlock))
return sem;
 
/*
@@ -779,7 +782,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, 
int state)
/* wait until we successfully acquire the lock */
set_current_state(state);
while (true) {
-   if (rwsem_try_write_lock(count, sem, first))
+   if (rwsem_try_write_lock(count, sem, wlock, first))
break;
 
raw_spin_unlock_irq(&sem->wait_lock);
-- 
2.18.1



[PATCH-tip v3 08/14] locking/rwsem: Wake up almost all readers in wait queue

2019-04-10 Thread Waiman Long
When the front of the wait queue is a reader, other readers
immediately following the first reader will also be woken up at the
same time. However, if there is a writer in between. Those readers
behind the writer will not be woken up.

Because of optimistic spinning, the lock acquisition order is not FIFO
anyway. The lock handoff mechanism will ensure that lock starvation
will not happen.

Assuming that the lock hold times of the other readers still in the
queue will be about the same as the readers that are being woken up,
there is really not much additional cost other than the additional
latency due to the wakeup of additional tasks by the waker. Therefore
all the readers up to a maximum of 256 in the queue are woken up when
the first waiter is a reader to improve reader throughput.

With a locking microbenchmark running on 5.1 based kernel, the total
locking rates (in kops/s) on a 8-socket IvyBridge-EX system with
equal numbers of readers and writers before and after this patch were
as follows:

   # of Threads  Pre-Patch   Post-patch
     -   --
4  1,6411,674
87311,062
   16564  924
   32 78  300
   64 38  195
  240 50  149

There is no performance gain at low contention level. At high contention
level, however, this patch gives a pretty decent performance boost.

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index de1bf9fea1e2..ecd4bddc343a 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -88,6 +88,13 @@ enum rwsem_wake_type {
  */
 #define RWSEM_WAIT_TIMEOUT (HZ/250)
 
+/*
+ * We limit the maximum number of readers that can be woken up for a
+ * wake-up call to not penalizing the waking thread for spending too
+ * much time doing it.
+ */
+#define MAX_READERS_WAKEUP 0x100
+
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_(), then the RWSEM_FLAG_WAITERS bit must
@@ -158,16 +165,16 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
}
 
/*
-* Grant an infinite number of read locks to the readers at the front
-* of the queue. We know that woken will be at least 1 as we accounted
-* for above. Note we increment the 'active part' of the count by the
+* Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
+* queue. We know that woken will be at least 1 as we accounted for
+* above. Note we increment the 'active part' of the count by the
 * number of readers before waking any processes up.
 */
list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
struct task_struct *tsk;
 
if (waiter->type == RWSEM_WAITING_FOR_WRITE)
-   break;
+   continue;
 
woken++;
tsk = waiter->task;
@@ -186,6 +193,12 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 * after setting the reader waiter to nil.
 */
wake_q_add_safe(wake_q, tsk);
+
+   /*
+* Limit # of readers that can be woken up per wakeup call.
+*/
+   if (woken >= MAX_READERS_WAKEUP)
+   break;
}
 
adjustment = woken * RWSEM_READER_BIAS - adjustment;
-- 
2.18.1



[PATCH-tip v3 11/14] locking/rwsem: Add more rwsem owner access helpers

2019-04-10 Thread Waiman Long
Before combining owner and count, we are adding two new helpers for
accessing the owner value in the rwsem.

 1) struct task_struct *rwsem_get_owner(struct rw_semaphore *sem)
 2) bool is_rwsem_reader_owned(struct rw_semaphore *sem)

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c | 15 ++-
 kernel/locking/rwsem.c  |  3 +--
 kernel/locking/rwsem.h  | 32 ++--
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09b245e0d6f4..196729fd7f94 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -312,7 +312,7 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
 
preempt_disable();
rcu_read_lock();
-   owner = READ_ONCE(sem->owner);
+   owner = rwsem_get_owner(sem);
if (owner) {
ret = is_rwsem_owner_spinnable(owner) &&
 (is_rwsem_owner_reader(owner) || owner_on_cpu(owner));
@@ -344,15 +344,21 @@ enum owner_state {
 
 static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
 {
-   struct task_struct *owner = READ_ONCE(sem->owner);
+   struct task_struct *owner = rwsem_get_owner(sem);
long count;
 
if (!is_rwsem_owner_spinnable(owner))
return OWNER_NONSPINNABLE;
 
rcu_read_lock();
-   while (owner && !is_rwsem_owner_reader(owner)
-&& (READ_ONCE(sem->owner) == owner)) {
+   while (owner && !is_rwsem_owner_reader(owner)) {
+   struct task_struct *new_owner = rwsem_get_owner(sem);
+
+   if (new_owner != owner) {
+   owner = new_owner;
+   break;  /* The owner has changed */
+   }
+
/*
 * Ensure we emit the owner->on_cpu, dereference _after_
 * checking sem->owner still matches owner, if that fails,
@@ -379,7 +385,6 @@ static noinline enum owner_state rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 * spinning except when here is no active locks and the handoff bit
 * is set. In this case, we have to stop spinning.
 */
-   owner = READ_ONCE(sem->owner);
if (!is_rwsem_owner_spinnable(owner))
return OWNER_NONSPINNABLE;
if (owner && !is_rwsem_owner_reader(owner))
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index ccbf18f560ff..38d24676e01c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -198,8 +198,7 @@ EXPORT_SYMBOL(down_write_killable_nested);
 
 void up_read_non_owner(struct rw_semaphore *sem)
 {
-   DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
-   sem);
+   DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
__up_read(sem);
 }
 
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 19ae51ff7b91..ef7287f273fe 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -92,6 +92,11 @@ static inline void rwsem_clear_owner(struct rw_semaphore 
*sem)
WRITE_ONCE(sem->owner, NULL);
 }
 
+static inline struct task_struct *rwsem_get_owner(struct rw_semaphore *sem)
+{
+   return READ_ONCE(sem->owner);
+}
+
 /*
  * The task_struct pointer of the last owning reader will be left in
  * the owner field.
@@ -136,6 +141,23 @@ static inline bool is_rwsem_spinnable(struct rw_semaphore 
*sem)
return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
 }
 
+/*
+ * Return true if the rwsem is owned by a reader.
+ */
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+#ifdef CONFIG_DEBUG_RWSEMS
+   /*
+* Check the count to see if it is write-locked.
+*/
+   long count = atomic_long_read(&sem->count);
+
+   if (count & RWSEM_WRITER_MASK)
+   return false;
+#endif
+   return (unsigned long)sem->owner & RWSEM_READER_OWNED;
+}
+
 /*
  * Return true if rwsem is owned by an anonymous writer or readers.
  */
@@ -155,6 +177,7 @@ static inline void rwsem_clear_reader_owned(struct 
rw_semaphore *sem)
 {
unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
   | RWSEM_ANONYMOUSLY_OWNED;
+
if (READ_ONCE(sem->owner) == (struct task_struct *)val)
cmpxchg_relaxed((unsigned long *)&sem->owner, val,
RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
@@ -196,8 +219,7 @@ static inline void __down_read(struct rw_semaphore *sem)
if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
&sem->count) & RWSEM_READ_FAILED_MASK)) {
rwsem_down_read_failed(sem);
-   DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
-   RWSEM_READER_OWNED), sem);
+   DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
   

[PATCH-tip v3 07/14] locking/rwsem: Ensure an RT task will not spin on reader

2019-04-10 Thread Waiman Long
An RT task can do optimistic spinning only if the lock holder is
actually running. If the state of the lock holder isn't known, there
is a possibility that high priority of the RT task may block forward
progress of the lock holder if it happens to reside on the same CPU.
This will lead to deadlock. So we have to make sure that an RT task
will not spin on a reader-owned rwsem.

When the owner is temporarily set to NULL, it is more tricky to decide
if an RT task should stop spinning as it may be a temporary state
where another writer may have just stolen the lock which then failed
the task's trylock attempt. So one more retry is allowed to make sure
that the lock is not spinnable by an RT task.

When testing on a 8-socket IvyBridge-EX system, the one additional retry
seems to improve locking performance of RT write locking threads under
heavy contentions. The table below shows the locking rates (in kops/s)
with various write locking threads before and after the patch.

Locking threads Pre-patch Post-patch
--- - ---
4 2,753  2,608
8 2,529  2,520
   16 1,727  1,918
   32 1,263  1,956
   64   889  1,343

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c | 36 +---
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d38cf76ac17c..de1bf9fea1e2 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -349,6 +349,8 @@ static noinline enum owner_state rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
bool taken = false;
+   bool is_rt_task = rt_task(current);
+   int prev_owner_state = OWNER_NULL;
 
preempt_disable();
 
@@ -366,7 +368,12 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 *  2) readers own the lock as we can't determine if they are
 * actively running or not.
 */
-   while (rwsem_spin_on_owner(sem) & OWNER_SPINNABLE) {
+   for (;;) {
+   enum owner_state owner_state = rwsem_spin_on_owner(sem);
+
+   if (!(owner_state & OWNER_SPINNABLE))
+   break;
+
/*
 * Try to acquire the lock
 */
@@ -376,13 +383,28 @@ static bool rwsem_optimistic_spin(struct rw_semaphore 
*sem)
}
 
/*
-* When there's no owner, we might have preempted between the
-* owner acquiring the lock and setting the owner field. If
-* we're an RT task that will live-lock because we won't let
-* the owner complete.
+* An RT task cannot do optimistic spinning if it cannot
+* be sure the lock holder is running or live-lock may
+* happen if the current task and the lock holder happen
+* to run in the same CPU.
+*
+* When there's no owner or is reader-owned, an RT task
+* will stop spinning if the owner state is not a writer
+* at the previous iteration of the loop. This allows the
+* RT task to recheck if the task that steals the lock is
+* a spinnable writer. If so, it can keeps on spinning.
+*
+* If the owner is a writer, the need_resched() check is
+* done inside rwsem_spin_on_owner(). If the owner is not
+* a writer, need_resched() check needs to be done here.
 */
-   if (!sem->owner && (need_resched() || rt_task(current)))
-   break;
+   if (owner_state != OWNER_WRITER) {
+   if (need_resched())
+   break;
+   if (is_rt_task && (prev_owner_state != OWNER_WRITER))
+   break;
+   }
+   prev_owner_state = owner_state;
 
/*
 * The cpu_relax() call is a compiler barrier which forces
-- 
2.18.1



[PATCH-tip v3 12/14] locking/rwsem: Guard against making count negative

2019-04-10 Thread Waiman Long
The upper bits of the count field is used as reader count. When
sufficient number of active readers are present, the most significant
bit will be set and the count becomes negative. If the number of active
readers keep on piling up, we may eventually overflow the reader counts.
This is not likely to happen unless the number of bits reserved for
reader count is reduced because those bits are need for other purpose.

To prevent this count overflow from happening, the most significant bit
is now treated as a guard bit (RWSEM_FLAG_READFAIL). Read-lock attempts
will now fail for both the fast and optimistic spinning paths whenever
this bit is set. So all those extra readers will be put to sleep in
the wait queue. Wakeup will not happen until the reader count reaches 0.

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c | 38 +++-
 kernel/locking/rwsem.h  | 59 ++---
 2 files changed, 73 insertions(+), 24 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 196729fd7f94..db13ed13c360 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -92,7 +92,8 @@ enum rwsem_wake_type {
 /*
  * We limit the maximum number of readers that can be woken up for a
  * wake-up call to not penalizing the waking thread for spending too
- * much time doing it.
+ * much time doing it as well as the unlikely possiblity of overflowing
+ * the reader count.
  */
 #define MAX_READERS_WAKEUP 0x100
 
@@ -558,12 +559,35 @@ rwsem_waiter_is_first(struct rw_semaphore *sem, struct 
rwsem_waiter *waiter)
  * Wait for the read lock to be granted
  */
 static inline struct rw_semaphore __sched *
-__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state, long 
count)
 {
-   long count, adjustment = -RWSEM_READER_BIAS;
+   long adjustment = -RWSEM_READER_BIAS;
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
 
+   if (unlikely(count < 0)) {
+   /*
+* The sign bit has been set meaning that too many active
+* readers are present. We need to decrement reader count &
+* enter wait queue immediately to avoid overflowing the
+* reader count.
+*
+* As preemption is not disabled, there is a remote
+* possibility that premption can happen in the narrow
+* timing window between incrementing and decrementing
+* the reader count and the task is put to sleep for a
+* considerable amount of time. If sufficient number
+* of such unfortunate sequence of events happen, we
+* may still overflow the reader count. It is extremely
+* unlikey, though. If this is a concern, we should consider
+* disable preemption during this timing window to make
+* sure that such unfortunate event will not happen.
+*/
+   atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
+   adjustment = 0;
+   goto queue;
+   }
+
if (!rwsem_can_spin_on_owner(sem))
goto queue;
 
@@ -664,16 +688,16 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, 
int state)
 }
 
 __visible struct rw_semaphore * __sched
-rwsem_down_read_failed(struct rw_semaphore *sem)
+rwsem_down_read_failed(struct rw_semaphore *sem, long cnt)
 {
-   return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
+   return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE, cnt);
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
 
 __visible struct rw_semaphore * __sched
-rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+rwsem_down_read_failed_killable(struct rw_semaphore *sem, long cnt)
 {
-   return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+   return __rwsem_down_read_failed_common(sem, TASK_KILLABLE, cnt);
 }
 EXPORT_SYMBOL(rwsem_down_read_failed_killable);
 
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index ef7287f273fe..e4d67c0a167b 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -44,13 +44,28 @@
 #endif
 
 /*
- * The definition of the atomic counter in the semaphore:
+ * On 64-bit architectures, the bit definitions of the count are:
  *
- * Bit  0   - writer locked bit
- * Bit  1   - waiters present bit
- * Bit  2   - lock handoff bit
- * Bits 3-7 - reserved
- * Bits 8-X - 24-bit (32-bit) or 56-bit reader count
+ * Bit  0- writer locked bit
+ * Bit  1- waiters present bit
+ * Bit  2- lock handoff bit
+ * Bits 3-7  - reserved
+ * Bits 8-62 - 55-bit reader count
+ * Bit  63   - read fail bit
+ *
+ * On 32-bit architectures, the bit definitions of the count are:
+ *
+ * Bit  0- writer locked bit
+ * Bit  1- waiters present bit
+ * Bit  2- lock handoff bit
+ * 

[PATCH-tip v3 13/14] locking/rwsem: Merge owner into count on x86-64

2019-04-10 Thread Waiman Long
With separate count and owner, there are timing windows where the two
values are inconsistent. That can cause problem when trying to figure
out the exact state of the rwsem. For instance, a RT task will stop
optimistic spinning if the lock is acquired by a writer but the owner
field isn't set yet. That can be solved by combining the count and
owner together in a single atomic value.

On 32-bit architectures, there aren't enough bits to hold both.
64-bit architectures, however, can have enough bits to do that. For
x86-64, the physical address can use up to 52 bits. That is 4PB of
memory. That leaves 12 bits available for other use. The task structure
pointer is aligned to the L1 cache size. That means another 6 bits
(64 bytes cacheline) will be available. Reserving 2 bits for status
flags, we will have 16 bits for the reader count and the read fail bit.
That can supports up to (32k-1) readers.

The owner value will still be duplicated in the owner field as that
will ease debugging when looking at core dump.

This change is currently enabled for x86-64 only. Other 64-bit
architectures may be enabled in the future if the need arises.

With a locking microbenchmark running on 5.1 based kernel, the total
locking rates (in kops/s) on a 8-socket IvyBridge-EX system with
writer-only locking threads and then equal numbers of readers and writers
(mixed) before patch and after this and subsequent related patches were
as follows:

  Before Patch  After Patch
   # of Threads  wlockmixedwlockmixed
     ----
130,422   31,034   30,323   30,379
2 6,4276,6847,8049,436
4 6,7426,7387,5688,268
8 7,0927,2225,6797,041
   16 6,8827,1636,8487,652
   32 7,4587,3167,9752,189
   64 7,906  5208,269  534
  128 1,680  4258,047  448

In the single thread case, the complex write-locking operation does
introduce a little bit of overhead (about 0.3%). For the contended cases,
except for some anomalies in the data, there is no evidence that this
change will adversely impact performance.

When running the same microbenchmark with RT locking threads instead,
we got the following results:

  Before Patch  After Patch
   # of Threads  wlockmixedwlockmixed
     ----
2 4,0653,6424,7565,062
4 2,2541,9073,4602,496
8 2,386  9643,0121,964
   16 2,0951,5963,0831,862
   32 2,388  5303,717  359
   64 1,424  3224,060  401
  128 1,642  5104,488  628

It is obvious that RT tasks can benefit pretty significantly with this set
of patches.

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c |  11 ++--
 kernel/locking/rwsem.h  | 101 +---
 2 files changed, 103 insertions(+), 9 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index db13ed13c360..b2b9d1719965 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -27,11 +27,11 @@
 /*
  * Guide to the rw_semaphore's count field.
  *
- * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
- * by a writer.
+ * When any of the RWSEM_WRITER_MASK bits in count is set, the lock is
+ * owned by a writer.
  *
  * The lock is owned by readers when
- * (1) the RWSEM_WRITER_LOCKED isn't set in count,
+ * (1) none of the RWSEM_WRITER_MASK bits is set in count,
  * (2) some of the reader bits are set in count, and
  * (3) the owner field has RWSEM_READ_OWNED bit set.
  *
@@ -47,6 +47,11 @@
 void __init_rwsem(struct rw_semaphore *sem, const char *name,
  struct lock_class_key *key)
 {
+   /*
+* We should support at least (4k-1) concurrent readers
+*/
+   BUILD_BUG_ON(sizeof(long) * 8 - RWSEM_READER_SHIFT < 12);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
 * Make sure we are not reinitializing a held semaphore:
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index e4d67c0a167b..2ad4d7261219 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -44,7 +44,41 @@
 #endif
 
 /*
- * On 64-bit architectures, the bit definitions of the count are:
+ * Enable the merging of owner into count for x86-64 only.
+ */
+#ifdef CONFIG_X86_64
+#define RWSEM_MERGE_OWNER_TO_COUNT
+#endif
+
+/*
+ * With separate count and owner, there are timing windows where the two
+ * values are inconsistent. That can cause problem when trying to figure
+ * out the exact state of the rwsem. That can be solved by combining
+ * the count and owner together in a single atomic value.
+ *
+ * On 64-bit architectures, the owner task structure pointer can be
+ * compre

[PATCH-tip v3 09/14] locking/rwsem: Enable readers spinning on writer

2019-04-10 Thread Waiman Long
This patch enables readers to optimistically spin on a
rwsem when it is owned by a writer instead of going to sleep
directly.  The rwsem_can_spin_on_owner() function is extracted
out of rwsem_optimistic_spin() and is called directly by
__rwsem_down_read_failed_common() and __rwsem_down_write_failed_common().

With a locking microbenchmark running on 5.1 based kernel, the total
locking rates (in kops/s) on a 8-socket IvyBrige-EX system with equal
numbers of readers and writers before and after the patch were as
follows:

   # of Threads  Pre-patchPost-patch
     ---
4  1,6741,684
8  1,0621,074
   16924  900
   32300  458
   64195  208
  128164  168
  240149  143

The performance change wasn't significant in this case, but this change
is required by a follow-on patch.

Signed-off-by: Waiman Long 
---
 kernel/locking/lock_events_list.h |  1 +
 kernel/locking/rwsem-xadd.c   | 88 ++-
 kernel/locking/rwsem.h|  3 ++
 3 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/kernel/locking/lock_events_list.h 
b/kernel/locking/lock_events_list.h
index 29e5c52197fa..333ed5fda333 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -56,6 +56,7 @@ LOCK_EVENT(rwsem_sleep_reader)/* # of reader sleeps   
*/
 LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps   */
 LOCK_EVENT(rwsem_wake_reader)  /* # of reader wakeups  */
 LOCK_EVENT(rwsem_wake_writer)  /* # of writer wakeups  */
+LOCK_EVENT(rwsem_opt_rlock)/* # of read locks opt-spin acquired*/
 LOCK_EVENT(rwsem_opt_wlock)/* # of write locks opt-spin acquired   */
 LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings*/
 LOCK_EVENT(rwsem_rlock)/* # of read locks acquired 
*/
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index ecd4bddc343a..7ee78a752815 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -246,6 +246,30 @@ rwsem_try_write_lock(long count, struct rw_semaphore *sem, 
bool first)
 }
 
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire read lock before the reader is put on wait queue.
+ * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff
+ * is ongoing.
+ */
+static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem)
+{
+   long count = atomic_long_read(&sem->count);
+
+   if (RWSEM_COUNT_WLOCKED_OR_HANDOFF(count))
+   return false;
+
+   count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
+   if (!RWSEM_COUNT_WLOCKED_OR_HANDOFF(count)) {
+   rwsem_set_reader_owned(sem);
+   lockevent_inc(rwsem_opt_rlock);
+   return true;
+   }
+
+   /* Back out the change */
+   atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
+   return false;
+}
+
 /*
  * Try to acquire write lock before the writer has been put on wait queue.
  */
@@ -280,9 +304,12 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
 
BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
 
-   if (need_resched())
+   if (need_resched()) {
+   lockevent_inc(rwsem_opt_fail);
return false;
+   }
 
+   preempt_disable();
rcu_read_lock();
owner = READ_ONCE(sem->owner);
if (owner) {
@@ -290,6 +317,9 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
  owner_on_cpu(owner);
}
rcu_read_unlock();
+   preempt_enable();
+
+   lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
 }
 
@@ -359,7 +389,7 @@ static noinline enum owner_state rwsem_spin_on_owner(struct 
rw_semaphore *sem)
return !owner ? OWNER_NULL : OWNER_READER;
 }
 
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
 {
bool taken = false;
bool is_rt_task = rt_task(current);
@@ -368,9 +398,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
preempt_disable();
 
/* sem->wait_lock should not be held when doing optimistic spinning */
-   if (!rwsem_can_spin_on_owner(sem))
-   goto done;
-
if (!osq_lock(&sem->osq))
goto done;
 
@@ -390,10 +417,11 @@ static bool rwsem_optimistic_spin(struct rw_semaphore 
*sem)
/*
 * Try to acquire the lock
 */
-   if (rwsem_try_write_lock_unqueued(sem)) {
-   taken = true;
+   taken = wlock ? rwsem_try_write_lock_unqueued(sem)
+ : rwsem_try_read_lock_u

[PATCH-tip v3 04/14] locking/rwsem: Implement lock handoff to prevent lock starvation

2019-04-10 Thread Waiman Long
Because of writer lock stealing, it is possible that a constant
stream of incoming writers will cause a waiting writer or reader to
wait indefinitely leading to lock starvation.

The mutex code has a lock handoff mechanism to prevent lock starvation.
This patch implements a similar lock handoff mechanism to disable
lock stealing and force lock handoff to the first waiter in the queue
after at least a 4ms waiting period unless it is a RT writer task which
doesn't need to wait. The waiting period is used to avoid discouraging
lock stealing too much to affect performance.

A rwsem microbenchmark was run for 5 seconds on a 2-socket 40-core
80-thread Skylake system with a v5.1 based kernel and 240 write_lock
threads with 5us sleep critical section.

Before the patch, the min/mean/max numbers of locking operations for
the locking threads were 1/7,792/173,696. After the patch, the figures
became 5,842/6,542/7,458.  It can be seen that the rwsem became much
more fair, though there was a drop of about 16% in the mean locking
operations done which was a tradeoff of having better fairness.

Making the waiter set the handoff bit right after the first wakeup can
impact performance especially with a mixed reader/writer workload. With
the same microbenchmark with short critical section and equal number of
reader and writer threads (40/40), the reader/writer locking operation
counts with the current patch were:

  40 readers, Iterations Min/Mean/Max = 1,793/1,794/1,796
  40 writers, Iterations Min/Mean/Max = 1,793/34,956/86,081

By making waiter set handoff bit immediately after wakeup:

  40 readers, Iterations Min/Mean/Max = 43/44/46
  40 writers, Iterations Min/Mean/Max = 43/1,263/3,191

Signed-off-by: Waiman Long 
---
 kernel/locking/lock_events_list.h |   2 +
 kernel/locking/rwsem-xadd.c   | 159 +++---
 kernel/locking/rwsem.h|  23 +++--
 3 files changed, 139 insertions(+), 45 deletions(-)

diff --git a/kernel/locking/lock_events_list.h 
b/kernel/locking/lock_events_list.h
index ad7668cfc9da..29e5c52197fa 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -61,7 +61,9 @@ LOCK_EVENT(rwsem_opt_fail)/* # of failed opt-spinnings
*/
 LOCK_EVENT(rwsem_rlock)/* # of read locks acquired 
*/
 LOCK_EVENT(rwsem_rlock_fast)   /* # of fast read locks acquired*/
 LOCK_EVENT(rwsem_rlock_fail)   /* # of failed read lock acquisitions   */
+LOCK_EVENT(rwsem_rlock_handoff)/* # of read lock handoffs  
*/
 LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls  */
 LOCK_EVENT(rwsem_wlock)/* # of write locks acquired
*/
 LOCK_EVENT(rwsem_wlock_fail)   /* # of failed write lock acquisitions  */
+LOCK_EVENT(rwsem_wlock_handoff)/* # of write lock handoffs 
*/
 LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index adab6477be51..e6f1d218ceca 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -73,6 +73,7 @@ struct rwsem_waiter {
struct list_head list;
struct task_struct *task;
enum rwsem_waiter_type type;
+   unsigned long timeout;
 };
 
 enum rwsem_wake_type {
@@ -81,6 +82,12 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
 };
 
+/*
+ * The typical HZ value is either 250 or 1000. So set the minimum waiting
+ * time to 4ms in the wait queue before initiating the handoff protocol.
+ */
+#define RWSEM_WAIT_TIMEOUT (HZ/250)
+
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_(), then the RWSEM_FLAG_WAITERS bit must
@@ -131,6 +138,15 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
adjustment = RWSEM_READER_BIAS;
oldcount = atomic_long_fetch_add(adjustment, &sem->count);
if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
+   /*
+* Initiate handoff to reader, if applicable.
+*/
+   if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
+   time_after(jiffies, waiter->timeout)) {
+   adjustment -= RWSEM_FLAG_HANDOFF;
+   lockevent_inc(rwsem_rlock_handoff);
+   }
+
atomic_long_sub(adjustment, &sem->count);
return;
}
@@ -179,6 +195,12 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
adjustment -= RWSEM_FLAG_WAITERS;
}
 
+   /*
+* Clear the handoff flag
+*/
+   if (woken && RWSEM_COUNT_HANDOFF(atomic_long_read(&sem->count)))
+   adjustment -= RWSEM_FLAG_HANDOFF;
+
if (adjustment)
atomic_long_add(adjustment,

[PATCH-tip v3 06/14] locking/rwsem: Make rwsem_spin_on_owner() return owner state

2019-04-10 Thread Waiman Long
This patch modifies rwsem_spin_on_owner() to return four possible
values to better reflect the state of lock holder which enables us to
make a better decision of what to do next.

In the special case that there is no active lock and the handoff bit
is set, optimistic spinning has to be stopped.

Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem-xadd.c | 40 ++---
 kernel/locking/rwsem.h  |  5 +
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d4854bfad589..d38cf76ac17c 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -281,14 +281,30 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
 }
 
 /*
- * Return true only if we can still spin on the owner field of the rwsem.
+ * Return the folowing 4 values depending on the lock owner state.
+ *   OWNER_NULL  : owner is currently NULL
+ *   OWNER_WRITER: when owner changes and is a writer
+ *   OWNER_READER: when owner changes and the new owner may be a reader.
+ *   OWNER_NONSPINNABLE:
+ *when optimistic spinning has to stop because either the
+ *owner stops running, is unknown, or its timeslice has
+ *been used up.
  */
-static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
+enum owner_state {
+   OWNER_NULL  = 1 << 0,
+   OWNER_WRITER= 1 << 1,
+   OWNER_READER= 1 << 2,
+   OWNER_NONSPINNABLE  = 1 << 3,
+};
+#define OWNER_SPINNABLE(OWNER_NULL | OWNER_WRITER)
+
+static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
 {
struct task_struct *owner = READ_ONCE(sem->owner);
+   long count;
 
if (!is_rwsem_owner_spinnable(owner))
-   return false;
+   return OWNER_NONSPINNABLE;
 
rcu_read_lock();
while (owner && (READ_ONCE(sem->owner) == owner)) {
@@ -306,7 +322,7 @@ static noinline bool rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 */
if (need_resched() || !owner_on_cpu(owner)) {
rcu_read_unlock();
-   return false;
+   return OWNER_NONSPINNABLE;
}
 
cpu_relax();
@@ -315,9 +331,19 @@ static noinline bool rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 
/*
 * If there is a new owner or the owner is not set, we continue
-* spinning.
+* spinning except when here is no active locks and the handoff bit
+* is set. In this case, we have to stop spinning.
 */
-   return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
+   owner = READ_ONCE(sem->owner);
+   if (!is_rwsem_owner_spinnable(owner))
+   return OWNER_NONSPINNABLE;
+   if (owner && !is_rwsem_owner_reader(owner))
+   return OWNER_WRITER;
+
+   count = atomic_long_read(&sem->count);
+   if (RWSEM_COUNT_HANDOFF(count) && !RWSEM_COUNT_LOCKED(count))
+   return OWNER_NONSPINNABLE;
+   return !owner ? OWNER_NULL : OWNER_READER;
 }
 
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -340,7 +366,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 *  2) readers own the lock as we can't determine if they are
 * actively running or not.
 */
-   while (rwsem_spin_on_owner(sem)) {
+   while (rwsem_spin_on_owner(sem) & OWNER_SPINNABLE) {
/*
 * Try to acquire the lock
 */
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 809a73be391e..83148a7d4f41 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -119,6 +119,11 @@ static inline bool is_rwsem_owner_spinnable(struct 
task_struct *owner)
return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
 }
 
+static inline bool is_rwsem_owner_reader(struct task_struct *owner)
+{
+   return (unsigned long)owner & RWSEM_READER_OWNED;
+}
+
 /*
  * Return true if rwsem is owned by an anonymous writer or readers.
  */
-- 
2.18.1



[PATCH-tip v3 02/14] locking/rwsem: Make owner available even if !CONFIG_RWSEM_SPIN_ON_OWNER

2019-04-10 Thread Waiman Long
The owner field in the rw_semaphore structure is used primarily for
optimistic spinning. However, identifying the rwsem owner can also be
helpful in debugging as well as tracing locking related issues when
analyzing crash dump. The owner field may also store state information
that can be important to the operation of the rwsem.

So the owner field is now made a permanent member of the rw_semaphore
structure irrespective of CONFIG_RWSEM_SPIN_ON_OWNER.

Signed-off-by: Waiman Long 
---
 include/linux/rwsem.h  |  6 +++---
 kernel/locking/rwsem.h | 23 ---
 lib/Kconfig.debug  |  8 
 3 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 2ea18a3def04..6b902121389f 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -34,12 +34,12 @@
  */
 struct rw_semaphore {
atomic_long_t count;
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
-* Write owner. Used as a speculative check to see
-* if the owner is running on the cpu.
+* Write owner or one of the read owners. Can be used as a
+* speculative check to see if the owner is running on the cpu.
 */
struct task_struct *owner;
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
raw_spinlock_t wait_lock;
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 64877f5294e3..eb9c8534299b 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -61,7 +61,6 @@
 #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS(RWSEM_WAITING_BIAS + 
RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
  * All writes to owner are protected by WRITE_ONCE() to make sure that
  * store tearing can't happen as optimistic spinners may read and use
@@ -126,7 +125,6 @@ static inline bool rwsem_has_anonymous_owner(struct 
task_struct *owner)
  * real owner or one of the real owners. The only exception is when the
  * unlock is done by up_read_non_owner().
  */
-#define rwsem_clear_reader_owned rwsem_clear_reader_owned
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
@@ -135,28 +133,7 @@ static inline void rwsem_clear_reader_owned(struct 
rw_semaphore *sem)
cmpxchg_relaxed((unsigned long *)&sem->owner, val,
RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
 }
-#endif
-
 #else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
-  struct task_struct *owner)
-{
-}
-
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
-
-#ifndef rwsem_clear_reader_owned
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0d9e81779e37..2047f3884540 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1067,7 +1067,7 @@ config PROVE_LOCKING
select DEBUG_SPINLOCK
select DEBUG_MUTEXES
select DEBUG_RT_MUTEXES if RT_MUTEXES
-   select DEBUG_RWSEMS if RWSEM_SPIN_ON_OWNER
+   select DEBUG_RWSEMS
select DEBUG_WW_MUTEX_SLOWPATH
select DEBUG_LOCK_ALLOC
select TRACE_IRQFLAGS
@@ -1171,10 +1171,10 @@ config DEBUG_WW_MUTEX_SLOWPATH
 
 config DEBUG_RWSEMS
bool "RW Semaphore debugging: basic checks"
-   depends on DEBUG_KERNEL && RWSEM_SPIN_ON_OWNER
+   depends on DEBUG_KERNEL
help
- This debugging feature allows mismatched rw semaphore locks and 
unlocks
- to be detected and reported.
+ This debugging feature allows mismatched rw semaphore locks
+ and unlocks to be detected and reported.
 
 config DEBUG_LOCK_ALLOC
bool "Lock debugging: detect incorrect freeing of live locks"
-- 
2.18.1



Re: [PATCH-tip v2 02/12] locking/rwsem: Implement lock handoff to prevent lock starvation

2019-04-10 Thread Peter Zijlstra
On Fri, Apr 05, 2019 at 03:21:05PM -0400, Waiman Long wrote:
> Because of writer lock stealing, it is possible that a constant
> stream of incoming writers will cause a waiting writer or reader to
> wait indefinitely leading to lock starvation.
> 
> The mutex code has a lock handoff mechanism to prevent lock starvation.
> This patch implements a similar lock handoff mechanism to disable
> lock stealing and force lock handoff to the first waiter in the queue
> after at least a 5ms waiting period. The waiting period is used to
> avoid discouraging lock stealing too much to affect performance.

I would say the handoff it not at all similar to the mutex code. It is
in fact radically different.

> @@ -131,6 +138,15 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
>   adjustment = RWSEM_READER_BIAS;
>   oldcount = atomic_long_fetch_add(adjustment, &sem->count);
>   if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
> + /*
> +  * Initiate handoff to reader, if applicable.
> +  */
> + if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
> + time_after(jiffies, waiter->timeout)) {
> + adjustment -= RWSEM_FLAG_HANDOFF;
> + lockevent_inc(rwsem_rlock_handoff);
> + }
> +
>   atomic_long_sub(adjustment, &sem->count);
>   return;
>   }

That confuses the heck out of me...

The above seems to rely on __rwsem_mark_wake() to be fully serialized
(and it is, by ->wait_lock, but that isn't spelled out anywhere) such
that we don't get double increment of FLAG_HANDOFF.

So there is NO __rwsem_mark_wake() vs __wesem_mark_wake() race like:

  CPU0  CPU1

  oldcount = atomic_long_fetch_add(adjustment, &sem->count)

oldcount = 
atomic_long_fetch_add(adjustment, &sem->count)

  if (!(oldcount & HANDOFF))
adjustment -= HANDOFF;

if (!(oldcount & HANDOFF))
  adjustment -= HANDOFF;
  atomic_long_sub(adjustment)
atomic_long_sub(adjustment)


*whoops* double negative decrement of HANDOFF (aka double increment).


However there is another site that fiddles with the HANDOFF bit, namely
__rwsem_down_write_failed_common(), and that does:

+   atomic_long_or(RWSEM_FLAG_HANDOFF, &sem->count);

_OUTSIDE_ of ->wait_lock, which would yield:

  CPU0  CPU1

  oldcount = atomic_long_fetch_add(adjustment, &sem->count)

atomic_long_or(HANDOFF)

  if (!(oldcount & HANDOFF))
adjustment -= HANDOFF;

  atomic_long_sub(adjustment)

*whoops*, incremented HANDOFF on HANDOFF.


And there's not a comment in sight that would elucidate if this is
possible or not.


Also:

+   atomic_long_or(RWSEM_FLAG_HANDOFF, &sem->count);
+   first++;
+
+   /*
+* Make sure the handoff bit is seen by
+* others before proceeding.
+*/
+   smp_mb__after_atomic();

That comment is utter nonsense. smp_mb() doesn't (and cannot) 'make
visible'. There needs to be order between two memops on both sides.



[GIT PULL] Please pull RDMA subsystem changes

2019-04-10 Thread Jason Gunthorpe
Hi Linus,

Second rc pull request

The following changes since commit 8c2ffd9174779014c3fe1f96d9dc3641d9175f00:

  Linux 5.1-rc2 (2019-03-24 14:02:26 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git tags/for-linus

for you to fetch changes up to d737b25b1ae0540ba13cbd45ebb9b58a1d6d7f0d:

  IB/hfi1: Do not flush send queue in the TID RDMA second leg (2019-04-10 
15:09:30 -0300)


5.1 Second RC pull request

Several driver bug fixes posted in the last several weeks

- Several bug fixes for the hfi1 driver 'TID RDMA' functionality merged
  into 5.1. Since TID RDMA is on by default these all seem to be
  regressions.

- Wrong software permission checks on memory in mlx5

- Memory leak in vmw_pvrdma during driver remove

- Several bug fixes for hns driver features merged into 5.1


Kaike Wan (5):
  IB/hfi1: Failed to drain send queue when QP is put into error state
  IB/hfi1: Clear the IOWAIT pending bits when QP is put into error state
  IB/hfi1: Eliminate opcode tests on mr deref
  IB/hfi1: Fix the allocation of RSM table
  IB/hfi1: Do not flush send queue in the TID RDMA second leg

Kamal Heib (1):
  RDMA/vmw_pvrdma: Fix memory leak on pvrdma_pci_remove

Lijun Ou (1):
  RDMA/hns: Fix bug that caused srq creation to fail

Moni Shoua (1):
  IB/mlx5: Reset access mask when looping inside page fault handler

Yangyang Li (1):
  RDMA/hns: Bugfix for SCC hem free

 drivers/infiniband/hw/hfi1/chip.c  | 26 +++--
 drivers/infiniband/hw/hfi1/qp.c|  4 +++-
 drivers/infiniband/hw/hfi1/rc.c|  4 ++--
 drivers/infiniband/hw/hfi1/tid_rdma.c  | 31 +++---
 drivers/infiniband/hw/hns/hns_roce_hem.c   |  6 +++--
 drivers/infiniband/hw/hns/hns_roce_mr.c|  4 ++--
 drivers/infiniband/hw/hns/hns_roce_qp.c|  3 ---
 drivers/infiniband/hw/mlx5/odp.c   |  3 ++-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c |  2 ++
 9 files changed, 42 insertions(+), 41 deletions(-)


signature.asc
Description: PGP signature


Re: [PATCH 2/2] kernel: use sysctl shared variables for range check

2019-04-10 Thread Kees Cook
On Mon, Apr 8, 2019 at 3:09 PM Matteo Croce  wrote:
>
> Use the shared variables for range check, instead of declaring a local one
> in every source file.

I was expecting this to be a tree-wide change for all the cases found
by patch 1's "git grep".

Slight change to the grep for higher accuracy:

$ git grep -E '\.extra[12].*&(zero|one|int_max)\b' |wc -l
245

Only 31 sources:
$ git grep -E '\.extra[12].*&(zero|one|int_max)\b' | cut -d: -f1 |
sort -u > /tmp/list.txt
$ wc -l /tmp/list.txt
31

One thing I wonder about is if any of these cases depend on the extra
variable being non-const (many of these are just "static int").

$ egrep -H '\b(zero|one|int_max)\b.*=' $(cat /tmp/list.txt) | grep -v static

Looks like none, so it'd be safe. How about doing this tree-wide for
all 31 cases? (Coccinelle might be able to help.)

-Kees

>
> Signed-off-by: Matteo Croce 
> ---
>  kernel/pid_namespace.c |   3 +-
>  kernel/sysctl.c| 193 -
>  kernel/ucount.c|   6 +-
>  3 files changed, 98 insertions(+), 104 deletions(-)
>
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index aa6e72fb7c08..ddbb51bc4968 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -290,14 +290,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, 
> int write,
>  }
>
>  extern int pid_max;
> -static int zero = 0;
>  static struct ctl_table pid_ns_ctl_table[] = {
> {
> .procname = "ns_last_pid",
> .maxlen = sizeof(int),
> .mode = 0666, /* permissions are checked in the handler */
> .proc_handler = pid_ns_ctl_handler,
> -   .extra1 = &zero,
> +   .extra1 = (void *)&sysctl_zero,
> .extra2 = &pid_max,
> },
> { }
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 553b19439714..d6f4b26951e1 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -123,9 +123,6 @@ static int sixty = 60;
>  #endif
>
>  static int __maybe_unused neg_one = -1;
> -
> -static int zero;
> -static int __maybe_unused one = 1;
>  static int __maybe_unused two = 2;
>  static int __maybe_unused four = 4;
>  static unsigned long zero_ul;
> @@ -388,8 +385,8 @@ static struct ctl_table kern_table[] = {
> .maxlen = sizeof(unsigned int),
> .mode   = 0644,
> .proc_handler   = sysctl_schedstats,
> -   .extra1 = &zero,
> -   .extra2 = &one,
> +   .extra1 = (void *)&sysctl_zero,
> +   .extra2 = (void *)&sysctl_one,
> },
>  #endif /* CONFIG_SCHEDSTATS */
>  #endif /* CONFIG_SMP */
> @@ -421,7 +418,7 @@ static struct ctl_table kern_table[] = {
> .maxlen = sizeof(unsigned int),
> .mode   = 0644,
> .proc_handler   = proc_dointvec_minmax,
> -   .extra1 = &one,
> +   .extra1 = (void *)&sysctl_one,
> },
> {
> .procname   = "numa_balancing",
> @@ -429,8 +426,8 @@ static struct ctl_table kern_table[] = {
> .maxlen = sizeof(unsigned int),
> .mode   = 0644,
> .proc_handler   = sysctl_numa_balancing,
> -   .extra1 = &zero,
> -   .extra2 = &one,
> +   .extra1 = (void *)&sysctl_zero,
> +   .extra2 = (void *)&sysctl_one,
> },
>  #endif /* CONFIG_NUMA_BALANCING */
>  #endif /* CONFIG_SCHED_DEBUG */
> @@ -462,8 +459,8 @@ static struct ctl_table kern_table[] = {
> .maxlen = sizeof(unsigned int),
> .mode   = 0644,
> .proc_handler   = proc_dointvec_minmax,
> -   .extra1 = &zero,
> -   .extra2 = &one,
> +   .extra1 = (void *)&sysctl_zero,
> +   .extra2 = (void *)&sysctl_one,
> },
>  #endif
>  #ifdef CONFIG_CFS_BANDWIDTH
> @@ -473,7 +470,7 @@ static struct ctl_table kern_table[] = {
> .maxlen = sizeof(unsigned int),
> .mode   = 0644,
> .proc_handler   = proc_dointvec_minmax,
> -   .extra1 = &one,
> +   .extra1 = (void *)&sysctl_one,
> },
>  #endif
>  #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
> @@ -483,8 +480,8 @@ static struct ctl_table kern_table[] = {
> .maxlen = sizeof(unsigned int),
> .mode   = 0644,
> .proc_handler   = sched_energy_aware_handler,
> -   .extra1 = &zero,
> -   .extra2 = &one,
> +   .extra1 = (void *)&sysctl_zero,
> +   .extra2 = (void *)&sysctl_one,
> },
>  #endif
>  #ifdef CONFIG_PROVE_

Re: [PATCH 18/22] watchdog: mt7621_wdt: Use 'dev' instead of dereferencing it repeatedly

2019-04-10 Thread Joe Perches
On Wed, 2019-04-10 at 09:27 -0700, Guenter Roeck wrote:
> Introduce local variable 'struct device *dev' and use it instead of
> dereferencing it repeatedly.
> 
> The conversion was done automatically with coccinelle using the
> following semantic patches. The semantic patches and the scripts
> used to generate this commit log are available at
> https://github.com/groeck/coccinelle-patches

Interesting collection.  It would be useful to specify which
particular script generated or enabled this patch.

Just scanning briefly, it might have been this one:
https://github.com/groeck/coccinelle-patches/blob/master/common/deref.cocci
But it looks like some manual bit might have been required too.

And trivially:

> diff --git a/drivers/watchdog/mt7621_wdt.c b/drivers/watchdog/mt7621_wdt.c
[]
> @@ -133,18 +133,19 @@ static struct watchdog_device mt7621_wdt_dev = {
[]
>   watchdog_init_timeout(&mt7621_wdt_dev, mt7621_wdt_dev.max_timeout,
> -   &pdev->dev);
> +   dev);

This could be on one line.




Re: [PATCH v3 1/9] ARM: dts: imx6qdl: Specify IMX6QDL_CLK_IPG as "ipg" clock to SDMA

2019-04-10 Thread Adam Ford
On Fri, Mar 29, 2019 at 1:49 AM Andrey Smirnov  wrote:
>
> Since 25aaa75df1e6 SDMA driver uses clock rates of "ipg" and "ahb"
> clock to determine if it needs to configure the IP block as operating
> at 1:1 or 1:2 clock ratio (ACR bit in SDMAARM_CONFIG). Specifying both
> clocks as IMX6QDL_CLK_SDMA results in driver incorrectly thinking that
> ratio is 1:1 which results in broken SDMA funtionality(this at least
> breaks RAVE SP serdev driver on RDU2). Fix the code to specify
> IMX6QDL_CLK_IPG as "ipg" clock for SDMA, to avoid detecting incorrect
> clock ratio.
>
Thank you for this!  I the 5.1 kernel is the first revision to include
the i.MX6 kit from Logic PD and I thought I messed something up, so I
was off in a completely different direction trying to figure out why
it was broken.  With your patch, my board works again!

> Fixes: 25aaa75df1e6 ("dmaengine: imx-sdma: add clock ratio 1:1 check")
> Signed-off-by: Andrey Smirnov 
> Reviewed-by: Lucas Stach 

Tested-by: Adam Ford  #imx6q-logicpd

> Cc: Angus Ainslie (Purism) 
> Cc: Chris Healy 
> Cc: Lucas Stach 
> Cc: Fabio Estevam 
> Cc: Shawn Guo 
> Cc: linux-arm-ker...@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  arch/arm/boot/dts/imx6qdl.dtsi | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/arm/boot/dts/imx6qdl.dtsi b/arch/arm/boot/dts/imx6qdl.dtsi
> index 9f9aa6e7ed0e..354feba077b2 100644
> --- a/arch/arm/boot/dts/imx6qdl.dtsi
> +++ b/arch/arm/boot/dts/imx6qdl.dtsi
> @@ -949,7 +949,7 @@
> compatible = "fsl,imx6q-sdma", 
> "fsl,imx35-sdma";
> reg = <0x020ec000 0x4000>;
> interrupts = <0 2 IRQ_TYPE_LEVEL_HIGH>;
> -   clocks = <&clks IMX6QDL_CLK_SDMA>,
> +   clocks = <&clks IMX6QDL_CLK_IPG>,
>  <&clks IMX6QDL_CLK_SDMA>;
> clock-names = "ipg", "ahb";
> #dma-cells = <3>;
> --
> 2.20.1
>
>
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel


[PATCH] watchdog: machzwd: Mark expected switch fall-through

2019-04-10 Thread Gustavo A. R. Silva
In preparation to enabling -Wimplicit-fallthrough, mark switch
cases where we are expecting to fall through.

This patch fixes the following warnings:

drivers/watchdog/machzwd.c: In function ‘zf_set_timer’:
./arch/x86/include/asm/io.h:355:14: warning: this statement may fall through 
[-Wimplicit-fallthrough=]
 #define outw outw
drivers/watchdog/machzwd.c:80:53: note: in expansion of macro ‘outw’
 #define zf_writew(port, data)  { outb(port, INDEX); outw(data, DATA_W); }
 ^~~~
drivers/watchdog/machzwd.c:179:3: note: in expansion of macro ‘zf_writew’
   zf_writew(COUNTER_1, new);
   ^
drivers/watchdog/machzwd.c:180:2: note: here
  case WD2:
  ^~~~

Warning level 3 was used: -Wimplicit-fallthrough=3

This patch is part of the ongoing efforts to enable
-Wimplicit-fallthrough.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/watchdog/machzwd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
index 88d823d87a4b..108928dbc754 100644
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -177,6 +177,7 @@ static inline void zf_set_timer(unsigned short new, 
unsigned char n)
switch (n) {
case WD1:
zf_writew(COUNTER_1, new);
+   /* fall through */
case WD2:
zf_writeb(COUNTER_2, new > 0xff ? 0xff : new);
default:
-- 
2.21.0



Re: [PATCH] of: del redundant type conversion

2019-04-10 Thread Frank Rowand
On 4/10/19 1:29 AM, xiaojiangfeng wrote:
> The type of variable l in early_init_dt_scan_chosen is
> int, there is no need to convert to int.
> 
> Signed-off-by: xiaojiangfeng 
> ---
>  drivers/of/fdt.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
> index 4734223..de893c9 100644
> --- a/drivers/of/fdt.c
> +++ b/drivers/of/fdt.c
> @@ -1091,7 +1091,7 @@ int __init early_init_dt_scan_chosen(unsigned long 
> node, const char *uname,
>   /* Retrieve command line */
>   p = of_get_flat_dt_prop(node, "bootargs", &l);
>   if (p != NULL && l > 0)
> - strlcpy(data, p, min((int)l, COMMAND_LINE_SIZE));
> + strlcpy(data, p, min(l, COMMAND_LINE_SIZE));
>  
>   /*
>* CONFIG_CMDLINE is meant to be a default in case nothing else
> 

Thanks for catching the redundant cast.

There is a second problem detected by sparse on that line:

  drivers/of/fdt.c:1094:34: warning: expression using sizeof(void)

Can you please fix both issues?

Thanks,

Frank


Re: [PATCH 12/12] [PROBABLY WRONG] s390: void '0' constraint in inline assembly

2019-04-10 Thread Arnd Bergmann
On Wed, Apr 10, 2019 at 3:55 PM Martin Schwidefsky
 wrote:
>
> On Mon,  8 Apr 2019 23:26:25 +0200
> Arnd Bergmann  wrote:
>
> > diff --git a/arch/s390/include/asm/processor.h 
> > b/arch/s390/include/asm/processor.h
> > index 700c650ffd4f..84c59c99668a 100644
> > --- a/arch/s390/include/asm/processor.h
> > +++ b/arch/s390/include/asm/processor.h
> > @@ -262,7 +262,7 @@ static __no_kasan_or_inline unsigned short stap(void)
> >   register unsigned long r4 asm("6") = (unsigned long)(arg5)
> >
> >  #define CALL_FMT_0
> > -#define CALL_FMT_1 CALL_FMT_0, "0" (r2)
> > +#define CALL_FMT_1 CALL_FMT_0, "d" (r2)
> >  #define CALL_FMT_2 CALL_FMT_1, "d" (r3)
> >  #define CALL_FMT_3 CALL_FMT_2, "d" (r4)
> >  #define CALL_FMT_4 CALL_FMT_3, "d" (r5)
>
> This is (slightly) wrong. %r2 is used as the input register for the first 
> argument
> and the result value for the call. With your patch you force the compiler to 
> load
> the first argument in two registers. One solution would be to CALL_FMT1 as
>
> #define CALL_FMT1 CALL_FMT_0
>
> It still is not optimal though as for CALL_FMT_0 the "+&d" (r2) indicates an
> input but CALL_ARGS_0 does not initialize r2.

Ok, thanks for taking a closer look!

> I am thinking about the following patch to cover all cases:
> --
> From 91a4abbec91a9f26f84f7386f2c0f96de669b0eb Mon Sep 17 00:00:00 2001
> From: Martin Schwidefsky 
> Date: Wed, 10 Apr 2019 15:48:43 +0200
> Subject: [PATCH] s390: fine-tune stack switch helper
>
> The CALL_ON_STACK helper currently does not work with clang and for
> calls without arguments it does not initialize r2 although the contraint
> is "+&d". Rework the CALL_FMT_x and the CALL_ON_STACK macros to work
> with clang and produce optimal code in all cases.
>
> Reported-by: Arnd Bergmann 
> Signed-off-by: Martin Schwidefsky 

I did another build test to confirm that your patch works fine with clang
as well, looks good to me.

  Arnd


Re: [PATCH 05/12] s390: zcrypt: initialize variables before_use

2019-04-10 Thread Arnd Bergmann
On Wed, Apr 10, 2019 at 5:59 PM Martin Schwidefsky
 wrote:
> On Tue, 9 Apr 2019 11:54:30 +0200 Harald Freudenberger  
> wrote:
> > On 08.04.19 23:26, Arnd Bergmann wrote:
> > > }
> > Thanks Arnd, but as Nathan already wrote, I'd prefer to have the
> > variable initialized with 0 instead of -1.
> > If you agree with this, I'll rewrite the patch and apply it to our
> > internal git and it will appear at kernel org with the next s390 code merge 
> > then.
>
> Do we agreement on func_coed=0 for this one ?

Yes, I think that was the consensus.

   Arnd


[PATCH V2 0/2] perf: Add Tremont support

2019-04-10 Thread kan . liang
From: Kan Liang 

The patch series intends to add Tremont support for Linux perf.

The patch series is on top of Icelake V5 patch series (with Peter's cleanup 
patch).
https://lkml.org/lkml/2019/4/8/630

PATCH 1: A fix for Icelake V5 patch series (with Peter's cleanup patch).
 It can be merged back into "Subject: perf/x86/intel: Add Icelake 
support"
PATCH 2: Tremont core PMU support.

Changes since V1:
- The previous patch "perf/x86/intel: Support adaptive PEBS for fixed counters"
  will be merged back.
- New patch to fix the checking for instruction event.
- Allow instruction:ppp on generic purpose counter 0

Kan Liang (2):
  perf/x86/intel: Fix the checking for instruction event
  perf/x86/intel: Add Tremont core PMU support

 arch/x86/events/intel/core.c | 96 +++-
 1 file changed, 95 insertions(+), 1 deletion(-)

-- 
2.7.4



[PATCH V2 2/2] perf/x86/intel: Add Tremont core PMU support

2019-04-10 Thread kan . liang
From: Kan Liang 

Add perf core PMU support for Intel Tremont CPU.

The init code is based on Goldmont plus.

The generic purpose counter 0 and fixed counter 0 have less skid.
Force :ppp events on generic purpose counter 0.
Force instruction:ppp on generic purpose counter 0 and fixed counter 0.

Updates LLC cache event table and OFFCORE_RESPONSE mask.

The adaptive PEBS, which is already enabled on ICL, is also supported
on Tremont. No extra codes required.

Signed-off-by: Kan Liang 
---

Changes since v1:
- Allow instruction:ppp on generic purpose counter 0
- Fix the checking for instruction event.

 arch/x86/events/intel/core.c | 91 
 1 file changed, 91 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 34220ab..9f1e000 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1856,6 +1856,45 @@ static __initconst const u64 glp_hw_cache_extra_regs
},
 };
 
+#define TNT_LOCAL_DRAM BIT_ULL(26)
+#define TNT_DEMAND_READGLM_DEMAND_DATA_RD
+#define TNT_DEMAND_WRITE   GLM_DEMAND_RFO
+#define TNT_LLC_ACCESS GLM_ANY_RESPONSE
+#define TNT_SNP_ANY(SNB_SNP_NOT_NEEDED|SNB_SNP_MISS| \
+SNB_NO_FWD|SNB_SNP_FWD|SNB_HITM)
+#define TNT_LLC_MISS   
(TNT_SNP_ANY|SNB_NON_DRAM|TNT_LOCAL_DRAM)
+
+static __initconst const u64 tnt_hw_cache_extra_regs
+   [PERF_COUNT_HW_CACHE_MAX]
+   [PERF_COUNT_HW_CACHE_OP_MAX]
+   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+   [C(LL)] = {
+   [C(OP_READ)] = {
+   [C(RESULT_ACCESS)]  = TNT_DEMAND_READ|
+ TNT_LLC_ACCESS,
+   [C(RESULT_MISS)]= TNT_DEMAND_READ|
+ TNT_LLC_MISS,
+   },
+   [C(OP_WRITE)] = {
+   [C(RESULT_ACCESS)]  = TNT_DEMAND_WRITE|
+ TNT_LLC_ACCESS,
+   [C(RESULT_MISS)]= TNT_DEMAND_WRITE|
+ TNT_LLC_MISS,
+   },
+   [C(OP_PREFETCH)] = {
+   [C(RESULT_ACCESS)]  = 0x0,
+   [C(RESULT_MISS)]= 0x0,
+   },
+   },
+};
+
+static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
+   /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+   INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xff9fffull, 
RSP_0),
+   INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff9fffull, 
RSP_1),
+   EVENT_EXTRA_END
+};
+
 #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
 #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
 #define KNL_MCDRAM_LOCAL   BIT_ULL(21)
@@ -3403,6 +3442,9 @@ static struct event_constraint counter2_constraint =
 static struct event_constraint fixed_counter0_constraint =
FIXED_EVENT_CONSTRAINT(0x00c0, 0);
 
+static struct event_constraint fixed0_counter0_constraint =
+   INTEL_ALL_EVENT_CONSTRAINT(0, 0x10001ULL);
+
 static struct event_constraint *
 hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
  struct perf_event *event)
@@ -3454,6 +3496,29 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, 
int idx,
return c;
 }
 
+static struct event_constraint *
+tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+   struct event_constraint *c;
+
+   /*
+* :ppp means to do reduced skid PEBS,
+* which is available on PMC0 and fixed counter 0.
+*/
+   if (event->attr.precise_ip == 3) {
+   /* Force instruction:ppp on PMC0 and Fixed counter 0 */
+   if (EVENT_CONFIG(event->hw.config) == X86_CONFIG(.event=0xc0))
+   return &fixed0_counter0_constraint;
+
+   return &counter0_constraint;
+   }
+
+   c = intel_get_event_constraints(cpuc, idx, event);
+
+   return c;
+}
+
 static bool allow_tsx_force_abort = true;
 
 static struct event_constraint *
@@ -4533,6 +4598,32 @@ __init int intel_pmu_init(void)
name = "goldmont_plus";
break;
 
+   case INTEL_FAM6_ATOM_TREMONT_X:
+   x86_pmu.late_ack = true;
+   memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+  sizeof(hw_cache_event_ids));
+   memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
+  sizeof(hw_cache_extra_regs));
+   hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+
+   intel_pmu_lbr_init_skl();

[PATCH V2 1/2] perf/x86/intel: Fix the checking for instruction event

2019-04-10 Thread kan . liang
From: Kan Liang 

Some bits must be masked before checking X86_CONFIG(.event=0xc0), e.g.
ARCH_PERFMON_EVENTSEL_INT, ARCH_PERFMON_EVENTSEL_USR and
ARCH_PERFMON_EVENTSEL_OS. Those bits will be set in hw_config().
Otherwise the condition will never be met.

Other fields, e.g the INV, ANY, E, or CMASK fields are not allowed for
the reduced Skid PEBS.

Signed-off-by: Kan Liang 
---

New patch to fix a bug on top of Icelake V5 patch series
(with Peter's cleanup patch).

The patch may be merged back into:

  Subject: perf/x86/intel: Add Icelake support

 arch/x86/events/intel/core.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f34d92b..34220ab 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3421,6 +3421,9 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, int 
idx,
return c;
 }
 
+#define EVENT_CONFIG(config)   \
+   (config & (X86_ALL_EVENT_FLAGS | INTEL_ARCH_EVENT_MASK))
+
 static struct event_constraint *
 icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
  struct perf_event *event)
@@ -3430,7 +3433,7 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int 
idx,
 * Force instruction:ppp in Fixed counter 0
 */
if ((event->attr.precise_ip == 3) &&
-   (event->hw.config == X86_CONFIG(.event=0xc0)))
+   (EVENT_CONFIG(event->hw.config) == X86_CONFIG(.event=0xc0)))
return &fixed_counter0_constraint;
 
return hsw_get_event_constraints(cpuc, idx, event);
-- 
2.7.4



Re: [PATCH 02/12] s390: don't build vdso32 with clang

2019-04-10 Thread Arnd Bergmann
On Wed, Apr 10, 2019 at 6:26 PM 'Nick Desaulniers' via Clang Built
Linux  wrote:
>
> On Mon, Apr 8, 2019 at 2:27 PM Arnd Bergmann  wrote:
> >
> > clang does not support 31 bit object files on s390, so skip
> > the 32-bit vdso here, and only build it when using gcc to compile
> > the kernel.
>
> What's the build failure?  Would you mind filing a bug against LLVM's
> issue tracker for it, please?

As far as I can tell, llvm does only supports 64-bit output for s390, so this
is not a bug but rather a missing feature that seems highly unlikely to
ever get added.

32-bit (31-bit) mode on s390 is only used for very old existing binaries,
and the vdso support is optional there.

  Arnd


Re: [RESEND PATCHv3 0/3] Update Stratix10 EDAC Bindings

2019-04-10 Thread Borislav Petkov
On Tue, Apr 09, 2019 at 05:32:07PM -0500, Thor Thayer wrote:
> I have ACKs on patches 1 & 3. Patch 2 has a Reviewed-by from Rob Herring
> which was sufficient in the past.

Sorry about that - I missed those because I looked only at the diffstat
and decided this series is not for me :)

Anyway, all three applied now.

Thx.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.


Re: [PATCH 10/12] s390: avoid __builtin_return_address(n) on clang

2019-04-10 Thread Arnd Bergmann
On Wed, Apr 10, 2019 at 6:14 PM Steven Rostedt  wrote:
> On Wed, 10 Apr 2019 18:03:57 +0200 Martin Schwidefsky 
>  wrote:
>
> > > --- a/arch/s390/include/asm/ftrace.h
> > > +++ b/arch/s390/include/asm/ftrace.h
> > > @@ -13,7 +13,12 @@
> > >
> > >  #ifndef __ASSEMBLY__
> > >
> > > +#ifdef CONFIG_CC_IS_CLANG
> > > +/* https://bugs.llvm.org/show_bug.cgi?id=41424 */
> > > +#define ftrace_return_address(n) __builtin_return_address(0)
> > > +#else
> > >  #define ftrace_return_address(n) __builtin_return_address(n)
> > > +#endif
> > >
> > >  void _mcount(void);
> > >  void ftrace_caller(void);
> >
> > I can say I like this one. If the compiler can not do 
> > __builtin_return_address(n)
> > it feels wrong to just use __builtin_return_address(0).
>
> I agree. The proper return value is 0UL, see include/linux/ftrace.h
>
> /* Archs may use other ways for ADDR1 and beyond */
> #ifndef ftrace_return_address
> # ifdef CONFIG_FRAME_POINTER
> #  define ftrace_return_address(n) __builtin_return_address(n)
> # else
> #  define ftrace_return_address(n) 0UL
> # endif
> #endif
>
> This is why we treat zero differently:
>
> #define CALLER_ADDR0 ((unsigned long)ftrace_return_address0)
> #define CALLER_ADDR1 ((unsigned long)ftrace_return_address(1))
> #define CALLER_ADDR2 ((unsigned long)ftrace_return_address(2))
> #define CALLER_ADDR3 ((unsigned long)ftrace_return_address(3))
> #define CALLER_ADDR4 ((unsigned long)ftrace_return_address(4))
> #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
> #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))

Right, got it.

Martin, do you want me to send a replacement patch, or can you
commit the patch with

#ifdef CONFIG_CC_IS_CLANG
/* https://bugs.llvm.org/show_bug.cgi?id=41424 */
#define ftrace_return_address(n) 0UL
#else
#define ftrace_return_address(n) __builtin_return_address(n)
#endif

instead?

Arnd


[PATCH v2 2/3] module: Make srcu_struct ptr array as read-only post init

2019-04-10 Thread Joel Fernandes (Google)
Since commit title ("srcu: Allocate per-CPU data for DEFINE_SRCU() in
modules"), modules that call DEFINE_{STATIC,}SRCU will have a new array
of srcu_struct pointers which is used by srcu code to initialize and
clean up these structures.

There is no reason for this array of pointers to be writable, and can
cause security or other hidden bugs. Mark these are read-only after the
module init has completed.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Acked-by: keesc...@chromium.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/module.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 1acddb93282a..8b9631e789f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3305,7 +3305,7 @@ core_param(module_blacklist, module_blacklist, charp, 
0400);
  * layout_sections() can put it in the right place.
  * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
  */
-static char *ro_after_init_sections[] = {
+static const char * const ro_after_init_sections[] = {
".data..ro_after_init",
 
/*
@@ -3314,6 +3314,12 @@ static char *ro_after_init_sections[] = {
 * annotated as such at module load time.
 */
"__jump_table",
+
+   /*
+* Used for SRCU structures which need to be initialized/cleaned up
+* by the SRCU notifiers
+*/
+   "___srcu_struct_ptrs",
 };
 
 static struct module *layout_and_allocate(struct load_info *info, int flags)
@@ -3336,7 +3342,7 @@ static struct module *layout_and_allocate(struct 
load_info *info, int flags)
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
/* Set sh_flags for read-only after init sections */
-   for (i = 0; ro_after_init_sections[i]; i++) {
+   for (i = 0; i < ARRAY_SIZE(ro_after_init_sections); i++) {
ndx = find_sec(info, ro_after_init_sections[i]);
if (ndx)
info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2 1/3] module: Prepare for addition of new ro_after_init sections

2019-04-10 Thread Joel Fernandes (Google)
For the purposes of hardening modules by adding sections to
ro_after_init sections, prepare for addition of new ro_after_init
entries which we do in future patches. Create a table to which new
entries could be added later. This makes it less error prone and reduce
code duplication.

Cc: paul...@linux.vnet.ibm.com
Cc: rost...@goodmis.org
Cc: mathieu.desnoy...@efficios.com
Cc: r...@vger.kernel.org
Cc: kernel-harden...@lists.openwall.com
Cc: kernel-t...@android.com
Suggested-by: keesc...@chromium.org
Reviewed-by: keesc...@chromium.org
Acked-by: rost...@goodmis.org
Signed-off-by: Joel Fernandes (Google) 

---
 kernel/module.c | 41 +++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 524da609c884..1acddb93282a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3300,11 +3300,27 @@ static bool blacklisted(const char *module_name)
 }
 core_param(module_blacklist, module_blacklist, charp, 0400);
 
+/*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+static char *ro_after_init_sections[] = {
+   ".data..ro_after_init",
+
+   /*
+* __jump_table structures are never modified, with the exception of
+* entries that refer to code in the __init section, which are
+* annotated as such at module load time.
+*/
+   "__jump_table",
+};
+
 static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
struct module *mod;
unsigned int ndx;
-   int err;
+   int err, i;
 
err = check_modinfo(info->mod, info, flags);
if (err)
@@ -3319,23 +3335,12 @@ static struct module *layout_and_allocate(struct 
load_info *info, int flags)
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
-   /*
-* Mark ro_after_init section with SHF_RO_AFTER_INIT so that
-* layout_sections() can put it in the right place.
-* Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
-*/
-   ndx = find_sec(info, ".data..ro_after_init");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-   /*
-* Mark the __jump_table section as ro_after_init as well: these data
-* structures are never modified, with the exception of entries that
-* refer to code in the __init section, which are annotated as such
-* at module load time.
-*/
-   ndx = find_sec(info, "__jump_table");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   /* Set sh_flags for read-only after init sections */
+   for (i = 0; ro_after_init_sections[i]; i++) {
+   ndx = find_sec(info, ro_after_init_sections[i]);
+   if (ndx)
+   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   }
 
/* Determine total sizes, and put offsets in sh_entsize.  For now
   this is done generically; there doesn't appear to be any
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2 3/3] module: Make __tracepoints_ptrs as read-only

2019-04-10 Thread Joel Fernandes (Google)
This series hardens the tracepoints in modules by making the array of
pointers referring to the tracepoints as read-only. This array is needed
during module unloading to verify that the tracepoint is quiescent.
There is no reason for the array to be to be writable after init, and
can cause security or other hidden bugs. Mark these as ro_after_init.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Suggested-by: mathieu.desnoy...@efficios.com
Cc: rost...@goodmis.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/module.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 8b9631e789f0..be980aaa8804 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3320,6 +3320,12 @@ static const char * const ro_after_init_sections[] = {
 * by the SRCU notifiers
 */
"___srcu_struct_ptrs",
+
+   /*
+* Array of tracepoint pointers used for checking if tracepoints are
+* quiescent during unloading.
+*/
+   "__tracepoints_ptrs",
 };
 
 static struct module *layout_and_allocate(struct load_info *info, int flags)
-- 
2.21.0.392.gf8f6787159e-goog



Re: [PATCH v2 1/3] module: Prepare for addition of new ro_after_init sections

2019-04-10 Thread Joel Fernandes
On Wed, Apr 10, 2019 at 03:08:21PM -0400, Joel Fernandes (Google) wrote:
> For the purposes of hardening modules by adding sections to
> ro_after_init sections, prepare for addition of new ro_after_init
> entries which we do in future patches. Create a table to which new
> entries could be added later. This makes it less error prone and reduce
> code duplication.
> 
> Cc: paul...@linux.vnet.ibm.com
> Cc: rost...@goodmis.org
> Cc: mathieu.desnoy...@efficios.com
> Cc: r...@vger.kernel.org
> Cc: kernel-harden...@lists.openwall.com
> Cc: kernel-t...@android.com
> Suggested-by: keesc...@chromium.org
> Reviewed-by: keesc...@chromium.org
> Acked-by: rost...@goodmis.org
> Signed-off-by: Joel Fernandes (Google) 
> 
> ---
>  kernel/module.c | 41 +++--
>  1 file changed, 23 insertions(+), 18 deletions(-)
> 
> diff --git a/kernel/module.c b/kernel/module.c
> index 524da609c884..1acddb93282a 100644
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -3300,11 +3300,27 @@ static bool blacklisted(const char *module_name)
>  }
>  core_param(module_blacklist, module_blacklist, charp, 0400);
>  
> +/*
> + * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
> + * layout_sections() can put it in the right place.
> + * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
> + */
> +static char *ro_after_init_sections[] = {
> + ".data..ro_after_init",
> +
> + /*
> +  * __jump_table structures are never modified, with the exception of
> +  * entries that refer to code in the __init section, which are
> +  * annotated as such at module load time.
> +  */
> + "__jump_table",
> +};
> +
>  static struct module *layout_and_allocate(struct load_info *info, int flags)
>  {
>   struct module *mod;
>   unsigned int ndx;
> - int err;
> + int err, i;
>  
>   err = check_modinfo(info->mod, info, flags);
>   if (err)
> @@ -3319,23 +3335,12 @@ static struct module *layout_and_allocate(struct 
> load_info *info, int flags)
>   /* We will do a special allocation for per-cpu sections later. */
>   info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
>  
> - /*
> -  * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
> -  * layout_sections() can put it in the right place.
> -  * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
> -  */
> - ndx = find_sec(info, ".data..ro_after_init");
> - if (ndx)
> - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
> - /*
> -  * Mark the __jump_table section as ro_after_init as well: these data
> -  * structures are never modified, with the exception of entries that
> -  * refer to code in the __init section, which are annotated as such
> -  * at module load time.
> -  */
> - ndx = find_sec(info, "__jump_table");
> - if (ndx)
> - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
> + /* Set sh_flags for read-only after init sections */
> + for (i = 0; ro_after_init_sections[i]; i++) {

Seems the fixup for this based on Kees suggestion of using NULL got squashed
into 2/3, so allow me to send a v3 to fix this ;-) Sorry! I am doing that
now.

The patches applied together are still code-correct thought.

thanks,

 - Joel



Trade Inquiry 10/04/2019

2019-04-10 Thread Daniel Murray
Hi,friend,

This is Daniel Murray and i am from Sinara Group Co.,LTD in Russia.
We are glad to know about your company from the web and we are interested in 
your products.
Could you kindly send us your Latest catalog and price list for our trial order.

Thanks and Best Regards,

Daniel Murray
Purchasing Manager
Sinara Group Co.,LTD




Re: [PATCH 2/2] kernel: use sysctl shared variables for range check

2019-04-10 Thread Matteo Croce
On Wed, Apr 10, 2019 at 8:46 PM Kees Cook  wrote:
>
> On Mon, Apr 8, 2019 at 3:09 PM Matteo Croce  wrote:
> >
> > Use the shared variables for range check, instead of declaring a local one
> > in every source file.
>
> I was expecting this to be a tree-wide change for all the cases found
> by patch 1's "git grep".
>

Hi Kees,

I have already the whole patch ready, but I was frightened by the
output of get_maintainer.pl, so I decided to split the patch into
small pieces and send the first one.
Patches for /proc/sys/net and drivers/ are pretty big, and can be
merged after the 1/2 inclusion.

> Slight change to the grep for higher accuracy:
>
> $ git grep -E '\.extra[12].*&(zero|one|int_max)\b' |wc -l
> 245
>

Right, my regexp wrongly matches also one_hundred, one_jiffy, etc.
Anywqay, I did the changes by hand, so apart the commit message, the
content should be safe.

> Only 31 sources:
> $ git grep -E '\.extra[12].*&(zero|one|int_max)\b' | cut -d: -f1 |
> sort -u > /tmp/list.txt
> $ wc -l /tmp/list.txt
> 31
>
> One thing I wonder about is if any of these cases depend on the extra
> variable being non-const (many of these are just "static int").
>
> $ egrep -H '\b(zero|one|int_max)\b.*=' $(cat /tmp/list.txt) | grep -v static
>
> Looks like none, so it'd be safe. How about doing this tree-wide for
> all 31 cases? (Coccinelle might be able to help.)
>

It could be true for other sysctl values like
xpc_disengage_max_timelimit or fscache_op_wq, but it's very unlikely
that someone writes, for example, 5 into a variable named "zero". If
it does, it most likely a bug, so const is our friend.

Regards,
-- 
Matteo Croce
per aspera ad upstream


[PATCH v2 12/21] docs: hwmon: asc7621: convert to ReST format

2019-04-10 Thread Mauro Carvalho Chehab
Convert asc7621 to ReST format, in order to allow it to
be parsed by Sphinx.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/hwmon/asc7621 | 146 ++--
 1 file changed, 88 insertions(+), 58 deletions(-)

diff --git a/Documentation/hwmon/asc7621 b/Documentation/hwmon/asc7621
index 7287be7e1f21..b5a9fad0f172 100644
--- a/Documentation/hwmon/asc7621
+++ b/Documentation/hwmon/asc7621
@@ -1,10 +1,15 @@
+=
 Kernel driver asc7621
-==
+=
 
 Supported chips:
+
 Andigilog aSC7621 and aSC7621a
+
 Prefix: 'asc7621'
+
 Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+
 Datasheet: http://www.fairview5.com/linux/asc7621/asc7621.pdf
 
 Author:
@@ -73,8 +78,10 @@ Finally, we have added a tach disable function that turns 
off the tach
 measurement system for individual tachs in order to save power. That is
 in register 75h.
 
---
+--
+
 aSC7621 Product Description
+===
 
 The aSC7621 has a two wire digital interface compatible with SMBus 2.0.
 Using a 10-bit ADC, the aSC7621 measures the temperature of two remote diode
@@ -102,6 +109,8 @@ System voltages of VCCP, 2.5V, 3.3V, 5.0V, and 12V 
motherboard power are
 monitored efficiently with internal scaling resistors.
 
 Features
+
+
 - Supports PECI interface and monitors internal and remote thermal diodes
 - 2-wire, SMBus 2.0 compliant, serial interface
 - 10-bit ADC
@@ -110,7 +119,7 @@ Features
 - Noise filtering of temperature reading for fan speed control
 - 0.25C digital temperature sensor resolution
 - 3 PWM fan speed control outputs for 2-, 3- or 4-wire fans and up to 4 fan
-   tachometer inputs
+  tachometer inputs
 - Enhanced measured temperature to Temperature Zone assignment.
 - Provides high and low PWM frequency ranges
 - 3 GPIO pins for custom use
@@ -123,17 +132,20 @@ Except where noted below, the sysfs entries created by 
this driver follow
 the standards defined in "sysfs-interface".
 
 temp1_source
+   =   ===
0   (default) peci_legacy = 0, Remote 1 Temperature
-   peci_legacy = 1, PECI Processor Temperature 0
+   peci_legacy = 1, PECI Processor Temperature 0
1   Remote 1 Temperature
2   Remote 2 Temperature
3   Internal Temperature
4   PECI Processor Temperature 0
5   PECI Processor Temperature 1
6   PECI Processor Temperature 2
-   7  PECI Processor Temperature 3
+   7   PECI Processor Temperature 3
+   =   ===
 
 temp2_source
+   =   ===
0   (default) Internal Temperature
1   Remote 1 Temperature
2   Remote 2 Temperature
@@ -142,8 +154,10 @@ temp2_source
5   PECI Processor Temperature 1
6   PECI Processor Temperature 2
7   PECI Processor Temperature 3
+   =   ===
 
 temp3_source
+   =   ===
0   (default) Remote 2 Temperature
1   Remote 1 Temperature
2   Remote 2 Temperature
@@ -152,10 +166,12 @@ temp3_source
5   PECI Processor Temperature 1
6   PECI Processor Temperature 2
7   PECI Processor Temperature 3
+   =   ===
 
 temp4_source
+   =   ===
0   (default) peci_legacy = 0, PECI Processor Temperature 0
-   peci_legacy = 1, Remote 1 Temperature
+   peci_legacy = 1, Remote 1 Temperature
1   Remote 1 Temperature
2   Remote 2 Temperature
3   Internal Temperature
@@ -163,58 +179,65 @@ temp4_source
5   PECI Processor Temperature 1
6   PECI Processor Temperature 2
7   PECI Processor Temperature 3
+   =   ===
 
-temp[1-4]_smoothing_enable
-temp[1-4]_smoothing_time
+temp[1-4]_smoothing_enable / temp[1-4]_smoothing_time
Smooths spikes in temp readings caused by noise.
Valid values in milliseconds are:
-   35000
-   17600
-   11800
-7000
-4400
-3000
-1600
- 800
+
+   * 35000
+   * 17600
+   * 11800
+   *  7000
+   *  4400
+   *  3000
+   *  1600
+   *   800
 
 temp[1-4]_crit
When the corresponding zone temperature reaches this value,
ALL pwm outputs will got to 100%.
 
-temp[5-8]_input
-temp[5-8]_enable
+temp[5-8]_input / temp[5-8]_enable
The aSC7621 can also read temperatures provided by the processor
via the PECI bus.  Usually these are

[PATCH v2 03/21] docs: hwmon: menf21bmc: convert to ReST format

2019-04-10 Thread Mauro Carvalho Chehab
Convert menf21bmc to ReST format, in order to allow it to
be parsed by Sphinx.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/hwmon/menf21bmc | 5 +
 1 file changed, 5 insertions(+)

diff --git a/Documentation/hwmon/menf21bmc b/Documentation/hwmon/menf21bmc
index 2a273a065c5e..1f0c6b2235ab 100644
--- a/Documentation/hwmon/menf21bmc
+++ b/Documentation/hwmon/menf21bmc
@@ -2,8 +2,11 @@ Kernel driver menf21bmc_hwmon
 =
 
 Supported chips:
+
* MEN 14F021P00
+
  Prefix: 'menf21bmc_hwmon'
+
  Adresses scanned: -
 
 Author: Andreas Werner 
@@ -34,6 +37,7 @@ Sysfs entries
 The following attributes are supported. All attributes are read only
 The Limits are read once by the driver.
 
+=== ==
 in0_input  +3.3V input voltage
 in1_input  +5.0V input voltage
 in2_input  +12.0V input voltage
@@ -48,3 +52,4 @@ in1_label "MON_5V"
 in2_label  "MON_12V"
 in3_label  "5V_STANDBY"
 in4_label  "VBAT"
+=== ==
-- 
2.20.1



[PATCH v2 14/21] docs: hwmon: dme1737, vt1211: convert to ReST format

2019-04-10 Thread Mauro Carvalho Chehab
Convert dme1737 and vt1211 to ReST format, in order to allow
them to be parsed by Sphinx.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/hwmon/dme1737 | 88 ++---
 Documentation/hwmon/vt1211  | 84 +--
 2 files changed, 114 insertions(+), 58 deletions(-)

diff --git a/Documentation/hwmon/dme1737 b/Documentation/hwmon/dme1737
index 4d2935145a1c..82fcbc6b2b43 100644
--- a/Documentation/hwmon/dme1737
+++ b/Documentation/hwmon/dme1737
@@ -2,21 +2,37 @@ Kernel driver dme1737
 =
 
 Supported chips:
+
   * SMSC DME1737 and compatibles (like Asus A8000)
+
 Prefix: 'dme1737'
+
 Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+
 Datasheet: Provided by SMSC upon request and under NDA
+
   * SMSC SCH3112, SCH3114, SCH3116
+
 Prefix: 'sch311x'
+
 Addresses scanned: none, address read from Super-I/O config space
+
 Datasheet: Available on the Internet
+
   * SMSC SCH5027
+
 Prefix: 'sch5027'
+
 Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+
 Datasheet: Provided by SMSC upon request and under NDA
+
   * SMSC SCH5127
+
 Prefix: 'sch5127'
+
 Addresses scanned: none, address read from Super-I/O config space
+
 Datasheet: Provided by SMSC upon request and under NDA
 
 Authors:
@@ -26,11 +42,14 @@ Authors:
 Module Parameters
 -
 
-* force_start: boolEnables the monitoring of voltage, fan and temp inputs
+* force_start: bool
+   Enables the monitoring of voltage, fan and temp inputs
and PWM output control functions. Using this parameter
shouldn't be required since the BIOS usually takes care
of this.
-* probe_all_addr: bool Include non-standard LPC addresses 0x162e and 0x164e
+
+* probe_all_addr: bool
+   Include non-standard LPC addresses 0x162e and 0x164e
when probing for ISA devices. This is required for the
following boards:
- VIA EPIA SN18000
@@ -70,7 +89,8 @@ scaling resistors. The values returned by the driver 
therefore reflect true
 millivolts and don't need scaling. The voltage inputs are mapped as follows
 (the last column indicates the input ranges):
 
-DME1737, A8000:
+DME1737, A8000::
+
in0: +5VTR  (+5V standby)   0V - 6.64V
in1: Vccp   (processor core)0V - 3V
in2: VCC(internal +3.3V)0V - 4.38V
@@ -79,7 +99,8 @@ DME1737, A8000:
in5: VTR(+3.3V standby) 0V - 4.38V
in6: Vbat   (+3.0V) 0V - 4.38V
 
-SCH311x:
+SCH311x::
+
in0: +2.5V  0V - 3.32V
in1: Vccp   (processor core)0V - 2V
in2: VCC(internal +3.3V)0V - 4.38V
@@ -88,7 +109,8 @@ SCH311x:
in5: VTR(+3.3V standby) 0V - 4.38V
in6: Vbat   (+3.0V) 0V - 4.38V
 
-SCH5027:
+SCH5027::
+
in0: +5VTR  (+5V standby)   0V - 6.64V
in1: Vccp   (processor core)0V - 3V
in2: VCC(internal +3.3V)0V - 4.38V
@@ -97,7 +119,8 @@ SCH5027:
in5: VTR(+3.3V standby) 0V - 4.38V
in6: Vbat   (+3.0V) 0V - 4.38V
 
-SCH5127:
+SCH5127::
+
in0: +2.5   0V - 3.32V
in1: Vccp   (processor core)0V - 3V
in2: VCC(internal +3.3V)0V - 4.38V
@@ -119,7 +142,7 @@ Celsius. The chip also features offsets for all 3 
temperature inputs which -
 when programmed - get added to the input readings. The chip does all the
 scaling by itself and the driver therefore reports true temperatures that don't
 need any user-space adjustments. The temperature inputs are mapped as follows
-(the last column indicates the input ranges):
+(the last column indicates the input ranges)::
 
temp1: Remote diode 1 (3904 type) temperature   -127C - +127C
temp2: DME1737 internal temperature -127C - +127C
@@ -171,6 +194,7 @@ pwm[1-3]_auto_pwm_min, respectively. The thermal thresholds 
of the zones are
 programmed via zone[1-3]_auto_point[1-3]_temp and
 zone[1-3]_auto_point1_temp_hyst:
 
+   === ===
pwm[1-3]_auto_point2_pwmfull-speed duty-cycle (255, i.e., 100%)
pwm[1-3]_auto_point1_pwmlow-speed duty-cycle
pwm[1-3]_auto_pwm_min   min-speed duty-cycle
@@ -179,6 +203,7 @@ zone[1-3]_auto_point1_temp_hyst:
zone[1-3]_auto_point2_temp  full-speed temp
zone[1-3]_auto_point1_temp  low-speed temp
zone[1-3]_auto_point1_temp_hyst min-speed temp
+   === ===
 
 The chip adjusts the output duty-cycle linearly in the range of auto_point1_pwm
 to auto_point2_pwm

[PATCH v2 11/21] docs: hwmon: ibmpowernv: convert to ReST format

2019-04-10 Thread Mauro Carvalho Chehab
Convert ibmpowernv to ReST format, in order to allow it to
be parsed by Sphinx.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/hwmon/ibmpowernv | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/hwmon/ibmpowernv b/Documentation/hwmon/ibmpowernv
index 56468258711f..3f1feae3901c 100644
--- a/Documentation/hwmon/ibmpowernv
+++ b/Documentation/hwmon/ibmpowernv
@@ -2,6 +2,7 @@ Kernel Driver IBMPOWERNV
 
 
 Supported systems:
+
   * Any recent IBM P servers based on POWERNV platform
 
 Author: Neelesh Gupta
@@ -29,6 +30,7 @@ CONFIG_SENSORS_IBMPOWERNV. It can also be built as module 
'ibmpowernv'.
 Sysfs attributes
 
 
+=== ===
 fanX_input Measured RPM value.
 fanX_min   Threshold RPM for alert generation.
 fanX_fault 0: No fail condition
@@ -78,3 +80,4 @@ currX_enable  Enable/disable all current sensors 
belonging to the
0: Disable
 
 energyX_input  Cumulative energy (microJoule)
+=== ===
-- 
2.20.1



[PATCH v2 08/21] docs: hwmon: w83791d: convert to ReST format

2019-04-10 Thread Mauro Carvalho Chehab
Convert w83791d to ReST format, in order to allow it to
be parsed by Sphinx.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/hwmon/w83791d | 123 +---
 1 file changed, 71 insertions(+), 52 deletions(-)

diff --git a/Documentation/hwmon/w83791d b/Documentation/hwmon/w83791d
index f4021a285460..a91f9e5fb0c6 100644
--- a/Documentation/hwmon/w83791d
+++ b/Documentation/hwmon/w83791d
@@ -2,9 +2,13 @@ Kernel driver w83791d
 =
 
 Supported chips:
+
   * Winbond W83791D
+
 Prefix: 'w83791d'
+
 Addresses scanned: I2C 0x2c - 0x2f
+
 Datasheet: 
http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83791D_W83791Gb.pdf
 
 Author: Charles Spirakis 
@@ -12,39 +16,46 @@ Author: Charles Spirakis 
 This driver was derived from the w83781d.c and w83792d.c source files.
 
 Credits:
+
   w83781d.c:
-Frodo Looijaard ,
-Philip Edelbrock ,
-and Mark Studebaker 
+
+- Frodo Looijaard ,
+- Philip Edelbrock ,
+- Mark Studebaker 
+
   w83792d.c:
-Shane Huang (Winbond),
-Rudolf Marek 
+
+- Shane Huang (Winbond),
+- Rudolf Marek 
 
 Additional contributors:
-Sven Anders 
-Marc Hulsman 
+
+- Sven Anders 
+- Marc Hulsman 
 
 Module Parameters
 -
 
 * init boolean
-  (default 0)
-  Use 'init=1' to have the driver do extra software initializations.
-  The default behavior is to do the minimum initialization possible
-  and depend on the BIOS to properly setup the chip. If you know you
-  have a w83791d and you're having problems, try init=1 before trying
-  reset=1.
+(default 0)
+
+Use 'init=1' to have the driver do extra software initializations.
+The default behavior is to do the minimum initialization possible
+and depend on the BIOS to properly setup the chip. If you know you
+have a w83791d and you're having problems, try init=1 before trying
+reset=1.
 
 * reset boolean
-  (default 0)
-  Use 'reset=1' to reset the chip (via index 0x40, bit 7). The default
-  behavior is no chip reset to preserve BIOS settings.
+(default 0)
+
+Use 'reset=1' to reset the chip (via index 0x40, bit 7). The default
+behavior is no chip reset to preserve BIOS settings.
 
 * force_subclients=bus,caddr,saddr,saddr
-  This is used to force the i2c addresses for subclients of
-  a certain chip. Example usage is `force_subclients=0,0x2f,0x4a,0x4b'
-  to force the subclients of chip 0x2f on bus 0 to i2c addresses
-  0x4a and 0x4b.
+This is used to force the i2c addresses for subclients of
+a certain chip. Example usage is `force_subclients=0,0x2f,0x4a,0x4b`
+to force the subclients of chip 0x2f on bus 0 to i2c addresses
+0x4a and 0x4b.
 
 
 Description
@@ -91,11 +102,11 @@ This file is used for both legacy and new code.
 
 The sysfs interface to the beep bitmask has migrated from the original legacy
 method of a single sysfs beep_mask file to a newer method using multiple
-*_beep files as described in .../Documentation/hwmon/sysfs-interface.
+`*_beep` files as described in `Documentation/hwmon/sysfs-interface`.
 
 A similar change has occurred for the bitmap corresponding to the alarms. The
 original legacy method used a single sysfs alarms file containing a bitmap
-of triggered alarms. The newer method uses multiple sysfs *_alarm files
+of triggered alarms. The newer method uses multiple sysfs `*_alarm` files
 (again following the pattern described in sysfs-interface).
 
 Since both methods read and write the underlying hardware, they can be used
@@ -116,46 +127,54 @@ User mode code requesting values more often will receive 
cached values.
 The sysfs-interface is documented in the 'sysfs-interface' file. Only
 chip-specific options are documented here.
 
-pwm[1-3]_enable -  this file controls mode of fan/temperature control for
+=== ===
+pwm[1-3]_enablethis file controls mode of fan/temperature 
control for
fan 1-3. Fan/PWM 4-5 only support manual mode.
-   * 1 Manual mode
-   * 2 Thermal Cruise mode
-   * 3 Fan Speed Cruise mode (no further support)
 
-temp[1-3]_target - defines the target temperature for Thermal Cruise mode.
+   * 1 Manual mode
+   * 2 Thermal Cruise mode
+   * 3 Fan Speed Cruise mode (no further support)
+
+temp[1-3]_target   defines the target temperature for Thermal Cruise mode.
Unit: millidegree Celsius
RW
 
-temp[1-3]_tolerance -  temperature tolerance for Thermal Cruise mode.
+temp[1-3]_tolerancetemperature tolerance for Thermal Cruise mode.
Specifies an interval around the target temperature
in which the fan speed is not changed.
Unit: millidegree 

[PATCH] cifs: fix page reference leak with readv/writev

2019-04-10 Thread jglisse
From: Jérôme Glisse 

CIFS can leak pages reference gotten through GUP (get_user_pages*()
through iov_iter_get_pages()). This happen if cifs_send_async_read()
or cifs_write_from_iter() calls fail from within __cifs_readv() and
__cifs_writev() respectively. This patch move page unreference to
cifs_aio_ctx_release() which will happens on all code paths this is
all simpler to follow for correctness.

Signed-off-by: Jérôme Glisse 
Cc: Steve French 
Cc: linux-c...@vger.kernel.org
Cc: samba-techni...@lists.samba.org
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Linus Torvalds 
Cc: Stable 
---
 fs/cifs/file.c | 15 +--
 fs/cifs/misc.c | 23 ++-
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 89006e044973..a756a4d3f70f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2858,7 +2858,6 @@ static void collect_uncached_write_data(struct 
cifs_aio_ctx *ctx)
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
struct dentry *dentry = ctx->cfile->dentry;
-   unsigned int i;
int rc;
 
tcon = tlink_tcon(ctx->cfile->tlink);
@@ -2922,10 +2921,6 @@ static void collect_uncached_write_data(struct 
cifs_aio_ctx *ctx)
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
 
-   if (!ctx->direct_io)
-   for (i = 0; i < ctx->npages; i++)
-   put_page(ctx->bv[i].bv_page);
-
cifs_stats_bytes_written(tcon, ctx->total_len);
set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
 
@@ -3563,7 +3558,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
struct iov_iter *to = &ctx->iter;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
-   unsigned int i;
int rc;
 
tcon = tlink_tcon(ctx->cfile->tlink);
@@ -3647,15 +3641,8 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
 
-   if (!ctx->direct_io) {
-   for (i = 0; i < ctx->npages; i++) {
-   if (ctx->should_dirty)
-   set_page_dirty(ctx->bv[i].bv_page);
-   put_page(ctx->bv[i].bv_page);
-   }
-
+   if (!ctx->direct_io)
ctx->total_len = ctx->len - iov_iter_count(to);
-   }
 
/* mask nodata case */
if (rc == -ENODATA)
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index bee203055b30..9bc0d17a9d77 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -768,6 +768,11 @@ cifs_aio_ctx_alloc(void)
 {
struct cifs_aio_ctx *ctx;
 
+   /*
+* Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io
+* to false so that we know when we have to unreference pages within
+* cifs_aio_ctx_release()
+*/
ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
if (!ctx)
return NULL;
@@ -786,7 +791,23 @@ cifs_aio_ctx_release(struct kref *refcount)
struct cifs_aio_ctx, refcount);
 
cifsFileInfo_put(ctx->cfile);
-   kvfree(ctx->bv);
+
+   /*
+* ctx->bv is only set if setup_aio_ctx_iter() was call successfuly
+* which means that iov_iter_get_pages() was a success and thus that
+* we have taken reference on pages.
+*/
+   if (ctx->bv) {
+   unsigned i;
+
+   for (i = 0; i < ctx->npages; i++) {
+   if (ctx->should_dirty)
+   set_page_dirty(ctx->bv[i].bv_page);
+   put_page(ctx->bv[i].bv_page);
+   }
+   kvfree(ctx->bv);
+   }
+
kfree(ctx);
 }
 
-- 
2.20.1



Re: [PATCH v3 3/7] clk: Add of_clk_hw_register() API for early clk drivers

2019-04-10 Thread Jeffrey Hugo

On 4/10/2019 10:53 AM, Stephen Boyd wrote:

Quoting Jeffrey Hugo (2019-04-08 14:46:11)

On 4/4/2019 3:53 PM, Stephen Boyd wrote:

In some circumstances drivers register clks early and don't have access
to a struct device because the device model isn't initialized yet. Add
an API to let drivers register clks associated with a struct device_node
so that these drivers can participate in getting parent clks through DT.


NACK.  This patch broke boot for me.  I had to pull the below from JTAG.
   What do you need to debug this?



Here's a patch to try to squash in:


No dice.  Same issue.



---8<
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 709492d901a1..040ce083c89e 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3662,7 +3662,7 @@ __clk_register(struct device *dev, struct device_node 
*np, struct clk_hw *hw)
   */
  struct clk *clk_register(struct device *dev, struct clk_hw *hw)
  {
-   return __clk_register(dev, dev->of_node, hw);
+   return __clk_register(dev, dev_of_node(dev), hw);
  }
  EXPORT_SYMBOL_GPL(clk_register);
  
@@ -3678,7 +3678,7 @@ EXPORT_SYMBOL_GPL(clk_register);

   */
  int clk_hw_register(struct device *dev, struct clk_hw *hw)
  {
-   return PTR_ERR_OR_ZERO(__clk_register(dev, dev->of_node, hw));
+   return PTR_ERR_OR_ZERO(__clk_register(dev, dev_of_node(dev), hw));
  }
  EXPORT_SYMBOL_GPL(clk_hw_register);
  




--
Jeffrey Hugo
Qualcomm Datacenter Technologies as an affiliate of Qualcomm 
Technologies, Inc.

Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


Re: [PATCH-tip v3 04/14] locking/rwsem: Implement lock handoff to prevent lock starvation

2019-04-10 Thread Peter Zijlstra


Hurph, I was still looking at v2.. I suppose I'll go stare at this
verison, I don't think you said there were many changes, right?

This version seems to still suffer that HANDOFF issue I found on v2.

On Wed, Apr 10, 2019 at 02:42:21PM -0400, Waiman Long wrote:
> Because of writer lock stealing, it is possible that a constant
> stream of incoming writers will cause a waiting writer or reader to
> wait indefinitely leading to lock starvation.
> 
> The mutex code has a lock handoff mechanism to prevent lock starvation.
> This patch implements a similar lock handoff mechanism to disable
> lock stealing and force lock handoff to the first waiter in the queue
> after at least a 4ms waiting period unless it is a RT writer task which
> doesn't need to wait. The waiting period is used to avoid discouraging
> lock stealing too much to affect performance.
> 
> A rwsem microbenchmark was run for 5 seconds on a 2-socket 40-core
> 80-thread Skylake system with a v5.1 based kernel and 240 write_lock
> threads with 5us sleep critical section.
> 
> Before the patch, the min/mean/max numbers of locking operations for
> the locking threads were 1/7,792/173,696. After the patch, the figures
> became 5,842/6,542/7,458.  It can be seen that the rwsem became much
> more fair, though there was a drop of about 16% in the mean locking
> operations done which was a tradeoff of having better fairness.
> 
> Making the waiter set the handoff bit right after the first wakeup can

What does 'right after the first wakeup' mean? If that the top-waiter
setting it if it fails to acquire the lock due to steals?

> impact performance especially with a mixed reader/writer workload. With
> the same microbenchmark with short critical section and equal number of
> reader and writer threads (40/40), the reader/writer locking operation
> counts with the current patch were:
> 
>   40 readers, Iterations Min/Mean/Max = 1,793/1,794/1,796
>   40 writers, Iterations Min/Mean/Max = 1,793/34,956/86,081
> 
> By making waiter set handoff bit immediately after wakeup:
> 
>   40 readers, Iterations Min/Mean/Max = 43/44/46
>   40 writers, Iterations Min/Mean/Max = 43/1,263/3,191




RE: [PATCH RESEND 2/5] x86/MCE: Handle MCA controls in a per_cpu way

2019-04-10 Thread Ghannam, Yazen
> -Original Message-
> From: linux-edac-ow...@vger.kernel.org  On 
> Behalf Of Borislav Petkov
> Sent: Wednesday, April 10, 2019 12:26 PM
> To: Ghannam, Yazen 
> Cc: linux-e...@vger.kernel.org; linux-kernel@vger.kernel.org; 
> tony.l...@intel.com; x...@kernel.org
> Subject: Re: [PATCH RESEND 2/5] x86/MCE: Handle MCA controls in a per_cpu way
> 
> On Wed, Apr 10, 2019 at 04:58:12PM +, Ghannam, Yazen wrote:
> > Yes, unused banks in the middle are counted in the MCG_CAP[Count] value.
> 
> Good.
> 
> > Okay, so you're saying the sysfs access should fail if a bank is
> > disabled. Is that correct?
> 
> Well, think about it. If a bank is not operational for whatever reason,
> we should tell the user that.
> 
> > Does "disabled" mean one or both of these?
> > Unused = RAZ/WI in hardware
> > Uninitialized = Not initialized by kernel due to quirks, etc.
> >
> > For an unused bank, it doesn't hurt to write MCA_CTL, but really
> > there's no reason to do so and go through mce_restart().
> 
> Yes, but that bank is non-operational in some form. So we should prevent
> all writes to it because, well, it is not going to do anything. And this
> would be a good way to give feedback to the user that that is the case.
> 
> > For an uninitialized bank, should we prevent users from overriding the
> > kernel's settings?
> 
> That all depends on the quirks. Whether we should allow them to be
> overridden or not. I don't think we've ever thought about it, though.
> 
> Let's look at one:
> 
> if (c->x86_vendor == X86_VENDOR_AMD) {
> if (c->x86 == 15 && cfg->banks > 4) {
> /*
>  * disable GART TBL walk error reporting, which
>  * trips off incorrectly with the IOMMU & 3ware
>  * & Cerberus:
>  */
> clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
> 
> 
> Yah, so if the user reenables those GART errors, then she/he will see a
> lot of MCEs reported and will maybe complain about it. And then we'll
> say, but why did you enable them then. And she/he'll say: uh, didn't
> know. Or, I was just poking at sysfs and this happened.
> 
> Then we can say, well, don't do that then! :-)
> 
> So my current position is, meh, who cares. But then I'm looking at
> another quirk:
> 
> if (c->x86_vendor == X86_VENDOR_INTEL) {
> /*
>  * SDM documents that on family 6 bank 0 should not be written
>  * because it aliases to another special BIOS controlled
>  * register.
>  * But it's not aliased anymore on model 0x1a+
>  * Don't ignore bank 0 completely because there could be a
>  * valid event later, merely don't write CTL0.
>  */
> 
> if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
> mce_banks[0].init = 0;
> 
> 
> which basically prevents that bank from being reinitialized. So I guess
> we have that functionality already - we simply need to pay attention to
> w->init.
> 
> Right?

Okay, I'm with you.

So I'm thinking to add another patch to the set. This will set mce_bank.init=0 
if we read MCA_CTL=0 from the hardware.

Then we check if mce_bank.init=0 in the set/show functions and give a message 
if the bank is not used.

How does that sound?

Thanks,
Yazen


Re: [PATCH] zram: pass down the bvec we need to read into in the work struct

2019-04-10 Thread Jerome Glisse
Adding more Cc and stable (i thought this was 5.1 addition). Note that
without this patch on arch/kernel where PAGE_SIZE != 4096 userspace
could read random memory through a zram block device (thought userspace
probably would have no control on the address being read).

On Mon, Apr 08, 2019 at 02:32:19PM -0400, jgli...@redhat.com wrote:
> From: Jérôme Glisse 
> 
> When scheduling work item to read page we need to pass down the proper
> bvec struct which point to the page to read into. Before this patch it
> uses randomly initialized bvec (only if PAGE_SIZE != 4096) which is
> wrong.
> 
> Signed-off-by: Jérôme Glisse 
> Cc: Minchan Kim 
> Cc: Nitin Gupta 
> Cc: Sergey Senozhatsky 
> Cc: linux-kernel@vger.kernel.org
> ---
>  drivers/block/zram/zram_drv.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 399cad7daae7..d58a359a6622 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -774,18 +774,18 @@ struct zram_work {
>   struct zram *zram;
>   unsigned long entry;
>   struct bio *bio;
> + struct bio_vec bvec;
>  };
>  
>  #if PAGE_SIZE != 4096
>  static void zram_sync_read(struct work_struct *work)
>  {
> - struct bio_vec bvec;
>   struct zram_work *zw = container_of(work, struct zram_work, work);
>   struct zram *zram = zw->zram;
>   unsigned long entry = zw->entry;
>   struct bio *bio = zw->bio;
>  
> - read_from_bdev_async(zram, &bvec, entry, bio);
> + read_from_bdev_async(zram, &zw->bvec, entry, bio);
>  }
>  
>  /*
> @@ -798,6 +798,7 @@ static int read_from_bdev_sync(struct zram *zram, struct 
> bio_vec *bvec,
>  {
>   struct zram_work work;
>  
> + work.bvec = *bvec;
>   work.zram = zram;
>   work.entry = entry;
>   work.bio = bio;
> -- 
> 2.20.1
> 


Re: [PATCH 0/7] introduce cpu.headroom knob to cpu controller

2019-04-10 Thread Song Liu
Hi Morten,

> On Apr 10, 2019, at 4:59 AM, Morten Rasmussen  
> wrote:
> 
> Hi,
> 
> On Mon, Apr 08, 2019 at 02:45:32PM -0700, Song Liu wrote:
>> Servers running latency sensitive workload usually aren't fully loaded for 
>> various reasons including disaster readiness. The machines running our 
>> interactive workloads (referred as main workload) have a lot of spare CPU 
>> cycles that we would like to use for optimistic side jobs like video 
>> encoding. However, our experiments show that the side workload has strong
>> impact on the latency of main workload:
>> 
>>  side-job   main-load-level   main-avg-latency
>> none  1.0  1.00
>> none  1.1  1.10
>> none  1.2  1.10 
>> none  1.3  1.10
>> none  1.4  1.15
>> none  1.5  1.24
>> none  1.6  1.74
>> 
>> ffmpeg1.0  1.82
>> ffmpeg1.1  2.74
>> 
>> Note: both the main-load-level and the main-avg-latency numbers are
>> _normalized_.
> 
> Could you reveal what level of utilization those main-load-level numbers
> correspond to? I'm trying to understand why the latency seems to
> increase rapidly once you hit 1.5. Is that the point where the system
> hits 100% utilization?

The load level above is measured as requests-per-second. 

When there is no side workload, the system has about 45% busy CPU with 
load level of 1.0; and about 75% busy CPU at load level of 1.5. 

The saturation starts before the system hitting 100% utilization. This is
true for many different resources: ALUs in SMT systems, cache lines, 
memory bandwidths, etc. 

> 
>> In these experiments, ffmpeg is put in a cgroup with cpu.weight of 1 
>> (lowest priority). However, it consumes all idle CPU cycles in the 
>> system and causes high latency for the main workload. Further experiments
>> and analysis (more details below) shows that, for the main workload to meet
>> its latency targets, it is necessary to limit the CPU usage of the side
>> workload so that there are some _idle_ CPU. There are various reasons
>> behind the need of idle CPU time. First, shared CPU resouce saturation 
>> starts to happen way before time-measured utilization reaches 100%. 
>> Secondly, scheduling latency starts to impact the main workload as CPU 
>> reaches full utilization. 
>> 
>> Currently, the cpu controller provides two mechanisms to protect the main 
>> workload: cpu.weight and cpu.max. However, neither of them is sufficient 
>> in these use cases. As shown in the experiments above, side workload with 
>> cpu.weight of 1 (lowest priority) would still consume all idle CPU and add 
>> unacceptable latency to the main workload. cpu.max can throttle the CPU 
>> usage of the side workload and preserve some idle CPU. However, cpu.max 
>> cannot react to changes in load levels. For example, when the main 
>> workload uses 40% of CPU, cpu.max of 30% for the side workload would yield 
>> good latencies for the main workload. However, when the workload 
>> experiences higher load levels and uses more CPU, the same setting (cpu.max 
>> of 30%) would cause the interactive workload to miss its latency target. 
>> 
>> These experiments demonstrated the need for a mechanism to effectively 
>> throttle CPU usage of the side workload and preserve idle CPU cycles. 
>> The mechanism should be able to adjust the level of throttling based on
>> the load level of the main workload. 
>> 
>> This patchset introduces a new knob for cpu controller: cpu.headroom. 
>> cgroup of the main workload uses cpu.headroom to ensure side workload to 
>> use limited CPU cycles. For example, if a main workload has a cpu.headroom 
>> of 30%. The side workload will be throttled to give 30% overall idle CPU. 
>> If the main workload uses more than 70% of CPU, the side workload will only 
>> run with configurable minimal cycles. This configurable minimal cycles is
>> referred as "tolerance" of the main workload.
> 
> IIUC, you are proposing to basically apply dynamic bandwidth throttling to
> side-jobs to preserve a specific headroom of idle cycles.

This is accurate. The effect is similar to cpu.max, but more dynamic. 

> 
> The bit that isn't clear to me, is _why_ adding idle cycles helps your
> workload. I'm not convinced that adding headroom gives any latency
> improvements beyond watering down the impact of your side jobs. AFAIK,

We think the latency improvements actually come from watering down the 
impact of side jobs. It is not just statistically improving average 
latency numbers, but also reduces resource contention caused by the side
workload. I don't know whether it is from reducing contention of ALUs, 
memory bandwidth, CPU caches, or something else, but we saw reduced 
latencies when headroom is used. 

> the throttling mechanism effectively removes the throttled tasks from
> the schedule according to a specific duty cycle. When

Re: KASAN: use-after-free Read in path_lookupat

2019-04-10 Thread Linus Torvalds
On Wed, Apr 10, 2019 at 8:11 AM Al Viro  wrote:
>
> Both are in vfs.git#fixes.  Which way should that go - directly or
> via linux-security.git?

Just do it directly. I doubt you can trigger them for securityfs and
apparmourfs, since normal users have no way to remove any files from
them, so the race with final unlink sounds fairly irrelevant in
practice, no?

   Linus


Re: [PATCH V5 08/12] perf/x86/intel: Add Icelake support

2019-04-10 Thread Peter Zijlstra
On Wed, Apr 10, 2019 at 02:22:21PM -0400, Liang, Kan wrote:
> > > That is, are there really bits we want to mask in there?
> > 
> > For instruction event, right, we don't need mask it.
> > I will change it.
> > 
> 
> Actually, we have to mask some bits here, e.g. ARCH_PERFMON_EVENTSEL_INT,
> ARCH_PERFMON_EVENTSEL_USR and ARCH_PERFMON_EVENTSEL_OS. Those bits will be
> set in hw_config().

Ah, bah, You're right. I misread and though we were comparing against
the user provided raw config.

> 
> Also, other filds, e.g the INV, ANY, E, or CMASK fields are not allowed for
> reduced Skid PEBS.

Sure, those are actually forced 0 with the existing thing.

I'll go fold smething like back in. Thanks!

> 
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index dae3d84..3fa36c9 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3463,6 +3463,9 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc,
> int idx,
>   return c;
>  }
> 
> +#define EVENT_CONFIG(config) \
> + (config & (X86_ALL_EVENT_FLAGS | INTEL_ARCH_EVENT_MASK))
> +
>  static struct event_constraint *
>  icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
> struct perf_event *event)
> @@ -3472,7 +3475,7 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc,
> int idx,
>* Force instruction:ppp in Fixed counter 0
>*/
>   if ((event->attr.precise_ip == 3) &&
> - (event->hw.config == X86_CONFIG(.event=0xc0)))
> + (EVENT_CONFIG(event->hw.config) == X86_CONFIG(.event=0xc0)))
>   return &fixed_counter0_constraint;
> 
>   return hsw_get_event_constraints(cpuc, idx, event);
> 
> Thanks,
> Kan


Re: [PATCH] Staging: rtl8723bs: Remove typedef in struct sdio_data

2019-04-10 Thread Madhumthia Prabakaran
On Wed, Apr 10, 2019 at 09:49:54AM +0300, Dan Carpenter wrote:
> On Tue, Apr 09, 2019 at 11:16:17AM -0500, Madhumitha Prabakaran wrote:
> > diff --git a/drivers/staging/rtl8723bs/include/drv_types.h 
> > b/drivers/staging/rtl8723bs/include/drv_types.h
> > index bafb2c30e7fb..b0623c936940 100644
> > --- a/drivers/staging/rtl8723bs/include/drv_types.h
> > +++ b/drivers/staging/rtl8723bs/include/drv_types.h
> > @@ -220,7 +220,7 @@ struct registry_priv
> >  #define BSSID_SZ(field)   sizeof(((struct wlan_bssid_ex *) 0)->field)
> >  
> >  #include 
> > -#define INTF_DATA SDIO_DATA
> > +#define INTF_DATA struct sdio_data
> >  
> 
> Just get rid of INTF_DATA data as well.  It's only used once a bit lower
> in the file.  (Get rid of the ifdef around it).

But, reference of INTF_DATA is also included in file 
./drivers/staging/rtl8723bs/include/drv_types.h:487:
return &dvobj->intf_data.func->dev;

thanks for the review
madhumitha


> 
> regards,
> dan carpenter
> 
> 
> 


Re: [PULL -- 5.1 REGRESSION] Bluetooth: btusb: request wake pin with NOAUTOEN

2019-04-10 Thread Linus Torvalds
On Wed, Apr 10, 2019 at 7:44 AM Brian Norris  wrote:
>
> I think our key difference here is in how much we trust the device:
> knowing the quality of the firmware running on some of these devices,
> I wouldn't totally trust that they get it right.

No.

You claim that IRQ_NOAUTOEN makes any difference, It doesn't.

I claim that you should get rid of the disable/enable_irq() games you
play, and replace them with just requesting the interrupt.

At which point the whole  IRQ_NOAUTOEN dance is entirely pointless.

Just don't do it.

This has nothing to do with trusting hardware, and everything to do
with "why do you request an interrupt that you aren't actually ready
to accept, and the hardware isn't even properly configured to generate
yet"?

See my point?

   Linus


Re: [PATCH] sparc: use struct_size() in kzalloc()

2019-04-10 Thread Gustavo A. R. Silva
Hi Dave,

I wonder if you can take this.

Thanks
--
Gustavo

On 1/8/19 10:13 AM, Gustavo A. R. Silva wrote:
> One of the more common cases of allocation size calculations is finding the
> size of a structure that has a zero-sized array at the end, along with memory
> for some number of elements for that array. For example:
> 
> struct foo {
> int stuff;
> void *entry[];
> };
> 
> instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL);
> 
> Instead of leaving these open-coded and prone to type mistakes, we can now
> use the new struct_size() helper:
> 
> instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL);
> 
> This code was detected with the help of Coccinelle.
> 
> Signed-off-by: Gustavo A. R. Silva 
> ---
>  arch/sparc/kernel/cpumap.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
> index d1d52822603d..1cb62bfeaa1f 100644
> --- a/arch/sparc/kernel/cpumap.c
> +++ b/arch/sparc/kernel/cpumap.c
> @@ -194,8 +194,7 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void)
>  
>   n = enumerate_cpuinfo_nodes(tmp_level);
>  
> - new_tree = kzalloc(sizeof(struct cpuinfo_tree) +
> -(sizeof(struct cpuinfo_node) * n), GFP_ATOMIC);
> + new_tree = kzalloc(struct_size(new_tree, nodes, n), GFP_ATOMIC);
>   if (!new_tree)
>   return NULL;
>  
> 


Re: [PATCH 18/22] watchdog: mt7621_wdt: Use 'dev' instead of dereferencing it repeatedly

2019-04-10 Thread Guenter Roeck
On Wed, Apr 10, 2019 at 11:46:24AM -0700, Joe Perches wrote:
> On Wed, 2019-04-10 at 09:27 -0700, Guenter Roeck wrote:
> > Introduce local variable 'struct device *dev' and use it instead of
> > dereferencing it repeatedly.
> > 
> > The conversion was done automatically with coccinelle using the
> > following semantic patches. The semantic patches and the scripts
> > used to generate this commit log are available at
> > https://github.com/groeck/coccinelle-patches
> 
> Interesting collection.  It would be useful to specify which
> particular script generated or enabled this patch.
> 

It is pdev-addvar.cocci, rule 'new'. deref.cocci wasn't used for the
watchdog patches. The script to apply the various rules is watchdog/make.sh.

Pointing to the actual scripts used is a good idea. I'll see if I can add
this for subsequent series. After all, the commit log is also auto-generated,
so this should be straightforward.

> Just scanning briefly, it might have been this one:
> https://github.com/groeck/coccinelle-patches/blob/master/common/deref.cocci
> But it looks like some manual bit might have been required too.

Not for this one. There were a couple of situations where I had to manually
split long lines to avoid checkpatch warnings, and I manually updated a few
of the commit logs, but not in this patch.

> 
> And trivially:
> 
> > diff --git a/drivers/watchdog/mt7621_wdt.c b/drivers/watchdog/mt7621_wdt.c
> []
> > @@ -133,18 +133,19 @@ static struct watchdog_device mt7621_wdt_dev = {
> []
> > watchdog_init_timeout(&mt7621_wdt_dev, mt7621_wdt_dev.max_timeout,
> > - &pdev->dev);
> > + dev);
> 
> This could be on one line.
> 
Coccinelle isn't perfect. The rule doesn't modify the entire argument list,
only the last argument, so coccinelle missed that it could have merged the
two lines into one.

A checkpatch rule suggesting that multiple extension lines can be merged
might be useful to help finding such situations. Just a thought.

Thanks,
Guenter


Re: [GIT PULL] Please pull RDMA subsystem changes

2019-04-10 Thread pr-tracker-bot
The pull request you sent on Wed, 10 Apr 2019 18:46:23 +:

> git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git tags/for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/582549e3fbe137eb6ce9be591aca25ca36b4

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker


Re: [PATCH] iio: adc: ti-ads7950: Fix build error without CONFIG_GPIOLIB

2019-04-10 Thread Justin Chen
On Wed, Apr 10, 2019 at 1:47 AM Yue Haibing  wrote:
>
> From: YueHaibing 
>
> When building with CONFIG_GPIOLIB is not set
> gcc warns this:
>
> drivers/iio/adc/ti-ads7950.c:75:19: error: field chip has incomplete type
>   struct gpio_chip chip;
>^~~~
> drivers/iio/adc/ti-ads7950.c: In function ti_ads7950_set:
> drivers/iio/adc/ti-ads7950.c:409:32: error: implicit declaration of function 
> gpiochip_get_data; did you mean acpi_get_data? 
> [-Werror=implicit-function-declaration]
>   struct ti_ads7950_state *st = gpiochip_get_data(chip);
> ^
> acpi_get_data
>
> Reported-by: Hulk Robot 
> Fixes: c97dce792dc8 ("iio: adc: ti-ads7950: add GPIO support")
> Signed-off-by: YueHaibing 
Reviewed-by: Justin Chen 

Thanks,
Justin
> ---
>  drivers/iio/adc/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
> index 846c7ac..f760269 100644
> --- a/drivers/iio/adc/Kconfig
> +++ b/drivers/iio/adc/Kconfig
> @@ -968,7 +968,7 @@ config TI_ADS1015
>
>  config TI_ADS7950
> tristate "Texas Instruments ADS7950 ADC driver"
> -   depends on SPI
> +   depends on SPI && GPIOLIB
> select IIO_BUFFER
> select IIO_TRIGGERED_BUFFER
> help
> --
> 2.7.4
>
>


[PATCH v3 3/3] module: Make __tracepoints_ptrs as read-only

2019-04-10 Thread Joel Fernandes (Google)
This series hardens the tracepoints in modules by making the array of
pointers referring to the tracepoints as read-only. This array is needed
during module unloading to verify that the tracepoint is quiescent.
There is no reason for the array to be to be writable after init, and
can cause security or other hidden bugs. Mark these as ro_after_init.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Suggested-by: mathieu.desnoy...@efficios.com
Cc: rost...@goodmis.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/module.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 8b9631e789f0..be980aaa8804 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3320,6 +3320,12 @@ static const char * const ro_after_init_sections[] = {
 * by the SRCU notifiers
 */
"___srcu_struct_ptrs",
+
+   /*
+* Array of tracepoint pointers used for checking if tracepoints are
+* quiescent during unloading.
+*/
+   "__tracepoints_ptrs",
 };
 
 static struct module *layout_and_allocate(struct load_info *info, int flags)
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v3 1/3] module: Prepare for addition of new ro_after_init sections

2019-04-10 Thread Joel Fernandes (Google)
For the purposes of hardening modules by adding sections to
ro_after_init sections, prepare for addition of new ro_after_init
entries which we do in future patches. Create a table to which new
entries could be added later. This makes it less error prone and reduce
code duplication.

Cc: paul...@linux.vnet.ibm.com
Cc: rost...@goodmis.org
Cc: mathieu.desnoy...@efficios.com
Cc: r...@vger.kernel.org
Cc: kernel-harden...@lists.openwall.com
Cc: kernel-t...@android.com
Suggested-by: keesc...@chromium.org
Reviewed-by: keesc...@chromium.org
Acked-by: rost...@goodmis.org
Signed-off-by: Joel Fernandes (Google) 

---
 kernel/module.c | 41 +++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 524da609c884..42e4e289d6c7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3300,11 +3300,27 @@ static bool blacklisted(const char *module_name)
 }
 core_param(module_blacklist, module_blacklist, charp, 0400);
 
+/*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+static const char * const ro_after_init_sections[] = {
+   ".data..ro_after_init",
+
+   /*
+* __jump_table structures are never modified, with the exception of
+* entries that refer to code in the __init section, which are
+* annotated as such at module load time.
+*/
+   "__jump_table",
+};
+
 static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
struct module *mod;
unsigned int ndx;
-   int err;
+   int err, i;
 
err = check_modinfo(info->mod, info, flags);
if (err)
@@ -3319,23 +3335,12 @@ static struct module *layout_and_allocate(struct 
load_info *info, int flags)
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
-   /*
-* Mark ro_after_init section with SHF_RO_AFTER_INIT so that
-* layout_sections() can put it in the right place.
-* Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
-*/
-   ndx = find_sec(info, ".data..ro_after_init");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-   /*
-* Mark the __jump_table section as ro_after_init as well: these data
-* structures are never modified, with the exception of entries that
-* refer to code in the __init section, which are annotated as such
-* at module load time.
-*/
-   ndx = find_sec(info, "__jump_table");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   /* Set sh_flags for read-only after init sections */
+   for (i = 0; i < ARRAY_SIZE(ro_after_init_sections); i++) {
+   ndx = find_sec(info, ro_after_init_sections[i]);
+   if (ndx)
+   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   }
 
/* Determine total sizes, and put offsets in sh_entsize.  For now
   this is done generically; there doesn't appear to be any
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v3 2/3] module: Make srcu_struct ptr array as read-only post init

2019-04-10 Thread Joel Fernandes (Google)
Since commit title ("srcu: Allocate per-CPU data for DEFINE_SRCU() in
modules"), modules that call DEFINE_{STATIC,}SRCU will have a new array
of srcu_struct pointers which is used by srcu code to initialize and
clean up these structures.

There is no reason for this array of pointers to be writable, and can
cause security or other hidden bugs. Mark these are read-only after the
module init has completed.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Acked-by: keesc...@chromium.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/module.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 42e4e289d6c7..8b9631e789f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3314,6 +3314,12 @@ static const char * const ro_after_init_sections[] = {
 * annotated as such at module load time.
 */
"__jump_table",
+
+   /*
+* Used for SRCU structures which need to be initialized/cleaned up
+* by the SRCU notifiers
+*/
+   "___srcu_struct_ptrs",
 };
 
 static struct module *layout_and_allocate(struct load_info *info, int flags)
-- 
2.21.0.392.gf8f6787159e-goog



Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

2019-04-10 Thread Vineeth Remanan Pillai
From: Vineeth Pillai 

> Well, I was promised someome else was going to carry all this, also

We are interested in this feature and have been actively testing, benchmarking
and working on fixes. If there is no v2 effort currently in progress, we are
willing to help consolidate all the changes discussed here and prepare a v2.
If there are any pending changes in pipeline, please post your ideas so that
we could include it in v2.

We hope to post the v2 with all the changes here in a week’s time rebased on
the latest tip.



[PATCH 1/2] mtd: nand: Kconfig: correct the MTD_NAND_ECC_SW_BCH select

2019-04-10 Thread Anders Roxell
Config fragments should not have the prefix 'CONFIG_'.

Rework to remove the prefix 'CONFIG_' from 'CONFIG_MTD_NAND_ECC_SW_BCH'.

Fixes: 51ef1d0b2095 ("mtd: nand: Clarify Kconfig entry for software BCH ECC 
algorithm")
Signed-off-by: Anders Roxell 
---
 drivers/mtd/devices/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 7fcdaf6c279d..f9258d666846 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -207,7 +207,7 @@ comment "Disk-On-Chip Device Drivers"
 config MTD_DOCG3
tristate "M-Systems Disk-On-Chip G3"
select BCH
-   select BCH_CONST_PARAMS if !CONFIG_MTD_NAND_ECC_SW_BCH
+   select BCH_CONST_PARAMS if !MTD_NAND_ECC_SW_BCH
select BITREVERSE
help
  This provides an MTD device driver for the M-Systems DiskOnChip
-- 
2.20.1



[PATCH 2/2] mtd: nand: raw: fix build dependency

2019-04-10 Thread Anders Roxell
When enable CONFIG_MTD_NAND_ECC_SW_BCH as a module, the
MTD_NAND_ECC_SW_BCH depends on MTD_NAND, but the module controlled by
MTD_NAND links against the module controlled by MTD_NAND_ECC_SW_BCH.
This leads to the following link failure.

aarch64-linux-gnu-ld: drivers/mtd/nand/raw/nand_base.o: in function 
`nand_cleanup':
../drivers/mtd/nand/raw/nand_base.c:5886: undefined reference to `nand_bch_free'
aarch64-linux-gnu-ld: ../drivers/mtd/nand/raw/nand_base.c:5886:(.text+0x9928): 
relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol 
`nand_bch_free'
aarch64-linux-gnu-ld: drivers/mtd/nand/raw/nand_base.o: in function 
`nand_set_ecc_soft_ops':
../drivers/mtd/nand/raw/nand_base.c:5093: undefined reference to 
`nand_bch_calculate_ecc'
aarch64-linux-gnu-ld: ../drivers/mtd/nand/raw/nand_base.c:5093:(.text+0xe914): 
relocation truncated to fit: R_AARCH64_ADR_PREL_PG_HI21 against undefined 
symbol `nand_bch_calculate_ecc'
aarch64-linux-gnu-ld: ../drivers/mtd/nand/raw/nand_base.c:5093: undefined 
reference to `nand_bch_calculate_ecc'
aarch64-linux-gnu-ld: ../drivers/mtd/nand/raw/nand_base.c:5094: undefined 
reference to `nand_bch_correct_data'
aarch64-linux-gnu-ld: ../drivers/mtd/nand/raw/nand_base.c:5094:(.text+0xe934): 
relocation truncated to fit: R_AARCH64_ADR_PREL_PG_HI21 against undefined 
symbol `nand_bch_correct_data'
aarch64-linux-gnu-ld: ../drivers/mtd/nand/raw/nand_base.c:5094: undefined 
reference to `nand_bch_correct_data'
aarch64-linux-gnu-ld: ../drivers/mtd/nand/raw/nand_base.c:5148: undefined 
reference to `nand_bch_init'
aarch64-linux-gnu-ld: ../drivers/mtd/nand/raw/nand_base.c:5148:(.text+0xebbc): 
relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol 
`nand_bch_init'

Rework CONFIG_MTD_NAND_ECC_SW_BCH from tristate to bool,
and then link the nand_bch.o file into nand.ko if its enabled.

Fixes: 51ef1d0b2095 ("mtd: nand: Clarify Kconfig entry for software BCH ECC 
algorithm")
Signed-off-by: Anders Roxell 
---
 drivers/mtd/nand/raw/Kconfig  | 2 +-
 drivers/mtd/nand/raw/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
index 615d738be411..0500c42f31cb 100644
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -22,7 +22,7 @@ menuconfig MTD_RAW_NAND
 if MTD_RAW_NAND
 
 config MTD_NAND_ECC_SW_BCH
-   tristate "Support software BCH ECC"
+   bool "Support software BCH ECC"
select BCH
default n
help
diff --git a/drivers/mtd/nand/raw/Makefile b/drivers/mtd/nand/raw/Makefile
index 8bc6faaa3bc7..efaf5cd25edc 100644
--- a/drivers/mtd/nand/raw/Makefile
+++ b/drivers/mtd/nand/raw/Makefile
@@ -2,7 +2,7 @@
 
 obj-$(CONFIG_MTD_RAW_NAND) += nand.o
 obj-$(CONFIG_MTD_NAND_ECC_SW_HAMMING)  += nand_ecc.o
-obj-$(CONFIG_MTD_NAND_ECC_SW_BCH)  += nand_bch.o
+nand-$(CONFIG_MTD_NAND_ECC_SW_BCH) += nand_bch.o
 obj-$(CONFIG_MTD_SM_COMMON)+= sm_common.o
 
 obj-$(CONFIG_MTD_NAND_CAFE)+= cafe_nand.o
-- 
2.20.1



Re: [PATCH RESEND 2/5] x86/MCE: Handle MCA controls in a per_cpu way

2019-04-10 Thread Borislav Petkov
On Wed, Apr 10, 2019 at 07:41:47PM +, Ghannam, Yazen wrote:
> So I'm thinking to add another patch to the set. This will set
> mce_bank.init=0 if we read MCA_CTL=0 from the hardware.

Ok.

> Then we check if mce_bank.init=0 in the set/show functions and give a
> message if the bank is not used.

Nah, not a message. return -ENODEV or -EINVAL or so.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.


Re: [PATCH] clocksource/drivers/timer-ti-dm: Remove omap_dm_timer_set_load_start

2019-04-10 Thread Tony Lindgren
Hi,

* Daniel Lezcano  [190410 17:02]:
> can you ask for an acked-by before pulling a patch in your tree?

I certainly do ask and wait for acks where possible :)

Note that I have not applied this patch. I just added
Keerthy to Cc on this thread so maybe you misread the
message earlier. My comment "seems like no other
takers" was for Ladislav regarding somebody picking up
his earlier work, not for picking up this patch :)

In any case, I've been waiting for you guys to pick up
the patch, here's my ack if you're waiting for it:

Acked-by: Tony Lindgren 


> On 04/04/2019 16:17, Tony Lindgren wrote:
> > * Ladislav Michl  [190327 08:12]:
> >> Hello Nathan,
> >>
> >> On Tue, Mar 26, 2019 at 10:01:27PM -0700, Nathan Chancellor wrote:
> >>> Commit 008258d995a6 ("clocksource/drivers/timer-ti-dm: Make
> >>> omap_dm_timer_set_load_start() static") made omap_dm_time_set_load_start
> >>> static because its prototype was not defined in a header. Unfortunately,
> >>> this causes a build warning on multi_v7_defconfig because this function
> >>> is not used anywhere in this translation unit:
> >>>
> >>> drivers/clocksource/timer-ti-dm.c:589:12: error: unused function
> >>> 'omap_dm_timer_set_load_start' [-Werror,-Wunused-function]
> >>>
> >>> In fact, omap_dm_timer_set_load_start hasn't been used anywhere since
> >>> commit f190be7f39a5 ("staging: tidspbridge: remove driver") and the
> >>> prototype was removed in commit 592ea6bd1fad ("clocksource: timer-ti-dm:
> >>> Make unexported functions static"), which is probably where this should
> >>> have happened.
> >>
> >> Alternatively you might want to look at "clocksource: timer-ti-dm: Add 
> >> event
> >> capture": https://patchwork.kernel.org/patch/10237217/ (it makes use of
> >> function being removed here). It is a part of an attempt to add event 
> >> capture
> >> for OMAP. Of course I would like this functionality to be implemented, but
> >> as I do not have a time to continue, I cannot really object removing this
> >> function.
> >>
> >> Just in case you'd be interested in finishing this task ;-)
> > 
> > Well seems like no other takers :) We can always find the missing
> > function in git history when needed, so I suggest we apply this.
> > 
> > Adding Keerthy to Cc as he just posted a similar patch.
> > 
> > Regards,
> > 
> > Tony
> > 
> 
> 
> -- 
>   Linaro.org │ Open source software for ARM SoCs
> 
> Follow Linaro:   Facebook |
>  Twitter |
>  Blog
> 


Re: [PATCH v3 3/3] module: Make __tracepoints_ptrs as read-only

2019-04-10 Thread Steven Rostedt
On Wed, 10 Apr 2019 15:57:08 -0400
"Joel Fernandes (Google)"  wrote:

> This series hardens the tracepoints in modules by making the array of
> pointers referring to the tracepoints as read-only. This array is needed
> during module unloading to verify that the tracepoint is quiescent.
> There is no reason for the array to be to be writable after init, and
> can cause security or other hidden bugs. Mark these as ro_after_init.
> 
> Suggested-by: paul...@linux.vnet.ibm.com
> Suggested-by: keesc...@chromium.org
> Suggested-by: mathieu.desnoy...@efficios.com
> Cc: rost...@goodmis.org
> Signed-off-by: Joel Fernandes (Google) 
> ---
>  kernel/module.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/kernel/module.c b/kernel/module.c
> index 8b9631e789f0..be980aaa8804 100644
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -3320,6 +3320,12 @@ static const char * const ro_after_init_sections[] = {
>* by the SRCU notifiers
>*/
>   "___srcu_struct_ptrs",
> +
> + /*
> +  * Array of tracepoint pointers used for checking if tracepoints are
> +  * quiescent during unloading.
> +  */
> + "__tracepoints_ptrs",

Do we ever modify the __tracepoint_ptrs section? I know the jump_label
sections are sorted on load, which means they need to be writable
during init, but if __tracepoint_ptrs is not sorted or touched during
load, why not just put them in the rodata section to begin with?

-- Steve

>  };
>  
>  static struct module *layout_and_allocate(struct load_info *info, int flags)



[PATCH 00/11] asus-wmi: Support of ASUS TUF Gaming series laptops

2019-04-10 Thread Yurii Pavlovskyi
Hi,

I'm new to kernel development, so first I would like to apologize in
advance for any mistakes.

The support for this laptop series is currently non-existent, as the
asus-nb-wmi driver (which is essentially configuration for asus-wmi) fails
to load and multiple ACPI errors are logged in dmesg. This patch series
adds pretty comprehenisve support for these relatively new laptops, adds
some code organization, and fixes couple of bugs in the asus-wmi module.

I have FX565GM (FX505GM) model, but I would guess that the same will
likely apply to the complete FX505 / FX705 series. I've also tested 
this on an older K54C model to ensure that it doesn't break support.
Unfortunately I don't have capacity to test this on more devices.

The patches 1 and 2 are pure bug fixes, but I can not measure relevance
for stable.

OVERALL DESIGN DECISIONS
Besides this patch, I've written experimental separate driver [1] for this
series to make it usable on my system as a DKMS module for 4.18 kernel for
the time being. One might wonder if it is more reasonable to make a new
independent module. The things to consider are that: asus-nb-wmi is
currently loaded by the WMI GUID alias, whereas the original ASUS driver
does check for the ASUS7000 device in ACPI. One should then choose
appropriate base driver instead of asus-wmi when asus-nb-wmi is loaded,
about third of the code gets duplicated in this case and the whole ends up
ugly.

Another question, does it make sense to embed RGB keyboard backlight
support in kernel code? There was discussion [2] about exposing WMI to the
userspace. The same would apply for the fan boost mode support. As I
understand as of yet it is still preferrable to support hardware features
in kernel.

NOTE ON HWMON
One open issue with the result is that hwmon device gets loaded anyway,
but it does not do anything noticeable. The heavily reduced code for the
MFUN is present in DSDT, but it either really does nothing or possibly
call something unnoticeable via DMA.

I've managed to detect that thermal sensor is not present, but the MFUN
for read fan speed does return 0 and not an error. Taking this as 
condition for disabling hwmon might intermittently break some existing
devices if the RPM is really 0 (no idea if that is really possible). One
might ponder on the better way to detect the presence of manual fan
control.

NOTE ON QUIRKS
I would speculate that the queue might be really present in many more
devices, it just didn't get noticed. Anyway after this is merged one might
consider if it is reasonable to enable it always and fallback if flush
fails. The patch does enable the new quirks only for very new models.

Regarding the DSTS force quirk, as I understand the underlying issue is a
workaround for EEEPC devices and use of DSTS is more conventional. It
might be reasonable to find a way to detect specific DSTS device ID that
is present on EEEPC instead, apply same ACPI device detection approach or
just duplicate the relevant method calls. I don't have access to such
device or it's DSDT and can't evaluate any of these options myself.

NOTE ON KEYBOARD BACKLIGHT
When the keyboard backlight is set via 0x50021 DEVID the brightness drops
slightly compared to brightness after boot. I did not found any way to
revert this. The method does set some bit in EC RAM, but this address is
not used anywhere else.  Unfortunately I wiped original OS after two hours
after unpacking.If someone can verify whether it is identical to behavior
of the vendor driver it would be appreciated.

NOTE ON UPOWER DAEMON
If you're testing with GNOME, notice that UPower does hang pretty badly if
the module is removed at runtime at least on my device. Stop it with
'systemctl stop upower' before removing the module and then restart it
again.

[1] https://github.com/hackbnw/faustus
[2] https://lwn.net/Articles/725725/

Yurii Pavlovskyi (11):
  platform/x86: asus-wmi: Fix hwmon device cleanup
  platform/x86: asus-wmi: Fix preserving keyboard backlight intensity on
load
  platform/x86: asus-wmi: Increase input buffer size of WMI methods
  platform/x86: asus-wmi: Add quirk to force DSTS WMI method detection
  platform/x86: asus-wmi: Support queued WMI event codes
  platform/x86: asus-nb-wmi: Add microphone mute key code
  platform/x86: asus-wmi: Organize code into sections
  platform/x86: asus-wmi: Enhance detection of thermal data
  platform/x86: asus-wmi: Control RGB keyboard backlight
  platform/x86: asus-wmi: Switch fan boost mode
  platform/x86: asus-wmi: Do not disable keyboard backlight on unload

 .../ABI/testing/sysfs-platform-asus-wmi   |  71 ++
 drivers/platform/x86/asus-nb-wmi.c|   9 +-
 drivers/platform/x86/asus-wmi.c   | 755 +++---
 drivers/platform/x86/asus-wmi.h   |   7 +
 include/linux/platform_data/x86/asus-wmi.h|   3 +
 5 files changed, 742 insertions(+), 103 deletions(-)

-- 
2.17.1



[PATCH 1/2] s390: only build for new CPUs with clang

2019-04-10 Thread Arnd Bergmann
llvm does does not understand -march=z9-109 and older target
specifiers, so disable the respective Kconfig settings and
the logic to make the boot code work on old systems when
building with clang.

Signed-off-by: Arnd Bergmann 
---
 arch/s390/Kconfig   | 6 ++
 arch/s390/boot/Makefile | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8cd860cba4d1..1a2eec61196d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -240,6 +240,7 @@ choice
 
 config MARCH_Z900
bool "IBM zSeries model z800 and z900"
+   depends on !CC_IS_CLANG
select HAVE_MARCH_Z900_FEATURES
help
  Select this to enable optimizations for model z800/z900 (2064 and
@@ -248,6 +249,7 @@ config MARCH_Z900
 
 config MARCH_Z990
bool "IBM zSeries model z890 and z990"
+   depends on !CC_IS_CLANG
select HAVE_MARCH_Z990_FEATURES
help
  Select this to enable optimizations for model z890/z990 (2084 and
@@ -256,6 +258,7 @@ config MARCH_Z990
 
 config MARCH_Z9_109
bool "IBM System z9"
+   depends on !CC_IS_CLANG
select HAVE_MARCH_Z9_109_FEATURES
help
  Select this to enable optimizations for IBM System z9 (2094 and
@@ -347,12 +350,15 @@ config TUNE_DEFAULT
 
 config TUNE_Z900
bool "IBM zSeries model z800 and z900"
+   depends on !CC_IS_CLANG
 
 config TUNE_Z990
bool "IBM zSeries model z890 and z990"
+   depends on !CC_IS_CLANG
 
 config TUNE_Z9_109
bool "IBM System z9"
+   depends on !CC_IS_CLANG
 
 config TUNE_Z10
bool "IBM System z10"
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index c844eaf24ed7..953a74d04990 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -11,6 +11,7 @@ KASAN_SANITIZE := n
 KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR)
 KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR)
 
+ifndef CONFIG_CC_IS_CLANG
 #
 # Use -march=z900 for als.c to be able to print an error
 # message if the kernel is started on a machine which is too old
@@ -25,6 +26,7 @@ CFLAGS_als.o  += -march=z900
 CFLAGS_REMOVE_sclp_early_core.o+= $(CC_FLAGS_MARCH)
 CFLAGS_sclp_early_core.o   += -march=z900
 endif
+endif
 
 CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
 
-- 
2.20.0



[PATCH 2/2] s390: boot, purgatory: pass $(CLANG_FLAGS) where needed

2019-04-10 Thread Arnd Bergmann
The purgatory and boot Makefiles do not inherit the original cflags,
so clang falls back to the default target architecture when building it,
typically this would be x86 when cross-compiling.

Add $(CLANG_FLAGS) everywhere so we pass the correct --target=s390x-linux
option when cross-compiling.

Signed-off-by: Arnd Bergmann 
---
 arch/s390/Makefile   | 5 +++--
 arch/s390/purgatory/Makefile | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 9c079a506325..443990791099 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -17,12 +17,13 @@ KBUILD_CFLAGS_MODULE += -fPIC
 KBUILD_AFLAGS  += -m64
 KBUILD_CFLAGS  += -m64
 aflags_dwarf   := -Wa,-gdwarf-2
-KBUILD_AFLAGS_DECOMPRESSOR := -m64 -D__ASSEMBLY__
+KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
 KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf))
-KBUILD_CFLAGS_DECOMPRESSOR := -m64 -O2
+KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2
 KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY
 KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float
 KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables
+KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning,pointer-sign)
 KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-option,-ffreestanding)
 KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g)
 KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call 
cc-option, -gdwarf-4,))
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
index ce6a3f75065b..ecd0b3847fef 100644
--- a/arch/s390/purgatory/Makefile
+++ b/arch/s390/purgatory/Makefile
@@ -22,6 +22,7 @@ KBUILD_CFLAGS := -fno-strict-aliasing -Wall 
-Wstrict-prototypes
 KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare
 KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding
 KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common
+KBUILD_CFLAGS += $(CLANG_FLAGS)
 KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
 KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS))
 
-- 
2.20.0



[PATCH 01/11] platform/x86: asus-wmi: Fix hwmon device cleanup

2019-04-10 Thread Yurii Pavlovskyi
The asus-wmi driver does not clean up the hwmon device on exit or error.
To reproduce the bug, repeat rmmod, insmod to verify that device number
/sys/devices/platform/asus-nb-wmi/hwmon/hwmon?? grows every time. Add
pointer to the device in module state and call cleanup on error.

Signed-off-by: Yurii Pavlovskyi 
---
 drivers/platform/x86/asus-wmi.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index ee1fa93708ec..6b736a9375ef 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -145,6 +145,7 @@ struct asus_wmi {
 
struct input_dev *inputdev;
struct backlight_device *backlight_device;
+   struct device *hwmon_device;
struct platform_device *platform_device;
 
struct led_classdev wlan_led;
@@ -1432,9 +1433,19 @@ static int asus_wmi_hwmon_init(struct asus_wmi *asus)
pr_err("Could not register asus hwmon device\n");
return PTR_ERR(hwmon);
}
+
+   asus->hwmon_device = hwmon;
return 0;
 }
 
+static void asus_wmi_hwmon_exit(struct asus_wmi *asus)
+{
+   if (asus->hwmon_device) {
+   asus_hwmon_fan_set_auto(asus);
+   hwmon_device_unregister(asus->hwmon_device);
+   }
+}
+
 /*
  * Backlight
  */
@@ -2157,6 +2168,7 @@ static int asus_wmi_add(struct platform_device *pdev)
 fail_rfkill:
asus_wmi_led_exit(asus);
 fail_leds:
+   asus_wmi_hwmon_exit(asus);
 fail_hwmon:
asus_wmi_input_exit(asus);
 fail_input:
@@ -2178,7 +2190,7 @@ static int asus_wmi_remove(struct platform_device *device)
asus_wmi_rfkill_exit(asus);
asus_wmi_debugfs_exit(asus);
asus_wmi_platform_exit(asus);
-   asus_hwmon_fan_set_auto(asus);
+   asus_wmi_hwmon_exit(asus);
 
kfree(asus);
return 0;
-- 
2.17.1



[PATCH] cros_ec: Add trace event to trace EC commands

2019-04-10 Thread Raul E Rangel
This is useful to see which EC commands are being executed and when.

To enable:

echo 'cros_ec:*' >> /sys/kernel/debug/tracing/set_event

Example:

/* cros_ec_cmd: version: 0, command: GET_VERSION */
/* cros_ec_cmd: version: 0, command: GET_PROTOCOL_INFO */
/* cros_ec_cmd: version: 1, command: GET_CMD_VERSIONS */
/* cros_ec_cmd: version: 1, command: USB_PD_CONTROL */

Signed-off-by: Raul E Rangel 
---

 drivers/platform/chrome/Makefile|   4 +-
 drivers/platform/chrome/cros_ec_proto.c |   4 +
 drivers/platform/chrome/cros_ec_trace.c | 163 
 drivers/platform/chrome/cros_ec_trace.h |  51 
 4 files changed, 221 insertions(+), 1 deletion(-)
 create mode 100644 drivers/platform/chrome/cros_ec_trace.c
 create mode 100644 drivers/platform/chrome/cros_ec_trace.h

diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index 1e2f0029b597..e542268454a4 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -3,12 +3,14 @@
 obj-$(CONFIG_CHROMEOS_LAPTOP)  += chromeos_laptop.o
 obj-$(CONFIG_CHROMEOS_PSTORE)  += chromeos_pstore.o
 obj-$(CONFIG_CHROMEOS_TBMC)+= chromeos_tbmc.o
+# tell define_trace.h where to find the cros ec trace header
+CFLAGS_cros_ec_trace.o:=   -I$(src)
 obj-$(CONFIG_CROS_EC_I2C)  += cros_ec_i2c.o
 obj-$(CONFIG_CROS_EC_SPI)  += cros_ec_spi.o
 cros_ec_lpcs-objs  := cros_ec_lpc.o cros_ec_lpc_reg.o
 cros_ec_lpcs-$(CONFIG_CROS_EC_LPC_MEC) += cros_ec_lpc_mec.o
 obj-$(CONFIG_CROS_EC_LPC)  += cros_ec_lpcs.o
-obj-$(CONFIG_CROS_EC_PROTO)+= cros_ec_proto.o
+obj-$(CONFIG_CROS_EC_PROTO)+= cros_ec_proto.o cros_ec_trace.o
 obj-$(CONFIG_CROS_KBD_LED_BACKLIGHT)   += cros_kbd_led_backlight.o
 obj-$(CONFIG_CROS_EC_LIGHTBAR) += cros_ec_lightbar.o
 obj-$(CONFIG_CROS_EC_VBC)  += cros_ec_vbc.o
diff --git a/drivers/platform/chrome/cros_ec_proto.c 
b/drivers/platform/chrome/cros_ec_proto.c
index 97a068dff192..3d02c8259ac6 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -10,6 +10,8 @@
 #include 
 #include 
 
+#include "cros_ec_trace.h"
+
 #define EC_COMMAND_RETRIES 50
 
 static int prepare_packet(struct cros_ec_device *ec_dev,
@@ -51,6 +53,8 @@ static int send_command(struct cros_ec_device *ec_dev,
int ret;
int (*xfer_fxn)(struct cros_ec_device *ec, struct cros_ec_command *msg);
 
+   trace_cros_ec_cmd(msg);
+
if (ec_dev->proto_version > 2)
xfer_fxn = ec_dev->pkt_xfer;
else
diff --git a/drivers/platform/chrome/cros_ec_trace.c 
b/drivers/platform/chrome/cros_ec_trace.c
new file mode 100644
index ..799c8e2bfd22
--- /dev/null
+++ b/drivers/platform/chrome/cros_ec_trace.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace events for the ChromeOS Embedded Controller
+ *
+ * Copyright 2019 Google LLC.
+ */
+
+#define ec_cmds \
+   {EC_CMD_PROTO_VERSION, "PROTO_VERSION"}, \
+   {EC_CMD_HELLO, "HELLO"}, \
+   {EC_CMD_GET_VERSION, "GET_VERSION"}, \
+   {EC_CMD_READ_TEST, "READ_TEST"}, \
+   {EC_CMD_GET_BUILD_INFO, "GET_BUILD_INFO"}, \
+   {EC_CMD_GET_CHIP_INFO, "GET_CHIP_INFO"}, \
+   {EC_CMD_GET_BOARD_VERSION, "GET_BOARD_VERSION"}, \
+   {EC_CMD_READ_MEMMAP, "READ_MEMMAP"}, \
+   {EC_CMD_GET_CMD_VERSIONS, "GET_CMD_VERSIONS"}, \
+   {EC_CMD_GET_COMMS_STATUS, "GET_COMMS_STATUS"}, \
+   {EC_CMD_TEST_PROTOCOL, "TEST_PROTOCOL"}, \
+   {EC_CMD_GET_PROTOCOL_INFO, "GET_PROTOCOL_INFO"}, \
+   {EC_CMD_GSV_PAUSE_IN_S5, "GSV_PAUSE_IN_S5"}, \
+   {EC_CMD_GET_FEATURES, "GET_FEATURES"}, \
+   {EC_CMD_GET_SKU_ID, "GET_SKU_ID"}, \
+   {EC_CMD_SET_SKU_ID, "SET_SKU_ID"}, \
+   {EC_CMD_FLASH_INFO, "FLASH_INFO"}, \
+   {EC_CMD_FLASH_READ, "FLASH_READ"}, \
+   {EC_CMD_FLASH_WRITE, "FLASH_WRITE"}, \
+   {EC_CMD_FLASH_ERASE, "FLASH_ERASE"}, \
+   {EC_CMD_FLASH_PROTECT, "FLASH_PROTECT"}, \
+   {EC_CMD_FLASH_REGION_INFO, "FLASH_REGION_INFO"}, \
+   {EC_CMD_VBNV_CONTEXT, "VBNV_CONTEXT"}, \
+   {EC_CMD_FLASH_SPI_INFO, "FLASH_SPI_INFO"}, \
+   {EC_CMD_FLASH_SELECT, "FLASH_SELECT"}, \
+   {EC_CMD_PWM_GET_FAN_TARGET_RPM, "PWM_GET_FAN_TARGET_RPM"}, \
+   {EC_CMD_PWM_SET_FAN_TARGET_RPM, "PWM_SET_FAN_TARGET_RPM"}, \
+   {EC_CMD_PWM_GET_KEYBOARD_BACKLIGHT, "PWM_GET_KEYBOARD_BACKLIGHT"}, \
+   {EC_CMD_PWM_SET_KEYBOARD_BACKLIGHT, "PWM_SET_KEYBOARD_BACKLIGHT"}, \
+   {EC_CMD_PWM_SET_FAN_DUTY, "PWM_SET_FAN_DUTY"}, \
+   {EC_CMD_PWM_SET_DUTY, "PWM_SET_DUTY"}, \
+   {EC_CMD_PWM_GET_DUTY, "PWM_GET_DUTY"}, \
+   {EC_CMD_LIGHTBAR_CMD, "LIGHTBAR_CMD"}, \
+   {EC_CMD_LED_CONTROL, "LED_CONTROL"}, \
+   {EC_CMD_VBOOT_HASH, "VBOOT_HASH"}, \
+   {EC_CMD_MOTION_SENSE_CMD, "MOTION_SENSE_CMD"}, \
+   {EC_CMD_FORCE_LID_OPEN, "FORCE_LID_OPEN"}, \
+

[PATCH 02/11] platform/x86: asus-wmi: Fix preserving keyboard, backlight intensity on load

2019-04-10 Thread Yurii Pavlovskyi
The error code and return value are mixed up. The intensity is always set
to 0 on load as kbd_led_read returns either 0 or negative value. To
reproduce set backlight to maximum, reload driver and try to increase it
using keyboard hotkey, the intensity will drop as a result. Correct the
implementation.

Signed-off-by: Yurii Pavlovskyi 
---
 drivers/platform/x86/asus-wmi.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 6b736a9375ef..0fbb947b07c4 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -591,8 +591,7 @@ static int asus_wmi_led_init(struct asus_wmi *asus)
goto error;
}
 
-   led_val = kbd_led_read(asus, NULL, NULL);
-   if (led_val >= 0) {
+   if (!kbd_led_read(asus, &led_val, NULL)) {
asus->kbd_led_wk = led_val;
asus->kbd_led.name = "asus::kbd_backlight";
asus->kbd_led.flags = LED_BRIGHT_HW_CHANGED;
-- 
2.17.1



[PATCH 03/11] platform/x86: asus-wmi: Increase input buffer size of WMI methods

2019-04-10 Thread Yurii Pavlovskyi
The asus-nb-wmi driver is matched by WMI alias but fails to load on TUF
Gaming series laptops producing multiple ACPI errors in kernel log. Patch
was tested on TUF Gaming FX505GM and older K54C model.

The input buffer for WMI method invocation size is 2 dwords, whereas
3 are expected by this model.

FX505GM:
..
Method (WMNB, 3, Serialized)
{
P8XH (Zero, 0x11)
CreateDWordField (Arg2, Zero, IIA0)
CreateDWordField (Arg2, 0x04, IIA1)
CreateDWordField (Arg2, 0x08, IIA2)
Local0 = (Arg1 & 0x)
...

Compare with older K54C:
...
Method (WMNB, 3, NotSerialized)
{
CreateDWordField (Arg2, 0x00, IIA0)
CreateDWordField (Arg2, 0x04, IIA1)
Local0 = (Arg1 & 0x)
...

Increase buffer size to 3 dwords. No negative consequences of this change
are expected, as input buffer size is not verified. The original function
is replaced by a wrapper for a new method passing value 0 for the last
parameter. The new function will be used to control RGB keyboard
backlight.

Signed-off-by: Yurii Pavlovskyi 
---
One of current kernel errors:
ACPI BIOS Error (bug): AE_AML_BUFFER_LIMIT, Field [IIA2] at bit offset/
length 64/32 exceeds size of target Buffer (64 bits)
(20190215/dsopcode-203)
[ 4528.573948] No Local Variables are initialized for Method [WMNB]
[ 4528.573949] Initialized Arguments for Method [WMNB]:  (3 arguments
defined for method invocation)
[ 4528.573950]   Arg0:   bd1bea5a 
Integer 
[ 4528.573952]   Arg1:   d414dc53 
Integer 4E464741
[ 4528.573954]   Arg2:   fcefea4b 
Buffer(8) F0 95 08 00 00 00 00 00
[ 4528.573959] ACPI Error: Aborting method \_SB.ATKD.WMNB due to previous
error (AE_AML_BUFFER_LIMIT) (20190215/psparse-531)
[ 4528.686425] asus-nb-wmi: probe of asus-nb-wmi failed with error -5
---
 drivers/platform/x86/asus-wmi.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 0fbb947b07c4..cfccfc0b8c2f 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -95,6 +95,7 @@ static bool ashs_present(void)
 struct bios_args {
u32 arg0;
u32 arg1;
+   u32 arg2; /* At least TUF Gaming series uses 3 dword input buffer. */
 } __packed;
 
 /*
@@ -220,11 +221,13 @@ static void asus_wmi_input_exit(struct asus_wmi *asus)
asus->inputdev = NULL;
 }
 
-int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval)
+static int asus_wmi_evaluate_method_3dw(u32 method_id, u32 arg0, u32 arg1,
+   u32 arg2, u32 *retval)
 {
struct bios_args args = {
.arg0 = arg0,
.arg1 = arg1,
+   .arg2 = arg2
};
struct acpi_buffer input = { (acpi_size) sizeof(args), &args };
struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -256,6 +259,11 @@ int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 
arg1, u32 *retval)
 
return 0;
 }
+
+int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval)
+{
+   return asus_wmi_evaluate_method_3dw(method_id, arg0, arg1, 0, retval);
+}
 EXPORT_SYMBOL_GPL(asus_wmi_evaluate_method);
 
 static int asus_wmi_evaluate_method_agfn(const struct acpi_buffer args)
-- 
2.17.1



Re: [PATCH-tip v3 04/14] locking/rwsem: Implement lock handoff to prevent lock starvation

2019-04-10 Thread Waiman Long
On 04/10/2019 03:38 PM, Peter Zijlstra wrote:
> Hurph, I was still looking at v2.. I suppose I'll go stare at this
> verison, I don't think you said there were many changes, right?
>
> This version seems to still suffer that HANDOFF issue I found on v2.

It is mainly minor adjustments. I was trying to add two more patches.
While at it, make some minor changes. I will address your concern in a
separate mail later today.

>
> On Wed, Apr 10, 2019 at 02:42:21PM -0400, Waiman Long wrote:
>> Because of writer lock stealing, it is possible that a constant
>> stream of incoming writers will cause a waiting writer or reader to
>> wait indefinitely leading to lock starvation.
>>
>> The mutex code has a lock handoff mechanism to prevent lock starvation.
>> This patch implements a similar lock handoff mechanism to disable
>> lock stealing and force lock handoff to the first waiter in the queue
>> after at least a 4ms waiting period unless it is a RT writer task which
>> doesn't need to wait. The waiting period is used to avoid discouraging
>> lock stealing too much to affect performance.
>>
>> A rwsem microbenchmark was run for 5 seconds on a 2-socket 40-core
>> 80-thread Skylake system with a v5.1 based kernel and 240 write_lock
>> threads with 5us sleep critical section.
>>
>> Before the patch, the min/mean/max numbers of locking operations for
>> the locking threads were 1/7,792/173,696. After the patch, the figures
>> became 5,842/6,542/7,458.  It can be seen that the rwsem became much
>> more fair, though there was a drop of about 16% in the mean locking
>> operations done which was a tradeoff of having better fairness.
>>
>> Making the waiter set the handoff bit right after the first wakeup can
> What does 'right after the first wakeup' mean? If that the top-waiter
> setting it if it fails to acquire the lock due to steals?
Yes. It is after the first sleep.

Cheers,
Longman


[PATCH 04/11] platform/x86: asus-wmi: Add quirk to force DSTS WMI method detection

2019-04-10 Thread Yurii Pavlovskyi
The DSTS method detection fails, as nothing is returned if method is not
defined in WMNB. As a result the control of keyboard backlight is not
functional for TUF Gaming series laptops (at the time the only
functionality of the driver on this model implemented with WMI methods).

Patch was tested on a newer TUF Gaming FX505GM and older K54C model.

FX505GM:
Method (WMNB, 3, Serialized)
{ ...
If ((Local0 == 0x53545344))
{
...
Return (Zero)
}
...
// No return
}

K54C:
Method (WMNB, 3, Serialized)
{ ...
If ((Local0 == 0x53545344))
{
...
Return (0x02)
}
...
Return (0xFFFE)
}

The non-existing method ASUS_WMI_METHODID_DSTS=0x53544344 (actually it is
DCTS in little endian ASCII) is selected in asus->dsts.

One way to fix this would be to call both for every known device ID until
some answers - this would increase module load time.

Another option is to check some device that is known to exist on every
model - none known at the time.

Last option, which is implemented, is to check for presence of the
ASUS7000 device in ACPI tree (it is a dummy device), which is the
condition used for loading the vendor driver for this model. This might
not fix every affected model ever produced, but it likely does not
introduce any regressions. The patch introduces a quirk that is enabled
when ASUS7000 is found.

Scope (_SB)
{
Device (ATK)
{
Name (_HID, "ASUS7000")  // _HID: Hardware ID
}
}

Signed-off-by: Yurii Pavlovskyi 
---
 drivers/platform/x86/asus-nb-wmi.c |  5 +
 drivers/platform/x86/asus-wmi.c| 16 +---
 drivers/platform/x86/asus-wmi.h|  5 +
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/x86/asus-nb-wmi.c 
b/drivers/platform/x86/asus-nb-wmi.c
index b6f2ff95c3ed..cc5f0765a8d9 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "asus-wmi.h"
 
@@ -434,6 +435,10 @@ static void asus_nb_wmi_quirks(struct asus_wmi_driver 
*driver)
}
pr_info("Using i8042 filter function for receiving events\n");
}
+
+   if (acpi_dev_found("ASUS7000")) {
+   driver->quirks->force_dsts = true;
+   }
 }
 
 static const struct key_entry asus_nb_wmi_keymap[] = {
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index cfccfc0b8c2f..58890d87d50c 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -24,7 +24,7 @@
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define PR KBUILD_MODNAME ": "
 
 #include 
 #include 
@@ -1885,11 +1885,21 @@ static int asus_wmi_platform_init(struct asus_wmi *asus)
 * Note, on most Eeepc, there is no way to check if a method exist
 * or note, while on notebooks, they returns 0xFFFE on failure,
 * but once again, SPEC may probably be used for that kind of things.
+*
+* Additionally at least TUF Gaming series laptops return 0 for unknown
+* methods, so the detection in this way is not possible and method must
+* be forced. Likely the presence of ACPI device ASUS7000 indicates
+* this.
 */
-   if (!asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, 0, 0, NULL))
+   if (asus->driver->quirks->force_dsts) {
+   pr_info(PR "DSTS method forced\n");
+   asus->dsts_id = ASUS_WMI_METHODID_DSTS2;
+   } else if (!asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS,
+   0, 0, NULL)) {
asus->dsts_id = ASUS_WMI_METHODID_DSTS;
-   else
+   } else {
asus->dsts_id = ASUS_WMI_METHODID_DSTS2;
+   }
 
/* CWAP allow to define the behavior of the Fn+F2 key,
 * this method doesn't seems to be present on Eee PCs */
diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h
index 6c1311f4b04d..94056da02fde 100644
--- a/drivers/platform/x86/asus-wmi.h
+++ b/drivers/platform/x86/asus-wmi.h
@@ -54,6 +54,11 @@ struct quirk_entry {
 */
int no_display_toggle;
u32 xusb2pr;
+   /**
+* Force DSTS instead of DSCS and skip detection. Useful if WMNB
+* returns nothing on unknown method call.
+*/
+   bool force_dsts;
 
bool (*i8042_filter)(unsigned char data, unsigned char str,
 struct serio *serio);
-- 
2.17.1



Re: [PATCH v3 3/3] module: Make __tracepoints_ptrs as read-only

2019-04-10 Thread Joel Fernandes
On Wed, Apr 10, 2019 at 04:11:12PM -0400, Steven Rostedt wrote:
> On Wed, 10 Apr 2019 15:57:08 -0400
> "Joel Fernandes (Google)"  wrote:
> 
> > This series hardens the tracepoints in modules by making the array of
> > pointers referring to the tracepoints as read-only. This array is needed
> > during module unloading to verify that the tracepoint is quiescent.
> > There is no reason for the array to be to be writable after init, and
> > can cause security or other hidden bugs. Mark these as ro_after_init.
> > 
> > Suggested-by: paul...@linux.vnet.ibm.com
> > Suggested-by: keesc...@chromium.org
> > Suggested-by: mathieu.desnoy...@efficios.com
> > Cc: rost...@goodmis.org
> > Signed-off-by: Joel Fernandes (Google) 
> > ---
> >  kernel/module.c | 6 ++
> >  1 file changed, 6 insertions(+)
> > 
> > diff --git a/kernel/module.c b/kernel/module.c
> > index 8b9631e789f0..be980aaa8804 100644
> > --- a/kernel/module.c
> > +++ b/kernel/module.c
> > @@ -3320,6 +3320,12 @@ static const char * const ro_after_init_sections[] = 
> > {
> >  * by the SRCU notifiers
> >  */
> > "___srcu_struct_ptrs",
> > +
> > +   /*
> > +* Array of tracepoint pointers used for checking if tracepoints are
> > +* quiescent during unloading.
> > +*/
> > +   "__tracepoints_ptrs",
> 
> Do we ever modify the __tracepoint_ptrs section? I know the jump_label
> sections are sorted on load, which means they need to be writable
> during init, but if __tracepoint_ptrs is not sorted or touched during
> load, why not just put them in the rodata section to begin with?
> 
> -- Steve

The srcu structure pointer array is modified at module load time because the
array is fixed up by the module loader at load-time with the final locations
of the tracepoints right?  Basically relocation fixups. At compile time, I
believe it is not know what the values in the ptr array are. I believe same
is true for the tracepoint ptrs array.

Also it needs to be in a separate __tracepoint_ptrs so that this code works:


#ifdef CONFIG_TRACEPOINTS
mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
 sizeof(*mod->tracepoints_ptrs),
 &mod->num_tracepoints);
#endif

Did I  miss some point? Thanks,

 - Joel



Re: rseq/arm32: choosing rseq code signature

2019-04-10 Thread Mathieu Desnoyers
- On Apr 9, 2019, at 3:32 PM, Mathieu Desnoyers 
mathieu.desnoy...@efficios.com wrote:

> Hi Will,
> 
> We are about to include the code signature required prior to restartable
> sequences abort handlers into glibc, which will make this ABI choice final.
> We need architecture maintainer input on that signature value.
> 
> That code signature is placed before each abort handler, so the kernel can
> validate that it is indeed jumping to an abort handler (and not some
> arbitrary attacker-chosen code). The signature is never executed.
> 
> The current discussion thread on the glibc mailing list leads us towards
> using a trap with uncommon immediate operand, which simplifies integration
> with disassemblers, emulators, makes it easier to debug if the control
> flow gets redirected there by mistake, and is nicer for some architecture's
> speculative execution.
> 
> We can have different signatures for each sub-architecture, as long as they
> don't have to co-exist within the same process. We can special-case with
> #ifdef for each sub-architecture and endianness if need be. If the 
> architecture
> has instruction set extensions that can co-exist with the architecture
> instruction set within the same process (e.g. thumb for arm), we need to take
> into account to which instruction the chosen signature value would map (and
> possibly decide if we need to extend rseq to support many signatures).
> 
> Here is an example of rseq signature definition template:
> 
> /*
> * TODO: document trap instruction objdump output on each sub-architecture
> * instruction sets, as well as instruction set extensions.
> */
> #define RSEQ_SIG 0x
> 
> Ideally we'd need a patch on top of the Linux kernel
> tools/testing/selftests/rseq/rseq-arm.h file that updates
> the signature value, so I can then pick it up for the glibc
> patchset.

Would the following diff work for you ? If so, can I get your
acked-by ?

diff --git a/tools/testing/selftests/rseq/rseq-arm.h 
b/tools/testing/selftests/rseq/rseq-arm.h
index 5f262c54364f..1f261ad2ac1b 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -5,7 +5,17 @@
  * (C) Copyright 2016-2018 - Mathieu Desnoyers 
  */
 
-#define RSEQ_SIG   0x53053053
+/*
+ * RSEQ_SIG uses the udf A32 instruction with an uncommon immediate operand
+ * value 0x5305. This traps if user-space reaches this instruction by mistake,
+ * and the uncommon operand ensures the kernel does not move the instruction
+ * pointer to attacker-controlled code on rseq abort.
+ *
+ * The instruction pattern is:
+ *
+ * e7f530f5udf#21253; 0x5305
+ */
+#define RSEQ_SIG   0xe7f530f5
 
 #define rseq_smp_mb()  __asm__ __volatile__ ("dmb" ::: "memory", "cc")
 #define rseq_smp_rmb() __asm__ __volatile__ ("dmb" ::: "memory", "cc")
@@ -78,7 +88,8 @@ do {  
\
__rseq_str(table_label) ":\n\t" \
".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".word " __rseq_str(start_ip) ", 0x0, " 
__rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
-   ".word " __rseq_str(RSEQ_SIG) "\n\t"\
+   ".arm\n\t"  \
+   ".inst " __rseq_str(RSEQ_SIG) "\n\t"\
__rseq_str(label) ":\n\t"   \
teardown\
"b %l[" __rseq_str(abort_label) "]\n\t"


> 
> Thanks!
> 
> Mathieu
> 
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


[PATCH 05/11] platform/x86: asus-wmi: Support queued WMI event codes

2019-04-10 Thread Yurii Pavlovskyi
Event codes are expected to be polled from a queue on at least some
models.

The WMI event codes are pushed into queue based on circular buffer. After
INIT method is called ACPI code is allowed to push events into this buffer
the INIT method can not be reverted. If the module is unloaded and an
event (such as hotkey press) gets emitted before inserting it back the
events get processed delayed by one or, if the queue overflows,
additionally delayed by about 3 seconds.

Patch was tested on a newer TUF Gaming FX505GM and older K54C model.

FX505GM
Device (ATKD)
{ ..
Name (ATKQ, Package (0x10)
{
0x, ..
}

Method (IANQ, 1, Serialized)
{
If ((AQNO >= 0x10))
{
Local0 = 0x64
While ((Local0 && (AQNO >= 0x10)))
{
Local0--
Sleep (0x0A)
}
...
..
AQTI++
AQTI &= 0x0F
ATKQ [AQTI] = Arg0
...
}

Method (GANQ, 0, Serialized)
{
..
If (AQNO)
{
...
Local0 = DerefOf (ATKQ [AQHI])
AQHI++
AQHI &= 0x0F
Return (Local0)
}

Return (One)
}

This code is almost identical to K54C, which does return Ones on empty
queue.

K54C:
Method (GANQ, 0, Serialized)
{
If (AQNO)
{
...
Return (Local0)
}

Return (Ones)
}

The fix flushes the old key codes out of the queue on load and after
receiving event the queue is read until either .. or 1 is encountered.

It might be considered a minor issue and no normal user would likely to
observe this (there is little reason unloading the driver), but it does
significantly frustrate a developer who is unlucky enough to encounter
this.

Introduce functionality for flushing and processing queued codes, which is
enabled via quirk flag for ASUS7000. It might be considered if it is
reasonable to enable it everywhere (might introduce regressions) or always
try to flush the queue on module load and try to detect if this quirk is
present in the future.

This patch limits the effect to the specific hardware defined by ASUS7000
device that is used for driver detection by vendor driver of Fx505. The
fallback is also implemented in case initial flush fails.

Signed-off-by: Yurii Pavlovskyi 
---
 drivers/platform/x86/asus-nb-wmi.c |   1 +
 drivers/platform/x86/asus-wmi.c| 122 ++---
 drivers/platform/x86/asus-wmi.h|   2 +
 3 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/drivers/platform/x86/asus-nb-wmi.c 
b/drivers/platform/x86/asus-nb-wmi.c
index cc5f0765a8d9..357d273ed336 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -438,6 +438,7 @@ static void asus_nb_wmi_quirks(struct asus_wmi_driver 
*driver)
 
if (acpi_dev_found("ASUS7000")) {
driver->quirks->force_dsts = true;
+   driver->quirks->wmi_event_queue = true;
}
 }
 
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 58890d87d50c..e0a710c64dea 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -80,6 +80,12 @@ MODULE_LICENSE("GPL");
 #define USB_INTEL_XUSB2PR  0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI  0x9c31
 
+#define WMI_EVENT_QUEUE_SIZE   0x10
+#define WMI_EVENT_QUEUE_END0x1
+#define WMI_EVENT_MASK 0x
+/* The event value is always the same. */
+#define WMI_EVENT_VALUE0xFF
+
 static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL };
 
 static bool ashs_present(void)
@@ -143,6 +149,7 @@ struct asus_wmi {
int dsts_id;
int spec;
int sfun;
+   bool wmi_event_queue;
 
struct input_dev *inputdev;
struct backlight_device *backlight_device;
@@ -1637,77 +1644,126 @@ static int is_display_toggle(int code)
return 0;
 }
 
-static void asus_wmi_notify(u32 value, void *context)
+static int asus_poll_wmi_event(u32 value)
 {
-   struct asus_wmi *asus = context;
-   struct acpi_buffer response = { ACPI_ALLOCATE_BUFFER, NULL };
+   struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
union acpi_object *obj;
acpi_status status;
-   int code;
-   int orig_code;
-   unsigned int key_value = 1;
-   bool autorelease = 1;
+   int code = -EIO;
 
-   status = wmi_get_event_data(value, &response);
-   if (status != AE_OK) {
-   pr_err("bad event status 0x%x\n", status);
-   return;
+   status = wmi_get_event_data(value, &output);
+   if (ACPI_FAILURE(status)) {
+   pr_warn(PR "Failed to get WMI event code: %s\n",
+   acpi_format_exception(status));
+   return code;
}
 
-   obj = (union acpi_object *)response.pointer;
+   obj = (union acpi_object *)output.

[PATCH 06/11] platform/x86: asus-nb-wmi: Add microphone mute key code

2019-04-10 Thread Yurii Pavlovskyi
The microphone mute key that is present on FX505GM laptop and possibly
others is missing from sparse keymap. Add the missing code.

Also comment on the fan mode switch key that has the same code as the
already used key.

Signed-off-by: Yurii Pavlovskyi 
---
 drivers/platform/x86/asus-nb-wmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/asus-nb-wmi.c 
b/drivers/platform/x86/asus-nb-wmi.c
index 357d273ed336..39cf447198a9 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -474,6 +474,7 @@ static const struct key_entry asus_nb_wmi_keymap[] = {
{ KE_KEY, 0x6B, { KEY_TOUCHPAD_TOGGLE } },
{ KE_IGNORE, 0x6E, },  /* Low Battery notification */
{ KE_KEY, 0x7a, { KEY_ALS_TOGGLE } }, /* Ambient Light Sensor Toggle */
+   { KE_KEY, 0x7c, { KEY_MICMUTE } },
{ KE_KEY, 0x7D, { KEY_BLUETOOTH } }, /* Bluetooth Enable */
{ KE_KEY, 0x7E, { KEY_BLUETOOTH } }, /* Bluetooth Disable */
{ KE_KEY, 0x82, { KEY_CAMERA } },
@@ -488,7 +489,7 @@ static const struct key_entry asus_nb_wmi_keymap[] = {
{ KE_KEY, 0x92, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + TV + DVI */
{ KE_KEY, 0x93, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + TV + DVI 
*/
{ KE_KEY, 0x95, { KEY_MEDIA } },
-   { KE_KEY, 0x99, { KEY_PHONE } },
+   { KE_KEY, 0x99, { KEY_PHONE } }, /* Conflicts with fan mode switch */
{ KE_KEY, 0xA0, { KEY_SWITCHVIDEOMODE } }, /* SDSP HDMI only */
{ KE_KEY, 0xA1, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + HDMI */
{ KE_KEY, 0xA2, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + HDMI */
-- 
2.17.1



[PATCH 07/11] platform/x86: asus-wmi: Organize code into sections

2019-04-10 Thread Yurii Pavlovskyi
The driver has grown (and will more) pretty big which makes it hard to
navigate and understand. Add uniform comments to the code and ensure that
it is sorted into logical sections.

Signed-off-by: Yurii Pavlovskyi 
---
 drivers/platform/x86/asus-wmi.c | 94 -
 1 file changed, 46 insertions(+), 48 deletions(-)

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index e0a710c64dea..b9a6dc224e08 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -191,6 +191,8 @@ struct asus_wmi {
struct asus_wmi_driver *driver;
 };
 
+/* Input 
**/
+
 static int asus_wmi_input_init(struct asus_wmi *asus)
 {
int err;
@@ -228,6 +230,8 @@ static void asus_wmi_input_exit(struct asus_wmi *asus)
asus->inputdev = NULL;
 }
 
+/* WMI 
/
+
 static int asus_wmi_evaluate_method_3dw(u32 method_id, u32 arg0, u32 arg1,
u32 arg2, u32 *retval)
 {
@@ -246,7 +250,7 @@ static int asus_wmi_evaluate_method_3dw(u32 method_id, u32 
arg0, u32 arg1,
 &input, &output);
 
if (ACPI_FAILURE(status))
-   goto exit;
+   return -EIO;
 
obj = (union acpi_object *)output.pointer;
if (obj && obj->type == ACPI_TYPE_INTEGER)
@@ -257,10 +261,6 @@ static int asus_wmi_evaluate_method_3dw(u32 method_id, u32 
arg0, u32 arg1,
 
kfree(obj);
 
-exit:
-   if (ACPI_FAILURE(status))
-   return -EIO;
-
if (tmp == ASUS_WMI_UNSUPPORTED_METHOD)
return -ENODEV;
 
@@ -344,9 +344,8 @@ static int asus_wmi_get_devstate_simple(struct asus_wmi 
*asus, u32 dev_id)
  ASUS_WMI_DSTS_STATUS_BIT);
 }
 
-/*
- * LEDs
- */
+/* LEDs 
***/
+
 /*
  * These functions actually update the LED's, and are called from a
  * workqueue. By doing this as separate work rather than when the LED
@@ -656,6 +655,7 @@ static int asus_wmi_led_init(struct asus_wmi *asus)
return rv;
 }
 
+/* RF 
*/
 
 /*
  * PCI hotplug (for wlan rfkill)
@@ -1078,6 +1078,8 @@ static int asus_wmi_rfkill_init(struct asus_wmi *asus)
return result;
 }
 
+/* Quirks 
*/
+
 static void asus_wmi_set_xusb2pr(struct asus_wmi *asus)
 {
struct pci_dev *xhci_pdev;
@@ -1110,9 +1112,8 @@ static void asus_wmi_set_als(void)
asus_wmi_set_devstate(ASUS_WMI_DEVID_ALS_ENABLE, 1, NULL);
 }
 
-/*
- * Hwmon device
- */
+/* Hwmon device 
***/
+
 static int asus_hwmon_agfn_fan_speed_read(struct asus_wmi *asus, int fan,
  int *speed)
 {
@@ -1388,7 +1389,6 @@ static umode_t asus_hwmon_sysfs_is_visible(struct kobject 
*kobj,
else if (attr == &dev_attr_temp1_input.attr)
dev_id = ASUS_WMI_DEVID_THERMAL_CTRL;
 
-
if (attr == &dev_attr_fan1_input.attr
|| attr == &dev_attr_fan1_label.attr
|| attr == &dev_attr_pwm1.attr
@@ -1460,9 +1460,27 @@ static void asus_wmi_hwmon_exit(struct asus_wmi *asus)
}
 }
 
-/*
- * Backlight
- */
+static int asus_wmi_fan_init(struct asus_wmi *asus)
+{
+   int status;
+
+   asus->asus_hwmon_pwm = -1;
+   asus->asus_hwmon_num_fans = -1;
+   asus->asus_hwmon_fan_manual_mode = false;
+
+   status = asus_hwmon_get_fan_number(asus, &asus->asus_hwmon_num_fans);
+   if (status) {
+   asus->asus_hwmon_num_fans = 0;
+   pr_warn("Could not determine number of fans: %d\n", status);
+   return -ENXIO;
+   }
+
+   pr_info("Number of fans: %d\n", asus->asus_hwmon_num_fans);
+   return 0;
+}
+
+/* Backlight 
**/
+
 static int read_backlight_power(struct asus_wmi *asus)
 {
int ret;
@@ -1644,6 +1662,8 @@ static int is_display_toggle(int code)
return 0;
 }
 
+/* WMI events 
*/
+
 static int asus_poll_wmi_event(u32 value)
 {
struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -1766,9 +1786,8 @@ static int asus_wmi_notify_queue_flush(struct asus_wmi 
*asus)
return -EIO;
 }
 
-/*
- * Sys helpers
- */
+/* Sysfs 
**/
+
 static int parse_arg(const char *buf, unsigned long count, int *val)
 {
if (!count)
@@ -1907,9 +1926,8 @@ static int asus_wmi_sysfs_init(struct platform_device 
*device)
return sysfs_create_group(&device->dev.kobj, &platform_attribute_group);
 }
 
-/*
- * Platform device
- */
+/* Platform 

[PATCH 08/11] platform/x86: asus-wmi: Enhance detection of thermal data

2019-04-10 Thread Yurii Pavlovskyi
The obviously wrong value 1 for temperature device ID in this driver is
returned by at least some devices, including TUF Gaming series laptops,
instead of 0 as expected previously. Observable effect is that a
temp1_input in hwmon reads temperature near absolute zero.

* Consider 0.1 K as erroneous value in addition to 0 K.
* Refactor detection of thermal input availability to a separate function.

Signed-off-by: Yurii Pavlovskyi 
---
 drivers/platform/x86/asus-wmi.c | 46 -
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index b9a6dc224e08..175ecd5b7c51 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -176,6 +176,7 @@ struct asus_wmi {
struct asus_rfkill gps;
struct asus_rfkill uwb;
 
+   bool asus_hwmon_thermal_available;
bool asus_hwmon_fan_manual_mode;
int asus_hwmon_num_fans;
int asus_hwmon_pwm;
@@ -1373,6 +1374,32 @@ static struct attribute *hwmon_attributes[] = {
NULL
 };
 
+static int asus_hwmon_check_thermal_available(struct asus_wmi *asus)
+{
+   u32 value = ASUS_WMI_UNSUPPORTED_METHOD;
+   int err;
+
+   asus->asus_hwmon_thermal_available = false;
+   err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_THERMAL_CTRL, &value);
+
+   if (err < 0) {
+   if (err == -ENODEV)
+   return 0;
+
+   return err;
+   }
+
+   /*
+* If the temperature value in deci-Kelvin is near the absolute
+* zero temperature, something is clearly wrong.
+*/
+   if (!value || value == 1)
+   return 0;
+
+   asus->asus_hwmon_thermal_available = true;
+   return 0;
+}
+
 static umode_t asus_hwmon_sysfs_is_visible(struct kobject *kobj,
  struct attribute *attr, int idx)
 {
@@ -1386,8 +1413,6 @@ static umode_t asus_hwmon_sysfs_is_visible(struct kobject 
*kobj,
 
if (attr == &dev_attr_pwm1.attr)
dev_id = ASUS_WMI_DEVID_FAN_CTRL;
-   else if (attr == &dev_attr_temp1_input.attr)
-   dev_id = ASUS_WMI_DEVID_THERMAL_CTRL;
 
if (attr == &dev_attr_fan1_input.attr
|| attr == &dev_attr_fan1_label.attr
@@ -1412,15 +1437,13 @@ static umode_t asus_hwmon_sysfs_is_visible(struct 
kobject *kobj,
 * - reverved bits are non-zero
 * - sfun and presence bit are not set
 */
-   if (value == ASUS_WMI_UNSUPPORTED_METHOD || value & 0xFFF8
+   if (value == ASUS_WMI_UNSUPPORTED_METHOD || (value & 0xFFF8)
|| (!asus->sfun && !(value & ASUS_WMI_DSTS_PRESENCE_BIT)))
ok = false;
else
ok = fan_attr <= asus->asus_hwmon_num_fans;
-   } else if (dev_id == ASUS_WMI_DEVID_THERMAL_CTRL) {
-   /* If value is zero, something is clearly wrong */
-   if (!value)
-   ok = false;
+   } else if (attr == &dev_attr_temp1_input.attr) {
+   ok = asus->asus_hwmon_thermal_available;
} else if (fan_attr <= asus->asus_hwmon_num_fans && fan_attr != -1) {
ok = true;
} else {
@@ -1476,6 +1499,15 @@ static int asus_wmi_fan_init(struct asus_wmi *asus)
}
 
pr_info("Number of fans: %d\n", asus->asus_hwmon_num_fans);
+
+   status = asus_hwmon_check_thermal_available(asus);
+   if (status) {
+   pr_warn("Could not check if thermal available: %d\n", status);
+   return -ENXIO;
+   }
+
+   pr_info(PR "Thermal available: %d\n",
+   asus->asus_hwmon_thermal_available);
return 0;
 }
 
-- 
2.17.1



[PATCH 09/11] platform/x86: asus-wmi: Control RGB keyboard backlight

2019-04-10 Thread Yurii Pavlovskyi
The WMI exposes two methods for controlling RGB keyboard backlight which
allow to control:
* RGB components in range 00 - ff,
* Switch between 4 effects,
* Switch between 3 effect speed modes,
* Separately enable the backlight on boot, in awake state (after driver
  load), in sleep mode, and probably in something called shutdown mode
  (no observable effects of enabling it are known so far).

The configuration should be written to several sysfs parameter buffers
which are then written via WMI by writing either 1 or 2 to the "kbbl_set"
parameter. When reading the buffers the last written value is returned.

If the 2 is written to "kbbl_set", the parameters will be reset on reboot
(temporary mode), 1 is permanent mode, parameters are retained.

The calls use new 3-dword input buffer method call.

The functionality is only enabled if corresponding DSTS methods return
exact valid values.

The following script demonstrates usage:

echo Red [00 - ff]
echo 33 > /sys/devices/platform/asus-nb-wmi/kbbl/kbbl_red
echo Green [00 - ff]
echo ff > /sys/devices/platform/asus-nb-wmi/kbbl/kbbl_green
echo Blue [00 - ff]
echo 0 > /sys/devices/platform/asus-nb-wmi/kbbl/kbbl_blue
echo Mode: 0 - static color, 1 - blink, 2 - rainbow, 3 - strobe
echo 0 > /sys/devices/platform/asus-nb-wmi/kbbl/kbbl_mode
echo Speed for modes 1 and 2: 0 - slow, 1 - medium, 2 - fast
echo 0 > /sys/devices/platform/asus-nb-wmi/kbbl/kbbl_speed
echo Enable: 02 - on boot, before module load, 08 - awake, 20 - sleep,
echo 2a or ff to set all
echo 2a > /sys/devices/platform/asus-nb-wmi/kbbl/kbbl_flags
echo Save: 1 - permanently, 2 - temporarily, reset after reboot
echo 1 > /sys/devices/platform/asus-nb-wmi/kbbl/kbbl_set

Signed-off-by: Yurii Pavlovskyi 
---
 .../ABI/testing/sysfs-platform-asus-wmi   |  61 
 drivers/platform/x86/asus-wmi.c   | 329 ++
 include/linux/platform_data/x86/asus-wmi.h|   2 +
 3 files changed, 392 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi 
b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 019e1e29370e..300a40519695 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -36,3 +36,64 @@ KernelVersion:   3.5
 Contact:   "AceLan Kao" 
 Description:
Resume on lid open. 1 means on, 0 means off.
+
+What:  /sys/devices/platform//kbbl/kbbl_red
+Date:  Apr 2019
+KernelVersion: 5.1
+Contact:   "Yurii Pavlovskyi" 
+Description:
+   RGB keyboard backlight red component: 00 .. ff.
+
+What:  /sys/devices/platform//kbbl/kbbl_green
+Date:  Apr 2019
+KernelVersion: 5.1
+Contact:   "Yurii Pavlovskyi" 
+Description:
+   RGB keyboard backlight green component: 00 .. ff.
+
+What:  /sys/devices/platform//kbbl/kbbl_blue
+Date:  Apr 2019
+KernelVersion: 5.1
+Contact:   "Yurii Pavlovskyi" 
+Description:
+   RGB keyboard backlight blue component: 00 .. ff.
+
+What:  /sys/devices/platform//kbbl/kbbl_mode
+Date:  Apr 2019
+KernelVersion: 5.1
+Contact:   "Yurii Pavlovskyi" 
+Description:
+   RGB keyboard backlight mode:
+   * 0 - static color,
+   * 1 - blink,
+   * 2 - rainbow,
+   * 3 - strobe.
+
+What:  /sys/devices/platform//kbbl/kbbl_speed
+Date:  Apr 2019
+KernelVersion: 5.1
+Contact:   "Yurii Pavlovskyi" 
+Description:
+   RGB keyboard backlight speed for modes 1 and 2:
+   * 0 - slow,
+   * 1 - medium,
+   * 2 - fast.
+
+What:  /sys/devices/platform//kbbl/kbbl_flags
+Date:  Apr 2019
+KernelVersion: 5.1
+Contact:   "Yurii Pavlovskyi" 
+Description:
+   RGB keyboard backlight enable flags (2a to enable everything), 
OR of:
+   * 02 - on boot (until module load),
+   * 08 - awake,
+   * 20 - sleep.
+
+What:  /sys/devices/platform//kbbl/kbbl_set
+Date:  Apr 2019
+KernelVersion: 5.1
+Contact:   "Yurii Pavlovskyi" 
+Description:
+   Write changed RGB keyboard backlight parameters:
+   * 1 - permanently,
+   * 2 - temporarily.
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 175ecd5b7c51..f4323a57f22f 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -145,6 +145,21 @@ struct asus_rfkill {
u32 dev_id;
 };
 
+struct asus_kbbl_rgb {
+   u8 kbbl_red;
+   u8 kbbl_green;
+   u8 kbbl_blue;
+   u8 kbbl_mode;
+   u8 kbbl_speed;
+
+   u8 kbbl_set_red;
+   u8 kbbl_set_green;
+   u8 kbbl_set_blue;
+   u8 kbbl_set_mode;
+   u8 kbbl_set_speed;
+   u8 kbbl_set_flags;
+};
+
 struct asus_wmi {
int dsts_id;
int spec;
@

[PATCH 10/11] platform/x86: asus-wmi: Switch fan boost mode

2019-04-10 Thread Yurii Pavlovskyi
The WMI exposes a write-only device ID where three modes can be switched
on some laptops (TUF Gaming FX505GM). There is a hotkey combination Fn-F5
that does have a fan icon which is designed to toggle between these 3
modes.

Add a SysFS entry that reads the last written value and updates value in
WMI on write and a hotkey handler that toggles the modes. The
corresponding DEVS device handler does obviously take 3 possible
argument values.

Method (SFBM, 1, NotSerialized)
{
If ((Arg0 == Zero) { .. }
If ((Arg0 == One)) { .. }
If ((Arg0 == 0x02)) { .. }
}

... // DEVS
If ((IIA0 == 0x00110018))
{
   SFBM (IIA1)
   Return (One)
}

* 0x00 - is normal,
* 0x01 - is obviously turbo by the amount of noise, might be useful to
avoid CPU frequency throttling on high load,
* 0x02 - the meaning is unknown at the time as modes are not named
in the vendor documentation, but it does look like a quiet mode as CPU
temperature does increase about 10 degrees on maximum load.

Signed-off-by: Yurii Pavlovskyi 
---
 .../ABI/testing/sysfs-platform-asus-wmi   |  10 ++
 drivers/platform/x86/asus-wmi.c   | 119 --
 include/linux/platform_data/x86/asus-wmi.h|   1 +
 3 files changed, 117 insertions(+), 13 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi 
b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 300a40519695..2b3184e297a7 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -97,3 +97,13 @@ Description:
Write changed RGB keyboard backlight parameters:
* 1 - permanently,
* 2 - temporarily.
+
+What:  /sys/devices/platform//fan_mode
+Date:  Apr 2019
+KernelVersion: 5.1
+Contact:   "Yurii Pavlovskyi" 
+Description:
+   Fan boost mode:
+   * 0 - normal,
+   * 1 - turbo,
+   * 2 - quiet?
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index f4323a57f22f..941c628945ac 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -69,6 +69,7 @@ MODULE_LICENSE("GPL");
 #define NOTIFY_KBD_BRTUP   0xc4
 #define NOTIFY_KBD_BRTDWN  0xc5
 #define NOTIFY_KBD_BRTTOGGLE   0xc7
+#define NOTIFY_KBD_FBM 0x99
 
 #define ASUS_FAN_DESC  "cpu_fan"
 #define ASUS_FAN_MFUN  0x13
@@ -77,6 +78,8 @@ MODULE_LICENSE("GPL");
 #define ASUS_FAN_CTRL_MANUAL   1
 #define ASUS_FAN_CTRL_AUTO 2
 
+#define ASUS_FAN_MODE_COUNT3
+
 #define USB_INTEL_XUSB2PR  0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI  0x9c31
 
@@ -196,6 +199,9 @@ struct asus_wmi {
int asus_hwmon_num_fans;
int asus_hwmon_pwm;
 
+   bool fan_mode_available;
+   u8 fan_mode;
+
bool kbbl_rgb_available;
struct asus_kbbl_rgb kbbl_rgb;
 
@@ -1833,6 +1839,87 @@ static int asus_wmi_fan_init(struct asus_wmi *asus)
return 0;
 }
 
+/* Fan mode 
***/
+
+static int fan_mode_check_present(struct asus_wmi *asus)
+{
+   u32 result;
+   int err;
+
+   asus->fan_mode_available = false;
+
+   err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_MODE, &result);
+   if (err) {
+   if (err == -ENODEV)
+   return 0;
+   else
+   return err;
+   }
+
+   if (result & ASUS_WMI_DSTS_PRESENCE_BIT)
+   asus->fan_mode_available = true;
+
+   return 0;
+}
+
+static int fan_mode_write(struct asus_wmi *asus)
+{
+   int err;
+   u8 value;
+   u32 retval;
+
+   value = asus->fan_mode % ASUS_FAN_MODE_COUNT;
+   pr_info(PR "Set fan mode: %u\n", value);
+   err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_MODE, value, &retval);
+
+   if (err) {
+   pr_warn(PR "Failed to set fan mode: %d\n", err);
+   return err;
+   }
+
+   if (retval != 1) {
+   pr_warn(PR "Failed to set fan mode (retval): 0x%x\n", retval);
+   return -EIO;
+   }
+
+   return 0;
+}
+
+static int fan_mode_switch_next(struct asus_wmi *asus)
+{
+   asus->fan_mode = (asus->fan_mode + 1) % ASUS_FAN_MODE_COUNT;
+   return fan_mode_write(asus);
+}
+
+static ssize_t fan_mode_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct asus_wmi *asus = dev_get_drvdata(dev);
+
+   return show_u8(asus->fan_mode, buf);
+}
+
+static ssize_t fan_mode_store(struct device *dev,
+   struct device_attribute *attr, const char *buf, size_t count)
+{
+   int result;
+   u8 new_mode;
+
+   struct asus_wmi *asus = dev_get_drvdata(dev);
+
+   result = store_u8(&new_mode, buf, count);
+   if (result < 0)
+   return r

Re: [PATCH] watchdog: machzwd: Mark expected switch fall-through

2019-04-10 Thread Guenter Roeck
On Wed, Apr 10, 2019 at 01:49:05PM -0500, Gustavo A. R. Silva wrote:
> In preparation to enabling -Wimplicit-fallthrough, mark switch
> cases where we are expecting to fall through.
> 
> This patch fixes the following warnings:
> 
> drivers/watchdog/machzwd.c: In function ‘zf_set_timer’:
> ./arch/x86/include/asm/io.h:355:14: warning: this statement may fall through 
> [-Wimplicit-fallthrough=]
>  #define outw outw
> drivers/watchdog/machzwd.c:80:53: note: in expansion of macro ‘outw’
>  #define zf_writew(port, data)  { outb(port, INDEX); outw(data, DATA_W); }
>  ^~~~
> drivers/watchdog/machzwd.c:179:3: note: in expansion of macro ‘zf_writew’
>zf_writew(COUNTER_1, new);
>^
> drivers/watchdog/machzwd.c:180:2: note: here
>   case WD2:
>   ^~~~
> 
> Warning level 3 was used: -Wimplicit-fallthrough=3
> 
> This patch is part of the ongoing efforts to enable
> -Wimplicit-fallthrough.
> 
> Signed-off-by: Gustavo A. R. Silva 

Reviewed-by: Guenter Roeck 

> ---
>  drivers/watchdog/machzwd.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
> index 88d823d87a4b..108928dbc754 100644
> --- a/drivers/watchdog/machzwd.c
> +++ b/drivers/watchdog/machzwd.c
> @@ -177,6 +177,7 @@ static inline void zf_set_timer(unsigned short new, 
> unsigned char n)
>   switch (n) {
>   case WD1:
>   zf_writew(COUNTER_1, new);
> + /* fall through */
>   case WD2:
>   zf_writeb(COUNTER_2, new > 0xff ? 0xff : new);
>   default:
> -- 
> 2.21.0
> 


[PATCH 11/11] platform/x86: asus-wmi: Do not disable keyboard backlight on unload

2019-04-10 Thread Yurii Pavlovskyi
The keyboard backlight is disabled when module is unloaded as it is
exposed as LED device. Change this behavior to ignore setting 0 brightness
when the ledclass device is unloading.

Signed-off-by: Yurii Pavlovskyi 
---
 drivers/platform/x86/asus-wmi.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 941c628945ac..a0ffdd99eae2 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -475,6 +475,10 @@ static void do_kbd_led_set(struct led_classdev *led_cdev, 
int value)
 static void kbd_led_set(struct led_classdev *led_cdev,
enum led_brightness value)
 {
+   /* Prevent disabling keyboard backlight on module unregister */
+   if (led_cdev->flags & LED_UNREGISTERING)
+   return;
+
do_kbd_led_set(led_cdev, value);
 }
 
-- 
2.17.1



[tip:core/core] overflow.h: Add comment documenting __ab_c_size()

2019-04-10 Thread tip-bot for Rasmus Villemoes
Commit-ID:  899cbdfa8d147c873fe4e66c38d2cca3c1ac6286
Gitweb: https://git.kernel.org/tip/899cbdfa8d147c873fe4e66c38d2cca3c1ac6286
Author: Rasmus Villemoes 
AuthorDate: Wed, 10 Apr 2019 22:27:25 +0200
Committer:  Borislav Petkov 
CommitDate: Wed, 10 Apr 2019 22:35:47 +0200

overflow.h: Add comment documenting __ab_c_size()

__ab_c_size() is a somewhat opaque name. Document its purpose, and while
at it, rename the parameters to actually match the abc naming.

 [ bp: glued a complete patch from chunks on LKML. ]

Reported-by: Borislav Petkov 
Signed-off-by: Rasmus Villemoes 
Acked-by: Kees Cook 
Cc: Matthew Wilcox 
Link: https://lkml.kernel.org/r/20190405045711.30339-1...@alien8.de
---
 include/linux/overflow.h | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 40b48e2133cb..6534a727cadb 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -278,11 +278,15 @@ static inline __must_check size_t array3_size(size_t a, 
size_t b, size_t c)
return bytes;
 }
 
-static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c)
+/*
+ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
+ * struct_size() below.
+ */
+static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
 {
size_t bytes;
 
-   if (check_mul_overflow(n, size, &bytes))
+   if (check_mul_overflow(a, b, &bytes))
return SIZE_MAX;
if (check_add_overflow(bytes, c, &bytes))
return SIZE_MAX;


Re: [PATCH] cros_ec: Add trace event to trace EC commands

2019-04-10 Thread Ross Zwisler
On Wed, Apr 10, 2019 at 02:24:08PM -0600, Raul E Rangel wrote:
> This is useful to see which EC commands are being executed and when.
> 
> To enable:
> 
> echo 'cros_ec:*' >> /sys/kernel/debug/tracing/set_event
> 
> Example:
> 
> /* cros_ec_cmd: version: 0, command: GET_VERSION */
> /* cros_ec_cmd: version: 0, command: GET_PROTOCOL_INFO */
> /* cros_ec_cmd: version: 1, command: GET_CMD_VERSIONS */
> /* cros_ec_cmd: version: 1, command: USB_PD_CONTROL */
> 
> Signed-off-by: Raul E Rangel 

Reviewed-by: Ross Zwisler 


[tip:x86/microcode] x86/microcode/intel: Refactor Intel microcode blob loading

2019-04-10 Thread tip-bot for Jann Horn
Commit-ID:  7e94a7b659eefedda82cde97229a26f319fb1182
Gitweb: https://git.kernel.org/tip/7e94a7b659eefedda82cde97229a26f319fb1182
Author: Jann Horn 
AuthorDate: Thu, 4 Apr 2019 13:11:28 +0200
Committer:  Borislav Petkov 
CommitDate: Wed, 10 Apr 2019 22:40:25 +0200

x86/microcode/intel: Refactor Intel microcode blob loading

Change generic_load_microcode() to use the iov_iter API instead of a
clumsy open-coded version which has to pay attention to __user data
or kernel data, depending on the loading method. This allows to avoid
explicit casting between user and kernel pointers.

Because the iov_iter API makes it hard to read the same location twice,
as a side effect, also fix a double-read of the microcode header (which
could e.g. lead to out-of-bounds reads in microcode_sanity_check()).

Not that it matters much, only root is allowed to load microcode
anyway...

 [ bp: Massage a bit, sort function-local variables. ]

Signed-off-by: Jann Horn 
Signed-off-by: Borislav Petkov 
Reviewed-by: Thomas Gleixner 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/2019040428.131157-1-ja...@google.com
---
 arch/x86/kernel/cpu/microcode/intel.c | 71 ++-
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/intel.c 
b/arch/x86/kernel/cpu/microcode/intel.c
index 16936a24795c..a44bdbe7c55e 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -861,32 +862,33 @@ out:
return ret;
 }
 
-static enum ucode_state generic_load_microcode(int cpu, void *data, size_t 
size,
-   int (*get_ucode_data)(void *, const void *, 
size_t))
+static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
 {
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-   u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
-   int new_rev = uci->cpu_sig.rev;
-   unsigned int leftover = size;
unsigned int curr_mc_size = 0, new_mc_size = 0;
-   unsigned int csig, cpf;
enum ucode_state ret = UCODE_OK;
+   int new_rev = uci->cpu_sig.rev;
+   u8 *new_mc = NULL, *mc = NULL;
+   unsigned int csig, cpf;
 
-   while (leftover) {
+   while (iov_iter_count(iter)) {
struct microcode_header_intel mc_header;
-   unsigned int mc_size;
+   unsigned int mc_size, data_size;
+   u8 *data;
 
-   if (leftover < sizeof(mc_header)) {
-   pr_err("error! Truncated header in microcode data 
file\n");
+   if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) {
+   pr_err("error! Truncated or inaccessible header in 
microcode data file\n");
break;
}
 
-   if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
-   break;
-
mc_size = get_totalsize(&mc_header);
-   if (!mc_size || mc_size > leftover) {
-   pr_err("error! Bad data in microcode data file\n");
+   if (mc_size < sizeof(mc_header)) {
+   pr_err("error! Bad data in microcode data file 
(totalsize too small)\n");
+   break;
+   }
+   data_size = mc_size - sizeof(mc_header);
+   if (data_size > iov_iter_count(iter)) {
+   pr_err("error! Bad data in microcode data file 
(truncated file?)\n");
break;
}
 
@@ -899,7 +901,9 @@ static enum ucode_state generic_load_microcode(int cpu, 
void *data, size_t size,
curr_mc_size = mc_size;
}
 
-   if (get_ucode_data(mc, ucode_ptr, mc_size) ||
+   memcpy(mc, &mc_header, sizeof(mc_header));
+   data = mc + sizeof(mc_header);
+   if (!copy_from_iter_full(data, data_size, iter) ||
microcode_sanity_check(mc, 1) < 0) {
break;
}
@@ -914,14 +918,11 @@ static enum ucode_state generic_load_microcode(int cpu, 
void *data, size_t size,
mc = NULL;  /* trigger new vmalloc */
ret = UCODE_NEW;
}
-
-   ucode_ptr += mc_size;
-   leftover  -= mc_size;
}
 
vfree(mc);
 
-   if (leftover) {
+   if (iov_iter_count(iter)) {
vfree(new_mc);
return UCODE_ERROR;
}
@@ -945,12 +946,6 @@ static enum ucode_state generic_load_microcode(int cpu, 
void *data, size_t size,
return ret;
 }
 
-static int get_ucode_fw(void *to, const void *from, size_t n)
-{
-   memcpy(to, from, n);
-   return 0;
-}
-
 static bool is_blacklisted(unsigned int cpu)
 {
struct cpuinfo_

<    1   2   3   4   5   6   7   8   9   >