Re: [PATCH v16 14/15] mtd: spi-nor: spansion: add support for Cypress Semper flash

2020-11-06 Thread Vignesh Raghavendra
Hi,

[...]

On 10/5/20 9:01 PM, Pratyush Yadav wrote:
> +static int spi_nor_cypress_octal_dtr_enable(struct spi_nor *nor, bool enable)
> +{
> + struct spi_mem_op op;
> + u8 *buf = nor->bouncebuf;
> + int ret;
> +
> + if (enable) {
> + /* Use 24 dummy cycles for memory array reads. */
> + ret = spi_nor_write_enable(nor);
> + if (ret)
> + return ret;
> +
> + *buf = SPINOR_REG_CYPRESS_CFR2V_MEMLAT_11_24;
> + op = (struct spi_mem_op)
> + SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WR_ANY_REG, 1),
> +SPI_MEM_OP_ADDR(3, SPINOR_REG_CYPRESS_CFR2V,
> +1),
> +SPI_MEM_OP_NO_DUMMY,
> +SPI_MEM_OP_DATA_OUT(1, buf, 1));
> +
> + ret = spi_mem_exec_op(nor->spimem, &op);
> + if (ret)
> + return ret;
> +
> + ret = spi_nor_wait_till_ready(nor);
> + if (ret)
> + return ret;
> +
> + nor->read_dummy = 24;
> + }
> +
> + /* Set/unset the octal and DTR enable bits. */
> + ret = spi_nor_write_enable(nor);
> + if (ret)
> + return ret;
> +
> + if (enable)
> + *buf = SPINOR_REG_CYPRESS_CFR5V_OCT_DTR_EN;
> + else
> + *buf = SPINOR_REG_CYPRESS_CFR5V_OCT_DTR_DS;
> +
> + op = (struct spi_mem_op)
> + SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WR_ANY_REG, 1),
> +SPI_MEM_OP_ADDR(enable ? 3 : 4,
> +SPINOR_REG_CYPRESS_CFR5V,
> +1),
> +SPI_MEM_OP_NO_DUMMY,
> +SPI_MEM_OP_DATA_OUT(1, buf, 1));
> +
> + if (!enable)
> + spi_nor_spimem_setup_op(nor, &op, SNOR_PROTO_8_8_8_DTR);
> +
> + ret = spi_mem_exec_op(nor->spimem, &op);
> + if (ret)
> + return ret;
> +
> + /* Give some time for the mode change to take place. */
> + usleep_range(1000, 1500);
> +

This delay is no longer needed right? I can drop it while applying, if
you confirm.

Tudor: Could you provide your R-by?

Regards
Vignesh


[PATCH] netfilter: conntrack: fix -Wformat

2020-11-06 Thread Nick Desaulniers
Clang is more aggressive about -Wformat warnings when the format flag
specifies a type smaller than the parameter. Fixes 8 instances of:

warning: format specifies type 'unsigned short' but the argument has
type 'int' [-Wformat]

Link: https://github.com/ClangBuiltLinux/linux/issues/378
Signed-off-by: Nick Desaulniers 
---
 net/netfilter/nf_conntrack_standalone.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/nf_conntrack_standalone.c 
b/net/netfilter/nf_conntrack_standalone.c
index 46c5557c1fec..c5aa45c38eb2 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -50,38 +50,38 @@ print_tuple(struct seq_file *s, const struct 
nf_conntrack_tuple *tuple,
 
switch (l4proto->l4proto) {
case IPPROTO_ICMP:
-   seq_printf(s, "type=%u code=%u id=%u ",
+   seq_printf(s, "type=%u code=%u id=%hu ",
   tuple->dst.u.icmp.type,
   tuple->dst.u.icmp.code,
-  ntohs(tuple->src.u.icmp.id));
+  (__be16)ntohs(tuple->src.u.icmp.id));
break;
case IPPROTO_TCP:
seq_printf(s, "sport=%hu dport=%hu ",
-  ntohs(tuple->src.u.tcp.port),
-  ntohs(tuple->dst.u.tcp.port));
+  (__be16)ntohs(tuple->src.u.tcp.port),
+  (__be16)ntohs(tuple->dst.u.tcp.port));
break;
case IPPROTO_UDPLITE:
case IPPROTO_UDP:
seq_printf(s, "sport=%hu dport=%hu ",
-  ntohs(tuple->src.u.udp.port),
-  ntohs(tuple->dst.u.udp.port));
+  (__be16)ntohs(tuple->src.u.udp.port),
+  (__be16)ntohs(tuple->dst.u.udp.port));
 
break;
case IPPROTO_DCCP:
seq_printf(s, "sport=%hu dport=%hu ",
-  ntohs(tuple->src.u.dccp.port),
-  ntohs(tuple->dst.u.dccp.port));
+  (__be16)ntohs(tuple->src.u.dccp.port),
+  (__be16)ntohs(tuple->dst.u.dccp.port));
break;
case IPPROTO_SCTP:
seq_printf(s, "sport=%hu dport=%hu ",
-  ntohs(tuple->src.u.sctp.port),
-  ntohs(tuple->dst.u.sctp.port));
+  (__be16)ntohs(tuple->src.u.sctp.port),
+  (__be16)ntohs(tuple->dst.u.sctp.port));
break;
case IPPROTO_ICMPV6:
-   seq_printf(s, "type=%u code=%u id=%u ",
+   seq_printf(s, "type=%u code=%u id=%hu ",
   tuple->dst.u.icmp.type,
   tuple->dst.u.icmp.code,
-  ntohs(tuple->src.u.icmp.id));
+  (__be16)ntohs(tuple->src.u.icmp.id));
break;
case IPPROTO_GRE:
seq_printf(s, "srckey=0x%x dstkey=0x%x ",
-- 
2.29.2.222.g5d2a92d10f8-goog



Re: [PATCH] KVM: PPC: Book3S: Assign boolean values to a bool variable

2020-11-06 Thread Greg Kurz
On Sat,  7 Nov 2020 14:26:22 +0800
xiakaixu1...@gmail.com wrote:

> From: Kaixu Xia 
> 
> Fix the following coccinelle warnings:
> 
> ./arch/powerpc/kvm/book3s_xics.c:476:3-15: WARNING: Assignment of 0/1 to bool 
> variable
> ./arch/powerpc/kvm/book3s_xics.c:504:3-15: WARNING: Assignment of 0/1 to bool 
> variable
> 
> Reported-by: Tosk Robot 
> Signed-off-by: Kaixu Xia 
> ---

Reviewed-by: Greg Kurz 

>  arch/powerpc/kvm/book3s_xics.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> index 5fee5a11550d..303e3cb096db 100644
> --- a/arch/powerpc/kvm/book3s_xics.c
> +++ b/arch/powerpc/kvm/book3s_xics.c
> @@ -473,7 +473,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, 
> struct kvmppc_icp *icp,
>   arch_spin_unlock(&ics->lock);
>   local_irq_restore(flags);
>   new_irq = reject;
> - check_resend = 0;
> + check_resend = false;
>   goto again;
>   }
>   } else {
> @@ -501,7 +501,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, 
> struct kvmppc_icp *icp,
>   state->resend = 0;
>   arch_spin_unlock(&ics->lock);
>   local_irq_restore(flags);
> - check_resend = 0;
> + check_resend = false;
>   goto again;
>   }
>   }



[PATCH v3 bpf] trace: bpf: Fix passing zero to PTR_ERR()

2020-11-06 Thread Wang Qing
There is a bug when passing zero to PTR_ERR() and return.
Fix smatch err.

Signed-off-by: Wang Qing 
---
 kernel/trace/bpf_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4517c8b..5113fd4
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1198,7 +1198,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, 
u32 btf_ptr_size,
*btf = bpf_get_btf_vmlinux();
 
if (IS_ERR_OR_NULL(*btf))
-   return PTR_ERR(*btf);
+   return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
 
if (ptr->type_id > 0)
*btf_id = ptr->type_id;
-- 
2.7.4



Re: [PATCH v4 1/4] dt-bindings: usb: add rk3328 dwc3 docs

2020-11-06 Thread Felipe Balbi

Hi,

Lindsey Stanpoor  writes:
> On Wed, Sep 2, 2020 at 11:12 AM  wrote:
>>
>> From: Cameron Nemo 
>>
>> Document compatible for dwc3 on the Rockchip rk3328 platform.
>
> Hi all,
>
> Wanted to give this patch submission a gentle ping.
>
> Rob Herring acked the documentation changes, but I have not heard
> anything
> from the USB or Rockchip maintainers. This patchset would facilitate USB3
> support for Rockchip rk3328 devices like the Pine Rock64.
>
> If there is anything I can do to help move this along, please let me know.

Sorry, it had fallen through the cracks. It's now in my testing/next.

-- 
balbi


signature.asc
Description: PGP signature


[GIT PULL] RISC-V Fixes for 5.10-rc3

2020-11-06 Thread Palmer Dabbelt
The following changes since commit 3650b228f83adda7e5ee532e2b90429c03f7b9ec:

  Linux 5.10-rc1 (2020-10-25 15:14:11 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git 
tags/riscv-for-linus-5.10-rc3

for you to fetch changes up to c2c81bb2f69138f902e1a58d3bef6ad97fb8a92c:

  RISC-V: Fix the VDSO symbol generaton for binutils-2.35+ (2020-11-06 00:03:48 
-0800)


RISC-V Fixes for 5.10-rc3

* An SPDX comment style fix.
* A fix to ignore memory that is unusable.
* A fix to avoid setting a kernel text offset for the !MMU kernels, where
  skipping the first page of memory is both unnecessary and costly.
* A fix to avoid passing the flag bits in satp to pfn_to_virt().
* A fix to __put_kernel_nofault, where we had the arguments to
  __put_user_nocheck reversed.
* A workaround for a bug in the FU540 to avoid triggering PMP issues during
  early boot.
* A change to how we pull symbols out of the vDSO.  The old mechanism was
  removed from binutils-2.35 (and has been backported to Debian's 2.34).


Anup Patel (1):
  RISC-V: Use non-PGD mappings for early DTB access

Atish Patra (1):
  RISC-V: Remove any memblock representing unusable memory area

Changbin Du (1):
  riscv: uaccess: fix __put_kernel_nofault()

Liu Shaohua (1):
  riscv: fix pfn_to_virt err in do_page_fault().

Palmer Dabbelt (1):
  RISC-V: Fix the VDSO symbol generaton for binutils-2.35+

Ryan Kosta (1):
  risc-v: kernel: ftrace: Fixes improper SPDX comment style

Sean Anderson (1):
  riscv: Set text_offset correctly for M-Mode

 arch/riscv/include/asm/uaccess.h  |  2 +-
 arch/riscv/kernel/ftrace.c|  2 +-
 arch/riscv/kernel/head.S  |  5 +
 arch/riscv/kernel/vdso/.gitignore |  1 +
 arch/riscv/kernel/vdso/Makefile   | 18 +-
 arch/riscv/kernel/vdso/so2s.sh|  6 ++
 arch/riscv/mm/fault.c |  4 +++-
 arch/riscv/mm/init.c  | 32 +---
 8 files changed, 47 insertions(+), 23 deletions(-)
 create mode 100755 arch/riscv/kernel/vdso/so2s.sh


Re: [V2] trace: Fix passing zero to PTR_ERR()

2020-11-06 Thread Yonghong Song




On 11/6/20 10:34 PM, Wang Qing wrote:

There is a bug when passing zero to PTR_ERR() and return.
Fix smatch err.

Signed-off-by: Wang Qing 


For clarity, the subject probably should be
  bpf: Fix passing zero to PTR_ERR()
to indicate this is a bpf related fix. The tag should
be something like
  [PATCH bpf v2] or [PATCH v2 bpf]
depending on your preference, to indicate this is for bpf tree.

If another version is sent, the above "v2" should change to "v3".


---
  kernel/trace/bpf_trace.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4517c8b..5113fd4
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1198,7 +1198,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, 
u32 btf_ptr_size,
*btf = bpf_get_btf_vmlinux();
  
  	if (IS_ERR_OR_NULL(*btf))

-   return PTR_ERR(*btf);
+   return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
  
  	if (ptr->type_id > 0)

*btf_id = ptr->type_id;



[PATCH] Kbuild: enable -Wfallthrough for clang

2020-11-06 Thread Nick Desaulniers
Partial revert of commit e2079e93f562 ("kbuild: Do not enable
-Wimplicit-fallthrough for clang for now")

This has been fixed up over time thanks to the addition of "fallthrough"
pseudo-keyword in
commit 294f69e662d1 ("compiler_attributes.h: Add 'fallthrough' pseudo
keyword for switch/case use")

Link: https://github.com/ClangBuiltLinux/linux/issues/236
Signed-off-by: Nick Desaulniers 
---
 Makefile | 5 -
 1 file changed, 5 deletions(-)

diff --git a/Makefile b/Makefile
index f353886dbf44..c1c61c276f60 100644
--- a/Makefile
+++ b/Makefile
@@ -777,11 +777,6 @@ else
 # These warnings generated too much noise in a regular build.
 # Use make W=1 to enable them (see scripts/Makefile.extrawarn)
 KBUILD_CFLAGS += -Wno-unused-but-set-variable
-
-# Warn about unmarked fall-throughs in switch statement.
-# Disabled for clang while comment to attribute conversion happens and
-# https://github.com/ClangBuiltLinux/linux/issues/636 is discussed.
-KBUILD_CFLAGS += $(call cc-option,-Wimplicit-fallthrough,)
 endif
 
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
-- 
2.29.2.222.g5d2a92d10f8-goog



[PATCH v7] mm/zswap: move to use crypto_acomp API for hardware acceleration

2020-11-06 Thread Barry Song
Right now, all new ZIP drivers are adapted to crypto_acomp APIs rather
than legacy crypto_comp APIs. Tradiontal ZIP drivers like lz4,lzo etc
have been also wrapped into acomp via scomp backend. But zswap.c is still
using the old APIs. That means zswap won't be able to work on any new
ZIP drivers in kernel.

This patch moves to use cryto_acomp APIs to fix the disconnected bridge
between new ZIP drivers and zswap. It is probably the first real user
to use acomp but perhaps not a good example to demonstrate how multiple
acomp requests can be executed in parallel in one acomp instance.
frontswap is doing page load and store page by page synchronously.
swap_writepage() depends on the completion of frontswap_store() to
decide if it should call __swap_writepage() to swap to disk.

However this patch creates multiple acomp instances, so multiple threads
running on multiple different cpus can actually do (de)compression
parallelly, leveraging the power of multiple ZIP hardware queues. This
is also consistent with frontswap's page management model.

The old zswap code uses atomic context and avoids the race conditions
while shared resources like zswap_dstmem are accessed. Here since acomp
can sleep, per-cpu mutex is used to replace preemption-disable.

While it is possible to make mm/page_io.c and mm/frontswap.c support
async (de)compression in some way, the entire design requires careful
thinking and performance evaluation. For the first step, the base with
fixed connection between ZIP drivers and zswap should be built.

Acked-by: Vitaly Wool 
Cc: Luis Claudio R. Goncalves 
Cc: Sebastian Andrzej Siewior 
Cc: Andrew Morton 
Cc: Herbert Xu 
Cc: David S. Miller 
Cc: Mahipal Challa 
Cc: Seth Jennings 
Cc: Dan Streetman 
Cc: Zhou Wang 
Cc: Colin Ian King 
Signed-off-by: Barry Song 
---
 -v7:
 1. Add Acked-by of Vitaly Wool, thanks!
 2. Address the issues pointed out by Sebastian Andrzej Siewior, thanks!
  * remove redundant kmap and move to use sg_set_page;
  * remove the warning if DEBUG_PREEMPTIBLE is enabled by using
raw_cpu_ptr(). 
  * Regarding another code refinement issue, I am still not a big fan of
 a. get_cpu_ptr() for the acomp_ctx   //lock preemption
 b. this_cpu_ptr() for the dstmem and mutex
 c. put_cpu_ptr() for the acomp_ctx  //unlock preemption
 It seems the code is better looking to put all stuff in a struct,
 and get the per_cpu struct to get them all rather than adding a
 preemption-disabled context and getting them one by one.

 mm/zswap.c | 183 +
 1 file changed, 137 insertions(+), 46 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index fbb7829..73f04de 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -24,8 +24,10 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -127,9 +129,17 @@ module_param_named(same_filled_pages_enabled, 
zswap_same_filled_pages_enabled,
 * data structures
 **/
 
+struct crypto_acomp_ctx {
+   struct crypto_acomp *acomp;
+   struct acomp_req *req;
+   struct crypto_wait wait;
+   u8 *dstmem;
+   struct mutex *mutex;
+};
+
 struct zswap_pool {
struct zpool *zpool;
-   struct crypto_comp * __percpu *tfm;
+   struct crypto_acomp_ctx __percpu *acomp_ctx;
struct kref kref;
struct list_head list;
struct work_struct release_work;
@@ -388,23 +398,43 @@ static struct zswap_entry *zswap_entry_find_get(struct 
rb_root *root,
 * per-cpu code
 **/
 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
+/*
+ * If users dynamically change the zpool type and compressor at runtime, i.e.
+ * zswap is running, zswap can have more than one zpool on one cpu, but they
+ * are sharing dtsmem. So we need this mutex to be per-cpu.
+ */
+static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
 
 static int zswap_dstmem_prepare(unsigned int cpu)
 {
+   struct mutex *mutex;
u8 *dst;
 
dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
if (!dst)
return -ENOMEM;
 
+   mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
+   if (!mutex) {
+   kfree(dst);
+   return -ENOMEM;
+   }
+
+   mutex_init(mutex);
per_cpu(zswap_dstmem, cpu) = dst;
+   per_cpu(zswap_mutex, cpu) = mutex;
return 0;
 }
 
 static int zswap_dstmem_dead(unsigned int cpu)
 {
+   struct mutex *mutex;
u8 *dst;
 
+   mutex = per_cpu(zswap_mutex, cpu);
+   kfree(mutex);
+   per_cpu(zswap_mutex, cpu) = NULL;
+
dst = per_cpu(zswap_dstmem, cpu);
kfree(dst);
per_cpu(zswap_dstmem, cpu) = NULL;
@@ -415,30 +445,54 @@ static int zswap_dstmem_dead(unsigned int cpu)
 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 {
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-   struct cryp

[PATCH] net/mlx4: Assign boolean values to a bool variable

2020-11-06 Thread xiakaixu1987
From: Kaixu Xia 

Fix the following coccinelle warnings:

./drivers/net/ethernet/mellanox/mlx4/en_rx.c:687:1-17: WARNING: Assignment of 
0/1 to bool variable

Reported-by: Tosk Robot 
Signed-off-by: Kaixu Xia 
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 502d1b97855c..b0f79a5151cf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -684,7 +684,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
xdp_prog = rcu_dereference(ring->xdp_prog);
xdp.rxq = &ring->xdp_rxq;
xdp.frame_sz = priv->frag_info[0].frag_stride;
-   doorbell_pending = 0;
+   doorbell_pending = false;
 
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
 * descriptor offset can be deduced from the CQE index instead of
-- 
2.20.0



[PATCH] fork: fix copy_process(CLONE_PARENT) race with the exiting ->real_parent

2020-11-06 Thread Eddy Wu
current->group_leader->exit_signal may change during copy_process() if
current->real_parent exits, move the assignment inside tasklist_lock to avoid
the race.

Signed-off-by: Eddy Wu 
---
 kernel/fork.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index da8d360fb032..7abda2a888a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2142,14 +2142,9 @@ static __latent_entropy struct task_struct *copy_process(
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
-   p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
-   if (clone_flags & CLONE_PARENT)
-   p->exit_signal = current->group_leader->exit_signal;
-   else
-   p->exit_signal = args->exit_signal;
p->group_leader = p;
p->tgid = p->pid;
}
@@ -2193,9 +2188,14 @@ static __latent_entropy struct task_struct *copy_process(
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
+   if (clone_flags & CLONE_THREAD)
+   p->exit_signal = -1;
+   else
+   p->exit_signal = current->group_leader->exit_signal;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
+   p->exit_signal = args->exit_signal;
}
 
klp_copy_process(p);
-- 
2.17.1



[V2] trace: Fix passing zero to PTR_ERR()

2020-11-06 Thread Wang Qing
There is a bug when passing zero to PTR_ERR() and return.
Fix smatch err.

Signed-off-by: Wang Qing 
---
 kernel/trace/bpf_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4517c8b..5113fd4
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1198,7 +1198,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, 
u32 btf_ptr_size,
*btf = bpf_get_btf_vmlinux();
 
if (IS_ERR_OR_NULL(*btf))
-   return PTR_ERR(*btf);
+   return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
 
if (ptr->type_id > 0)
*btf_id = ptr->type_id;
-- 
2.7.4



[PATCH] scsi: ses: Fix crash caused by kfree an invalid pointer

2020-11-06 Thread Ding Hui
We can get a crash when disconnecting the iSCSI session,
the call trace like this:

  [2a00fb70] kfree at 0830e224
  [2a00fba0] ses_intf_remove at 01f200e4
  [2a00fbd0] device_del at 086b6a98
  [2a00fc50] device_unregister at 086b6d58
  [2a00fc70] __scsi_remove_device at 0870608c
  [2a00fca0] scsi_remove_device at 08706134
  [2a00fcc0] __scsi_remove_target at 087062e4
  [2a00fd10] scsi_remove_target at 087064c0
  [2a00fd70] __iscsi_unbind_session at 01c872c4
  [2a00fdb0] process_one_work at 0810f35c
  [2a00fe00] worker_thread at 0810f648
  [2a00fe70] kthread at 08116e98

In ses_intf_add, components count can be 0, and kcalloc 0 size scomp,
but not saved at edev->component[i].scratch

In this situation, edev->component[0].scratch is an invalid pointer,
when kfree it in ses_intf_remove_enclosure, a crash like above would happen
The call trace also could be other random cases when kfree cannot detect
the invalid pointer

We should not use edev->component[] array when we get components count is 0
We also need check index when use edev->component[] array in
ses_enclosure_data_process

Tested-by: Zeng Zhicong 
Cc: stable  # 2.6.25+
Signed-off-by: Ding Hui 
---
 drivers/scsi/ses.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index c2afba2a5414..f5ef0a91f0eb 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -477,9 +477,6 @@ static int ses_enclosure_find_by_addr(struct 
enclosure_device *edev,
int i;
struct ses_component *scomp;
 
-   if (!edev->component[0].scratch)
-   return 0;
-
for (i = 0; i < edev->components; i++) {
scomp = edev->component[i].scratch;
if (scomp->addr != efd->addr)
@@ -565,8 +562,10 @@ static void ses_enclosure_data_process(struct 
enclosure_device *edev,
components++,
type_ptr[0],
name);
-   else
+   else if (components < edev->components)
ecomp = &edev->component[components++];
+   else
+   ecomp = ERR_PTR(-EINVAL);
 
if (!IS_ERR(ecomp)) {
if (addl_desc_ptr)
@@ -731,9 +730,11 @@ static int ses_intf_add(struct device *cdev,
buf = NULL;
}
 page2_not_supported:
-   scomp = kcalloc(components, sizeof(struct ses_component), GFP_KERNEL);
-   if (!scomp)
-   goto err_free;
+   if (components > 0) {
+   scomp = kcalloc(components, sizeof(struct ses_component), 
GFP_KERNEL);
+   if (!scomp)
+   goto err_free;
+   }
 
edev = enclosure_register(cdev->parent, dev_name(&sdev->sdev_gendev),
  components, &ses_enclosure_callbacks);
@@ -813,7 +814,8 @@ static void ses_intf_remove_enclosure(struct scsi_device 
*sdev)
kfree(ses_dev->page2);
kfree(ses_dev);
 
-   kfree(edev->component[0].scratch);
+   if (edev->components > 0)
+   kfree(edev->component[0].scratch);
 
put_device(&edev->edev);
enclosure_unregister(edev);
-- 
2.17.1



[PATCH net-next 06/11] net: hns3: add ethtool priv-flag for DIM

2020-11-06 Thread Huazhong Tan
Add a control private flag in ethtool for enable/disable
DIM feature.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h|  7 +++
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c|  1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 71 ++
 3 files changed, 79 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index f9d4d23..18b3e43 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -716,6 +716,11 @@ struct hnae3_roce_private_info {
 #define HNAE3_UPE  (HNAE3_USER_UPE | HNAE3_OVERFLOW_UPE)
 #define HNAE3_MPE  (HNAE3_USER_MPE | HNAE3_OVERFLOW_MPE)
 
+enum hnae3_pflag {
+   HNAE3_PFLAG_DIM_ENABLE,
+   HNAE3_PFLAG_MAX
+};
+
 struct hnae3_handle {
struct hnae3_client *client;
struct pci_dev *pdev;
@@ -738,6 +743,8 @@ struct hnae3_handle {
 
/* Network interface message level enabled bits */
u32 msg_enable;
+
+   unsigned long priv_flags;
 };
 
 #define hnae3_set_field(origin, mask, shift, val) \
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 9e895b9..a567557 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -4246,6 +4246,7 @@ static int hns3_client_init(struct hnae3_handle *handle)
 
set_bit(HNS3_NIC_STATE_INITED, &priv->state);
set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state);
+   handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE);
 
if (netif_msg_drv(handle))
hns3_info_show(priv);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 30ffaaf..427b72c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -18,6 +18,11 @@ struct hns3_sfp_type {
u8 ext_type;
 };
 
+struct hns3_pflag_desc {
+   char name[ETH_GSTRING_LEN];
+   void (*handler)(struct net_device *netdev, bool enable);
+};
+
 /* tqp related stats */
 #define HNS3_TQP_STAT(_string, _member){   \
.stats_string = _string,\
@@ -59,6 +64,8 @@ static const struct hns3_stats hns3_rxq_stats[] = {
HNS3_TQP_STAT("non_reuse_pg", non_reuse_pg),
 };
 
+#define HNS3_PRIV_FLAGS_LEN ARRAY_SIZE(hns3_priv_flags)
+
 #define HNS3_RXQ_STATS_COUNT ARRAY_SIZE(hns3_rxq_stats)
 
 #define HNS3_TQP_STATS_COUNT (HNS3_TXQ_STATS_COUNT + HNS3_RXQ_STATS_COUNT)
@@ -394,6 +401,26 @@ static void hns3_self_test(struct net_device *ndev,
netif_dbg(h, drv, ndev, "self test end\n");
 }
 
+static void hns3_update_state(struct net_device *netdev,
+ enum hns3_nic_state state, bool enable)
+{
+   struct hns3_nic_priv *priv = netdev_priv(netdev);
+
+   if (enable)
+   set_bit(state, &priv->state);
+   else
+   clear_bit(state, &priv->state);
+}
+
+static void hns3_update_dim_state(struct net_device *netdev, bool enable)
+{
+   hns3_update_state(netdev, HNS3_NIC_STATE_DIM_ENABLE, enable);
+}
+
+static const struct hns3_pflag_desc hns3_priv_flags[HNAE3_PFLAG_MAX] = {
+   { "dim_enable", hns3_update_dim_state },
+};
+
 static int hns3_get_sset_count(struct net_device *netdev, int stringset)
 {
struct hnae3_handle *h = hns3_get_handle(netdev);
@@ -410,6 +437,9 @@ static int hns3_get_sset_count(struct net_device *netdev, 
int stringset)
case ETH_SS_TEST:
return ops->get_sset_count(h, stringset);
 
+   case ETH_SS_PRIV_FLAGS:
+   return HNAE3_PFLAG_MAX;
+
default:
return -EOPNOTSUPP;
}
@@ -463,6 +493,7 @@ static void hns3_get_strings(struct net_device *netdev, u32 
stringset, u8 *data)
struct hnae3_handle *h = hns3_get_handle(netdev);
const struct hnae3_ae_ops *ops = h->ae_algo->ops;
char *buff = (char *)data;
+   int i;
 
if (!ops->get_strings)
return;
@@ -475,6 +506,13 @@ static void hns3_get_strings(struct net_device *netdev, 
u32 stringset, u8 *data)
case ETH_SS_TEST:
ops->get_strings(h, stringset, data);
break;
+   case ETH_SS_PRIV_FLAGS:
+   for (i = 0; i < HNS3_PRIV_FLAGS_LEN; i++) {
+   snprintf(buff, ETH_GSTRING_LEN, "%s",
+hns3_priv_flags[i].name);
+   buff += ETH_GSTRING_LEN;
+   }
+   break;
default:
break;
}
@@ -1516,6 +1554,35 @@ static int hns3_get_module_eeprom(struct net_device 
*netdev,
return ops->get_module_eeprom(handle, ee->offset, ee->len, data);
 }
 
+static u32 hns3_get_priv_flags(struct net_device *netdev)
+{
+

[PATCH net-next 04/11] net: hns3: rename gl_adapt_enable in struct hns3_enet_coalesce

2020-11-06 Thread Huazhong Tan
Besides GL(Gap Limiting), QL(Quantity Limiting) can be modified
dynamically when DIM is supported. So rename gl_adapt_enable as
adapt_enable in struct hns3_enet_coalesce.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 12 ++--
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h|  2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c |  8 
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 2813fe5..999a2aa 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -211,8 +211,8 @@ void hns3_set_vector_coalesce_rl(struct 
hns3_enet_tqp_vector *tqp_vector,
 * GL and RL(Rate Limiter) are 2 ways to acheive interrupt coalescing
 */
 
-   if (rl_reg > 0 && !tqp_vector->tx_group.coal.gl_adapt_enable &&
-   !tqp_vector->rx_group.coal.gl_adapt_enable)
+   if (rl_reg > 0 && !tqp_vector->tx_group.coal.adapt_enable &&
+   !tqp_vector->rx_group.coal.adapt_enable)
/* According to the hardware, the range of rl_reg is
 * 0-59 and the unit is 4.
 */
@@ -273,8 +273,8 @@ static void hns3_vector_coalesce_init(struct 
hns3_enet_tqp_vector *tqp_vector,
 *
 * Default: enable interrupt coalescing self-adaptive and GL
 */
-   tx_coal->gl_adapt_enable = 1;
-   rx_coal->gl_adapt_enable = 1;
+   tx_coal->adapt_enable = 1;
+   rx_coal->adapt_enable = 1;
 
tx_coal->int_gl = HNS3_INT_GL_50K;
rx_coal->int_gl = HNS3_INT_GL_50K;
@@ -3384,14 +3384,14 @@ static void hns3_update_new_int_gl(struct 
hns3_enet_tqp_vector *tqp_vector)
tqp_vector->last_jiffies + msecs_to_jiffies(1000)))
return;
 
-   if (rx_group->coal.gl_adapt_enable) {
+   if (rx_group->coal.adapt_enable) {
rx_update = hns3_get_new_int_gl(rx_group);
if (rx_update)
hns3_set_vector_coalesce_rx_gl(tqp_vector,
   rx_group->coal.int_gl);
}
 
-   if (tx_group->coal.gl_adapt_enable) {
+   if (tx_group->coal.adapt_enable) {
tx_update = hns3_get_new_int_gl(tx_group);
if (tx_update)
hns3_set_vector_coalesce_tx_gl(tqp_vector,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 4651ad1..8d33652 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -436,7 +436,7 @@ struct hns3_enet_coalesce {
u16 int_gl;
u16 int_ql;
u16 int_ql_max;
-   u8 gl_adapt_enable:1;
+   u8 adapt_enable:1;
u8 ql_enable:1;
u8 unit_1us:1;
enum hns3_flow_level_range flow_level;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 8d5c194..30ffaaf 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1105,9 +1105,9 @@ static int hns3_get_coalesce_per_queue(struct net_device 
*netdev, u32 queue,
rx_vector = priv->ring[queue_num + queue].tqp_vector;
 
cmd->use_adaptive_tx_coalesce =
-   tx_vector->tx_group.coal.gl_adapt_enable;
+   tx_vector->tx_group.coal.adapt_enable;
cmd->use_adaptive_rx_coalesce =
-   rx_vector->rx_group.coal.gl_adapt_enable;
+   rx_vector->rx_group.coal.adapt_enable;
 
cmd->tx_coalesce_usecs = tx_vector->tx_group.coal.int_gl;
cmd->rx_coalesce_usecs = rx_vector->rx_group.coal.int_gl;
@@ -1268,9 +1268,9 @@ static void hns3_set_coalesce_per_queue(struct net_device 
*netdev,
tx_vector = priv->ring[queue].tqp_vector;
rx_vector = priv->ring[queue_num + queue].tqp_vector;
 
-   tx_vector->tx_group.coal.gl_adapt_enable =
+   tx_vector->tx_group.coal.adapt_enable =
cmd->use_adaptive_tx_coalesce;
-   rx_vector->rx_group.coal.gl_adapt_enable =
+   rx_vector->rx_group.coal.adapt_enable =
cmd->use_adaptive_rx_coalesce;
 
tx_vector->tx_group.coal.int_gl = cmd->tx_coalesce_usecs;
-- 
2.7.4



[PATCH net-next 08/11] net: hns3: add a check for ethtool priv-flag interface

2020-11-06 Thread Huazhong Tan
Add a check for hns3_set_priv_flags() since if the capability
is unsupported its private flags should not be modified as well.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h|  1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c|  1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 19 +++
 3 files changed, 21 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 18b3e43..3642740 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -744,6 +744,7 @@ struct hnae3_handle {
/* Network interface message level enabled bits */
u32 msg_enable;
 
+   unsigned long supported_pflags;
unsigned long priv_flags;
 };
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index f686723..c30cf9e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -4152,6 +4152,7 @@ static void hns3_state_init(struct hnae3_handle *handle)
set_bit(HNS3_NIC_STATE_INITED, &priv->state);
set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state);
handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE);
+   set_bit(HNAE3_PFLAG_DIM_ENABLE, &handle->supported_pflags);
 }
 
 static int hns3_client_init(struct hnae3_handle *handle)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 427b72c..6904c0a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1561,12 +1561,31 @@ static u32 hns3_get_priv_flags(struct net_device 
*netdev)
return handle->priv_flags;
 }
 
+static int hns3_check_priv_flags(struct hnae3_handle *h, u32 changed)
+{
+   u32 i;
+
+   for (i = 0; i < HNAE3_PFLAG_MAX; i++)
+   if ((changed & BIT(i)) && !test_bit(i, &h->supported_pflags)) {
+   netdev_err(h->netdev, "%s is unsupported\n",
+  hns3_priv_flags[i].name);
+   return -EOPNOTSUPP;
+   }
+
+   return 0;
+}
+
 static int hns3_set_priv_flags(struct net_device *netdev, u32 pflags)
 {
struct hnae3_handle *handle = hns3_get_handle(netdev);
u32 changed = pflags ^ handle->priv_flags;
+   int ret;
u32 i;
 
+   ret = hns3_check_priv_flags(handle, changed);
+   if (ret)
+   return ret;
+
for (i = 0; i < HNAE3_PFLAG_MAX; i++) {
if (changed & BIT(i)) {
bool enable = !(handle->priv_flags & BIT(i));
-- 
2.7.4



[PATCH net-next 10/11] net: hns3: add ethtool priv-flag for EQ/CQ

2020-11-06 Thread Huazhong Tan
Add a control private flag in ethtool for switching EQ/CQ mode.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h|  2 ++
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 19 --
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h|  2 ++
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 23 ++
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 345e8a4..a452874 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -719,6 +719,8 @@ struct hnae3_roce_private_info {
 
 enum hnae3_pflag {
HNAE3_PFLAG_DIM_ENABLE,
+   HNAE3_PFLAG_TX_CQE_MODE,
+   HNAE3_PFLAG_RX_CQE_MODE,
HNAE3_PFLAG_MAX
 };
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index d1243ea..93f7731 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -4144,6 +4144,7 @@ static void hns3_info_show(struct hns3_nic_priv *priv)
 
 static void hns3_state_init(struct hnae3_handle *handle)
 {
+   struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev);
struct net_device *netdev = handle->kinfo.netdev;
struct hns3_nic_priv *priv = netdev_priv(netdev);
 
@@ -4151,10 +4152,24 @@ static void hns3_state_init(struct hnae3_handle *handle)
set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state);
handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE);
set_bit(HNAE3_PFLAG_DIM_ENABLE, &handle->supported_pflags);
+
+   /* device version above V3(include V3), GL can switch CQ/EQ period
+* mode.
+*/
+   if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) {
+   set_bit(HNAE3_PFLAG_TX_CQE_MODE, &handle->supported_pflags);
+   set_bit(HNAE3_PFLAG_RX_CQE_MODE, &handle->supported_pflags);
+   }
+
+   if (priv->tx_cqe_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE)
+   handle->priv_flags |= BIT(HNAE3_PFLAG_TX_CQE_MODE);
+
+   if (priv->rx_cqe_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE)
+   handle->priv_flags |= BIT(HNAE3_PFLAG_RX_CQE_MODE);
 }
 
-static void hns3_set_cq_period_mode(struct hns3_nic_priv *priv,
-   enum dim_cq_period_mode mode, bool is_tx)
+void hns3_set_cq_period_mode(struct hns3_nic_priv *priv,
+enum dim_cq_period_mode mode, bool is_tx)
 {
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
struct hnae3_handle *handle = priv->ae_handle;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index c6c082a..ecdb544 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -635,4 +635,6 @@ void hns3_dbg_uninit(struct hnae3_handle *handle);
 void hns3_dbg_register_debugfs(const char *debugfs_dir_name);
 void hns3_dbg_unregister_debugfs(void);
 void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size);
+void hns3_set_cq_period_mode(struct hns3_nic_priv *priv,
+enum dim_cq_period_mode mode, bool is_tx);
 #endif
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 6904c0a..8de2789 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -417,8 +417,31 @@ static void hns3_update_dim_state(struct net_device 
*netdev, bool enable)
hns3_update_state(netdev, HNS3_NIC_STATE_DIM_ENABLE, enable);
 }
 
+static void hns3_update_cqe_mode(struct net_device *netdev, bool enable, bool 
is_tx)
+{
+   struct hns3_nic_priv *priv = netdev_priv(netdev);
+   enum dim_cq_period_mode mode;
+
+   mode = enable ? DIM_CQ_PERIOD_MODE_START_FROM_CQE :
+   DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+
+   hns3_set_cq_period_mode(priv, mode, is_tx);
+}
+
+static void hns3_update_tx_cqe_mode(struct net_device *netdev, bool enable)
+{
+   hns3_update_cqe_mode(netdev, enable, true);
+}
+
+static void hns3_update_rx_cqe_mode(struct net_device *netdev, bool enable)
+{
+   hns3_update_cqe_mode(netdev, enable, false);
+}
+
 static const struct hns3_pflag_desc hns3_priv_flags[HNAE3_PFLAG_MAX] = {
{ "dim_enable", hns3_update_dim_state },
+   { "tx_cqe_mode",hns3_update_tx_cqe_mode },
+   { "rx_cqe_mode",hns3_update_rx_cqe_mode },
 };
 
 static int hns3_get_sset_count(struct net_device *netdev, int stringset)
-- 
2.7.4



[PATCH net-next 01/11] net: hns3: add support for configuring interrupt quantity limiting

2020-11-06 Thread Huazhong Tan
QL(quantity limiting) means that hardware supports the interrupt
coalesce based on the frame quantity.  QL can be configured when
int_ql_max in device's specification is non-zero, so add support
to configure it. Also, rename two coalesce init function to fit
their purpose.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 65 --
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h| 13 -
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 43 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c|  1 +
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c  |  1 +
 5 files changed, 105 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index a362516..6e08719 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -237,35 +237,68 @@ void hns3_set_vector_coalesce_tx_gl(struct 
hns3_enet_tqp_vector *tqp_vector,
writel(tx_gl_reg, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET);
 }
 
-static void hns3_vector_gl_rl_init(struct hns3_enet_tqp_vector *tqp_vector,
-  struct hns3_nic_priv *priv)
+void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector,
+   u32 ql_value)
 {
+   writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_TX_QL_OFFSET);
+}
+
+void hns3_set_vector_coalesce_rx_ql(struct hns3_enet_tqp_vector *tqp_vector,
+   u32 ql_value)
+{
+   writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_RX_QL_OFFSET);
+}
+
+static void hns3_vector_coalesce_init(struct hns3_enet_tqp_vector *tqp_vector,
+ struct hns3_nic_priv *priv)
+{
+   struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
+   struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal;
+   struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal;
+
/* initialize the configuration for interrupt coalescing.
 * 1. GL (Interrupt Gap Limiter)
 * 2. RL (Interrupt Rate Limiter)
+* 3. QL (Interrupt Quantity Limiter)
 *
 * Default: enable interrupt coalescing self-adaptive and GL
 */
-   tqp_vector->tx_group.coal.gl_adapt_enable = 1;
-   tqp_vector->rx_group.coal.gl_adapt_enable = 1;
+   tx_coal->gl_adapt_enable = 1;
+   rx_coal->gl_adapt_enable = 1;
+
+   tx_coal->int_gl = HNS3_INT_GL_50K;
+   rx_coal->int_gl = HNS3_INT_GL_50K;
 
-   tqp_vector->tx_group.coal.int_gl = HNS3_INT_GL_50K;
-   tqp_vector->rx_group.coal.int_gl = HNS3_INT_GL_50K;
+   rx_coal->flow_level = HNS3_FLOW_LOW;
+   tx_coal->flow_level = HNS3_FLOW_LOW;
 
-   tqp_vector->rx_group.coal.flow_level = HNS3_FLOW_LOW;
-   tqp_vector->tx_group.coal.flow_level = HNS3_FLOW_LOW;
+   if (ae_dev->dev_specs.int_ql_max) {
+   tx_coal->ql_enable = 1;
+   rx_coal->ql_enable = 1;
+   tx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max;
+   rx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max;
+   tx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG;
+   rx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG;
+   }
 }
 
-static void hns3_vector_gl_rl_init_hw(struct hns3_enet_tqp_vector *tqp_vector,
- struct hns3_nic_priv *priv)
+static void
+hns3_vector_coalesce_init_hw(struct hns3_enet_tqp_vector *tqp_vector,
+struct hns3_nic_priv *priv)
 {
+   struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal;
+   struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal;
struct hnae3_handle *h = priv->ae_handle;
 
-   hns3_set_vector_coalesce_tx_gl(tqp_vector,
-  tqp_vector->tx_group.coal.int_gl);
-   hns3_set_vector_coalesce_rx_gl(tqp_vector,
-  tqp_vector->rx_group.coal.int_gl);
+   hns3_set_vector_coalesce_tx_gl(tqp_vector, tx_coal->int_gl);
+   hns3_set_vector_coalesce_rx_gl(tqp_vector, rx_coal->int_gl);
hns3_set_vector_coalesce_rl(tqp_vector, h->kinfo.int_rl_setting);
+
+   if (tx_coal->ql_enable)
+   hns3_set_vector_coalesce_tx_ql(tqp_vector, tx_coal->int_ql);
+
+   if (rx_coal->ql_enable)
+   hns3_set_vector_coalesce_rx_ql(tqp_vector, rx_coal->int_ql);
 }
 
 static int hns3_nic_set_real_num_queue(struct net_device *netdev)
@@ -3536,7 +3569,7 @@ static int hns3_nic_init_vector_data(struct hns3_nic_priv 
*priv)
 
for (i = 0; i < priv->vector_num; i++) {
tqp_vector = &priv->tqp_vector[i];
-   hns3_vector_gl_rl_init_hw(tqp_vector, priv);
+   hns3_vector_coalesce_init_hw(tqp_vector, priv);
tqp_vector->num_tqps = 0;
}
 
@@ -3632,7 +3665,7 @@ static i

[PATCH net-next 00/11] net: hns3: updates for -next

2020-11-06 Thread Huazhong Tan
There are several updates relating to the interrupt coalesce for
the HNS3 ethernet driver.

#1 adds support for QL(quantity limiting, interrupt coalesce
   based on the frame quantity).
#2 adds support for 1us unit GL(gap limiting, interrupt coalesce
   based on the gap time).
#3 queries the maximum value of GL from the firmware instead of
   a fixed value in code.
#4 renames gl_adapt_enable in struct hns3_enet_coalesce to fit
   its new usage.
#5 & #6 adds support for the dynamic interrupt moderation,
   and adds a control private flag in ethtool.
#7 adds wrapper function for state initialization.
#8 adds a check for the read-only private flag.
#9 & #10 adds support for EQ/CQ configuration, and adds a control
   private flag in ethtool.
#11 adds debugfs support for interrupt coalesce.

Huazhong Tan (11):
  net: hns3: add support for configuring interrupt quantity limiting
  net: hns3: add support for 1us unit GL configuration
  net: hns3: add support for querying maximum value of GL
  net: hns3: rename gl_adapt_enable in struct hns3_enet_coalesce
  net: hns3: add support for dynamic interrupt moderation
  net: hns3: add ethtool priv-flag for DIM
  net: hns3: add hns3_state_init() to do state initialization
  net: hns3: add a check for ethtool priv-flag interface
  net: hns3: add support for EQ/CQ mode configuration
  net: hns3: add ethtool priv-flag for EQ/CQ
  net: hns3: add debugfs support for interrupt coalesce

 drivers/net/ethernet/hisilicon/Kconfig |   1 +
 drivers/net/ethernet/hisilicon/hns3/hnae3.h|  12 +
 drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 125 ++
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 258 ++---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h|  31 ++-
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 184 ++-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |   8 +
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c|   8 +
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h   |   8 +
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c  |   8 +
 10 files changed, 604 insertions(+), 39 deletions(-)

-- 
2.7.4



[PATCH net-next 07/11] net: hns3: add hns3_state_init() to do state initialization

2020-11-06 Thread Huazhong Tan
To improve the readability and maintainability, add hns3_state_init()
to initialize the state.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index a567557..f686723 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -4144,6 +4144,16 @@ static void hns3_info_show(struct hns3_nic_priv *priv)
dev_info(priv->dev, "Max mtu size: %u\n", priv->netdev->max_mtu);
 }
 
+static void hns3_state_init(struct hnae3_handle *handle)
+{
+   struct net_device *netdev = handle->kinfo.netdev;
+   struct hns3_nic_priv *priv = netdev_priv(netdev);
+
+   set_bit(HNS3_NIC_STATE_INITED, &priv->state);
+   set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state);
+   handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE);
+}
+
 static int hns3_client_init(struct hnae3_handle *handle)
 {
struct pci_dev *pdev = handle->pdev;
@@ -4244,9 +4254,7 @@ static int hns3_client_init(struct hnae3_handle *handle)
/* MTU range: (ETH_MIN_MTU(kernel default) - 9702) */
netdev->max_mtu = HNS3_MAX_MTU;
 
-   set_bit(HNS3_NIC_STATE_INITED, &priv->state);
-   set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state);
-   handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE);
+   hns3_state_init(handle);
 
if (netif_msg_drv(handle))
hns3_info_show(priv);
-- 
2.7.4



[PATCH net-next 11/11] net: hns3: add debugfs support for interrupt coalesce

2020-11-06 Thread Huazhong Tan
Since user may need to check the current configuration of the
interrupt coalesce, so add debugfs support for query this info,
which includes DIM profile, coalesce configuration of both software
and hardware.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 124 +
 1 file changed, 124 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
index a5ebca8..1efeed6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
@@ -12,6 +12,91 @@
 
 static struct dentry *hns3_dbgfs_root;
 
+static ssize_t hns3_dbg_coal_write(struct file *filp, const char __user 
*buffer,
+  size_t count, loff_t *ppos)
+{
+   struct hnae3_handle *h = filp->private_data;
+   struct hns3_nic_priv *priv  = h->priv;
+   struct hns3_enet_tqp_vector *tqp_vector;
+   struct hns3_enet_coalesce *coal;
+   int uncopied_bytes;
+   unsigned int idx;
+   struct dim *dim;
+   char *cmd_buf;
+
+   if (*ppos != 0)
+   return 0;
+
+   if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
+   dev_err(&h->pdev->dev, "device is not initialized\n");
+   return -EFAULT;
+   }
+
+   cmd_buf = kzalloc(count + 1, GFP_KERNEL);
+   if (!cmd_buf)
+   return -ENOMEM;
+
+   uncopied_bytes = copy_from_user(cmd_buf, buffer, count);
+   if (uncopied_bytes) {
+   kfree(cmd_buf);
+   return -EFAULT;
+   }
+
+   cmd_buf[count] = '\0';
+
+   if (kstrtouint(cmd_buf, 0, &idx))
+   idx = 0;
+
+   if (idx >= priv->vector_num) {
+   dev_err(&h->pdev->dev,
+   "vector index(%u) is out of range(0-%u)\n", idx,
+   priv->vector_num - 1);
+   kfree(cmd_buf);
+   return -EINVAL;
+   }
+
+   tqp_vector = &priv->tqp_vector[idx];
+   coal = &tqp_vector->tx_group.coal;
+   dim = &tqp_vector->tx_group.dim;
+
+   dev_info(&h->pdev->dev, "vector[%u] interrupt coalesce info:\n", idx);
+   dev_info(&h->pdev->dev,
+"TX DIM info state = %d profile_ix = %d mode = %d tune_state = 
%d steps_right = %d steps_left = %d tired = %d\n",
+dim->state, dim->profile_ix, dim->mode, dim->tune_state,
+dim->steps_right, dim->steps_left, dim->tired);
+
+   dev_info(&h->pdev->dev, "TX GL info sw_gl = %u, hw_gl = %u\n",
+coal->int_gl,
+readl(tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET));
+
+   if (coal->ql_enable)
+   dev_info(&h->pdev->dev, "TX QL info sw_ql = %u, hw_ql = %u\n",
+coal->int_ql,
+readl(tqp_vector->mask_addr + 
HNS3_VECTOR_TX_QL_OFFSET));
+
+   coal = &tqp_vector->rx_group.coal;
+   dim = &tqp_vector->rx_group.dim;
+
+   dev_info(&h->pdev->dev,
+"RX dim_info state = %d profile_ix = %d mode = %d tune_state = 
%d steps_right = %d steps_left = %d tired = %d\n",
+dim->state, dim->profile_ix, dim->mode, dim->tune_state,
+dim->steps_right, dim->steps_left, dim->tired);
+
+   dev_info(&h->pdev->dev, "RX GL info sw_gl = %u, hw_gl = %u\n",
+coal->int_gl,
+readl(tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET));
+
+   if (coal->ql_enable)
+   dev_info(&h->pdev->dev, "RX QL info sw_ql = %u, hw_ql = %u\n",
+coal->int_ql,
+readl(tqp_vector->mask_addr + 
HNS3_VECTOR_RX_QL_OFFSET));
+
+   kfree(cmd_buf);
+   cmd_buf = NULL;
+
+   return count;
+}
+
 static int hns3_dbg_queue_info(struct hnae3_handle *h,
   const char *cmd_buf)
 {
@@ -352,6 +437,35 @@ static void hns3_dbg_dev_specs(struct hnae3_handle *h)
dev_info(priv->dev, "MAX INT GL: %u\n", dev_specs->max_int_gl);
 }
 
+static ssize_t hns3_dbg_coal_read(struct file *filp, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+   int uncopy_bytes;
+   char *buf;
+   int len;
+
+   if (*ppos != 0)
+   return 0;
+
+   if (count < HNS3_DBG_READ_LEN)
+   return -ENOSPC;
+
+   buf = kzalloc(HNS3_DBG_READ_LEN, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   len = scnprintf(buf, HNS3_DBG_READ_LEN, "%s\n",
+   "Please echo index to coal");
+   uncopy_bytes = copy_to_user(buffer, buf, len);
+
+   kfree(buf);
+
+   if (uncopy_bytes)
+   return -EFAULT;
+
+   return (*ppos = len);
+}
+
 static ssize_t hns3_dbg_cmd_read(struct file *filp, char __user *buffer,
 size_t count, loff_t *ppos)
 {
@@ -452,6 +566,13 @@ static const struct file_operations hns3_dbg_c

[PATCH net-next 09/11] net: hns3: add support for EQ/CQ mode configuration

2020-11-06 Thread Huazhong Tan
For device whose version is above V3(include V3), the GL can
select EQ or CQ mode, so adds support for it.

In CQ mode, the coalesced timer will restart upon new completion,
while in EQ mode, the timer will not restart.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h|  1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 49 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h|  8 
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c|  1 +
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c  |  1 +
 5 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 3642740..345e8a4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -684,6 +684,7 @@ struct hnae3_knic_private_info {
 
u16 int_rl_setting;
enum pkt_hash_types rss_type;
+   void __iomem *io_base;
 };
 
 struct hnae3_roce_private_info {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index c30cf9e..d1243ea 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3653,9 +3653,7 @@ static void hns3_tx_dim_work(struct work_struct *work)
 static void hns3_nic_init_dim(struct hns3_enet_tqp_vector *tqp_vector)
 {
INIT_WORK(&tqp_vector->rx_group.dim.work, hns3_rx_dim_work);
-   tqp_vector->rx_group.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
INIT_WORK(&tqp_vector->tx_group.dim.work, hns3_tx_dim_work);
-   tqp_vector->tx_group.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 }
 
 static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv)
@@ -4155,6 +4153,48 @@ static void hns3_state_init(struct hnae3_handle *handle)
set_bit(HNAE3_PFLAG_DIM_ENABLE, &handle->supported_pflags);
 }
 
+static void hns3_set_cq_period_mode(struct hns3_nic_priv *priv,
+   enum dim_cq_period_mode mode, bool is_tx)
+{
+   struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev);
+   struct hnae3_handle *handle = priv->ae_handle;
+   int i;
+
+   if (is_tx) {
+   priv->tx_cqe_mode = mode;
+
+   for (i = 0; i < priv->vector_num; i++)
+   priv->tqp_vector[i].tx_group.dim.mode = mode;
+   } else {
+   priv->rx_cqe_mode = mode;
+
+   for (i = 0; i < priv->vector_num; i++)
+   priv->tqp_vector[i].rx_group.dim.mode = mode;
+   }
+
+   /* only device version above V3(include V3), GL can switch CQ/EQ
+* period mode.
+*/
+   if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) {
+   u32 new_mode;
+   u64 reg;
+
+   new_mode = (mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE) ?
+   HNS3_CQ_MODE_CQE : HNS3_CQ_MODE_EQE;
+   reg = is_tx ? HNS3_GL1_CQ_MODE_REG : HNS3_GL0_CQ_MODE_REG;
+
+   writel(new_mode, handle->kinfo.io_base + reg);
+   }
+}
+
+static void hns3_cq_period_mode_init(struct hns3_nic_priv *priv,
+enum dim_cq_period_mode tx_mode,
+enum dim_cq_period_mode rx_mode)
+{
+   hns3_set_cq_period_mode(priv, tx_mode, true);
+   hns3_set_cq_period_mode(priv, rx_mode, false);
+}
+
 static int hns3_client_init(struct hnae3_handle *handle)
 {
struct pci_dev *pdev = handle->pdev;
@@ -4220,6 +4260,9 @@ static int hns3_client_init(struct hnae3_handle *handle)
goto out_init_ring;
}
 
+   hns3_cq_period_mode_init(priv, DIM_CQ_PERIOD_MODE_START_FROM_EQE,
+DIM_CQ_PERIOD_MODE_START_FROM_EQE);
+
ret = hns3_init_phy(netdev);
if (ret)
goto out_init_phy;
@@ -4580,6 +4623,8 @@ static int hns3_reset_notify_init_enet(struct 
hnae3_handle *handle)
if (ret)
goto err_uninit_vector;
 
+   hns3_cq_period_mode_init(priv, priv->tx_cqe_mode, priv->rx_cqe_mode);
+
/* the device can work without cpu rmap, only aRFS needs it */
ret = hns3_set_rx_cpu_rmap(netdev);
if (ret)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index eb4e7ef..c6c082a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -188,6 +188,12 @@ enum hns3_nic_state {
 
 #define HNS3_RING_EN_B 0
 
+#define HNS3_GL0_CQ_MODE_REG   0x20d00
+#define HNS3_GL1_CQ_MODE_REG   0x20d04
+#define HNS3_GL2_CQ_MODE_REG   0x20d08
+#define HNS3_CQ_MODE_EQE   1U
+#define HNS3_CQ_MODE_CQE   0U
+
 enum hns3_pkt_l2t_type {
HNS3_L2_TYPE_UNICAST,
  

[PATCH net-next 05/11] net: hns3: add support for dynamic interrupt moderation

2020-11-06 Thread Huazhong Tan
Add dynamic interrupt moderation support for the HNS3 driver.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/Kconfig  |  1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 87 -
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h |  4 ++
 3 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/Kconfig 
b/drivers/net/ethernet/hisilicon/Kconfig
index 44f9279..fa6025d 100644
--- a/drivers/net/ethernet/hisilicon/Kconfig
+++ b/drivers/net/ethernet/hisilicon/Kconfig
@@ -130,6 +130,7 @@ config HNS3_ENET
default m
depends on 64BIT && PCI
depends on INET
+   select DIMLIB
help
  This selects the Ethernet Driver for Hisilicon Network Subsystem 3 
for hip08
  family of SoCs. This module depends upon HNAE3 driver to access the 
HNAE3
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 999a2aa..9e895b9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -96,6 +96,7 @@ static irqreturn_t hns3_irq_handle(int irq, void *vector)
struct hns3_enet_tqp_vector *tqp_vector = vector;
 
napi_schedule_irqoff(&tqp_vector->napi);
+   tqp_vector->event_cnt++;
 
return IRQ_HANDLED;
 }
@@ -199,6 +200,8 @@ static void hns3_vector_disable(struct hns3_enet_tqp_vector 
*tqp_vector)
 
disable_irq(tqp_vector->vector_irq);
napi_disable(&tqp_vector->napi);
+   cancel_work_sync(&tqp_vector->rx_group.dim.work);
+   cancel_work_sync(&tqp_vector->tx_group.dim.work);
 }
 
 void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector,
@@ -3401,6 +3404,32 @@ static void hns3_update_new_int_gl(struct 
hns3_enet_tqp_vector *tqp_vector)
tqp_vector->last_jiffies = jiffies;
 }
 
+static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector 
*tqp_vector)
+{
+   struct hns3_enet_ring_group *rx_group = &tqp_vector->rx_group;
+   struct dim_sample sample = {};
+
+   if (!rx_group->coal.adapt_enable)
+   return;
+
+   dim_update_sample(tqp_vector->event_cnt, rx_group->total_packets,
+ rx_group->total_bytes, &sample);
+   net_dim(&rx_group->dim, sample);
+}
+
+static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector 
*tqp_vector)
+{
+   struct hns3_enet_ring_group *tx_group = &tqp_vector->tx_group;
+   struct dim_sample sample = {};
+
+   if (!tx_group->coal.adapt_enable)
+   return;
+
+   dim_update_sample(tqp_vector->event_cnt, tx_group->total_packets,
+ tx_group->total_bytes, &sample);
+   net_dim(&tx_group->dim, sample);
+}
+
 static int hns3_nic_common_poll(struct napi_struct *napi, int budget)
 {
struct hns3_nic_priv *priv = netdev_priv(napi->dev);
@@ -3444,7 +3473,13 @@ static int hns3_nic_common_poll(struct napi_struct 
*napi, int budget)
 
if (napi_complete(napi) &&
likely(!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) {
-   hns3_update_new_int_gl(tqp_vector);
+   if (test_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state)) {
+   hns3_update_rx_int_coalesce(tqp_vector);
+   hns3_update_tx_int_coalesce(tqp_vector);
+   } else {
+   hns3_update_new_int_gl(tqp_vector);
+   }
+
hns3_mask_vector_irq(tqp_vector, 1);
}
 
@@ -3575,6 +3610,54 @@ static void hns3_nic_set_cpumask(struct hns3_nic_priv 
*priv)
}
 }
 
+static void hns3_rx_dim_work(struct work_struct *work)
+{
+   struct dim *dim = container_of(work, struct dim, work);
+   struct hns3_enet_ring_group *group = container_of(dim,
+   struct hns3_enet_ring_group, dim);
+   struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector;
+   struct dim_cq_moder cur_moder =
+   net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
+
+   hns3_set_vector_coalesce_rx_gl(group->ring->tqp_vector, cur_moder.usec);
+   tqp_vector->rx_group.coal.int_gl = cur_moder.usec;
+
+   if (cur_moder.pkts < tqp_vector->rx_group.coal.int_ql_max) {
+   hns3_set_vector_coalesce_rx_ql(tqp_vector, cur_moder.pkts);
+   tqp_vector->rx_group.coal.int_ql = cur_moder.pkts;
+   }
+
+   dim->state = DIM_START_MEASURE;
+}
+
+static void hns3_tx_dim_work(struct work_struct *work)
+{
+   struct dim *dim = container_of(work, struct dim, work);
+   struct hns3_enet_ring_group *group = container_of(dim,
+   struct hns3_enet_ring_group, dim);
+   struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector;
+   struct dim_cq_moder cur_moder =
+   net_dim_get_tx_moderation(dim->mode, dim->profile_ix);
+
+   hns3_set_vector_coalesce_tx_gl(tqp_vector, cur_moder.usec);
+   tqp_vector->

[PATCH net-next 03/11] net: hns3: add support for querying maximum value of GL

2020-11-06 Thread Huazhong Tan
For maintainability and compatibility, add support for querying
the maximum value of GL.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h   |  1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c|  1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h   |  1 -
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c| 14 --
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h|  8 
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   |  6 ++
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h  |  8 
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c |  6 ++
 8 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 912c51e..f9d4d23 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -278,6 +278,7 @@ struct hnae3_dev_specs {
u16 rss_ind_tbl_size;
u16 rss_key_size;
u16 int_ql_max; /* max value of interrupt coalesce based on INT_QL */
+   u16 max_int_gl; /* max value of interrupt coalesce based on INT_GL */
u8 max_non_tso_bd_num; /* max BD number of one non-TSO packet */
 };
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
index dc9a857..a5ebca8 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
@@ -349,6 +349,7 @@ static void hns3_dbg_dev_specs(struct hnae3_handle *h)
dev_info(priv->dev, "Desc num per RX queue: %u\n", kinfo->num_rx_desc);
dev_info(priv->dev, "Total number of enabled TCs: %u\n", kinfo->num_tc);
dev_info(priv->dev, "MAX INT QL: %u\n", dev_specs->int_ql_max);
+   dev_info(priv->dev, "MAX INT GL: %u\n", dev_specs->max_int_gl);
 }
 
 static ssize_t hns3_dbg_cmd_read(struct file *filp, char __user *buffer,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index b37635d..4651ad1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -420,7 +420,6 @@ enum hns3_flow_level_range {
HNS3_FLOW_ULTRA = 3,
 };
 
-#define HNS3_INT_GL_MAX0x1FE0
 #define HNS3_INT_GL_50K0x0014
 #define HNS3_INT_GL_20K0x0032
 #define HNS3_INT_GL_18K0x0036
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 128e9ec..8d5c194 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1130,19 +1130,21 @@ static int hns3_get_coalesce(struct net_device *netdev,
 static int hns3_check_gl_coalesce_para(struct net_device *netdev,
   struct ethtool_coalesce *cmd)
 {
+   struct hnae3_handle *handle = hns3_get_handle(netdev);
+   struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev);
u32 rx_gl, tx_gl;
 
-   if (cmd->rx_coalesce_usecs > HNS3_INT_GL_MAX) {
+   if (cmd->rx_coalesce_usecs > ae_dev->dev_specs.max_int_gl) {
netdev_err(netdev,
-  "Invalid rx-usecs value, rx-usecs range is 0-%d\n",
-  HNS3_INT_GL_MAX);
+  "invalid rx-usecs value, rx-usecs range is 0-%u\n",
+  ae_dev->dev_specs.max_int_gl);
return -EINVAL;
}
 
-   if (cmd->tx_coalesce_usecs > HNS3_INT_GL_MAX) {
+   if (cmd->tx_coalesce_usecs > ae_dev->dev_specs.max_int_gl) {
netdev_err(netdev,
-  "Invalid tx-usecs value, tx-usecs range is 0-%d\n",
-  HNS3_INT_GL_MAX);
+  "invalid tx-usecs value, tx-usecs range is 0-%u\n",
+  ae_dev->dev_specs.max_int_gl);
return -EINVAL;
}
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index 096e26a..5b7967c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -1103,6 +1103,14 @@ struct hclge_dev_specs_0_cmd {
__le32 max_tm_rate;
 };
 
+#define HCLGE_DEF_MAX_INT_GL   0x1FE0U
+
+struct hclge_dev_specs_1_cmd {
+   __le32 rsv0;
+   __le16 max_int_gl;
+   u8 rsv1[18];
+};
+
 int hclge_cmd_init(struct hclge_dev *hdev);
 static inline void hclge_write_reg(void __iomem *base, u32 reg, u32 value)
 {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 8bcdb28..7102001 100644
--- a/drivers/net/ethernet/hisilicon/h

[PATCH net-next 02/11] net: hns3: add support for 1us unit GL configuration

2020-11-06 Thread Huazhong Tan
For device whose version is above V3(include V3), the GL
configuration can set as 1us unit, so adds support for
configuring this field.

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 26 ++
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h|  3 +++
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c |  6 +
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 6e08719..2813fe5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -224,17 +224,27 @@ void hns3_set_vector_coalesce_rl(struct 
hns3_enet_tqp_vector *tqp_vector,
 void hns3_set_vector_coalesce_rx_gl(struct hns3_enet_tqp_vector *tqp_vector,
u32 gl_value)
 {
-   u32 rx_gl_reg = hns3_gl_usec_to_reg(gl_value);
+   u32 new_val;
 
-   writel(rx_gl_reg, tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET);
+   if (tqp_vector->rx_group.coal.unit_1us)
+   new_val = gl_value | HNS3_INT_GL_1US;
+   else
+   new_val = hns3_gl_usec_to_reg(gl_value);
+
+   writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET);
 }
 
 void hns3_set_vector_coalesce_tx_gl(struct hns3_enet_tqp_vector *tqp_vector,
u32 gl_value)
 {
-   u32 tx_gl_reg = hns3_gl_usec_to_reg(gl_value);
+   u32 new_val;
+
+   if (tqp_vector->tx_group.coal.unit_1us)
+   new_val = gl_value | HNS3_INT_GL_1US;
+   else
+   new_val = hns3_gl_usec_to_reg(gl_value);
 
-   writel(tx_gl_reg, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET);
+   writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET);
 }
 
 void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector,
@@ -272,6 +282,14 @@ static void hns3_vector_coalesce_init(struct 
hns3_enet_tqp_vector *tqp_vector,
rx_coal->flow_level = HNS3_FLOW_LOW;
tx_coal->flow_level = HNS3_FLOW_LOW;
 
+   /* device version above V3(include V3), GL can configure 1us
+* unit, so uses 1us unit.
+*/
+   if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) {
+   tx_coal->unit_1us = 1;
+   rx_coal->unit_1us = 1;
+   }
+
if (ae_dev->dev_specs.int_ql_max) {
tx_coal->ql_enable = 1;
rx_coal->ql_enable = 1;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 10990bd..b37635d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -426,6 +426,8 @@ enum hns3_flow_level_range {
 #define HNS3_INT_GL_18K0x0036
 #define HNS3_INT_GL_8K 0x007C
 
+#define HNS3_INT_GL_1USBIT(31)
+
 #define HNS3_INT_RL_MAX0x00EC
 #define HNS3_INT_RL_ENABLE_MASK0x40
 
@@ -437,6 +439,7 @@ struct hns3_enet_coalesce {
u16 int_ql_max;
u8 gl_adapt_enable:1;
u8 ql_enable:1;
+   u8 unit_1us:1;
enum hns3_flow_level_range flow_level;
 };
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 9af7cb9..128e9ec 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1146,6 +1146,12 @@ static int hns3_check_gl_coalesce_para(struct net_device 
*netdev,
return -EINVAL;
}
 
+   /* device version above V3(include V3), GL uses 1us unit,
+* so the round down is not needed.
+*/
+   if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
+   return 0;
+
rx_gl = hns3_gl_round_down(cmd->rx_coalesce_usecs);
if (rx_gl != cmd->rx_coalesce_usecs) {
netdev_info(netdev,
-- 
2.7.4



[PATCH] KVM: PPC: Book3S: Assign boolean values to a bool variable

2020-11-06 Thread xiakaixu1987
From: Kaixu Xia 

Fix the following coccinelle warnings:

./arch/powerpc/kvm/book3s_xics.c:476:3-15: WARNING: Assignment of 0/1 to bool 
variable
./arch/powerpc/kvm/book3s_xics.c:504:3-15: WARNING: Assignment of 0/1 to bool 
variable

Reported-by: Tosk Robot 
Signed-off-by: Kaixu Xia 
---
 arch/powerpc/kvm/book3s_xics.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 5fee5a11550d..303e3cb096db 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -473,7 +473,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, 
struct kvmppc_icp *icp,
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
new_irq = reject;
-   check_resend = 0;
+   check_resend = false;
goto again;
}
} else {
@@ -501,7 +501,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, 
struct kvmppc_icp *icp,
state->resend = 0;
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
-   check_resend = 0;
+   check_resend = false;
goto again;
}
}
-- 
2.20.0



Re: [PATCH v2 1/1] Fonts: Replace discarded const qualifier

2020-11-06 Thread Peilin Ye
Hi all,

On Tue, Nov 03, 2020 at 10:55:23AM +, Lee Jones wrote:
> Would you be kind enough to let us know when this lands in Mainline
> please?  We'll need to back-port it to start fixing up our Stable
> kernels ASAP.

Patch is in mainline now:

9522750c66c689b739e151fcdf895420dc81efc0 Fonts: Replace discarded const 
qualifier

Thank you,
Peilin Ye



Re: [PATCH v4] checkpatch: improve email parsing

2020-11-06 Thread Joe Perches
On Sat, 2020-11-07 at 10:11 +0530, Dwaipayan Ray wrote:
> On Sat, Nov 7, 2020 at 3:34 AM Joe Perches  wrote:
> > 
> > On Sat, 2020-11-07 at 03:15 +0530, Dwaipayan Ray wrote:
> > > checkpatch doesn't report warnings for many common mistakes
> > > in emails. Some of which are trailing commas and incorrect
> > > use of email comments.
> > 
> > Assuming it all works, this looks good.  I haven't tested it.
> > 
> > How did you test the $fix bits?
> > 
> Hi,
> I actually dumped about 17k unique emails from git log, put it in one of
> my previous patches, and ran checkpatch with --fix on it.
> I checked the diff and most of the cases looked pretty good to me.
> I could send the diff output if you like?

Please.  Likely just to me as I imagine it's not interesting to most.
 
> > Trivial notes:
> > 
> > > diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
> > []
> > > + # sta...@vger.kernel.org or 
> > > sta...@kernel.org shouldn't
> > > + # have an email name. In addition commments 
> > > should strictly
> > > + # begin with a #
> > > + if ($email =~ 
> > > /^.*stable\@(?:vger\.)?kernel\.org/) {
> > 
> > Likely better to test with a case insensitive match so
> > sta...@vger.kernel.org and such are still warned.
> 
> Sure, I will do that.
> > 
> > if ($email =~ 
> > /\bstable\@(?:vger\.)?kernel\.org\b/i) {
> > 
> > > + if ($sign_off =~ /cc:$/i && 
> > > (($comment ne "" && $comment !~ /^#.+/) ||
> > > + ($email_name ne ""))) {
> > 
> > > > $sign_off !~ /^cc:/i ?
> 
> I actually had a doubt about that one. Only the stable address with
> Cc: should be checked right? Or something else?

yes.
 
> What about those stable addresses with tags other than Cc: ? Should
> a change be suggested?

Ideally yes, but there were very few of those in the git commit
history so it's probably not a big deal one way or another.




Re: [RFC] proc: get_wchan() stack unwind only makes sense for sleeping/non-self tasks

2020-11-06 Thread Andrew Morton
On Thu,  5 Nov 2020 15:11:32 -0800 Vineet Gupta  
wrote:

> Most architectures currently check this in their get_wchan() implementation
> (ARC doesn't hence this patch). However doing this in core code shows
> the semantics better so move the check one level up (eventually remove
> the boiler-plate code from arches)

It would be nice to clean up the arch callees in the same patch, at
least so it doesn't get forgotten about.  Are you prepared to propose
such a change?



Re: [PATCH v6 2/2] fs: ext4: Modify inode-test.c to use KUnit parameterized testing feature

2020-11-06 Thread David Gow
On Sat, Nov 7, 2020 at 3:23 AM Arpitha Raghunandan <98.a...@gmail.com> wrote:
>
> Modify fs/ext4/inode-test.c to use the parameterized testing
> feature of KUnit.
>
> Signed-off-by: Arpitha Raghunandan <98.a...@gmail.com>
> ---

This looks good to me. Thanks!

Reviewed-by: David Gow 

-- David


Re: [PATCH v6 1/2] kunit: Support for Parameterized Testing

2020-11-06 Thread David Gow
On Sat, Nov 7, 2020 at 3:22 AM Arpitha Raghunandan <98.a...@gmail.com> wrote:
>
> Implementation of support for parameterized testing in KUnit.
> This approach requires the creation of a test case using the
> KUNIT_CASE_PARAM macro that accepts a generator function as input.
> This generator function should return the next parameter given the
> previous parameter in parameterized tests. It also provides
> a macro to generate common-case generators.
>
> Signed-off-by: Arpitha Raghunandan <98.a...@gmail.com>
> Co-developed-by: Marco Elver 
> Signed-off-by: Marco Elver 
> ---

This looks good to me! A couple of minor thoughts about the output
format below, but I'm quite happy to have this as-is regardless.

Reviewed-by: David Gow 

Cheers,
-- David

> Changes v5->v6:
> - Fix alignment to maintain consistency
> Changes v4->v5:
> - Update kernel-doc comments.
> - Use const void* for generator return and prev value types.
> - Add kernel-doc comment for KUNIT_ARRAY_PARAM.
> - Rework parameterized test case execution strategy: each parameter is 
> executed
>   as if it was its own test case, with its own test initialization and cleanup
>   (init and exit are called, etc.). However, we cannot add new test cases per 
> TAP
>   protocol once we have already started execution. Instead, log the result of
>   each parameter run as a diagnostic comment.
> Changes v3->v4:
> - Rename kunit variables
> - Rename generator function helper macro
> - Add documentation for generator approach
> - Display test case name in case of failure along with param index
> Changes v2->v3:
> - Modifictaion of generator macro and method
> Changes v1->v2:
> - Use of a generator method to access test case parameters
>
>  include/kunit/test.h | 36 ++
>  lib/kunit/test.c | 46 +++-
>  2 files changed, 69 insertions(+), 13 deletions(-)
>
> diff --git a/include/kunit/test.h b/include/kunit/test.h
> index db1b0ae666c4..16616d3974f9 100644
> --- a/include/kunit/test.h
> +++ b/include/kunit/test.h
> @@ -107,6 +107,7 @@ struct kunit;
>   *
>   * @run_case: the function representing the actual test case.
>   * @name: the name of the test case.
> + * @generate_params: the generator function for parameterized tests.
>   *
>   * A test case is a function with the signature,
>   * ``void (*)(struct kunit *)``
> @@ -141,6 +142,7 @@ struct kunit;
>  struct kunit_case {
> void (*run_case)(struct kunit *test);
> const char *name;
> +   const void* (*generate_params)(const void *prev);
>
> /* private: internal use only. */
> bool success;
> @@ -163,6 +165,22 @@ static inline char *kunit_status_to_string(bool status)
>   */
>  #define KUNIT_CASE(test_name) { .run_case = test_name, .name = #test_name }
>
> +/**
> + * KUNIT_CASE_PARAM - A helper for creation a parameterized &struct 
> kunit_case
> + *
> + * @test_name: a reference to a test case function.
> + * @gen_params: a reference to a parameter generator function.
> + *
> + * The generator function ``const void* gen_params(const void *prev)`` is 
> used
> + * to lazily generate a series of arbitrarily typed values that fit into a
> + * void*. The argument @prev is the previously returned value, which should 
> be
> + * used to derive the next value; @prev is set to NULL on the initial 
> generator
> + * call.  When no more values are available, the generator must return NULL.
> + */
> +#define KUNIT_CASE_PARAM(test_name, gen_params)\
> +   { .run_case = test_name, .name = #test_name,\
> + .generate_params = gen_params }
> +
>  /**
>   * struct kunit_suite - describes a related collection of &struct kunit_case
>   *
> @@ -208,6 +226,10 @@ struct kunit {
> const char *name; /* Read only after initialization! */
> char *log; /* Points at case log after initialization */
> struct kunit_try_catch try_catch;
> +   /* param_value is the current parameter value for a test case. */
> +   const void *param_value;
> +   /* param_index stores the index of the parameter in parameterized 
> tests. */
> +   int param_index;
> /*
>  * success starts as true, and may only be set to false during a
>  * test case; thus, it is safe to update this across multiple
> @@ -1742,4 +1764,18 @@ do {   
>  \
> fmt,  
>  \
> ##__VA_ARGS__)
>
> +/**
> + * KUNIT_ARRAY_PARAM() - Define test parameter generator from an array.
> + * @name:  prefix for the test parameter generator function.
> + * @array: array of test parameters.
> + *
> + * Define function @name_gen_params which uses @array to generate parameters.
> + */
> +#define KUNIT_ARRAY_PARAM(name, array)   
>   \
> 

Re: [PATCH v4] checkpatch: improve email parsing

2020-11-06 Thread Dwaipayan Ray
On Sat, Nov 7, 2020 at 3:34 AM Joe Perches  wrote:
>
> On Sat, 2020-11-07 at 03:15 +0530, Dwaipayan Ray wrote:
> > checkpatch doesn't report warnings for many common mistakes
> > in emails. Some of which are trailing commas and incorrect
> > use of email comments.
>
> Assuming it all works, this looks good.  I haven't tested it.
>
> How did you test the $fix bits?
>
Hi,
I actually dumped about 17k unique emails from git log, put it in one of
my previous patches, and ran checkpatch with --fix on it.
I checked the diff and most of the cases looked pretty good to me.
I could send the diff output if you like?

> Trivial notes:
>
> > diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
> []
> > + # sta...@vger.kernel.org or sta...@kernel.org 
> > shouldn't
> > + # have an email name. In addition commments 
> > should strictly
> > + # begin with a #
> > + if ($email =~ 
> > /^.*stable\@(?:vger\.)?kernel\.org/) {
>
> Likely better to test with a case insensitive match so
> sta...@vger.kernel.org and such are still warned.

Sure, I will do that.
>
> if ($email =~ 
> /\bstable\@(?:vger\.)?kernel\.org\b/i) {
>
> > + if ($sign_off =~ /cc:$/i && 
> > (($comment ne "" && $comment !~ /^#.+/) ||
> > + ($email_name ne ""))) {
>
> || $sign_off !~ /^cc:/i ?

I actually had a doubt about that one. Only the stable address with
Cc: should be checked right? Or something else?

What about those stable addresses with tags other than Cc: ? Should
a change be suggested?

Thanks,
Dwaipayan.


Re: [PATCH 1/3 v4] ftrace: Have the callbacks receive a struct ftrace_regs instead of pt_regs

2020-11-06 Thread Masami Hiramatsu
On Fri, 06 Nov 2020 16:42:35 -0500
Steven Rostedt  wrote:

> From: "Steven Rostedt (VMware)" 
> 
> In preparation to have arguments of a function passed to callbacks attached
> to functions as default, change the default callback prototype to receive a
> struct ftrace_regs as the forth parameter instead of a pt_regs.
> 
> For callbacks that set the FL_SAVE_REGS flag in their ftrace_ops flags, they
> will now need to get the pt_regs via a ftrace_get_regs() helper call. If
> this is called by a callback that their ftrace_ops did not have a
> FL_SAVE_REGS flag set, it that helper function will return NULL.
> 
> This will allow the ftrace_regs to hold enough just to get the parameters
> and stack pointer, but without the worry that callbacks may have a pt_regs
> that is not completely filled.
> 

This looks good to me.

Reviewed-by: Masami Hiramatsu 

Thank you,

> Signed-off-by: Steven Rostedt (VMware) 
> ---
>  arch/x86/kernel/kprobes/ftrace.c  |  3 ++-
>  include/linux/ftrace.h| 16 ++--
>  include/linux/kprobes.h   |  2 +-
>  kernel/livepatch/patch.c  |  3 ++-
>  kernel/trace/ftrace.c | 27 +++
>  kernel/trace/trace_event_perf.c   |  2 +-
>  kernel/trace/trace_functions.c|  9 -
>  kernel/trace/trace_irqsoff.c  |  2 +-
>  kernel/trace/trace_sched_wakeup.c |  2 +-
>  kernel/trace/trace_selftest.c | 20 +++-
>  kernel/trace/trace_stack.c|  2 +-
>  11 files changed, 53 insertions(+), 35 deletions(-)
> 
> diff --git a/arch/x86/kernel/kprobes/ftrace.c 
> b/arch/x86/kernel/kprobes/ftrace.c
> index 954d930a7127..373e5fa3ce1f 100644
> --- a/arch/x86/kernel/kprobes/ftrace.c
> +++ b/arch/x86/kernel/kprobes/ftrace.c
> @@ -14,8 +14,9 @@
>  
>  /* Ftrace callback handler for kprobes -- called under preepmt disabed */
>  void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
> -struct ftrace_ops *ops, struct pt_regs *regs)
> +struct ftrace_ops *ops, struct ftrace_regs *fregs)
>  {
> + struct pt_regs *regs = ftrace_get_regs(fregs);
>   struct kprobe *p;
>   struct kprobe_ctlblk *kcb;
>   int bit;
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 8dde9c17aaa5..24e1fa52337d 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -90,8 +90,20 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
>  
>  struct ftrace_ops;
>  
> +struct ftrace_regs {
> + struct pt_regs  regs;
> +};
> +
> +static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs 
> *fregs)
> +{
> + if (!fregs)
> + return NULL;
> +
> + return &fregs->regs;
> +}
> +
>  typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
> -   struct ftrace_ops *op, struct pt_regs *regs);
> +   struct ftrace_ops *op, struct ftrace_regs *fregs);
>  
>  ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
>  
> @@ -259,7 +271,7 @@ int register_ftrace_function(struct ftrace_ops *ops);
>  int unregister_ftrace_function(struct ftrace_ops *ops);
>  
>  extern void ftrace_stub(unsigned long a0, unsigned long a1,
> - struct ftrace_ops *op, struct pt_regs *regs);
> + struct ftrace_ops *op, struct ftrace_regs *fregs);
>  
>  #else /* !CONFIG_FUNCTION_TRACER */
>  /*
> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
> index 629abaf25681..be73350955e4 100644
> --- a/include/linux/kprobes.h
> +++ b/include/linux/kprobes.h
> @@ -345,7 +345,7 @@ static inline void wait_for_kprobe_optimizer(void) { }
>  #endif /* CONFIG_OPTPROBES */
>  #ifdef CONFIG_KPROBES_ON_FTRACE
>  extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
> -   struct ftrace_ops *ops, struct pt_regs *regs);
> +   struct ftrace_ops *ops, struct ftrace_regs 
> *fregs);
>  extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
>  #endif
>  
> diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
> index 875c5dbbdd33..f89f9e7e9b07 100644
> --- a/kernel/livepatch/patch.c
> +++ b/kernel/livepatch/patch.c
> @@ -40,8 +40,9 @@ struct klp_ops *klp_find_ops(void *old_func)
>  static void notrace klp_ftrace_handler(unsigned long ip,
>  unsigned long parent_ip,
>  struct ftrace_ops *fops,
> -struct pt_regs *regs)
> +struct ftrace_regs *fregs)
>  {
> + struct pt_regs *regs = ftrace_get_regs(fregs);
>   struct klp_ops *ops;
>   struct klp_func *func;
>   int patch_state;
> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index 3db64fb0cce8..67888311784e 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -121,7 +121,7 @@ struct ftrace_ops global_ops;
> 

Re: [PATCH] Documentation: kunit: provide guidance for testing many inputs

2020-11-06 Thread David Gow
On Tue, Nov 3, 2020 at 5:37 AM Daniel Latypov  wrote:
>
> usage.rst goes into a detailed about faking out classes, but currently

Nit: a detailed what?

> lacks wording about how one might idiomatically test a range of inputs.
>
> Give an example of how one might test a hash function via macros/helper
> funcs and a table-driven test and very briefly discuss pros and cons.
>
> Also highlight the KUNIT_EXPECT_*_MSG() variants (that aren't mentioned
> elsewhere [1]) which are particularly useful in these situations.
>
> It is also criminally underused at the moment, only appearing in 2
> tests (both written by people involved in KUnit).
>
> [1] not even on
> https://www.kernel.org/doc/html/latest/dev-tools/kunit/api/test.html

I suspect we'll eventually want to document the _MSG variants here as
well, though it will bloat the page somewhat. In any case, it can be
left to a separate patch.

>
> Signed-off-by: Daniel Latypov 
> ---

Thanks for writing this -- it's definitely a common test pattern which
it'd be nice to encourage and explain a bit better.

Cheers,
-- David

>  Documentation/dev-tools/kunit/usage.rst | 66 +
>  1 file changed, 66 insertions(+)
>
> diff --git a/Documentation/dev-tools/kunit/usage.rst 
> b/Documentation/dev-tools/kunit/usage.rst
> index 62142a47488c..317390df2b96 100644
> --- a/Documentation/dev-tools/kunit/usage.rst
> +++ b/Documentation/dev-tools/kunit/usage.rst
> @@ -451,6 +451,72 @@ We can now use it to test ``struct eeprom_buffer``:
> destroy_eeprom_buffer(ctx->eeprom_buffer);
> }
>
> +Testing various inputs
> +--
Nit: "various" isn't hugely descriptive here. Maybe something like
"Testing against multiple inputs" would be better?

> +
> +Testing just a few inputs might not be enough to have confidence that the 
> code
> +works correctly, e.g. for a hash function.
> +
> +In such cases, it can be helpful to have a helper macro or function, e.g. 
> this
> +fictitious example for ``md5sum(1)``
> +
> +.. code-block:: c
> +
> +   /* Note: the cast is to satisfy overly strict type-checking. */
> +   #define TEST_MD5(in, want) \
> +   md5sum(in, out); \
> +   KUNIT_EXPECT_STREQ_MSG(test, (char *)out, want, "md5sum(%s)", 
> in);
> +
> +   char out[16];
> +   TEST_MD5("hello world",   "5eb63bbbe01eeed093cb22bb8f5acdc3");
> +   TEST_MD5("hello world!",  "fc3ff98e8c6a0d3087d515c0473f8677");
> +
> +Note the use of ``KUNIT_EXPECT_STREQ_MSG`` to give more context when it fails
> +and make it easier to track down. (Yes, in this example, ``want`` is likely
> +going to be unique enough on its own).
> +
> +The ``_MSG`` variants are even more useful when the same expectation is 
> called
> +multiple times (in a loop or helper function) and thus the line number isn't
> +enough to identify what failed, like below.
> +
> +In some cases, it can be helpful to write a *table-driven test* instead, e.g.
> +
> +.. code-block:: c
> +
> +   int i;
> +   char out[16];
> +
> +   struct md5_test_case {
> +   const char *str;
> +   const char *md5;
> +   };
> +
> +   struct md5_test_case cases[] = {
> +   {
> +   .str = "hello world",
> +   .md5 = "5eb63bbbe01eeed093cb22bb8f5acdc3",
> +   },
> +   {
> +   .str = "hello world!",
> +   .md5 = "fc3ff98e8c6a0d3087d515c0473f8677",
> +   },
> +   };
> +   for (i = 0; i < ARRAY_SIZE(cases); ++i) {
> +   md5sum(cases[i].str, out);
> +   KUNIT_EXPECT_STREQ_MSG(test, (char *)out, cases[i].md5,
> + "md5sum(%s)", cases[i].str);
> +   }
> +
> +
> +There's more boilerplate involved, but it can:
> +
> +* be more readable when there are multiple inputs/outputs thanks to field 
> names,
> +
> +  * E.g. see ``fs/ext4/inode-test.c`` for an example of both.
> +* reduce duplication if test cases can be shared across multiple tests.
> +
> +  * E.g. if we had a magical ``undo_md5sum`` function, we could reuse 
> ``cases``.
> +

This is a bit of a nitpick, but I don't think this is quite conveying
the usefulness of table-based testing. Maybe it's that a hypothetical
"undo_md5sum" is too unrealistic an example? Maybe, instead of having
both the macro-based and table-driven examples based around md5sum(),
the table-based one could use something more obviously invertible /
reusable, and include both in the example code. E.g, something akin to
toupper() and tolower() or some other conversion function. I think
having a better example here is probably more useful than having both
the table- and macro- driven examples test the same thing.


>  .. _kunit-on-non-uml:
>
>  KUnit on non-UML architectures
>
> base-commit: 77c8473edf7f7664137f555cfcdc8c460bbd947d
> --
> 2.29.1.341.ge80a0c044ae-goog
>


Re: [PATCH v1] kunit: tool: unmark test_data as binary blobs

2020-11-06 Thread David Gow
On Fri, Nov 6, 2020 at 7:24 AM Brendan Higgins
 wrote:
>
> The tools/testing/kunit/test_data/ directory was marked as binary
> because some of the test_data files cause checkpatch warnings. Fix this
> by dropping the .gitattributes file.
>
> Fixes: afc63da64f1e ("kunit: kunit_parser: make parser more robust")
> Signed-off-by: Brendan Higgins 
> ---
Reviewed-by: David Gow 

Thanks. I wasn't able to find any issues which required those files to
be binary.

For the record, a link to the original issue, which appeared to be
with whitespace (spaces before tabs) in git apply:
https://lkml.org/lkml/2020/3/13/920

Cheers,
-- David


[PATCH 0/2] drivers/tty: delete break after return or goto

2020-11-06 Thread Bernard Zhao
Hi, Greg:

This patch sereies delete code which never run:
{
case XXX:
return XXX;
break; //The break is meanless, so just delete it.
case YYY:
goto YYY;
break; //The break is meanless, so just delete it.
..
}

Bernard Zhao (2):
  tty/serial: delete break after return
  drivers/tty: delete break after goto/return

 drivers/tty/nozomi.c | 4 
 drivers/tty/serial/imx.c | 5 -
 2 files changed, 9 deletions(-)

-- 
2.29.0



[PATCH 1/2] tty/serial: delete break after return

2020-11-06 Thread Bernard Zhao
Delete break after return, which will never run.

Signed-off-by: Bernard Zhao 
---
 drivers/tty/serial/imx.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 1731d9728865..09703079db7b 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -320,7 +320,6 @@ static u32 imx_uart_readl(struct imx_port *sport, u32 
offset)
switch (offset) {
case UCR1:
return sport->ucr1;
-   break;
case UCR2:
/*
 * UCR2_SRST is the only bit in the cached registers that might
@@ -331,16 +330,12 @@ static u32 imx_uart_readl(struct imx_port *sport, u32 
offset)
if (!(sport->ucr2 & UCR2_SRST))
sport->ucr2 = readl(sport->port.membase + offset);
return sport->ucr2;
-   break;
case UCR3:
return sport->ucr3;
-   break;
case UCR4:
return sport->ucr4;
-   break;
case UFCR:
return sport->ufcr;
-   break;
default:
return readl(sport->port.membase + offset);
}
-- 
2.29.0



[PATCH 2/2] drivers/tty: delete break after goto/return

2020-11-06 Thread Bernard Zhao
Delete break after goto/return, which will never run.

Signed-off-by: Bernard Zhao 
---
 drivers/tty/nozomi.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index d42b854cb7df..946cc16827aa 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -414,11 +414,9 @@ static void read_mem32(u32 *buf, const void __iomem 
*mem_addr_start,
buf16 = (u16 *) buf;
*buf16 = __le16_to_cpu(readw(ptr));
goto out;
-   break;
case 4: /* 4 bytes */
*(buf) = __le32_to_cpu(readl(ptr));
goto out;
-   break;
}
 
while (i < size_bytes) {
@@ -460,7 +458,6 @@ static u32 write_mem32(void __iomem *mem_addr_start, const 
u32 *buf,
buf16 = (const u16 *)buf;
writew(__cpu_to_le16(*buf16), ptr);
return 2;
-   break;
case 1: /*
 * also needs to write 4 bytes in this case
 * so falling through..
@@ -468,7 +465,6 @@ static u32 write_mem32(void __iomem *mem_addr_start, const 
u32 *buf,
case 4: /* 4 bytes */
writel(__cpu_to_le32(*buf), ptr);
return 4;
-   break;
}
 
while (i < size_bytes) {
-- 
2.29.0



[V2] [PATCH] net/ethernet: update ret when ptp_clock is ERROR

2020-11-06 Thread Wang Qing
We always have to update the value of ret, otherwise the error value
 may be the previous one. And ptp_clock_register() never return NULL
 when PTP_1588_CLOCK enable, so we use IS_ERR here.

Signed-off-by: Wang Qing 
---
 drivers/net/ethernet/ti/am65-cpts.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/am65-cpts.c 
b/drivers/net/ethernet/ti/am65-cpts.c
index 75056c1..ec8e56d
--- a/drivers/net/ethernet/ti/am65-cpts.c
+++ b/drivers/net/ethernet/ti/am65-cpts.c
@@ -998,11 +998,10 @@ struct am65_cpts *am65_cpts_create(struct device *dev, 
void __iomem *regs,
am65_cpts_settime(cpts, ktime_to_ns(ktime_get_real()));
 
cpts->ptp_clock = ptp_clock_register(&cpts->ptp_info, cpts->dev);
-   if (IS_ERR_OR_NULL(cpts->ptp_clock)) {
+   if (IS_ERR(cpts->ptp_clock)) {
dev_err(dev, "Failed to register ptp clk %ld\n",
PTR_ERR(cpts->ptp_clock));
-   if (!cpts->ptp_clock)
-   ret = -ENODEV;
+   ret = PTR_ERR(cpts->ptp_clock);
goto refclk_disable;
}
cpts->phc_index = ptp_clock_index(cpts->ptp_clock);
-- 
2.7.4



[PATCH] pcp_clock: return EOPNOTSUPP if !CONFIG_PTP_1588_CLOCK

2020-11-06 Thread Wang Qing
pcp_clock_register() is checked with IS_ERR(), and will crash if !PTP,
change return value to ERR_PTR(-EOPNOTSUPP) for the !CONFIG_PTP_1588_CLOCK
 and so question resolved.

Signed-off-by: Wang Qing 
---
 include/linux/ptp_clock_kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index d3e8ba5..05db40c
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -276,7 +276,7 @@ void ptp_cancel_worker_sync(struct ptp_clock *ptp);
 #else
 static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
   struct device *parent)
-{ return NULL; }
+{ return ERR_PTR(-EOPNOTSUPP); }
 static inline int ptp_clock_unregister(struct ptp_clock *ptp)
 { return 0; }
 static inline void ptp_clock_event(struct ptp_clock *ptp,
-- 
2.7.4



Re: [PATCH] MAINTAINERS: add missing file in ext4 entry

2020-11-06 Thread Theodore Y. Ts'o
On Fri, Oct 30, 2020 at 10:24:35AM +0800, Chao Yu wrote:
> include/trace/events/ext4.h belongs to ext4 module, add the file path into
> ext4 entry in MAINTAINERS.
> 
> Signed-off-by: Chao Yu 

Thanks, applied.

- Ted


[V2] drm: msm: adreno: use IS_ERR() instead of null pointer check

2020-11-06 Thread Wang Qing
a6xx_gmu_get_mmio() never return null in case of error, but ERR_PTR(), so 
we should use IS_ERR() instead of null pointer check and IS_ERR_OR_NULL().

Signed-off-by: Wang Qing 
---
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
index 491fee4..82420f7
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
@@ -492,7 +492,7 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu)
void __iomem *seqptr = a6xx_gmu_get_mmio(pdev, "gmu_pdc_seq");
uint32_t pdc_address_offset;
 
-   if (!pdcptr || !seqptr)
+   if (IS_ERR(pdcptr) || IS_ERR(seqptr))
goto err;
 
if (adreno_is_a618(adreno_gpu) || adreno_is_a640(adreno_gpu))
@@ -580,9 +580,9 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu)
wmb();
 
 err:
-   if (!IS_ERR_OR_NULL(pdcptr))
+   if (!IS_ERR(pdcptr))
iounmap(pdcptr);
-   if (!IS_ERR_OR_NULL(seqptr))
+   if (!IS_ERR(seqptr))
iounmap(seqptr);
 }
 
-- 
2.7.4



Re: [PATCH memory-model 5/8] tools/memory-model: Add a glossary of LKMM terms

2020-11-06 Thread Boqun Feng
On Fri, Nov 06, 2020 at 10:01:02AM -0800, Paul E. McKenney wrote:
> On Fri, Nov 06, 2020 at 09:47:22AM +0800, Boqun Feng wrote:
> > On Thu, Nov 05, 2020 at 02:00:14PM -0800, paul...@kernel.org wrote:
> > > From: "Paul E. McKenney" 
> > > 
> > > Signed-off-by: Paul E. McKenney 
> > > ---
> > >  tools/memory-model/Documentation/glossary.txt | 155 
> > > ++
> > >  1 file changed, 155 insertions(+)
> > >  create mode 100644 tools/memory-model/Documentation/glossary.txt
> > > 
> > > diff --git a/tools/memory-model/Documentation/glossary.txt 
> > > b/tools/memory-model/Documentation/glossary.txt
> > > new file mode 100644
> > > index 000..036fa28
> > > --- /dev/null
> > > +++ b/tools/memory-model/Documentation/glossary.txt
> > > @@ -0,0 +1,155 @@
> > > +This document contains brief definitions of LKMM-related terms.  Like 
> > > most
> > > +glossaries, it is not intended to be read front to back (except perhaps
> > > +as a way of confirming a diagnosis of OCD), but rather to be searched
> > > +for specific terms.
> > > +
> > > +
> > > +Address Dependency:  When the address of a later memory access is 
> > > computed
> > > + based on the value returned by an earlier load, an "address
> > > + dependency" extends from that load extending to the later access.
> > > + Address dependencies are quite common in RCU read-side critical
> > > + sections:
> > > +
> > > +  1 rcu_read_lock();
> > > +  2 p = rcu_dereference(gp);
> > > +  3 do_something(p->a);
> > > +  4 rcu_read_unlock();
> > > +
> > > +  In this case, because the address of "p->a" on line 3 is computed
> > > +  from the value returned by the rcu_dereference() on line 2, the
> > > +  address dependency extends from that rcu_dereference() to that
> > > +  "p->a".  In rare cases, optimizing compilers can destroy address
> > > +  dependencies.  Please see Documentation/RCU/rcu_dereference.txt
> > > +  for more information.
> > > +
> > > +  See also "Control Dependency".
> > > +
> > > +Acquire:  With respect to a lock, acquiring that lock, for example,
> > > + using spin_lock().  With respect to a non-lock shared variable,
> > > + a special operation that includes a load and which orders that
> > > + load before later memory references running on that same CPU.
> > > + An example special acquire operation is smp_load_acquire(),
> > > + but atomic_read_acquire() and atomic_xchg_acquire() also include
> > > + acquire loads.
> > > +
> > > + When an acquire load returns the value stored by a release store
> > > + to that same variable, then all operations preceding that store
> > 
> > Change this to:
> > 
> > When an acquire load reads-from a release store
> > 
> > , and put a reference to "Reads-from"? I think this makes the document
> > more consistent in that it makes clear "an acquire load returns the
> > value stored by a release store to the same variable" is not a special
> > case, it's simple a "Reads-from".
> > 
> > > + happen before any operations following that load acquire.
> > 
> > Add a reference to the definition of "happen before" in explanation.txt?
> 
> How about as shown below?  I currently am carrying this as a separate
> commit, but I might merge it into this one later on.
> 

Looks good to me, thanks!

Regards,
Boqun

>   Thanx, Paul
> 
> 
> 
> commit 774a52cd3d80d6b657ae6c14c10bd9fc437068f3
> Author: Paul E. McKenney 
> Date:   Fri Nov 6 09:58:01 2020 -0800
> 
> tools/memory-model: Tie acquire loads to reads-from
> 
> This commit explicitly makes the connection between acquire loads and
> the reads-from relation.  It also adds an entry for happens-before,
> and refers to the corresponding section of explanation.txt.
> 
> Reported-by: Boqun Feng 
> Signed-off-by: Paul E. McKenney 
> 
> diff --git a/tools/memory-model/Documentation/glossary.txt 
> b/tools/memory-model/Documentation/glossary.txt
> index 3924aca..383151b 100644
> --- a/tools/memory-model/Documentation/glossary.txt
> +++ b/tools/memory-model/Documentation/glossary.txt
> @@ -33,10 +33,11 @@ Acquire:  With respect to a lock, acquiring that lock, 
> for example,
>   acquire loads.
>  
>   When an acquire load returns the value stored by a release store
> - to that same variable, then all operations preceding that store
> - happen before any operations following that load acquire.
> + to that same variable, (in other words, the acquire load "reads
> + from" the release store), then all operations preceding that
> + store "happen before" any operations following that load acquire.
>  
> - See also "Relaxed" and "Release".
> + See also "Happens-Before", "Reads-From", "Relaxed", and "Release".
>  
>  Coherence (co):  When one CPU's store to a given variable overwrites
>   either the value from another CPU's store or some later value,
> @@ -102,6 +103,11 @@ Fully Orde

Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Logan Gunthorpe



On 2020-11-06 5:14 p.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 01:03:26PM -0700, Logan Gunthorpe wrote:
>> I don't think a function like that will work for the p2pmem use case. In
>> order to implement proper page freeing I expect I'll need a loop around
>> the allocator and vm_insert_mixed()... Something roughly like:
>>
>> for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
>> vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE);
>>  ret = vmf_insert_mixed(vma, addr,
>> __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP));
>> }
>>
>> That way we can call pci_free_p2pmem() when a page's ref count goes to
>> zero. I suspect your use case will need to do something similar.
> 
> Yes, but I would say the pci_alloc_p2pmem() layer should be able to
> free pages on a page-by-page basis so you don't have to do stuff like
> the above.
> 
> There is often a lot of value in having physical contiguous addresses,
> so allocating page by page as well seems poor.

Agreed. But I'll have to dig to see if genalloc supports freeing blocks
in different sizes than the allocations.

Logan


Re: [PATCH net-next v4 0/5] bonding: rename bond components

2020-11-06 Thread Jakub Kicinski
On Fri,  6 Nov 2020 15:04:31 -0500 Jarod Wilson wrote:
> The bonding driver's use of master and slave, while largely understood
> in technical circles, poses a barrier for inclusion to some potential
> members of the development and user community, due to the historical
> context of masters and slaves, particularly in the United States. This
> is a first full pass at replacing those phrases with more socially
> inclusive ones, opting for bond to replace master and port to
> replace slave, which is congruent with the bridge and team drivers.

If we decide to go ahead with this, we should probably also use it as
an opportunity to clean up the more egregious checkpatch warnings, WDYT?

Plan minimum - don't add new ones ;)


Re: [PATCH] ASoC: fsl_aud2htx: Remove dev_err() usage after platform_get_irq()

2020-11-06 Thread Nicolin Chen
On Sat, Nov 07, 2020 at 10:20:43AM +0800, Shengjiu Wang wrote:
> platform_get_irq() would print error message internally, so dev_err()
> after platform_get_irq() is not needed
> 
> Signed-off-by: Shengjiu Wang 

Acked-by: Nicolin Chen 


Re: [PATCH v4 1/4] dt-bindings: usb: add rk3328 dwc3 docs

2020-11-06 Thread Lindsey Stanpoor
On Wed, Sep 2, 2020 at 11:12 AM  wrote:
>
> From: Cameron Nemo 
>
> Document compatible for dwc3 on the Rockchip rk3328 platform.

Hi all,

Wanted to give this patch submission a gentle ping.

Rob Herring acked the documentation changes, but I have not heard
anything
from the USB or Rockchip maintainers. This patchset would facilitate USB3
support for Rockchip rk3328 devices like the Pine Rock64.

If there is anything I can do to help move this along, please let me know.

Thank you,
Cameron

>
> Signed-off-by: Cameron Nemo 
> ---
>  Documentation/devicetree/bindings/usb/dwc3.txt  | 1 +
>  Documentation/devicetree/bindings/usb/rockchip,dwc3.txt | 3 ++-
>  2 files changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt 
> b/Documentation/devicetree/bindings/usb/dwc3.txt
> index d03edf9d3935..d625cd5966e9 100644
> --- a/Documentation/devicetree/bindings/usb/dwc3.txt
> +++ b/Documentation/devicetree/bindings/usb/dwc3.txt
> @@ -25,6 +25,7 @@ Exception for clocks:
>  "ti,am437x-dwc3"
>  "ti,dwc3"
>  "ti,keystone-dwc3"
> +"rockchip,rk3328-dwc3"
>  "rockchip,rk3399-dwc3"
>  "xlnx,zynqmp-dwc3"
>
> diff --git a/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt 
> b/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt
> index 94520493233b..b41f30a61be6 100644
> --- a/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt
> +++ b/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt
> @@ -1,7 +1,8 @@
>  Rockchip SuperSpeed DWC3 USB SoC controller
>
>  Required properties:
> -- compatible:  should contain "rockchip,rk3399-dwc3" for rk3399 SoC
> +- compatible:  should contain "rockchip,rk3328-dwc3" for rk3328 SoC
> +   or "rockchip,rk3399-dwc3" for rk3399 SoC
>  - clocks:  A list of phandle + clock-specifier pairs for the
> clocks listed in clock-names
>  - clock-names: Should contain the following:
> --
> 2.28.0
>


Re: [PATCH memory-model 5/8] tools/memory-model: Add a glossary of LKMM terms

2020-11-06 Thread Alan Stern
On Fri, Nov 06, 2020 at 01:04:13PM -0800, Paul E. McKenney wrote:
> On Fri, Nov 06, 2020 at 03:40:08PM -0500, Alan Stern wrote:
> > Is it really true that data dependencies are so easily destroyed?  I 
> > would expect that a true "semantic" dependency (i.e., one where the 
> > value written really does vary according to the value read) would be 
> > rather hard to second guess.
> 
> The usual optimizations apply, for but one example:
> 
>   r1 = READ_ONCE(x);
>   WRITE_ONCE(y, (r1 + 1) % MAX_ELEMENTS);
> 
> If MAX_ELEMENTS is 1, so long, data dependency!

Sure, but if MAX_ELEMENTS is 1 then the value written will always be 0 
no matter what value r1 has, so it isn't a semantic dependency.  
Presumably a semantic data dependency would be much more robust.

I wonder if it's worth pointing out this distinction to the reader.

> With pointers, the compiler has fewer optimization opportunities,
> but there are still cases where it can break the dependency.
> Or transform it to a control dependency.

Transforming a data dependency into a control dependency wouldn't make 
any important difference; the hardware would still provide the desired 
ordering.

Alan


[PATCH] ASoC: fsl_aud2htx: Remove dev_err() usage after platform_get_irq()

2020-11-06 Thread Shengjiu Wang
platform_get_irq() would print error message internally, so dev_err()
after platform_get_irq() is not needed

Signed-off-by: Shengjiu Wang 
---
 sound/soc/fsl/fsl_aud2htx.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sound/soc/fsl/fsl_aud2htx.c b/sound/soc/fsl/fsl_aud2htx.c
index 124aeb70f24e..4091ccc7c3e9 100644
--- a/sound/soc/fsl/fsl_aud2htx.c
+++ b/sound/soc/fsl/fsl_aud2htx.c
@@ -211,11 +211,8 @@ static int fsl_aud2htx_probe(struct platform_device *pdev)
}
 
irq = platform_get_irq(pdev, 0);
-   if (irq < 0) {
-   dev_err(&pdev->dev, "no irq for node %s\n",
-   dev_name(&pdev->dev));
+   if (irq < 0)
return irq;
-   }
 
ret = devm_request_irq(&pdev->dev, irq, fsl_aud2htx_isr, 0,
   dev_name(&pdev->dev), aud2htx);
-- 
2.27.0



[PATCH RT 3/6] mm/memcontrol: Disable preemption in __mod_memcg_lruvec_state()

2020-11-06 Thread Steven Rostedt
5.4.74-rt42-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Sebastian Andrzej Siewior 

The callers expect disabled preemption/interrupts while invoking
__mod_memcg_lruvec_state(). This works mainline because a lock of
somekind is acquired.

Use preempt_disable_rt() where per-CPU variables are accessed and a
stable pointer is expected. This is also done in __mod_zone_page_state()
for the same reason.

Cc: stable...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Steven Rostedt (VMware) 
---
 mm/memcontrol.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9bdb75ef6d62..c9d02e2272e1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -752,6 +752,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum 
node_stat_item idx,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
 
+   preempt_disable_rt();
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
 
@@ -767,6 +768,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum 
node_stat_item idx,
x = 0;
}
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+   preempt_enable_rt();
 }
 
 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
-- 
2.28.0




[PATCH RT 1/6] net: Properly annotate the try-lock for the seqlock

2020-11-06 Thread Steven Rostedt
5.4.74-rt42-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Sebastian Andrzej Siewior 

In patch
   ("net/Qdisc: use a seqlock instead seqcount")

the seqcount has been replaced with a seqlock to allow to reader to
boost the preempted writer.
The try_write_seqlock() acquired the lock with a try-lock but the
seqcount annotation was "lock".

Opencode write_seqcount_t_begin() and use the try-lock annotation for
lockdep.

Reported-by: Mike Galbraith 
Cc: stable...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Steven Rostedt (VMware) 
---
 include/linux/seqlock.h   |  9 -
 include/net/sch_generic.h | 10 +-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index e5207897c33e..f390293974ea 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -489,15 +489,6 @@ static inline void write_seqlock(seqlock_t *sl)
__raw_write_seqcount_begin(&sl->seqcount);
 }
 
-static inline int try_write_seqlock(seqlock_t *sl)
-{
-   if (spin_trylock(&sl->lock)) {
-   __raw_write_seqcount_begin(&sl->seqcount);
-   return 1;
-   }
-   return 0;
-}
-
 static inline void write_sequnlock(seqlock_t *sl)
 {
__raw_write_seqcount_end(&sl->seqcount);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6afb4b9cede..112d2dca8b08 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -168,8 +168,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
return false;
}
 #ifdef CONFIG_PREEMPT_RT
-   if (try_write_seqlock(&qdisc->running))
+   if (spin_trylock(&qdisc->running.lock)) {
+   seqcount_t *s = &qdisc->running.seqcount;
+   /*
+* Variant of write_seqcount_t_begin() telling lockdep that a
+* trylock was attempted.
+*/
+   __raw_write_seqcount_begin(s);
+   seqcount_acquire(&s->dep_map, 0, 1, _RET_IP_);
return true;
+   }
return false;
 #else
/* Variant of write_seqcount_begin() telling lockdep a trylock
-- 
2.28.0




[PATCH RT 6/6] Linux 5.4.74-rt42-rc1

2020-11-06 Thread Steven Rostedt
5.4.74-rt42-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: "Steven Rostedt (VMware)" 

---
 localversion-rt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/localversion-rt b/localversion-rt
index 629e0b4384b8..31c892a05e4d 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt41
+-rt42-rc1
-- 
2.28.0




[PATCH RT 5/6] timers: Dont block on ->expiry_lock for TIMER_IRQSAFE

2020-11-06 Thread Steven Rostedt
5.4.74-rt42-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Sebastian Andrzej Siewior 

PREEMPT_RT does not spin and wait until a running timer completes its
callback but instead it blocks on a sleeping lock to prevent a deadlock.

This blocking can not be done for workqueue's IRQ_SAFE timer which will
be canceled in an IRQ-off region. It has to happen to in IRQ-off region
because changing the PENDING bit and clearing the timer must not be
interrupted to avoid a busy-loop.

The callback invocation of IRQSAFE timer is not preempted on PREEMPT_RT
so there is no need to synchronize on timer_base::expiry_lock.

Don't acquire the timer_base::expiry_lock for TIMER_IRQSAFE flagged
timer.
Add a lockdep annotation to ensure that this function is always invoked
in preemptible context on PREEMPT_RT.

Reported-by: Mike Galbraith 
Signed-off-by: Sebastian Andrzej Siewior 
Cc: stable...@vger.kernel.org
Signed-off-by: Steven Rostedt (VMware) 
---
 kernel/time/timer.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 89078fd848b9..3e9d7f227a5c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1289,7 +1289,7 @@ static void del_timer_wait_running(struct timer_list 
*timer)
u32 tf;
 
tf = READ_ONCE(timer->flags);
-   if (!(tf & TIMER_MIGRATING)) {
+   if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
struct timer_base *base = get_timer_base(tf);
 
/*
@@ -1373,6 +1373,13 @@ int del_timer_sync(struct timer_list *timer)
 */
WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
 
+   /*
+* Must be able to sleep on PREEMPT_RT because of the slowpath in
+* del_timer_wait_running().
+*/
+   if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
+   might_sleep();
+
do {
ret = try_to_del_timer_sync(timer);
 
-- 
2.28.0




[PATCH RT 4/6] ptrace: fix ptrace_unfreeze_traced() race with rt-lock

2020-11-06 Thread Steven Rostedt
5.4.74-rt42-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Oleg Nesterov 

The patch "ptrace: fix ptrace vs tasklist_lock race" changed
ptrace_freeze_traced() to take task->saved_state into account, but
ptrace_unfreeze_traced() has the same problem and needs a similar fix:
it should check/update both ->state and ->saved_state.

Reported-by: Luis Claudio R. Goncalves 
Fixes: "ptrace: fix ptrace vs tasklist_lock race"
Signed-off-by: Oleg Nesterov 
Signed-off-by: Sebastian Andrzej Siewior 
Cc: stable...@vger.kernel.org
Signed-off-by: Steven Rostedt (VMware) 
---
 kernel/ptrace.c | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 3075006d720e..3f7156f06b6c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -197,8 +197,8 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 
 static void ptrace_unfreeze_traced(struct task_struct *task)
 {
-   if (task->state != __TASK_TRACED)
-   return;
+   unsigned long flags;
+   bool frozen = true;
 
WARN_ON(!task->ptrace || task->parent != current);
 
@@ -207,12 +207,19 @@ static void ptrace_unfreeze_traced(struct task_struct 
*task)
 * Recheck state under the lock to close this race.
 */
spin_lock_irq(&task->sighand->siglock);
-   if (task->state == __TASK_TRACED) {
-   if (__fatal_signal_pending(task))
-   wake_up_state(task, __TASK_TRACED);
-   else
-   task->state = TASK_TRACED;
-   }
+
+   raw_spin_lock_irqsave(&task->pi_lock, flags);
+   if (task->state == __TASK_TRACED)
+   task->state = TASK_TRACED;
+   else if (task->saved_state == __TASK_TRACED)
+   task->saved_state = TASK_TRACED;
+   else
+   frozen = false;
+   raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+   if (frozen && __fatal_signal_pending(task))
+   wake_up_state(task, __TASK_TRACED);
+
spin_unlock_irq(&task->sighand->siglock);
 }
 
-- 
2.28.0




[PATCH RT 0/6] Linux 5.4.74-rt42-rc1

2020-11-06 Thread Steven Rostedt


Dear RT Folks,

This is the RT stable review cycle of patch 5.4.74-rt42-rc1.

Please scream at me if I messed something up. Please test the patches too.

The -rc release will be uploaded to kernel.org and will be deleted when
the final release is out. This is just a review release (or release candidate).

The pre-releases will not be pushed to the git repository, only the
final release is.

If all goes well, this patch will be converted to the next main release
on 11/10/2020.

Enjoy,

-- Steve


To build 5.4.74-rt42-rc1 directly, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v5.x/linux-5.4.tar.xz

  http://www.kernel.org/pub/linux/kernel/v5.x/patch-5.4.74.xz

  
http://www.kernel.org/pub/linux/kernel/projects/rt/5.4/patch-5.4.74-rt42-rc1.patch.xz

You can also build from 5.4.74-rt41 by applying the incremental patch:

http://www.kernel.org/pub/linux/kernel/projects/rt/5.4/incr/patch-5.4.74-rt41-rt42-rc1.patch.xz


Changes from 5.4.74-rt41:

---


Oleg Nesterov (1):
  ptrace: fix ptrace_unfreeze_traced() race with rt-lock

Sebastian Andrzej Siewior (4):
  net: Properly annotate the try-lock for the seqlock
  tcp: Remove superfluous BH-disable around listening_hash
  mm/memcontrol: Disable preemption in __mod_memcg_lruvec_state()
  timers: Don't block on ->expiry_lock for TIMER_IRQSAFE

Steven Rostedt (VMware) (1):
  Linux 5.4.74-rt42-rc1


 include/linux/seqlock.h |  9 -
 include/net/sch_generic.h   | 10 +-
 kernel/ptrace.c | 23 +++
 kernel/time/timer.c |  9 -
 localversion-rt |  2 +-
 mm/memcontrol.c |  2 ++
 net/ipv4/inet_hashtables.c  | 19 ---
 net/ipv6/inet6_hashtables.c |  5 +
 8 files changed, 48 insertions(+), 31 deletions(-)


[PATCH RT 2/6] tcp: Remove superfluous BH-disable around listening_hash

2020-11-06 Thread Steven Rostedt
5.4.74-rt42-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Sebastian Andrzej Siewior 

Commit
   9652dc2eb9e40 ("tcp: relax listening_hash operations")

removed the need to disable bottom half while acquiring
listening_hash.lock. There are still two callers left which disable
bottom half before the lock is acquired.

Drop local_bh_disable() around __inet_hash() which acquires
listening_hash->lock, invoke inet_ehash_nolisten() with disabled BH.
inet_unhash() conditionally acquires listening_hash->lock.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Steven Rostedt (VMware) 
---
 net/ipv4/inet_hashtables.c  | 19 ---
 net/ipv6/inet6_hashtables.c |  5 +
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 006a34b18537..4c8565d6624c 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -543,7 +543,9 @@ int __inet_hash(struct sock *sk, struct sock *osk)
int err = 0;
 
if (sk->sk_state != TCP_LISTEN) {
+   local_bh_disable();
inet_ehash_nolisten(sk, osk);
+   local_bh_enable();
return 0;
}
WARN_ON(!sk_unhashed(sk));
@@ -575,11 +577,8 @@ int inet_hash(struct sock *sk)
 {
int err = 0;
 
-   if (sk->sk_state != TCP_CLOSE) {
-   local_bh_disable();
+   if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
-   local_bh_enable();
-   }
 
return err;
 }
@@ -590,17 +589,20 @@ void inet_unhash(struct sock *sk)
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb = NULL;
spinlock_t *lock;
+   bool state_listen;
 
if (sk_unhashed(sk))
return;
 
if (sk->sk_state == TCP_LISTEN) {
+   state_listen = true;
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
-   lock = &ilb->lock;
+   spin_lock(&ilb->lock);
} else {
+   state_listen = false;
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+   spin_lock_bh(lock);
}
-   spin_lock_bh(lock);
if (sk_unhashed(sk))
goto unlock;
 
@@ -613,7 +615,10 @@ void inet_unhash(struct sock *sk)
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 unlock:
-   spin_unlock_bh(lock);
+   if (state_listen)
+   spin_unlock(&ilb->lock);
+   else
+   spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index fbe9d4295eac..5d1c1c6967cb 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -287,11 +287,8 @@ int inet6_hash(struct sock *sk)
 {
int err = 0;
 
-   if (sk->sk_state != TCP_CLOSE) {
-   local_bh_disable();
+   if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
-   local_bh_enable();
-   }
 
return err;
 }
-- 
2.28.0




Re: [PATCH] Revert "mm/vunmap: add cond_resched() in vunmap_pmd_range"

2020-11-06 Thread Andrew Morton
On Thu,  5 Nov 2020 09:02:49 -0800 Minchan Kim  wrote:

> This reverts commit e47110e90584a22e9980510b00d0dfad3a83354e.
> 
> While I was doing zram testing, I found sometimes decompression failed
> since the compression buffer was corrupted. With investigation,
> I found below commit calls cond_resched unconditionally so it could
> make a problem in atomic context if the task is reschedule.
> 
> Revert the original commit for now.
> 
> [   55.109012] BUG: sleeping function called from invalid context at 
> mm/vmalloc.c:108
> [   55.110774] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 946, 
> name: memhog
> [   55.111973] 3 locks held by memhog/946:
> [   55.112807]  #0: 9d01d4b193e8 (&mm->mmap_lock#2){}-{4:4}, at: 
> __mm_populate+0x103/0x160
> [   55.114151]  #1: a3d53de0 (fs_reclaim){+.+.}-{0:0}, at: 
> __alloc_pages_slowpath.constprop.0+0xa98/0x1160
> [   55.115848]  #2: 9d01d56b8110 (&zspage->lock){.+.+}-{3:3}, at: 
> zs_map_object+0x8e/0x1f0
> [   55.118947] CPU: 0 PID: 946 Comm: memhog Not tainted 
> 5.9.3-00011-gc5bfc0287345-dirty #316
> [   55.121265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.13.0-1 04/01/2014
> [   55.122540] Call Trace:
> [   55.122974]  dump_stack+0x8b/0xb8
> [   55.123588]  ___might_sleep.cold+0xb6/0xc6
> [   55.124328]  unmap_kernel_range_noflush+0x2eb/0x350
> [   55.125198]  unmap_kernel_range+0x14/0x30
> [   55.125920]  zs_unmap_object+0xd5/0xe0
> [   55.126604]  zram_bvec_rw.isra.0+0x38c/0x8e0
> [   55.127462]  zram_rw_page+0x90/0x101
> [   55.128199]  bdev_write_page+0x92/0xe0
> [   55.128957]  ? swap_slot_free_notify+0xb0/0xb0
> [   55.129841]  __swap_writepage+0x94/0x4a0
> [   55.130636]  ? do_raw_spin_unlock+0x4b/0xa0
> [   55.131462]  ? _raw_spin_unlock+0x1f/0x30
> [   55.132261]  ? page_swapcount+0x6c/0x90
> [   55.133038]  pageout+0xe3/0x3a0
> [   55.133702]  shrink_page_list+0xb94/0xd60
> [   55.134626]  shrink_inactive_list+0x158/0x460
>
> ...
>
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -102,8 +102,6 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long 
> addr, unsigned long end,
>   if (pmd_none_or_clear_bad(pmd))
>   continue;
>   vunmap_pte_range(pmd, addr, next, mask);
> -
> - cond_resched();
>   } while (pmd++, addr = next, addr != end);
>  }

If this is triggering a warning then why isn't the might_sleep() in
remove_vm_area() also triggering?

Sigh.  I also cannot remember why these vfree() functions have to be so
awkward.  The mutex_trylock(&vmap_purge_lock) isn't permitted in
interrupt context because mutex_trylock() is stupid, but what was the
issue with non-interrupt atomic code?




Re: [PATCH RFC] driver core: Ensure DT devices always have fwnode set

2020-11-06 Thread Saravana Kannan
On Fri, Nov 6, 2020 at 11:23 AM Mark Brown  wrote:
>
> On Fri, Nov 06, 2020 at 11:09:17AM -0800, Saravana Kannan wrote:
>
> > If you want to do this in "one common place", then I think the way to
> > do this is have include/linux/of.h provide something like:
>
> > void of_set_device_of_node(dev, of_node)
> > {
> > dev->of_node = of_node;
> > dev->fw_node = &of_node->fwnode;
> >/* bunch of other housekeeping like setting OF_POPULATED and doing
> > proper of_node_get() */
> >// Passing NULL for of_node could undo all the above for dev->of_node.
> > }
>
> That also sounds good, particularly if we have a coccinelle spatch

I've never used coccinelle. But I can fix 5-10 easily findable ones
like i2c, platform, spi, slimbus, spmi, etc.

> or
> some other mechanism that enforced the usage of the function where
> appropriate, my main concern is making sure that we do something which
> ensures that the boilerplate stuff is handled.

Rob/Frank,

I spent an hour or more looking at this and there are many ways of
doing this. Wanted to know how much you wanted to do inside these
boilerplate functions.

This is the minimum we should do in these helper functions.

+/**
+ * of_unset_dev_of_node - Unset a device's of_node
+ * @dev - device to unset the of_node for
+ *
+ * Use this when you delete a device on which you had called
+ * of_set_dev_of_node() before.
+ */
+static inline void of_unset_dev_of_node(struct device *dev)
+{
+   struct device_node *node = dev->of_node
+
+   if (!node)
+   return;
+
+   dev->fwnode = NULL;
+   dev->of_node = NULL;
+   of_node_put(node);
+}
+
+/**
+ * of_set_dev_of_node - Set a device's of_node
+ * @dev - device to set the of_node for
+ * @node - the device_node that the device was constructed from
+ *
+ * Use this when you create a new device from a device_node. It takes care some
+ * of the housekeeping work that's necessary when you set a device's of_node.
+ *
+ * Use of_unset_dev_of_node() before you delete the device.
+ *
+ * Returns an error if the device already has its of_node set.
+ */
+static inline int of_set_dev_of_node(struct device *dev, struct
device_node *node)
+{
+   if (!node)
+   return 0;
+
+   if (WARN_ON(dev->of_node))
+   return -EBUSY;
+
+   of_node_get(node);
+   dev->of_node = node;
+   dev->fwnode = of_fwnode_handle(node);
+
+   return 0;
+}

But I also had another version that set/cleared OF_POPULATED. But that
meant of_device_alloc() will allocate the device before checking if
the node has already been populated (because I'd delete the check
that's already there and use the one rolled into these helper
functions). I think that inefficiency is okay because I don't think
"trying to populate an already populated node" would be a common
occurrence. But I wasn't sure how you'd feel about it.

Any preferences? Thoughts?

Additional context:
https://lore.kernel.org/lkml/20201104205431.3795207-2-sarava...@google.com/

-Saravana


Re: [PATCH v2 bpf-next 1/5] bpf: add in-kernel split BTF support

2020-11-06 Thread Andrii Nakryiko
On Fri, Nov 6, 2020 at 5:28 PM Song Liu  wrote:
>
>
>
> > On Nov 6, 2020, at 3:02 PM, Andrii Nakryiko  wrote:
> >
> > Adjust in-kernel BTF implementation to support a split BTF mode of 
> > operation.
> > Changes are mostly mirroring libbpf split BTF changes, with the exception of
> > start_id being 0 for in-kernel implementation due to simpler read-only mode.
> >
> > Otherwise, for split BTF logic, most of the logic of jumping to base BTF,
> > where necessary, is encapsulated in few helper functions. Type numbering and
> > string offset in a split BTF are logically continuing where base BTF ends, 
> > so
> > most of the high-level logic is kept without changes.
> >
> > Type verification and size resolution is only doing an added resolution of 
> > new
> > split BTF types and relies on already cached size and type resolution 
> > results
> > in the base BTF.
> >
> > Signed-off-by: Andrii Nakryiko 
>
> [...]
>
> >
> > @@ -600,8 +618,15 @@ static const struct btf_kind_operations 
> > *btf_type_ops(const struct btf_type *t)
> >
> > static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
> > {
> > - return BTF_STR_OFFSET_VALID(offset) &&
> > - offset < btf->hdr.str_len;
> > + if (!BTF_STR_OFFSET_VALID(offset))
> > + return false;
> > +again:
> > + if (offset < btf->start_str_off) {
> > + btf = btf->base_btf;
> > + goto again;
>
> Can we do a while loop instead of "goto again;"?

yep, not sure why I went with goto...

while (offset < btf->start_str_off)
btf = btf->base_btf;

Shorter.

>
> > + }
> > + offset -= btf->start_str_off;
> > + return offset < btf->hdr.str_len;
> > }
> >
> > static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
> > @@ -615,10 +640,25 @@ static bool __btf_name_char_ok(char c, bool first, 
> > bool dot_ok)
> >   return true;
> > }
> >
> > +static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
> > +{
> > +again:
> > + if (offset < btf->start_str_off) {
> > + btf = btf->base_btf;
> > + goto again;
> > + }
>
> Maybe add a btf_find_base_btf(btf, offset) helper for this logic?

No strong feelings about this, but given it's a two-line loop might
not be worth it. I'd also need a pretty verbose
btf_find_base_btf_for_str_offset() and
btf_find_base_btf_for_type_id(). I feel like loop might be less
distracting actually.

>
> > +
> > + offset -= btf->start_str_off;
> > + if (offset < btf->hdr.str_len)
> > + return &btf->strings[offset];
> > +
> > + return NULL;
> > +}
> > +
>
> [...]
>
> > }
> >
> > const char *btf_name_by_offset(const struct btf *btf, u32 offset)
> > {
> > - if (offset < btf->hdr.str_len)
> > - return &btf->strings[offset];
> > -
> > - return NULL;
> > + return btf_str_by_offset(btf, offset);
> > }
>
> IIUC, btf_str_by_offset() and btf_name_by_offset() are identical. Can we
> just keep btf_name_by_offset()?

btf_str_by_offset() is static, so should be inlinable, while
btf_name_by_offset() is a global function, I was worrying about
regressing performance for __btf_name_valid() and
__btf_name_by_offset(). Premature optimization you think?

>
> >
> > const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
> > {
> > - if (type_id > btf->nr_types)
> > - return NULL;
> > +again:
> > + if (type_id < btf->start_id) {
> > + btf = btf->base_btf;
> > + goto again;
> > + }
>
> ditto, goto again..
>
> [...]
>
>


Re: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection

2020-11-06 Thread Dan Williams
On Fri, Nov 6, 2020 at 4:12 PM Jason Gunthorpe  wrote:
>
> On Fri, Nov 06, 2020 at 03:47:00PM -0800, Dan Williams wrote:
[..]
> The only sane way to implement this generically is for the VMM to
> provide a hypercall to obtain a real *working* addr/data pair(s) and
> then have the platform hand those out from
> pci_subdevice_msi_create_irq_domain().

Yeah, that seems a logical attach point for this magic. Appreciate you
taking the time to lay it out.


Re: [PATCH] mm/memcontrol:rewrite mem_cgroup_page_lruvec()

2020-11-06 Thread Andrew Morton
On Wed, 4 Nov 2020 22:25:16 +0800 Hui Su  wrote:

> mem_cgroup_page_lruvec() in memcontrol.c and
> mem_cgroup_lruvec() in memcontrol.h is very similar
> except for the param(page and memcg) which also can be
> convert to each other.
> 
> So rewrite mem_cgroup_page_lruvec() with mem_cgroup_lruvec().

Alex Shi's "mm/memcg: warning on !memcg after readahead page charged"
(https://lkml.kernel.org/r/1604283436-18880-3-git-send-email-alex@linux.alibaba.com)
changes mem_cgroup_page_lruvec() thusly:

--- a/mm/memcontrol.c~mm-memcg-warning-on-memcg-after-readahead-page-charged
+++ a/mm/memcontrol.c
@@ -1325,10 +1325,7 @@ struct lruvec *mem_cgroup_page_lruvec(st
}
 
memcg = page_memcg(page);
-   /*
-* Swapcache readahead pages are added to the LRU - and
-* possibly migrated - before they are charged.
-*/
+   VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
memcg = root_mem_cgroup;
 
So the patch didn't apply.

That's easily fixed, but it does make one wonder whether this:

> -struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
> +/**
> + * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
> + * @page: the page
> + * @pgdat: pgdat of the page
> + *
> + * This function relies on page->mem_cgroup being stable.
> + */
> +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
> + struct pglist_data *pgdat)
> +{
> + struct mem_cgroup *memcg = page->mem_cgroup;
> +
> + return mem_cgroup_lruvec(memcg, pgdat);
> +}

Should be using page_memcg()?



Re: [PATCH v2 bpf-next 1/5] bpf: add in-kernel split BTF support

2020-11-06 Thread Song Liu



> On Nov 6, 2020, at 3:02 PM, Andrii Nakryiko  wrote:
> 
> Adjust in-kernel BTF implementation to support a split BTF mode of operation.
> Changes are mostly mirroring libbpf split BTF changes, with the exception of
> start_id being 0 for in-kernel implementation due to simpler read-only mode.
> 
> Otherwise, for split BTF logic, most of the logic of jumping to base BTF,
> where necessary, is encapsulated in few helper functions. Type numbering and
> string offset in a split BTF are logically continuing where base BTF ends, so
> most of the high-level logic is kept without changes.
> 
> Type verification and size resolution is only doing an added resolution of new
> split BTF types and relies on already cached size and type resolution results
> in the base BTF.
> 
> Signed-off-by: Andrii Nakryiko 

[...]

> 
> @@ -600,8 +618,15 @@ static const struct btf_kind_operations 
> *btf_type_ops(const struct btf_type *t)
> 
> static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
> {
> - return BTF_STR_OFFSET_VALID(offset) &&
> - offset < btf->hdr.str_len;
> + if (!BTF_STR_OFFSET_VALID(offset))
> + return false;
> +again:
> + if (offset < btf->start_str_off) {
> + btf = btf->base_btf;
> + goto again;

Can we do a while loop instead of "goto again;"?

> + }
> + offset -= btf->start_str_off;
> + return offset < btf->hdr.str_len;
> }
> 
> static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
> @@ -615,10 +640,25 @@ static bool __btf_name_char_ok(char c, bool first, bool 
> dot_ok)
>   return true;
> }
> 
> +static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
> +{
> +again:
> + if (offset < btf->start_str_off) {
> + btf = btf->base_btf;
> + goto again;
> + }

Maybe add a btf_find_base_btf(btf, offset) helper for this logic?

> +
> + offset -= btf->start_str_off;
> + if (offset < btf->hdr.str_len)
> + return &btf->strings[offset];
> +
> + return NULL;
> +}
> +

[...]

> }
> 
> const char *btf_name_by_offset(const struct btf *btf, u32 offset)
> {
> - if (offset < btf->hdr.str_len)
> - return &btf->strings[offset];
> -
> - return NULL;
> + return btf_str_by_offset(btf, offset);
> }

IIUC, btf_str_by_offset() and btf_name_by_offset() are identical. Can we
just keep btf_name_by_offset()?

> 
> const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
> {
> - if (type_id > btf->nr_types)
> - return NULL;
> +again:
> + if (type_id < btf->start_id) {
> + btf = btf->base_btf;
> + goto again;
> + }

ditto, goto again..

[...]




Re: [PATCH 4/9 next] fs/io_uring Don't use the return value from import_iovec().

2020-11-06 Thread Pavel Begunkov
On 15/09/2020 15:55, David Laight wrote:
> 
> This is the only code that relies on import_iovec() returning
> iter.count on success.
> This allows a better interface to import_iovec().

Seems this got nowhere. I'll pick it and send with some other
patches to Jens.

> Signed-off-by: David Laight 
> ---
>  fs/io_uring.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 3790c7fe9fee..0df43882e4b3 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2824,7 +2824,7 @@ static ssize_t __io_import_iovec(int rw, struct 
> io_kiocb *req,
>  
>   ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
>   *iovec = NULL;
> - return ret < 0 ? ret : sqe_len;
> + return ret;
>   }
>  
>   if (req->flags & REQ_F_BUFFER_SELECT) {
> @@ -2853,7 +2853,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb 
> *req,
>   if (!req->io)
>   return __io_import_iovec(rw, req, iovec, iter, needs_lock);
>   *iovec = NULL;
> - return iov_iter_count(&req->io->rw.iter);
> + return 0;
>  }
>  
>  static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
> @@ -3123,7 +3123,7 @@ static int io_read(struct io_kiocb *req, bool 
> force_nonblock,
>   if (ret < 0)
>   return ret;
>   iov_count = iov_iter_count(iter);
> - io_size = ret;
> + io_size = iov_count;
>   req->result = io_size;
>   ret = 0;
>  
> @@ -3246,7 +3246,7 @@ static int io_write(struct io_kiocb *req, bool 
> force_nonblock,
>   if (ret < 0)
>   return ret;
>   iov_count = iov_iter_count(iter);
> - io_size = ret;
> + io_size = iov_count;
>   req->result = io_size;
>  
>   /* Ensure we clear previously set non-block flag */
> 

-- 
Pavel Begunkov


[PATCH v2] x86/xen: don't unbind uninitialized lock_kicker_irq

2020-11-06 Thread Brian Masney
When booting a hyperthreaded system with the kernel parameter
'mitigations=auto,nosmt', the following warning occurs:

WARNING: CPU: 0 PID: 1 at drivers/xen/events/events_base.c:1112 
unbind_from_irqhandler+0x4e/0x60
...
Hardware name: Xen HVM domU, BIOS 4.2.amazon 08/24/2006
...
Call Trace:
 xen_uninit_lock_cpu+0x28/0x62
 xen_hvm_cpu_die+0x21/0x30
 takedown_cpu+0x9c/0xe0
 ? trace_suspend_resume+0x60/0x60
 cpuhp_invoke_callback+0x9a/0x530
 _cpu_up+0x11a/0x130
 cpu_up+0x7e/0xc0
 bringup_nonboot_cpus+0x48/0x50
 smp_init+0x26/0x79
 kernel_init_freeable+0xea/0x229
 ? rest_init+0xaa/0xaa
 kernel_init+0xa/0x106
 ret_from_fork+0x35/0x40

The secondary CPUs are not activated with the nosmt mitigations and only
the primary thread on each CPU core is used. In this situation,
xen_hvm_smp_prepare_cpus(), and more importantly xen_init_lock_cpu(), is
not called, so the lock_kicker_irq is not initialized for the secondary
CPUs. Let's fix this by exiting early in xen_uninit_lock_cpu() if the
irq is not set to avoid the warning from above for each secondary CPU.

Signed-off-by: Brian Masney 
---
Changes since v1:
- Remove duplicate per_cpu() call and pass in irq variable.
- Changed subject from 'x86/xen: fix warning when running with nosmt
  mitigations'
- Shorten code comment

 arch/x86/xen/spinlock.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 799f4eba0a62..043c73dfd2c9 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -93,10 +93,20 @@ void xen_init_lock_cpu(int cpu)
 
 void xen_uninit_lock_cpu(int cpu)
 {
+   int irq;
+
if (!xen_pvspin)
return;
 
-   unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+   /*
+* When booting the kernel with 'mitigations=auto,nosmt', the secondary
+* CPUs are not activated, and lock_kicker_irq is not initialized.
+*/
+   irq = per_cpu(lock_kicker_irq, cpu);
+   if (irq == -1)
+   return;
+
+   unbind_from_irqhandler(irq, NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
kfree(per_cpu(irq_name, cpu));
per_cpu(irq_name, cpu) = NULL;
-- 
2.26.2



Re: [PATCH v4 2/5] arm64, numa: Change the numa init functions name to be generic

2020-11-06 Thread Atish Patra
On Fri, Nov 6, 2020 at 11:08 AM Catalin Marinas  wrote:
>
> On Fri, Nov 06, 2020 at 09:33:14AM -0800, Atish Patra wrote:
> > On Fri, Nov 6, 2020 at 9:14 AM Catalin Marinas  
> > wrote:
> > > On Mon, Oct 05, 2020 at 05:17:49PM -0700, Atish Patra wrote:
> > > > diff --git a/arch/arm64/kernel/acpi_numa.c 
> > > > b/arch/arm64/kernel/acpi_numa.c
> > > > index 7ff800045434..96502ff92af5 100644
> > > > --- a/arch/arm64/kernel/acpi_numa.c
> > > > +++ b/arch/arm64/kernel/acpi_numa.c
> > > > @@ -117,16 +117,3 @@ void __init acpi_numa_gicc_affinity_init(struct 
> > > > acpi_srat_gicc_affinity *pa)
> > > >
> > > >   node_set(node, numa_nodes_parsed);
> > > >  }
> > > > -
> > > > -int __init arm64_acpi_numa_init(void)
> > > > -{
> > > > - int ret;
> > > > -
> > > > - ret = acpi_numa_init();
> > > > - if (ret) {
> > > > - pr_info("Failed to initialise from firmware\n");
> > > > - return ret;
> > > > - }
> > > > -
> > > > - return srat_disabled() ? -EINVAL : 0;
> > > > -}
> > >
> > > I think it's better if arm64_acpi_numa_init() and arm64_numa_init()
> > > remained in the arm64 code. It's not really much code to be shared.
> >
> > RISC-V will probably support ACPI one day. The idea is to not to do
> > exercise again in future.
> > Moreover, there will be arch_numa_init which will be used by RISC-V
> > and there will be arm64_numa_init
> > used by arm64. However, if you feel strongly about it, I am happy to
> > move back those two functions to arm64.
>
> I don't have a strong view on this, only if there's a risk at some point
> of the implementations diverging (e.g. quirks). We can revisit it if
> that happens.
>

Sure. I seriously hope we don't have to deal with arch specific quirks
in future.

> It may be worth swapping patches 1 and 2 so that you don't have an
> arm64_* function in the core code after the first patch (more of a
> nitpick). Either way, feel free to add my ack on both patches:
>

Sure. I will swap 1 & 2 and resend the series.

> Acked-by: Catalin Marinas 

Thanks.

-- 
Regards,
Atish


Re: [PATCH v9 2/7] rcu/segcblist: Add counters to segcblist datastructure

2020-11-06 Thread Paul E. McKenney
On Fri, Nov 06, 2020 at 07:18:47PM -0500, Joel Fernandes wrote:
> On Fri, Nov 06, 2020 at 07:01:57PM -0500, Joel Fernandes wrote:
> > On Wed, Nov 04, 2020 at 09:01:33AM -0800, Paul E. McKenney wrote:
> > 
> > > A casual reader might be forgiven for being confused by the combination
> > > of "Return" in the above comment and the "void" function type below.
> > > So shouldn't this comment be something like "Add the specified number
> > > of callbacks to the specified segment..."?
> > 
> > You are right, sorry and will fix it.
> > 
> > > > @@ -330,11 +342,16 @@ void rcu_segcblist_extract_pend_cbs(struct 
> > > > rcu_segcblist *rsclp,
> > > >  
> > > > if (!rcu_segcblist_pend_cbs(rsclp))
> > > > return; /* Nothing to do. */
> > > > +   rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) +
> > > > +   rcu_segcblist_get_seglen(rsclp, 
> > > > RCU_NEXT_READY_TAIL) +
> > > > +   rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL);
> > > 
> > > This should be a "for" loop.  Yes, the number and names of the segments
> > > hasn't changed for a good long time, but nothing like code as above to
> > > inspire Murphy to more mischief.  :-/
> > > 
> > > Actually, why not put the summation in the existing "for" loop below?
> > > That would save a line of code in addition to providing less inspiration
> > > for Mr. Murphy.
> > 
> > I can do that. Actually Frederic suggested the same thing but I was 
> > reluctant
> > as I felt it did not give much LOC benefit. Will revisit it.
> 
> It reduces 1 line of code :) I changed it to the below, will update the patch:

Thank you!  And yes, I am much more concerned about the constraints on
Mr. Murphy than on the lines of code.  ;-)

Thanx, Paul

> ---8<---
> 
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index 9b43d686b1f3..bff9b2253e50 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -101,7 +101,7 @@ static void rcu_segcblist_set_seglen(struct rcu_segcblist 
> *rsclp, int seg, long
>   WRITE_ONCE(rsclp->seglen[seg], v);
>  }
>  
> -/* Return number of callbacks in a segment of the segmented callback list. */
> +/* Increase the numeric length of a segment by a specified amount. */
>  static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, 
> long v)
>  {
>   WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v);
> @@ -406,13 +406,12 @@ void rcu_segcblist_extract_pend_cbs(struct 
> rcu_segcblist *rsclp,
>  
>   if (!rcu_segcblist_pend_cbs(rsclp))
>   return; /* Nothing to do. */
> - rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) +
> - rcu_segcblist_get_seglen(rsclp, RCU_NEXT_READY_TAIL) +
> - rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL);
> + rclp->len = 0;
>   *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
>   rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
>   WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
>   for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
> + rclp->len += rcu_segcblist_get_seglen(rsclp, i);
>   WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
>   rcu_segcblist_set_seglen(rsclp, i, 0);
>   }


Re: [Y2038][time namespaces] Question regarding CLOCK_REALTIME support plans in Linux time namespaces

2020-11-06 Thread Thomas Gleixner
On Thu, Nov 05 2020 at 12:25, Carlos O'Donell wrote:
> On 10/30/20 9:38 PM, Thomas Gleixner wrote:
> If kata grows up quickly perhaps this entire problem becomes solved, but until
> then I continue to have a testing need for a distinct CLOCK_REALTIME in a
> time namespace (and it need not be unconditional, if I have to engage magic
> then I'm happy to do that).

Conditional, that might be a way to go.

Would CONFIG_DEBUG_DISTORTED_CLOCK_REALTIME be a way to go? IOW,
something which is clearly in the debug section of the kernel which wont
get turned on by distros (*cough*) and comes with a description that any
bug reports against it vs. time correctness are going to be ignored.

> * Adding CLOCK_REALTIME to the kernel is a lot of work given the expected
>   guarantees for a local system.

Correct.

> * CLOCK_REALTIME is an expensive resource to maintain, even more expensive
>   than other resources where the kernel can balance their usage.

Correct.

> * On balance it would be better to use vm or vm+containers e.g. kata as a
>   solution to having CLOCK_REALTIME distinct in the container.

That'd be the optimal solution, but the above might be a middle ground.

Thanks,

tglx


Re: [PATCH] x86/xen: fix warning when running with nosmt mitigations

2020-11-06 Thread boris . ostrovsky


On 11/5/20 7:47 PM, Brian Masney wrote:
> On Thu, Nov 05, 2020 at 07:35:29PM -0500, Brian Masney wrote:
>> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
>> index 799f4eba0a62..4a052459a08e 100644
>> --- a/arch/x86/xen/spinlock.c
>> +++ b/arch/x86/xen/spinlock.c
>> @@ -93,9 +93,24 @@ void xen_init_lock_cpu(int cpu)
>>  
>>  void xen_uninit_lock_cpu(int cpu)
>>  {
>> +int irq;
>> +
>>  if (!xen_pvspin)
>>  return;
>>  
>> +/*
>> + * When booting the kernel with 'mitigations=auto,nosmt', the secondary
>> + * CPUs are not activated and only the primary thread on each CPU core
>> + * is used. In this situation, xen_hvm_smp_prepare_cpus(), and more
>> + * importantly xen_init_lock_cpu(), is not called, so the
>> + * lock_kicker_irq is not initialized for the secondary CPUs. Let's
>> + * exit early if the irq is not set to avoid a warning in the console
>> + * log.
>> + */
>> +irq = per_cpu(lock_kicker_irq, cpu);
>> +if (irq == -1)
>> +return;
>> +
>>  unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
> As soon as I saw this on lore, I saw that I should have passed the irq
> variable to unbind_from_irqhandler() rather than doing another per_cpu()
> lookup. I'll wait for feedback about the general approach before posting
> a v2.


This looks good. I'd shorten the comment though: your commit message already 
describes the scenario. And change the subject to something like "Don't unbind 
uninitialized lock_kicker_irq".


-boris



Re: [PATCH] interconnect: qcom: msm8974: Prevent integer overflow in rate

2020-11-06 Thread Brian Masney
On Fri, Nov 06, 2020 at 04:48:47PM +0200, Georgi Djakov wrote:
> When sync_state support got introduced recently, by default we try to
> set the NoCs to run initially at maximum rate. But as these values are
> aggregated, we may end with a really big clock rate value, which is
> then converted from "u64" to "long" during the clock rate rounding.
> But on 32bit platforms this may result an overflow. Fix it by making
> sure that the rate is within range.
> 
> Reported-by: Luca Weiss 
> Fixes: b1d681d8d324 ("interconnect: Add sync state support")
> Signed-off-by: Georgi Djakov 

Reviewed-by: Brian Masney 



RE: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection

2020-11-06 Thread Thomas Gleixner
On Fri, Nov 06 2020 at 09:48, Kevin Tian wrote:
>> From: Jason Gunthorpe 
>> On Wed, Nov 04, 2020 at 01:34:08PM +, Tian, Kevin wrote:
>> The interrupt controller is responsible to create an addr/data pair
>> for an interrupt message. It sets the message format and ensures it
>> routes to the proper CPU interrupt handler. Everything about the
>> addr/data pair is owned by the platform interrupt controller.
>> 
>> Devices do not create interrupts. They only trigger the addr/data pair
>> the platform gives them.
>
> I guess that we may just view it from different angles. On x86 platform,
> a MSI/IMS capable device directly composes interrupt messages, with 
> addr/data pair filled by OS. If there is no IOMMU remapping enabled in 
> the middle, the message just hits the CPU. Your description possibly
> is from software side, e.g. describing the hierarchical IRQ domain
> concept?

No. The device composes nothing. If the interrupt is raised in the
device then the MSI block sends the message which was composed by the OS
and stored in the device's message store. For PCI/MSI that's the MSI or
MSIX table and for IMS that's either on device memory (as IDXD uses) or
some completely different location which Jason described.

This has absolutely nothing to do with the X86 platform. MSI is a
architecture independent mechanism: Send whatever the OS put into the
storage to raise an interrupt in the CPU. The device does neither know
whether that message is going to be intercepted by an interrupt
remapping unit or not.

Stop claiming that any of this has anything to do with x86. It has
absolutely nothing to do with x86 and looking at MSI from an x86
perspective instead of looking at it from the architecture agnostic
technical reality of MSI is the reason why we have this discussion at
all.

We had a similar discussion vs. the way how IMS interrupts have to be
dealt with in terms of irq domains. Can you finally stop looking at
everything as a big x86/intel/platform lump and understand that things
are very well structured and seperated both at the hardware and at the
software level? 

> Do you mind providing the link? There were lots of discussions between
> you and Thomas. I failed to locate the exact mail when searching above
> keywords. 

In this thread: 20200821002424.119492...@linutronix.de and you were on
Cc

Thanks,

tglx




Re: [PATCH v9 5/7] rcu/segcblist: Remove useless rcupdate.h include

2020-11-06 Thread Joel Fernandes
On Thu, Nov 05, 2020 at 06:28:10AM -0800, Paul E. McKenney wrote:
> On Wed, Nov 04, 2020 at 07:48:23PM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 03, 2020 at 09:26:01AM -0500, Joel Fernandes (Google) wrote:
> > > Signed-off-by: Joel Fernandes (Google) 
> > 
> > This one looks fine, but depends on the earlier "rcu/segcblist: Add
> > counters to segcblist datastructure" patch, which also changes the list
> > of #include directives for this file.  It manages to avoid conflicting
> > with "rcu/trace: Add tracing for how segcb list changes", despite this
> > one also changing the #include directives.
> > 
> > I am testing it just out of curiosity, but it might make more sense
> > to fold this one into "rcu/segcblist: Add counters to segcblist
> > datastructure".
> 
> And it does pass light rcutorture.  ;-)

Cool, I squashed it into 2/7 and updated my tree.

thanks,

 - Joel

>   Thanx, Paul
> 
> > > ---
> > >  kernel/rcu/rcu_segcblist.c | 1 -
> > >  1 file changed, 1 deletion(-)
> > > 
> > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > > index 2a03949d0b82..e9e72d72f7a6 100644
> > > --- a/kernel/rcu/rcu_segcblist.c
> > > +++ b/kernel/rcu/rcu_segcblist.c
> > > @@ -10,7 +10,6 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > -#include 
> > >  #include 
> > >  
> > >  #include "rcu_segcblist.h"
> > > -- 
> > > 2.29.1.341.ge80a0c044ae-goog
> > > 


Re: [PATCH v9 6/7] rcu/tree: segcblist: Remove redundant smp_mb()s

2020-11-06 Thread Joel Fernandes
On Wed, Nov 04, 2020 at 07:57:13PM -0800, Paul E. McKenney wrote:
> On Tue, Nov 03, 2020 at 09:26:02AM -0500, Joel Fernandes (Google) wrote:
> > This memory barrier is not needed as rcu_segcblist_add_len() already
> > includes a memory barrier *before* and *after* the length of the list is
> > updated.
> > 
> > Same reasoning for rcu_segcblist_enqueue().
> 
> I suggest a commit log like the following:
> 
> 
> 
> The full memory barriers in rcu_segcblist_enqueue() and in rcu_do_batch()
> are not needed because rcu_segcblist_add_len(), and thus also
> rcu_segcblist_inc_len(), already includes a memory barrier *before*
> and *after* the length of the list is updated.
> 
> This commit therefore removes these redundant smp_mb() invocations.
> 
> 
> 
> Other than that, looks good!  I could hand-apply it, but that
> would just cause more churn with the addition of the call to
> rcu_segcblist_inc_seglen().  So could you please update the commit log
> when you repost, whether to the mailing list or from your git tree?

Done, I updated it in my tree. I will share the link to tree on IRC.

thanks,

 - Joel

> 
>   Thanx, Paul
> 
> > Reviewed-by: Frederic Weisbecker 
> > Signed-off-by: Joel Fernandes (Google) 
> > ---
> >  kernel/rcu/rcu_segcblist.c | 1 -
> >  kernel/rcu/tree.c  | 1 -
> >  2 files changed, 2 deletions(-)
> > 
> > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > index e9e72d72f7a6..d96272e8d604 100644
> > --- a/kernel/rcu/rcu_segcblist.c
> > +++ b/kernel/rcu/rcu_segcblist.c
> > @@ -268,7 +268,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
> >struct rcu_head *rhp)
> >  {
> > rcu_segcblist_inc_len(rsclp);
> > -   smp_mb(); /* Ensure counts are updated before callback is enqueued. */
> > rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL);
> > rhp->next = NULL;
> > WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index f6c6653b3ec2..fb2a5ac4a59c 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -2525,7 +2525,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
> >  
> > /* Update counts and requeue any remaining callbacks. */
> > rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
> > -   smp_mb(); /* List handling before counting for rcu_barrier(). */
> > rcu_segcblist_add_len(&rdp->cblist, -count);
> >  
> > /* Reinstate batch limit if we have worked down the excess. */
> > -- 
> > 2.29.1.341.ge80a0c044ae-goog
> > 


Re: [PATCH v22 12/23] LSM: Specify which LSM to display

2020-11-06 Thread Casey Schaufler
On 11/5/2020 1:22 AM, Greg KH wrote:
> On Wed, Nov 04, 2020 at 03:41:03PM -0800, Casey Schaufler wrote:
>> Create a new entry "display" in the procfs attr directory for
>> controlling which LSM security information is displayed for a
>> process. A process can only read or write its own display value.
>>
>> The name of an active LSM that supplies hooks for
>> human readable data may be written to "display" to set the
>> value. The name of the LSM currently in use can be read from
>> "display". At this point there can only be one LSM capable
>> of display active. A helper function lsm_task_display() is
>> provided to get the display slot for a task_struct.
>>
>> Setting the "display" requires that all security modules using
>> setprocattr hooks allow the action. Each security module is
>> responsible for defining its policy.
>>
>> AppArmor hook provided by John Johansen 
>> SELinux hook provided by Stephen Smalley 
>>
>> Reviewed-by: Kees Cook 
>> Acked-by: Stephen Smalley 
>> Acked-by: Paul Moore 
>> Signed-off-by: Casey Schaufler 
>> Cc: linux-...@vger.kernel.org
>> ---
>>  fs/proc/base.c   |   1 +
>>  include/linux/lsm_hooks.h|  17 +++
>>  security/apparmor/include/apparmor.h |   3 +-
>>  security/apparmor/lsm.c  |  32 +
>>  security/security.c  | 169 ---
>>  security/selinux/hooks.c |  11 ++
>>  security/selinux/include/classmap.h  |   2 +-
>>  security/smack/smack_lsm.c   |   7 ++
>>  8 files changed, 223 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index 0f707003dda5..7432f24f0132 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -2806,6 +2806,7 @@ static const struct pid_entry attr_dir_stuff[] = {
>>  ATTR(NULL, "fscreate",  0666),
>>  ATTR(NULL, "keycreate", 0666),
>>  ATTR(NULL, "sockcreate",0666),
>> +ATTR(NULL, "display",   0666),
> That's a vague name, any chance it can be more descriptive?

Sure. How about lsm_display, or display_lsm? I wouldn't say that
any of the files in /proc/*/attr have especially descriptive names,
but that's hardly an excuse.

> And where is the Documentation/ABI/ entries for all of this, how does
> userspace know what these things are, and how to use them?

I'll add ABI descriptions and move some of the lsm.rst up from where it
is later in the patchset.

>
> thanks,
>
> greg k-h


Re: [PATCH v9 2/7] rcu/segcblist: Add counters to segcblist datastructure

2020-11-06 Thread Joel Fernandes
On Fri, Nov 06, 2020 at 07:01:57PM -0500, Joel Fernandes wrote:
> On Wed, Nov 04, 2020 at 09:01:33AM -0800, Paul E. McKenney wrote:
> 
> > A casual reader might be forgiven for being confused by the combination
> > of "Return" in the above comment and the "void" function type below.
> > So shouldn't this comment be something like "Add the specified number
> > of callbacks to the specified segment..."?
> 
> You are right, sorry and will fix it.
> 
> > > @@ -330,11 +342,16 @@ void rcu_segcblist_extract_pend_cbs(struct 
> > > rcu_segcblist *rsclp,
> > >  
> > >   if (!rcu_segcblist_pend_cbs(rsclp))
> > >   return; /* Nothing to do. */
> > > + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) +
> > > + rcu_segcblist_get_seglen(rsclp, RCU_NEXT_READY_TAIL) +
> > > + rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL);
> > 
> > This should be a "for" loop.  Yes, the number and names of the segments
> > hasn't changed for a good long time, but nothing like code as above to
> > inspire Murphy to more mischief.  :-/
> > 
> > Actually, why not put the summation in the existing "for" loop below?
> > That would save a line of code in addition to providing less inspiration
> > for Mr. Murphy.
> 
> I can do that. Actually Frederic suggested the same thing but I was reluctant
> as I felt it did not give much LOC benefit. Will revisit it.

It reduces 1 line of code :) I changed it to the below, will update the patch:

---8<---

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 9b43d686b1f3..bff9b2253e50 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -101,7 +101,7 @@ static void rcu_segcblist_set_seglen(struct rcu_segcblist 
*rsclp, int seg, long
WRITE_ONCE(rsclp->seglen[seg], v);
 }
 
-/* Return number of callbacks in a segment of the segmented callback list. */
+/* Increase the numeric length of a segment by a specified amount. */
 static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, 
long v)
 {
WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v);
@@ -406,13 +406,12 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist 
*rsclp,
 
if (!rcu_segcblist_pend_cbs(rsclp))
return; /* Nothing to do. */
-   rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) +
-   rcu_segcblist_get_seglen(rsclp, RCU_NEXT_READY_TAIL) +
-   rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL);
+   rclp->len = 0;
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
+   rclp->len += rcu_segcblist_get_seglen(rsclp, i);
WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
rcu_segcblist_set_seglen(rsclp, i, 0);
}


Re: [PATCH v8 17/18] scsi: megaraid_sas: Added support for shared host tagset for cpuhotplug

2020-11-06 Thread Qian Cai
On Sat, 2020-11-07 at 00:55 +0530, Sumit Saxena wrote:
> I am able to hit the boot hang and similar kind of stack traces as
> reported by Qian with shared .config on x86 machine.
> In my case the system boots after a hang of 40-45 mins. Qian, is it
> true for you as well ?
I don't know. I had never waited for that long.



Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Jason Gunthorpe
On Fri, Nov 06, 2020 at 01:03:26PM -0700, Logan Gunthorpe wrote:
> I don't think a function like that will work for the p2pmem use case. In
> order to implement proper page freeing I expect I'll need a loop around
> the allocator and vm_insert_mixed()... Something roughly like:
> 
> for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
> vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE);
>   ret = vmf_insert_mixed(vma, addr,
>  __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP));
> }
> 
> That way we can call pci_free_p2pmem() when a page's ref count goes to
> zero. I suspect your use case will need to do something similar.

Yes, but I would say the pci_alloc_p2pmem() layer should be able to
free pages on a page-by-page basis so you don't have to do stuff like
the above.

There is often a lot of value in having physical contiguous addresses,
so allocating page by page as well seems poor.

Jason


Re: [PATCH] mm/gup_benchmark: GUP_BENCHMARK depends on DEBUG_FS

2020-11-06 Thread John Hubbard

On 11/4/20 2:05 AM, Barry Song wrote:

Without DEBUG_FS, all the code in gup_benchmark becomes meaningless.
For sure kernel provides debugfs stub while DEBUG_FS is disabled, but
the point here is that GUP_BENCHMARK can do nothing without DEBUG_FS.

Cc: John Hubbard 
Cc: Ralph Campbell 
Inspired-by: John Garry 
Signed-off-by: Barry Song 
---
  * inspired by John's comment in this patch:
  
https://lore.kernel.org/linux-iommu/184797b8-512e-e3da-fae7-25c7d6626...@huawei.com/

  mm/Kconfig | 1 +
  1 file changed, 1 insertion(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index d42423f..91fa923 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -836,6 +836,7 @@ config PERCPU_STATS
  
  config GUP_BENCHMARK

bool "Enable infrastructure for get_user_pages() and related calls 
benchmarking"
+   depends on DEBUG_FS



I think "select DEBUG_FS" is better here. "depends on" has the obnoxious 
behavior
of hiding the choice from you, if the dependencies aren't already met. Whereas 
what
the developer *really* wants is a no-nonsense activation of the choice: "enable
GUP_BENCHMARK and the debug fs that it requires".

So depends on really on is better for things that you just can't control, such 
as
the cpu arch you're on, etc.

Also note that this will have some minor merge conflict with mmotm, Due to 
renaming
to GUP_TEST. No big deal though.


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection

2020-11-06 Thread Jason Gunthorpe
On Fri, Nov 06, 2020 at 03:47:00PM -0800, Dan Williams wrote:

> Also feel free to straighten me out (Jason or Ashok) if I've botched
> the understanding of this.

It is pretty simple when you get down to it.

We have a new kernel API that Thomas added:

  pci_subdevice_msi_create_irq_domain()

This creates an IRQ domain that hands out addr/data pairs that
trigger interrupts.

On bare metal the addr/data pairs from the IRQ domain are programmed
into the HW in some HW specific way by the device driver that calls
the above function.

On (kvm) virtualization the addr/data pair the IRQ domain hands out
doesn't work. It is some fake thing.

To make this work on normal MSI/MSI-X the VMM implements emulation of
the standard MSI/MSI-X programming and swaps the fake addr/data pair
for a real one obtained from the hypervisor IRQ domain.

To "deal" with this issue the SIOV spec suggests to add a per-device
PCI Capability that says "IMS works". Which means either:
 - This is bare metal, so of course it works
 - The VMM is trapping and emulating whatever the device specific IMS
   programming is.

The idea being that a VMM can never advertise the IMS cap flag to the
guest unles the VMM provides a device specific driver that does device
specific emulation to capture the addr/data pair. Remeber IMS doesn't
say how to program the addr/data pair! Every device is unique!

On something like IDXD this emulation is not so hard, on something
like mlx5 this is completely unworkable. Further we never do
emulation on our devices, they always pass native hardware through,
even for SIOV-like cases.

In the end pci_subdevice_msi_create_irq_domain() is a platform
function. Either it should work completely on every device with no
device-specific emulation required in the VMM, or it should not work
at all and return -EOPNOTSUPP.

The only sane way to implement this generically is for the VMM to
provide a hypercall to obtain a real *working* addr/data pair(s) and
then have the platform hand those out from
pci_subdevice_msi_create_irq_domain(). 

All IMS device drivers will work correctly. No VMM device emulation is
ever needed to translate addr/data pairs.

Earlier in this thread Kevin said hyper-v is already working this way,
even for MSI/MSI-X. To me this says it is fundamentally a KVM platform
problem and it should not be solved by PCI capability flags.

Jason


[PATCH v2] Make iwmmxt.S support Clang's integrated assembler

2020-11-06 Thread Jian Cai
This patch replaces 6 IWMMXT instructions Clang's integrated assembler
does not support in iwmmxt.S using macros, while making sure GNU
assembler still emit the same instructions. This should be easier than
providing full IWMMXT support in Clang.

"Intel Wireless MMX Technology - Developer Guide - August, 2002" should
be referenced for the encoding schemes of these extensions.

Link: https://github.com/ClangBuiltLinux/linux/issues/975

Suggested-by: Nick Desaulniers 
Suggested-by: Ard Biesheuvel 
Signed-off-by: Jian Cai 
---
 arch/arm/kernel/iwmmxt.S | 89 
 arch/arm/kernel/iwmmxt.h | 47 +
 2 files changed, 92 insertions(+), 44 deletions(-)
 create mode 100644 arch/arm/kernel/iwmmxt.h

diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S
index 0dcae787b004..d2b4ac06e4ed 100644
--- a/arch/arm/kernel/iwmmxt.S
+++ b/arch/arm/kernel/iwmmxt.S
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include "iwmmxt.h"
 
 #if defined(CONFIG_CPU_PJ4) || defined(CONFIG_CPU_PJ4B)
 #define PJ4(code...)   code
@@ -113,33 +114,33 @@ concan_save:
 
 concan_dump:
 
-   wstrw   wCSSF, [r1, #MMX_WCSSF]
-   wstrw   wCASF, [r1, #MMX_WCASF]
-   wstrw   wCGR0, [r1, #MMX_WCGR0]
-   wstrw   wCGR1, [r1, #MMX_WCGR1]
-   wstrw   wCGR2, [r1, #MMX_WCGR2]
-   wstrw   wCGR3, [r1, #MMX_WCGR3]
+   wstrw   wCSSF, r1, MMX_WCSSF
+   wstrw   wCASF, r1, MMX_WCASF
+   wstrw   wCGR0, r1, MMX_WCGR0
+   wstrw   wCGR1, r1, MMX_WCGR1
+   wstrw   wCGR2, r1, MMX_WCGR2
+   wstrw   wCGR3, r1, MMX_WCGR3
 
 1: @ MUP? wRn
tst r2, #0x2
beq 2f
 
-   wstrd   wR0,  [r1, #MMX_WR0]
-   wstrd   wR1,  [r1, #MMX_WR1]
-   wstrd   wR2,  [r1, #MMX_WR2]
-   wstrd   wR3,  [r1, #MMX_WR3]
-   wstrd   wR4,  [r1, #MMX_WR4]
-   wstrd   wR5,  [r1, #MMX_WR5]
-   wstrd   wR6,  [r1, #MMX_WR6]
-   wstrd   wR7,  [r1, #MMX_WR7]
-   wstrd   wR8,  [r1, #MMX_WR8]
-   wstrd   wR9,  [r1, #MMX_WR9]
-   wstrd   wR10, [r1, #MMX_WR10]
-   wstrd   wR11, [r1, #MMX_WR11]
-   wstrd   wR12, [r1, #MMX_WR12]
-   wstrd   wR13, [r1, #MMX_WR13]
-   wstrd   wR14, [r1, #MMX_WR14]
-   wstrd   wR15, [r1, #MMX_WR15]
+   wstrd   wR0,  r1, MMX_WR0
+   wstrd   wR1,  r1, MMX_WR1
+   wstrd   wR2,  r1, MMX_WR2
+   wstrd   wR3,  r1, MMX_WR3
+   wstrd   wR4,  r1, MMX_WR4
+   wstrd   wR5,  r1, MMX_WR5
+   wstrd   wR6,  r1, MMX_WR6
+   wstrd   wR7,  r1, MMX_WR7
+   wstrd   wR8,  r1, MMX_WR8
+   wstrd   wR9,  r1, MMX_WR9
+   wstrd   wR10, r1, MMX_WR10
+   wstrd   wR11, r1, MMX_WR11
+   wstrd   wR12, r1, MMX_WR12
+   wstrd   wR13, r1, MMX_WR13
+   wstrd   wR14, r1, MMX_WR14
+   wstrd   wR15, r1, MMX_WR15
 
 2: teq r0, #0  @ anything to load?
reteq   lr  @ if not, return
@@ -147,30 +148,30 @@ concan_dump:
 concan_load:
 
@ Load wRn
-   wldrd   wR0,  [r0, #MMX_WR0]
-   wldrd   wR1,  [r0, #MMX_WR1]
-   wldrd   wR2,  [r0, #MMX_WR2]
-   wldrd   wR3,  [r0, #MMX_WR3]
-   wldrd   wR4,  [r0, #MMX_WR4]
-   wldrd   wR5,  [r0, #MMX_WR5]
-   wldrd   wR6,  [r0, #MMX_WR6]
-   wldrd   wR7,  [r0, #MMX_WR7]
-   wldrd   wR8,  [r0, #MMX_WR8]
-   wldrd   wR9,  [r0, #MMX_WR9]
-   wldrd   wR10, [r0, #MMX_WR10]
-   wldrd   wR11, [r0, #MMX_WR11]
-   wldrd   wR12, [r0, #MMX_WR12]
-   wldrd   wR13, [r0, #MMX_WR13]
-   wldrd   wR14, [r0, #MMX_WR14]
-   wldrd   wR15, [r0, #MMX_WR15]
+   wldrd   wR0,  r0, MMX_WR0
+   wldrd   wR1,  r0, MMX_WR1
+   wldrd   wR2,  r0, MMX_WR2
+   wldrd   wR3,  r0, MMX_WR3
+   wldrd   wR4,  r0, MMX_WR4
+   wldrd   wR5,  r0, MMX_WR5
+   wldrd   wR6,  r0, MMX_WR6
+   wldrd   wR7,  r0, MMX_WR7
+   wldrd   wR8,  r0, MMX_WR8
+   wldrd   wR9,  r0, MMX_WR9
+   wldrd   wR10, r0, MMX_WR10
+   wldrd   wR11, r0, MMX_WR11
+   wldrd   wR12, r0, MMX_WR12
+   wldrd   wR13, r0, MMX_WR13
+   wldrd   wR14, r0, MMX_WR14
+   wldrd   wR15, r0, MMX_WR15
 
@ Load wCx
-   wldrw   wCSSF, [r0, #MMX_WCSSF]
-   wldrw   wCASF, [r0, #MMX_WCASF]
-   wldrw   wCGR0, [r0, #MMX_WCGR0]
-   wldrw   wCGR1, [r0, #MMX_WCGR1]
-   wldrw   wCGR2, [r0, #MMX_WCGR2]
-   wldrw   wCGR3, [r0, #MMX_WCGR3]
+   wldrw   wCSSF, r0, MMX_WCSSF
+   wldrw   wCASF, r0, MMX_WCASF
+   wldrw   wCGR0, r0, MMX_WCGR0
+   wldrw   wCGR1, r0, MMX_WCGR1
+   wldrw   wCGR2, r0, MMX_WCGR2
+   wldrw   wCGR3, r0, MMX_WCGR3
 
@ clear CUP/MUP (only if r1 != 0)
teq r1, #0
diff --git a/arch/arm/kernel/iwmmxt.h b/arch/arm/kernel/iwmmxt.h
new file mode 100644
index ..fb627286f5bb
--- /dev/null
+++ b/arch/arm/kernel/iwmmxt.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __IWMMXT_H__
+#define __IWMMXT_H__
+
+.irp b,

Re: [PATCH v9 4/7] rcu/trace: Add tracing for how segcb list changes

2020-11-06 Thread Joel Fernandes
On Wed, Nov 04, 2020 at 03:33:14PM +0100, Frederic Weisbecker wrote:
> On Wed, Nov 04, 2020 at 06:08:07AM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 03, 2020 at 04:17:31PM +0100, Frederic Weisbecker wrote:
> > > On Tue, Nov 03, 2020 at 09:26:00AM -0500, Joel Fernandes (Google) wrote:
> > > > +/*
> > > > + * Return how many CBs each segment along with their gp_seq values.
> > > > + *
> > > > + * This function is O(N) where N is the number of segments. Only used 
> > > > from
> > > > + * tracing code which is usually disabled in production.
> > > > + */
> > > > +#ifdef CONFIG_RCU_TRACE
> > > > +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> > > > +int cbcount[RCU_CBLIST_NSEGS],
> > > > +unsigned long gpseq[RCU_CBLIST_NSEGS])
> > > > +{
> > > > +   int i;
> > > > +
> > > > +   for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> > > > +   cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> > > > +   gpseq[i] = rsclp->gp_seq[i];
> > > > +   }
> > > > +}
> > > > +
> > > > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char 
> > > > *context)
> > > > +{
> > > > +   int cbs[RCU_CBLIST_NSEGS];
> > > > +   unsigned long gps[RCU_CBLIST_NSEGS];
> > > > +
> > > > +   if (!trace_rcu_segcb_stats_enabled())
> > > > +   return;
> > > 
> > > Yes, very good!
> > > 
> > > Paul just told me that RCU_TRACE can be used in production so that 
> > > confirms that we
> > > wanted to avoid this loop of 8 iterations when tracing is disabled.
> > 
> > RCU's "don't try this in production" Kconfig option is PROVE_RCU.
> > 
> > I would be looking for checks that the sum of the segment lengths
> > match the overall ->len or checks that all of the segment lengths
> > are zero when ->cblist is empty to be guarded by something like
> > IS_ENABLED(CONFIG_PROVE_RCU).  Of course, checks of this sort need to
> > be confined to those portions of rcu_do_batch() that are excluding other
> > accesses to ->cblist.
> 
> Right.
> 
> > 
> > But if rcu_segcblist_countseq() is invoked only when a specific trace
> > event is enabled, it should be OK to have it guarded only by RCU_TRACE.
> 
> Indeed, so I think we are good.

Thanks, so the only changes are to patches 2/7 and 4/7 which I will work on.
1/7 was already taken by Paul. For 7/7, it sounds like I did not understand
Paul's reworks on the comments and we're still discussing it; but some
comments are better than none, so I am Ok with Pauls version of it.

thanks,

 - Joel



Re: [PATCH v9 2/7] rcu/segcblist: Add counters to segcblist datastructure

2020-11-06 Thread Joel Fernandes
On Wed, Nov 04, 2020 at 09:01:33AM -0800, Paul E. McKenney wrote:

> A casual reader might be forgiven for being confused by the combination
> of "Return" in the above comment and the "void" function type below.
> So shouldn't this comment be something like "Add the specified number
> of callbacks to the specified segment..."?

You are right, sorry and will fix it.

> > @@ -330,11 +342,16 @@ void rcu_segcblist_extract_pend_cbs(struct 
> > rcu_segcblist *rsclp,
> >  
> > if (!rcu_segcblist_pend_cbs(rsclp))
> > return; /* Nothing to do. */
> > +   rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) +
> > +   rcu_segcblist_get_seglen(rsclp, RCU_NEXT_READY_TAIL) +
> > +   rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL);
> 
> This should be a "for" loop.  Yes, the number and names of the segments
> hasn't changed for a good long time, but nothing like code as above to
> inspire Murphy to more mischief.  :-/
> 
> Actually, why not put the summation in the existing "for" loop below?
> That would save a line of code in addition to providing less inspiration
> for Mr. Murphy.

I can do that. Actually Frederic suggested the same thing but I was reluctant
as I felt it did not give much LOC benefit. Will revisit it.

> 
> > *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
> > rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
> > WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
> > -   for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
> > +   for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
> > WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
> > +   rcu_segcblist_set_seglen(rsclp, i, 0);
> > +   }
> >  }
> >  
> >  /*
> > @@ -345,7 +362,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist 
> > *rsclp,
> > struct rcu_cblist *rclp)
> >  {
> > rcu_segcblist_add_len(rsclp, rclp->len);
> > -   rclp->len = 0;
> 
> You audited the callers, correct?

Yep.

thanks,

 - Joel



Re: [PATCH v5 6/7] IMA: add critical_data to the built-in policy rules

2020-11-06 Thread Lakshmi Ramasubramanian

On 11/6/20 7:37 AM, Lakshmi Ramasubramanian wrote:

Hi Mimi,




Hi Lakshmi, Tushar,

This patch defines a new critical_data builtin policy.  Please update
the Subject line.

On Sun, 2020-11-01 at 14:26 -0800, Tushar Sugandhi wrote:

From: Lakshmi Ramasubramanian 

The IMA hook to measure kernel critical data, namely
ima_measure_critical_data(), could be called before a custom IMA policy
is loaded. For example, SELinux calls ima_measure_critical_data() to
measure its state and policy when they are initialized. This occurs
before a custom IMA policy is loaded, and hence IMA hook will not
measure the data. A built-in policy is therefore needed to measure
critical data provided by callers before a custom IMA policy is loaded.


^Define a new critical data builtin policy to allow measuring early
kernel integrity critical data before a custom IMA policy is loaded.


I will add the above line in the patch description.



Either remove the references to SELinux or move this patch after the
subsequent patch which measures SELinux critical data.


I will remove the reference to SELinux.
I think it would be better to have this patch before the SELinux 
measurement patch.






Add CRITICAL_DATA to built-in IMA rules if the kernel command line
contains "ima_policy=critical_data". Set the IMA template for this rule
to "ima-buf" since ima_measure_critical_data() measures a buffer.

Signed-off-by: Lakshmi Ramasubramanian 



---
  security/integrity/ima/ima_policy.c | 32 +
  1 file changed, 32 insertions(+)

diff --git a/security/integrity/ima/ima_policy.c 
b/security/integrity/ima/ima_policy.c

index ec99e0bb6c6f..dc8fe969d3fe 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c



@@ -875,6 +884,29 @@ void __init ima_init_policy(void)
    ARRAY_SIZE(default_appraise_rules),
    IMA_DEFAULT_POLICY);
+    if (ima_use_critical_data) {
+    template = lookup_template_desc("ima-buf");
+    if (!template) {
+    ret = -EINVAL;
+    goto out;
+    }
+
+    ret = template_desc_init_fields(template->fmt,
+    &(template->fields),
+    &(template->num_fields));


The default IMA template when measuring buffer data is "ima_buf".   Is
there a reason for allocating and initializing it here and not
deferring it until process_buffer_measurement()?



You are right - good catch.
I will remove the above and validate.



process_buffer_measurement() allocates and initializes "ima-buf" 
template only when the parameter "func" is NONE. Currently, only 
ima_check_blacklist() passes NONE for func when calling 
process_buffer_measurement().


If "func" is anything other than NONE, ima_match_policy() picks
the default IMA template if the IMA policy rule does not specify a template.

We need to add "ima-buf" in the built-in policy for critical_data so 
that the default template is not used for buffer measurement.


Please let me know if I am missing something.

thanks,
 -lakshmi




+    if (ret)
+    goto out;
+
+    critical_data_rules[0].template = template;
+    add_rules(critical_data_rules,
+  ARRAY_SIZE(critical_data_rules),
+  IMA_DEFAULT_POLICY);
+    }
+
+out:
+    if (ret)
+    pr_err("%s failed, result: %d\n", __func__, ret);
+
  ima_update_policy_flag();
  }








Re: [GIT PULL] Kselftest fixes update for Linux 5.10-rc3

2020-11-06 Thread pr-tracker-bot
The pull request you sent on Fri, 6 Nov 2020 11:16:07 -0700:

> git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest 
> tags/linux-kselftest-fixes-5.10-rc3

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/03f0f5ad58479ba1374f10680fc836aa21abe8f9

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html


[PATCH] include/linux/huge_mm.h: remove extern keyword

2020-11-06 Thread Ralph Campbell
The external function definitions don't need the "extern" keyword.
Remove them so future changes don't copy the function definition style.

Signed-off-by: Ralph Campbell 
---

This applies cleanly to linux-mm 5.10.0-rc2 and is for Andrew's tree.

 include/linux/huge_mm.h | 93 ++---
 1 file changed, 41 insertions(+), 52 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 0365aa97f8e7..6a19f35f836b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -7,43 +7,37 @@
 
 #include  /* only for vma_is_dax() */
 
-extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
-extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
-struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
-extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
-struct vm_area_struct *vma);
+vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *vma);
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
+int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+ struct vm_area_struct *vma);
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
+void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
 #else
 static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
 {
 }
 #endif
 
-extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
-extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
- unsigned long addr,
- pmd_t *pmd,
- unsigned int flags);
-extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
-   struct vm_area_struct *vma,
-   pmd_t *pmd, unsigned long addr, unsigned long next);
-extern int zap_huge_pmd(struct mmu_gather *tlb,
-   struct vm_area_struct *vma,
-   pmd_t *pmd, unsigned long addr);
-extern int zap_huge_pud(struct mmu_gather *tlb,
-   struct vm_area_struct *vma,
-   pud_t *pud, unsigned long addr);
-extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
-unsigned long new_addr,
-pmd_t *old_pmd, pmd_t *new_pmd);
-extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-   unsigned long addr, pgprot_t newprot,
-   unsigned long cp_flags);
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+  unsigned long addr, pmd_t *pmd,
+  unsigned int flags);
+bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+  pmd_t *pmd, unsigned long addr, unsigned long next);
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t 
*pmd,
+unsigned long addr);
+int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t 
*pud,
+unsigned long addr);
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
+   pgprot_t newprot, unsigned long cp_flags);
 vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
   pgprot_t pgprot, bool write);
 
@@ -100,13 +94,13 @@ enum transparent_hugepage_flag {
 struct kobject;
 struct kobj_attribute;
 
-extern ssize_t single_hugepage_flag_store(struct kobject *kobj,
-struct kobj_attribute *attr,
-const char *buf, size_t count,
-enum transparent_hugepage_flag flag);
-extern ssize_t single_hugepage_flag_show(struct kobject *kobj,
-   struct kobj_attribute *attr, char *buf,
-   enum transparent_hugepage_flag flag);
+ssize_t single_hugepage_flag_store(struct kobject *kobj,
+  struct kobj_attribute *attr,
+  const char *buf, size_t count,
+   

Re: [GIT PULL] Ceph fix for 5.10-rc3

2020-11-06 Thread pr-tracker-bot
The pull request you sent on Fri,  6 Nov 2020 20:27:50 +0100:

> https://github.com/ceph/ceph-client.git tags/ceph-for-5.10-rc3

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/659caaf65dc9c7150aa3e80225ec6e66b25ab3ce

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html


Re: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection

2020-11-06 Thread Dan Williams
On Fri, Nov 6, 2020 at 9:51 AM Jason Gunthorpe  wrote:
[..]
> > This is true for IMS as well. But probably not implemented in the kernel as
> > such. From a HW point of view (take idxd for instance) the facility is
> > available to native OS as well. The early RFC supported this for native.
>
> I can't follow what you are trying to say here.

I'm having a hard time following the technical cruxes of this debate.
I grokked your feedback on the original IMS proposal way back at the
beginning of this effort (pre-COVID even!), so maybe I can mediate
here as well. Although, SIOV is that much harder for me to spell than
IMS, so bear with me.

> Dave said the IMS cap was to indicate that the VMM supported emulation
> of IMS so that the VMM can do the MSI addr/data translation as part of
> the emulation.
>
> I'm saying emulation will be too horrible for our devices that don't
> require *any* emulation.

This part I think I understand, i.e. why spend any logic emulating IMS
as MSI since the IMS capability can be a paravirtualized interface
from guest to VMM with none of the compromises that MSI would enforce.
Did I get that right?

> It is a bad architecture. The platform needs to handle this globally
> for all devices, not special hacky emulations things custom made for
> every device out there.

I confess I don't quite understand the shape of what "platform needs
to handle this globally" means, but I understand the desired end
result of "no emulation added where not needed". However, would this
mean that the bare-metal idxd driver can not be used directly in the
guest without modification? For example, as I understand from talking
to Ashok, idxd has some device events like error notification hard
wired to MSI while data patch interrupts are IMS. So even if the IMS
side does not hook up MSI emulation doesn't idxd still need MSI
emulation to reuse the bare metal driver directly?

> > Native devices can have both MSIx and IMS capability. But as I
> > understand this isn't how we have partitioned things in SW today. We
> > left IMS only for mdev's. And I agree this would be very useful.
>
> That split is just some decision idxd did, we are thinking about doing
> other things in our devices.

Where does the collision happen between what you need for a clean
implementation of an IMS-like capability (/me misses his "dev-msi"
name that got thrown out in the Thomas rewrite), and emulation needed
to not have VF special casing in the idxd driver.

Also feel free to straighten me out (Jason or Ashok) if I've botched
the understanding of this.


Re: [GIT PULL] SCSI fixes for 5.10-rc2

2020-11-06 Thread pr-tracker-bot
The pull request you sent on Fri, 06 Nov 2020 14:26:05 -0800:

> git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git scsi-fixes

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/d4fc96832f0131c8f2fb067fb01c3007df6d4c9f

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html


Re: [PATCH 0/8] simplify ep_poll

2020-11-06 Thread Linus Torvalds
On Fri, Nov 6, 2020 at 3:17 PM Soheil Hassas Yeganeh
 wrote:
>
> The first patch in the series is a fix for the epoll race in
> presence of timeouts, so that it can be cleanly backported to all
> affected stable kernels.
>
> The rest of the patch series simplify the ep_poll() implementation.
> Some of these simplifications result in minor performance enhancements
> as well.  We have kept these changes under self tests and internal
> benchmarks for a few days, and there are minor (1-2%) performance
> enhancements as a result.

>From just looking at the patches (not the end result - I didn't
actually apply them), it looks sane to me.

 Linus


[PATCH 2/2] drm/nouveau/kms/nv50-: Fix clock checking algorithm in nv50_dp_mode_valid()

2020-11-06 Thread Lyude Paul
While I thought I had this correct (since it actually did reject modes
like I expected during testing), Ville Syrjala from Intel pointed out
that the logic here isn't correct. max_clock refers to the max data rate
supported by the DP encoder. So, limiting it to the output of ds_clock (which
refers to the maximum dotclock of the downstream DP device) doesn't make any
sense. Additionally, since we're using the connector's bpc as the canonical BPC
we should use this in mode_valid until we support dynamically setting the bpp
based on bandwidth constraints.

https://lists.freedesktop.org/archives/dri-devel/2020-September/280276.html

For more info.

So, let's rewrite this using Ville's advice.

Changes made for stable backport:
* 5.9 didn't use drm_dp_downstream_max_dotclock() yet, so remove that (the
  fix is still important regardless)

v2:
* Ville pointed out I mixed up the dotclock and the link rate. So fix that...
* ...and also rename all the variables in this function to be more appropriately
  labeled so I stop mixing them up.
* Reuse the bpp from the connector for now until we have dynamic bpp selection.
* Use use DIV_ROUND_UP for calculating the mode rate like i915 does, which we
  should also have been doing from the start

Signed-off-by: Lyude Paul 
Fixes: 409d38139b42 ("drm/nouveau/kms/nv50-: Use downstream DP clock limits for 
mode validation")
Cc: Ville Syrjälä 
Cc: Lyude Paul 
Cc: Ben Skeggs 
Signed-off-by: Ben Skeggs 
---
 drivers/gpu/drm/nouveau/nouveau_dp.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c 
b/drivers/gpu/drm/nouveau/nouveau_dp.c
index 40683e1244c3f..9c06d1cc43905 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dp.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dp.c
@@ -114,7 +114,8 @@ nv50_dp_mode_valid(struct drm_connector *connector,
   unsigned *out_clock)
 {
const unsigned min_clock = 25000;
-   unsigned max_clock, clock = mode->clock;
+   unsigned int max_rate, mode_rate, clock = mode->clock;
+   const u8 bpp = connector->display_info.bpc * 3;
 
if (mode->flags & DRM_MODE_FLAG_INTERLACE && !outp->caps.dp_interlace)
return MODE_NO_INTERLACE;
@@ -122,12 +123,13 @@ nv50_dp_mode_valid(struct drm_connector *connector,
if ((mode->flags & DRM_MODE_FLAG_3D_MASK) == 
DRM_MODE_FLAG_3D_FRAME_PACKING)
clock *= 2;
 
-   max_clock = outp->dp.link_nr * outp->dp.link_bw;
-   clock = mode->clock * (connector->display_info.bpc * 3) / 10;
+   max_rate = outp->dp.link_nr * outp->dp.link_bw;
+   mode_rate = DIV_ROUND_UP(clock * bpp, 8);
+   if (mode_rate > max_rate)
+   return MODE_CLOCK_HIGH;
+
if (clock < min_clock)
return MODE_CLOCK_LOW;
-   if (clock > max_clock)
-   return MODE_CLOCK_HIGH;
 
if (out_clock)
*out_clock = clock;
-- 
2.28.0



[PATCH V3 00/10] PKS: Add Protection Keys Supervisor (PKS) support V3

2020-11-06 Thread ira . weiny
From: Ira Weiny 

Changes from V2 [4]
Rebased on tip-tree/core/entry
From Thomas Gleixner
Address bisectability
Drop Patch:
x86/entry: Move nmi entry/exit into common code
From Greg KH
Remove WARN_ON's
From Dan Williams
Add __must_check to pks_key_alloc()
New patch: x86/pks: Add PKS defines and config options
Split from Enable patch to build on through the series
Fix compile errors

Changes from V1
Rebase to TIP master; resolve conflicts and test
Clean up some kernel docs updates missed in V1
Add irqentry_state_t kernel doc for PKRS field
Removed redundant irq_state->pkrs
This is only needed when we add the global state and somehow
ended up in this patch series.  That will come back when we add
the global functionality in.
From Thomas Gleixner
Update commit messages
Add kernel doc for struct irqentry_state_t
From Dave Hansen add flags to pks_key_alloc()

Changes from RFC V3[3]
Rebase to TIP master
Update test error output
Standardize on 'irq_state' for state variables
From Dave Hansen
Update commit messages
Add/clean up comments
Add X86_FEATURE_PKS to disabled-features.h and remove some
explicit CONFIG checks
Move saved_pkrs member of thread_struct
Remove superfluous preempt_disable()
s/irq_save_pks/irq_save_set_pks/
Ensure PKRS is not seen in faults if not configured or not
supported
s/pks_mknoaccess/pks_mk_noaccess/
s/pks_mkread/pks_mk_readonly/
s/pks_mkrdwr/pks_mk_readwrite/
Change pks_key_alloc return to -EOPNOTSUPP when not supported
From Peter Zijlstra
Clean up Attribution
Remove superfluous preempt_disable()
Add union to differentiate exit_rcu/lockdep use in
irqentry_state_t
From Thomas Gleixner
Add preliminary clean up patch and adjust series as needed


Introduce a new page protection mechanism for supervisor pages, Protection Key
Supervisor (PKS).

2 use cases for PKS are being developed, trusted keys and PMEM.  Trusted keys
is a newer use case which is still being explored.  PMEM was submitted as part
of the RFC (v2) series[1].  However, since then it was found that some callers
of kmap() require a global implementation of PKS.  Specifically some users of
kmap() expect mappings to be available to all kernel threads.  While global use
of PKS is rare it needs to be included for correctness.  Unfortunately the
kmap() updates required a large patch series to make the needed changes at the
various kmap() call sites so that patch set has been split out.  Because the
global PKS feature is only required for that use case it will be deferred to
that set as well.[2]  This patch set is being submitted as a precursor to both
of the use cases.

For an overview of the entire PKS ecosystem, a git tree including this series
and 2 proposed use cases can be found here:


https://lore.kernel.org/lkml/20201009195033.3208459-1-ira.we...@intel.com/

https://lore.kernel.org/lkml/20201009201410.3209180-1-ira.we...@intel.com/


PKS enables protections on 'domains' of supervisor pages to limit supervisor
mode access to those pages beyond the normal paging protections.  PKS works in
a similar fashion to user space pkeys, PKU.  As with PKU, supervisor pkeys are
checked in addition to normal paging protections and Access or Writes can be
disabled via a MSR update without TLB flushes when permissions change.  Also
like PKU, a page mapping is assigned to a domain by setting pkey bits in the
page table entry for that mapping.

Access is controlled through a PKRS register which is updated via WRMSR/RDMSR.

XSAVE is not supported for the PKRS MSR.  Therefore the implementation
saves/restores the MSR across context switches and during exceptions.  Nested
exceptions are supported by each exception getting a new PKS state.

For consistent behavior with current paging protections, pkey 0 is reserved and
configured to allow full access via the pkey mechanism, thus preserving the
default paging protections on mappings with the default pkey value of 0.

Other keys, (1-15) are allocated by an allocator which prepares us for key
contention from day one.  Kernel users should be prepared for the allocator to
fail either because of key exhaustion or due to PKS not being supported on the
arch and/or CPU instance.

The following are key attributes of PKS.

   1) Fast switching of permissions
1a) Prevents access without page table manipulations
1b) No TLB flushes required
   2) Works 

[PATCH 1/2] drm/nouveau/kms/nv50-: Get rid of bogus nouveau_conn_mode_valid()

2020-11-06 Thread Lyude Paul
Ville also pointed out that I got a lot of the logic here wrong as well, whoops.
While I don't think anyone's likely using 3D output with nouveau, the next patch
will make nouveau_conn_mode_valid() make a lot less sense. So, let's just get
rid of it and open-code it like before, while taking care to move the 3D frame
packing calculations on the dot clock into the right place.

Signed-off-by: Lyude Paul 
Fixes: d6a9efece724 ("drm/nouveau/kms/nv50-: Share DP SST mode_valid() handling 
with MST")
Cc: Ville Syrjälä 
Cc:  # v5.8+
Signed-off-by: Ben Skeggs 
---
 drivers/gpu/drm/nouveau/nouveau_connector.c | 36 ++---
 drivers/gpu/drm/nouveau/nouveau_dp.c| 15 ++---
 2 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c 
b/drivers/gpu/drm/nouveau/nouveau_connector.c
index 7674025a4bfe8..1d91d52ee5083 100644
--- a/drivers/gpu/drm/nouveau/nouveau_connector.c
+++ b/drivers/gpu/drm/nouveau/nouveau_connector.c
@@ -1035,29 +1035,6 @@ get_tmds_link_bandwidth(struct drm_connector *connector)
return 112000 * duallink_scale;
 }
 
-enum drm_mode_status
-nouveau_conn_mode_clock_valid(const struct drm_display_mode *mode,
- const unsigned min_clock,
- const unsigned max_clock,
- unsigned int *clock_out)
-{
-   unsigned int clock = mode->clock;
-
-   if ((mode->flags & DRM_MODE_FLAG_3D_MASK) ==
-   DRM_MODE_FLAG_3D_FRAME_PACKING)
-   clock *= 2;
-
-   if (clock < min_clock)
-   return MODE_CLOCK_LOW;
-   if (clock > max_clock)
-   return MODE_CLOCK_HIGH;
-
-   if (clock_out)
-   *clock_out = clock;
-
-   return MODE_OK;
-}
-
 static enum drm_mode_status
 nouveau_connector_mode_valid(struct drm_connector *connector,
 struct drm_display_mode *mode)
@@ -1065,7 +1042,7 @@ nouveau_connector_mode_valid(struct drm_connector 
*connector,
struct nouveau_connector *nv_connector = nouveau_connector(connector);
struct nouveau_encoder *nv_encoder = nv_connector->detected_encoder;
struct drm_encoder *encoder = to_drm_encoder(nv_encoder);
-   unsigned min_clock = 25000, max_clock = min_clock;
+   unsigned int min_clock = 25000, max_clock = min_clock, clock = 
mode->clock;
 
switch (nv_encoder->dcb->type) {
case DCB_OUTPUT_LVDS:
@@ -1094,8 +1071,15 @@ nouveau_connector_mode_valid(struct drm_connector 
*connector,
return MODE_BAD;
}
 
-   return nouveau_conn_mode_clock_valid(mode, min_clock, max_clock,
-NULL);
+   if ((mode->flags & DRM_MODE_FLAG_3D_MASK) == 
DRM_MODE_FLAG_3D_FRAME_PACKING)
+   clock *= 2;
+
+   if (clock < min_clock)
+   return MODE_CLOCK_LOW;
+   if (clock > max_clock)
+   return MODE_CLOCK_HIGH;
+
+   return MODE_OK;
 }
 
 static struct drm_encoder *
diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c 
b/drivers/gpu/drm/nouveau/nouveau_dp.c
index 8a0f7994e1aeb..40683e1244c3f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dp.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dp.c
@@ -114,18 +114,23 @@ nv50_dp_mode_valid(struct drm_connector *connector,
   unsigned *out_clock)
 {
const unsigned min_clock = 25000;
-   unsigned max_clock, clock;
-   enum drm_mode_status ret;
+   unsigned max_clock, clock = mode->clock;
 
if (mode->flags & DRM_MODE_FLAG_INTERLACE && !outp->caps.dp_interlace)
return MODE_NO_INTERLACE;
 
+   if ((mode->flags & DRM_MODE_FLAG_3D_MASK) == 
DRM_MODE_FLAG_3D_FRAME_PACKING)
+   clock *= 2;
+
max_clock = outp->dp.link_nr * outp->dp.link_bw;
clock = mode->clock * (connector->display_info.bpc * 3) / 10;
+   if (clock < min_clock)
+   return MODE_CLOCK_LOW;
+   if (clock > max_clock)
+   return MODE_CLOCK_HIGH;
 
-   ret = nouveau_conn_mode_clock_valid(mode, min_clock, max_clock,
-   &clock);
if (out_clock)
*out_clock = clock;
-   return ret;
+
+   return MODE_OK;
 }
-- 
2.28.0



Re: [PATCH 2/3] vfio/virqfd: Drain events from eventfd in virqfd_wakeup()

2020-11-06 Thread Alex Williamson
On Tue, 27 Oct 2020 13:55:22 +
David Woodhouse  wrote:

> From: David Woodhouse 
> 
> Don't allow the events to accumulate in the eventfd counter, drain them
> as they are handled.
> 
> Signed-off-by: David Woodhouse 
> ---

Acked-by: Alex Williamson 

Paolo, I assume you'll add this to your queue.  Thanks,

Alex

>  drivers/vfio/virqfd.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
> index 997cb5d0a657..414e98d82b02 100644
> --- a/drivers/vfio/virqfd.c
> +++ b/drivers/vfio/virqfd.c
> @@ -46,6 +46,9 @@ static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned 
> mode, int sync, void
>   __poll_t flags = key_to_poll(key);
>  
>   if (flags & EPOLLIN) {
> + u64 cnt;
> + eventfd_ctx_do_read(virqfd->eventfd, &cnt);
> +
>   /* An event has been signaled, call function */
>   if ((!virqfd->handler ||
>virqfd->handler(virqfd->opaque, virqfd->data)) &&



[PATCH V3 06/10] x86/entry: Preserve PKRS MSR across exceptions

2020-11-06 Thread ira . weiny
From: Ira Weiny 

The PKRS MSR is not managed by XSAVE.  It is preserved through a context
switch but this support leaves exception handling code open to memory
accesses during exceptions.

2 possible places for preserving this state were considered,
irqentry_state_t or pt_regs.[1]  pt_regs was much more complicated and
was potentially fraught with unintended consequences.[2]
irqentry_state_t was already an object being used in the exception
handling and is straightforward.  It is also easy for any number of
nested states to be tracked and eventually can be enhanced to store the
reference counting required to support PKS through kmap reentry

Preserve the current task's PKRS values in irqentry_state_t on exception
entry and restoring them on exception exit.

Each nested exception is further saved allowing for any number of levels
of exception handling.

Peter and Thomas both suggested parts of the patch, IDT and NMI respectively.

[1] 
https://lore.kernel.org/lkml/calcetrve1i5jdyzd_bcctxqjn+ze3t38efpgjxn1f577m36...@mail.gmail.com/
[2] https://lore.kernel.org/lkml/874kpxx4jf@nanos.tec.linutronix.de/#t

Cc: Dave Hansen 
Cc: Andy Lutomirski 
Suggested-by: Peter Zijlstra 
Suggested-by: Thomas Gleixner 
Signed-off-by: Ira Weiny 

---
Changes from V1
remove redundant irq_state->pkrs
This value is only needed for the global tracking.  So
it should be included in that patch and not in this one.

Changes from RFC V3
Standardize on 'irq_state' variable name
Per Dave Hansen
irq_save_pkrs() -> irq_save_set_pkrs()
Rebased based on clean up patch by Thomas Gleixner
This includes moving irq_[save_set|restore]_pkrs() to
the core as well.
---
 arch/x86/entry/common.c | 38 +
 arch/x86/include/asm/pkeys_common.h |  5 ++--
 arch/x86/mm/pkeys.c |  2 +-
 include/linux/entry-common.h| 13 ++
 kernel/entry/common.c   | 14 +--
 5 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 87dea56a15d2..1b6a419a6fac 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_XEN_PV
 #include 
@@ -209,6 +210,41 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
 }
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+/*
+ * PKRS is a per-logical-processor MSR which overlays additional protection for
+ * pages which have been mapped with a protection key.
+ *
+ * The register is not maintained with XSAVE so we have to maintain the MSR
+ * value in software during context switch and exception handling.
+ *
+ * Context switches save the MSR in the task struct thus taking that value to
+ * other processors if necessary.
+ *
+ * To protect against exceptions having access to this memory we save the
+ * current running value and set the PKRS value for the duration of the
+ * exception.  Thus preventing exception handlers from having the elevated
+ * access of the interrupted task.
+ */
+noinstr void irq_save_set_pkrs(irqentry_state_t *irq_state, u32 val)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_PKS))
+   return;
+
+   irq_state->thread_pkrs = current->thread.saved_pkrs;
+   write_pkrs(INIT_PKRS_VALUE);
+}
+
+noinstr void irq_restore_pkrs(irqentry_state_t *irq_state)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_PKS))
+   return;
+
+   write_pkrs(irq_state->thread_pkrs);
+   current->thread.saved_pkrs = irq_state->thread_pkrs;
+}
+#endif /* CONFIG_ARCH_HAS_SUPERVISOR_PKEYS */
+
 #ifdef CONFIG_XEN_PV
 #ifndef CONFIG_PREEMPTION
 /*
@@ -272,6 +308,8 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct 
pt_regs *regs)
 
inhcall = get_and_clear_inhcall();
if (inhcall && !WARN_ON_ONCE(irq_state.exit_rcu)) {
+   /* Normally called by irqentry_exit, we must restore pkrs here 
*/
+   irq_restore_pkrs(&irq_state);
instrumentation_begin();
irqentry_exit_cond_resched();
instrumentation_end();
diff --git a/arch/x86/include/asm/pkeys_common.h 
b/arch/x86/include/asm/pkeys_common.h
index 801a75615209..11a95e6efd2d 100644
--- a/arch/x86/include/asm/pkeys_common.h
+++ b/arch/x86/include/asm/pkeys_common.h
@@ -27,9 +27,10 @@
 PKR_AD_KEY(13) | PKR_AD_KEY(14) | PKR_AD_KEY(15))
 
 #ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
-void write_pkrs(u32 new_pkrs);
+DECLARE_PER_CPU(u32, pkrs_cache);
+noinstr void write_pkrs(u32 new_pkrs);
 #else
-static inline void write_pkrs(u32 new_pkrs) { }
+static __always_inline void write_pkrs(u32 new_pkrs) { }
 #endif
 
 #endif /*_ASM_X86_PKEYS_INTERNAL_H */
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 76a62419c446..6892d4524868 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -248,7 +248,7 @@ DEFINE_PER_

[PATCH V3 09/10] x86/pks: Enable Protection Keys Supervisor (PKS)

2020-11-06 Thread ira . weiny
From: Fenghua Yu 

Protection Keys for Supervisor pages (PKS) enables fast, hardware thread
specific, manipulation of permission restrictions on supervisor page
mappings.  It uses the same mechanism of Protection Keys as those on
User mappings but applies that mechanism to supervisor mappings using a
supervisor specific MSR.

Kernel users can thus defines 'domains' of page mappings which have an
extra level of protection beyond those specified in the supervisor page
table entries.

Enable PKS on supported CPUS.

Co-developed-by: Ira Weiny 
Signed-off-by: Ira Weiny 
Signed-off-by: Fenghua Yu 

---
Changes from V2
From Thomas: Make this patch last so PKS is not enabled until
all the PKS mechanisms are in place.  Specifically:
1) Modify setup_pks() to call write_pkrs() to properly
   set up the initial value when enabled.

2) Split this patch into two. 1) a precursor patch with
   the required defines/config options and 2) this patch
   which actually enables feature on CPUs which support
   it.

Changes since RFC V3
Per Dave Hansen
Update comment
Add X86_FEATURE_PKS to disabled-features.h
Rebase based on latest TIP tree
---
 arch/x86/include/asm/disabled-features.h |  6 +-
 arch/x86/kernel/cpu/common.c | 15 +++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index 164587177152..82540f0c5b6c 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -44,7 +44,11 @@
 # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 
-#define DISABLE_PKS   (1<<(X86_FEATURE_PKS & 31))
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+# define DISABLE_PKS   0
+#else
+# define DISABLE_PKS   (1<<(X86_FEATURE_PKS & 31))
+#endif
 
 #ifdef CONFIG_X86_5LEVEL
 # define DISABLE_LA57  0
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f8929a557d72 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -1494,6 +1495,19 @@ static void validate_apic_and_package_id(struct 
cpuinfo_x86 *c)
 #endif
 }
 
+/*
+ * PKS is independent of PKU and either or both may be supported on a CPU.
+ * Configure PKS if the CPU supports the feature.
+ */
+static void setup_pks(void)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_PKS))
+   return;
+
+   write_pkrs(INIT_PKRS_VALUE);
+   cr4_set_bits(X86_CR4_PKS);
+}
+
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
@@ -1591,6 +1605,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 
x86_init_rdrand(c);
setup_pku(c);
+   setup_pks();
 
/*
 * Clear/Set all flags overridden by options, need do it
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH V3 08/10] x86/pks: Add PKS kernel API

2020-11-06 Thread ira . weiny
From: Fenghua Yu 

PKS allows kernel users to define domains of page mappings which have
additional protections beyond the paging protections.

Add an API to allocate, use, and free a protection key which identifies
such a domain.  Export 5 new symbols pks_key_alloc(), pks_mknoaccess(),
pks_mkread(), pks_mkrdwr(), and pks_key_free().  Add 2 new macros;
PAGE_KERNEL_PKEY(key) and _PAGE_PKEY(pkey).

Update the protection key documentation to cover pkeys on supervisor
pages.

Co-developed-by: Ira Weiny 
Signed-off-by: Ira Weiny 
Signed-off-by: Fenghua Yu 

---
Changes from V2
From Greg KH
Replace all WARN_ON_ONCE() uses with pr_err()
From Dan Williams
Add __must_check to pks_key_alloc() to help ensure users
are using the API correctly

Changes from V1
Per Dave Hansen
Add flags to pks_key_alloc() to help future proof the
interface if/when the key space is exhausted.

Changes from RFC V3
Per Dave Hansen
Put WARN_ON_ONCE in pks_key_free()
s/pks_mknoaccess/pks_mk_noaccess/
s/pks_mkread/pks_mk_readonly/
s/pks_mkrdwr/pks_mk_readwrite/
Change return pks_key_alloc() to EOPNOTSUPP when not
supported or configured
Per Peter Zijlstra
Remove unneeded preempt disable/enable
---
 Documentation/core-api/protection-keys.rst | 102 +---
 arch/x86/include/asm/pgtable_types.h   |  12 ++
 arch/x86/include/asm/pkeys.h   |  11 ++
 arch/x86/include/asm/pkeys_common.h|   4 +
 arch/x86/mm/pkeys.c| 128 +
 include/linux/pgtable.h|   4 +
 include/linux/pkeys.h  |  24 
 7 files changed, 267 insertions(+), 18 deletions(-)

diff --git a/Documentation/core-api/protection-keys.rst 
b/Documentation/core-api/protection-keys.rst
index ec575e72d0b2..c4e6c480562f 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -4,25 +4,33 @@
 Memory Protection Keys
 ==
 
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
-which is found on Intel's Skylake (and later) "Scalable Processor"
-Server CPUs. It will be available in future non-server Intel parts
-and future AMD processors.
-
-For anyone wishing to test or use this feature, it is available in
-Amazon's EC2 C5 instances and is known to work there using an Ubuntu
-17.04 image.
-
 Memory Protection Keys provides a mechanism for enforcing page-based
 protections, but without requiring modification of the page tables
-when an application changes protection domains.  It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
+when an application changes protection domains.
+
+PKeys Userspace (PKU) is a feature which is found on Intel's Skylake "Scalable
+Processor" Server CPUs and later.  And It will be available in future
+non-server Intel parts and future AMD processors.
+
+Future Intel processors will support Protection Keys for Supervisor pages
+(PKS).
+
+For anyone wishing to test or use user space pkeys, it is available in Amazon's
+EC2 C5 instances and is known to work there using an Ubuntu 17.04 image.
+
+pkeys work by dedicating 4 previously Reserved bits in each page table entry to
+a "protection key", giving 16 possible keys.  User and Supervisor pages are
+treated separately.
+
+Protections for each page are controlled with per CPU registers for each type
+of page User and Supervisor.  Each of these 32 bit register stores two separate
+bits (Access Disable and Write Disable) for each key.
 
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key.  Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
-thread a different set of protections from every other thread.
+For Userspace the register is user-accessible (rdpkru/wrpkru).  For
+Supervisor, the register (MSR_IA32_PKRS) is accessible only to the kernel.
+
+Being a CPU register, pkeys are inherently thread-local, potentially giving
+each thread an independent set of protections from every other thread.
 
 There are two new instructions (RDPKRU/WRPKRU) for reading and writing
 to the new register.  The feature is only available in 64-bit mode,
@@ -30,8 +38,11 @@ even though there is theoretically space in the PAE PTEs.  
These
 permissions are enforced on data access only and have no effect on
 instruction fetches.
 
-Syscalls
-
+For kernel space rdmsr/wrmsr are used to access the kernel MSRs.
+
+
+Syscalls for user space keys
+
 
 There are 3 system calls which directly interact with pkeys::
 
@@ -98,3 +109,58 @@ with a read()::
 The kernel will send a SIGSEGV in both cases, but si_code will be set
 to SEGV

[PATCH V3 10/10] x86/pks: Add PKS test code

2020-11-06 Thread ira . weiny
From: Ira Weiny 

The core PKS functionality provides an interface for kernel users to
reserve keys to their domains set up the page tables with those keys and
control access to those domains when needed.

Define test code which exercises the core functionality of PKS via a
debugfs entry.  Basic checks can be triggered on boot with a kernel
command line option while both basic and preemption checks can be
triggered with separate debugfs values.

debugfs controls are:

'0' -- Run access tests with a single pkey
'1' -- Set up the pkey register with no access for the pkey allocated to
   this fd
'2' -- Check that the pkey register updated in '1' is still the same.
   (To be used after a forced context switch.)
'3' -- Allocate all pkeys possible and run tests on each pkey allocated.
   DEFAULT when run at boot.

Closing the fd will cleanup and release the pkey, therefore to exercise
context switch testing a user space program is provided in:

.../tools/testing/selftests/x86/test_pks.c

Reviewed-by: Dave Hansen 
Co-developed-by: Fenghua Yu 
Signed-off-by: Fenghua Yu 
Signed-off-by: Ira Weiny 

---
Changes for V2
Fix compilation errors

Changes for V1
Update for new pks_key_alloc()

Changes from RFC V3
Comments from Dave Hansen
clean up whitespace dmanage
Clean up Kconfig help
Clean up user test error output
s/pks_mknoaccess/pks_mk_noaccess/
s/pks_mkread/pks_mk_readonly/
s/pks_mkrdwr/pks_mk_readwrite/
Comments from Jing Han
Remove duplicate stdio.h
---
 Documentation/core-api/protection-keys.rst |   1 +
 arch/x86/mm/fault.c|  23 +
 lib/Kconfig.debug  |  12 +
 lib/Makefile   |   3 +
 lib/pks/Makefile   |   3 +
 lib/pks/pks_test.c | 692 +
 tools/testing/selftests/x86/Makefile   |   3 +-
 tools/testing/selftests/x86/test_pks.c |  66 ++
 8 files changed, 802 insertions(+), 1 deletion(-)
 create mode 100644 lib/pks/Makefile
 create mode 100644 lib/pks/pks_test.c
 create mode 100644 tools/testing/selftests/x86/test_pks.c

diff --git a/Documentation/core-api/protection-keys.rst 
b/Documentation/core-api/protection-keys.rst
index c4e6c480562f..8ffdfbff013c 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -164,3 +164,4 @@ of WRPKRU.  So to quote from the WRPKRU text:
until all prior executions of WRPKRU have completed execution
and updated the PKRU register.
 
+Example code can be found in lib/pks/pks_test.c
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 90029ce9b0da..916b2d18ed57 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -18,6 +18,7 @@
 #include  /* faulthandler_disabled()  */
 #include  /* efi_recover_from_page_fault()*/
 #include 
+#include 
 
 #include /* boot_cpu_has, ...*/
 #include  /* dotraplinkage, ...   */
@@ -1149,6 +1150,25 @@ bool fault_in_kernel_space(unsigned long address)
return address >= TASK_SIZE_MAX;
 }
 
+#ifdef CONFIG_PKS_TESTING
+bool pks_test_callback(irqentry_state_t *irq_state);
+static bool handle_pks_testing(unsigned long hw_error_code, irqentry_state_t 
*irq_state)
+{
+   /*
+* If we get a protection key exception it could be because we
+* are running the PKS test.  If so, pks_test_callback() will
+* clear the protection mechanism and return true to indicate
+* the fault was handled.
+*/
+   return (hw_error_code & X86_PF_PK) && pks_test_callback(irq_state);
+}
+#else
+static bool handle_pks_testing(unsigned long hw_error_code, irqentry_state_t 
*irq_state)
+{
+   return false;
+}
+#endif
+
 /*
  * Called for all faults where 'address' is part of the kernel address
  * space.  Might get called for faults that originate from *code* that
@@ -1165,6 +1185,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long 
hw_error_code,
if (!cpu_feature_enabled(X86_FEATURE_PKS))
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
 
+   if (handle_pks_testing(hw_error_code, irq_state))
+   return;
+
 #ifdef CONFIG_X86_32
/*
 * We can fault-in kernel-space virtual memory on-demand. The
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c789b39ed527..e90e06f5a3b9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2444,6 +2444,18 @@ config HYPERV_TESTING
help
  Select this option to enable Hyper-V vmbus testing.
 
+config PKS_TESTING
+   bool "PKey (S)upervisor testing"
+   default n
+   depends on ARCH_HAS_SUPERVISOR_PKEYS
+   help
+ Select this option to enable testing of PKS core software and
+ hardware.  The PKS core provides a mechanism to al

[PATCH V3 05/10] x86/entry: Pass irqentry_state_t by reference

2020-11-06 Thread ira . weiny
From: Ira Weiny 

Currently struct irqentry_state_t only contains a single bool value
which makes passing it by value is reasonable.  However, future patches
propose to add information to this struct, for example the PKRS
register/thread state.

Adding information to irqentry_state_t makes passing by value less
efficient.  Therefore, change the entry/exit calls to pass irq_state by
reference.

While at it, make the code easier to follow by changing all the usage
sites to consistently use the variable name 'irq_state'.

Signed-off-by: Ira Weiny 

---
Changes from V1
From Thomas: Update commit message
Further clean up Kernel doc and comments
Missed some 'return' comments which are no longer valid

Changes from RFC V3
Clean up @irq_state comments
Standardize on 'irq_state' for the state variable name
Refactor based on new patch from Thomas Gleixner
Also addresses Peter Zijlstra's comment
---
 arch/x86/entry/common.c |  8 
 arch/x86/include/asm/idtentry.h | 25 ++--
 arch/x86/kernel/cpu/mce/core.c  |  4 ++--
 arch/x86/kernel/kvm.c   |  6 +++---
 arch/x86/kernel/nmi.c   |  4 ++--
 arch/x86/kernel/traps.c | 21 
 arch/x86/mm/fault.c |  6 +++---
 include/linux/entry-common.h| 18 +
 kernel/entry/common.c   | 34 +
 9 files changed, 65 insertions(+), 61 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 18d8f17f755c..87dea56a15d2 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -259,9 +259,9 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct 
pt_regs *regs)
 {
struct pt_regs *old_regs;
bool inhcall;
-   irqentry_state_t state;
+   irqentry_state_t irq_state;
 
-   state = irqentry_enter(regs);
+   irqentry_enter(regs, &irq_state);
old_regs = set_irq_regs(regs);
 
instrumentation_begin();
@@ -271,13 +271,13 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct 
pt_regs *regs)
set_irq_regs(old_regs);
 
inhcall = get_and_clear_inhcall();
-   if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
+   if (inhcall && !WARN_ON_ONCE(irq_state.exit_rcu)) {
instrumentation_begin();
irqentry_exit_cond_resched();
instrumentation_end();
restore_inhcall(inhcall);
} else {
-   irqentry_exit(regs, state);
+   irqentry_exit(regs, &irq_state);
}
 }
 #endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 247a60a47331..282d2413b6a1 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -49,12 +49,13 @@ static __always_inline void __##func(struct pt_regs *regs); 
\
\
 __visible noinstr void func(struct pt_regs *regs)  \
 {  \
-   irqentry_state_t state = irqentry_enter(regs);  \
+   irqentry_state_t irq_state; 
\
\
+   irqentry_enter(regs, &irq_state);   
\
instrumentation_begin();\
__##func (regs);\
instrumentation_end();  \
-   irqentry_exit(regs, state); \
+   irqentry_exit(regs, &irq_state);
\
 }  \
\
 static __always_inline void __##func(struct pt_regs *regs)
@@ -96,12 +97,13 @@ static __always_inline void __##func(struct pt_regs *regs,  
\
 __visible noinstr void func(struct pt_regs *regs,  \
unsigned long error_code)   \
 {  \
-   irqentry_state_t state = irqentry_enter(regs);  \
+   irqentry_state_t irq_state; 
\
\
+   irqentry_enter(regs, &irq_state);   
\
instrumentation_begin();\
__##func (regs, error_code);\
instrumentation_end();  \
-   irqentry_exit(regs, state); \
+   irqentry_exit(regs,

[PATCH V3 07/10] x86/fault: Report the PKRS state on fault

2020-11-06 Thread ira . weiny
From: Ira Weiny 

When only user space pkeys are enabled faulting within the kernel was an
unexpected condition which should never happen.  Therefore a WARN_ON in
the kernel fault handler would detect if it ever did.  Now this is no
longer the case if PKS is enabled and supported.

Report a Pkey fault with a normal splat and add the PKRS state to the
fault splat text.  Note the PKS register is reset during an exception
therefore the saved PKRS value from before the beginning of the
exception is passed down.

If PKS is not enabled, or not active, maintain the WARN_ON_ONCE() from
before.

Because each fault has its own state the pkrs information will be
correctly reported even if a fault 'faults'.

Suggested-by: Andy Lutomirski 
Signed-off-by: Ira Weiny 

---
Changes from V2
Fix compilation error

Changes from RFC V3
Update commit message
Per Dave Hansen
Don't print PKRS if !cpu_feature_enabled(X86_FEATURE_PKS)
Fix comment
Remove check on CONFIG_ARCH_HAS_SUPERVISOR_PKEYS in favor of
disabled-features.h
---
 arch/x86/mm/fault.c | 58 ++---
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8d20c4c13abf..90029ce9b0da 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -504,7 +504,8 @@ static void show_ldttss(const struct desc_ptr *gdt, const 
char *name, u16 index)
 }
 
 static void
-show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long 
address)
+show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long 
address,
+   irqentry_state_t *irq_state)
 {
if (!oops_may_print())
return;
@@ -548,6 +549,11 @@ show_fault_oops(struct pt_regs *regs, unsigned long 
error_code, unsigned long ad
 (error_code & X86_PF_PK)? "protection keys violation" :
   "permissions violation");
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+   if (cpu_feature_enabled(X86_FEATURE_PKS) && irq_state && (error_code & 
X86_PF_PK))
+   pr_alert("PKRS: 0x%x\n", irq_state->thread_pkrs);
+#endif
+
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
struct desc_ptr idt, gdt;
u16 ldtr, tr;
@@ -626,7 +632,8 @@ static void set_signal_archinfo(unsigned long address,
 
 static noinline void
 no_context(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, int signal, int si_code)
+  unsigned long address, int signal, int si_code,
+  irqentry_state_t *irq_state)
 {
struct task_struct *tsk = current;
unsigned long flags;
@@ -732,7 +739,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 */
flags = oops_begin();
 
-   show_fault_oops(regs, error_code, address);
+   show_fault_oops(regs, error_code, address, irq_state);
 
if (task_stack_end_corrupted(tsk))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
@@ -785,7 +792,8 @@ static bool is_vsyscall_vaddr(unsigned long vaddr)
 
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, u32 pkey, int si_code)
+  unsigned long address, u32 pkey, int si_code,
+  irqentry_state_t *irq_state)
 {
struct task_struct *tsk = current;
 
@@ -832,14 +840,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned 
long error_code,
if (is_f00f_bug(regs, address))
return;
 
-   no_context(regs, error_code, address, SIGSEGV, si_code);
+   no_context(regs, error_code, address, SIGSEGV, si_code, irq_state);
 }
 
 static noinline void
 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-unsigned long address)
+unsigned long address, irqentry_state_t *irq_state)
 {
-   __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
+   __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR, 
irq_state);
 }
 
 static void
@@ -853,7 +861,7 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 */
mmap_read_unlock(mm);
 
-   __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+   __bad_area_nosemaphore(regs, error_code, address, pkey, si_code, NULL);
 }
 
 static noinline void
@@ -923,7 +931,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, 
unsigned long address,
 {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & X86_PF_USER)) {
-   no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+   no_context(regs, error_code, address, SIGBUS, BUS_ADRERR, NULL);
return;
}
 
@@ -957,7 +965,7 @@ mm_fault_error(struct pt_regs *regs, unsigned 

[PATCH V3 03/10] x86/pks: Add PKS defines and Kconfig options

2020-11-06 Thread ira . weiny
From: Ira Weiny 

Protection Keys for Supervisor pages (PKS) enables fast, hardware thread
specific, manipulation of permission restrictions on supervisor page
mappings.  It uses the same mechanism of Protection Keys as those on
User mappings but applies that mechanism to supervisor mappings using a
supervisor specific MSR.

Kernel users can thus defines 'domains' of page mappings which have an
extra level of protection beyond those specified in the supervisor page
table entries.

Add the Kconfig ARCH_HAS_SUPERVISOR_PKEYS to indicate to core code that
an architecture support pkeys.  Select it for x86.

Define the CPU features bit needed but leave DISABLE_PKS set to disable
the feature until the implementation can be completed and enabled in a
final patch.

Co-developed-by: Fenghua Yu 
Signed-off-by: Fenghua Yu 
Signed-off-by: Ira Weiny 

---
Changes from V2
New patch for V3:  Split this off from the enable patch to be
able to create cleaner bisectability
---
 arch/x86/Kconfig| 1 +
 arch/x86/include/asm/cpufeatures.h  | 1 +
 arch/x86/include/asm/disabled-features.h| 4 +++-
 arch/x86/include/uapi/asm/processor-flags.h | 2 ++
 mm/Kconfig  | 2 ++
 5 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f6946b81f74a..78c4c749c6a9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1876,6 +1876,7 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
depends on X86_64 && (CPU_SUP_INTEL || CPU_SUP_AMD)
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_HAS_PKEYS
+   select ARCH_HAS_SUPERVISOR_PKEYS
help
  Memory Protection Keys provides a mechanism for enforcing
  page-based protections, but without requiring modification of the
diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index dad350d42ecf..4deb580324e8 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -356,6 +356,7 @@
 #define X86_FEATURE_MOVDIRI(16*32+27) /* MOVDIRI instruction */
 #define X86_FEATURE_MOVDIR64B  (16*32+28) /* MOVDIR64B instruction */
 #define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS 
instructions */
+#define X86_FEATURE_PKS(16*32+31) /* Protection Keys 
for Supervisor pages */
 
 /* AMD-defined CPU features, CPUID level 0x8007 (EBX), word 17 */
 #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery 
support */
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index 5861d34f9771..164587177152 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -44,6 +44,8 @@
 # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 
+#define DISABLE_PKS   (1<<(X86_FEATURE_PKS & 31))
+
 #ifdef CONFIG_X86_5LEVEL
 # define DISABLE_LA57  0
 #else
@@ -82,7 +84,7 @@
 #define DISABLED_MASK140
 #define DISABLED_MASK150
 #define DISABLED_MASK16
(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
-DISABLE_ENQCMD)
+DISABLE_ENQCMD|DISABLE_PKS)
 #define DISABLED_MASK170
 #define DISABLED_MASK180
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..191c574b2390 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -130,6 +130,8 @@
 #define X86_CR4_SMAP   _BITUL(X86_CR4_SMAP_BIT)
 #define X86_CR4_PKE_BIT22 /* enable Protection Keys support */
 #define X86_CR4_PKE_BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_PKS_BIT24 /* enable Protection Keys for 
Supervisor */
+#define X86_CR4_PKS_BITUL(X86_CR4_PKS_BIT)
 
 /*
  * x86-64 Task Priority Register, CR8
diff --git a/mm/Kconfig b/mm/Kconfig
index d42423f884a7..fc9ce7f65683 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -826,6 +826,8 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
 config ARCH_HAS_PKEYS
bool
+config ARCH_HAS_SUPERVISOR_PKEYS
+   bool
 
 config PERCPU_STATS
bool "Collect percpu memory statistics"
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH V3 02/10] x86/fpu: Refactor arch_set_user_pkey_access() for PKS support

2020-11-06 Thread ira . weiny
From: Ira Weiny 

Define a helper, update_pkey_val(), which will be used to support both
Protection Key User (PKU) and the new Protection Key for Supervisor
(PKS) in subsequent patches.

Co-developed-by: Peter Zijlstra 
Signed-off-by: Peter Zijlstra 
Signed-off-by: Ira Weiny 

---
Changes from RFC V3:
Per Dave Hansen
Update and add comments per Dave's review
Per Peter
Correct attribution
---
 arch/x86/include/asm/pkeys.h |  2 ++
 arch/x86/kernel/fpu/xstate.c | 22 --
 arch/x86/mm/pkeys.c  | 23 +++
 3 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index f9feba80894b..4526245b03e5 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -136,4 +136,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
 }
 
+u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags);
+
 #endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a99afc70cc0a..a3bca3211eba 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -994,9 +994,7 @@ const void *get_xsave_field_ptr(int xfeature_nr)
 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val)
 {
-   u32 old_pkru;
-   int pkey_shift = (pkey * PKR_BITS_PER_PKEY);
-   u32 new_pkru_bits = 0;
+   u32 pkru;
 
/*
 * This check implies XSAVE support.  OSPKE only gets
@@ -1012,21 +1010,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, 
int pkey,
 */
WARN_ON_ONCE(pkey >= arch_max_pkey());
 
-   /* Set the bits we need in PKRU:  */
-   if (init_val & PKEY_DISABLE_ACCESS)
-   new_pkru_bits |= PKR_AD_BIT;
-   if (init_val & PKEY_DISABLE_WRITE)
-   new_pkru_bits |= PKR_WD_BIT;
-
-   /* Shift the bits in to the correct place in PKRU for pkey: */
-   new_pkru_bits <<= pkey_shift;
-
-   /* Get old PKRU and mask off any old bits in place: */
-   old_pkru = read_pkru();
-   old_pkru &= ~((PKR_AD_BIT|PKR_WD_BIT) << pkey_shift);
-
-   /* Write old part along with new part: */
-   write_pkru(old_pkru | new_pkru_bits);
+   pkru = read_pkru();
+   pkru = update_pkey_val(pkru, pkey, init_val);
+   write_pkru(pkru);
 
return 0;
 }
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index f5efb4007e74..d1dfe743e79f 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -208,3 +208,26 @@ static __init int setup_init_pkru(char *opt)
return 1;
 }
 __setup("init_pkru=", setup_init_pkru);
+
+/*
+ * Replace disable bits for @pkey with values from @flags
+ *
+ * Kernel users use the same flags as user space:
+ * PKEY_DISABLE_ACCESS
+ * PKEY_DISABLE_WRITE
+ */
+u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags)
+{
+   int pkey_shift = pkey * PKR_BITS_PER_PKEY;
+
+   /*  Mask out old bit values */
+   pk_reg &= ~(((1 << PKR_BITS_PER_PKEY) - 1) << pkey_shift);
+
+   /*  Or in new values */
+   if (flags & PKEY_DISABLE_ACCESS)
+   pk_reg |= PKR_AD_BIT << pkey_shift;
+   if (flags & PKEY_DISABLE_WRITE)
+   pk_reg |= PKR_WD_BIT << pkey_shift;
+
+   return pk_reg;
+}
-- 
2.28.0.rc0.12.gb6a658bd00c9



  1   2   3   4   5   6   7   8   9   10   >