Re: [PATCH v16 14/15] mtd: spi-nor: spansion: add support for Cypress Semper flash
Hi, [...] On 10/5/20 9:01 PM, Pratyush Yadav wrote: > +static int spi_nor_cypress_octal_dtr_enable(struct spi_nor *nor, bool enable) > +{ > + struct spi_mem_op op; > + u8 *buf = nor->bouncebuf; > + int ret; > + > + if (enable) { > + /* Use 24 dummy cycles for memory array reads. */ > + ret = spi_nor_write_enable(nor); > + if (ret) > + return ret; > + > + *buf = SPINOR_REG_CYPRESS_CFR2V_MEMLAT_11_24; > + op = (struct spi_mem_op) > + SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WR_ANY_REG, 1), > +SPI_MEM_OP_ADDR(3, SPINOR_REG_CYPRESS_CFR2V, > +1), > +SPI_MEM_OP_NO_DUMMY, > +SPI_MEM_OP_DATA_OUT(1, buf, 1)); > + > + ret = spi_mem_exec_op(nor->spimem, &op); > + if (ret) > + return ret; > + > + ret = spi_nor_wait_till_ready(nor); > + if (ret) > + return ret; > + > + nor->read_dummy = 24; > + } > + > + /* Set/unset the octal and DTR enable bits. */ > + ret = spi_nor_write_enable(nor); > + if (ret) > + return ret; > + > + if (enable) > + *buf = SPINOR_REG_CYPRESS_CFR5V_OCT_DTR_EN; > + else > + *buf = SPINOR_REG_CYPRESS_CFR5V_OCT_DTR_DS; > + > + op = (struct spi_mem_op) > + SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_WR_ANY_REG, 1), > +SPI_MEM_OP_ADDR(enable ? 3 : 4, > +SPINOR_REG_CYPRESS_CFR5V, > +1), > +SPI_MEM_OP_NO_DUMMY, > +SPI_MEM_OP_DATA_OUT(1, buf, 1)); > + > + if (!enable) > + spi_nor_spimem_setup_op(nor, &op, SNOR_PROTO_8_8_8_DTR); > + > + ret = spi_mem_exec_op(nor->spimem, &op); > + if (ret) > + return ret; > + > + /* Give some time for the mode change to take place. */ > + usleep_range(1000, 1500); > + This delay is no longer needed right? I can drop it while applying, if you confirm. Tudor: Could you provide your R-by? Regards Vignesh
[PATCH] netfilter: conntrack: fix -Wformat
Clang is more aggressive about -Wformat warnings when the format flag specifies a type smaller than the parameter. Fixes 8 instances of: warning: format specifies type 'unsigned short' but the argument has type 'int' [-Wformat] Link: https://github.com/ClangBuiltLinux/linux/issues/378 Signed-off-by: Nick Desaulniers --- net/netfilter/nf_conntrack_standalone.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 46c5557c1fec..c5aa45c38eb2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -50,38 +50,38 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, switch (l4proto->l4proto) { case IPPROTO_ICMP: - seq_printf(s, "type=%u code=%u id=%u ", + seq_printf(s, "type=%u code=%u id=%hu ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, - ntohs(tuple->src.u.icmp.id)); + (__be16)ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_TCP: seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.tcp.port), - ntohs(tuple->dst.u.tcp.port)); + (__be16)ntohs(tuple->src.u.tcp.port), + (__be16)ntohs(tuple->dst.u.tcp.port)); break; case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.udp.port), - ntohs(tuple->dst.u.udp.port)); + (__be16)ntohs(tuple->src.u.udp.port), + (__be16)ntohs(tuple->dst.u.udp.port)); break; case IPPROTO_DCCP: seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.dccp.port), - ntohs(tuple->dst.u.dccp.port)); + (__be16)ntohs(tuple->src.u.dccp.port), + (__be16)ntohs(tuple->dst.u.dccp.port)); break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.sctp.port), - ntohs(tuple->dst.u.sctp.port)); + (__be16)ntohs(tuple->src.u.sctp.port), + (__be16)ntohs(tuple->dst.u.sctp.port)); break; case IPPROTO_ICMPV6: - seq_printf(s, "type=%u code=%u id=%u ", + seq_printf(s, "type=%u code=%u id=%hu ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, - ntohs(tuple->src.u.icmp.id)); + (__be16)ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_GRE: seq_printf(s, "srckey=0x%x dstkey=0x%x ", -- 2.29.2.222.g5d2a92d10f8-goog
Re: [PATCH] KVM: PPC: Book3S: Assign boolean values to a bool variable
On Sat, 7 Nov 2020 14:26:22 +0800 xiakaixu1...@gmail.com wrote: > From: Kaixu Xia > > Fix the following coccinelle warnings: > > ./arch/powerpc/kvm/book3s_xics.c:476:3-15: WARNING: Assignment of 0/1 to bool > variable > ./arch/powerpc/kvm/book3s_xics.c:504:3-15: WARNING: Assignment of 0/1 to bool > variable > > Reported-by: Tosk Robot > Signed-off-by: Kaixu Xia > --- Reviewed-by: Greg Kurz > arch/powerpc/kvm/book3s_xics.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c > index 5fee5a11550d..303e3cb096db 100644 > --- a/arch/powerpc/kvm/book3s_xics.c > +++ b/arch/powerpc/kvm/book3s_xics.c > @@ -473,7 +473,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, > struct kvmppc_icp *icp, > arch_spin_unlock(&ics->lock); > local_irq_restore(flags); > new_irq = reject; > - check_resend = 0; > + check_resend = false; > goto again; > } > } else { > @@ -501,7 +501,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, > struct kvmppc_icp *icp, > state->resend = 0; > arch_spin_unlock(&ics->lock); > local_irq_restore(flags); > - check_resend = 0; > + check_resend = false; > goto again; > } > }
[PATCH v3 bpf] trace: bpf: Fix passing zero to PTR_ERR()
There is a bug when passing zero to PTR_ERR() and return. Fix smatch err. Signed-off-by: Wang Qing --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b..5113fd4 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1198,7 +1198,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) - return PTR_ERR(*btf); + return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id; -- 2.7.4
Re: [PATCH v4 1/4] dt-bindings: usb: add rk3328 dwc3 docs
Hi, Lindsey Stanpoor writes: > On Wed, Sep 2, 2020 at 11:12 AM wrote: >> >> From: Cameron Nemo >> >> Document compatible for dwc3 on the Rockchip rk3328 platform. > > Hi all, > > Wanted to give this patch submission a gentle ping. > > Rob Herring acked the documentation changes, but I have not heard > anything > from the USB or Rockchip maintainers. This patchset would facilitate USB3 > support for Rockchip rk3328 devices like the Pine Rock64. > > If there is anything I can do to help move this along, please let me know. Sorry, it had fallen through the cracks. It's now in my testing/next. -- balbi signature.asc Description: PGP signature
[GIT PULL] RISC-V Fixes for 5.10-rc3
The following changes since commit 3650b228f83adda7e5ee532e2b90429c03f7b9ec: Linux 5.10-rc1 (2020-10-25 15:14:11 -0700) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git tags/riscv-for-linus-5.10-rc3 for you to fetch changes up to c2c81bb2f69138f902e1a58d3bef6ad97fb8a92c: RISC-V: Fix the VDSO symbol generaton for binutils-2.35+ (2020-11-06 00:03:48 -0800) RISC-V Fixes for 5.10-rc3 * An SPDX comment style fix. * A fix to ignore memory that is unusable. * A fix to avoid setting a kernel text offset for the !MMU kernels, where skipping the first page of memory is both unnecessary and costly. * A fix to avoid passing the flag bits in satp to pfn_to_virt(). * A fix to __put_kernel_nofault, where we had the arguments to __put_user_nocheck reversed. * A workaround for a bug in the FU540 to avoid triggering PMP issues during early boot. * A change to how we pull symbols out of the vDSO. The old mechanism was removed from binutils-2.35 (and has been backported to Debian's 2.34). Anup Patel (1): RISC-V: Use non-PGD mappings for early DTB access Atish Patra (1): RISC-V: Remove any memblock representing unusable memory area Changbin Du (1): riscv: uaccess: fix __put_kernel_nofault() Liu Shaohua (1): riscv: fix pfn_to_virt err in do_page_fault(). Palmer Dabbelt (1): RISC-V: Fix the VDSO symbol generaton for binutils-2.35+ Ryan Kosta (1): risc-v: kernel: ftrace: Fixes improper SPDX comment style Sean Anderson (1): riscv: Set text_offset correctly for M-Mode arch/riscv/include/asm/uaccess.h | 2 +- arch/riscv/kernel/ftrace.c| 2 +- arch/riscv/kernel/head.S | 5 + arch/riscv/kernel/vdso/.gitignore | 1 + arch/riscv/kernel/vdso/Makefile | 18 +- arch/riscv/kernel/vdso/so2s.sh| 6 ++ arch/riscv/mm/fault.c | 4 +++- arch/riscv/mm/init.c | 32 +--- 8 files changed, 47 insertions(+), 23 deletions(-) create mode 100755 arch/riscv/kernel/vdso/so2s.sh
Re: [V2] trace: Fix passing zero to PTR_ERR()
On 11/6/20 10:34 PM, Wang Qing wrote: There is a bug when passing zero to PTR_ERR() and return. Fix smatch err. Signed-off-by: Wang Qing For clarity, the subject probably should be bpf: Fix passing zero to PTR_ERR() to indicate this is a bpf related fix. The tag should be something like [PATCH bpf v2] or [PATCH v2 bpf] depending on your preference, to indicate this is for bpf tree. If another version is sent, the above "v2" should change to "v3". --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b..5113fd4 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1198,7 +1198,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) - return PTR_ERR(*btf); + return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id;
[PATCH] Kbuild: enable -Wfallthrough for clang
Partial revert of commit e2079e93f562 ("kbuild: Do not enable -Wimplicit-fallthrough for clang for now") This has been fixed up over time thanks to the addition of "fallthrough" pseudo-keyword in commit 294f69e662d1 ("compiler_attributes.h: Add 'fallthrough' pseudo keyword for switch/case use") Link: https://github.com/ClangBuiltLinux/linux/issues/236 Signed-off-by: Nick Desaulniers --- Makefile | 5 - 1 file changed, 5 deletions(-) diff --git a/Makefile b/Makefile index f353886dbf44..c1c61c276f60 100644 --- a/Makefile +++ b/Makefile @@ -777,11 +777,6 @@ else # These warnings generated too much noise in a regular build. # Use make W=1 to enable them (see scripts/Makefile.extrawarn) KBUILD_CFLAGS += -Wno-unused-but-set-variable - -# Warn about unmarked fall-throughs in switch statement. -# Disabled for clang while comment to attribute conversion happens and -# https://github.com/ClangBuiltLinux/linux/issues/636 is discussed. -KBUILD_CFLAGS += $(call cc-option,-Wimplicit-fallthrough,) endif KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) -- 2.29.2.222.g5d2a92d10f8-goog
[PATCH v7] mm/zswap: move to use crypto_acomp API for hardware acceleration
Right now, all new ZIP drivers are adapted to crypto_acomp APIs rather than legacy crypto_comp APIs. Tradiontal ZIP drivers like lz4,lzo etc have been also wrapped into acomp via scomp backend. But zswap.c is still using the old APIs. That means zswap won't be able to work on any new ZIP drivers in kernel. This patch moves to use cryto_acomp APIs to fix the disconnected bridge between new ZIP drivers and zswap. It is probably the first real user to use acomp but perhaps not a good example to demonstrate how multiple acomp requests can be executed in parallel in one acomp instance. frontswap is doing page load and store page by page synchronously. swap_writepage() depends on the completion of frontswap_store() to decide if it should call __swap_writepage() to swap to disk. However this patch creates multiple acomp instances, so multiple threads running on multiple different cpus can actually do (de)compression parallelly, leveraging the power of multiple ZIP hardware queues. This is also consistent with frontswap's page management model. The old zswap code uses atomic context and avoids the race conditions while shared resources like zswap_dstmem are accessed. Here since acomp can sleep, per-cpu mutex is used to replace preemption-disable. While it is possible to make mm/page_io.c and mm/frontswap.c support async (de)compression in some way, the entire design requires careful thinking and performance evaluation. For the first step, the base with fixed connection between ZIP drivers and zswap should be built. Acked-by: Vitaly Wool Cc: Luis Claudio R. Goncalves Cc: Sebastian Andrzej Siewior Cc: Andrew Morton Cc: Herbert Xu Cc: David S. Miller Cc: Mahipal Challa Cc: Seth Jennings Cc: Dan Streetman Cc: Zhou Wang Cc: Colin Ian King Signed-off-by: Barry Song --- -v7: 1. Add Acked-by of Vitaly Wool, thanks! 2. Address the issues pointed out by Sebastian Andrzej Siewior, thanks! * remove redundant kmap and move to use sg_set_page; * remove the warning if DEBUG_PREEMPTIBLE is enabled by using raw_cpu_ptr(). * Regarding another code refinement issue, I am still not a big fan of a. get_cpu_ptr() for the acomp_ctx //lock preemption b. this_cpu_ptr() for the dstmem and mutex c. put_cpu_ptr() for the acomp_ctx //unlock preemption It seems the code is better looking to put all stuff in a struct, and get the per_cpu struct to get them all rather than adding a preemption-disabled context and getting them one by one. mm/zswap.c | 183 + 1 file changed, 137 insertions(+), 46 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index fbb7829..73f04de 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -24,8 +24,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -127,9 +129,17 @@ module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, * data structures **/ +struct crypto_acomp_ctx { + struct crypto_acomp *acomp; + struct acomp_req *req; + struct crypto_wait wait; + u8 *dstmem; + struct mutex *mutex; +}; + struct zswap_pool { struct zpool *zpool; - struct crypto_comp * __percpu *tfm; + struct crypto_acomp_ctx __percpu *acomp_ctx; struct kref kref; struct list_head list; struct work_struct release_work; @@ -388,23 +398,43 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, * per-cpu code **/ static DEFINE_PER_CPU(u8 *, zswap_dstmem); +/* + * If users dynamically change the zpool type and compressor at runtime, i.e. + * zswap is running, zswap can have more than one zpool on one cpu, but they + * are sharing dtsmem. So we need this mutex to be per-cpu. + */ +static DEFINE_PER_CPU(struct mutex *, zswap_mutex); static int zswap_dstmem_prepare(unsigned int cpu) { + struct mutex *mutex; u8 *dst; dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) return -ENOMEM; + mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); + if (!mutex) { + kfree(dst); + return -ENOMEM; + } + + mutex_init(mutex); per_cpu(zswap_dstmem, cpu) = dst; + per_cpu(zswap_mutex, cpu) = mutex; return 0; } static int zswap_dstmem_dead(unsigned int cpu) { + struct mutex *mutex; u8 *dst; + mutex = per_cpu(zswap_mutex, cpu); + kfree(mutex); + per_cpu(zswap_mutex, cpu) = NULL; + dst = per_cpu(zswap_dstmem, cpu); kfree(dst); per_cpu(zswap_dstmem, cpu) = NULL; @@ -415,30 +445,54 @@ static int zswap_dstmem_dead(unsigned int cpu) static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct cryp
[PATCH] net/mlx4: Assign boolean values to a bool variable
From: Kaixu Xia Fix the following coccinelle warnings: ./drivers/net/ethernet/mellanox/mlx4/en_rx.c:687:1-17: WARNING: Assignment of 0/1 to bool variable Reported-by: Tosk Robot Signed-off-by: Kaixu Xia --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 502d1b97855c..b0f79a5151cf 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -684,7 +684,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud xdp_prog = rcu_dereference(ring->xdp_prog); xdp.rxq = &ring->xdp_rxq; xdp.frame_sz = priv->frag_info[0].frag_stride; - doorbell_pending = 0; + doorbell_pending = false; /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx * descriptor offset can be deduced from the CQE index instead of -- 2.20.0
[PATCH] fork: fix copy_process(CLONE_PARENT) race with the exiting ->real_parent
current->group_leader->exit_signal may change during copy_process() if current->real_parent exits, move the assignment inside tasklist_lock to avoid the race. Signed-off-by: Eddy Wu --- kernel/fork.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index da8d360fb032..7abda2a888a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2142,14 +2142,9 @@ static __latent_entropy struct task_struct *copy_process( /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { - p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { - if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2193,9 +2188,14 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else + p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; + p->exit_signal = args->exit_signal; } klp_copy_process(p); -- 2.17.1
[V2] trace: Fix passing zero to PTR_ERR()
There is a bug when passing zero to PTR_ERR() and return. Fix smatch err. Signed-off-by: Wang Qing --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b..5113fd4 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1198,7 +1198,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) - return PTR_ERR(*btf); + return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id; -- 2.7.4
[PATCH] scsi: ses: Fix crash caused by kfree an invalid pointer
We can get a crash when disconnecting the iSCSI session, the call trace like this: [2a00fb70] kfree at 0830e224 [2a00fba0] ses_intf_remove at 01f200e4 [2a00fbd0] device_del at 086b6a98 [2a00fc50] device_unregister at 086b6d58 [2a00fc70] __scsi_remove_device at 0870608c [2a00fca0] scsi_remove_device at 08706134 [2a00fcc0] __scsi_remove_target at 087062e4 [2a00fd10] scsi_remove_target at 087064c0 [2a00fd70] __iscsi_unbind_session at 01c872c4 [2a00fdb0] process_one_work at 0810f35c [2a00fe00] worker_thread at 0810f648 [2a00fe70] kthread at 08116e98 In ses_intf_add, components count can be 0, and kcalloc 0 size scomp, but not saved at edev->component[i].scratch In this situation, edev->component[0].scratch is an invalid pointer, when kfree it in ses_intf_remove_enclosure, a crash like above would happen The call trace also could be other random cases when kfree cannot detect the invalid pointer We should not use edev->component[] array when we get components count is 0 We also need check index when use edev->component[] array in ses_enclosure_data_process Tested-by: Zeng Zhicong Cc: stable # 2.6.25+ Signed-off-by: Ding Hui --- drivers/scsi/ses.c | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index c2afba2a5414..f5ef0a91f0eb 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -477,9 +477,6 @@ static int ses_enclosure_find_by_addr(struct enclosure_device *edev, int i; struct ses_component *scomp; - if (!edev->component[0].scratch) - return 0; - for (i = 0; i < edev->components; i++) { scomp = edev->component[i].scratch; if (scomp->addr != efd->addr) @@ -565,8 +562,10 @@ static void ses_enclosure_data_process(struct enclosure_device *edev, components++, type_ptr[0], name); - else + else if (components < edev->components) ecomp = &edev->component[components++]; + else + ecomp = ERR_PTR(-EINVAL); if (!IS_ERR(ecomp)) { if (addl_desc_ptr) @@ -731,9 +730,11 @@ static int ses_intf_add(struct device *cdev, buf = NULL; } page2_not_supported: - scomp = kcalloc(components, sizeof(struct ses_component), GFP_KERNEL); - if (!scomp) - goto err_free; + if (components > 0) { + scomp = kcalloc(components, sizeof(struct ses_component), GFP_KERNEL); + if (!scomp) + goto err_free; + } edev = enclosure_register(cdev->parent, dev_name(&sdev->sdev_gendev), components, &ses_enclosure_callbacks); @@ -813,7 +814,8 @@ static void ses_intf_remove_enclosure(struct scsi_device *sdev) kfree(ses_dev->page2); kfree(ses_dev); - kfree(edev->component[0].scratch); + if (edev->components > 0) + kfree(edev->component[0].scratch); put_device(&edev->edev); enclosure_unregister(edev); -- 2.17.1
[PATCH net-next 06/11] net: hns3: add ethtool priv-flag for DIM
Add a control private flag in ethtool for enable/disable DIM feature. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hnae3.h| 7 +++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 1 + drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 71 ++ 3 files changed, 79 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index f9d4d23..18b3e43 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -716,6 +716,11 @@ struct hnae3_roce_private_info { #define HNAE3_UPE (HNAE3_USER_UPE | HNAE3_OVERFLOW_UPE) #define HNAE3_MPE (HNAE3_USER_MPE | HNAE3_OVERFLOW_MPE) +enum hnae3_pflag { + HNAE3_PFLAG_DIM_ENABLE, + HNAE3_PFLAG_MAX +}; + struct hnae3_handle { struct hnae3_client *client; struct pci_dev *pdev; @@ -738,6 +743,8 @@ struct hnae3_handle { /* Network interface message level enabled bits */ u32 msg_enable; + + unsigned long priv_flags; }; #define hnae3_set_field(origin, mask, shift, val) \ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 9e895b9..a567557 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4246,6 +4246,7 @@ static int hns3_client_init(struct hnae3_handle *handle) set_bit(HNS3_NIC_STATE_INITED, &priv->state); set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state); + handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE); if (netif_msg_drv(handle)) hns3_info_show(priv); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 30ffaaf..427b72c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -18,6 +18,11 @@ struct hns3_sfp_type { u8 ext_type; }; +struct hns3_pflag_desc { + char name[ETH_GSTRING_LEN]; + void (*handler)(struct net_device *netdev, bool enable); +}; + /* tqp related stats */ #define HNS3_TQP_STAT(_string, _member){ \ .stats_string = _string,\ @@ -59,6 +64,8 @@ static const struct hns3_stats hns3_rxq_stats[] = { HNS3_TQP_STAT("non_reuse_pg", non_reuse_pg), }; +#define HNS3_PRIV_FLAGS_LEN ARRAY_SIZE(hns3_priv_flags) + #define HNS3_RXQ_STATS_COUNT ARRAY_SIZE(hns3_rxq_stats) #define HNS3_TQP_STATS_COUNT (HNS3_TXQ_STATS_COUNT + HNS3_RXQ_STATS_COUNT) @@ -394,6 +401,26 @@ static void hns3_self_test(struct net_device *ndev, netif_dbg(h, drv, ndev, "self test end\n"); } +static void hns3_update_state(struct net_device *netdev, + enum hns3_nic_state state, bool enable) +{ + struct hns3_nic_priv *priv = netdev_priv(netdev); + + if (enable) + set_bit(state, &priv->state); + else + clear_bit(state, &priv->state); +} + +static void hns3_update_dim_state(struct net_device *netdev, bool enable) +{ + hns3_update_state(netdev, HNS3_NIC_STATE_DIM_ENABLE, enable); +} + +static const struct hns3_pflag_desc hns3_priv_flags[HNAE3_PFLAG_MAX] = { + { "dim_enable", hns3_update_dim_state }, +}; + static int hns3_get_sset_count(struct net_device *netdev, int stringset) { struct hnae3_handle *h = hns3_get_handle(netdev); @@ -410,6 +437,9 @@ static int hns3_get_sset_count(struct net_device *netdev, int stringset) case ETH_SS_TEST: return ops->get_sset_count(h, stringset); + case ETH_SS_PRIV_FLAGS: + return HNAE3_PFLAG_MAX; + default: return -EOPNOTSUPP; } @@ -463,6 +493,7 @@ static void hns3_get_strings(struct net_device *netdev, u32 stringset, u8 *data) struct hnae3_handle *h = hns3_get_handle(netdev); const struct hnae3_ae_ops *ops = h->ae_algo->ops; char *buff = (char *)data; + int i; if (!ops->get_strings) return; @@ -475,6 +506,13 @@ static void hns3_get_strings(struct net_device *netdev, u32 stringset, u8 *data) case ETH_SS_TEST: ops->get_strings(h, stringset, data); break; + case ETH_SS_PRIV_FLAGS: + for (i = 0; i < HNS3_PRIV_FLAGS_LEN; i++) { + snprintf(buff, ETH_GSTRING_LEN, "%s", +hns3_priv_flags[i].name); + buff += ETH_GSTRING_LEN; + } + break; default: break; } @@ -1516,6 +1554,35 @@ static int hns3_get_module_eeprom(struct net_device *netdev, return ops->get_module_eeprom(handle, ee->offset, ee->len, data); } +static u32 hns3_get_priv_flags(struct net_device *netdev) +{ +
[PATCH net-next 04/11] net: hns3: rename gl_adapt_enable in struct hns3_enet_coalesce
Besides GL(Gap Limiting), QL(Quantity Limiting) can be modified dynamically when DIM is supported. So rename gl_adapt_enable as adapt_enable in struct hns3_enet_coalesce. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 12 ++-- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h| 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 8 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 2813fe5..999a2aa 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -211,8 +211,8 @@ void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector, * GL and RL(Rate Limiter) are 2 ways to acheive interrupt coalescing */ - if (rl_reg > 0 && !tqp_vector->tx_group.coal.gl_adapt_enable && - !tqp_vector->rx_group.coal.gl_adapt_enable) + if (rl_reg > 0 && !tqp_vector->tx_group.coal.adapt_enable && + !tqp_vector->rx_group.coal.adapt_enable) /* According to the hardware, the range of rl_reg is * 0-59 and the unit is 4. */ @@ -273,8 +273,8 @@ static void hns3_vector_coalesce_init(struct hns3_enet_tqp_vector *tqp_vector, * * Default: enable interrupt coalescing self-adaptive and GL */ - tx_coal->gl_adapt_enable = 1; - rx_coal->gl_adapt_enable = 1; + tx_coal->adapt_enable = 1; + rx_coal->adapt_enable = 1; tx_coal->int_gl = HNS3_INT_GL_50K; rx_coal->int_gl = HNS3_INT_GL_50K; @@ -3384,14 +3384,14 @@ static void hns3_update_new_int_gl(struct hns3_enet_tqp_vector *tqp_vector) tqp_vector->last_jiffies + msecs_to_jiffies(1000))) return; - if (rx_group->coal.gl_adapt_enable) { + if (rx_group->coal.adapt_enable) { rx_update = hns3_get_new_int_gl(rx_group); if (rx_update) hns3_set_vector_coalesce_rx_gl(tqp_vector, rx_group->coal.int_gl); } - if (tx_group->coal.gl_adapt_enable) { + if (tx_group->coal.adapt_enable) { tx_update = hns3_get_new_int_gl(tx_group); if (tx_update) hns3_set_vector_coalesce_tx_gl(tqp_vector, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 4651ad1..8d33652 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -436,7 +436,7 @@ struct hns3_enet_coalesce { u16 int_gl; u16 int_ql; u16 int_ql_max; - u8 gl_adapt_enable:1; + u8 adapt_enable:1; u8 ql_enable:1; u8 unit_1us:1; enum hns3_flow_level_range flow_level; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 8d5c194..30ffaaf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1105,9 +1105,9 @@ static int hns3_get_coalesce_per_queue(struct net_device *netdev, u32 queue, rx_vector = priv->ring[queue_num + queue].tqp_vector; cmd->use_adaptive_tx_coalesce = - tx_vector->tx_group.coal.gl_adapt_enable; + tx_vector->tx_group.coal.adapt_enable; cmd->use_adaptive_rx_coalesce = - rx_vector->rx_group.coal.gl_adapt_enable; + rx_vector->rx_group.coal.adapt_enable; cmd->tx_coalesce_usecs = tx_vector->tx_group.coal.int_gl; cmd->rx_coalesce_usecs = rx_vector->rx_group.coal.int_gl; @@ -1268,9 +1268,9 @@ static void hns3_set_coalesce_per_queue(struct net_device *netdev, tx_vector = priv->ring[queue].tqp_vector; rx_vector = priv->ring[queue_num + queue].tqp_vector; - tx_vector->tx_group.coal.gl_adapt_enable = + tx_vector->tx_group.coal.adapt_enable = cmd->use_adaptive_tx_coalesce; - rx_vector->rx_group.coal.gl_adapt_enable = + rx_vector->rx_group.coal.adapt_enable = cmd->use_adaptive_rx_coalesce; tx_vector->tx_group.coal.int_gl = cmd->tx_coalesce_usecs; -- 2.7.4
[PATCH net-next 08/11] net: hns3: add a check for ethtool priv-flag interface
Add a check for hns3_set_priv_flags() since if the capability is unsupported its private flags should not be modified as well. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hnae3.h| 1 + drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 1 + drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 19 +++ 3 files changed, 21 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 18b3e43..3642740 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -744,6 +744,7 @@ struct hnae3_handle { /* Network interface message level enabled bits */ u32 msg_enable; + unsigned long supported_pflags; unsigned long priv_flags; }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index f686723..c30cf9e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4152,6 +4152,7 @@ static void hns3_state_init(struct hnae3_handle *handle) set_bit(HNS3_NIC_STATE_INITED, &priv->state); set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state); handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE); + set_bit(HNAE3_PFLAG_DIM_ENABLE, &handle->supported_pflags); } static int hns3_client_init(struct hnae3_handle *handle) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 427b72c..6904c0a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1561,12 +1561,31 @@ static u32 hns3_get_priv_flags(struct net_device *netdev) return handle->priv_flags; } +static int hns3_check_priv_flags(struct hnae3_handle *h, u32 changed) +{ + u32 i; + + for (i = 0; i < HNAE3_PFLAG_MAX; i++) + if ((changed & BIT(i)) && !test_bit(i, &h->supported_pflags)) { + netdev_err(h->netdev, "%s is unsupported\n", + hns3_priv_flags[i].name); + return -EOPNOTSUPP; + } + + return 0; +} + static int hns3_set_priv_flags(struct net_device *netdev, u32 pflags) { struct hnae3_handle *handle = hns3_get_handle(netdev); u32 changed = pflags ^ handle->priv_flags; + int ret; u32 i; + ret = hns3_check_priv_flags(handle, changed); + if (ret) + return ret; + for (i = 0; i < HNAE3_PFLAG_MAX; i++) { if (changed & BIT(i)) { bool enable = !(handle->priv_flags & BIT(i)); -- 2.7.4
[PATCH net-next 10/11] net: hns3: add ethtool priv-flag for EQ/CQ
Add a control private flag in ethtool for switching EQ/CQ mode. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hnae3.h| 2 ++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 19 -- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h| 2 ++ drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 23 ++ 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 345e8a4..a452874 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -719,6 +719,8 @@ struct hnae3_roce_private_info { enum hnae3_pflag { HNAE3_PFLAG_DIM_ENABLE, + HNAE3_PFLAG_TX_CQE_MODE, + HNAE3_PFLAG_RX_CQE_MODE, HNAE3_PFLAG_MAX }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index d1243ea..93f7731 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4144,6 +4144,7 @@ static void hns3_info_show(struct hns3_nic_priv *priv) static void hns3_state_init(struct hnae3_handle *handle) { + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev); struct net_device *netdev = handle->kinfo.netdev; struct hns3_nic_priv *priv = netdev_priv(netdev); @@ -4151,10 +4152,24 @@ static void hns3_state_init(struct hnae3_handle *handle) set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state); handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE); set_bit(HNAE3_PFLAG_DIM_ENABLE, &handle->supported_pflags); + + /* device version above V3(include V3), GL can switch CQ/EQ period +* mode. +*/ + if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) { + set_bit(HNAE3_PFLAG_TX_CQE_MODE, &handle->supported_pflags); + set_bit(HNAE3_PFLAG_RX_CQE_MODE, &handle->supported_pflags); + } + + if (priv->tx_cqe_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE) + handle->priv_flags |= BIT(HNAE3_PFLAG_TX_CQE_MODE); + + if (priv->rx_cqe_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE) + handle->priv_flags |= BIT(HNAE3_PFLAG_RX_CQE_MODE); } -static void hns3_set_cq_period_mode(struct hns3_nic_priv *priv, - enum dim_cq_period_mode mode, bool is_tx) +void hns3_set_cq_period_mode(struct hns3_nic_priv *priv, +enum dim_cq_period_mode mode, bool is_tx) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); struct hnae3_handle *handle = priv->ae_handle; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index c6c082a..ecdb544 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -635,4 +635,6 @@ void hns3_dbg_uninit(struct hnae3_handle *handle); void hns3_dbg_register_debugfs(const char *debugfs_dir_name); void hns3_dbg_unregister_debugfs(void); void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size); +void hns3_set_cq_period_mode(struct hns3_nic_priv *priv, +enum dim_cq_period_mode mode, bool is_tx); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 6904c0a..8de2789 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -417,8 +417,31 @@ static void hns3_update_dim_state(struct net_device *netdev, bool enable) hns3_update_state(netdev, HNS3_NIC_STATE_DIM_ENABLE, enable); } +static void hns3_update_cqe_mode(struct net_device *netdev, bool enable, bool is_tx) +{ + struct hns3_nic_priv *priv = netdev_priv(netdev); + enum dim_cq_period_mode mode; + + mode = enable ? DIM_CQ_PERIOD_MODE_START_FROM_CQE : + DIM_CQ_PERIOD_MODE_START_FROM_EQE; + + hns3_set_cq_period_mode(priv, mode, is_tx); +} + +static void hns3_update_tx_cqe_mode(struct net_device *netdev, bool enable) +{ + hns3_update_cqe_mode(netdev, enable, true); +} + +static void hns3_update_rx_cqe_mode(struct net_device *netdev, bool enable) +{ + hns3_update_cqe_mode(netdev, enable, false); +} + static const struct hns3_pflag_desc hns3_priv_flags[HNAE3_PFLAG_MAX] = { { "dim_enable", hns3_update_dim_state }, + { "tx_cqe_mode",hns3_update_tx_cqe_mode }, + { "rx_cqe_mode",hns3_update_rx_cqe_mode }, }; static int hns3_get_sset_count(struct net_device *netdev, int stringset) -- 2.7.4
[PATCH net-next 01/11] net: hns3: add support for configuring interrupt quantity limiting
QL(quantity limiting) means that hardware supports the interrupt coalesce based on the frame quantity. QL can be configured when int_ql_max in device's specification is non-zero, so add support to configure it. Also, rename two coalesce init function to fit their purpose. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 65 -- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h| 13 - drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 43 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 1 + .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 1 + 5 files changed, 105 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index a362516..6e08719 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -237,35 +237,68 @@ void hns3_set_vector_coalesce_tx_gl(struct hns3_enet_tqp_vector *tqp_vector, writel(tx_gl_reg, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET); } -static void hns3_vector_gl_rl_init(struct hns3_enet_tqp_vector *tqp_vector, - struct hns3_nic_priv *priv) +void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector, + u32 ql_value) { + writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_TX_QL_OFFSET); +} + +void hns3_set_vector_coalesce_rx_ql(struct hns3_enet_tqp_vector *tqp_vector, + u32 ql_value) +{ + writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_RX_QL_OFFSET); +} + +static void hns3_vector_coalesce_init(struct hns3_enet_tqp_vector *tqp_vector, + struct hns3_nic_priv *priv) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); + struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal; + struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal; + /* initialize the configuration for interrupt coalescing. * 1. GL (Interrupt Gap Limiter) * 2. RL (Interrupt Rate Limiter) +* 3. QL (Interrupt Quantity Limiter) * * Default: enable interrupt coalescing self-adaptive and GL */ - tqp_vector->tx_group.coal.gl_adapt_enable = 1; - tqp_vector->rx_group.coal.gl_adapt_enable = 1; + tx_coal->gl_adapt_enable = 1; + rx_coal->gl_adapt_enable = 1; + + tx_coal->int_gl = HNS3_INT_GL_50K; + rx_coal->int_gl = HNS3_INT_GL_50K; - tqp_vector->tx_group.coal.int_gl = HNS3_INT_GL_50K; - tqp_vector->rx_group.coal.int_gl = HNS3_INT_GL_50K; + rx_coal->flow_level = HNS3_FLOW_LOW; + tx_coal->flow_level = HNS3_FLOW_LOW; - tqp_vector->rx_group.coal.flow_level = HNS3_FLOW_LOW; - tqp_vector->tx_group.coal.flow_level = HNS3_FLOW_LOW; + if (ae_dev->dev_specs.int_ql_max) { + tx_coal->ql_enable = 1; + rx_coal->ql_enable = 1; + tx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max; + rx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max; + tx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG; + rx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG; + } } -static void hns3_vector_gl_rl_init_hw(struct hns3_enet_tqp_vector *tqp_vector, - struct hns3_nic_priv *priv) +static void +hns3_vector_coalesce_init_hw(struct hns3_enet_tqp_vector *tqp_vector, +struct hns3_nic_priv *priv) { + struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal; + struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal; struct hnae3_handle *h = priv->ae_handle; - hns3_set_vector_coalesce_tx_gl(tqp_vector, - tqp_vector->tx_group.coal.int_gl); - hns3_set_vector_coalesce_rx_gl(tqp_vector, - tqp_vector->rx_group.coal.int_gl); + hns3_set_vector_coalesce_tx_gl(tqp_vector, tx_coal->int_gl); + hns3_set_vector_coalesce_rx_gl(tqp_vector, rx_coal->int_gl); hns3_set_vector_coalesce_rl(tqp_vector, h->kinfo.int_rl_setting); + + if (tx_coal->ql_enable) + hns3_set_vector_coalesce_tx_ql(tqp_vector, tx_coal->int_ql); + + if (rx_coal->ql_enable) + hns3_set_vector_coalesce_rx_ql(tqp_vector, rx_coal->int_ql); } static int hns3_nic_set_real_num_queue(struct net_device *netdev) @@ -3536,7 +3569,7 @@ static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv) for (i = 0; i < priv->vector_num; i++) { tqp_vector = &priv->tqp_vector[i]; - hns3_vector_gl_rl_init_hw(tqp_vector, priv); + hns3_vector_coalesce_init_hw(tqp_vector, priv); tqp_vector->num_tqps = 0; } @@ -3632,7 +3665,7 @@ static i
[PATCH net-next 00/11] net: hns3: updates for -next
There are several updates relating to the interrupt coalesce for the HNS3 ethernet driver. #1 adds support for QL(quantity limiting, interrupt coalesce based on the frame quantity). #2 adds support for 1us unit GL(gap limiting, interrupt coalesce based on the gap time). #3 queries the maximum value of GL from the firmware instead of a fixed value in code. #4 renames gl_adapt_enable in struct hns3_enet_coalesce to fit its new usage. #5 & #6 adds support for the dynamic interrupt moderation, and adds a control private flag in ethtool. #7 adds wrapper function for state initialization. #8 adds a check for the read-only private flag. #9 & #10 adds support for EQ/CQ configuration, and adds a control private flag in ethtool. #11 adds debugfs support for interrupt coalesce. Huazhong Tan (11): net: hns3: add support for configuring interrupt quantity limiting net: hns3: add support for 1us unit GL configuration net: hns3: add support for querying maximum value of GL net: hns3: rename gl_adapt_enable in struct hns3_enet_coalesce net: hns3: add support for dynamic interrupt moderation net: hns3: add ethtool priv-flag for DIM net: hns3: add hns3_state_init() to do state initialization net: hns3: add a check for ethtool priv-flag interface net: hns3: add support for EQ/CQ mode configuration net: hns3: add ethtool priv-flag for EQ/CQ net: hns3: add debugfs support for interrupt coalesce drivers/net/ethernet/hisilicon/Kconfig | 1 + drivers/net/ethernet/hisilicon/hns3/hnae3.h| 12 + drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 125 ++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 258 ++--- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h| 31 ++- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 184 ++- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 8 + .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 8 + .../ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h | 8 + .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 8 + 10 files changed, 604 insertions(+), 39 deletions(-) -- 2.7.4
[PATCH net-next 07/11] net: hns3: add hns3_state_init() to do state initialization
To improve the readability and maintainability, add hns3_state_init() to initialize the state. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index a567557..f686723 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4144,6 +4144,16 @@ static void hns3_info_show(struct hns3_nic_priv *priv) dev_info(priv->dev, "Max mtu size: %u\n", priv->netdev->max_mtu); } +static void hns3_state_init(struct hnae3_handle *handle) +{ + struct net_device *netdev = handle->kinfo.netdev; + struct hns3_nic_priv *priv = netdev_priv(netdev); + + set_bit(HNS3_NIC_STATE_INITED, &priv->state); + set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state); + handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE); +} + static int hns3_client_init(struct hnae3_handle *handle) { struct pci_dev *pdev = handle->pdev; @@ -4244,9 +4254,7 @@ static int hns3_client_init(struct hnae3_handle *handle) /* MTU range: (ETH_MIN_MTU(kernel default) - 9702) */ netdev->max_mtu = HNS3_MAX_MTU; - set_bit(HNS3_NIC_STATE_INITED, &priv->state); - set_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state); - handle->priv_flags |= BIT(HNAE3_PFLAG_DIM_ENABLE); + hns3_state_init(handle); if (netif_msg_drv(handle)) hns3_info_show(priv); -- 2.7.4
[PATCH net-next 11/11] net: hns3: add debugfs support for interrupt coalesce
Since user may need to check the current configuration of the interrupt coalesce, so add debugfs support for query this info, which includes DIM profile, coalesce configuration of both software and hardware. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 124 + 1 file changed, 124 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index a5ebca8..1efeed6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -12,6 +12,91 @@ static struct dentry *hns3_dbgfs_root; +static ssize_t hns3_dbg_coal_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct hnae3_handle *h = filp->private_data; + struct hns3_nic_priv *priv = h->priv; + struct hns3_enet_tqp_vector *tqp_vector; + struct hns3_enet_coalesce *coal; + int uncopied_bytes; + unsigned int idx; + struct dim *dim; + char *cmd_buf; + + if (*ppos != 0) + return 0; + + if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) { + dev_err(&h->pdev->dev, "device is not initialized\n"); + return -EFAULT; + } + + cmd_buf = kzalloc(count + 1, GFP_KERNEL); + if (!cmd_buf) + return -ENOMEM; + + uncopied_bytes = copy_from_user(cmd_buf, buffer, count); + if (uncopied_bytes) { + kfree(cmd_buf); + return -EFAULT; + } + + cmd_buf[count] = '\0'; + + if (kstrtouint(cmd_buf, 0, &idx)) + idx = 0; + + if (idx >= priv->vector_num) { + dev_err(&h->pdev->dev, + "vector index(%u) is out of range(0-%u)\n", idx, + priv->vector_num - 1); + kfree(cmd_buf); + return -EINVAL; + } + + tqp_vector = &priv->tqp_vector[idx]; + coal = &tqp_vector->tx_group.coal; + dim = &tqp_vector->tx_group.dim; + + dev_info(&h->pdev->dev, "vector[%u] interrupt coalesce info:\n", idx); + dev_info(&h->pdev->dev, +"TX DIM info state = %d profile_ix = %d mode = %d tune_state = %d steps_right = %d steps_left = %d tired = %d\n", +dim->state, dim->profile_ix, dim->mode, dim->tune_state, +dim->steps_right, dim->steps_left, dim->tired); + + dev_info(&h->pdev->dev, "TX GL info sw_gl = %u, hw_gl = %u\n", +coal->int_gl, +readl(tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET)); + + if (coal->ql_enable) + dev_info(&h->pdev->dev, "TX QL info sw_ql = %u, hw_ql = %u\n", +coal->int_ql, +readl(tqp_vector->mask_addr + HNS3_VECTOR_TX_QL_OFFSET)); + + coal = &tqp_vector->rx_group.coal; + dim = &tqp_vector->rx_group.dim; + + dev_info(&h->pdev->dev, +"RX dim_info state = %d profile_ix = %d mode = %d tune_state = %d steps_right = %d steps_left = %d tired = %d\n", +dim->state, dim->profile_ix, dim->mode, dim->tune_state, +dim->steps_right, dim->steps_left, dim->tired); + + dev_info(&h->pdev->dev, "RX GL info sw_gl = %u, hw_gl = %u\n", +coal->int_gl, +readl(tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET)); + + if (coal->ql_enable) + dev_info(&h->pdev->dev, "RX QL info sw_ql = %u, hw_ql = %u\n", +coal->int_ql, +readl(tqp_vector->mask_addr + HNS3_VECTOR_RX_QL_OFFSET)); + + kfree(cmd_buf); + cmd_buf = NULL; + + return count; +} + static int hns3_dbg_queue_info(struct hnae3_handle *h, const char *cmd_buf) { @@ -352,6 +437,35 @@ static void hns3_dbg_dev_specs(struct hnae3_handle *h) dev_info(priv->dev, "MAX INT GL: %u\n", dev_specs->max_int_gl); } +static ssize_t hns3_dbg_coal_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + int uncopy_bytes; + char *buf; + int len; + + if (*ppos != 0) + return 0; + + if (count < HNS3_DBG_READ_LEN) + return -ENOSPC; + + buf = kzalloc(HNS3_DBG_READ_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + len = scnprintf(buf, HNS3_DBG_READ_LEN, "%s\n", + "Please echo index to coal"); + uncopy_bytes = copy_to_user(buffer, buf, len); + + kfree(buf); + + if (uncopy_bytes) + return -EFAULT; + + return (*ppos = len); +} + static ssize_t hns3_dbg_cmd_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { @@ -452,6 +566,13 @@ static const struct file_operations hns3_dbg_c
[PATCH net-next 09/11] net: hns3: add support for EQ/CQ mode configuration
For device whose version is above V3(include V3), the GL can select EQ or CQ mode, so adds support for it. In CQ mode, the coalesced timer will restart upon new completion, while in EQ mode, the timer will not restart. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hnae3.h| 1 + drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 49 +- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h| 8 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 1 + .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 1 + 5 files changed, 58 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 3642740..345e8a4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -684,6 +684,7 @@ struct hnae3_knic_private_info { u16 int_rl_setting; enum pkt_hash_types rss_type; + void __iomem *io_base; }; struct hnae3_roce_private_info { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index c30cf9e..d1243ea 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3653,9 +3653,7 @@ static void hns3_tx_dim_work(struct work_struct *work) static void hns3_nic_init_dim(struct hns3_enet_tqp_vector *tqp_vector) { INIT_WORK(&tqp_vector->rx_group.dim.work, hns3_rx_dim_work); - tqp_vector->rx_group.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; INIT_WORK(&tqp_vector->tx_group.dim.work, hns3_tx_dim_work); - tqp_vector->tx_group.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; } static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv) @@ -4155,6 +4153,48 @@ static void hns3_state_init(struct hnae3_handle *handle) set_bit(HNAE3_PFLAG_DIM_ENABLE, &handle->supported_pflags); } +static void hns3_set_cq_period_mode(struct hns3_nic_priv *priv, + enum dim_cq_period_mode mode, bool is_tx) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); + struct hnae3_handle *handle = priv->ae_handle; + int i; + + if (is_tx) { + priv->tx_cqe_mode = mode; + + for (i = 0; i < priv->vector_num; i++) + priv->tqp_vector[i].tx_group.dim.mode = mode; + } else { + priv->rx_cqe_mode = mode; + + for (i = 0; i < priv->vector_num; i++) + priv->tqp_vector[i].rx_group.dim.mode = mode; + } + + /* only device version above V3(include V3), GL can switch CQ/EQ +* period mode. +*/ + if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) { + u32 new_mode; + u64 reg; + + new_mode = (mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE) ? + HNS3_CQ_MODE_CQE : HNS3_CQ_MODE_EQE; + reg = is_tx ? HNS3_GL1_CQ_MODE_REG : HNS3_GL0_CQ_MODE_REG; + + writel(new_mode, handle->kinfo.io_base + reg); + } +} + +static void hns3_cq_period_mode_init(struct hns3_nic_priv *priv, +enum dim_cq_period_mode tx_mode, +enum dim_cq_period_mode rx_mode) +{ + hns3_set_cq_period_mode(priv, tx_mode, true); + hns3_set_cq_period_mode(priv, rx_mode, false); +} + static int hns3_client_init(struct hnae3_handle *handle) { struct pci_dev *pdev = handle->pdev; @@ -4220,6 +4260,9 @@ static int hns3_client_init(struct hnae3_handle *handle) goto out_init_ring; } + hns3_cq_period_mode_init(priv, DIM_CQ_PERIOD_MODE_START_FROM_EQE, +DIM_CQ_PERIOD_MODE_START_FROM_EQE); + ret = hns3_init_phy(netdev); if (ret) goto out_init_phy; @@ -4580,6 +4623,8 @@ static int hns3_reset_notify_init_enet(struct hnae3_handle *handle) if (ret) goto err_uninit_vector; + hns3_cq_period_mode_init(priv, priv->tx_cqe_mode, priv->rx_cqe_mode); + /* the device can work without cpu rmap, only aRFS needs it */ ret = hns3_set_rx_cpu_rmap(netdev); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index eb4e7ef..c6c082a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -188,6 +188,12 @@ enum hns3_nic_state { #define HNS3_RING_EN_B 0 +#define HNS3_GL0_CQ_MODE_REG 0x20d00 +#define HNS3_GL1_CQ_MODE_REG 0x20d04 +#define HNS3_GL2_CQ_MODE_REG 0x20d08 +#define HNS3_CQ_MODE_EQE 1U +#define HNS3_CQ_MODE_CQE 0U + enum hns3_pkt_l2t_type { HNS3_L2_TYPE_UNICAST,
[PATCH net-next 05/11] net: hns3: add support for dynamic interrupt moderation
Add dynamic interrupt moderation support for the HNS3 driver. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/Kconfig | 1 + drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 87 - drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 4 ++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig index 44f9279..fa6025d 100644 --- a/drivers/net/ethernet/hisilicon/Kconfig +++ b/drivers/net/ethernet/hisilicon/Kconfig @@ -130,6 +130,7 @@ config HNS3_ENET default m depends on 64BIT && PCI depends on INET + select DIMLIB help This selects the Ethernet Driver for Hisilicon Network Subsystem 3 for hip08 family of SoCs. This module depends upon HNAE3 driver to access the HNAE3 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 999a2aa..9e895b9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -96,6 +96,7 @@ static irqreturn_t hns3_irq_handle(int irq, void *vector) struct hns3_enet_tqp_vector *tqp_vector = vector; napi_schedule_irqoff(&tqp_vector->napi); + tqp_vector->event_cnt++; return IRQ_HANDLED; } @@ -199,6 +200,8 @@ static void hns3_vector_disable(struct hns3_enet_tqp_vector *tqp_vector) disable_irq(tqp_vector->vector_irq); napi_disable(&tqp_vector->napi); + cancel_work_sync(&tqp_vector->rx_group.dim.work); + cancel_work_sync(&tqp_vector->tx_group.dim.work); } void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector, @@ -3401,6 +3404,32 @@ static void hns3_update_new_int_gl(struct hns3_enet_tqp_vector *tqp_vector) tqp_vector->last_jiffies = jiffies; } +static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) +{ + struct hns3_enet_ring_group *rx_group = &tqp_vector->rx_group; + struct dim_sample sample = {}; + + if (!rx_group->coal.adapt_enable) + return; + + dim_update_sample(tqp_vector->event_cnt, rx_group->total_packets, + rx_group->total_bytes, &sample); + net_dim(&rx_group->dim, sample); +} + +static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) +{ + struct hns3_enet_ring_group *tx_group = &tqp_vector->tx_group; + struct dim_sample sample = {}; + + if (!tx_group->coal.adapt_enable) + return; + + dim_update_sample(tqp_vector->event_cnt, tx_group->total_packets, + tx_group->total_bytes, &sample); + net_dim(&tx_group->dim, sample); +} + static int hns3_nic_common_poll(struct napi_struct *napi, int budget) { struct hns3_nic_priv *priv = netdev_priv(napi->dev); @@ -3444,7 +3473,13 @@ static int hns3_nic_common_poll(struct napi_struct *napi, int budget) if (napi_complete(napi) && likely(!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) { - hns3_update_new_int_gl(tqp_vector); + if (test_bit(HNS3_NIC_STATE_DIM_ENABLE, &priv->state)) { + hns3_update_rx_int_coalesce(tqp_vector); + hns3_update_tx_int_coalesce(tqp_vector); + } else { + hns3_update_new_int_gl(tqp_vector); + } + hns3_mask_vector_irq(tqp_vector, 1); } @@ -3575,6 +3610,54 @@ static void hns3_nic_set_cpumask(struct hns3_nic_priv *priv) } } +static void hns3_rx_dim_work(struct work_struct *work) +{ + struct dim *dim = container_of(work, struct dim, work); + struct hns3_enet_ring_group *group = container_of(dim, + struct hns3_enet_ring_group, dim); + struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector; + struct dim_cq_moder cur_moder = + net_dim_get_rx_moderation(dim->mode, dim->profile_ix); + + hns3_set_vector_coalesce_rx_gl(group->ring->tqp_vector, cur_moder.usec); + tqp_vector->rx_group.coal.int_gl = cur_moder.usec; + + if (cur_moder.pkts < tqp_vector->rx_group.coal.int_ql_max) { + hns3_set_vector_coalesce_rx_ql(tqp_vector, cur_moder.pkts); + tqp_vector->rx_group.coal.int_ql = cur_moder.pkts; + } + + dim->state = DIM_START_MEASURE; +} + +static void hns3_tx_dim_work(struct work_struct *work) +{ + struct dim *dim = container_of(work, struct dim, work); + struct hns3_enet_ring_group *group = container_of(dim, + struct hns3_enet_ring_group, dim); + struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector; + struct dim_cq_moder cur_moder = + net_dim_get_tx_moderation(dim->mode, dim->profile_ix); + + hns3_set_vector_coalesce_tx_gl(tqp_vector, cur_moder.usec); + tqp_vector->
[PATCH net-next 03/11] net: hns3: add support for querying maximum value of GL
For maintainability and compatibility, add support for querying the maximum value of GL. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 + drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c| 1 + drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 1 - drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c| 14 -- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h| 8 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 ++ drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h | 8 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 6 ++ 8 files changed, 38 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 912c51e..f9d4d23 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -278,6 +278,7 @@ struct hnae3_dev_specs { u16 rss_ind_tbl_size; u16 rss_key_size; u16 int_ql_max; /* max value of interrupt coalesce based on INT_QL */ + u16 max_int_gl; /* max value of interrupt coalesce based on INT_GL */ u8 max_non_tso_bd_num; /* max BD number of one non-TSO packet */ }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index dc9a857..a5ebca8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -349,6 +349,7 @@ static void hns3_dbg_dev_specs(struct hnae3_handle *h) dev_info(priv->dev, "Desc num per RX queue: %u\n", kinfo->num_rx_desc); dev_info(priv->dev, "Total number of enabled TCs: %u\n", kinfo->num_tc); dev_info(priv->dev, "MAX INT QL: %u\n", dev_specs->int_ql_max); + dev_info(priv->dev, "MAX INT GL: %u\n", dev_specs->max_int_gl); } static ssize_t hns3_dbg_cmd_read(struct file *filp, char __user *buffer, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index b37635d..4651ad1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -420,7 +420,6 @@ enum hns3_flow_level_range { HNS3_FLOW_ULTRA = 3, }; -#define HNS3_INT_GL_MAX0x1FE0 #define HNS3_INT_GL_50K0x0014 #define HNS3_INT_GL_20K0x0032 #define HNS3_INT_GL_18K0x0036 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 128e9ec..8d5c194 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1130,19 +1130,21 @@ static int hns3_get_coalesce(struct net_device *netdev, static int hns3_check_gl_coalesce_para(struct net_device *netdev, struct ethtool_coalesce *cmd) { + struct hnae3_handle *handle = hns3_get_handle(netdev); + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev); u32 rx_gl, tx_gl; - if (cmd->rx_coalesce_usecs > HNS3_INT_GL_MAX) { + if (cmd->rx_coalesce_usecs > ae_dev->dev_specs.max_int_gl) { netdev_err(netdev, - "Invalid rx-usecs value, rx-usecs range is 0-%d\n", - HNS3_INT_GL_MAX); + "invalid rx-usecs value, rx-usecs range is 0-%u\n", + ae_dev->dev_specs.max_int_gl); return -EINVAL; } - if (cmd->tx_coalesce_usecs > HNS3_INT_GL_MAX) { + if (cmd->tx_coalesce_usecs > ae_dev->dev_specs.max_int_gl) { netdev_err(netdev, - "Invalid tx-usecs value, tx-usecs range is 0-%d\n", - HNS3_INT_GL_MAX); + "invalid tx-usecs value, tx-usecs range is 0-%u\n", + ae_dev->dev_specs.max_int_gl); return -EINVAL; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 096e26a..5b7967c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -1103,6 +1103,14 @@ struct hclge_dev_specs_0_cmd { __le32 max_tm_rate; }; +#define HCLGE_DEF_MAX_INT_GL 0x1FE0U + +struct hclge_dev_specs_1_cmd { + __le32 rsv0; + __le16 max_int_gl; + u8 rsv1[18]; +}; + int hclge_cmd_init(struct hclge_dev *hdev); static inline void hclge_write_reg(void __iomem *base, u32 reg, u32 value) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 8bcdb28..7102001 100644 --- a/drivers/net/ethernet/hisilicon/h
[PATCH net-next 02/11] net: hns3: add support for 1us unit GL configuration
For device whose version is above V3(include V3), the GL configuration can set as 1us unit, so adds support for configuring this field. Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 26 ++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.h| 3 +++ drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 6 + 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 6e08719..2813fe5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -224,17 +224,27 @@ void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector, void hns3_set_vector_coalesce_rx_gl(struct hns3_enet_tqp_vector *tqp_vector, u32 gl_value) { - u32 rx_gl_reg = hns3_gl_usec_to_reg(gl_value); + u32 new_val; - writel(rx_gl_reg, tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET); + if (tqp_vector->rx_group.coal.unit_1us) + new_val = gl_value | HNS3_INT_GL_1US; + else + new_val = hns3_gl_usec_to_reg(gl_value); + + writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET); } void hns3_set_vector_coalesce_tx_gl(struct hns3_enet_tqp_vector *tqp_vector, u32 gl_value) { - u32 tx_gl_reg = hns3_gl_usec_to_reg(gl_value); + u32 new_val; + + if (tqp_vector->tx_group.coal.unit_1us) + new_val = gl_value | HNS3_INT_GL_1US; + else + new_val = hns3_gl_usec_to_reg(gl_value); - writel(tx_gl_reg, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET); + writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET); } void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector, @@ -272,6 +282,14 @@ static void hns3_vector_coalesce_init(struct hns3_enet_tqp_vector *tqp_vector, rx_coal->flow_level = HNS3_FLOW_LOW; tx_coal->flow_level = HNS3_FLOW_LOW; + /* device version above V3(include V3), GL can configure 1us +* unit, so uses 1us unit. +*/ + if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) { + tx_coal->unit_1us = 1; + rx_coal->unit_1us = 1; + } + if (ae_dev->dev_specs.int_ql_max) { tx_coal->ql_enable = 1; rx_coal->ql_enable = 1; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 10990bd..b37635d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -426,6 +426,8 @@ enum hns3_flow_level_range { #define HNS3_INT_GL_18K0x0036 #define HNS3_INT_GL_8K 0x007C +#define HNS3_INT_GL_1USBIT(31) + #define HNS3_INT_RL_MAX0x00EC #define HNS3_INT_RL_ENABLE_MASK0x40 @@ -437,6 +439,7 @@ struct hns3_enet_coalesce { u16 int_ql_max; u8 gl_adapt_enable:1; u8 ql_enable:1; + u8 unit_1us:1; enum hns3_flow_level_range flow_level; }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 9af7cb9..128e9ec 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1146,6 +1146,12 @@ static int hns3_check_gl_coalesce_para(struct net_device *netdev, return -EINVAL; } + /* device version above V3(include V3), GL uses 1us unit, +* so the round down is not needed. +*/ + if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) + return 0; + rx_gl = hns3_gl_round_down(cmd->rx_coalesce_usecs); if (rx_gl != cmd->rx_coalesce_usecs) { netdev_info(netdev, -- 2.7.4
[PATCH] KVM: PPC: Book3S: Assign boolean values to a bool variable
From: Kaixu Xia Fix the following coccinelle warnings: ./arch/powerpc/kvm/book3s_xics.c:476:3-15: WARNING: Assignment of 0/1 to bool variable ./arch/powerpc/kvm/book3s_xics.c:504:3-15: WARNING: Assignment of 0/1 to bool variable Reported-by: Tosk Robot Signed-off-by: Kaixu Xia --- arch/powerpc/kvm/book3s_xics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 5fee5a11550d..303e3cb096db 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -473,7 +473,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, arch_spin_unlock(&ics->lock); local_irq_restore(flags); new_irq = reject; - check_resend = 0; + check_resend = false; goto again; } } else { @@ -501,7 +501,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, state->resend = 0; arch_spin_unlock(&ics->lock); local_irq_restore(flags); - check_resend = 0; + check_resend = false; goto again; } } -- 2.20.0
Re: [PATCH v2 1/1] Fonts: Replace discarded const qualifier
Hi all, On Tue, Nov 03, 2020 at 10:55:23AM +, Lee Jones wrote: > Would you be kind enough to let us know when this lands in Mainline > please? We'll need to back-port it to start fixing up our Stable > kernels ASAP. Patch is in mainline now: 9522750c66c689b739e151fcdf895420dc81efc0 Fonts: Replace discarded const qualifier Thank you, Peilin Ye
Re: [PATCH v4] checkpatch: improve email parsing
On Sat, 2020-11-07 at 10:11 +0530, Dwaipayan Ray wrote: > On Sat, Nov 7, 2020 at 3:34 AM Joe Perches wrote: > > > > On Sat, 2020-11-07 at 03:15 +0530, Dwaipayan Ray wrote: > > > checkpatch doesn't report warnings for many common mistakes > > > in emails. Some of which are trailing commas and incorrect > > > use of email comments. > > > > Assuming it all works, this looks good. I haven't tested it. > > > > How did you test the $fix bits? > > > Hi, > I actually dumped about 17k unique emails from git log, put it in one of > my previous patches, and ran checkpatch with --fix on it. > I checked the diff and most of the cases looked pretty good to me. > I could send the diff output if you like? Please. Likely just to me as I imagine it's not interesting to most. > > Trivial notes: > > > > > diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl > > [] > > > + # sta...@vger.kernel.org or > > > sta...@kernel.org shouldn't > > > + # have an email name. In addition commments > > > should strictly > > > + # begin with a # > > > + if ($email =~ > > > /^.*stable\@(?:vger\.)?kernel\.org/) { > > > > Likely better to test with a case insensitive match so > > sta...@vger.kernel.org and such are still warned. > > Sure, I will do that. > > > > if ($email =~ > > /\bstable\@(?:vger\.)?kernel\.org\b/i) { > > > > > + if ($sign_off =~ /cc:$/i && > > > (($comment ne "" && $comment !~ /^#.+/) || > > > + ($email_name ne ""))) { > > > > > > $sign_off !~ /^cc:/i ? > > I actually had a doubt about that one. Only the stable address with > Cc: should be checked right? Or something else? yes. > What about those stable addresses with tags other than Cc: ? Should > a change be suggested? Ideally yes, but there were very few of those in the git commit history so it's probably not a big deal one way or another.
Re: [RFC] proc: get_wchan() stack unwind only makes sense for sleeping/non-self tasks
On Thu, 5 Nov 2020 15:11:32 -0800 Vineet Gupta wrote: > Most architectures currently check this in their get_wchan() implementation > (ARC doesn't hence this patch). However doing this in core code shows > the semantics better so move the check one level up (eventually remove > the boiler-plate code from arches) It would be nice to clean up the arch callees in the same patch, at least so it doesn't get forgotten about. Are you prepared to propose such a change?
Re: [PATCH v6 2/2] fs: ext4: Modify inode-test.c to use KUnit parameterized testing feature
On Sat, Nov 7, 2020 at 3:23 AM Arpitha Raghunandan <98.a...@gmail.com> wrote: > > Modify fs/ext4/inode-test.c to use the parameterized testing > feature of KUnit. > > Signed-off-by: Arpitha Raghunandan <98.a...@gmail.com> > --- This looks good to me. Thanks! Reviewed-by: David Gow -- David
Re: [PATCH v6 1/2] kunit: Support for Parameterized Testing
On Sat, Nov 7, 2020 at 3:22 AM Arpitha Raghunandan <98.a...@gmail.com> wrote: > > Implementation of support for parameterized testing in KUnit. > This approach requires the creation of a test case using the > KUNIT_CASE_PARAM macro that accepts a generator function as input. > This generator function should return the next parameter given the > previous parameter in parameterized tests. It also provides > a macro to generate common-case generators. > > Signed-off-by: Arpitha Raghunandan <98.a...@gmail.com> > Co-developed-by: Marco Elver > Signed-off-by: Marco Elver > --- This looks good to me! A couple of minor thoughts about the output format below, but I'm quite happy to have this as-is regardless. Reviewed-by: David Gow Cheers, -- David > Changes v5->v6: > - Fix alignment to maintain consistency > Changes v4->v5: > - Update kernel-doc comments. > - Use const void* for generator return and prev value types. > - Add kernel-doc comment for KUNIT_ARRAY_PARAM. > - Rework parameterized test case execution strategy: each parameter is > executed > as if it was its own test case, with its own test initialization and cleanup > (init and exit are called, etc.). However, we cannot add new test cases per > TAP > protocol once we have already started execution. Instead, log the result of > each parameter run as a diagnostic comment. > Changes v3->v4: > - Rename kunit variables > - Rename generator function helper macro > - Add documentation for generator approach > - Display test case name in case of failure along with param index > Changes v2->v3: > - Modifictaion of generator macro and method > Changes v1->v2: > - Use of a generator method to access test case parameters > > include/kunit/test.h | 36 ++ > lib/kunit/test.c | 46 +++- > 2 files changed, 69 insertions(+), 13 deletions(-) > > diff --git a/include/kunit/test.h b/include/kunit/test.h > index db1b0ae666c4..16616d3974f9 100644 > --- a/include/kunit/test.h > +++ b/include/kunit/test.h > @@ -107,6 +107,7 @@ struct kunit; > * > * @run_case: the function representing the actual test case. > * @name: the name of the test case. > + * @generate_params: the generator function for parameterized tests. > * > * A test case is a function with the signature, > * ``void (*)(struct kunit *)`` > @@ -141,6 +142,7 @@ struct kunit; > struct kunit_case { > void (*run_case)(struct kunit *test); > const char *name; > + const void* (*generate_params)(const void *prev); > > /* private: internal use only. */ > bool success; > @@ -163,6 +165,22 @@ static inline char *kunit_status_to_string(bool status) > */ > #define KUNIT_CASE(test_name) { .run_case = test_name, .name = #test_name } > > +/** > + * KUNIT_CASE_PARAM - A helper for creation a parameterized &struct > kunit_case > + * > + * @test_name: a reference to a test case function. > + * @gen_params: a reference to a parameter generator function. > + * > + * The generator function ``const void* gen_params(const void *prev)`` is > used > + * to lazily generate a series of arbitrarily typed values that fit into a > + * void*. The argument @prev is the previously returned value, which should > be > + * used to derive the next value; @prev is set to NULL on the initial > generator > + * call. When no more values are available, the generator must return NULL. > + */ > +#define KUNIT_CASE_PARAM(test_name, gen_params)\ > + { .run_case = test_name, .name = #test_name,\ > + .generate_params = gen_params } > + > /** > * struct kunit_suite - describes a related collection of &struct kunit_case > * > @@ -208,6 +226,10 @@ struct kunit { > const char *name; /* Read only after initialization! */ > char *log; /* Points at case log after initialization */ > struct kunit_try_catch try_catch; > + /* param_value is the current parameter value for a test case. */ > + const void *param_value; > + /* param_index stores the index of the parameter in parameterized > tests. */ > + int param_index; > /* > * success starts as true, and may only be set to false during a > * test case; thus, it is safe to update this across multiple > @@ -1742,4 +1764,18 @@ do { > \ > fmt, > \ > ##__VA_ARGS__) > > +/** > + * KUNIT_ARRAY_PARAM() - Define test parameter generator from an array. > + * @name: prefix for the test parameter generator function. > + * @array: array of test parameters. > + * > + * Define function @name_gen_params which uses @array to generate parameters. > + */ > +#define KUNIT_ARRAY_PARAM(name, array) > \ >
Re: [PATCH v4] checkpatch: improve email parsing
On Sat, Nov 7, 2020 at 3:34 AM Joe Perches wrote: > > On Sat, 2020-11-07 at 03:15 +0530, Dwaipayan Ray wrote: > > checkpatch doesn't report warnings for many common mistakes > > in emails. Some of which are trailing commas and incorrect > > use of email comments. > > Assuming it all works, this looks good. I haven't tested it. > > How did you test the $fix bits? > Hi, I actually dumped about 17k unique emails from git log, put it in one of my previous patches, and ran checkpatch with --fix on it. I checked the diff and most of the cases looked pretty good to me. I could send the diff output if you like? > Trivial notes: > > > diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl > [] > > + # sta...@vger.kernel.org or sta...@kernel.org > > shouldn't > > + # have an email name. In addition commments > > should strictly > > + # begin with a # > > + if ($email =~ > > /^.*stable\@(?:vger\.)?kernel\.org/) { > > Likely better to test with a case insensitive match so > sta...@vger.kernel.org and such are still warned. Sure, I will do that. > > if ($email =~ > /\bstable\@(?:vger\.)?kernel\.org\b/i) { > > > + if ($sign_off =~ /cc:$/i && > > (($comment ne "" && $comment !~ /^#.+/) || > > + ($email_name ne ""))) { > > || $sign_off !~ /^cc:/i ? I actually had a doubt about that one. Only the stable address with Cc: should be checked right? Or something else? What about those stable addresses with tags other than Cc: ? Should a change be suggested? Thanks, Dwaipayan.
Re: [PATCH 1/3 v4] ftrace: Have the callbacks receive a struct ftrace_regs instead of pt_regs
On Fri, 06 Nov 2020 16:42:35 -0500 Steven Rostedt wrote: > From: "Steven Rostedt (VMware)" > > In preparation to have arguments of a function passed to callbacks attached > to functions as default, change the default callback prototype to receive a > struct ftrace_regs as the forth parameter instead of a pt_regs. > > For callbacks that set the FL_SAVE_REGS flag in their ftrace_ops flags, they > will now need to get the pt_regs via a ftrace_get_regs() helper call. If > this is called by a callback that their ftrace_ops did not have a > FL_SAVE_REGS flag set, it that helper function will return NULL. > > This will allow the ftrace_regs to hold enough just to get the parameters > and stack pointer, but without the worry that callbacks may have a pt_regs > that is not completely filled. > This looks good to me. Reviewed-by: Masami Hiramatsu Thank you, > Signed-off-by: Steven Rostedt (VMware) > --- > arch/x86/kernel/kprobes/ftrace.c | 3 ++- > include/linux/ftrace.h| 16 ++-- > include/linux/kprobes.h | 2 +- > kernel/livepatch/patch.c | 3 ++- > kernel/trace/ftrace.c | 27 +++ > kernel/trace/trace_event_perf.c | 2 +- > kernel/trace/trace_functions.c| 9 - > kernel/trace/trace_irqsoff.c | 2 +- > kernel/trace/trace_sched_wakeup.c | 2 +- > kernel/trace/trace_selftest.c | 20 +++- > kernel/trace/trace_stack.c| 2 +- > 11 files changed, 53 insertions(+), 35 deletions(-) > > diff --git a/arch/x86/kernel/kprobes/ftrace.c > b/arch/x86/kernel/kprobes/ftrace.c > index 954d930a7127..373e5fa3ce1f 100644 > --- a/arch/x86/kernel/kprobes/ftrace.c > +++ b/arch/x86/kernel/kprobes/ftrace.c > @@ -14,8 +14,9 @@ > > /* Ftrace callback handler for kprobes -- called under preepmt disabed */ > void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, > -struct ftrace_ops *ops, struct pt_regs *regs) > +struct ftrace_ops *ops, struct ftrace_regs *fregs) > { > + struct pt_regs *regs = ftrace_get_regs(fregs); > struct kprobe *p; > struct kprobe_ctlblk *kcb; > int bit; > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h > index 8dde9c17aaa5..24e1fa52337d 100644 > --- a/include/linux/ftrace.h > +++ b/include/linux/ftrace.h > @@ -90,8 +90,20 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, > > struct ftrace_ops; > > +struct ftrace_regs { > + struct pt_regs regs; > +}; > + > +static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs > *fregs) > +{ > + if (!fregs) > + return NULL; > + > + return &fregs->regs; > +} > + > typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, > - struct ftrace_ops *op, struct pt_regs *regs); > + struct ftrace_ops *op, struct ftrace_regs *fregs); > > ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); > > @@ -259,7 +271,7 @@ int register_ftrace_function(struct ftrace_ops *ops); > int unregister_ftrace_function(struct ftrace_ops *ops); > > extern void ftrace_stub(unsigned long a0, unsigned long a1, > - struct ftrace_ops *op, struct pt_regs *regs); > + struct ftrace_ops *op, struct ftrace_regs *fregs); > > #else /* !CONFIG_FUNCTION_TRACER */ > /* > diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h > index 629abaf25681..be73350955e4 100644 > --- a/include/linux/kprobes.h > +++ b/include/linux/kprobes.h > @@ -345,7 +345,7 @@ static inline void wait_for_kprobe_optimizer(void) { } > #endif /* CONFIG_OPTPROBES */ > #ifdef CONFIG_KPROBES_ON_FTRACE > extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, > - struct ftrace_ops *ops, struct pt_regs *regs); > + struct ftrace_ops *ops, struct ftrace_regs > *fregs); > extern int arch_prepare_kprobe_ftrace(struct kprobe *p); > #endif > > diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c > index 875c5dbbdd33..f89f9e7e9b07 100644 > --- a/kernel/livepatch/patch.c > +++ b/kernel/livepatch/patch.c > @@ -40,8 +40,9 @@ struct klp_ops *klp_find_ops(void *old_func) > static void notrace klp_ftrace_handler(unsigned long ip, > unsigned long parent_ip, > struct ftrace_ops *fops, > -struct pt_regs *regs) > +struct ftrace_regs *fregs) > { > + struct pt_regs *regs = ftrace_get_regs(fregs); > struct klp_ops *ops; > struct klp_func *func; > int patch_state; > diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c > index 3db64fb0cce8..67888311784e 100644 > --- a/kernel/trace/ftrace.c > +++ b/kernel/trace/ftrace.c > @@ -121,7 +121,7 @@ struct ftrace_ops global_ops; >
Re: [PATCH] Documentation: kunit: provide guidance for testing many inputs
On Tue, Nov 3, 2020 at 5:37 AM Daniel Latypov wrote: > > usage.rst goes into a detailed about faking out classes, but currently Nit: a detailed what? > lacks wording about how one might idiomatically test a range of inputs. > > Give an example of how one might test a hash function via macros/helper > funcs and a table-driven test and very briefly discuss pros and cons. > > Also highlight the KUNIT_EXPECT_*_MSG() variants (that aren't mentioned > elsewhere [1]) which are particularly useful in these situations. > > It is also criminally underused at the moment, only appearing in 2 > tests (both written by people involved in KUnit). > > [1] not even on > https://www.kernel.org/doc/html/latest/dev-tools/kunit/api/test.html I suspect we'll eventually want to document the _MSG variants here as well, though it will bloat the page somewhat. In any case, it can be left to a separate patch. > > Signed-off-by: Daniel Latypov > --- Thanks for writing this -- it's definitely a common test pattern which it'd be nice to encourage and explain a bit better. Cheers, -- David > Documentation/dev-tools/kunit/usage.rst | 66 + > 1 file changed, 66 insertions(+) > > diff --git a/Documentation/dev-tools/kunit/usage.rst > b/Documentation/dev-tools/kunit/usage.rst > index 62142a47488c..317390df2b96 100644 > --- a/Documentation/dev-tools/kunit/usage.rst > +++ b/Documentation/dev-tools/kunit/usage.rst > @@ -451,6 +451,72 @@ We can now use it to test ``struct eeprom_buffer``: > destroy_eeprom_buffer(ctx->eeprom_buffer); > } > > +Testing various inputs > +-- Nit: "various" isn't hugely descriptive here. Maybe something like "Testing against multiple inputs" would be better? > + > +Testing just a few inputs might not be enough to have confidence that the > code > +works correctly, e.g. for a hash function. > + > +In such cases, it can be helpful to have a helper macro or function, e.g. > this > +fictitious example for ``md5sum(1)`` > + > +.. code-block:: c > + > + /* Note: the cast is to satisfy overly strict type-checking. */ > + #define TEST_MD5(in, want) \ > + md5sum(in, out); \ > + KUNIT_EXPECT_STREQ_MSG(test, (char *)out, want, "md5sum(%s)", > in); > + > + char out[16]; > + TEST_MD5("hello world", "5eb63bbbe01eeed093cb22bb8f5acdc3"); > + TEST_MD5("hello world!", "fc3ff98e8c6a0d3087d515c0473f8677"); > + > +Note the use of ``KUNIT_EXPECT_STREQ_MSG`` to give more context when it fails > +and make it easier to track down. (Yes, in this example, ``want`` is likely > +going to be unique enough on its own). > + > +The ``_MSG`` variants are even more useful when the same expectation is > called > +multiple times (in a loop or helper function) and thus the line number isn't > +enough to identify what failed, like below. > + > +In some cases, it can be helpful to write a *table-driven test* instead, e.g. > + > +.. code-block:: c > + > + int i; > + char out[16]; > + > + struct md5_test_case { > + const char *str; > + const char *md5; > + }; > + > + struct md5_test_case cases[] = { > + { > + .str = "hello world", > + .md5 = "5eb63bbbe01eeed093cb22bb8f5acdc3", > + }, > + { > + .str = "hello world!", > + .md5 = "fc3ff98e8c6a0d3087d515c0473f8677", > + }, > + }; > + for (i = 0; i < ARRAY_SIZE(cases); ++i) { > + md5sum(cases[i].str, out); > + KUNIT_EXPECT_STREQ_MSG(test, (char *)out, cases[i].md5, > + "md5sum(%s)", cases[i].str); > + } > + > + > +There's more boilerplate involved, but it can: > + > +* be more readable when there are multiple inputs/outputs thanks to field > names, > + > + * E.g. see ``fs/ext4/inode-test.c`` for an example of both. > +* reduce duplication if test cases can be shared across multiple tests. > + > + * E.g. if we had a magical ``undo_md5sum`` function, we could reuse > ``cases``. > + This is a bit of a nitpick, but I don't think this is quite conveying the usefulness of table-based testing. Maybe it's that a hypothetical "undo_md5sum" is too unrealistic an example? Maybe, instead of having both the macro-based and table-driven examples based around md5sum(), the table-based one could use something more obviously invertible / reusable, and include both in the example code. E.g, something akin to toupper() and tolower() or some other conversion function. I think having a better example here is probably more useful than having both the table- and macro- driven examples test the same thing. > .. _kunit-on-non-uml: > > KUnit on non-UML architectures > > base-commit: 77c8473edf7f7664137f555cfcdc8c460bbd947d > -- > 2.29.1.341.ge80a0c044ae-goog >
Re: [PATCH v1] kunit: tool: unmark test_data as binary blobs
On Fri, Nov 6, 2020 at 7:24 AM Brendan Higgins wrote: > > The tools/testing/kunit/test_data/ directory was marked as binary > because some of the test_data files cause checkpatch warnings. Fix this > by dropping the .gitattributes file. > > Fixes: afc63da64f1e ("kunit: kunit_parser: make parser more robust") > Signed-off-by: Brendan Higgins > --- Reviewed-by: David Gow Thanks. I wasn't able to find any issues which required those files to be binary. For the record, a link to the original issue, which appeared to be with whitespace (spaces before tabs) in git apply: https://lkml.org/lkml/2020/3/13/920 Cheers, -- David
[PATCH 0/2] drivers/tty: delete break after return or goto
Hi, Greg: This patch sereies delete code which never run: { case XXX: return XXX; break; //The break is meanless, so just delete it. case YYY: goto YYY; break; //The break is meanless, so just delete it. .. } Bernard Zhao (2): tty/serial: delete break after return drivers/tty: delete break after goto/return drivers/tty/nozomi.c | 4 drivers/tty/serial/imx.c | 5 - 2 files changed, 9 deletions(-) -- 2.29.0
[PATCH 1/2] tty/serial: delete break after return
Delete break after return, which will never run. Signed-off-by: Bernard Zhao --- drivers/tty/serial/imx.c | 5 - 1 file changed, 5 deletions(-) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 1731d9728865..09703079db7b 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -320,7 +320,6 @@ static u32 imx_uart_readl(struct imx_port *sport, u32 offset) switch (offset) { case UCR1: return sport->ucr1; - break; case UCR2: /* * UCR2_SRST is the only bit in the cached registers that might @@ -331,16 +330,12 @@ static u32 imx_uart_readl(struct imx_port *sport, u32 offset) if (!(sport->ucr2 & UCR2_SRST)) sport->ucr2 = readl(sport->port.membase + offset); return sport->ucr2; - break; case UCR3: return sport->ucr3; - break; case UCR4: return sport->ucr4; - break; case UFCR: return sport->ufcr; - break; default: return readl(sport->port.membase + offset); } -- 2.29.0
[PATCH 2/2] drivers/tty: delete break after goto/return
Delete break after goto/return, which will never run. Signed-off-by: Bernard Zhao --- drivers/tty/nozomi.c | 4 1 file changed, 4 deletions(-) diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c index d42b854cb7df..946cc16827aa 100644 --- a/drivers/tty/nozomi.c +++ b/drivers/tty/nozomi.c @@ -414,11 +414,9 @@ static void read_mem32(u32 *buf, const void __iomem *mem_addr_start, buf16 = (u16 *) buf; *buf16 = __le16_to_cpu(readw(ptr)); goto out; - break; case 4: /* 4 bytes */ *(buf) = __le32_to_cpu(readl(ptr)); goto out; - break; } while (i < size_bytes) { @@ -460,7 +458,6 @@ static u32 write_mem32(void __iomem *mem_addr_start, const u32 *buf, buf16 = (const u16 *)buf; writew(__cpu_to_le16(*buf16), ptr); return 2; - break; case 1: /* * also needs to write 4 bytes in this case * so falling through.. @@ -468,7 +465,6 @@ static u32 write_mem32(void __iomem *mem_addr_start, const u32 *buf, case 4: /* 4 bytes */ writel(__cpu_to_le32(*buf), ptr); return 4; - break; } while (i < size_bytes) { -- 2.29.0
[V2] [PATCH] net/ethernet: update ret when ptp_clock is ERROR
We always have to update the value of ret, otherwise the error value may be the previous one. And ptp_clock_register() never return NULL when PTP_1588_CLOCK enable, so we use IS_ERR here. Signed-off-by: Wang Qing --- drivers/net/ethernet/ti/am65-cpts.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpts.c b/drivers/net/ethernet/ti/am65-cpts.c index 75056c1..ec8e56d --- a/drivers/net/ethernet/ti/am65-cpts.c +++ b/drivers/net/ethernet/ti/am65-cpts.c @@ -998,11 +998,10 @@ struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, am65_cpts_settime(cpts, ktime_to_ns(ktime_get_real())); cpts->ptp_clock = ptp_clock_register(&cpts->ptp_info, cpts->dev); - if (IS_ERR_OR_NULL(cpts->ptp_clock)) { + if (IS_ERR(cpts->ptp_clock)) { dev_err(dev, "Failed to register ptp clk %ld\n", PTR_ERR(cpts->ptp_clock)); - if (!cpts->ptp_clock) - ret = -ENODEV; + ret = PTR_ERR(cpts->ptp_clock); goto refclk_disable; } cpts->phc_index = ptp_clock_index(cpts->ptp_clock); -- 2.7.4
[PATCH] pcp_clock: return EOPNOTSUPP if !CONFIG_PTP_1588_CLOCK
pcp_clock_register() is checked with IS_ERR(), and will crash if !PTP, change return value to ERR_PTR(-EOPNOTSUPP) for the !CONFIG_PTP_1588_CLOCK and so question resolved. Signed-off-by: Wang Qing --- include/linux/ptp_clock_kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index d3e8ba5..05db40c --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -276,7 +276,7 @@ void ptp_cancel_worker_sync(struct ptp_clock *ptp); #else static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) -{ return NULL; } +{ return ERR_PTR(-EOPNOTSUPP); } static inline int ptp_clock_unregister(struct ptp_clock *ptp) { return 0; } static inline void ptp_clock_event(struct ptp_clock *ptp, -- 2.7.4
Re: [PATCH] MAINTAINERS: add missing file in ext4 entry
On Fri, Oct 30, 2020 at 10:24:35AM +0800, Chao Yu wrote: > include/trace/events/ext4.h belongs to ext4 module, add the file path into > ext4 entry in MAINTAINERS. > > Signed-off-by: Chao Yu Thanks, applied. - Ted
[V2] drm: msm: adreno: use IS_ERR() instead of null pointer check
a6xx_gmu_get_mmio() never return null in case of error, but ERR_PTR(), so we should use IS_ERR() instead of null pointer check and IS_ERR_OR_NULL(). Signed-off-by: Wang Qing --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 491fee4..82420f7 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -492,7 +492,7 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu) void __iomem *seqptr = a6xx_gmu_get_mmio(pdev, "gmu_pdc_seq"); uint32_t pdc_address_offset; - if (!pdcptr || !seqptr) + if (IS_ERR(pdcptr) || IS_ERR(seqptr)) goto err; if (adreno_is_a618(adreno_gpu) || adreno_is_a640(adreno_gpu)) @@ -580,9 +580,9 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu) wmb(); err: - if (!IS_ERR_OR_NULL(pdcptr)) + if (!IS_ERR(pdcptr)) iounmap(pdcptr); - if (!IS_ERR_OR_NULL(seqptr)) + if (!IS_ERR(seqptr)) iounmap(seqptr); } -- 2.7.4
Re: [PATCH memory-model 5/8] tools/memory-model: Add a glossary of LKMM terms
On Fri, Nov 06, 2020 at 10:01:02AM -0800, Paul E. McKenney wrote: > On Fri, Nov 06, 2020 at 09:47:22AM +0800, Boqun Feng wrote: > > On Thu, Nov 05, 2020 at 02:00:14PM -0800, paul...@kernel.org wrote: > > > From: "Paul E. McKenney" > > > > > > Signed-off-by: Paul E. McKenney > > > --- > > > tools/memory-model/Documentation/glossary.txt | 155 > > > ++ > > > 1 file changed, 155 insertions(+) > > > create mode 100644 tools/memory-model/Documentation/glossary.txt > > > > > > diff --git a/tools/memory-model/Documentation/glossary.txt > > > b/tools/memory-model/Documentation/glossary.txt > > > new file mode 100644 > > > index 000..036fa28 > > > --- /dev/null > > > +++ b/tools/memory-model/Documentation/glossary.txt > > > @@ -0,0 +1,155 @@ > > > +This document contains brief definitions of LKMM-related terms. Like > > > most > > > +glossaries, it is not intended to be read front to back (except perhaps > > > +as a way of confirming a diagnosis of OCD), but rather to be searched > > > +for specific terms. > > > + > > > + > > > +Address Dependency: When the address of a later memory access is > > > computed > > > + based on the value returned by an earlier load, an "address > > > + dependency" extends from that load extending to the later access. > > > + Address dependencies are quite common in RCU read-side critical > > > + sections: > > > + > > > + 1 rcu_read_lock(); > > > + 2 p = rcu_dereference(gp); > > > + 3 do_something(p->a); > > > + 4 rcu_read_unlock(); > > > + > > > + In this case, because the address of "p->a" on line 3 is computed > > > + from the value returned by the rcu_dereference() on line 2, the > > > + address dependency extends from that rcu_dereference() to that > > > + "p->a". In rare cases, optimizing compilers can destroy address > > > + dependencies. Please see Documentation/RCU/rcu_dereference.txt > > > + for more information. > > > + > > > + See also "Control Dependency". > > > + > > > +Acquire: With respect to a lock, acquiring that lock, for example, > > > + using spin_lock(). With respect to a non-lock shared variable, > > > + a special operation that includes a load and which orders that > > > + load before later memory references running on that same CPU. > > > + An example special acquire operation is smp_load_acquire(), > > > + but atomic_read_acquire() and atomic_xchg_acquire() also include > > > + acquire loads. > > > + > > > + When an acquire load returns the value stored by a release store > > > + to that same variable, then all operations preceding that store > > > > Change this to: > > > > When an acquire load reads-from a release store > > > > , and put a reference to "Reads-from"? I think this makes the document > > more consistent in that it makes clear "an acquire load returns the > > value stored by a release store to the same variable" is not a special > > case, it's simple a "Reads-from". > > > > > + happen before any operations following that load acquire. > > > > Add a reference to the definition of "happen before" in explanation.txt? > > How about as shown below? I currently am carrying this as a separate > commit, but I might merge it into this one later on. > Looks good to me, thanks! Regards, Boqun > Thanx, Paul > > > > commit 774a52cd3d80d6b657ae6c14c10bd9fc437068f3 > Author: Paul E. McKenney > Date: Fri Nov 6 09:58:01 2020 -0800 > > tools/memory-model: Tie acquire loads to reads-from > > This commit explicitly makes the connection between acquire loads and > the reads-from relation. It also adds an entry for happens-before, > and refers to the corresponding section of explanation.txt. > > Reported-by: Boqun Feng > Signed-off-by: Paul E. McKenney > > diff --git a/tools/memory-model/Documentation/glossary.txt > b/tools/memory-model/Documentation/glossary.txt > index 3924aca..383151b 100644 > --- a/tools/memory-model/Documentation/glossary.txt > +++ b/tools/memory-model/Documentation/glossary.txt > @@ -33,10 +33,11 @@ Acquire: With respect to a lock, acquiring that lock, > for example, > acquire loads. > > When an acquire load returns the value stored by a release store > - to that same variable, then all operations preceding that store > - happen before any operations following that load acquire. > + to that same variable, (in other words, the acquire load "reads > + from" the release store), then all operations preceding that > + store "happen before" any operations following that load acquire. > > - See also "Relaxed" and "Release". > + See also "Happens-Before", "Reads-From", "Relaxed", and "Release". > > Coherence (co): When one CPU's store to a given variable overwrites > either the value from another CPU's store or some later value, > @@ -102,6 +103,11 @@ Fully Orde
Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()
On 2020-11-06 5:14 p.m., Jason Gunthorpe wrote: > On Fri, Nov 06, 2020 at 01:03:26PM -0700, Logan Gunthorpe wrote: >> I don't think a function like that will work for the p2pmem use case. In >> order to implement proper page freeing I expect I'll need a loop around >> the allocator and vm_insert_mixed()... Something roughly like: >> >> for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { >> vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE); >> ret = vmf_insert_mixed(vma, addr, >> __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP)); >> } >> >> That way we can call pci_free_p2pmem() when a page's ref count goes to >> zero. I suspect your use case will need to do something similar. > > Yes, but I would say the pci_alloc_p2pmem() layer should be able to > free pages on a page-by-page basis so you don't have to do stuff like > the above. > > There is often a lot of value in having physical contiguous addresses, > so allocating page by page as well seems poor. Agreed. But I'll have to dig to see if genalloc supports freeing blocks in different sizes than the allocations. Logan
Re: [PATCH net-next v4 0/5] bonding: rename bond components
On Fri, 6 Nov 2020 15:04:31 -0500 Jarod Wilson wrote: > The bonding driver's use of master and slave, while largely understood > in technical circles, poses a barrier for inclusion to some potential > members of the development and user community, due to the historical > context of masters and slaves, particularly in the United States. This > is a first full pass at replacing those phrases with more socially > inclusive ones, opting for bond to replace master and port to > replace slave, which is congruent with the bridge and team drivers. If we decide to go ahead with this, we should probably also use it as an opportunity to clean up the more egregious checkpatch warnings, WDYT? Plan minimum - don't add new ones ;)
Re: [PATCH] ASoC: fsl_aud2htx: Remove dev_err() usage after platform_get_irq()
On Sat, Nov 07, 2020 at 10:20:43AM +0800, Shengjiu Wang wrote: > platform_get_irq() would print error message internally, so dev_err() > after platform_get_irq() is not needed > > Signed-off-by: Shengjiu Wang Acked-by: Nicolin Chen
Re: [PATCH v4 1/4] dt-bindings: usb: add rk3328 dwc3 docs
On Wed, Sep 2, 2020 at 11:12 AM wrote: > > From: Cameron Nemo > > Document compatible for dwc3 on the Rockchip rk3328 platform. Hi all, Wanted to give this patch submission a gentle ping. Rob Herring acked the documentation changes, but I have not heard anything from the USB or Rockchip maintainers. This patchset would facilitate USB3 support for Rockchip rk3328 devices like the Pine Rock64. If there is anything I can do to help move this along, please let me know. Thank you, Cameron > > Signed-off-by: Cameron Nemo > --- > Documentation/devicetree/bindings/usb/dwc3.txt | 1 + > Documentation/devicetree/bindings/usb/rockchip,dwc3.txt | 3 ++- > 2 files changed, 3 insertions(+), 1 deletion(-) > > diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt > b/Documentation/devicetree/bindings/usb/dwc3.txt > index d03edf9d3935..d625cd5966e9 100644 > --- a/Documentation/devicetree/bindings/usb/dwc3.txt > +++ b/Documentation/devicetree/bindings/usb/dwc3.txt > @@ -25,6 +25,7 @@ Exception for clocks: > "ti,am437x-dwc3" > "ti,dwc3" > "ti,keystone-dwc3" > +"rockchip,rk3328-dwc3" > "rockchip,rk3399-dwc3" > "xlnx,zynqmp-dwc3" > > diff --git a/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt > b/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt > index 94520493233b..b41f30a61be6 100644 > --- a/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt > +++ b/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt > @@ -1,7 +1,8 @@ > Rockchip SuperSpeed DWC3 USB SoC controller > > Required properties: > -- compatible: should contain "rockchip,rk3399-dwc3" for rk3399 SoC > +- compatible: should contain "rockchip,rk3328-dwc3" for rk3328 SoC > + or "rockchip,rk3399-dwc3" for rk3399 SoC > - clocks: A list of phandle + clock-specifier pairs for the > clocks listed in clock-names > - clock-names: Should contain the following: > -- > 2.28.0 >
Re: [PATCH memory-model 5/8] tools/memory-model: Add a glossary of LKMM terms
On Fri, Nov 06, 2020 at 01:04:13PM -0800, Paul E. McKenney wrote: > On Fri, Nov 06, 2020 at 03:40:08PM -0500, Alan Stern wrote: > > Is it really true that data dependencies are so easily destroyed? I > > would expect that a true "semantic" dependency (i.e., one where the > > value written really does vary according to the value read) would be > > rather hard to second guess. > > The usual optimizations apply, for but one example: > > r1 = READ_ONCE(x); > WRITE_ONCE(y, (r1 + 1) % MAX_ELEMENTS); > > If MAX_ELEMENTS is 1, so long, data dependency! Sure, but if MAX_ELEMENTS is 1 then the value written will always be 0 no matter what value r1 has, so it isn't a semantic dependency. Presumably a semantic data dependency would be much more robust. I wonder if it's worth pointing out this distinction to the reader. > With pointers, the compiler has fewer optimization opportunities, > but there are still cases where it can break the dependency. > Or transform it to a control dependency. Transforming a data dependency into a control dependency wouldn't make any important difference; the hardware would still provide the desired ordering. Alan
[PATCH] ASoC: fsl_aud2htx: Remove dev_err() usage after platform_get_irq()
platform_get_irq() would print error message internally, so dev_err() after platform_get_irq() is not needed Signed-off-by: Shengjiu Wang --- sound/soc/fsl/fsl_aud2htx.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sound/soc/fsl/fsl_aud2htx.c b/sound/soc/fsl/fsl_aud2htx.c index 124aeb70f24e..4091ccc7c3e9 100644 --- a/sound/soc/fsl/fsl_aud2htx.c +++ b/sound/soc/fsl/fsl_aud2htx.c @@ -211,11 +211,8 @@ static int fsl_aud2htx_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); - if (irq < 0) { - dev_err(&pdev->dev, "no irq for node %s\n", - dev_name(&pdev->dev)); + if (irq < 0) return irq; - } ret = devm_request_irq(&pdev->dev, irq, fsl_aud2htx_isr, 0, dev_name(&pdev->dev), aud2htx); -- 2.27.0
[PATCH RT 3/6] mm/memcontrol: Disable preemption in __mod_memcg_lruvec_state()
5.4.74-rt42-rc1 stable review patch. If anyone has any objections, please let me know. -- From: Sebastian Andrzej Siewior The callers expect disabled preemption/interrupts while invoking __mod_memcg_lruvec_state(). This works mainline because a lock of somekind is acquired. Use preempt_disable_rt() where per-CPU variables are accessed and a stable pointer is expected. This is also done in __mod_zone_page_state() for the same reason. Cc: stable...@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9bdb75ef6d62..c9d02e2272e1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -752,6 +752,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; + preempt_disable_rt(); /* Update memcg */ __mod_memcg_state(memcg, idx, val); @@ -767,6 +768,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, x = 0; } __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); + preempt_enable_rt(); } void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) -- 2.28.0
[PATCH RT 1/6] net: Properly annotate the try-lock for the seqlock
5.4.74-rt42-rc1 stable review patch. If anyone has any objections, please let me know. -- From: Sebastian Andrzej Siewior In patch ("net/Qdisc: use a seqlock instead seqcount") the seqcount has been replaced with a seqlock to allow to reader to boost the preempted writer. The try_write_seqlock() acquired the lock with a try-lock but the seqcount annotation was "lock". Opencode write_seqcount_t_begin() and use the try-lock annotation for lockdep. Reported-by: Mike Galbraith Cc: stable...@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- include/linux/seqlock.h | 9 - include/net/sch_generic.h | 10 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index e5207897c33e..f390293974ea 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -489,15 +489,6 @@ static inline void write_seqlock(seqlock_t *sl) __raw_write_seqcount_begin(&sl->seqcount); } -static inline int try_write_seqlock(seqlock_t *sl) -{ - if (spin_trylock(&sl->lock)) { - __raw_write_seqcount_begin(&sl->seqcount); - return 1; - } - return 0; -} - static inline void write_sequnlock(seqlock_t *sl) { __raw_write_seqcount_end(&sl->seqcount); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e6afb4b9cede..112d2dca8b08 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -168,8 +168,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) return false; } #ifdef CONFIG_PREEMPT_RT - if (try_write_seqlock(&qdisc->running)) + if (spin_trylock(&qdisc->running.lock)) { + seqcount_t *s = &qdisc->running.seqcount; + /* +* Variant of write_seqcount_t_begin() telling lockdep that a +* trylock was attempted. +*/ + __raw_write_seqcount_begin(s); + seqcount_acquire(&s->dep_map, 0, 1, _RET_IP_); return true; + } return false; #else /* Variant of write_seqcount_begin() telling lockdep a trylock -- 2.28.0
[PATCH RT 6/6] Linux 5.4.74-rt42-rc1
5.4.74-rt42-rc1 stable review patch. If anyone has any objections, please let me know. -- From: "Steven Rostedt (VMware)" --- localversion-rt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/localversion-rt b/localversion-rt index 629e0b4384b8..31c892a05e4d 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt41 +-rt42-rc1 -- 2.28.0
[PATCH RT 5/6] timers: Dont block on ->expiry_lock for TIMER_IRQSAFE
5.4.74-rt42-rc1 stable review patch. If anyone has any objections, please let me know. -- From: Sebastian Andrzej Siewior PREEMPT_RT does not spin and wait until a running timer completes its callback but instead it blocks on a sleeping lock to prevent a deadlock. This blocking can not be done for workqueue's IRQ_SAFE timer which will be canceled in an IRQ-off region. It has to happen to in IRQ-off region because changing the PENDING bit and clearing the timer must not be interrupted to avoid a busy-loop. The callback invocation of IRQSAFE timer is not preempted on PREEMPT_RT so there is no need to synchronize on timer_base::expiry_lock. Don't acquire the timer_base::expiry_lock for TIMER_IRQSAFE flagged timer. Add a lockdep annotation to ensure that this function is always invoked in preemptible context on PREEMPT_RT. Reported-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior Cc: stable...@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) --- kernel/time/timer.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 89078fd848b9..3e9d7f227a5c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1289,7 +1289,7 @@ static void del_timer_wait_running(struct timer_list *timer) u32 tf; tf = READ_ONCE(timer->flags); - if (!(tf & TIMER_MIGRATING)) { + if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { struct timer_base *base = get_timer_base(tf); /* @@ -1373,6 +1373,13 @@ int del_timer_sync(struct timer_list *timer) */ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); + /* +* Must be able to sleep on PREEMPT_RT because of the slowpath in +* del_timer_wait_running(). +*/ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) + might_sleep(); + do { ret = try_to_del_timer_sync(timer); -- 2.28.0
[PATCH RT 4/6] ptrace: fix ptrace_unfreeze_traced() race with rt-lock
5.4.74-rt42-rc1 stable review patch. If anyone has any objections, please let me know. -- From: Oleg Nesterov The patch "ptrace: fix ptrace vs tasklist_lock race" changed ptrace_freeze_traced() to take task->saved_state into account, but ptrace_unfreeze_traced() has the same problem and needs a similar fix: it should check/update both ->state and ->saved_state. Reported-by: Luis Claudio R. Goncalves Fixes: "ptrace: fix ptrace vs tasklist_lock race" Signed-off-by: Oleg Nesterov Signed-off-by: Sebastian Andrzej Siewior Cc: stable...@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) --- kernel/ptrace.c | 23 +++ 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 3075006d720e..3f7156f06b6c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -197,8 +197,8 @@ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (task->state != __TASK_TRACED) - return; + unsigned long flags; + bool frozen = true; WARN_ON(!task->ptrace || task->parent != current); @@ -207,12 +207,19 @@ static void ptrace_unfreeze_traced(struct task_struct *task) * Recheck state under the lock to close this race. */ spin_lock_irq(&task->sighand->siglock); - if (task->state == __TASK_TRACED) { - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - task->state = TASK_TRACED; - } + + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (task->state == __TASK_TRACED) + task->state = TASK_TRACED; + else if (task->saved_state == __TASK_TRACED) + task->saved_state = TASK_TRACED; + else + frozen = false; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + if (frozen && __fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + spin_unlock_irq(&task->sighand->siglock); } -- 2.28.0
[PATCH RT 0/6] Linux 5.4.74-rt42-rc1
Dear RT Folks, This is the RT stable review cycle of patch 5.4.74-rt42-rc1. Please scream at me if I messed something up. Please test the patches too. The -rc release will be uploaded to kernel.org and will be deleted when the final release is out. This is just a review release (or release candidate). The pre-releases will not be pushed to the git repository, only the final release is. If all goes well, this patch will be converted to the next main release on 11/10/2020. Enjoy, -- Steve To build 5.4.74-rt42-rc1 directly, the following patches should be applied: http://www.kernel.org/pub/linux/kernel/v5.x/linux-5.4.tar.xz http://www.kernel.org/pub/linux/kernel/v5.x/patch-5.4.74.xz http://www.kernel.org/pub/linux/kernel/projects/rt/5.4/patch-5.4.74-rt42-rc1.patch.xz You can also build from 5.4.74-rt41 by applying the incremental patch: http://www.kernel.org/pub/linux/kernel/projects/rt/5.4/incr/patch-5.4.74-rt41-rt42-rc1.patch.xz Changes from 5.4.74-rt41: --- Oleg Nesterov (1): ptrace: fix ptrace_unfreeze_traced() race with rt-lock Sebastian Andrzej Siewior (4): net: Properly annotate the try-lock for the seqlock tcp: Remove superfluous BH-disable around listening_hash mm/memcontrol: Disable preemption in __mod_memcg_lruvec_state() timers: Don't block on ->expiry_lock for TIMER_IRQSAFE Steven Rostedt (VMware) (1): Linux 5.4.74-rt42-rc1 include/linux/seqlock.h | 9 - include/net/sch_generic.h | 10 +- kernel/ptrace.c | 23 +++ kernel/time/timer.c | 9 - localversion-rt | 2 +- mm/memcontrol.c | 2 ++ net/ipv4/inet_hashtables.c | 19 --- net/ipv6/inet6_hashtables.c | 5 + 8 files changed, 48 insertions(+), 31 deletions(-)
[PATCH RT 2/6] tcp: Remove superfluous BH-disable around listening_hash
5.4.74-rt42-rc1 stable review patch. If anyone has any objections, please let me know. -- From: Sebastian Andrzej Siewior Commit 9652dc2eb9e40 ("tcp: relax listening_hash operations") removed the need to disable bottom half while acquiring listening_hash.lock. There are still two callers left which disable bottom half before the lock is acquired. Drop local_bh_disable() around __inet_hash() which acquires listening_hash->lock, invoke inet_ehash_nolisten() with disabled BH. inet_unhash() conditionally acquires listening_hash->lock. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- net/ipv4/inet_hashtables.c | 19 --- net/ipv6/inet6_hashtables.c | 5 + 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 006a34b18537..4c8565d6624c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -543,7 +543,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { + local_bh_disable(); inet_ehash_nolisten(sk, osk); + local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); @@ -575,11 +577,8 @@ int inet_hash(struct sock *sk) { int err = 0; - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); + if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); - local_bh_enable(); - } return err; } @@ -590,17 +589,20 @@ void inet_unhash(struct sock *sk) struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb = NULL; spinlock_t *lock; + bool state_listen; if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { + state_listen = true; ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &ilb->lock; + spin_lock(&ilb->lock); } else { + state_listen = false; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + spin_lock_bh(lock); } - spin_lock_bh(lock); if (sk_unhashed(sk)) goto unlock; @@ -613,7 +615,10 @@ void inet_unhash(struct sock *sk) __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); unlock: - spin_unlock_bh(lock); + if (state_listen) + spin_unlock(&ilb->lock); + else + spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index fbe9d4295eac..5d1c1c6967cb 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -287,11 +287,8 @@ int inet6_hash(struct sock *sk) { int err = 0; - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); + if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); - local_bh_enable(); - } return err; } -- 2.28.0
Re: [PATCH] Revert "mm/vunmap: add cond_resched() in vunmap_pmd_range"
On Thu, 5 Nov 2020 09:02:49 -0800 Minchan Kim wrote: > This reverts commit e47110e90584a22e9980510b00d0dfad3a83354e. > > While I was doing zram testing, I found sometimes decompression failed > since the compression buffer was corrupted. With investigation, > I found below commit calls cond_resched unconditionally so it could > make a problem in atomic context if the task is reschedule. > > Revert the original commit for now. > > [ 55.109012] BUG: sleeping function called from invalid context at > mm/vmalloc.c:108 > [ 55.110774] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 946, > name: memhog > [ 55.111973] 3 locks held by memhog/946: > [ 55.112807] #0: 9d01d4b193e8 (&mm->mmap_lock#2){}-{4:4}, at: > __mm_populate+0x103/0x160 > [ 55.114151] #1: a3d53de0 (fs_reclaim){+.+.}-{0:0}, at: > __alloc_pages_slowpath.constprop.0+0xa98/0x1160 > [ 55.115848] #2: 9d01d56b8110 (&zspage->lock){.+.+}-{3:3}, at: > zs_map_object+0x8e/0x1f0 > [ 55.118947] CPU: 0 PID: 946 Comm: memhog Not tainted > 5.9.3-00011-gc5bfc0287345-dirty #316 > [ 55.121265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS > 1.13.0-1 04/01/2014 > [ 55.122540] Call Trace: > [ 55.122974] dump_stack+0x8b/0xb8 > [ 55.123588] ___might_sleep.cold+0xb6/0xc6 > [ 55.124328] unmap_kernel_range_noflush+0x2eb/0x350 > [ 55.125198] unmap_kernel_range+0x14/0x30 > [ 55.125920] zs_unmap_object+0xd5/0xe0 > [ 55.126604] zram_bvec_rw.isra.0+0x38c/0x8e0 > [ 55.127462] zram_rw_page+0x90/0x101 > [ 55.128199] bdev_write_page+0x92/0xe0 > [ 55.128957] ? swap_slot_free_notify+0xb0/0xb0 > [ 55.129841] __swap_writepage+0x94/0x4a0 > [ 55.130636] ? do_raw_spin_unlock+0x4b/0xa0 > [ 55.131462] ? _raw_spin_unlock+0x1f/0x30 > [ 55.132261] ? page_swapcount+0x6c/0x90 > [ 55.133038] pageout+0xe3/0x3a0 > [ 55.133702] shrink_page_list+0xb94/0xd60 > [ 55.134626] shrink_inactive_list+0x158/0x460 > > ... > > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -102,8 +102,6 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long > addr, unsigned long end, > if (pmd_none_or_clear_bad(pmd)) > continue; > vunmap_pte_range(pmd, addr, next, mask); > - > - cond_resched(); > } while (pmd++, addr = next, addr != end); > } If this is triggering a warning then why isn't the might_sleep() in remove_vm_area() also triggering? Sigh. I also cannot remember why these vfree() functions have to be so awkward. The mutex_trylock(&vmap_purge_lock) isn't permitted in interrupt context because mutex_trylock() is stupid, but what was the issue with non-interrupt atomic code?
Re: [PATCH RFC] driver core: Ensure DT devices always have fwnode set
On Fri, Nov 6, 2020 at 11:23 AM Mark Brown wrote: > > On Fri, Nov 06, 2020 at 11:09:17AM -0800, Saravana Kannan wrote: > > > If you want to do this in "one common place", then I think the way to > > do this is have include/linux/of.h provide something like: > > > void of_set_device_of_node(dev, of_node) > > { > > dev->of_node = of_node; > > dev->fw_node = &of_node->fwnode; > >/* bunch of other housekeeping like setting OF_POPULATED and doing > > proper of_node_get() */ > >// Passing NULL for of_node could undo all the above for dev->of_node. > > } > > That also sounds good, particularly if we have a coccinelle spatch I've never used coccinelle. But I can fix 5-10 easily findable ones like i2c, platform, spi, slimbus, spmi, etc. > or > some other mechanism that enforced the usage of the function where > appropriate, my main concern is making sure that we do something which > ensures that the boilerplate stuff is handled. Rob/Frank, I spent an hour or more looking at this and there are many ways of doing this. Wanted to know how much you wanted to do inside these boilerplate functions. This is the minimum we should do in these helper functions. +/** + * of_unset_dev_of_node - Unset a device's of_node + * @dev - device to unset the of_node for + * + * Use this when you delete a device on which you had called + * of_set_dev_of_node() before. + */ +static inline void of_unset_dev_of_node(struct device *dev) +{ + struct device_node *node = dev->of_node + + if (!node) + return; + + dev->fwnode = NULL; + dev->of_node = NULL; + of_node_put(node); +} + +/** + * of_set_dev_of_node - Set a device's of_node + * @dev - device to set the of_node for + * @node - the device_node that the device was constructed from + * + * Use this when you create a new device from a device_node. It takes care some + * of the housekeeping work that's necessary when you set a device's of_node. + * + * Use of_unset_dev_of_node() before you delete the device. + * + * Returns an error if the device already has its of_node set. + */ +static inline int of_set_dev_of_node(struct device *dev, struct device_node *node) +{ + if (!node) + return 0; + + if (WARN_ON(dev->of_node)) + return -EBUSY; + + of_node_get(node); + dev->of_node = node; + dev->fwnode = of_fwnode_handle(node); + + return 0; +} But I also had another version that set/cleared OF_POPULATED. But that meant of_device_alloc() will allocate the device before checking if the node has already been populated (because I'd delete the check that's already there and use the one rolled into these helper functions). I think that inefficiency is okay because I don't think "trying to populate an already populated node" would be a common occurrence. But I wasn't sure how you'd feel about it. Any preferences? Thoughts? Additional context: https://lore.kernel.org/lkml/20201104205431.3795207-2-sarava...@google.com/ -Saravana
Re: [PATCH v2 bpf-next 1/5] bpf: add in-kernel split BTF support
On Fri, Nov 6, 2020 at 5:28 PM Song Liu wrote: > > > > > On Nov 6, 2020, at 3:02 PM, Andrii Nakryiko wrote: > > > > Adjust in-kernel BTF implementation to support a split BTF mode of > > operation. > > Changes are mostly mirroring libbpf split BTF changes, with the exception of > > start_id being 0 for in-kernel implementation due to simpler read-only mode. > > > > Otherwise, for split BTF logic, most of the logic of jumping to base BTF, > > where necessary, is encapsulated in few helper functions. Type numbering and > > string offset in a split BTF are logically continuing where base BTF ends, > > so > > most of the high-level logic is kept without changes. > > > > Type verification and size resolution is only doing an added resolution of > > new > > split BTF types and relies on already cached size and type resolution > > results > > in the base BTF. > > > > Signed-off-by: Andrii Nakryiko > > [...] > > > > > @@ -600,8 +618,15 @@ static const struct btf_kind_operations > > *btf_type_ops(const struct btf_type *t) > > > > static bool btf_name_offset_valid(const struct btf *btf, u32 offset) > > { > > - return BTF_STR_OFFSET_VALID(offset) && > > - offset < btf->hdr.str_len; > > + if (!BTF_STR_OFFSET_VALID(offset)) > > + return false; > > +again: > > + if (offset < btf->start_str_off) { > > + btf = btf->base_btf; > > + goto again; > > Can we do a while loop instead of "goto again;"? yep, not sure why I went with goto... while (offset < btf->start_str_off) btf = btf->base_btf; Shorter. > > > + } > > + offset -= btf->start_str_off; > > + return offset < btf->hdr.str_len; > > } > > > > static bool __btf_name_char_ok(char c, bool first, bool dot_ok) > > @@ -615,10 +640,25 @@ static bool __btf_name_char_ok(char c, bool first, > > bool dot_ok) > > return true; > > } > > > > +static const char *btf_str_by_offset(const struct btf *btf, u32 offset) > > +{ > > +again: > > + if (offset < btf->start_str_off) { > > + btf = btf->base_btf; > > + goto again; > > + } > > Maybe add a btf_find_base_btf(btf, offset) helper for this logic? No strong feelings about this, but given it's a two-line loop might not be worth it. I'd also need a pretty verbose btf_find_base_btf_for_str_offset() and btf_find_base_btf_for_type_id(). I feel like loop might be less distracting actually. > > > + > > + offset -= btf->start_str_off; > > + if (offset < btf->hdr.str_len) > > + return &btf->strings[offset]; > > + > > + return NULL; > > +} > > + > > [...] > > > } > > > > const char *btf_name_by_offset(const struct btf *btf, u32 offset) > > { > > - if (offset < btf->hdr.str_len) > > - return &btf->strings[offset]; > > - > > - return NULL; > > + return btf_str_by_offset(btf, offset); > > } > > IIUC, btf_str_by_offset() and btf_name_by_offset() are identical. Can we > just keep btf_name_by_offset()? btf_str_by_offset() is static, so should be inlinable, while btf_name_by_offset() is a global function, I was worrying about regressing performance for __btf_name_valid() and __btf_name_by_offset(). Premature optimization you think? > > > > > const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) > > { > > - if (type_id > btf->nr_types) > > - return NULL; > > +again: > > + if (type_id < btf->start_id) { > > + btf = btf->base_btf; > > + goto again; > > + } > > ditto, goto again.. > > [...] > >
Re: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection
On Fri, Nov 6, 2020 at 4:12 PM Jason Gunthorpe wrote: > > On Fri, Nov 06, 2020 at 03:47:00PM -0800, Dan Williams wrote: [..] > The only sane way to implement this generically is for the VMM to > provide a hypercall to obtain a real *working* addr/data pair(s) and > then have the platform hand those out from > pci_subdevice_msi_create_irq_domain(). Yeah, that seems a logical attach point for this magic. Appreciate you taking the time to lay it out.
Re: [PATCH] mm/memcontrol:rewrite mem_cgroup_page_lruvec()
On Wed, 4 Nov 2020 22:25:16 +0800 Hui Su wrote: > mem_cgroup_page_lruvec() in memcontrol.c and > mem_cgroup_lruvec() in memcontrol.h is very similar > except for the param(page and memcg) which also can be > convert to each other. > > So rewrite mem_cgroup_page_lruvec() with mem_cgroup_lruvec(). Alex Shi's "mm/memcg: warning on !memcg after readahead page charged" (https://lkml.kernel.org/r/1604283436-18880-3-git-send-email-alex@linux.alibaba.com) changes mem_cgroup_page_lruvec() thusly: --- a/mm/memcontrol.c~mm-memcg-warning-on-memcg-after-readahead-page-charged +++ a/mm/memcontrol.c @@ -1325,10 +1325,7 @@ struct lruvec *mem_cgroup_page_lruvec(st } memcg = page_memcg(page); - /* -* Swapcache readahead pages are added to the LRU - and -* possibly migrated - before they are charged. -*/ + VM_WARN_ON_ONCE_PAGE(!memcg, page); if (!memcg) memcg = root_mem_cgroup; So the patch didn't apply. That's easily fixed, but it does make one wonder whether this: > -struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); > +/** > + * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page > + * @page: the page > + * @pgdat: pgdat of the page > + * > + * This function relies on page->mem_cgroup being stable. > + */ > +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, > + struct pglist_data *pgdat) > +{ > + struct mem_cgroup *memcg = page->mem_cgroup; > + > + return mem_cgroup_lruvec(memcg, pgdat); > +} Should be using page_memcg()?
Re: [PATCH v2 bpf-next 1/5] bpf: add in-kernel split BTF support
> On Nov 6, 2020, at 3:02 PM, Andrii Nakryiko wrote: > > Adjust in-kernel BTF implementation to support a split BTF mode of operation. > Changes are mostly mirroring libbpf split BTF changes, with the exception of > start_id being 0 for in-kernel implementation due to simpler read-only mode. > > Otherwise, for split BTF logic, most of the logic of jumping to base BTF, > where necessary, is encapsulated in few helper functions. Type numbering and > string offset in a split BTF are logically continuing where base BTF ends, so > most of the high-level logic is kept without changes. > > Type verification and size resolution is only doing an added resolution of new > split BTF types and relies on already cached size and type resolution results > in the base BTF. > > Signed-off-by: Andrii Nakryiko [...] > > @@ -600,8 +618,15 @@ static const struct btf_kind_operations > *btf_type_ops(const struct btf_type *t) > > static bool btf_name_offset_valid(const struct btf *btf, u32 offset) > { > - return BTF_STR_OFFSET_VALID(offset) && > - offset < btf->hdr.str_len; > + if (!BTF_STR_OFFSET_VALID(offset)) > + return false; > +again: > + if (offset < btf->start_str_off) { > + btf = btf->base_btf; > + goto again; Can we do a while loop instead of "goto again;"? > + } > + offset -= btf->start_str_off; > + return offset < btf->hdr.str_len; > } > > static bool __btf_name_char_ok(char c, bool first, bool dot_ok) > @@ -615,10 +640,25 @@ static bool __btf_name_char_ok(char c, bool first, bool > dot_ok) > return true; > } > > +static const char *btf_str_by_offset(const struct btf *btf, u32 offset) > +{ > +again: > + if (offset < btf->start_str_off) { > + btf = btf->base_btf; > + goto again; > + } Maybe add a btf_find_base_btf(btf, offset) helper for this logic? > + > + offset -= btf->start_str_off; > + if (offset < btf->hdr.str_len) > + return &btf->strings[offset]; > + > + return NULL; > +} > + [...] > } > > const char *btf_name_by_offset(const struct btf *btf, u32 offset) > { > - if (offset < btf->hdr.str_len) > - return &btf->strings[offset]; > - > - return NULL; > + return btf_str_by_offset(btf, offset); > } IIUC, btf_str_by_offset() and btf_name_by_offset() are identical. Can we just keep btf_name_by_offset()? > > const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) > { > - if (type_id > btf->nr_types) > - return NULL; > +again: > + if (type_id < btf->start_id) { > + btf = btf->base_btf; > + goto again; > + } ditto, goto again.. [...]
Re: [PATCH 4/9 next] fs/io_uring Don't use the return value from import_iovec().
On 15/09/2020 15:55, David Laight wrote: > > This is the only code that relies on import_iovec() returning > iter.count on success. > This allows a better interface to import_iovec(). Seems this got nowhere. I'll pick it and send with some other patches to Jens. > Signed-off-by: David Laight > --- > fs/io_uring.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index 3790c7fe9fee..0df43882e4b3 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -2824,7 +2824,7 @@ static ssize_t __io_import_iovec(int rw, struct > io_kiocb *req, > > ret = import_single_range(rw, buf, sqe_len, *iovec, iter); > *iovec = NULL; > - return ret < 0 ? ret : sqe_len; > + return ret; > } > > if (req->flags & REQ_F_BUFFER_SELECT) { > @@ -2853,7 +2853,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb > *req, > if (!req->io) > return __io_import_iovec(rw, req, iovec, iter, needs_lock); > *iovec = NULL; > - return iov_iter_count(&req->io->rw.iter); > + return 0; > } > > static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) > @@ -3123,7 +3123,7 @@ static int io_read(struct io_kiocb *req, bool > force_nonblock, > if (ret < 0) > return ret; > iov_count = iov_iter_count(iter); > - io_size = ret; > + io_size = iov_count; > req->result = io_size; > ret = 0; > > @@ -3246,7 +3246,7 @@ static int io_write(struct io_kiocb *req, bool > force_nonblock, > if (ret < 0) > return ret; > iov_count = iov_iter_count(iter); > - io_size = ret; > + io_size = iov_count; > req->result = io_size; > > /* Ensure we clear previously set non-block flag */ > -- Pavel Begunkov
[PATCH v2] x86/xen: don't unbind uninitialized lock_kicker_irq
When booting a hyperthreaded system with the kernel parameter 'mitigations=auto,nosmt', the following warning occurs: WARNING: CPU: 0 PID: 1 at drivers/xen/events/events_base.c:1112 unbind_from_irqhandler+0x4e/0x60 ... Hardware name: Xen HVM domU, BIOS 4.2.amazon 08/24/2006 ... Call Trace: xen_uninit_lock_cpu+0x28/0x62 xen_hvm_cpu_die+0x21/0x30 takedown_cpu+0x9c/0xe0 ? trace_suspend_resume+0x60/0x60 cpuhp_invoke_callback+0x9a/0x530 _cpu_up+0x11a/0x130 cpu_up+0x7e/0xc0 bringup_nonboot_cpus+0x48/0x50 smp_init+0x26/0x79 kernel_init_freeable+0xea/0x229 ? rest_init+0xaa/0xaa kernel_init+0xa/0x106 ret_from_fork+0x35/0x40 The secondary CPUs are not activated with the nosmt mitigations and only the primary thread on each CPU core is used. In this situation, xen_hvm_smp_prepare_cpus(), and more importantly xen_init_lock_cpu(), is not called, so the lock_kicker_irq is not initialized for the secondary CPUs. Let's fix this by exiting early in xen_uninit_lock_cpu() if the irq is not set to avoid the warning from above for each secondary CPU. Signed-off-by: Brian Masney --- Changes since v1: - Remove duplicate per_cpu() call and pass in irq variable. - Changed subject from 'x86/xen: fix warning when running with nosmt mitigations' - Shorten code comment arch/x86/xen/spinlock.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 799f4eba0a62..043c73dfd2c9 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -93,10 +93,20 @@ void xen_init_lock_cpu(int cpu) void xen_uninit_lock_cpu(int cpu) { + int irq; + if (!xen_pvspin) return; - unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); + /* +* When booting the kernel with 'mitigations=auto,nosmt', the secondary +* CPUs are not activated, and lock_kicker_irq is not initialized. +*/ + irq = per_cpu(lock_kicker_irq, cpu); + if (irq == -1) + return; + + unbind_from_irqhandler(irq, NULL); per_cpu(lock_kicker_irq, cpu) = -1; kfree(per_cpu(irq_name, cpu)); per_cpu(irq_name, cpu) = NULL; -- 2.26.2
Re: [PATCH v4 2/5] arm64, numa: Change the numa init functions name to be generic
On Fri, Nov 6, 2020 at 11:08 AM Catalin Marinas wrote: > > On Fri, Nov 06, 2020 at 09:33:14AM -0800, Atish Patra wrote: > > On Fri, Nov 6, 2020 at 9:14 AM Catalin Marinas > > wrote: > > > On Mon, Oct 05, 2020 at 05:17:49PM -0700, Atish Patra wrote: > > > > diff --git a/arch/arm64/kernel/acpi_numa.c > > > > b/arch/arm64/kernel/acpi_numa.c > > > > index 7ff800045434..96502ff92af5 100644 > > > > --- a/arch/arm64/kernel/acpi_numa.c > > > > +++ b/arch/arm64/kernel/acpi_numa.c > > > > @@ -117,16 +117,3 @@ void __init acpi_numa_gicc_affinity_init(struct > > > > acpi_srat_gicc_affinity *pa) > > > > > > > > node_set(node, numa_nodes_parsed); > > > > } > > > > - > > > > -int __init arm64_acpi_numa_init(void) > > > > -{ > > > > - int ret; > > > > - > > > > - ret = acpi_numa_init(); > > > > - if (ret) { > > > > - pr_info("Failed to initialise from firmware\n"); > > > > - return ret; > > > > - } > > > > - > > > > - return srat_disabled() ? -EINVAL : 0; > > > > -} > > > > > > I think it's better if arm64_acpi_numa_init() and arm64_numa_init() > > > remained in the arm64 code. It's not really much code to be shared. > > > > RISC-V will probably support ACPI one day. The idea is to not to do > > exercise again in future. > > Moreover, there will be arch_numa_init which will be used by RISC-V > > and there will be arm64_numa_init > > used by arm64. However, if you feel strongly about it, I am happy to > > move back those two functions to arm64. > > I don't have a strong view on this, only if there's a risk at some point > of the implementations diverging (e.g. quirks). We can revisit it if > that happens. > Sure. I seriously hope we don't have to deal with arch specific quirks in future. > It may be worth swapping patches 1 and 2 so that you don't have an > arm64_* function in the core code after the first patch (more of a > nitpick). Either way, feel free to add my ack on both patches: > Sure. I will swap 1 & 2 and resend the series. > Acked-by: Catalin Marinas Thanks. -- Regards, Atish
Re: [PATCH v9 2/7] rcu/segcblist: Add counters to segcblist datastructure
On Fri, Nov 06, 2020 at 07:18:47PM -0500, Joel Fernandes wrote: > On Fri, Nov 06, 2020 at 07:01:57PM -0500, Joel Fernandes wrote: > > On Wed, Nov 04, 2020 at 09:01:33AM -0800, Paul E. McKenney wrote: > > > > > A casual reader might be forgiven for being confused by the combination > > > of "Return" in the above comment and the "void" function type below. > > > So shouldn't this comment be something like "Add the specified number > > > of callbacks to the specified segment..."? > > > > You are right, sorry and will fix it. > > > > > > @@ -330,11 +342,16 @@ void rcu_segcblist_extract_pend_cbs(struct > > > > rcu_segcblist *rsclp, > > > > > > > > if (!rcu_segcblist_pend_cbs(rsclp)) > > > > return; /* Nothing to do. */ > > > > + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) + > > > > + rcu_segcblist_get_seglen(rsclp, > > > > RCU_NEXT_READY_TAIL) + > > > > + rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL); > > > > > > This should be a "for" loop. Yes, the number and names of the segments > > > hasn't changed for a good long time, but nothing like code as above to > > > inspire Murphy to more mischief. :-/ > > > > > > Actually, why not put the summation in the existing "for" loop below? > > > That would save a line of code in addition to providing less inspiration > > > for Mr. Murphy. > > > > I can do that. Actually Frederic suggested the same thing but I was > > reluctant > > as I felt it did not give much LOC benefit. Will revisit it. > > It reduces 1 line of code :) I changed it to the below, will update the patch: Thank you! And yes, I am much more concerned about the constraints on Mr. Murphy than on the lines of code. ;-) Thanx, Paul > ---8<--- > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c > index 9b43d686b1f3..bff9b2253e50 100644 > --- a/kernel/rcu/rcu_segcblist.c > +++ b/kernel/rcu/rcu_segcblist.c > @@ -101,7 +101,7 @@ static void rcu_segcblist_set_seglen(struct rcu_segcblist > *rsclp, int seg, long > WRITE_ONCE(rsclp->seglen[seg], v); > } > > -/* Return number of callbacks in a segment of the segmented callback list. */ > +/* Increase the numeric length of a segment by a specified amount. */ > static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, > long v) > { > WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v); > @@ -406,13 +406,12 @@ void rcu_segcblist_extract_pend_cbs(struct > rcu_segcblist *rsclp, > > if (!rcu_segcblist_pend_cbs(rsclp)) > return; /* Nothing to do. */ > - rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) + > - rcu_segcblist_get_seglen(rsclp, RCU_NEXT_READY_TAIL) + > - rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL); > + rclp->len = 0; > *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; > rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; > WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); > for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) { > + rclp->len += rcu_segcblist_get_seglen(rsclp, i); > WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); > rcu_segcblist_set_seglen(rsclp, i, 0); > }
Re: [Y2038][time namespaces] Question regarding CLOCK_REALTIME support plans in Linux time namespaces
On Thu, Nov 05 2020 at 12:25, Carlos O'Donell wrote: > On 10/30/20 9:38 PM, Thomas Gleixner wrote: > If kata grows up quickly perhaps this entire problem becomes solved, but until > then I continue to have a testing need for a distinct CLOCK_REALTIME in a > time namespace (and it need not be unconditional, if I have to engage magic > then I'm happy to do that). Conditional, that might be a way to go. Would CONFIG_DEBUG_DISTORTED_CLOCK_REALTIME be a way to go? IOW, something which is clearly in the debug section of the kernel which wont get turned on by distros (*cough*) and comes with a description that any bug reports against it vs. time correctness are going to be ignored. > * Adding CLOCK_REALTIME to the kernel is a lot of work given the expected > guarantees for a local system. Correct. > * CLOCK_REALTIME is an expensive resource to maintain, even more expensive > than other resources where the kernel can balance their usage. Correct. > * On balance it would be better to use vm or vm+containers e.g. kata as a > solution to having CLOCK_REALTIME distinct in the container. That'd be the optimal solution, but the above might be a middle ground. Thanks, tglx
Re: [PATCH] x86/xen: fix warning when running with nosmt mitigations
On 11/5/20 7:47 PM, Brian Masney wrote: > On Thu, Nov 05, 2020 at 07:35:29PM -0500, Brian Masney wrote: >> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c >> index 799f4eba0a62..4a052459a08e 100644 >> --- a/arch/x86/xen/spinlock.c >> +++ b/arch/x86/xen/spinlock.c >> @@ -93,9 +93,24 @@ void xen_init_lock_cpu(int cpu) >> >> void xen_uninit_lock_cpu(int cpu) >> { >> +int irq; >> + >> if (!xen_pvspin) >> return; >> >> +/* >> + * When booting the kernel with 'mitigations=auto,nosmt', the secondary >> + * CPUs are not activated and only the primary thread on each CPU core >> + * is used. In this situation, xen_hvm_smp_prepare_cpus(), and more >> + * importantly xen_init_lock_cpu(), is not called, so the >> + * lock_kicker_irq is not initialized for the secondary CPUs. Let's >> + * exit early if the irq is not set to avoid a warning in the console >> + * log. >> + */ >> +irq = per_cpu(lock_kicker_irq, cpu); >> +if (irq == -1) >> +return; >> + >> unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); > As soon as I saw this on lore, I saw that I should have passed the irq > variable to unbind_from_irqhandler() rather than doing another per_cpu() > lookup. I'll wait for feedback about the general approach before posting > a v2. This looks good. I'd shorten the comment though: your commit message already describes the scenario. And change the subject to something like "Don't unbind uninitialized lock_kicker_irq". -boris
Re: [PATCH] interconnect: qcom: msm8974: Prevent integer overflow in rate
On Fri, Nov 06, 2020 at 04:48:47PM +0200, Georgi Djakov wrote: > When sync_state support got introduced recently, by default we try to > set the NoCs to run initially at maximum rate. But as these values are > aggregated, we may end with a really big clock rate value, which is > then converted from "u64" to "long" during the clock rate rounding. > But on 32bit platforms this may result an overflow. Fix it by making > sure that the rate is within range. > > Reported-by: Luca Weiss > Fixes: b1d681d8d324 ("interconnect: Add sync state support") > Signed-off-by: Georgi Djakov Reviewed-by: Brian Masney
RE: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection
On Fri, Nov 06 2020 at 09:48, Kevin Tian wrote: >> From: Jason Gunthorpe >> On Wed, Nov 04, 2020 at 01:34:08PM +, Tian, Kevin wrote: >> The interrupt controller is responsible to create an addr/data pair >> for an interrupt message. It sets the message format and ensures it >> routes to the proper CPU interrupt handler. Everything about the >> addr/data pair is owned by the platform interrupt controller. >> >> Devices do not create interrupts. They only trigger the addr/data pair >> the platform gives them. > > I guess that we may just view it from different angles. On x86 platform, > a MSI/IMS capable device directly composes interrupt messages, with > addr/data pair filled by OS. If there is no IOMMU remapping enabled in > the middle, the message just hits the CPU. Your description possibly > is from software side, e.g. describing the hierarchical IRQ domain > concept? No. The device composes nothing. If the interrupt is raised in the device then the MSI block sends the message which was composed by the OS and stored in the device's message store. For PCI/MSI that's the MSI or MSIX table and for IMS that's either on device memory (as IDXD uses) or some completely different location which Jason described. This has absolutely nothing to do with the X86 platform. MSI is a architecture independent mechanism: Send whatever the OS put into the storage to raise an interrupt in the CPU. The device does neither know whether that message is going to be intercepted by an interrupt remapping unit or not. Stop claiming that any of this has anything to do with x86. It has absolutely nothing to do with x86 and looking at MSI from an x86 perspective instead of looking at it from the architecture agnostic technical reality of MSI is the reason why we have this discussion at all. We had a similar discussion vs. the way how IMS interrupts have to be dealt with in terms of irq domains. Can you finally stop looking at everything as a big x86/intel/platform lump and understand that things are very well structured and seperated both at the hardware and at the software level? > Do you mind providing the link? There were lots of discussions between > you and Thomas. I failed to locate the exact mail when searching above > keywords. In this thread: 20200821002424.119492...@linutronix.de and you were on Cc Thanks, tglx
Re: [PATCH v9 5/7] rcu/segcblist: Remove useless rcupdate.h include
On Thu, Nov 05, 2020 at 06:28:10AM -0800, Paul E. McKenney wrote: > On Wed, Nov 04, 2020 at 07:48:23PM -0800, Paul E. McKenney wrote: > > On Tue, Nov 03, 2020 at 09:26:01AM -0500, Joel Fernandes (Google) wrote: > > > Signed-off-by: Joel Fernandes (Google) > > > > This one looks fine, but depends on the earlier "rcu/segcblist: Add > > counters to segcblist datastructure" patch, which also changes the list > > of #include directives for this file. It manages to avoid conflicting > > with "rcu/trace: Add tracing for how segcb list changes", despite this > > one also changing the #include directives. > > > > I am testing it just out of curiosity, but it might make more sense > > to fold this one into "rcu/segcblist: Add counters to segcblist > > datastructure". > > And it does pass light rcutorture. ;-) Cool, I squashed it into 2/7 and updated my tree. thanks, - Joel > Thanx, Paul > > > > --- > > > kernel/rcu/rcu_segcblist.c | 1 - > > > 1 file changed, 1 deletion(-) > > > > > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c > > > index 2a03949d0b82..e9e72d72f7a6 100644 > > > --- a/kernel/rcu/rcu_segcblist.c > > > +++ b/kernel/rcu/rcu_segcblist.c > > > @@ -10,7 +10,6 @@ > > > #include > > > #include > > > #include > > > -#include > > > #include > > > > > > #include "rcu_segcblist.h" > > > -- > > > 2.29.1.341.ge80a0c044ae-goog > > >
Re: [PATCH v9 6/7] rcu/tree: segcblist: Remove redundant smp_mb()s
On Wed, Nov 04, 2020 at 07:57:13PM -0800, Paul E. McKenney wrote: > On Tue, Nov 03, 2020 at 09:26:02AM -0500, Joel Fernandes (Google) wrote: > > This memory barrier is not needed as rcu_segcblist_add_len() already > > includes a memory barrier *before* and *after* the length of the list is > > updated. > > > > Same reasoning for rcu_segcblist_enqueue(). > > I suggest a commit log like the following: > > > > The full memory barriers in rcu_segcblist_enqueue() and in rcu_do_batch() > are not needed because rcu_segcblist_add_len(), and thus also > rcu_segcblist_inc_len(), already includes a memory barrier *before* > and *after* the length of the list is updated. > > This commit therefore removes these redundant smp_mb() invocations. > > > > Other than that, looks good! I could hand-apply it, but that > would just cause more churn with the addition of the call to > rcu_segcblist_inc_seglen(). So could you please update the commit log > when you repost, whether to the mailing list or from your git tree? Done, I updated it in my tree. I will share the link to tree on IRC. thanks, - Joel > > Thanx, Paul > > > Reviewed-by: Frederic Weisbecker > > Signed-off-by: Joel Fernandes (Google) > > --- > > kernel/rcu/rcu_segcblist.c | 1 - > > kernel/rcu/tree.c | 1 - > > 2 files changed, 2 deletions(-) > > > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c > > index e9e72d72f7a6..d96272e8d604 100644 > > --- a/kernel/rcu/rcu_segcblist.c > > +++ b/kernel/rcu/rcu_segcblist.c > > @@ -268,7 +268,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, > >struct rcu_head *rhp) > > { > > rcu_segcblist_inc_len(rsclp); > > - smp_mb(); /* Ensure counts are updated before callback is enqueued. */ > > rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); > > rhp->next = NULL; > > WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > index f6c6653b3ec2..fb2a5ac4a59c 100644 > > --- a/kernel/rcu/tree.c > > +++ b/kernel/rcu/tree.c > > @@ -2525,7 +2525,6 @@ static void rcu_do_batch(struct rcu_data *rdp) > > > > /* Update counts and requeue any remaining callbacks. */ > > rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); > > - smp_mb(); /* List handling before counting for rcu_barrier(). */ > > rcu_segcblist_add_len(&rdp->cblist, -count); > > > > /* Reinstate batch limit if we have worked down the excess. */ > > -- > > 2.29.1.341.ge80a0c044ae-goog > >
Re: [PATCH v22 12/23] LSM: Specify which LSM to display
On 11/5/2020 1:22 AM, Greg KH wrote: > On Wed, Nov 04, 2020 at 03:41:03PM -0800, Casey Schaufler wrote: >> Create a new entry "display" in the procfs attr directory for >> controlling which LSM security information is displayed for a >> process. A process can only read or write its own display value. >> >> The name of an active LSM that supplies hooks for >> human readable data may be written to "display" to set the >> value. The name of the LSM currently in use can be read from >> "display". At this point there can only be one LSM capable >> of display active. A helper function lsm_task_display() is >> provided to get the display slot for a task_struct. >> >> Setting the "display" requires that all security modules using >> setprocattr hooks allow the action. Each security module is >> responsible for defining its policy. >> >> AppArmor hook provided by John Johansen >> SELinux hook provided by Stephen Smalley >> >> Reviewed-by: Kees Cook >> Acked-by: Stephen Smalley >> Acked-by: Paul Moore >> Signed-off-by: Casey Schaufler >> Cc: linux-...@vger.kernel.org >> --- >> fs/proc/base.c | 1 + >> include/linux/lsm_hooks.h| 17 +++ >> security/apparmor/include/apparmor.h | 3 +- >> security/apparmor/lsm.c | 32 + >> security/security.c | 169 --- >> security/selinux/hooks.c | 11 ++ >> security/selinux/include/classmap.h | 2 +- >> security/smack/smack_lsm.c | 7 ++ >> 8 files changed, 223 insertions(+), 19 deletions(-) >> >> diff --git a/fs/proc/base.c b/fs/proc/base.c >> index 0f707003dda5..7432f24f0132 100644 >> --- a/fs/proc/base.c >> +++ b/fs/proc/base.c >> @@ -2806,6 +2806,7 @@ static const struct pid_entry attr_dir_stuff[] = { >> ATTR(NULL, "fscreate", 0666), >> ATTR(NULL, "keycreate", 0666), >> ATTR(NULL, "sockcreate",0666), >> +ATTR(NULL, "display", 0666), > That's a vague name, any chance it can be more descriptive? Sure. How about lsm_display, or display_lsm? I wouldn't say that any of the files in /proc/*/attr have especially descriptive names, but that's hardly an excuse. > And where is the Documentation/ABI/ entries for all of this, how does > userspace know what these things are, and how to use them? I'll add ABI descriptions and move some of the lsm.rst up from where it is later in the patchset. > > thanks, > > greg k-h
Re: [PATCH v9 2/7] rcu/segcblist: Add counters to segcblist datastructure
On Fri, Nov 06, 2020 at 07:01:57PM -0500, Joel Fernandes wrote: > On Wed, Nov 04, 2020 at 09:01:33AM -0800, Paul E. McKenney wrote: > > > A casual reader might be forgiven for being confused by the combination > > of "Return" in the above comment and the "void" function type below. > > So shouldn't this comment be something like "Add the specified number > > of callbacks to the specified segment..."? > > You are right, sorry and will fix it. > > > > @@ -330,11 +342,16 @@ void rcu_segcblist_extract_pend_cbs(struct > > > rcu_segcblist *rsclp, > > > > > > if (!rcu_segcblist_pend_cbs(rsclp)) > > > return; /* Nothing to do. */ > > > + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) + > > > + rcu_segcblist_get_seglen(rsclp, RCU_NEXT_READY_TAIL) + > > > + rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL); > > > > This should be a "for" loop. Yes, the number and names of the segments > > hasn't changed for a good long time, but nothing like code as above to > > inspire Murphy to more mischief. :-/ > > > > Actually, why not put the summation in the existing "for" loop below? > > That would save a line of code in addition to providing less inspiration > > for Mr. Murphy. > > I can do that. Actually Frederic suggested the same thing but I was reluctant > as I felt it did not give much LOC benefit. Will revisit it. It reduces 1 line of code :) I changed it to the below, will update the patch: ---8<--- diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9b43d686b1f3..bff9b2253e50 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -101,7 +101,7 @@ static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long WRITE_ONCE(rsclp->seglen[seg], v); } -/* Return number of callbacks in a segment of the segmented callback list. */ +/* Increase the numeric length of a segment by a specified amount. */ static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v) { WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v); @@ -406,13 +406,12 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_pend_cbs(rsclp)) return; /* Nothing to do. */ - rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) + - rcu_segcblist_get_seglen(rsclp, RCU_NEXT_READY_TAIL) + - rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL); + rclp->len = 0; *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) { + rclp->len += rcu_segcblist_get_seglen(rsclp, i); WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); rcu_segcblist_set_seglen(rsclp, i, 0); }
Re: [PATCH v8 17/18] scsi: megaraid_sas: Added support for shared host tagset for cpuhotplug
On Sat, 2020-11-07 at 00:55 +0530, Sumit Saxena wrote: > I am able to hit the boot hang and similar kind of stack traces as > reported by Qian with shared .config on x86 machine. > In my case the system boots after a hang of 40-45 mins. Qian, is it > true for you as well ? I don't know. I had never waited for that long.
Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()
On Fri, Nov 06, 2020 at 01:03:26PM -0700, Logan Gunthorpe wrote: > I don't think a function like that will work for the p2pmem use case. In > order to implement proper page freeing I expect I'll need a loop around > the allocator and vm_insert_mixed()... Something roughly like: > > for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { > vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE); > ret = vmf_insert_mixed(vma, addr, > __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP)); > } > > That way we can call pci_free_p2pmem() when a page's ref count goes to > zero. I suspect your use case will need to do something similar. Yes, but I would say the pci_alloc_p2pmem() layer should be able to free pages on a page-by-page basis so you don't have to do stuff like the above. There is often a lot of value in having physical contiguous addresses, so allocating page by page as well seems poor. Jason
Re: [PATCH] mm/gup_benchmark: GUP_BENCHMARK depends on DEBUG_FS
On 11/4/20 2:05 AM, Barry Song wrote: Without DEBUG_FS, all the code in gup_benchmark becomes meaningless. For sure kernel provides debugfs stub while DEBUG_FS is disabled, but the point here is that GUP_BENCHMARK can do nothing without DEBUG_FS. Cc: John Hubbard Cc: Ralph Campbell Inspired-by: John Garry Signed-off-by: Barry Song --- * inspired by John's comment in this patch: https://lore.kernel.org/linux-iommu/184797b8-512e-e3da-fae7-25c7d6626...@huawei.com/ mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index d42423f..91fa923 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -836,6 +836,7 @@ config PERCPU_STATS config GUP_BENCHMARK bool "Enable infrastructure for get_user_pages() and related calls benchmarking" + depends on DEBUG_FS I think "select DEBUG_FS" is better here. "depends on" has the obnoxious behavior of hiding the choice from you, if the dependencies aren't already met. Whereas what the developer *really* wants is a no-nonsense activation of the choice: "enable GUP_BENCHMARK and the debug fs that it requires". So depends on really on is better for things that you just can't control, such as the cpu arch you're on, etc. Also note that this will have some minor merge conflict with mmotm, Due to renaming to GUP_TEST. No big deal though. thanks, -- John Hubbard NVIDIA
Re: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection
On Fri, Nov 06, 2020 at 03:47:00PM -0800, Dan Williams wrote: > Also feel free to straighten me out (Jason or Ashok) if I've botched > the understanding of this. It is pretty simple when you get down to it. We have a new kernel API that Thomas added: pci_subdevice_msi_create_irq_domain() This creates an IRQ domain that hands out addr/data pairs that trigger interrupts. On bare metal the addr/data pairs from the IRQ domain are programmed into the HW in some HW specific way by the device driver that calls the above function. On (kvm) virtualization the addr/data pair the IRQ domain hands out doesn't work. It is some fake thing. To make this work on normal MSI/MSI-X the VMM implements emulation of the standard MSI/MSI-X programming and swaps the fake addr/data pair for a real one obtained from the hypervisor IRQ domain. To "deal" with this issue the SIOV spec suggests to add a per-device PCI Capability that says "IMS works". Which means either: - This is bare metal, so of course it works - The VMM is trapping and emulating whatever the device specific IMS programming is. The idea being that a VMM can never advertise the IMS cap flag to the guest unles the VMM provides a device specific driver that does device specific emulation to capture the addr/data pair. Remeber IMS doesn't say how to program the addr/data pair! Every device is unique! On something like IDXD this emulation is not so hard, on something like mlx5 this is completely unworkable. Further we never do emulation on our devices, they always pass native hardware through, even for SIOV-like cases. In the end pci_subdevice_msi_create_irq_domain() is a platform function. Either it should work completely on every device with no device-specific emulation required in the VMM, or it should not work at all and return -EOPNOTSUPP. The only sane way to implement this generically is for the VMM to provide a hypercall to obtain a real *working* addr/data pair(s) and then have the platform hand those out from pci_subdevice_msi_create_irq_domain(). All IMS device drivers will work correctly. No VMM device emulation is ever needed to translate addr/data pairs. Earlier in this thread Kevin said hyper-v is already working this way, even for MSI/MSI-X. To me this says it is fundamentally a KVM platform problem and it should not be solved by PCI capability flags. Jason
[PATCH v2] Make iwmmxt.S support Clang's integrated assembler
This patch replaces 6 IWMMXT instructions Clang's integrated assembler does not support in iwmmxt.S using macros, while making sure GNU assembler still emit the same instructions. This should be easier than providing full IWMMXT support in Clang. "Intel Wireless MMX Technology - Developer Guide - August, 2002" should be referenced for the encoding schemes of these extensions. Link: https://github.com/ClangBuiltLinux/linux/issues/975 Suggested-by: Nick Desaulniers Suggested-by: Ard Biesheuvel Signed-off-by: Jian Cai --- arch/arm/kernel/iwmmxt.S | 89 arch/arm/kernel/iwmmxt.h | 47 + 2 files changed, 92 insertions(+), 44 deletions(-) create mode 100644 arch/arm/kernel/iwmmxt.h diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S index 0dcae787b004..d2b4ac06e4ed 100644 --- a/arch/arm/kernel/iwmmxt.S +++ b/arch/arm/kernel/iwmmxt.S @@ -16,6 +16,7 @@ #include #include #include +#include "iwmmxt.h" #if defined(CONFIG_CPU_PJ4) || defined(CONFIG_CPU_PJ4B) #define PJ4(code...) code @@ -113,33 +114,33 @@ concan_save: concan_dump: - wstrw wCSSF, [r1, #MMX_WCSSF] - wstrw wCASF, [r1, #MMX_WCASF] - wstrw wCGR0, [r1, #MMX_WCGR0] - wstrw wCGR1, [r1, #MMX_WCGR1] - wstrw wCGR2, [r1, #MMX_WCGR2] - wstrw wCGR3, [r1, #MMX_WCGR3] + wstrw wCSSF, r1, MMX_WCSSF + wstrw wCASF, r1, MMX_WCASF + wstrw wCGR0, r1, MMX_WCGR0 + wstrw wCGR1, r1, MMX_WCGR1 + wstrw wCGR2, r1, MMX_WCGR2 + wstrw wCGR3, r1, MMX_WCGR3 1: @ MUP? wRn tst r2, #0x2 beq 2f - wstrd wR0, [r1, #MMX_WR0] - wstrd wR1, [r1, #MMX_WR1] - wstrd wR2, [r1, #MMX_WR2] - wstrd wR3, [r1, #MMX_WR3] - wstrd wR4, [r1, #MMX_WR4] - wstrd wR5, [r1, #MMX_WR5] - wstrd wR6, [r1, #MMX_WR6] - wstrd wR7, [r1, #MMX_WR7] - wstrd wR8, [r1, #MMX_WR8] - wstrd wR9, [r1, #MMX_WR9] - wstrd wR10, [r1, #MMX_WR10] - wstrd wR11, [r1, #MMX_WR11] - wstrd wR12, [r1, #MMX_WR12] - wstrd wR13, [r1, #MMX_WR13] - wstrd wR14, [r1, #MMX_WR14] - wstrd wR15, [r1, #MMX_WR15] + wstrd wR0, r1, MMX_WR0 + wstrd wR1, r1, MMX_WR1 + wstrd wR2, r1, MMX_WR2 + wstrd wR3, r1, MMX_WR3 + wstrd wR4, r1, MMX_WR4 + wstrd wR5, r1, MMX_WR5 + wstrd wR6, r1, MMX_WR6 + wstrd wR7, r1, MMX_WR7 + wstrd wR8, r1, MMX_WR8 + wstrd wR9, r1, MMX_WR9 + wstrd wR10, r1, MMX_WR10 + wstrd wR11, r1, MMX_WR11 + wstrd wR12, r1, MMX_WR12 + wstrd wR13, r1, MMX_WR13 + wstrd wR14, r1, MMX_WR14 + wstrd wR15, r1, MMX_WR15 2: teq r0, #0 @ anything to load? reteq lr @ if not, return @@ -147,30 +148,30 @@ concan_dump: concan_load: @ Load wRn - wldrd wR0, [r0, #MMX_WR0] - wldrd wR1, [r0, #MMX_WR1] - wldrd wR2, [r0, #MMX_WR2] - wldrd wR3, [r0, #MMX_WR3] - wldrd wR4, [r0, #MMX_WR4] - wldrd wR5, [r0, #MMX_WR5] - wldrd wR6, [r0, #MMX_WR6] - wldrd wR7, [r0, #MMX_WR7] - wldrd wR8, [r0, #MMX_WR8] - wldrd wR9, [r0, #MMX_WR9] - wldrd wR10, [r0, #MMX_WR10] - wldrd wR11, [r0, #MMX_WR11] - wldrd wR12, [r0, #MMX_WR12] - wldrd wR13, [r0, #MMX_WR13] - wldrd wR14, [r0, #MMX_WR14] - wldrd wR15, [r0, #MMX_WR15] + wldrd wR0, r0, MMX_WR0 + wldrd wR1, r0, MMX_WR1 + wldrd wR2, r0, MMX_WR2 + wldrd wR3, r0, MMX_WR3 + wldrd wR4, r0, MMX_WR4 + wldrd wR5, r0, MMX_WR5 + wldrd wR6, r0, MMX_WR6 + wldrd wR7, r0, MMX_WR7 + wldrd wR8, r0, MMX_WR8 + wldrd wR9, r0, MMX_WR9 + wldrd wR10, r0, MMX_WR10 + wldrd wR11, r0, MMX_WR11 + wldrd wR12, r0, MMX_WR12 + wldrd wR13, r0, MMX_WR13 + wldrd wR14, r0, MMX_WR14 + wldrd wR15, r0, MMX_WR15 @ Load wCx - wldrw wCSSF, [r0, #MMX_WCSSF] - wldrw wCASF, [r0, #MMX_WCASF] - wldrw wCGR0, [r0, #MMX_WCGR0] - wldrw wCGR1, [r0, #MMX_WCGR1] - wldrw wCGR2, [r0, #MMX_WCGR2] - wldrw wCGR3, [r0, #MMX_WCGR3] + wldrw wCSSF, r0, MMX_WCSSF + wldrw wCASF, r0, MMX_WCASF + wldrw wCGR0, r0, MMX_WCGR0 + wldrw wCGR1, r0, MMX_WCGR1 + wldrw wCGR2, r0, MMX_WCGR2 + wldrw wCGR3, r0, MMX_WCGR3 @ clear CUP/MUP (only if r1 != 0) teq r1, #0 diff --git a/arch/arm/kernel/iwmmxt.h b/arch/arm/kernel/iwmmxt.h new file mode 100644 index ..fb627286f5bb --- /dev/null +++ b/arch/arm/kernel/iwmmxt.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __IWMMXT_H__ +#define __IWMMXT_H__ + +.irp b,
Re: [PATCH v9 4/7] rcu/trace: Add tracing for how segcb list changes
On Wed, Nov 04, 2020 at 03:33:14PM +0100, Frederic Weisbecker wrote: > On Wed, Nov 04, 2020 at 06:08:07AM -0800, Paul E. McKenney wrote: > > On Tue, Nov 03, 2020 at 04:17:31PM +0100, Frederic Weisbecker wrote: > > > On Tue, Nov 03, 2020 at 09:26:00AM -0500, Joel Fernandes (Google) wrote: > > > > +/* > > > > + * Return how many CBs each segment along with their gp_seq values. > > > > + * > > > > + * This function is O(N) where N is the number of segments. Only used > > > > from > > > > + * tracing code which is usually disabled in production. > > > > + */ > > > > +#ifdef CONFIG_RCU_TRACE > > > > +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp, > > > > +int cbcount[RCU_CBLIST_NSEGS], > > > > +unsigned long gpseq[RCU_CBLIST_NSEGS]) > > > > +{ > > > > + int i; > > > > + > > > > + for (i = 0; i < RCU_CBLIST_NSEGS; i++) { > > > > + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i); > > > > + gpseq[i] = rsclp->gp_seq[i]; > > > > + } > > > > +} > > > > + > > > > +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char > > > > *context) > > > > +{ > > > > + int cbs[RCU_CBLIST_NSEGS]; > > > > + unsigned long gps[RCU_CBLIST_NSEGS]; > > > > + > > > > + if (!trace_rcu_segcb_stats_enabled()) > > > > + return; > > > > > > Yes, very good! > > > > > > Paul just told me that RCU_TRACE can be used in production so that > > > confirms that we > > > wanted to avoid this loop of 8 iterations when tracing is disabled. > > > > RCU's "don't try this in production" Kconfig option is PROVE_RCU. > > > > I would be looking for checks that the sum of the segment lengths > > match the overall ->len or checks that all of the segment lengths > > are zero when ->cblist is empty to be guarded by something like > > IS_ENABLED(CONFIG_PROVE_RCU). Of course, checks of this sort need to > > be confined to those portions of rcu_do_batch() that are excluding other > > accesses to ->cblist. > > Right. > > > > > But if rcu_segcblist_countseq() is invoked only when a specific trace > > event is enabled, it should be OK to have it guarded only by RCU_TRACE. > > Indeed, so I think we are good. Thanks, so the only changes are to patches 2/7 and 4/7 which I will work on. 1/7 was already taken by Paul. For 7/7, it sounds like I did not understand Paul's reworks on the comments and we're still discussing it; but some comments are better than none, so I am Ok with Pauls version of it. thanks, - Joel
Re: [PATCH v9 2/7] rcu/segcblist: Add counters to segcblist datastructure
On Wed, Nov 04, 2020 at 09:01:33AM -0800, Paul E. McKenney wrote: > A casual reader might be forgiven for being confused by the combination > of "Return" in the above comment and the "void" function type below. > So shouldn't this comment be something like "Add the specified number > of callbacks to the specified segment..."? You are right, sorry and will fix it. > > @@ -330,11 +342,16 @@ void rcu_segcblist_extract_pend_cbs(struct > > rcu_segcblist *rsclp, > > > > if (!rcu_segcblist_pend_cbs(rsclp)) > > return; /* Nothing to do. */ > > + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_WAIT_TAIL) + > > + rcu_segcblist_get_seglen(rsclp, RCU_NEXT_READY_TAIL) + > > + rcu_segcblist_get_seglen(rsclp, RCU_NEXT_TAIL); > > This should be a "for" loop. Yes, the number and names of the segments > hasn't changed for a good long time, but nothing like code as above to > inspire Murphy to more mischief. :-/ > > Actually, why not put the summation in the existing "for" loop below? > That would save a line of code in addition to providing less inspiration > for Mr. Murphy. I can do that. Actually Frederic suggested the same thing but I was reluctant as I felt it did not give much LOC benefit. Will revisit it. > > > *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; > > rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; > > WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); > > - for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) > > + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) { > > WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); > > + rcu_segcblist_set_seglen(rsclp, i, 0); > > + } > > } > > > > /* > > @@ -345,7 +362,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist > > *rsclp, > > struct rcu_cblist *rclp) > > { > > rcu_segcblist_add_len(rsclp, rclp->len); > > - rclp->len = 0; > > You audited the callers, correct? Yep. thanks, - Joel
Re: [PATCH v5 6/7] IMA: add critical_data to the built-in policy rules
On 11/6/20 7:37 AM, Lakshmi Ramasubramanian wrote: Hi Mimi, Hi Lakshmi, Tushar, This patch defines a new critical_data builtin policy. Please update the Subject line. On Sun, 2020-11-01 at 14:26 -0800, Tushar Sugandhi wrote: From: Lakshmi Ramasubramanian The IMA hook to measure kernel critical data, namely ima_measure_critical_data(), could be called before a custom IMA policy is loaded. For example, SELinux calls ima_measure_critical_data() to measure its state and policy when they are initialized. This occurs before a custom IMA policy is loaded, and hence IMA hook will not measure the data. A built-in policy is therefore needed to measure critical data provided by callers before a custom IMA policy is loaded. ^Define a new critical data builtin policy to allow measuring early kernel integrity critical data before a custom IMA policy is loaded. I will add the above line in the patch description. Either remove the references to SELinux or move this patch after the subsequent patch which measures SELinux critical data. I will remove the reference to SELinux. I think it would be better to have this patch before the SELinux measurement patch. Add CRITICAL_DATA to built-in IMA rules if the kernel command line contains "ima_policy=critical_data". Set the IMA template for this rule to "ima-buf" since ima_measure_critical_data() measures a buffer. Signed-off-by: Lakshmi Ramasubramanian --- security/integrity/ima/ima_policy.c | 32 + 1 file changed, 32 insertions(+) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index ec99e0bb6c6f..dc8fe969d3fe 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -875,6 +884,29 @@ void __init ima_init_policy(void) ARRAY_SIZE(default_appraise_rules), IMA_DEFAULT_POLICY); + if (ima_use_critical_data) { + template = lookup_template_desc("ima-buf"); + if (!template) { + ret = -EINVAL; + goto out; + } + + ret = template_desc_init_fields(template->fmt, + &(template->fields), + &(template->num_fields)); The default IMA template when measuring buffer data is "ima_buf". Is there a reason for allocating and initializing it here and not deferring it until process_buffer_measurement()? You are right - good catch. I will remove the above and validate. process_buffer_measurement() allocates and initializes "ima-buf" template only when the parameter "func" is NONE. Currently, only ima_check_blacklist() passes NONE for func when calling process_buffer_measurement(). If "func" is anything other than NONE, ima_match_policy() picks the default IMA template if the IMA policy rule does not specify a template. We need to add "ima-buf" in the built-in policy for critical_data so that the default template is not used for buffer measurement. Please let me know if I am missing something. thanks, -lakshmi + if (ret) + goto out; + + critical_data_rules[0].template = template; + add_rules(critical_data_rules, + ARRAY_SIZE(critical_data_rules), + IMA_DEFAULT_POLICY); + } + +out: + if (ret) + pr_err("%s failed, result: %d\n", __func__, ret); + ima_update_policy_flag(); }
Re: [GIT PULL] Kselftest fixes update for Linux 5.10-rc3
The pull request you sent on Fri, 6 Nov 2020 11:16:07 -0700: > git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest > tags/linux-kselftest-fixes-5.10-rc3 has been merged into torvalds/linux.git: https://git.kernel.org/torvalds/c/03f0f5ad58479ba1374f10680fc836aa21abe8f9 Thank you! -- Deet-doot-dot, I am a bot. https://korg.docs.kernel.org/prtracker.html
[PATCH] include/linux/huge_mm.h: remove extern keyword
The external function definitions don't need the "extern" keyword. Remove them so future changes don't copy the function definition style. Signed-off-by: Ralph Campbell --- This applies cleanly to linux-mm 5.10.0-rc2 and is for Andrew's tree. include/linux/huge_mm.h | 93 ++--- 1 file changed, 41 insertions(+), 52 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0365aa97f8e7..6a19f35f836b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -7,43 +7,37 @@ #include /* only for vma_is_dax() */ -extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); -extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, -pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, -struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); -extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, -pud_t *dst_pud, pud_t *src_pud, unsigned long addr, -struct vm_area_struct *vma); +vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma); +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); +int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); +void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif -extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); -extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, - pmd_t *pmd, - unsigned int flags); -extern bool madvise_free_huge_pmd(struct mmu_gather *tlb, - struct vm_area_struct *vma, - pmd_t *pmd, unsigned long addr, unsigned long next); -extern int zap_huge_pmd(struct mmu_gather *tlb, - struct vm_area_struct *vma, - pmd_t *pmd, unsigned long addr); -extern int zap_huge_pud(struct mmu_gather *tlb, - struct vm_area_struct *vma, - pud_t *pud, unsigned long addr); -extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, -unsigned long new_addr, -pmd_t *old_pmd, pmd_t *new_pmd); -extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, - unsigned long cp_flags); +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd, + unsigned int flags); +bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next); +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, +unsigned long addr); +int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, +unsigned long addr); +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); @@ -100,13 +94,13 @@ enum transparent_hugepage_flag { struct kobject; struct kobj_attribute; -extern ssize_t single_hugepage_flag_store(struct kobject *kobj, -struct kobj_attribute *attr, -const char *buf, size_t count, -enum transparent_hugepage_flag flag); -extern ssize_t single_hugepage_flag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf, - enum transparent_hugepage_flag flag); +ssize_t single_hugepage_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, +
Re: [GIT PULL] Ceph fix for 5.10-rc3
The pull request you sent on Fri, 6 Nov 2020 20:27:50 +0100: > https://github.com/ceph/ceph-client.git tags/ceph-for-5.10-rc3 has been merged into torvalds/linux.git: https://git.kernel.org/torvalds/c/659caaf65dc9c7150aa3e80225ec6e66b25ab3ce Thank you! -- Deet-doot-dot, I am a bot. https://korg.docs.kernel.org/prtracker.html
Re: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection
On Fri, Nov 6, 2020 at 9:51 AM Jason Gunthorpe wrote: [..] > > This is true for IMS as well. But probably not implemented in the kernel as > > such. From a HW point of view (take idxd for instance) the facility is > > available to native OS as well. The early RFC supported this for native. > > I can't follow what you are trying to say here. I'm having a hard time following the technical cruxes of this debate. I grokked your feedback on the original IMS proposal way back at the beginning of this effort (pre-COVID even!), so maybe I can mediate here as well. Although, SIOV is that much harder for me to spell than IMS, so bear with me. > Dave said the IMS cap was to indicate that the VMM supported emulation > of IMS so that the VMM can do the MSI addr/data translation as part of > the emulation. > > I'm saying emulation will be too horrible for our devices that don't > require *any* emulation. This part I think I understand, i.e. why spend any logic emulating IMS as MSI since the IMS capability can be a paravirtualized interface from guest to VMM with none of the compromises that MSI would enforce. Did I get that right? > It is a bad architecture. The platform needs to handle this globally > for all devices, not special hacky emulations things custom made for > every device out there. I confess I don't quite understand the shape of what "platform needs to handle this globally" means, but I understand the desired end result of "no emulation added where not needed". However, would this mean that the bare-metal idxd driver can not be used directly in the guest without modification? For example, as I understand from talking to Ashok, idxd has some device events like error notification hard wired to MSI while data patch interrupts are IMS. So even if the IMS side does not hook up MSI emulation doesn't idxd still need MSI emulation to reuse the bare metal driver directly? > > Native devices can have both MSIx and IMS capability. But as I > > understand this isn't how we have partitioned things in SW today. We > > left IMS only for mdev's. And I agree this would be very useful. > > That split is just some decision idxd did, we are thinking about doing > other things in our devices. Where does the collision happen between what you need for a clean implementation of an IMS-like capability (/me misses his "dev-msi" name that got thrown out in the Thomas rewrite), and emulation needed to not have VF special casing in the idxd driver. Also feel free to straighten me out (Jason or Ashok) if I've botched the understanding of this.
Re: [GIT PULL] SCSI fixes for 5.10-rc2
The pull request you sent on Fri, 06 Nov 2020 14:26:05 -0800: > git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git scsi-fixes has been merged into torvalds/linux.git: https://git.kernel.org/torvalds/c/d4fc96832f0131c8f2fb067fb01c3007df6d4c9f Thank you! -- Deet-doot-dot, I am a bot. https://korg.docs.kernel.org/prtracker.html
Re: [PATCH 0/8] simplify ep_poll
On Fri, Nov 6, 2020 at 3:17 PM Soheil Hassas Yeganeh wrote: > > The first patch in the series is a fix for the epoll race in > presence of timeouts, so that it can be cleanly backported to all > affected stable kernels. > > The rest of the patch series simplify the ep_poll() implementation. > Some of these simplifications result in minor performance enhancements > as well. We have kept these changes under self tests and internal > benchmarks for a few days, and there are minor (1-2%) performance > enhancements as a result. >From just looking at the patches (not the end result - I didn't actually apply them), it looks sane to me. Linus
[PATCH 2/2] drm/nouveau/kms/nv50-: Fix clock checking algorithm in nv50_dp_mode_valid()
While I thought I had this correct (since it actually did reject modes like I expected during testing), Ville Syrjala from Intel pointed out that the logic here isn't correct. max_clock refers to the max data rate supported by the DP encoder. So, limiting it to the output of ds_clock (which refers to the maximum dotclock of the downstream DP device) doesn't make any sense. Additionally, since we're using the connector's bpc as the canonical BPC we should use this in mode_valid until we support dynamically setting the bpp based on bandwidth constraints. https://lists.freedesktop.org/archives/dri-devel/2020-September/280276.html For more info. So, let's rewrite this using Ville's advice. Changes made for stable backport: * 5.9 didn't use drm_dp_downstream_max_dotclock() yet, so remove that (the fix is still important regardless) v2: * Ville pointed out I mixed up the dotclock and the link rate. So fix that... * ...and also rename all the variables in this function to be more appropriately labeled so I stop mixing them up. * Reuse the bpp from the connector for now until we have dynamic bpp selection. * Use use DIV_ROUND_UP for calculating the mode rate like i915 does, which we should also have been doing from the start Signed-off-by: Lyude Paul Fixes: 409d38139b42 ("drm/nouveau/kms/nv50-: Use downstream DP clock limits for mode validation") Cc: Ville Syrjälä Cc: Lyude Paul Cc: Ben Skeggs Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_dp.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c b/drivers/gpu/drm/nouveau/nouveau_dp.c index 40683e1244c3f..9c06d1cc43905 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dp.c +++ b/drivers/gpu/drm/nouveau/nouveau_dp.c @@ -114,7 +114,8 @@ nv50_dp_mode_valid(struct drm_connector *connector, unsigned *out_clock) { const unsigned min_clock = 25000; - unsigned max_clock, clock = mode->clock; + unsigned int max_rate, mode_rate, clock = mode->clock; + const u8 bpp = connector->display_info.bpc * 3; if (mode->flags & DRM_MODE_FLAG_INTERLACE && !outp->caps.dp_interlace) return MODE_NO_INTERLACE; @@ -122,12 +123,13 @@ nv50_dp_mode_valid(struct drm_connector *connector, if ((mode->flags & DRM_MODE_FLAG_3D_MASK) == DRM_MODE_FLAG_3D_FRAME_PACKING) clock *= 2; - max_clock = outp->dp.link_nr * outp->dp.link_bw; - clock = mode->clock * (connector->display_info.bpc * 3) / 10; + max_rate = outp->dp.link_nr * outp->dp.link_bw; + mode_rate = DIV_ROUND_UP(clock * bpp, 8); + if (mode_rate > max_rate) + return MODE_CLOCK_HIGH; + if (clock < min_clock) return MODE_CLOCK_LOW; - if (clock > max_clock) - return MODE_CLOCK_HIGH; if (out_clock) *out_clock = clock; -- 2.28.0
[PATCH V3 00/10] PKS: Add Protection Keys Supervisor (PKS) support V3
From: Ira Weiny Changes from V2 [4] Rebased on tip-tree/core/entry From Thomas Gleixner Address bisectability Drop Patch: x86/entry: Move nmi entry/exit into common code From Greg KH Remove WARN_ON's From Dan Williams Add __must_check to pks_key_alloc() New patch: x86/pks: Add PKS defines and config options Split from Enable patch to build on through the series Fix compile errors Changes from V1 Rebase to TIP master; resolve conflicts and test Clean up some kernel docs updates missed in V1 Add irqentry_state_t kernel doc for PKRS field Removed redundant irq_state->pkrs This is only needed when we add the global state and somehow ended up in this patch series. That will come back when we add the global functionality in. From Thomas Gleixner Update commit messages Add kernel doc for struct irqentry_state_t From Dave Hansen add flags to pks_key_alloc() Changes from RFC V3[3] Rebase to TIP master Update test error output Standardize on 'irq_state' for state variables From Dave Hansen Update commit messages Add/clean up comments Add X86_FEATURE_PKS to disabled-features.h and remove some explicit CONFIG checks Move saved_pkrs member of thread_struct Remove superfluous preempt_disable() s/irq_save_pks/irq_save_set_pks/ Ensure PKRS is not seen in faults if not configured or not supported s/pks_mknoaccess/pks_mk_noaccess/ s/pks_mkread/pks_mk_readonly/ s/pks_mkrdwr/pks_mk_readwrite/ Change pks_key_alloc return to -EOPNOTSUPP when not supported From Peter Zijlstra Clean up Attribution Remove superfluous preempt_disable() Add union to differentiate exit_rcu/lockdep use in irqentry_state_t From Thomas Gleixner Add preliminary clean up patch and adjust series as needed Introduce a new page protection mechanism for supervisor pages, Protection Key Supervisor (PKS). 2 use cases for PKS are being developed, trusted keys and PMEM. Trusted keys is a newer use case which is still being explored. PMEM was submitted as part of the RFC (v2) series[1]. However, since then it was found that some callers of kmap() require a global implementation of PKS. Specifically some users of kmap() expect mappings to be available to all kernel threads. While global use of PKS is rare it needs to be included for correctness. Unfortunately the kmap() updates required a large patch series to make the needed changes at the various kmap() call sites so that patch set has been split out. Because the global PKS feature is only required for that use case it will be deferred to that set as well.[2] This patch set is being submitted as a precursor to both of the use cases. For an overview of the entire PKS ecosystem, a git tree including this series and 2 proposed use cases can be found here: https://lore.kernel.org/lkml/20201009195033.3208459-1-ira.we...@intel.com/ https://lore.kernel.org/lkml/20201009201410.3209180-1-ira.we...@intel.com/ PKS enables protections on 'domains' of supervisor pages to limit supervisor mode access to those pages beyond the normal paging protections. PKS works in a similar fashion to user space pkeys, PKU. As with PKU, supervisor pkeys are checked in addition to normal paging protections and Access or Writes can be disabled via a MSR update without TLB flushes when permissions change. Also like PKU, a page mapping is assigned to a domain by setting pkey bits in the page table entry for that mapping. Access is controlled through a PKRS register which is updated via WRMSR/RDMSR. XSAVE is not supported for the PKRS MSR. Therefore the implementation saves/restores the MSR across context switches and during exceptions. Nested exceptions are supported by each exception getting a new PKS state. For consistent behavior with current paging protections, pkey 0 is reserved and configured to allow full access via the pkey mechanism, thus preserving the default paging protections on mappings with the default pkey value of 0. Other keys, (1-15) are allocated by an allocator which prepares us for key contention from day one. Kernel users should be prepared for the allocator to fail either because of key exhaustion or due to PKS not being supported on the arch and/or CPU instance. The following are key attributes of PKS. 1) Fast switching of permissions 1a) Prevents access without page table manipulations 1b) No TLB flushes required 2) Works
[PATCH 1/2] drm/nouveau/kms/nv50-: Get rid of bogus nouveau_conn_mode_valid()
Ville also pointed out that I got a lot of the logic here wrong as well, whoops. While I don't think anyone's likely using 3D output with nouveau, the next patch will make nouveau_conn_mode_valid() make a lot less sense. So, let's just get rid of it and open-code it like before, while taking care to move the 3D frame packing calculations on the dot clock into the right place. Signed-off-by: Lyude Paul Fixes: d6a9efece724 ("drm/nouveau/kms/nv50-: Share DP SST mode_valid() handling with MST") Cc: Ville Syrjälä Cc: # v5.8+ Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_connector.c | 36 ++--- drivers/gpu/drm/nouveau/nouveau_dp.c| 15 ++--- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index 7674025a4bfe8..1d91d52ee5083 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -1035,29 +1035,6 @@ get_tmds_link_bandwidth(struct drm_connector *connector) return 112000 * duallink_scale; } -enum drm_mode_status -nouveau_conn_mode_clock_valid(const struct drm_display_mode *mode, - const unsigned min_clock, - const unsigned max_clock, - unsigned int *clock_out) -{ - unsigned int clock = mode->clock; - - if ((mode->flags & DRM_MODE_FLAG_3D_MASK) == - DRM_MODE_FLAG_3D_FRAME_PACKING) - clock *= 2; - - if (clock < min_clock) - return MODE_CLOCK_LOW; - if (clock > max_clock) - return MODE_CLOCK_HIGH; - - if (clock_out) - *clock_out = clock; - - return MODE_OK; -} - static enum drm_mode_status nouveau_connector_mode_valid(struct drm_connector *connector, struct drm_display_mode *mode) @@ -1065,7 +1042,7 @@ nouveau_connector_mode_valid(struct drm_connector *connector, struct nouveau_connector *nv_connector = nouveau_connector(connector); struct nouveau_encoder *nv_encoder = nv_connector->detected_encoder; struct drm_encoder *encoder = to_drm_encoder(nv_encoder); - unsigned min_clock = 25000, max_clock = min_clock; + unsigned int min_clock = 25000, max_clock = min_clock, clock = mode->clock; switch (nv_encoder->dcb->type) { case DCB_OUTPUT_LVDS: @@ -1094,8 +1071,15 @@ nouveau_connector_mode_valid(struct drm_connector *connector, return MODE_BAD; } - return nouveau_conn_mode_clock_valid(mode, min_clock, max_clock, -NULL); + if ((mode->flags & DRM_MODE_FLAG_3D_MASK) == DRM_MODE_FLAG_3D_FRAME_PACKING) + clock *= 2; + + if (clock < min_clock) + return MODE_CLOCK_LOW; + if (clock > max_clock) + return MODE_CLOCK_HIGH; + + return MODE_OK; } static struct drm_encoder * diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c b/drivers/gpu/drm/nouveau/nouveau_dp.c index 8a0f7994e1aeb..40683e1244c3f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dp.c +++ b/drivers/gpu/drm/nouveau/nouveau_dp.c @@ -114,18 +114,23 @@ nv50_dp_mode_valid(struct drm_connector *connector, unsigned *out_clock) { const unsigned min_clock = 25000; - unsigned max_clock, clock; - enum drm_mode_status ret; + unsigned max_clock, clock = mode->clock; if (mode->flags & DRM_MODE_FLAG_INTERLACE && !outp->caps.dp_interlace) return MODE_NO_INTERLACE; + if ((mode->flags & DRM_MODE_FLAG_3D_MASK) == DRM_MODE_FLAG_3D_FRAME_PACKING) + clock *= 2; + max_clock = outp->dp.link_nr * outp->dp.link_bw; clock = mode->clock * (connector->display_info.bpc * 3) / 10; + if (clock < min_clock) + return MODE_CLOCK_LOW; + if (clock > max_clock) + return MODE_CLOCK_HIGH; - ret = nouveau_conn_mode_clock_valid(mode, min_clock, max_clock, - &clock); if (out_clock) *out_clock = clock; - return ret; + + return MODE_OK; } -- 2.28.0
Re: [PATCH 2/3] vfio/virqfd: Drain events from eventfd in virqfd_wakeup()
On Tue, 27 Oct 2020 13:55:22 + David Woodhouse wrote: > From: David Woodhouse > > Don't allow the events to accumulate in the eventfd counter, drain them > as they are handled. > > Signed-off-by: David Woodhouse > --- Acked-by: Alex Williamson Paolo, I assume you'll add this to your queue. Thanks, Alex > drivers/vfio/virqfd.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c > index 997cb5d0a657..414e98d82b02 100644 > --- a/drivers/vfio/virqfd.c > +++ b/drivers/vfio/virqfd.c > @@ -46,6 +46,9 @@ static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned > mode, int sync, void > __poll_t flags = key_to_poll(key); > > if (flags & EPOLLIN) { > + u64 cnt; > + eventfd_ctx_do_read(virqfd->eventfd, &cnt); > + > /* An event has been signaled, call function */ > if ((!virqfd->handler || >virqfd->handler(virqfd->opaque, virqfd->data)) &&
[PATCH V3 06/10] x86/entry: Preserve PKRS MSR across exceptions
From: Ira Weiny The PKRS MSR is not managed by XSAVE. It is preserved through a context switch but this support leaves exception handling code open to memory accesses during exceptions. 2 possible places for preserving this state were considered, irqentry_state_t or pt_regs.[1] pt_regs was much more complicated and was potentially fraught with unintended consequences.[2] irqentry_state_t was already an object being used in the exception handling and is straightforward. It is also easy for any number of nested states to be tracked and eventually can be enhanced to store the reference counting required to support PKS through kmap reentry Preserve the current task's PKRS values in irqentry_state_t on exception entry and restoring them on exception exit. Each nested exception is further saved allowing for any number of levels of exception handling. Peter and Thomas both suggested parts of the patch, IDT and NMI respectively. [1] https://lore.kernel.org/lkml/calcetrve1i5jdyzd_bcctxqjn+ze3t38efpgjxn1f577m36...@mail.gmail.com/ [2] https://lore.kernel.org/lkml/874kpxx4jf@nanos.tec.linutronix.de/#t Cc: Dave Hansen Cc: Andy Lutomirski Suggested-by: Peter Zijlstra Suggested-by: Thomas Gleixner Signed-off-by: Ira Weiny --- Changes from V1 remove redundant irq_state->pkrs This value is only needed for the global tracking. So it should be included in that patch and not in this one. Changes from RFC V3 Standardize on 'irq_state' variable name Per Dave Hansen irq_save_pkrs() -> irq_save_set_pkrs() Rebased based on clean up patch by Thomas Gleixner This includes moving irq_[save_set|restore]_pkrs() to the core as well. --- arch/x86/entry/common.c | 38 + arch/x86/include/asm/pkeys_common.h | 5 ++-- arch/x86/mm/pkeys.c | 2 +- include/linux/entry-common.h| 13 ++ kernel/entry/common.c | 14 +-- 5 files changed, 67 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 87dea56a15d2..1b6a419a6fac 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_XEN_PV #include @@ -209,6 +210,41 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } +#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS +/* + * PKRS is a per-logical-processor MSR which overlays additional protection for + * pages which have been mapped with a protection key. + * + * The register is not maintained with XSAVE so we have to maintain the MSR + * value in software during context switch and exception handling. + * + * Context switches save the MSR in the task struct thus taking that value to + * other processors if necessary. + * + * To protect against exceptions having access to this memory we save the + * current running value and set the PKRS value for the duration of the + * exception. Thus preventing exception handlers from having the elevated + * access of the interrupted task. + */ +noinstr void irq_save_set_pkrs(irqentry_state_t *irq_state, u32 val) +{ + if (!cpu_feature_enabled(X86_FEATURE_PKS)) + return; + + irq_state->thread_pkrs = current->thread.saved_pkrs; + write_pkrs(INIT_PKRS_VALUE); +} + +noinstr void irq_restore_pkrs(irqentry_state_t *irq_state) +{ + if (!cpu_feature_enabled(X86_FEATURE_PKS)) + return; + + write_pkrs(irq_state->thread_pkrs); + current->thread.saved_pkrs = irq_state->thread_pkrs; +} +#endif /* CONFIG_ARCH_HAS_SUPERVISOR_PKEYS */ + #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* @@ -272,6 +308,8 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) inhcall = get_and_clear_inhcall(); if (inhcall && !WARN_ON_ONCE(irq_state.exit_rcu)) { + /* Normally called by irqentry_exit, we must restore pkrs here */ + irq_restore_pkrs(&irq_state); instrumentation_begin(); irqentry_exit_cond_resched(); instrumentation_end(); diff --git a/arch/x86/include/asm/pkeys_common.h b/arch/x86/include/asm/pkeys_common.h index 801a75615209..11a95e6efd2d 100644 --- a/arch/x86/include/asm/pkeys_common.h +++ b/arch/x86/include/asm/pkeys_common.h @@ -27,9 +27,10 @@ PKR_AD_KEY(13) | PKR_AD_KEY(14) | PKR_AD_KEY(15)) #ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS -void write_pkrs(u32 new_pkrs); +DECLARE_PER_CPU(u32, pkrs_cache); +noinstr void write_pkrs(u32 new_pkrs); #else -static inline void write_pkrs(u32 new_pkrs) { } +static __always_inline void write_pkrs(u32 new_pkrs) { } #endif #endif /*_ASM_X86_PKEYS_INTERNAL_H */ diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 76a62419c446..6892d4524868 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -248,7 +248,7 @@ DEFINE_PER_
[PATCH V3 09/10] x86/pks: Enable Protection Keys Supervisor (PKS)
From: Fenghua Yu Protection Keys for Supervisor pages (PKS) enables fast, hardware thread specific, manipulation of permission restrictions on supervisor page mappings. It uses the same mechanism of Protection Keys as those on User mappings but applies that mechanism to supervisor mappings using a supervisor specific MSR. Kernel users can thus defines 'domains' of page mappings which have an extra level of protection beyond those specified in the supervisor page table entries. Enable PKS on supported CPUS. Co-developed-by: Ira Weiny Signed-off-by: Ira Weiny Signed-off-by: Fenghua Yu --- Changes from V2 From Thomas: Make this patch last so PKS is not enabled until all the PKS mechanisms are in place. Specifically: 1) Modify setup_pks() to call write_pkrs() to properly set up the initial value when enabled. 2) Split this patch into two. 1) a precursor patch with the required defines/config options and 2) this patch which actually enables feature on CPUs which support it. Changes since RFC V3 Per Dave Hansen Update comment Add X86_FEATURE_PKS to disabled-features.h Rebase based on latest TIP tree --- arch/x86/include/asm/disabled-features.h | 6 +- arch/x86/kernel/cpu/common.c | 15 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 164587177152..82540f0c5b6c 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -44,7 +44,11 @@ # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ -#define DISABLE_PKS (1<<(X86_FEATURE_PKS & 31)) +#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS +# define DISABLE_PKS 0 +#else +# define DISABLE_PKS (1<<(X86_FEATURE_PKS & 31)) +#endif #ifdef CONFIG_X86_5LEVEL # define DISABLE_LA57 0 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 35ad8480c464..f8929a557d72 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "cpu.h" @@ -1494,6 +1495,19 @@ static void validate_apic_and_package_id(struct cpuinfo_x86 *c) #endif } +/* + * PKS is independent of PKU and either or both may be supported on a CPU. + * Configure PKS if the CPU supports the feature. + */ +static void setup_pks(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_PKS)) + return; + + write_pkrs(INIT_PKRS_VALUE); + cr4_set_bits(X86_CR4_PKS); +} + /* * This does the hard work of actually picking apart the CPU stuff... */ @@ -1591,6 +1605,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) x86_init_rdrand(c); setup_pku(c); + setup_pks(); /* * Clear/Set all flags overridden by options, need do it -- 2.28.0.rc0.12.gb6a658bd00c9
[PATCH V3 08/10] x86/pks: Add PKS kernel API
From: Fenghua Yu PKS allows kernel users to define domains of page mappings which have additional protections beyond the paging protections. Add an API to allocate, use, and free a protection key which identifies such a domain. Export 5 new symbols pks_key_alloc(), pks_mknoaccess(), pks_mkread(), pks_mkrdwr(), and pks_key_free(). Add 2 new macros; PAGE_KERNEL_PKEY(key) and _PAGE_PKEY(pkey). Update the protection key documentation to cover pkeys on supervisor pages. Co-developed-by: Ira Weiny Signed-off-by: Ira Weiny Signed-off-by: Fenghua Yu --- Changes from V2 From Greg KH Replace all WARN_ON_ONCE() uses with pr_err() From Dan Williams Add __must_check to pks_key_alloc() to help ensure users are using the API correctly Changes from V1 Per Dave Hansen Add flags to pks_key_alloc() to help future proof the interface if/when the key space is exhausted. Changes from RFC V3 Per Dave Hansen Put WARN_ON_ONCE in pks_key_free() s/pks_mknoaccess/pks_mk_noaccess/ s/pks_mkread/pks_mk_readonly/ s/pks_mkrdwr/pks_mk_readwrite/ Change return pks_key_alloc() to EOPNOTSUPP when not supported or configured Per Peter Zijlstra Remove unneeded preempt disable/enable --- Documentation/core-api/protection-keys.rst | 102 +--- arch/x86/include/asm/pgtable_types.h | 12 ++ arch/x86/include/asm/pkeys.h | 11 ++ arch/x86/include/asm/pkeys_common.h| 4 + arch/x86/mm/pkeys.c| 128 + include/linux/pgtable.h| 4 + include/linux/pkeys.h | 24 7 files changed, 267 insertions(+), 18 deletions(-) diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst index ec575e72d0b2..c4e6c480562f 100644 --- a/Documentation/core-api/protection-keys.rst +++ b/Documentation/core-api/protection-keys.rst @@ -4,25 +4,33 @@ Memory Protection Keys == -Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature -which is found on Intel's Skylake (and later) "Scalable Processor" -Server CPUs. It will be available in future non-server Intel parts -and future AMD processors. - -For anyone wishing to test or use this feature, it is available in -Amazon's EC2 C5 instances and is known to work there using an Ubuntu -17.04 image. - Memory Protection Keys provides a mechanism for enforcing page-based protections, but without requiring modification of the page tables -when an application changes protection domains. It works by -dedicating 4 previously ignored bits in each page table entry to a -"protection key", giving 16 possible keys. +when an application changes protection domains. + +PKeys Userspace (PKU) is a feature which is found on Intel's Skylake "Scalable +Processor" Server CPUs and later. And It will be available in future +non-server Intel parts and future AMD processors. + +Future Intel processors will support Protection Keys for Supervisor pages +(PKS). + +For anyone wishing to test or use user space pkeys, it is available in Amazon's +EC2 C5 instances and is known to work there using an Ubuntu 17.04 image. + +pkeys work by dedicating 4 previously Reserved bits in each page table entry to +a "protection key", giving 16 possible keys. User and Supervisor pages are +treated separately. + +Protections for each page are controlled with per CPU registers for each type +of page User and Supervisor. Each of these 32 bit register stores two separate +bits (Access Disable and Write Disable) for each key. -There is also a new user-accessible register (PKRU) with two separate -bits (Access Disable and Write Disable) for each key. Being a CPU -register, PKRU is inherently thread-local, potentially giving each -thread a different set of protections from every other thread. +For Userspace the register is user-accessible (rdpkru/wrpkru). For +Supervisor, the register (MSR_IA32_PKRS) is accessible only to the kernel. + +Being a CPU register, pkeys are inherently thread-local, potentially giving +each thread an independent set of protections from every other thread. There are two new instructions (RDPKRU/WRPKRU) for reading and writing to the new register. The feature is only available in 64-bit mode, @@ -30,8 +38,11 @@ even though there is theoretically space in the PAE PTEs. These permissions are enforced on data access only and have no effect on instruction fetches. -Syscalls - +For kernel space rdmsr/wrmsr are used to access the kernel MSRs. + + +Syscalls for user space keys + There are 3 system calls which directly interact with pkeys:: @@ -98,3 +109,58 @@ with a read():: The kernel will send a SIGSEGV in both cases, but si_code will be set to SEGV
[PATCH V3 10/10] x86/pks: Add PKS test code
From: Ira Weiny The core PKS functionality provides an interface for kernel users to reserve keys to their domains set up the page tables with those keys and control access to those domains when needed. Define test code which exercises the core functionality of PKS via a debugfs entry. Basic checks can be triggered on boot with a kernel command line option while both basic and preemption checks can be triggered with separate debugfs values. debugfs controls are: '0' -- Run access tests with a single pkey '1' -- Set up the pkey register with no access for the pkey allocated to this fd '2' -- Check that the pkey register updated in '1' is still the same. (To be used after a forced context switch.) '3' -- Allocate all pkeys possible and run tests on each pkey allocated. DEFAULT when run at boot. Closing the fd will cleanup and release the pkey, therefore to exercise context switch testing a user space program is provided in: .../tools/testing/selftests/x86/test_pks.c Reviewed-by: Dave Hansen Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Ira Weiny --- Changes for V2 Fix compilation errors Changes for V1 Update for new pks_key_alloc() Changes from RFC V3 Comments from Dave Hansen clean up whitespace dmanage Clean up Kconfig help Clean up user test error output s/pks_mknoaccess/pks_mk_noaccess/ s/pks_mkread/pks_mk_readonly/ s/pks_mkrdwr/pks_mk_readwrite/ Comments from Jing Han Remove duplicate stdio.h --- Documentation/core-api/protection-keys.rst | 1 + arch/x86/mm/fault.c| 23 + lib/Kconfig.debug | 12 + lib/Makefile | 3 + lib/pks/Makefile | 3 + lib/pks/pks_test.c | 692 + tools/testing/selftests/x86/Makefile | 3 +- tools/testing/selftests/x86/test_pks.c | 66 ++ 8 files changed, 802 insertions(+), 1 deletion(-) create mode 100644 lib/pks/Makefile create mode 100644 lib/pks/pks_test.c create mode 100644 tools/testing/selftests/x86/test_pks.c diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst index c4e6c480562f..8ffdfbff013c 100644 --- a/Documentation/core-api/protection-keys.rst +++ b/Documentation/core-api/protection-keys.rst @@ -164,3 +164,4 @@ of WRPKRU. So to quote from the WRPKRU text: until all prior executions of WRPKRU have completed execution and updated the PKRU register. +Example code can be found in lib/pks/pks_test.c diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 90029ce9b0da..916b2d18ed57 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,6 +18,7 @@ #include /* faulthandler_disabled() */ #include /* efi_recover_from_page_fault()*/ #include +#include #include /* boot_cpu_has, ...*/ #include /* dotraplinkage, ... */ @@ -1149,6 +1150,25 @@ bool fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } +#ifdef CONFIG_PKS_TESTING +bool pks_test_callback(irqentry_state_t *irq_state); +static bool handle_pks_testing(unsigned long hw_error_code, irqentry_state_t *irq_state) +{ + /* +* If we get a protection key exception it could be because we +* are running the PKS test. If so, pks_test_callback() will +* clear the protection mechanism and return true to indicate +* the fault was handled. +*/ + return (hw_error_code & X86_PF_PK) && pks_test_callback(irq_state); +} +#else +static bool handle_pks_testing(unsigned long hw_error_code, irqentry_state_t *irq_state) +{ + return false; +} +#endif + /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that @@ -1165,6 +1185,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, if (!cpu_feature_enabled(X86_FEATURE_PKS)) WARN_ON_ONCE(hw_error_code & X86_PF_PK); + if (handle_pks_testing(hw_error_code, irq_state)) + return; + #ifdef CONFIG_X86_32 /* * We can fault-in kernel-space virtual memory on-demand. The diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c789b39ed527..e90e06f5a3b9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2444,6 +2444,18 @@ config HYPERV_TESTING help Select this option to enable Hyper-V vmbus testing. +config PKS_TESTING + bool "PKey (S)upervisor testing" + default n + depends on ARCH_HAS_SUPERVISOR_PKEYS + help + Select this option to enable testing of PKS core software and + hardware. The PKS core provides a mechanism to al
[PATCH V3 05/10] x86/entry: Pass irqentry_state_t by reference
From: Ira Weiny Currently struct irqentry_state_t only contains a single bool value which makes passing it by value is reasonable. However, future patches propose to add information to this struct, for example the PKRS register/thread state. Adding information to irqentry_state_t makes passing by value less efficient. Therefore, change the entry/exit calls to pass irq_state by reference. While at it, make the code easier to follow by changing all the usage sites to consistently use the variable name 'irq_state'. Signed-off-by: Ira Weiny --- Changes from V1 From Thomas: Update commit message Further clean up Kernel doc and comments Missed some 'return' comments which are no longer valid Changes from RFC V3 Clean up @irq_state comments Standardize on 'irq_state' for the state variable name Refactor based on new patch from Thomas Gleixner Also addresses Peter Zijlstra's comment --- arch/x86/entry/common.c | 8 arch/x86/include/asm/idtentry.h | 25 ++-- arch/x86/kernel/cpu/mce/core.c | 4 ++-- arch/x86/kernel/kvm.c | 6 +++--- arch/x86/kernel/nmi.c | 4 ++-- arch/x86/kernel/traps.c | 21 arch/x86/mm/fault.c | 6 +++--- include/linux/entry-common.h| 18 + kernel/entry/common.c | 34 + 9 files changed, 65 insertions(+), 61 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 18d8f17f755c..87dea56a15d2 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -259,9 +259,9 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs; bool inhcall; - irqentry_state_t state; + irqentry_state_t irq_state; - state = irqentry_enter(regs); + irqentry_enter(regs, &irq_state); old_regs = set_irq_regs(regs); instrumentation_begin(); @@ -271,13 +271,13 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) set_irq_regs(old_regs); inhcall = get_and_clear_inhcall(); - if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { + if (inhcall && !WARN_ON_ONCE(irq_state.exit_rcu)) { instrumentation_begin(); irqentry_exit_cond_resched(); instrumentation_end(); restore_inhcall(inhcall); } else { - irqentry_exit(regs, state); + irqentry_exit(regs, &irq_state); } } #endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 247a60a47331..282d2413b6a1 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -49,12 +49,13 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ + irqentry_state_t irq_state; \ \ + irqentry_enter(regs, &irq_state); \ instrumentation_begin();\ __##func (regs);\ instrumentation_end(); \ - irqentry_exit(regs, state); \ + irqentry_exit(regs, &irq_state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) @@ -96,12 +97,13 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ + irqentry_state_t irq_state; \ \ + irqentry_enter(regs, &irq_state); \ instrumentation_begin();\ __##func (regs, error_code);\ instrumentation_end(); \ - irqentry_exit(regs, state); \ + irqentry_exit(regs,
[PATCH V3 07/10] x86/fault: Report the PKRS state on fault
From: Ira Weiny When only user space pkeys are enabled faulting within the kernel was an unexpected condition which should never happen. Therefore a WARN_ON in the kernel fault handler would detect if it ever did. Now this is no longer the case if PKS is enabled and supported. Report a Pkey fault with a normal splat and add the PKRS state to the fault splat text. Note the PKS register is reset during an exception therefore the saved PKRS value from before the beginning of the exception is passed down. If PKS is not enabled, or not active, maintain the WARN_ON_ONCE() from before. Because each fault has its own state the pkrs information will be correctly reported even if a fault 'faults'. Suggested-by: Andy Lutomirski Signed-off-by: Ira Weiny --- Changes from V2 Fix compilation error Changes from RFC V3 Update commit message Per Dave Hansen Don't print PKRS if !cpu_feature_enabled(X86_FEATURE_PKS) Fix comment Remove check on CONFIG_ARCH_HAS_SUPERVISOR_PKEYS in favor of disabled-features.h --- arch/x86/mm/fault.c | 58 ++--- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8d20c4c13abf..90029ce9b0da 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -504,7 +504,8 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) } static void -show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) +show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address, + irqentry_state_t *irq_state) { if (!oops_may_print()) return; @@ -548,6 +549,11 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad (error_code & X86_PF_PK)? "protection keys violation" : "permissions violation"); +#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS + if (cpu_feature_enabled(X86_FEATURE_PKS) && irq_state && (error_code & X86_PF_PK)) + pr_alert("PKRS: 0x%x\n", irq_state->thread_pkrs); +#endif + if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; @@ -626,7 +632,8 @@ static void set_signal_archinfo(unsigned long address, static noinline void no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) + unsigned long address, int signal, int si_code, + irqentry_state_t *irq_state) { struct task_struct *tsk = current; unsigned long flags; @@ -732,7 +739,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, */ flags = oops_begin(); - show_fault_oops(regs, error_code, address); + show_fault_oops(regs, error_code, address, irq_state); if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); @@ -785,7 +792,8 @@ static bool is_vsyscall_vaddr(unsigned long vaddr) static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 pkey, int si_code) + unsigned long address, u32 pkey, int si_code, + irqentry_state_t *irq_state) { struct task_struct *tsk = current; @@ -832,14 +840,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_f00f_bug(regs, address)) return; - no_context(regs, error_code, address, SIGSEGV, si_code); + no_context(regs, error_code, address, SIGSEGV, si_code, irq_state); } static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, -unsigned long address) +unsigned long address, irqentry_state_t *irq_state) { - __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR, irq_state); } static void @@ -853,7 +861,7 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ mmap_read_unlock(mm); - __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); + __bad_area_nosemaphore(regs, error_code, address, pkey, si_code, NULL); } static noinline void @@ -923,7 +931,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, { /* Kernel mode? Handle exceptions or die: */ if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR, NULL); return; } @@ -957,7 +965,7 @@ mm_fault_error(struct pt_regs *regs, unsigned
[PATCH V3 03/10] x86/pks: Add PKS defines and Kconfig options
From: Ira Weiny Protection Keys for Supervisor pages (PKS) enables fast, hardware thread specific, manipulation of permission restrictions on supervisor page mappings. It uses the same mechanism of Protection Keys as those on User mappings but applies that mechanism to supervisor mappings using a supervisor specific MSR. Kernel users can thus defines 'domains' of page mappings which have an extra level of protection beyond those specified in the supervisor page table entries. Add the Kconfig ARCH_HAS_SUPERVISOR_PKEYS to indicate to core code that an architecture support pkeys. Select it for x86. Define the CPU features bit needed but leave DISABLE_PKS set to disable the feature until the implementation can be completed and enabled in a final patch. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Ira Weiny --- Changes from V2 New patch for V3: Split this off from the enable patch to be able to create cleaner bisectability --- arch/x86/Kconfig| 1 + arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h| 4 +++- arch/x86/include/uapi/asm/processor-flags.h | 2 ++ mm/Kconfig | 2 ++ 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f6946b81f74a..78c4c749c6a9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1876,6 +1876,7 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS depends on X86_64 && (CPU_SUP_INTEL || CPU_SUP_AMD) select ARCH_USES_HIGH_VMA_FLAGS select ARCH_HAS_PKEYS + select ARCH_HAS_SUPERVISOR_PKEYS help Memory Protection Keys provides a mechanism for enforcing page-based protections, but without requiring modification of the diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dad350d42ecf..4deb580324e8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -356,6 +356,7 @@ #define X86_FEATURE_MOVDIRI(16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ #define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_PKS(16*32+31) /* Protection Keys for Supervisor pages */ /* AMD-defined CPU features, CPUID level 0x8007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5861d34f9771..164587177152 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -44,6 +44,8 @@ # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ +#define DISABLE_PKS (1<<(X86_FEATURE_PKS & 31)) + #ifdef CONFIG_X86_5LEVEL # define DISABLE_LA57 0 #else @@ -82,7 +84,7 @@ #define DISABLED_MASK140 #define DISABLED_MASK150 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ -DISABLE_ENQCMD) +DISABLE_ENQCMD|DISABLE_PKS) #define DISABLED_MASK170 #define DISABLED_MASK180 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index bcba3c643e63..191c574b2390 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -130,6 +130,8 @@ #define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT) #define X86_CR4_PKE_BIT22 /* enable Protection Keys support */ #define X86_CR4_PKE_BITUL(X86_CR4_PKE_BIT) +#define X86_CR4_PKS_BIT24 /* enable Protection Keys for Supervisor */ +#define X86_CR4_PKS_BITUL(X86_CR4_PKS_BIT) /* * x86-64 Task Priority Register, CR8 diff --git a/mm/Kconfig b/mm/Kconfig index d42423f884a7..fc9ce7f65683 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -826,6 +826,8 @@ config ARCH_USES_HIGH_VMA_FLAGS bool config ARCH_HAS_PKEYS bool +config ARCH_HAS_SUPERVISOR_PKEYS + bool config PERCPU_STATS bool "Collect percpu memory statistics" -- 2.28.0.rc0.12.gb6a658bd00c9
[PATCH V3 02/10] x86/fpu: Refactor arch_set_user_pkey_access() for PKS support
From: Ira Weiny Define a helper, update_pkey_val(), which will be used to support both Protection Key User (PKU) and the new Protection Key for Supervisor (PKS) in subsequent patches. Co-developed-by: Peter Zijlstra Signed-off-by: Peter Zijlstra Signed-off-by: Ira Weiny --- Changes from RFC V3: Per Dave Hansen Update and add comments per Dave's review Per Peter Correct attribution --- arch/x86/include/asm/pkeys.h | 2 ++ arch/x86/kernel/fpu/xstate.c | 22 -- arch/x86/mm/pkeys.c | 23 +++ 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index f9feba80894b..4526245b03e5 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -136,4 +136,6 @@ static inline int vma_pkey(struct vm_area_struct *vma) return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; } +u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags); + #endif /*_ASM_X86_PKEYS_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a99afc70cc0a..a3bca3211eba 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -994,9 +994,7 @@ const void *get_xsave_field_ptr(int xfeature_nr) int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { - u32 old_pkru; - int pkey_shift = (pkey * PKR_BITS_PER_PKEY); - u32 new_pkru_bits = 0; + u32 pkru; /* * This check implies XSAVE support. OSPKE only gets @@ -1012,21 +1010,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ WARN_ON_ONCE(pkey >= arch_max_pkey()); - /* Set the bits we need in PKRU: */ - if (init_val & PKEY_DISABLE_ACCESS) - new_pkru_bits |= PKR_AD_BIT; - if (init_val & PKEY_DISABLE_WRITE) - new_pkru_bits |= PKR_WD_BIT; - - /* Shift the bits in to the correct place in PKRU for pkey: */ - new_pkru_bits <<= pkey_shift; - - /* Get old PKRU and mask off any old bits in place: */ - old_pkru = read_pkru(); - old_pkru &= ~((PKR_AD_BIT|PKR_WD_BIT) << pkey_shift); - - /* Write old part along with new part: */ - write_pkru(old_pkru | new_pkru_bits); + pkru = read_pkru(); + pkru = update_pkey_val(pkru, pkey, init_val); + write_pkru(pkru); return 0; } diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index f5efb4007e74..d1dfe743e79f 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -208,3 +208,26 @@ static __init int setup_init_pkru(char *opt) return 1; } __setup("init_pkru=", setup_init_pkru); + +/* + * Replace disable bits for @pkey with values from @flags + * + * Kernel users use the same flags as user space: + * PKEY_DISABLE_ACCESS + * PKEY_DISABLE_WRITE + */ +u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags) +{ + int pkey_shift = pkey * PKR_BITS_PER_PKEY; + + /* Mask out old bit values */ + pk_reg &= ~(((1 << PKR_BITS_PER_PKEY) - 1) << pkey_shift); + + /* Or in new values */ + if (flags & PKEY_DISABLE_ACCESS) + pk_reg |= PKR_AD_BIT << pkey_shift; + if (flags & PKEY_DISABLE_WRITE) + pk_reg |= PKR_WD_BIT << pkey_shift; + + return pk_reg; +} -- 2.28.0.rc0.12.gb6a658bd00c9