[PATCH v3 5/7] xfrm/compat: Add 32=>64-bit messages translator
Provide the user-to-kernel translator under XFRM_USER_COMPAT, that creates for 32-bit xfrm-user message a 64-bit translation. The translation is afterwards reused by xfrm_user code just as if userspace had sent 64-bit message. Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 6 + net/xfrm/Kconfig | 3 +- net/xfrm/xfrm_compat.c | 274 + net/xfrm/xfrm_user.c | 57 ++--- 4 files changed, 321 insertions(+), 19 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 5b6cc62c9354..fa18cb6bb3f7 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2001,11 +2001,17 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, } extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; +extern const struct nla_policy xfrma_policy[XFRMA_MAX+1]; struct xfrm_translator { /* Allocate frag_list and put compat translation there */ int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src); + /* Allocate nlmsg with 64-bit translaton of received 32-bit message */ + struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh, + int maxtype, const struct nla_policy *policy, + struct netlink_ext_ack *extack); + struct module *owner; }; diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index e79b48dab61b..3adf31a83a79 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -30,7 +30,8 @@ config XFRM_USER config XFRM_USER_COMPAT tristate "Compatible ABI support" - depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT + depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT && \ + HAVE_EFFICIENT_UNALIGNED_ACCESS select WANT_COMPAT_NETLINK_MESSAGES help Transformation(XFRM) user configuration interface like IPsec diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index aece41b44ff2..b1b5f972538d 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -96,6 +96,39 @@ static const int compat_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping) }; +static const struct nla_policy compat_policy[XFRMA_MAX+1] = { + [XFRMA_SA] = { .len = XMSGSIZE(compat_xfrm_usersa_info)}, + [XFRMA_POLICY] = { .len = XMSGSIZE(compat_xfrm_userpolicy_info)}, + [XFRMA_LASTUSED]= { .type = NLA_U64}, + [XFRMA_ALG_AUTH_TRUNC] = { .len = sizeof(struct xfrm_algo_auth)}, + [XFRMA_ALG_AEAD]= { .len = sizeof(struct xfrm_algo_aead) }, + [XFRMA_ALG_AUTH]= { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_CRYPT] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_COMP]= { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, + [XFRMA_TMPL]= { .len = sizeof(struct xfrm_user_tmpl) }, + [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) }, + [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, + [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, + [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, + [XFRMA_ETIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_COADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type)}, + [XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) }, + [XFRMA_KMADDRESS] = { .len = sizeof(struct xfrm_user_kmaddress) }, + [XFRMA_MARK]= { .len = sizeof(struct xfrm_mark) }, + [XFRMA_TFCPAD] = { .type = NLA_U32 }, + [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) }, + [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, + [XFRMA_PROTO] = { .type = NLA_U8 }, + [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, + [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, + [XFRMA_SET_MARK]= { .type = NLA_U32 }, + [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, + [XFRMA_IF_ID] = { .type = NLA_U32 }, +}; + static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src, u16 type) { @@ -303,9 +336,250 @@ static int xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src return 0; } +/* Calculates len of translated 64-bit message. */ +static size_t xfrm_user_rcv_calculate_len64(const struct nlmsghdr *src, + struct nlattr *attrs[XFRMA_MAX+1]) +{ + size_t len = nlmsg_len(src); + + switch (src->nlmsg_type) { + case XFRM_MSG_NEWSA: + case XFRM_MSG_NEWP
[PATCH v3 4/7] netlink/compat: Append NLMSG_DONE/extack to frag_list
Modules those use netlink may supply a 2nd skb, (via frag_list) that contains an alternative data set meant for applications using 32bit compatibility mode. In such a case, netlink_recvmsg will use this 2nd skb instead of the original one. Without this patch, such compat applications will retrieve all netlink dump data, but will then get an unexpected EOF. Cc: Johannes Berg Signed-off-by: Florian Westphal Signed-off-by: Dmitry Safonov Reviewed-by: Johannes Berg --- net/netlink/af_netlink.c | 47 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d2d1448274f5..de12dd3136f9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2186,13 +2186,35 @@ EXPORT_SYMBOL(__nlmsg_put); * It would be better to create kernel thread. */ +static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, +struct netlink_callback *cb, +struct netlink_ext_ack *extack) +{ + struct nlmsghdr *nlh; + + nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), + NLM_F_MULTI | cb->answer_flags); + if (WARN_ON(!nlh)) + return -ENOBUFS; + + nl_dump_check_consistent(cb, nlh); + memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); + + if (extack->_msg && nlk->flags & NETLINK_F_EXT_ACK) { + nlh->nlmsg_flags |= NLM_F_ACK_TLVS; + if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) + nlmsg_end(skb, nlh); + } + + return 0; +} + static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; struct module *module; int err = -ENOBUFS; int alloc_min_size; @@ -2258,22 +2280,19 @@ static int netlink_dump(struct sock *sk) return 0; } - nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, - sizeof(nlk->dump_done_errno), - NLM_F_MULTI | cb->answer_flags); - if (WARN_ON(!nlh)) + if (netlink_dump_done(nlk, skb, cb, &extack)) goto errout_skb; - nl_dump_check_consistent(cb, nlh); - - memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, - sizeof(nlk->dump_done_errno)); - - if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) { - nlh->nlmsg_flags |= NLM_F_ACK_TLVS; - if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg)) - nlmsg_end(skb, nlh); +#ifdef CONFIG_COMPAT_NETLINK_MESSAGES + /* frag_list skb's data is used for compat tasks +* and the regular skb's data for normal (non-compat) tasks. +* See netlink_recvmsg(). +*/ + if (unlikely(skb_shinfo(skb)->frag_list)) { + if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack)) + goto errout_skb; } +#endif if (sk_filter(sk, skb)) kfree_skb(skb); -- 2.28.0
[PATCH v3 2/7] xfrm/compat: Add 64=>32-bit messages translator
Provide the kernel-to-user translator under XFRM_USER_COMPAT, that creates for 64-bit xfrm-user message a 32-bit translation and puts it in skb's frag_list. net/compat.c layer provides MSG_CMSG_COMPAT to decide if the message should be taken from skb or frag_list. (used by wext-core which has also an ABI difference) Kernel sends 64-bit xfrm messages to the userspace for: - multicast (monitor events) - netlink dumps Wire up the translator to xfrm_nlmsg_multicast(). Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 5 + net/xfrm/xfrm_compat.c | 296 + net/xfrm/xfrm_user.c | 15 ++- 3 files changed, 315 insertions(+), 1 deletion(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fe2e3717da14..5b6cc62c9354 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2000,7 +2000,12 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, return 0; } +extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; + struct xfrm_translator { + /* Allocate frag_list and put compat translation there */ + int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src); + struct module *owner; }; diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index f01d9af41c55..aece41b44ff2 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -8,8 +8,304 @@ #include #include +struct compat_xfrm_lifetime_cfg { + compat_u64 soft_byte_limit, hard_byte_limit; + compat_u64 soft_packet_limit, hard_packet_limit; + compat_u64 soft_add_expires_seconds, hard_add_expires_seconds; + compat_u64 soft_use_expires_seconds, hard_use_expires_seconds; +}; /* same size on 32bit, but only 4 byte alignment required */ + +struct compat_xfrm_lifetime_cur { + compat_u64 bytes, packets, add_time, use_time; +}; /* same size on 32bit, but only 4 byte alignment required */ + +struct compat_xfrm_userpolicy_info { + struct xfrm_selector sel; + struct compat_xfrm_lifetime_cfg lft; + struct compat_xfrm_lifetime_cur curlft; + __u32 priority, index; + u8 dir, action, flags, share; + /* 4 bytes additional padding on 64bit */ +}; + +struct compat_xfrm_usersa_info { + struct xfrm_selector sel; + struct xfrm_id id; + xfrm_address_t saddr; + struct compat_xfrm_lifetime_cfg lft; + struct compat_xfrm_lifetime_cur curlft; + struct xfrm_stats stats; + __u32 seq, reqid; + u16 family; + u8 mode, replay_window, flags; + /* 4 bytes additional padding on 64bit */ +}; + +struct compat_xfrm_user_acquire { + struct xfrm_id id; + xfrm_address_t saddr; + struct xfrm_selector sel; + struct compat_xfrm_userpolicy_info policy; + /* 4 bytes additional padding on 64bit */ + __u32 aalgos, ealgos, calgos, seq; +}; + +struct compat_xfrm_userspi_info { + struct compat_xfrm_usersa_info info; + /* 4 bytes additional padding on 64bit */ + __u32 min, max; +}; + +struct compat_xfrm_user_expire { + struct compat_xfrm_usersa_info state; + /* 8 bytes additional padding on 64bit */ + u8 hard; +}; + +struct compat_xfrm_user_polexpire { + struct compat_xfrm_userpolicy_info pol; + /* 8 bytes additional padding on 64bit */ + u8 hard; +}; + +#define XMSGSIZE(type) sizeof(struct type) + +static const int compat_msg_min[XFRM_NR_MSGTYPES] = { + [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_usersa_info), + [XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), + [XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), + [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userpolicy_info), + [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_ALLOCSPI- XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userspi_info), + [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_acquire), + [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_expire), + [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userpolicy_info), + [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_usersa_info), + [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_polexpire), + [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush), + [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0, + [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), + [XFRM_MSG_GETAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), + [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), + [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_NEWSADINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_GETSA
[PATCH v3 7/7] selftest/net/xfrm: Add test for ipsec tunnel
It's an exhaustive testing for ipsec: covering all encryption/ authentication/compression algorithms. The tests are run in two network namespaces, connected by veth interfaces. To make exhaustive testing less time-consuming, the tests are run in parallel tasks, specified by parameter to the selftest. As the patches set adds support for xfrm in compatible tasks, there are tests to check structures that differ in size between 64-bit and 32-bit applications. The selftest doesn't use libnl so that it can be easily compiled as compatible application and don't require compatible .so. Here is a diagram of the selftest: --- | selftest | | (parent) | --- || | (pipe) | -- / | | \ /- /\ -\ | /- -\ | -|--||--|- | - -- - | | | child | | child | NS A | child | | child | | | - -- - | ---|||-|-- veth0veth1veth2 vethN -|||-|-- | | | | gr.child | | gr.child | NS B | gr.child | | gr.child | | | | The parent sends the description of a test (xfrm parameters) to the child, the child and grand child setup a tunnel over veth interface and test it by sending udp packets. Cc: Shuah Khan Cc: linux-kselft...@vger.kernel.org Signed-off-by: Dmitry Safonov --- MAINTAINERS|1 + tools/testing/selftests/net/.gitignore |1 + tools/testing/selftests/net/Makefile |1 + tools/testing/selftests/net/ipsec.c| 2195 4 files changed, 2198 insertions(+) create mode 100644 tools/testing/selftests/net/ipsec.c diff --git a/MAINTAINERS b/MAINTAINERS index d746519253c3..b50782df6264 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12134,6 +12134,7 @@ F: net/ipv6/ipcomp6.c F: net/ipv6/xfrm* F: net/key/ F: net/xfrm/ +F: tools/testing/selftests/net/ipsec.c NETWORKING [IPv4/IPv6] M: "David S. Miller" diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 742c499328b2..61ae899cfc17 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +ipsec msg_zerocopy socket psock_fanout diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 9491bbaa0831..edd4ac632dc8 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -29,6 +29,7 @@ TEST_GEN_FILES += tcp_fastopen_backup_key TEST_GEN_FILES += fin_ack_lat TEST_GEN_FILES += reuseaddr_ports_exhausted TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp +TEST_GEN_FILES += ipsec TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c new file mode 100644 index ..17ced7d6ce25 --- /dev/null +++ b/tools/testing/selftests/net/ipsec.c @@ -0,0 +1,2195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ipsec.c - Check xfrm on veth inside a net-ns. + * Copyright (c) 2018 Dmitry Safonov + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define printk(fmt, ...) \ + ksft_print_msg("%d[%u] " fmt "\n", getpid(), __LINE__, ##__VA_ARGS__) + +#define pr_err(fmt, ...) printk(fmt ": %m", ##__VA_ARGS__) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +#define IPV4_STR_SZ16 /* xxx.xxx.xxx.xxx is longest + \0 */ +#define MAX_PAYLOAD2048 +#define XFRM_ALGO_KEY_BUF_SIZE 512 +#define MAX_PROCESSES (1 << 14) /* /16 mask divided by /30 subnets */ +#define INADDR_A ((in_addr_t) 0x0
Re: [PATCH v2 1/6] xfrm/compat: Add 64=>32-bit messages translator
On 9/7/20 12:24 PM, Steffen Klassert wrote: [..] > One comment on this. Looks like the above is the same in all > commit messages. Please provide that generic information > with the patch 0/n and remove it from the other patches. Yeah, I think I've used to that from x86/core submissions - they prefer having general information copied from cover-letter to every patch, that way commits in `git log` or `git show` preserve it. Probably, one of small differences in style between contributions to different subsystems. Will do, no problem. Thanks, Dmitry
Re: [PATCH v2 0/6] xfrm: Add compat layer
On 9/7/20 10:43 AM, Steffen Klassert wrote: > On Wed, Aug 26, 2020 at 02:49:43AM +0100, Dmitry Safonov wrote: [..] > > Thanks for the patches, looks good! > > Please fix the issue reported from 'kernel test robot' and resend. Thanks, will do! -- Dmitry
Re: [PATCH v2 3/6] netlink/compat: Append NLMSG_DONE/extack to frag_list
On 8/26/20 8:19 AM, Johannes Berg wrote: > On Wed, 2020-08-26 at 02:49 +0100, Dmitry Safonov wrote: [..] >> +nl_dump_check_consistent(cb, nlh); >> +memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, >> +sizeof(nlk->dump_done_errno)); > > nit: indentation here looks odd. > > Other than that, looks reasonable to me. > > Reviewed-by: Johannes Berg Thank you for the review! -- Dmitry
Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
Hello, On Wed, 26 Aug 2020 at 15:39, Michael Ellerman wrote: > Christophe Leroy writes: [..] > > arch_remap() gets replaced by vdso_remap() > > > > For arch_unmap(), I'm wondering how/what other architectures do, because > > powerpc seems to be the only one to erase the vdso context pointer when > > unmapping the vdso. > > Yeah. The original unmap/remap stuff was added for CRIU, which I thought > people tested on other architectures (more than powerpc even). > > Possibly no one really cares about vdso unmap though, vs just moving the > vdso. > > We added a test for vdso unmap recently because it happened to trigger a > KAUP failure, and someone actually hit it & reported it. You right, CRIU cares much more about moving vDSO. It's done for each restoree and as on most setups vDSO is premapped and used by the application - it's actively tested. Speaking about vDSO unmap - that's concerning only for heterogeneous C/R, i.e when an application is migrated from a system that uses vDSO to the one which doesn't - it's much rare scenario. (for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter) Looking at the code, it seems quite easy to provide/maintain .close() for vm_special_mapping. A bit harder to add a test from CRIU side (as glibc won't know on restore that it can't use vdso anymore), but totally not impossible. > Running that test on arm64 segfaults: > > # ./sigreturn_vdso > VDSO is at 0x8191f000-0x8191 (4096 bytes) > Signal delivered OK with VDSO mapped > VDSO moved to 0x8191a000-0x8191afff (4096 bytes) > Signal delivered OK with VDSO moved > Unmapped VDSO > Remapped the stack executable > [ 48.556191] potentially unexpected fatal signal 11. > [ 48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted > 5.9.0-rc2-00057-g2ac69819ba9e #190 > [ 48.556990] Hardware name: linux,dummy-virt (DT) > [ 48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--) > [ 48.557475] pc : 8191a7bc > [ 48.557603] lr : 8191a7bc > [ 48.557697] sp : c13c9e90 > [ 48.557873] x29: c13cb0e0 x28: > [ 48.558201] x27: x26: > [ 48.558337] x25: x24: > [ 48.558754] x23: x22: > [ 48.558893] x21: 004009b0 x20: > [ 48.559046] x19: 00400ff0 x18: > [ 48.559180] x17: 817da300 x16: 00412010 > [ 48.559312] x15: x14: 001c > [ 48.559443] x13: 656c626174756365 x12: 7865206b63617473 > [ 48.559625] x11: 0003 x10: 0101010101010101 > [ 48.559828] x9 : 818afda8 x8 : 0081 > [ 48.559973] x7 : 6174732065687420 x6 : 64657070616d6552 > [ 48.560115] x5 : 0e0388bd x4 : 0040135d > [ 48.560270] x3 : x2 : 0001 > [ 48.560412] x1 : 0003 x0 : 004120b8 > Segmentation fault > # > > So I think we need to keep the unmap hook. Maybe it should be handled by > the special_mapping stuff generically. I'll cook a patch for vm_special_mapping if you don't mind :-) Thanks, Dmitry
[PATCH v2 0/6] xfrm: Add compat layer
Changes since v1: - reworked patches set to use translator - separated the compat layer into xfrm_compat.c, compiled under XFRM_USER_COMPAT config - 32-bit messages now being sent in frag_list (like wext-core does) - instead of __packed add compat_u64 members in compat structures - selftest reworked to kselftest lib API - added netlink dump testing to the selftest XFRM is disabled for compatible users because of the UABI difference. The difference is in structures paddings and in the result the size of netlink messages differ. Possibility for compatible application to manage xfrm tunnels was disabled by: the commmit 19d7df69fdb2 ("xfrm: Refuse to insert 32 bit userspace socket policies on 64 bit systems") and the commit 74005991b78a ("xfrm: Do not parse 32bits compiled xfrm netlink msg on 64bits host"). This is my second attempt to resolve the xfrm/compat problem by adding the 64=>32 and 32=>64 bit translators those non-visibly to a user provide translation between compatible user and kernel. Previous attempt was to interrupt the message ABI according to a syscall by xfrm_user, which resulted in over-complicated code [1]. Florian Westphal provided the idea of translator and some draft patches in the discussion. In these patches, his idea is reused and some of his initial code is also present. There were a couple of attempts to solve xfrm compat problem: https://lkml.org/lkml/2017/1/20/733 https://patchwork.ozlabs.org/patch/44600/ http://netdev.vger.kernel.narkive.com/2Gesykj6/patch-net-next-xfrm-correctly-parse-netlink-msg-from-32bits-ip-command-on-64bits-host All the discussions end in the conclusion that xfrm should have a full compatible layer to correctly work with 32-bit applications on 64-bit kernels: https://lkml.org/lkml/2017/1/23/413 https://patchwork.ozlabs.org/patch/433279/ In some recent lkml discussion, Linus said that it's worth to fix this problem and not giving people an excuse to stay on 32-bit kernel: https://lkml.org/lkml/2018/2/13/752 There is also an selftest for ipsec tunnels. It doesn't depend on any library and compat version can be easy build with: make CFLAGS=-m32 net/ipsec Patches as a .git branch: https://github.com/0x7f454c46/linux/tree/xfrm-compat-v2 [1]: https://lkml.kernel.org/r/20180726023144.31066-1-d...@arista.com Cc: "David S. Miller" Cc: Florian Westphal Cc: Herbert Xu Cc: Jakub Kicinski Cc: Steffen Klassert Cc: Stephen Suryaputra Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: net...@vger.kernel.org Dmitry Safonov (6): xfrm/compat: Add 64=>32-bit messages translator xfrm/compat: Attach xfrm dumps to 64=>32 bit translator netlink/compat: Append NLMSG_DONE/extack to frag_list xfrm/compat: Add 32=>64-bit messages translator xfrm/compat: Translate 32-bit user_policy from sockptr selftest/net/xfrm: Add test for ipsec tunnel MAINTAINERS|1 + include/net/xfrm.h | 32 + net/netlink/af_netlink.c | 48 +- net/xfrm/Kconfig | 11 + net/xfrm/Makefile |1 + net/xfrm/xfrm_compat.c | 609 +++ net/xfrm/xfrm_state.c | 11 +- net/xfrm/xfrm_user.c | 79 +- tools/testing/selftests/net/.gitignore |1 + tools/testing/selftests/net/Makefile |1 + tools/testing/selftests/net/ipsec.c| 2195 11 files changed, 2953 insertions(+), 36 deletions(-) create mode 100644 net/xfrm/xfrm_compat.c create mode 100644 tools/testing/selftests/net/ipsec.c -- 2.27.0
[PATCH v2 4/6] xfrm/compat: Add 32=>64-bit messages translator
XFRM is disabled for compatible users because of the UABI difference. The difference is in structures paddings and in the result the size of netlink messages differ. Possibility for compatible application to manage xfrm tunnels was disabled by: the commmit 19d7df69fdb2 ("xfrm: Refuse to insert 32 bit userspace socket policies on 64 bit systems") and the commit 74005991b78a ("xfrm: Do not parse 32bits compiled xfrm netlink msg on 64bits host"). This is my second attempt to resolve the xfrm/compat problem by adding the 64=>32 and 32=>64 bit translators those non-visibly to a user provide translation between compatible user and kernel. Previous attempt was to interrupt the message ABI according to a syscall by xfrm_user, which resulted in over-complicated code [1]. Florian Westphal provided the idea of translator and some draft patches in the discussion. Here his idea is reused and some of his initial code is also present. Provide the user-to-kernel translator under XFRM_USER_COMPAT, that creates for 32-bit xfrm-user message a 64-bit translation. The translation is afterwards reused by xfrm_user code just as if userspace had sent 64-bit message. [1]: https://lkml.kernel.org/r/20180726023144.31066-1-d...@arista.com Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 11 ++ net/xfrm/Kconfig | 3 +- net/xfrm/xfrm_compat.c | 276 + net/xfrm/xfrm_user.c | 50 +--- 4 files changed, 321 insertions(+), 19 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9febf4f5d2ea..242e690674c6 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2000,10 +2000,15 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, return 0; } +extern const struct nla_policy xfrma_policy[XFRMA_MAX+1]; + #ifdef CONFIG_XFRM_USER_COMPAT extern int xfrm_alloc_compat(struct sk_buff *skb); extern int __xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh); extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; +extern struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *nlh, + int maxtype, const struct nla_policy *policy, + struct netlink_ext_ack *extack); #else static inline int xfrm_alloc_compat(struct sk_buff *skb) { @@ -2014,6 +2019,12 @@ static inline int __xfrm_alloc_compat(struct sk_buff *skb, { return 0; } +static inline struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *nlh, + int maxtype, const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index e79b48dab61b..3adf31a83a79 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -30,7 +30,8 @@ config XFRM_USER config XFRM_USER_COMPAT tristate "Compatible ABI support" - depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT + depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT && \ + HAVE_EFFICIENT_UNALIGNED_ACCESS select WANT_COMPAT_NETLINK_MESSAGES help Transformation(XFRM) user configuration interface like IPsec diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index b34c8b56a571..79daa7f47d5a 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -96,6 +96,39 @@ static const int compat_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping) }; +static const struct nla_policy compat_policy[XFRMA_MAX+1] = { + [XFRMA_SA] = { .len = XMSGSIZE(compat_xfrm_usersa_info)}, + [XFRMA_POLICY] = { .len = XMSGSIZE(compat_xfrm_userpolicy_info)}, + [XFRMA_LASTUSED]= { .type = NLA_U64}, + [XFRMA_ALG_AUTH_TRUNC] = { .len = sizeof(struct xfrm_algo_auth)}, + [XFRMA_ALG_AEAD]= { .len = sizeof(struct xfrm_algo_aead) }, + [XFRMA_ALG_AUTH]= { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_CRYPT] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_COMP]= { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, + [XFRMA_TMPL]= { .len = sizeof(struct xfrm_user_tmpl) }, + [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) }, + [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, + [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, + [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, + [XFRMA_ETIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_COADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type
[PATCH v2 1/6] xfrm/compat: Add 64=>32-bit messages translator
XFRM is disabled for compatible users because of the UABI difference. The difference is in structures paddings and in the result the size of netlink messages differ. Possibility for compatible application to manage xfrm tunnels was disabled by: the commmit 19d7df69fdb2 ("xfrm: Refuse to insert 32 bit userspace socket policies on 64 bit systems") and the commit 74005991b78a ("xfrm: Do not parse 32bits compiled xfrm netlink msg on 64bits host"). This is my second attempt to resolve the xfrm/compat problem by adding the 64=>32 and 32=>64 bit translators those non-visibly to a user provide translation between compatible user and kernel. Previous attempt was to interrupt the message ABI according to a syscall by xfrm_user, which resulted in over-complicated code [1]. Florian Westphal provided the idea of translator and some draft patches in the discussion. In these patches, his idea is reused and some of his initial code is also present. Provide the kernel-to-user translator under XFRM_USER_COMPAT, that creates for 64-bit xfrm-user message a 32-bit translation and puts it in skb's frag_list. net/compat.c layer provides MSG_CMSG_COMPAT to decide if the message should be taken from skb or frag_list. (used by wext-core which has also an ABI difference) Kernel sends 64-bit xfrm messages to the userspace for: - multicast (monitor events) - netlink dumps Wire up the translator to xfrm_nlmsg_multicast(). [1]: https://lkml.kernel.org/r/20180726023144.31066-1-d...@arista.com Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 10 ++ net/xfrm/Kconfig | 10 ++ net/xfrm/Makefile | 1 + net/xfrm/xfrm_compat.c | 302 + net/xfrm/xfrm_user.c | 9 +- 5 files changed, 331 insertions(+), 1 deletion(-) create mode 100644 net/xfrm/xfrm_compat.c diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2737d24ec244..9810b5090338 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2000,6 +2000,16 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, return 0; } +#ifdef CONFIG_XFRM_USER_COMPAT +extern int xfrm_alloc_compat(struct sk_buff *skb); +extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; +#else +static inline int xfrm_alloc_compat(struct sk_buff *skb) +{ + return 0; +} +#endif + #if IS_ENABLED(CONFIG_IPV6) static inline bool xfrm6_local_dontfrag(const struct sock *sk) { diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 5b9a5ab48111..e79b48dab61b 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -28,6 +28,16 @@ config XFRM_USER If unsure, say Y. +config XFRM_USER_COMPAT + tristate "Compatible ABI support" + depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT + select WANT_COMPAT_NETLINK_MESSAGES + help + Transformation(XFRM) user configuration interface like IPsec + used by compatible Linux applications. + + If unsure, say N. + config XFRM_INTERFACE tristate "Transformation virtual interface" depends on XFRM && IPV6 diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 2d4bb4b9f75e..494aa744bfb9 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o +obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c new file mode 100644 index ..b9eb65dde0db --- /dev/null +++ b/net/xfrm/xfrm_compat.c @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XFRM compat layer + * Author: Dmitry Safonov + * Based on code and translator idea by: Florian Westphal + */ +#include +#include +#include + +struct compat_xfrm_lifetime_cfg { + compat_u64 soft_byte_limit, hard_byte_limit; + compat_u64 soft_packet_limit, hard_packet_limit; + compat_u64 soft_add_expires_seconds, hard_add_expires_seconds; + compat_u64 soft_use_expires_seconds, hard_use_expires_seconds; +}; /* same size on 32bit, but only 4 byte alignment required */ + +struct compat_xfrm_lifetime_cur { + compat_u64 bytes, packets, add_time, use_time; +}; /* same size on 32bit, but only 4 byte alignment required */ + +struct compat_xfrm_userpolicy_info { + struct xfrm_selector sel; + struct compat_xfrm_lifetime_cfg lft; + struct compat_xfrm_lifetime_cur curlft; + __u32 priority, index; + u8 dir, action, flags, share; + /* 4 bytes additional padding on 64bit */ +}; + +struct compat_xfrm_usersa_info { + struct xfrm_selector sel; + struct xfrm_id id; + xfrm_address_t saddr; + struct compat_xfrm_lifetime_cfg
[PATCH v2 5/6] xfrm/compat: Translate 32-bit user_policy from sockptr
XFRM is disabled for compatible users because of the UABI difference. The difference is in structures paddings and in the result the size of netlink messages differ. Possibility for compatible application to manage xfrm tunnels was disabled by: the commmit 19d7df69fdb2 ("xfrm: Refuse to insert 32 bit userspace socket policies on 64 bit systems") and the commit 74005991b78a ("xfrm: Do not parse 32bits compiled xfrm netlink msg on 64bits host"). This is my second attempt to resolve the xfrm/compat problem by adding the 64=>32 and 32=>64 bit translators those non-visibly to a user provide translation between compatible user and kernel. Previous attempt was to interrupt the message ABI according to a syscall by xfrm_user, which resulted in over-complicated code [1]. Florian Westphal provided the idea of translator and some draft patches in the discussion. In these patches, his idea is reused and some of his initial code is also present. Provide compat_xfrm_userpolicy_info translation for xfrm setsocketopt(). Reallocate buffer and put the missing padding for 64-bit message. [1]: https://lkml.kernel.org/r/20180726023144.31066-1-d...@arista.com Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 5 + net/xfrm/xfrm_compat.c | 25 + net/xfrm/xfrm_state.c | 11 --- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 242e690674c6..633c210bd2dd 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2009,6 +2009,7 @@ extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; extern struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *nlh, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack); +extern int xfrm_user_policy_compat(u8 **pdata32, int optlen); #else static inline int xfrm_alloc_compat(struct sk_buff *skb) { @@ -2025,6 +2026,10 @@ static inline struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *n { return ERR_PTR(-EOPNOTSUPP); } +static inline int xfrm_user_policy_compat(u8 **pdata32, int optlen) +{ + return -EOPNOTSUPP; +} #endif #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 79daa7f47d5a..990eecfc4c0e 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -582,3 +582,28 @@ struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, return h64; } + +int xfrm_user_policy_compat(u8 **pdata32, int optlen) +{ + struct compat_xfrm_userpolicy_info *p = (void *)*pdata32; + u8 *src_templates, *dst_templates; + u8 *data64; + + if (optlen < sizeof(*p)) + return -EINVAL; + + data64 = kmalloc_track_caller(optlen + 4, GFP_USER | __GFP_NOWARN); + if (!data64) + return -ENOMEM; + + memcpy(data64, *pdata32, sizeof(*p)); + memset(data64 + sizeof(*p), 0, 4); + + src_templates = *pdata32 + sizeof(*p); + dst_templates = data64 + sizeof(*p) + 4; + memcpy(dst_templates, src_templates, optlen - sizeof(*p)); + + kfree(*pdata32); + *pdata32 = data64; + return 0; +} diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 69520ad3d83b..053e6fe6ea7a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2271,9 +2271,6 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; - if (in_compat_syscall()) - return -EOPNOTSUPP; - if (sockptr_is_null(optval) && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); @@ -2288,6 +2285,14 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) if (IS_ERR(data)) return PTR_ERR(data); + if (in_compat_syscall()) { + err = xfrm_user_policy_compat(&data, optlen); + if (err) { + kfree(data); + return err; + } + } + err = -EINVAL; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { -- 2.27.0
[PATCH v2 3/6] netlink/compat: Append NLMSG_DONE/extack to frag_list
Modules those use netlink may supply a 2nd skb, (via frag_list) that contains an alternative data set meant for applications using 32bit compatibility mode. In such a case, netlink_recvmsg will use this 2nd skb instead of the original one. Without this patch, such compat applications will retrieve all netlink dump data, but will then get an unexpected EOF. Cc: Johannes Berg Signed-off-by: Florian Westphal Signed-off-by: Dmitry Safonov --- net/netlink/af_netlink.c | 48 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b5f30d7d30d0..b096f2b4a50d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2186,13 +2186,36 @@ EXPORT_SYMBOL(__nlmsg_put); * It would be better to create kernel thread. */ +static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, +struct netlink_callback *cb, +struct netlink_ext_ack *extack) +{ + struct nlmsghdr *nlh; + + nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), + NLM_F_MULTI | cb->answer_flags); + if (WARN_ON(!nlh)) + return -ENOBUFS; + + nl_dump_check_consistent(cb, nlh); + memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, + sizeof(nlk->dump_done_errno)); + + if (extack->_msg && nlk->flags & NETLINK_F_EXT_ACK) { + nlh->nlmsg_flags |= NLM_F_ACK_TLVS; + if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) + nlmsg_end(skb, nlh); + } + + return 0; +} + static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; struct module *module; int err = -ENOBUFS; int alloc_min_size; @@ -2258,22 +2281,19 @@ static int netlink_dump(struct sock *sk) return 0; } - nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, - sizeof(nlk->dump_done_errno), - NLM_F_MULTI | cb->answer_flags); - if (WARN_ON(!nlh)) + if (netlink_dump_done(nlk, skb, cb, &extack)) goto errout_skb; - nl_dump_check_consistent(cb, nlh); - - memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, - sizeof(nlk->dump_done_errno)); - - if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) { - nlh->nlmsg_flags |= NLM_F_ACK_TLVS; - if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg)) - nlmsg_end(skb, nlh); +#ifdef CONFIG_COMPAT_NETLINK_MESSAGES + /* frag_list skb's data is used for compat tasks +* and the regular skb's data for normal (non-compat) tasks. +* See netlink_recvmsg(). +*/ + if (unlikely(skb_shinfo(skb)->frag_list)) { + if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack)) + goto errout_skb; } +#endif if (sk_filter(sk, skb)) kfree_skb(skb); -- 2.27.0
[PATCH v2 2/6] xfrm/compat: Attach xfrm dumps to 64=>32 bit translator
XFRM is disabled for compatible users because of the UABI difference. The difference is in structures paddings and in the result the size of netlink messages differ. Possibility for compatible application to manage xfrm tunnels was disabled by: the commmit 19d7df69fdb2 ("xfrm: Refuse to insert 32 bit userspace socket policies on 64 bit systems") and the commit 74005991b78a ("xfrm: Do not parse 32bits compiled xfrm netlink msg on 64bits host"). This is my second attempt to resolve the xfrm/compat problem by adding the 64=>32 and 32=>64 bit translators those non-visibly to a user provide translation between compatible user and kernel. Previous attempt was to interrupt the message ABI according to a syscall by xfrm_user, which resulted in over-complicated code [1]. Florian Westphal provided the idea of translator and some draft patches in the discussion. In these patches, his idea is reused and some of his initial code is also present. Currently nlmsg_unicast() is used by functions that dump structures that can be different in size for compat tasks, see dump_one_state() and dump_one_policy(). The following nlmsg_unicast() users exist today in xfrm: Function |Message can be different | in size on compat ---|-- xfrm_get_spdinfo() | N xfrm_get_sadinfo() | N xfrm_get_sa() | Y xfrm_alloc_userspi() | Y xfrm_get_policy() | Y xfrm_get_ae() | N Besides, dump_one_state() and dump_one_policy() can be used by filtered netlink dump for XFRM_MSG_GETSA, XFRM_MSG_GETPOLICY. Just as for xfrm multicast, allocate frag_list for compat skb journey down to recvmsg() which will give user the desired skb according to syscall bitness. [1]: https://lkml.kernel.org/r/20180726023144.31066-1-d...@arista.com Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 6 ++ net/xfrm/xfrm_compat.c | 10 -- net/xfrm/xfrm_user.c | 20 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9810b5090338..9febf4f5d2ea 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2002,12 +2002,18 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, #ifdef CONFIG_XFRM_USER_COMPAT extern int xfrm_alloc_compat(struct sk_buff *skb); +extern int __xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh); extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; #else static inline int xfrm_alloc_compat(struct sk_buff *skb) { return 0; } +static inline int __xfrm_alloc_compat(struct sk_buff *skb, + const struct nlmsghdr *nlh) +{ + return 0; +} #endif #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index b9eb65dde0db..b34c8b56a571 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -272,9 +272,8 @@ static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src) return 0; } -int xfrm_alloc_compat(struct sk_buff *skb) +int __xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src) { - const struct nlmsghdr *nlh_src = nlmsg_hdr(skb); u16 type = nlh_src->nlmsg_type - XFRM_MSG_BASE; struct sk_buff *new = NULL; int err; @@ -300,3 +299,10 @@ int xfrm_alloc_compat(struct sk_buff *skb) return 0; } + +int xfrm_alloc_compat(struct sk_buff *skb) +{ + const struct nlmsghdr *nlh_src = nlmsg_hdr(skb); + + return __xfrm_alloc_compat(skb, nlh_src); +} diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 90c57d4a0b47..d135c6949336 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -992,6 +992,13 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) return err; } nlmsg_end(skb, nlh); + + err = __xfrm_alloc_compat(skb, nlh); + if (err) { + nlmsg_cancel(skb, nlh); + return err; + } + return 0; } @@ -1365,6 +1372,12 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } + err = xfrm_alloc_compat(skb); + if (err) { + kfree_skb(resp_skb); + goto out; + } + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); out: @@ -1795,6 +1808,13 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr return err; } nlmsg_end(skb, nlh); + + err = __xfrm_alloc_compat(skb, nlh); + if (err) { +
[PATCH v2 6/6] selftest/net/xfrm: Add test for ipsec tunnel
It's an exhaustive testing for ipsec: covering all encryption/ authentication/compression algorithms. The tests are run in two network namespaces, connected by veth interfaces. To make exhaustive testing less time-consuming, the tests are run in parallel tasks, specified by parameter to the selftest. As the patches set adds support for xfrm in compatible tasks, there are tests to check structures that differ in size between 64-bit and 32-bit applications. The selftest doesn't use libnl so that it can be easily compiled as compatible application and don't require compatible .so. Here is a diagram of the selftest: --- | selftest | | (parent) | --- || | (pipe) | -- / | | \ /- /\ -\ | /- -\ | -|--||--|- | - -- - | | | child | | child | NS A | child | | child | | | - -- - | ---|||-|-- veth0veth1veth2 vethN -|||-|-- | | | | gr.child | | gr.child | NS B | gr.child | | gr.child | | | | The parent sends the description of a test (xfrm parameters) to the child, the child and grand child setup a tunnel over veth interface and test it by sending udp packets. Cc: Shuah Khan Cc: linux-kselft...@vger.kernel.org Signed-off-by: Dmitry Safonov --- MAINTAINERS|1 + tools/testing/selftests/net/.gitignore |1 + tools/testing/selftests/net/Makefile |1 + tools/testing/selftests/net/ipsec.c| 2195 4 files changed, 2198 insertions(+) create mode 100644 tools/testing/selftests/net/ipsec.c diff --git a/MAINTAINERS b/MAINTAINERS index 3b186ade3597..f485d551bd1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12110,6 +12110,7 @@ F: net/ipv6/ipcomp6.c F: net/ipv6/xfrm* F: net/key/ F: net/xfrm/ +F: tools/testing/selftests/net/ipsec.c NETWORKING [IPv4/IPv6] M: "David S. Miller" diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 742c499328b2..61ae899cfc17 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +ipsec msg_zerocopy socket psock_fanout diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 9491bbaa0831..edd4ac632dc8 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -29,6 +29,7 @@ TEST_GEN_FILES += tcp_fastopen_backup_key TEST_GEN_FILES += fin_ack_lat TEST_GEN_FILES += reuseaddr_ports_exhausted TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp +TEST_GEN_FILES += ipsec TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c new file mode 100644 index ..17ced7d6ce25 --- /dev/null +++ b/tools/testing/selftests/net/ipsec.c @@ -0,0 +1,2195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ipsec.c - Check xfrm on veth inside a net-ns. + * Copyright (c) 2018 Dmitry Safonov + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define printk(fmt, ...) \ + ksft_print_msg("%d[%u] " fmt "\n", getpid(), __LINE__, ##__VA_ARGS__) + +#define pr_err(fmt, ...) printk(fmt ": %m", ##__VA_ARGS__) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +#define IPV4_STR_SZ16 /* xxx.xxx.xxx.xxx is longest + \0 */ +#define MAX_PAYLOAD2048 +#define XFRM_ALGO_KEY_BUF_SIZE 512 +#define MAX_PROCESSES (1 << 14) /* /16 mask divided by /30 subnets */ +#define INADDR_A ((in_addr_t) 0x0
[tip: x86/core] x86/dumpstack: Add log_lvl to show_iret_regs()
The following commit has been merged into the x86/core branch of tip: Commit-ID: fd07f802a70935fbbfb9cc2d11e1d8ac95f28e44 Gitweb: https://git.kernel.org/tip/fd07f802a70935fbbfb9cc2d11e1d8ac95f28e44 Author:Dmitry Safonov AuthorDate:Mon, 29 Jun 2020 15:48:45 +01:00 Committer: Thomas Gleixner CommitterDate: Wed, 22 Jul 2020 23:56:53 +02:00 x86/dumpstack: Add log_lvl to show_iret_regs() show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, there is a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). Add log_lvl parameter to show_iret_regs() as a preparation to add it to __show_regs() and show_regs_if_on_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Acked-by: Petr Mladek Link: https://lkml.kernel.org/r/20200629144847.492794-2-d...@arista.com --- arch/x86/include/asm/kdebug.h | 2 +- arch/x86/kernel/dumpstack.c | 8 arch/x86/kernel/process_64.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 247ab14..da024bb 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -37,7 +37,7 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); -extern void show_iret_regs(struct pt_regs *regs); +extern void show_iret_regs(struct pt_regs *regs, const char *log_lvl); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b037cfa..c36d629 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -126,10 +126,10 @@ void show_ip(struct pt_regs *regs, const char *loglvl) show_opcodes(regs, loglvl); } -void show_iret_regs(struct pt_regs *regs) +void show_iret_regs(struct pt_regs *regs, const char *log_lvl) { - show_ip(regs, KERN_DEFAULT); - printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + show_ip(regs, log_lvl); + printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, regs->sp, regs->flags); } @@ -155,7 +155,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs); + show_iret_regs(regs, KERN_DEFAULT); } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9a97415..09bcb29 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,7 +69,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) unsigned int fsindex, gsindex; unsigned int ds, es; - show_iret_regs(regs); + show_iret_regs(regs, KERN_DEFAULT); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
[tip: x86/core] x86/dumpstack: Add log_lvl to __show_regs()
The following commit has been merged into the x86/core branch of tip: Commit-ID: 44e215352cf17333992d56941b5bf4af60a67609 Gitweb: https://git.kernel.org/tip/44e215352cf17333992d56941b5bf4af60a67609 Author:Dmitry Safonov AuthorDate:Mon, 29 Jun 2020 15:48:46 +01:00 Committer: Thomas Gleixner CommitterDate: Wed, 22 Jul 2020 23:56:53 +02:00 x86/dumpstack: Add log_lvl to __show_regs() show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, there is a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). Add log_lvl parameter to __show_regs(). Keep the used log level intact to separate visible change. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Acked-by: Petr Mladek Link: https://lkml.kernel.org/r/20200629144847.492794-3-d...@arista.com --- arch/x86/include/asm/kdebug.h | 3 +- arch/x86/kernel/dumpstack.c | 9 -- arch/x86/kernel/process_32.c | 29 +-- arch/x86/kernel/process_64.c | 51 +- 4 files changed, 49 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index da024bb..d1514e7 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -36,7 +36,8 @@ extern void die(const char *, struct pt_regs *,long); void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); -extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); +extern void __show_regs(struct pt_regs *regs, enum show_regs_mode, + const char *log_lvl); extern void show_iret_regs(struct pt_regs *regs, const char *log_lvl); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c36d629..4954d66 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -146,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT); + __show_regs(regs, SHOW_REGS_SHORT, KERN_DEFAULT); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -345,7 +345,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) oops_exit(); /* Executive summary in case the oops scrolled away */ - __show_regs(&exec_summary_regs, SHOW_REGS_ALL); + __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); if (!signr) return; @@ -437,9 +437,12 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) void show_regs(struct pt_regs *regs) { + enum show_regs_mode print_kernel_regs; + show_regs_print_info(KERN_DEFAULT); - __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); /* * When in-kernel, we also print out the stack at the time of the fault.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index acfd6d2..4f2f54e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,8 @@ #include "process.h" -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, +const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -67,14 +68,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) else
[tip: x86/core] x86/dumpstack: Show registers dump with trace's log level
The following commit has been merged into the x86/core branch of tip: Commit-ID: ef2ff0f5d6008d325c9a068e20981c0d0acc4d6b Gitweb: https://git.kernel.org/tip/ef2ff0f5d6008d325c9a068e20981c0d0acc4d6b Author:Dmitry Safonov AuthorDate:Mon, 29 Jun 2020 15:48:47 +01:00 Committer: Thomas Gleixner CommitterDate: Wed, 22 Jul 2020 23:56:54 +02:00 x86/dumpstack: Show registers dump with trace's log level show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, there is a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). After all preparations are done, provide log_lvl parameter for show_regs_if_on_stack() and wire up to actual log level used as an argument for show_trace_log_lvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Acked-by: Petr Mladek Link: https://lkml.kernel.org/r/20200629144847.492794-4-d...@arista.com --- arch/x86/kernel/dumpstack.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4954d66..f9a3526 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -134,7 +134,7 @@ void show_iret_regs(struct pt_regs *regs, const char *log_lvl) } static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, - bool partial) + bool partial, const char *log_lvl) { /* * These on_stack() checks aren't strictly necessary: the unwind code @@ -146,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT, KERN_DEFAULT); + __show_regs(regs, SHOW_REGS_SHORT, log_lvl); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -155,7 +155,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs, KERN_DEFAULT); + show_iret_regs(regs, log_lvl); } } @@ -210,7 +210,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); /* * Scan the stack, printing any text addresses we find. At the @@ -271,7 +271,7 @@ next: /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); } if (stack_name)
Re: [PATCH v2 0/3] x86/dumpstack: Print registers with the same log level as the backtrace
Hi Ingo, Thomas, Could you take these to x86/tip tree? (in case they are good enough) On 6/29/20 3:48 PM, Dmitry Safonov wrote: > Changes since v1 [3]: > - Use (enum show_regs_mode) instead of (int) [nit by Jann, thanks!] > - Add acks from Petr > > show_trace_log_lvl() provides x86 platform-specific way to unwind > backtrace with a given log level. Unfortunately, registers dump(s) are > not printed with the same log level - instead, KERN_DEFAULT is always > used. > > Arista's switches uses quite common setup with rsyslog, where only > urgent messages goes to console (console_log_level=KERN_ERR), everything > else goes into /var/log/ as the console baud-rate often is indecently > slow (9600 bps). > > Backtrace dumps without registers printed have proven to be as useful as > morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED > (which I believe is still the most elegant way to fix raciness of sysrq[1]) > the log level should be passed down the stack to register dumping > functions. Besides, I have a potential use-case for printing traces > with KERN_DEBUG level [2] (where registers dump shouldn't appear with > higher log level than the backtrace). > > Cc: Andy Lutomirski > Cc: Borislav Petkov > Cc: "H. Peter Anvin" > Cc: Ingo Molnar > Cc: Jann Horn > Cc: Petr Mladek > Cc: Sergey Senozhatsky > Cc: Steven Rostedt > Cc: Tetsuo Handa > Cc: Thomas Gleixner > Cc: x...@kernel.org > [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ > [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ > [3]: https://lore.kernel.org/lkml/20200623162958.331051-1-d...@arista.com/ > Dmitry Safonov (3): > x86/dumpstack: Add log_lvl to show_iret_regs() > x86/dumpstack: Add log_lvl to __show_regs() > x86/dumpstack: Show registers dump with trace's log level > > arch/x86/include/asm/kdebug.h | 5 ++-- > arch/x86/kernel/dumpstack.c | 23 +--- > arch/x86/kernel/process_32.c | 29 ++-- > arch/x86/kernel/process_64.c | 51 ++- > 4 files changed, 57 insertions(+), 51 deletions(-) > Thanks, Dmitry
[PATCH v2 3/3] x86/dumpstack: Show registers dump with trace's log level
show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, I have a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). After all preparations are done, provide log_lvl parameter for show_regs_if_on_stack() and wire up to actual log level used as an argument for show_trace_log_lvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Acked-by: Petr Mladek Signed-off-by: Dmitry Safonov --- arch/x86/kernel/dumpstack.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4954d6678cef..f9a3526af15d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -134,7 +134,7 @@ void show_iret_regs(struct pt_regs *regs, const char *log_lvl) } static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, - bool partial) + bool partial, const char *log_lvl) { /* * These on_stack() checks aren't strictly necessary: the unwind code @@ -146,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT, KERN_DEFAULT); + __show_regs(regs, SHOW_REGS_SHORT, log_lvl); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -155,7 +155,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs, KERN_DEFAULT); + show_iret_regs(regs, log_lvl); } } @@ -210,7 +210,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); /* * Scan the stack, printing any text addresses we find. At the @@ -271,7 +271,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); } if (stack_name) -- 2.27.0
[PATCH v2 1/3] x86/dumpstack: Add log_lvl to show_iret_regs()
show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, I have a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). Add log_lvl parameter to show_iret_regs() as a preparation to add it to __show_regs() and show_regs_if_on_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Acked-by: Petr Mladek Signed-off-by: Dmitry Safonov --- arch/x86/include/asm/kdebug.h | 2 +- arch/x86/kernel/dumpstack.c | 8 arch/x86/kernel/process_64.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 247ab14c6309..da024bbda6f4 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -37,7 +37,7 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); -extern void show_iret_regs(struct pt_regs *regs); +extern void show_iret_regs(struct pt_regs *regs, const char *log_lvl); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b037cfa7c0c5..c36d629a25a8 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -126,10 +126,10 @@ void show_ip(struct pt_regs *regs, const char *loglvl) show_opcodes(regs, loglvl); } -void show_iret_regs(struct pt_regs *regs) +void show_iret_regs(struct pt_regs *regs, const char *log_lvl) { - show_ip(regs, KERN_DEFAULT); - printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + show_ip(regs, log_lvl); + printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, regs->sp, regs->flags); } @@ -155,7 +155,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs); + show_iret_regs(regs, KERN_DEFAULT); } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9a97415b2139..09bcb296cda6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,7 +69,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) unsigned int fsindex, gsindex; unsigned int ds, es; - show_iret_regs(regs); + show_iret_regs(regs, KERN_DEFAULT); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); -- 2.27.0
[PATCH v2 0/3] x86/dumpstack: Print registers with the same log level as the backtrace
Changes since v1 [3]: - Use (enum show_regs_mode) instead of (int) [nit by Jann, thanks!] - Add acks from Petr show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, I have a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level than the backtrace). Cc: Andy Lutomirski Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: x...@kernel.org [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ [3]: https://lore.kernel.org/lkml/20200623162958.331051-1-d...@arista.com/ Dmitry Safonov (3): x86/dumpstack: Add log_lvl to show_iret_regs() x86/dumpstack: Add log_lvl to __show_regs() x86/dumpstack: Show registers dump with trace's log level arch/x86/include/asm/kdebug.h | 5 ++-- arch/x86/kernel/dumpstack.c | 23 +--- arch/x86/kernel/process_32.c | 29 ++-- arch/x86/kernel/process_64.c | 51 ++- 4 files changed, 57 insertions(+), 51 deletions(-) -- 2.27.0
[PATCH v2 2/3] x86/dumpstack: Add log_lvl to __show_regs()
show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, I have a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). Add log_lvl parameter to __show_regs(). Keep the used log level intact to separate visible change. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Acked-by: Petr Mladek Signed-off-by: Dmitry Safonov --- arch/x86/include/asm/kdebug.h | 3 ++- arch/x86/kernel/dumpstack.c | 9 --- arch/x86/kernel/process_32.c | 29 ++-- arch/x86/kernel/process_64.c | 51 ++- 4 files changed, 49 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index da024bbda6f4..d1514e70477b 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -36,7 +36,8 @@ extern void die(const char *, struct pt_regs *,long); void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); -extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); +extern void __show_regs(struct pt_regs *regs, enum show_regs_mode, + const char *log_lvl); extern void show_iret_regs(struct pt_regs *regs, const char *log_lvl); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c36d629a25a8..4954d6678cef 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -146,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT); + __show_regs(regs, SHOW_REGS_SHORT, KERN_DEFAULT); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -345,7 +345,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) oops_exit(); /* Executive summary in case the oops scrolled away */ - __show_regs(&exec_summary_regs, SHOW_REGS_ALL); + __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); if (!signr) return; @@ -437,9 +437,12 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) void show_regs(struct pt_regs *regs) { + enum show_regs_mode print_kernel_regs; + show_regs_print_info(KERN_DEFAULT); - __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); /* * When in-kernel, we also print out the stack at the time of the fault.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index acfd6d2a0cbf..4f2f54e1281c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,8 @@ #include "process.h" -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, +const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -67,14 +68,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) else savesegment(gs, gs); - show_ip(regs, KERN_DEFAULT); + show_ip(regs, log_lvl); - printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); - printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); - printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04
Re: [PATCH 2/3] x86/dumpstack: Add log_lvl to __show_regs()
On 6/23/20 5:45 PM, Jann Horn wrote: > On Tue, Jun 23, 2020 at 6:30 PM Dmitry Safonov wrote: >> show_trace_log_lvl() provides x86 platform-specific way to unwind >> backtrace with a given log level. Unfortunately, registers dump(s) are >> not printed with the same log level - instead, KERN_DEFAULT is always >> used. >> >> Arista's switches uses quite common setup with rsyslog, where only >> urgent messages goes to console (console_log_level=KERN_ERR), everything >> else goes into /var/log/ as the console baud-rate often is indecently >> slow (9600 bps). >> >> Backtrace dumps without registers printed have proven to be as useful as >> morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED >> (which I believe is still the most elegant way to fix raciness of sysrq[1]) >> the log level should be passed down the stack to register dumping >> functions. Besides, I have a potential use-case for printing traces >> with KERN_DEBUG level [2] (where registers dump shouldn't appear with >> higher log level). >> >> Add log_lvl parameter to __show_regs(). >> Keep the used log level intact to separate visible change. > > This change seems like a good idea to me; just one small nit: > > [...] >> void show_regs(struct pt_regs *regs) >> { >> + int print_kernel_regs; >> + >> show_regs_print_info(KERN_DEFAULT); >> >> - __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); >> + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; >> + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); > > print_kernel_regs should probably have type "enum show_regs_mode"? Makes sense, will fix in v2. Thanks, Dmitry
Re: [PATCH 0/3] x86/dumpstack: Print registers with the same log level as the backtrace
On 6/24/20 7:50 AM, Petr Mladek wrote: > On Tue 2020-06-23 17:29:55, Dmitry Safonov wrote: >> show_trace_log_lvl() provides x86 platform-specific way to unwind >> backtrace with a given log level. Unfortunately, registers dump(s) are >> not printed with the same log level - instead, KERN_DEFAULT is always >> used. >> >> Arista's switches uses quite common setup with rsyslog, where only >> urgent messages goes to console (console_log_level=KERN_ERR), everything >> else goes into /var/log/ as the console baud-rate often is indecently >> slow (9600 bps). >> >> Backtrace dumps without registers printed have proven to be as useful as >> morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED >> (which I believe is still the most elegant way to fix raciness of sysrq[1]) >> the log level should be passed down the stack to register dumping >> functions. Besides, I have a potential use-case for printing traces >> with KERN_DEBUG level [2] (where registers dump shouldn't appear with >> higher log level than the backtrace). >> >> Dmitry Safonov (3): >> x86/dumpstack: Add log_lvl to show_iret_regs() >> x86/dumpstack: Add log_lvl to __show_regs() >> x86/dumpstack: Show registers dump with trace's log level > > The change makes sense. It is natural next step after adding log_lvl > parameter for printing stack traces. For the entire patchset: > > Acked-by: Petr Mladek I'll address the nit by Jann and resend v2 with your Ack, thanks! > Are there any plans to add this also for other architectures, please? Yes, I'll look into that. Thanks, Dmitry
[PATCH 3/3] x86/dumpstack: Show registers dump with trace's log level
show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, I have a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). After all preparations are done, provide log_lvl parameter for show_regs_if_on_stack() and wire up to actual log level used as an argument for show_trace_log_lvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Signed-off-by: Dmitry Safonov --- arch/x86/kernel/dumpstack.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ec90d71979f4..6b00964d5873 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -134,7 +134,7 @@ void show_iret_regs(struct pt_regs *regs, const char *log_lvl) } static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, - bool partial) + bool partial, const char *log_lvl) { /* * These on_stack() checks aren't strictly necessary: the unwind code @@ -146,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT, KERN_DEFAULT); + __show_regs(regs, SHOW_REGS_SHORT, log_lvl); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -155,7 +155,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs, KERN_DEFAULT); + show_iret_regs(regs, log_lvl); } } @@ -210,7 +210,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); /* * Scan the stack, printing any text addresses we find. At the @@ -271,7 +271,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); } if (stack_name) -- 2.27.0
[PATCH 0/3] x86/dumpstack: Print registers with the same log level as the backtrace
show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, I have a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level than the backtrace). Cc: Andy Lutomirski Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: x...@kernel.org [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Dmitry Safonov (3): x86/dumpstack: Add log_lvl to show_iret_regs() x86/dumpstack: Add log_lvl to __show_regs() x86/dumpstack: Show registers dump with trace's log level arch/x86/include/asm/kdebug.h | 5 ++-- arch/x86/kernel/dumpstack.c | 23 +--- arch/x86/kernel/process_32.c | 29 ++-- arch/x86/kernel/process_64.c | 51 ++- 4 files changed, 57 insertions(+), 51 deletions(-) -- 2.27.0
[PATCH 2/3] x86/dumpstack: Add log_lvl to __show_regs()
show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, I have a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). Add log_lvl parameter to __show_regs(). Keep the used log level intact to separate visible change. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Signed-off-by: Dmitry Safonov --- arch/x86/include/asm/kdebug.h | 3 ++- arch/x86/kernel/dumpstack.c | 9 --- arch/x86/kernel/process_32.c | 29 ++-- arch/x86/kernel/process_64.c | 51 ++- 4 files changed, 49 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index da024bbda6f4..d1514e70477b 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -36,7 +36,8 @@ extern void die(const char *, struct pt_regs *,long); void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); -extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); +extern void __show_regs(struct pt_regs *regs, enum show_regs_mode, + const char *log_lvl); extern void show_iret_regs(struct pt_regs *regs, const char *log_lvl); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c36d629a25a8..ec90d71979f4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -146,7 +146,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT); + __show_regs(regs, SHOW_REGS_SHORT, KERN_DEFAULT); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -345,7 +345,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) oops_exit(); /* Executive summary in case the oops scrolled away */ - __show_regs(&exec_summary_regs, SHOW_REGS_ALL); + __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); if (!signr) return; @@ -437,9 +437,12 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) void show_regs(struct pt_regs *regs) { + int print_kernel_regs; + show_regs_print_info(KERN_DEFAULT); - __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); /* * When in-kernel, we also print out the stack at the time of the fault.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index acfd6d2a0cbf..4f2f54e1281c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,8 @@ #include "process.h" -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, +const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -67,14 +68,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) else savesegment(gs, gs); - show_ip(regs, KERN_DEFAULT); + show_ip(regs, log_lvl); - printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); - printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); - printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", -
[PATCH 1/3] x86/dumpstack: Add log_lvl to show_iret_regs()
show_trace_log_lvl() provides x86 platform-specific way to unwind backtrace with a given log level. Unfortunately, registers dump(s) are not printed with the same log level - instead, KERN_DEFAULT is always used. Arista's switches uses quite common setup with rsyslog, where only urgent messages goes to console (console_log_level=KERN_ERR), everything else goes into /var/log/ as the console baud-rate often is indecently slow (9600 bps). Backtrace dumps without registers printed have proven to be as useful as morning standups. Furthermore, in order to introduce KERN_UNSUPPRESSED (which I believe is still the most elegant way to fix raciness of sysrq[1]) the log level should be passed down the stack to register dumping functions. Besides, I have a potential use-case for printing traces with KERN_DEBUG level [2] (where registers dump shouldn't appear with higher log level). Add log_lvl parameter to show_iret_regs() as a preparation to add it to __show_regs() and show_regs_if_on_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-d...@arista.com/ [2]: https://lore.kernel.org/linux-doc/20190724170249.9644-1-d...@arista.com/ Signed-off-by: Dmitry Safonov --- arch/x86/include/asm/kdebug.h | 2 +- arch/x86/kernel/dumpstack.c | 8 arch/x86/kernel/process_64.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 247ab14c6309..da024bbda6f4 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -37,7 +37,7 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); -extern void show_iret_regs(struct pt_regs *regs); +extern void show_iret_regs(struct pt_regs *regs, const char *log_lvl); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b037cfa7c0c5..c36d629a25a8 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -126,10 +126,10 @@ void show_ip(struct pt_regs *regs, const char *loglvl) show_opcodes(regs, loglvl); } -void show_iret_regs(struct pt_regs *regs) +void show_iret_regs(struct pt_regs *regs, const char *log_lvl) { - show_ip(regs, KERN_DEFAULT); - printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + show_ip(regs, log_lvl); + printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, regs->sp, regs->flags); } @@ -155,7 +155,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs); + show_iret_regs(regs, KERN_DEFAULT); } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9a97415b2139..09bcb296cda6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,7 +69,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) unsigned int fsindex, gsindex; unsigned int ds, es; - show_iret_regs(regs); + show_iret_regs(regs, KERN_DEFAULT); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); -- 2.27.0
Re: [PATCH 2/6] arm64/vdso: Zap vvar pages when switching to a time namespace
Hi Mark, On 6/16/20 12:24 PM, Mark Rutland wrote: > On Tue, Jun 16, 2020 at 12:55:41AM -0700, Andrei Vagin wrote: [..] >> Whenever a task changes its namespace, the VVAR >> page tables are cleared and then they will be re-faulted with a >> corresponding layout. > > How does this work for multi-threaded applications? Are there any > concerns w.r.t. atomicity of the change? Multi-threaded applications can't setns() for time namespace, timens_install(): : if (!current_is_single_threaded()) : return -EUSERS; Thanks, Dmitry
Re: [PATCH v2 3/3] serial: core: drop redundant sysrq checks
On 6/10/20 4:22 PM, Johan Hovold wrote: > The sysrq timestamp will never be set unless port->has_sysrq is set (see > uart_handle_break()) so drop the redundant checks that were added by > commit 1997e9dfdc84 ("serial_core: Un-ifdef sysrq SUPPORT_SYSRQ"). > > Signed-off-by: Johan Hovold Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com> Thanks, Dmitry
Re: [PATCH v2 2/3] serial: core: fix sysrq overhead regression
On 6/10/20 4:22 PM, Johan Hovold wrote: > Commit 8e20fc391711 ("serial_core: Move sysrq functions from header > file") converted the inline sysrq helpers to exported functions which > are now called for every received character, interrupt and break signal > also on systems without CONFIG_MAGIC_SYSRQ_SERIAL instead of being > optimised away by the compiler. > > Inlining these helpers again also avoids the function call overhead when > CONFIG_MAGIC_SYSRQ_SERIAL is enabled (e.g. when the port is not used as > a console). > > Fixes: 8e20fc391711 ("serial_core: Move sysrq functions from header file") > Cc: Dmitry Safonov <0x7f454...@gmail.com> > Signed-off-by: Johan Hovold Thanks for sending and the numbers, it's a bit pity that we need to move them back to the header, but as it matters for your setup, Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com> Thanks, Dmitry
Re: [PATCH v2 2/3] serial: core: fix sysrq overhead regression
On 6/12/20 4:29 PM, Johan Hovold wrote: > On Wed, Jun 10, 2020 at 05:24:57PM +0100, Dmitry Safonov wrote: >> Hi Johan, >> >> On 6/10/20 4:22 PM, Johan Hovold wrote: >>> Commit 8e20fc391711 ("serial_core: Move sysrq functions from header >>> file") converted the inline sysrq helpers to exported functions which >>> are now called for every received character, interrupt and break signal >>> also on systems without CONFIG_MAGIC_SYSRQ_SERIAL instead of being >>> optimised away by the compiler. >> >> The part with ifdeffing looks good to me. >> >>> Inlining these helpers again also avoids the function call overhead when >>> CONFIG_MAGIC_SYSRQ_SERIAL is enabled (e.g. when the port is not used as >>> a console). >> >> But this one, coul you add measures? (it will also help to understand if >> it's a stable material). > > Interrupt processing takes 2-3% longer without the inlining with > 8250_omap on a beagleboard for example. I think the number justifies moving them back to header. > >> If one function call actually matters here, than should >> uart_insert_char() also go into header? > > Good question, it actually was originally intended to be inlined as all > other per-character processing. Separate discussion though. Fair enough > The point is that we don't want a rarely used debugging feature to incur > unnecessary additional overhead that can easily be avoided. Well, it wasn't related to the debug feature, rather I wanted to cleanup the header from a bit over-grown functions those have realization details. And couldn't foresee that a function call would matter for some setup. Thanks, Dmitry
Re: vdso_join_timens() question
Hi Christian, On 6/11/20 12:02 PM, Christian Brauner wrote: > Hey, > > I'm about to finish a patch to add CLONE_NEWTIME support to setns(). > Since setns() now allows to attach to a multiple namespaces at the same > time I've also reworked it to be atomic (already upstream). Either all > namespaces are switched or no namespace is switched. All namespaces > basically now have a commit mode after which installation should ideally > not fail anymore. That could work for CLONE_NEWTIME too, I think. The > only blocker to this is vdso_join_timens() which can fail due to > mmap_write_lock_killable(). > > Is it possible to change this to mmap_write_lock()? So sm like: > > diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c > index ea7c1f0b79df..5c5b4cc61fce 100644 > --- a/arch/x86/entry/vdso/vma.c > +++ b/arch/x86/entry/vdso/vma.c > @@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct > time_namespace *ns) > struct mm_struct *mm = task->mm; > struct vm_area_struct *vma; > > - if (mmap_write_lock_killable(mm)) > - return -EINTR; > + mmap_write_lock(mm); > > for (vma = mm->mmap; vma; vma = vma->vm_next) { > unsigned long size = vma->vm_end - vma->vm_start; I think, it should be fine. I'm thinking if it actually could be downgraded to mmap_read_lock().. Probably, write lock was being over-cautious. > vdso_join_timens() is called in two places. Once during fork() and once > during timens_install(). I would only need the mmap_write_lock() change > for the latter. So alternatively we could have: > > __vdso_join_timens_unlocked() > > and then have/expose: > > vdso_join_timens_fork() > { > if (mmap_write_lock_killable(mm)) > return -EINTR; > __vdso_join_timens_unlocked() > mmap_write_unlock(mm); > } > > and > > vdso_join_timens_install() > { > mmap_write_lock(mm); > __vdso_join_timens_unlocked() > mmap_write_unlock(mm); > } I think it's not needed. On fork() it's called on creation of new timens: : if (nsproxy->time_ns == nsproxy->time_ns_for_children) : return 0; So the vdso_join_timens() is called on setns() or on creation of new namespace, which both are quite heavy operations themselves (in sense of locking). Thanks, Dmitry
Re: [PATCH v2 2/3] serial: core: fix sysrq overhead regression
Hi Johan, On 6/10/20 4:22 PM, Johan Hovold wrote: > Commit 8e20fc391711 ("serial_core: Move sysrq functions from header > file") converted the inline sysrq helpers to exported functions which > are now called for every received character, interrupt and break signal > also on systems without CONFIG_MAGIC_SYSRQ_SERIAL instead of being > optimised away by the compiler. The part with ifdeffing looks good to me. > Inlining these helpers again also avoids the function call overhead when > CONFIG_MAGIC_SYSRQ_SERIAL is enabled (e.g. when the port is not used as > a console). But this one, coul you add measures? (it will also help to understand if it's a stable material). If one function call actually matters here, than should uart_insert_char() also go into header? I see quite common pattern in drivers: : if (!uart_handle_sysrq_char(&up->port, ch)) : uart_insert_char(&up->port, byte, 0, ch, TTY_NORMAL); Don't misunderstand me, but I would prefer keeping headers cleaner without realization details with the exception if function calls actually hurts the performance. Probably, a comment like /* * Keeping these functions in the header improves performance by X% on * YYY platform by letting the compiler inline them. */ would also help. Thanks for working on this, Dmitry
Re: [PATCH RESEND v3 0/6] arm64: add the time namespace support
Hi Andrei, On 6/2/20 7:02 PM, Andrei Vagin wrote: > Allocate the time namespace page among VVAR pages and add the logic > to handle faults on VVAR properly. > > If a task belongs to a time namespace then the VVAR page which contains > the system wide VDSO data is replaced with a namespace specific page > which has the same layout as the VVAR page. That page has vdso_data->seq > set to 1 to enforce the slow path and vdso_data->clock_mode set to > VCLOCK_TIMENS to enforce the time namespace handling path. > > The extra check in the case that vdso_data->seq is odd, e.g. a concurrent > update of the VDSO data is in progress, is not really affecting regular > tasks which are not part of a time namespace as the task is spin waiting > for the update to finish and vdso_data->seq to become even again. > > If a time namespace task hits that code path, it invokes the corresponding > time getter function which retrieves the real VVAR page, reads host time > and then adds the offset for the requested clock which is stored in the > special VVAR page. > > v2: Code cleanups suggested by Vincenzo. > v3: add a comment in __arch_get_timens_vdso_data. > > Reviewed-by: Vincenzo Frascino > Cc: Thomas Gleixner > Cc: Dmitry Safonov > > v3 on github (if someone prefers `git pull` to `git am`): > https://github.com/avagin/linux-task-diag/tree/arm64/timens-v3 Thanks for adding arm64 support, I've looked through patches and don't see any major problems. Reviewed-by: Dmitry Safonov > > Andrei Vagin (6): > arm64/vdso: use the fault callback to map vvar pages > arm64/vdso: Zap vvar pages when switching to a time namespace > arm64/vdso: Add time napespace page > arm64/vdso: Handle faults on timens page > arm64/vdso: Restrict splitting VVAR VMA > arm64: enable time namespace support > > arch/arm64/Kconfig| 1 + > .../include/asm/vdso/compat_gettimeofday.h| 11 ++ > arch/arm64/include/asm/vdso/gettimeofday.h| 8 ++ > arch/arm64/kernel/vdso.c | 134 -- > arch/arm64/kernel/vdso/vdso.lds.S | 3 +- > arch/arm64/kernel/vdso32/vdso.lds.S | 3 +- > include/vdso/datapage.h | 1 + > 7 files changed, 147 insertions(+), 14 deletions(-) > Thanks, Dmitry
Re: [PATCH 3/6] arm64/vdso: Add time namespace page
Hi Andrei, On 6/2/20 7:02 PM, Andrei Vagin wrote: [..] > --- a/arch/arm64/include/asm/vdso.h > +++ b/arch/arm64/include/asm/vdso.h > @@ -12,6 +12,12 @@ > */ > #define VDSO_LBASE 0x0 > > +#ifdef CONFIG_TIME_NS > +#define __VVAR_PAGES2 > +#else > +#define __VVAR_PAGES1 > +#endif > + > #ifndef __ASSEMBLY__ Not an issue as-is, but: on x86 vdso+vvar is always the same size with/without CONFIG_TIME_NAMESPACE. Timens page isn't allocated on !CONFIG_TIME_NAMESPACE, but vma is the same size. Which simplifies criu/vdso migration between different kernel configs. Not any critical, but just to note.. Thanks, Dima
Re: [PATCH 2/4] serial: core: fix broken sysrq port unlock
On 6/2/20 3:48 PM, Andy Shevchenko wrote: > On Tue, Jun 2, 2020 at 5:03 PM Johan Hovold wrote: >> >> Commit d6e1935819db ("serial: core: Allow processing sysrq at port >> unlock time") worked around a circular locking dependency by adding >> helpers used to defer sysrq processing to when the port lock was >> released. >> >> A later commit unfortunately converted these inline helpers to exported >> functions despite the fact that the unlock helper was restoring irq >> flags, something which needs to be done in the same function that saved >> them (e.g. on SPARC). > > I'm not familiar with sparc, can you elaborate a bit what is ABI / > architecture lock implementation background? I remember that was a limitation a while ago to save/restore flags from the same function. Though, I vaguely remember the reason. I don't see this limitation in Documentation/* Google suggests that it's related to storage location: https://stackoverflow.com/a/34279032 Which is definitely non-issue with tty drivers: they call spin_lock_irqsave() with local flags and pass them to uart_unlock_and_check_sysrq(). Looking into arch/sparc I also can't catch if it's still a limitation. Also, looking around, xa_unlock_irqrestore() is called not from the same function. Maybe this issue is in history? Johan, is it a theoretical problem or something you observe? Also, some comments would be nice near functions in the header. Thanks, Dmitry
Re: [PATCHv3 12/50] csky: Add show_stack_loglvl()
Hi Andrew, On 5/15/20 8:23 PM, Andrew Morton wrote: > On Sat, 18 Apr 2020 21:19:06 +0100 Dmitry Safonov wrote: > >> Currently, the log-level of show_stack() depends on a platform >> realization. It creates situations where the headers are printed with >> lower log level or higher than the stacktrace (depending on >> a platform or user). >> >> Furthermore, it forces the logic decision from user to an architecture >> side. In result, some users as sysrq/kdb/etc are doing tricks with >> temporary rising console_loglevel while printing their messages. >> And in result it not only may print unwanted messages from other CPUs, >> but also omit printing at all in the unlucky case where the printk() >> was deferred. >> >> Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems >> an easier approach than introducing more printk buffers. >> Also, it will consolidate printings with headers. >> >> Introduce show_stack_loglvl(), that eventually will substitute >> show_stack(). > > The csky code has changed a lot in linux-next due to 18c07d23da5a > ("csky: Fixup calltrace panic"). I redid this patch as below. Can we > please review and test? The result looks good to me, thanks for the rebase! Thanks, Dmitry
Re: [PATCHv3 42/50] xtensa: Add loglvl to show_trace()
Hi Mike, On 5/11/20 8:45 PM, Mike Rapoport wrote: [..] >> @@ -511,7 +515,7 @@ void show_stack(struct task_struct *task, unsigned long >> *sp) >> print_hex_dump(KERN_INFO, " ", DUMP_PREFIX_NONE, >> STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, >> sp, len, false); >> -show_trace(task, sp); >> +show_trace(task, stack, KERN_INFO); > > it should have been > > show_trace(task, sp, KERN_INFO); Thank you for noticing it! > > Andrew, can you fold the following patch as a fixup please: > > > diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c > index f9217b6b45c8..efc3a29cde80 100644 > --- a/arch/xtensa/kernel/traps.c > +++ b/arch/xtensa/kernel/traps.c > @@ -515,7 +515,7 @@ void show_stack(struct task_struct *task, unsigned long > *sp, const char *loglvl) > print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE, > STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, > sp, len, false); > - show_trace(task, stack, loglvl); > + show_trace(task, sp, loglvl); > } > > DEFINE_SPINLOCK(die_lock); > Thanks, Dmitry
Re: [PATCH -next RESEND] Documentation/admin-guide: fix sysctl Sphinx warning
Hi Randy, On 10/21/19 3:44 AM, Randy Dunlap wrote: > From: Randy Dunlap > > Fix Sphinx warning when building Documentation: > > Documentation/admin-guide/sysctl/kernel.rst:397: WARNING: Title underline too > short. > > hung_task_interval_warnings: > === Thanks for the patch! I'm in process of reworking the patch in akpm according to reviews, I will incorporate your change in next version. > > Signed-off-by: Randy Dunlap > Cc: Dmitry Safonov > Cc: Andrew Morton > Cc: Ingo Molnar > --- > Documentation/admin-guide/sysctl/kernel.rst |2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > --- linux-next-20191018.orig/Documentation/admin-guide/sysctl/kernel.rst > +++ linux-next-20191018/Documentation/admin-guide/sysctl/kernel.rst > @@ -394,7 +394,7 @@ This file shows up if CONFIG_DETECT_HUNG > > > hung_task_interval_warnings: > -=== > + > > The same as hung_task_warnings, but set the number of interval > warnings to be issued about detected hung tasks during check > > Thanks, Dmitry
Re: [PATCHv7 01/33] ns: Introduce Time Namespace
On 10/16/19 11:44 AM, Vincenzo Frascino wrote: > On 10/16/19 11:39 AM, Thomas Gleixner wrote: [..] >> config TIME_NS >> bool "TIME namespace" >> depends on GENERIC_VDSO_TIME_NS >> default y >> >> and in lib/vdso/Kconfig >> >> config GENERIC_VDSO_TIME_NS >> bool >> >> and let architectures which have support for the VDSO bits select it. >> > > Agreed, this is even better. Thanks, will fix in v8, Dmitry
[PATCHv7 02/33] time: Add timens_offsets to be used for tasks in timens
From: Andrei Vagin Introduce offsets for time namespace. They will contain an adjustment needed to convert clocks to/from host's. A new namespace is created with the same offsets as the time namespace of the current process. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- include/linux/time_namespace.h | 22 ++ kernel/time/namespace.c| 2 ++ 2 files changed, 24 insertions(+) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 873b908c9ba8..3d429c7ecca5 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -12,11 +12,17 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct timens_offsets { + struct timespec64 monotonic; + struct timespec64 boottime; +}; + struct time_namespace { struct kref kref; struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; + struct timens_offsets offsets; } __randomize_layout; extern struct time_namespace init_time_ns; @@ -37,6 +43,20 @@ static inline void put_time_ns(struct time_namespace *ns) kref_put(&ns->kref, free_time_ns); } +static inline void timens_add_monotonic(struct timespec64 *ts) +{ + struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; + + *ts = timespec64_add(*ts, ns_offsets->monotonic); +} + +static inline void timens_add_boottime(struct timespec64 *ts) +{ + struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; + + *ts = timespec64_add(*ts, ns_offsets->boottime); +} + #else static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { @@ -61,6 +81,8 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *ts return 0; } +static inline void timens_add_monotonic(struct timespec64 *ts) {} +static inline void timens_add_boottime(struct timespec64 *ts) {} #endif #endif /* _LINUX_TIMENS_H */ diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 2662a69e0382..c2a58e45fc4b 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { @@ -60,6 +61,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); + ns->offsets = old_ns->offsets; return ns; fail_free: -- 2.23.0
[PATCHv7 11/33] timerfd: Make timerfd_settime() time namespace aware
From: Andrei Vagin timerfd_settime() accepts an absolute value of the expiration time if TFD_TIMER_ABSTIME is specified. This value is in task's time namespace and has to be converted to the host's time namespace. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- fs/timerfd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index 48305ba41e3c..f9da5752a79e 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -26,6 +26,7 @@ #include #include #include +#include struct timerfd_ctx { union { @@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, } if (texp != 0) { + if (flags & TFD_TIMER_ABSTIME) + texp = timens_ktime_to_host(clockid, texp); if (isalarm(ctx)) { if (flags & TFD_TIMER_ABSTIME) alarm_start(&ctx->t.alarm, texp); -- 2.23.0
[PATCHv7 31/33] selftests/timens: Add timer offsets test
From: Andrei Vagin Check that timer_create() takes into account clock offsets. Output on success: 1..3 ok 1 clockid=7 ok 2 clockid=1 ok 3 clockid=9 # Pass 3 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output with lack of permissions: 1..3 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..3 not ok 1 # SKIP Time namespaces are not supported Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 3 +- tools/testing/selftests/timens/timer.c| 118 ++ 3 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/timer.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 94ffdd9cead7..3b7eda8f35ce 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,4 +1,5 @@ clock_nanosleep procfs timens +timer timerfd diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index e4913f2991d4..08164548a49d 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,5 +1,6 @@ -TEST_GEN_PROGS := timens timerfd clock_nanosleep procfs +TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs CFLAGS := -Wall -Werror -pthread +LDFLAGS := -lrt include ../lib.mk diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c new file mode 100644 index ..5164cafd408d --- /dev/null +++ b/tools/testing/selftests/timens/timer.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +int run_test(int clockid, struct timespec now) +{ + struct itimerspec new_value; + long long elapsed; + timer_t fd; + int i; + + for (i = 0; i < 2; i++) { + struct sigevent sevp = {.sigev_notify = SIGEV_NONE}; + int flags = 0; + + new_value.it_value.tv_sec = 3600; + new_value.it_value.tv_nsec = 0; + new_value.it_interval.tv_sec = 1; + new_value.it_interval.tv_nsec = 0; + + if (i == 1) { + new_value.it_value.tv_sec += now.tv_sec; + new_value.it_value.tv_nsec += now.tv_nsec; + } + + if (timer_create(clockid, &sevp, &fd) == -1) + return pr_perror("timerfd_create"); + + if (i == 1) + flags |= TIMER_ABSTIME; + if (timer_settime(fd, flags, &new_value, NULL) == -1) + return pr_perror("timerfd_settime"); + + if (timer_gettime(fd, &new_value) == -1) + return pr_perror("timerfd_gettime"); + + elapsed = new_value.it_value.tv_sec; + if (abs(elapsed - 3600) > 60) { + ksft_test_result_fail("clockid: %d elapsed: %lld\n", + clockid, elapsed); + return 1; + } + } + + ksft_test_result_pass("clockid=%d\n", clockid); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret, status, len, fd; + char buf[4096]; + pid_t pid; + struct timespec btime_now, mtime_now; + + nscheck(); + + ksft_set_plan(3); + + clock_gettime(CLOCK_MONOTONIC, &mtime_now); + clock_gettime(CLOCK_BOOTTIME, &btime_now); + + if (unshare_timens()) + return 1; + + len = snprintf(buf, sizeof(buf), "%d %d 0\n%d %d 0", + CLOCK_MONOTONIC, 70 * 24 * 3600, + CLOCK_BOOTTIME, 9 * 24 * 3600); + fd = open("/proc/self/timens_offsets", O_WRONLY); + if (fd < 0) + return pr_perror("/proc/self/timens_offsets"); + + if (write(fd, buf, len) != len) + return pr_perror("/proc/self/timens_offsets"); + + close(fd); + mtime_now.tv_sec += 70 * 24 * 3600; + btime_now.tv_sec += 9 * 24 * 3600; + + pid = fork(); + if (pid < 0) + return pr_perror("Unable to fork"); + if (pid == 0) { + ret = 0; + ret |= run_test(CLOCK_BOOTTIME, btime_now); + ret |= run_test(CLOCK_MONOTONIC, mtime_now); + ret |= run_test(CLOCK_BOOTTIME_ALARM, btime_now); + + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); + return ret; +
[PATCHv7 07/33] posix-clocks: Introduce clock_get_ktime() callback
From: Andrei Vagin The callsite in common_timer_get() has already a comment: /* * The timespec64 based conversion is suboptimal, but it's not * worth to implement yet another callback. */ kc->clock_get(timr->it_clock, &ts64); now = timespec64_to_ktime(ts64); The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format. Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/alarmtimer.c | 19 ++- kernel/time/posix-timers.c | 26 +- kernel/time/posix-timers.h | 3 +++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 73a5458194c7..9415c83f8cca 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -664,7 +664,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp * @which_clock: clockid * @tp: timespec to fill. * - * Provides the underlying alarm base time. + * Provides the underlying alarm base time in a tasks time namespace. */ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { @@ -676,6 +676,22 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp return base->get_timespec(base->base_clockid, tp); } +/** + * alarm_clock_get_ktime - posix clock_get_ktime interface + * @which_clock: clockid + * + * Provides the underlying alarm base time in the root namespace. + */ +static ktime_t alarm_clock_get_ktime(clockid_t which_clock) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + return base->get_ktime(); +} + /** * alarm_timer_create - posix timer_create interface * @new_timer: k_itimer pointer to manage @@ -839,6 +855,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, + .clock_get_ktime= alarm_clock_get_ktime, .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e65241a46038..1d7329e8425f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -171,6 +171,11 @@ int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) return 0; } +static ktime_t posix_get_realtime_ktime(clockid_t which_clock) +{ + return ktime_get_real(); +} + /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) @@ -193,6 +198,11 @@ static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 return 0; } +static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) +{ + return ktime_get(); +} + /* * Get monotonic-raw time for posix timers */ @@ -228,12 +238,22 @@ int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 * return 0; } +static ktime_t posix_get_boottime_ktime(const clockid_t which_clock) +{ + return ktime_get_boottime(); +} + static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; } +static ktime_t posix_get_tai_ktime(clockid_t which_clock) +{ + return ktime_get_clocktai(); +} + static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; @@ -781,7 +801,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic -* functions which use timr->kclock->clock_get_timespec() work. +* functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. @@ -1262,6 +1282,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_realtime_timespec, + .clock_get_ktime= posix_get_realtime_ktime, .clock_set
[PATCHv7 18/33] lib/vdso: Add unlikely() hint into vdso_read_begin()
From: Andrei Vagin Place the branch with no concurrent write before contended case. Performance numbers for Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz (more clock_gettime() cycles - the better): | before| after --- | 150252214 | 153242367 | 150301112 | 153324800 | 150392773 | 153125401 | 150373957 | 153399355 | 150303157 | 153489417 | 150365237 | 153494270 --- avg | 150331408 | 153345935 diff % | 2 | 0 --- stdev % | 0.3 | 0.1 Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- include/vdso/helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 01641dbb68ef..9a2af9fca45e 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -10,7 +10,7 @@ static __always_inline u32 vdso_read_begin(const struct vdso_data *vd) { u32 seq; - while ((seq = READ_ONCE(vd->seq)) & 1) + while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) cpu_relax(); smp_rmb(); -- 2.23.0
[PATCHv7 12/33] posix-timers: Make timer_settime() time namespace aware
From: Andrei Vagin Wire timer_settime() syscall into time namespace virtualization. sys_timer_settime() calls the ktime->timer_set() callback. Right now, common_timer_set() is the only implementation for the callback. There user-supplied timer's value is converted from timespec64 to ktime and then timens_ktime_to_host() can be used to convert namespace's time to the host time. Inside a time namespace kernel's time differ on a fixed offset from a user-supplied, but only absolute values (TIMER_ABSTIME) must be converted. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/posix-timers.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d8b5bd4cbae1..6e350cc8f600 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -885,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); + if (flags & TIMER_ABSTIME) + expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); -- 2.23.0
[PATCHv7 20/33] x86/vdso: Provide vdso_data offset on vvar_page
VDSO support for Time namespace needs to set up a page with the same layout as VVAR. That timens page will be placed on position of VVAR page inside namespace. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. To prepare timens page kernel needs to know the vdso_data offset. Provide arch_get_vdso_data() helper for locating vdso_data on VVAR page. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vdso-layout.lds.S | 2 -- arch/x86/entry/vdso/vma.c | 11 +++ arch/x86/include/asm/vvar.h | 8 arch/x86/kernel/vmlinux.lds.S | 4 +--- include/linux/time_namespace.h| 1 + 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 93c6dc7812d0..2330daad67c3 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -21,9 +21,7 @@ SECTIONS /* Place all vvars at the offsets in asm/vvar.h. */ #define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#define __VVAR_KERNEL_LDS #include -#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR pvclock_page = vvar_start + PAGE_SIZE; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 000db8282cc8..5dab706aca2e 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -24,6 +24,17 @@ #include #include +#undef _ASM_X86_VVAR_H +#define EMIT_VVAR(name, offset)\ + const size_t name ## _offset = offset; +#include + +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page + _vdso_data_offset); +} +#undef EMIT_VVAR + #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; #endif diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 32f5d9a0b90e..ff2de3025388 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -19,10 +19,10 @@ #ifndef _ASM_X86_VVAR_H #define _ASM_X86_VVAR_H -#if defined(__VVAR_KERNEL_LDS) - -/* The kernel linker script defines its own magic to put vvars in the - * right place. +#ifdef EMIT_VVAR +/* + * EMIT_VVAR() is used by the kernel linker script to put vvars in the + * right place. Also, it's used by kernel code to import offsets values. */ #define DECLARE_VVAR(offset, type, name) \ EMIT_VVAR(name, offset) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e2feacf921a0..ca02d0d301cd 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -195,12 +195,10 @@ SECTIONS __vvar_beginning_hack = .; /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset)\ +#define EMIT_VVAR(name, offset)\ . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) -#define __VVAR_KERNEL_LDS #include -#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR /* diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 9a77d3854830..772911945944 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -37,6 +37,7 @@ extern struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns); extern void free_time_ns(struct kref *kref); extern int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); +extern struct vdso_data *arch_get_vdso_data(void *vvar_page); static inline void put_time_ns(struct time_namespace *ns) { -- 2.23.0
[PATCHv7 15/33] posix-timers: Make clock_nanosleep() time namespace aware
From: Andrei Vagin clock_nanosleep() accepts absolute values of expiration time, if the TIMER_ABSTIME flag is set. This value is in the task time namespace, which has to be converted to the host time namespace. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/posix-stubs.c | 12 ++-- kernel/time/posix-timers.c | 17 +++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 2ccefc9ce184..47ee2684d250 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -129,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct __kernel_timespec __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -147,7 +148,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(clockid, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -215,6 +219,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, struct old_timespec32 __user *, rmtp) { struct timespec64 t; + ktime texp; switch (which_clock) { case CLOCK_REALTIME: @@ -233,7 +238,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(clockid, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index c0ae1f6d2add..ccae99a645e1 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1228,6 +1228,19 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } +static int common_nsleep_timens(const clockid_t which_clock, int flags, +const struct timespec64 *rqtp) +{ + ktime_t texp = timespec64_to_ktime(*rqtp); + + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? +HRTIMER_MODE_ABS : HRTIMER_MODE_REL, +which_clock); +} + SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) @@ -1305,7 +1318,7 @@ static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_timespec, .clock_get_ktime= posix_get_monotonic_ktime, - .nsleep = common_nsleep, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1354,7 +1367,7 @@ static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, .clock_get_ktime= posix_get_boottime_ktime, .clock_get_timespec = posix_get_boottime_timespec, - .nsleep = common_nsleep, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, -- 2.23.0
[PATCHv7 08/33] posix-timers: Use clock_get_ktime() in common_timer_get()
From: Andrei Vagin Now, when the clock_get_ktime() callback exists, the suboptimal timespec64-based conversion can be removed from common_timer_get(). Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/posix-timers.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1d7329e8425f..47a8d43fe1c6 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -665,7 +665,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct timespec64 ts64; bool sig_none; sig_none = timr->it_sigev_notify == SIGEV_NONE; @@ -683,12 +682,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) return; } - /* -* The timespec64 based conversion is suboptimal, but it's not -* worth to implement yet another callback. -*/ - kc->clock_get_timespec(timr->it_clock, &ts64); - now = timespec64_to_ktime(ts64); + now = kc->clock_get_ktime(timr->it_clock); /* * When a requeue is pending or this is a SIGEV_NONE timer move the -- 2.23.0
[PATCHv7 24/33] x86/vdso: On timens page fault prefault also VVAR page
As timens page has offsets to data on VVAR page VVAR is going to be accessed shortly. Set it up with timens in one page fault as optimization. Suggested-by: Thomas Gleixner Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vma.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index f6e13ab29d94..d6cb8a16f368 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -169,8 +169,23 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, * offset. * See also the comment near timens_setup_vdso_data(). */ - if (timens_page) + if (timens_page) { + unsigned long addr; + vm_fault_t err; + + /* +* Optimization: inside time namespace pre-fault +* VVAR page too. As on timens page there are only +* offsets for clocks on VVAR, it'll be faulted +* shortly by VDSO code. +*/ + addr = vmf->address + (image->sym_timens_page - sym_offset); + err = vmf_insert_pfn(vma, addr, pfn); + if (unlikely(err & VM_FAULT_ERROR)) + return err; + pfn = page_to_pfn(timens_page); + } return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_pvclock_page) { -- 2.23.0
[PATCHv7 17/33] x86/vdso: Restrict splitting VVAR VMA
Forbid splitting VVAR resulting in stricter ABI and reducing amount of corner-cases to consider while working further on VDSO. As offset from timens to VVAR page is computed compile-time, the pages in VVAR should stay together and not being partically mremap()'ed. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vma.c | 13 + 1 file changed, 13 insertions(+) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index f5937742b290..000db8282cc8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -84,6 +84,18 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } +static int vvar_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + const struct vdso_image *image = new_vma->vm_mm->context.vdso_image; + + if (new_size != -image->sym_vvar_start) + return -EINVAL; + + return 0; +} + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -136,6 +148,7 @@ static const struct vm_special_mapping vdso_mapping = { static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", .fault = vvar_fault, + .mremap = vvar_mremap, }; /* -- 2.23.0
[PATCHv7 19/33] lib/vdso: Prepare for time namespace support
From: Thomas Gleixner To support time namespaces in the vdso with a minimal impact on regular non time namespace affected tasks, the namespace handling needs to be hidden in a slow path. The most obvious place is vdso_seq_begin(). If a task belongs to a time namespace then the VVAR page which contains the system wide vdso data is replaced with a namespace specific page which has the same layout as the VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. The extra check in the case that vdso_data->seq is odd, e.g. a concurrent update of the vdso data is in progress, is not really affecting regular tasks which are not part of a time namespace as the task is spin waiting for the update to finish and vdso_data->seq to become even again. If a time namespace task hits that code path, it invokes the corresponding time getter function which retrieves the real VVAR page, reads host time and then adds the offset for the requested clock which is stored in the special VVAR page. If VDSO time namespace support is disabled the whole magic is compiled out. Initial testing shows that the disabled case is almost identical to the host case which does not take the slow timens path. With the special timens page installed the performance hit is constant time and in the range of 5-7%. For the vdso functions which are not using the sequence count an unconditional check for vdso_data->clock_mode is added which switches to the real vdso when the clock_mode is VCLOCK_TIMENS. Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- include/linux/time.h| 6 ++ include/vdso/datapage.h | 19 +- lib/vdso/Kconfig| 6 ++ lib/vdso/gettimeofday.c | 128 +++- 4 files changed, 155 insertions(+), 4 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index 27d83fd2ae61..b1a592638d7d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -96,4 +96,10 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its) */ #define time_after32(a, b) ((s32)((u32)(b) - (u32)(a)) < 0) #define time_before32(b, a)time_after32(a, b) + +struct timens_offset { + s64 sec; + u64 nsec; +}; + #endif diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 2e302c0f41f7..65a38acce27e 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -21,6 +21,8 @@ #define CS_RAW 1 #define CS_BASES (CS_RAW + 1) +#define VCLOCK_TIMENS UINT_MAX + /** * struct vdso_timestamp - basetime per clock_id * @sec: seconds @@ -48,6 +50,7 @@ struct vdso_timestamp { * @mult: clocksource multiplier * @shift: clocksource shift * @basetime[clock_id]:basetime per clock_id + * @offset[clock_id]: time namespace offset per clock_id * @tz_minuteswest:minutes west of Greenwich * @tz_dsttime:type of DST correction * @hrtimer_res: hrtimer resolution @@ -55,6 +58,17 @@ struct vdso_timestamp { * * vdso_data will be accessed by 64 bit and compat code at the same time * so we should be careful before modifying this structure. + * + * @basetime is used to store the base time for the system wide time getter + * VVAR page. + * + * @offset is used by the special time namespace VVAR pages which are + * installed instead of the real VVAR page. These namespace pages must set + * @seq to 1 and @clock_mode to VLOCK_TIMENS to force the code into the + * time namespace slow path. The namespace aware functions retrieve the + * real system wide VVAR page, read host time and add the per clock offset. + * For clocks which are not affected by time namespace adjustement the + * offset must be zero. */ struct vdso_data { u32 seq; @@ -65,7 +79,10 @@ struct vdso_data { u32 mult; u32 shift; - struct vdso_timestamp basetime[VDSO_BASES]; + union { + struct vdso_timestamp basetime[VDSO_BASES]; + struct timens_offsetoffset[VDSO_BASES]; + }; s32 tz_minuteswest; s32 tz_dsttime; diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 9fe698ff62ec..85276de70dba 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -24,4 +24,10 @@ config GENERIC_COMPAT_VDSO help This config option enables the compat VDSO layer. +config VDSO_TIMENS + bool + help + Selected by architectures which support time namespaces in the + VDSO + endif diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index e630e7ff57f1..25244b677823 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -38,6 +38,51 @@ u64 vdso_calc_delta(u6
[PATCHv7 28/33] selftests/timens: Add a test for timerfd
From: Andrei Vagin Check that timerfd_create() takes into account clock offsets. Output on success: 1..3 ok 1 clockid=7 ok 2 clockid=1 ok 3 clockid=9 # Pass 3 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output on failure: 1..3 not ok 1 clockid: 7 elapsed: 0 not ok 2 clockid: 1 elapsed: 0 not ok 3 clockid: 9 elapsed: 0 Bail out! Output with lack of permissions: 1..3 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..3 not ok 1 # SKIP Time namespaces are not supported Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/timerfd.c | 129 ++ 3 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/timerfd.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 27a693229ce1..b609f6ee9fb9 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1 +1,2 @@ timens +timerfd diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index b877efb78974..66b90cd28e5c 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens +TEST_GEN_PROGS := timens timerfd CFLAGS := -Wall -Werror diff --git a/tools/testing/selftests/timens/timerfd.c b/tools/testing/selftests/timens/timerfd.c new file mode 100644 index ..619b096b7fe5 --- /dev/null +++ b/tools/testing/selftests/timens/timerfd.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +static int tclock_gettime(clock_t clockid, struct timespec *now) +{ + if (clockid == CLOCK_BOOTTIME_ALARM) + clockid = CLOCK_BOOTTIME; + return clock_gettime(clockid, now); +} + +int run_test(int clockid, struct timespec now) +{ + struct itimerspec new_value; + long long elapsed; + int fd, i; + + if (tclock_gettime(clockid, &now)) + return pr_perror("clock_gettime(%d)", clockid); + + for (i = 0; i < 2; i++) { + int flags = 0; + + new_value.it_value.tv_sec = 3600; + new_value.it_value.tv_nsec = 0; + new_value.it_interval.tv_sec = 1; + new_value.it_interval.tv_nsec = 0; + + if (i == 1) { + new_value.it_value.tv_sec += now.tv_sec; + new_value.it_value.tv_nsec += now.tv_nsec; + } + + fd = timerfd_create(clockid, 0); + if (fd == -1) + return pr_perror("timerfd_create(%d)", clockid); + + if (i == 1) + flags |= TFD_TIMER_ABSTIME; + + if (timerfd_settime(fd, flags, &new_value, NULL)) + return pr_perror("timerfd_settime(%d)", clockid); + + if (timerfd_gettime(fd, &new_value)) + return pr_perror("timerfd_gettime(%d)", clockid); + + elapsed = new_value.it_value.tv_sec; + if (abs(elapsed - 3600) > 60) { + ksft_test_result_fail("clockid: %d elapsed: %lld\n", + clockid, elapsed); + return 1; + } + + close(fd); + } + + ksft_test_result_pass("clockid=%d\n", clockid); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret, status, len, fd; + char buf[4096]; + pid_t pid; + struct timespec btime_now, mtime_now; + + nscheck(); + + ksft_set_plan(3); + + clock_gettime(CLOCK_MONOTONIC, &mtime_now); + clock_gettime(CLOCK_BOOTTIME, &btime_now); + + if (unshare_timens()) + return 1; + + len = snprintf(buf, sizeof(buf), "%d %d 0\n%d %d 0", + CLOCK_MONOTONIC, 70 * 24 * 3600, + CLOCK_BOOTTIME, 9 * 24 * 3600); + fd = open("/proc/self/timens_offsets", O_WRONLY); + if (fd < 0) + return pr_perror("/proc/self/timens_offsets"); + + if (write(fd, buf, len) != len) + return pr_perror("/proc/self/timens_offsets"); + + close(fd); + mtime_now.tv_sec += 70 * 24 * 3600; + btime_now.tv_sec += 9 * 24 * 3600; + + pid = fork(); + if (pid < 0) + return pr_perror("Unable to fork"); + if (pid == 0) { + ret = 0; +
[PATCHv7 25/33] x86/vdso: Zap vvar pages on switch a time namspace
The VVAR page layout depends on whether a task belongs to the root or non-root time namespace. Whenever a task changes its namespace, the VVAR page tables are cleared and then they will re-faulted with a corresponding layout. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vma.c | 27 +++ include/linux/time_namespace.h | 3 +++ kernel/time/namespace.c| 10 ++ 3 files changed, 40 insertions(+) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index d6cb8a16f368..57ada3e95f8d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -50,6 +50,7 @@ void __init init_vdso_image(const struct vdso_image *image) image->alt_len)); } +static const struct vm_special_mapping vvar_mapping; struct linux_binprm; static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, @@ -127,6 +128,32 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma) return NULL; } + +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_data() for details. + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, &vvar_mapping)) + zap_page_range(vma, vma->vm_start, size); + } + + up_write(&mm->mmap_sem); + return 0; +} #else static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) { diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index c479cfda2c3e..dcf3dbf2836b 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -30,6 +30,9 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +extern int vdso_join_timens(struct task_struct *task, + struct time_namespace *ns); + static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { kref_get(&ns->kref); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e14cd1ca387d..0dc0742ed1ee 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,6 +280,7 @@ static void timens_put(struct ns_common *ns) static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) { struct time_namespace *ns = to_time_ns(new); + int err; if (!current_is_single_threaded()) return -EUSERS; @@ -290,6 +291,10 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) timens_set_vvar_page(current, ns); + err = vdso_join_timens(current, ns); + if (err) + return err; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -304,6 +309,7 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); + int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) @@ -311,6 +317,10 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) timens_set_vvar_page(tsk, ns); + err = vdso_join_timens(tsk, ns); + if (err) + return err; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; -- 2.23.0
[PATCHv7 22/33] time: Allocate per-timens vvar page
VDSO support for Time namespace needs to set up a page with the same layout as VVAR. That timens page will be placed on position of VVAR page inside namespace. That page contains time namespace clock offsets and it has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. Allocate the timens page during namespace creation. Setup the offsets when the first task enters the ns and freeze them to guarantee the pace of monotonic/boottime clocks and to avoid breakage of applications. The design decision is to have a global offset_lock which is used during namespace offsets set up and to freeze offsets when first task joins the new ns. That is better in terms of memory usage comparing to having per-ns mutex that's used only during the set up period. Suggested-by: Andy Lutomirski Based-on-work-by: Thomas Gleixner Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- include/linux/time_namespace.h | 3 + kernel/time/namespace.c| 103 - 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 772911945944..c479cfda2c3e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -23,6 +23,9 @@ struct time_namespace { struct ucounts *ucounts; struct ns_common ns; struct timens_offsets offsets; + struct page *vvar_page; + /* Disallow changing offsets after any task joined namespace. */ + bool frozen_offsets; } __randomize_layout; extern struct time_namespace init_time_ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 1a0fbaa5d2d4..e14cd1ca387d 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -15,6 +15,7 @@ #include #include #include +#include ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) @@ -90,16 +91,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, kref_init(&ns->kref); + ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!ns->vvar_page) + goto fail_free; + err = ns_alloc_inum(&ns->ns); if (err) - goto fail_free; + goto fail_free_page; ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; + ns->frozen_offsets = false; return ns; +fail_free_page: + __free_page(ns->vvar_page); fail_free: kfree(ns); fail_dec: @@ -128,6 +136,93 @@ struct time_namespace *copy_time_ns(unsigned long flags, return clone_time_ns(user_ns, old_ns); } +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_data->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces + * the time namespace handling path. + */ +static void timens_setup_vdso_data(struct vdso_data *vdata, + struct time_namespace *ns) +{ + struct timens_offset *offset = vdata->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vdata->seq = 1; + vdata->clock_mode = VCLOCK_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM]= boottime; +} + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +static DEFINE_MUTEX(o
[PATCHv7 29/33] selftests/timens: Add a test for clock_nanosleep()
From: Andrei Vagin Check that clock_nanosleep() takes into account clock offsets. Output on success: 1..4 ok 1 clockid: 1 abs:0 ok 2 clockid: 1 abs:1 ok 3 clockid: 9 abs:0 ok 4 clockid: 9 abs:1 Output with lack of permissions: 1..4 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..4 not ok 1 # SKIP Time namespaces are not supported Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 4 +- .../selftests/timens/clock_nanosleep.c| 143 ++ 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/timens/clock_nanosleep.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index b609f6ee9fb9..9b6c8ddac2c8 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,2 +1,3 @@ +clock_nanosleep timens timerfd diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 66b90cd28e5c..801e7ab2f8bf 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,5 +1,5 @@ -TEST_GEN_PROGS := timens timerfd +TEST_GEN_PROGS := timens timerfd clock_nanosleep -CFLAGS := -Wall -Werror +CFLAGS := -Wall -Werror -pthread include ../lib.mk diff --git a/tools/testing/selftests/timens/clock_nanosleep.c b/tools/testing/selftests/timens/clock_nanosleep.c new file mode 100644 index ..0f4eab6e4669 --- /dev/null +++ b/tools/testing/selftests/timens/clock_nanosleep.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +void test_sig(int sig) { + if (sig == SIGUSR2) + pthread_exit(NULL); +} + +struct thread_args { + struct timespec *now, *rem; + pthread_mutex_t *lock; + int clockid; + int abs; +}; + +void *call_nanosleep(void *_args) +{ + struct thread_args *args = _args; + clock_nanosleep(args->clockid, args->abs ? TIMER_ABSTIME : 0, args->now, args->rem); + pthread_mutex_unlock(args->lock); + return NULL; +} + +int run_test(int clockid, int abs) +{ + struct timespec now = {}, rem; + struct thread_args args = { .now = &now, .rem = &rem, .clockid = clockid}; + struct timespec start; + pthread_mutex_t lock; + pthread_t thread; + int j, ok, ret; + + signal(SIGUSR1, test_sig); + signal(SIGUSR2, test_sig); + + pthread_mutex_init(&lock, NULL); + pthread_mutex_lock(&lock); + + if (clock_gettime(clockid, &start) == -1) + return pr_perror("clock_gettime"); + + + if (abs) { + now.tv_sec = start.tv_sec; + now.tv_nsec = start.tv_nsec; + } + + now.tv_sec += 3600; + args.abs = abs; + args.lock = &lock; + ret = pthread_create(&thread, NULL, call_nanosleep, &args); + if (ret != 0) { + pr_err("Unable to create a thread: %s", strerror(ret)); + return 1; + } + + /* Wait when the thread will call clock_nanosleep(). */ + ok = 0; + for (j = 0; j < 8; j++) { + /* The maximum timeout is about 5 seconds. */ + usleep(1 << j); + + /* Try to interrupt clock_nanosleep(). */ + pthread_kill(thread, SIGUSR1); + + usleep(1 << j); + /* Check whether clock_nanosleep() has been interrupted or not. */ + if (pthread_mutex_trylock(&lock) == 0) { + /**/ + ok = 1; + break; + } + } + if (!ok) + pthread_kill(thread, SIGUSR2); + pthread_join(thread, NULL); + pthread_mutex_destroy(&lock); + + if (!ok) { + ksft_test_result_pass("clockid: %d abs:%d timeout\n", clockid, abs); + return 1; + } + + if (rem.tv_sec < 3300 || rem.tv_sec > 3900) { + pr_fail("clockid: %d abs: %d remain: %ld\n", + clockid, abs, rem.tv_sec); + return 1; + } + ksft_test_result_pass("clockid: %d abs:%d\n", clockid, abs); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret, nsfd; + + nscheck(); + + ksft_set_plan(4); + + if (unshare_timens()) + return 1; + + if (_settime(CLOCK_MONOTONIC, 7 * 24 * 3600)) + return 1; + if (_settime(CLOCK_BOOTTIME, 9 * 24 *
[PATCHv7 32/33] selftests/timens: Add a simple perf test for clock_gettime()
From: Andrei Vagin Output on success: 1..4 ok 1 host: clock: monotonic cycles: 148323947 ok 2 host: clock: boottime cycles: 148577503 ok 3 ns: clock: monotonic cycles: 137659217 ok 4 ns: clock: boottime cycles: 137959154 # Pass 4 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output with lack of permissions: 1..4 ok 1 host: clock: monotonic cycles: 145671139 ok 2 host: clock: boottime cycles: 146958357 not ok 3 # SKIP need to run as root Output without support of time namespaces: 1..4 ok 1 host: clock: monotonic cycles: 145671139 ok 2 host: clock: boottime cycles: 146958357 not ok 3 # SKIP Time namespaces are not supported Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- tools/testing/selftests/timens/.gitignore | 2 + tools/testing/selftests/timens/Makefile | 3 +- tools/testing/selftests/timens/gettime_perf.c | 91 +++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/gettime_perf.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 3b7eda8f35ce..16292e4d08a5 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,4 +1,6 @@ clock_nanosleep +gettime_perf +gettime_perf_cold procfs timens timer diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 08164548a49d..6aefcaccb8f4 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,6 +1,7 @@ TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs +TEST_GEN_PROGS_EXTENDED := gettime_perf CFLAGS := -Wall -Werror -pthread -LDFLAGS := -lrt +LDFLAGS := -lrt -ldl include ../lib.mk diff --git a/tools/testing/selftests/timens/gettime_perf.c b/tools/testing/selftests/timens/gettime_perf.c new file mode 100644 index ..3a6d9c485de5 --- /dev/null +++ b/tools/testing/selftests/timens/gettime_perf.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +typedef int (*vgettime_t)(clockid_t, struct timespec *); + +vgettime_t vdso_clock_gettime; + +static void fill_function_pointers(void) +{ + void *vdso = dlopen("linux-vdso.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-gate.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) { + pr_err("[WARN]\tfailed to find vDSO\n"); + return; + } + + vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + if (!vdso_clock_gettime) + pr_err("Warning: failed to find clock_gettime in vDSO\n"); + +} + +static void test(clock_t clockid, char *clockstr, bool in_ns) +{ + struct timespec tp, start; + long i = 0; + const int timeout = 3; + + vdso_clock_gettime(clockid, &start); + tp = start; + for (tp = start; start.tv_sec + timeout > tp.tv_sec || +(start.tv_sec + timeout == tp.tv_sec && + start.tv_nsec > tp.tv_nsec); i++) { + vdso_clock_gettime(clockid, &tp); + } + + ksft_test_result_pass("%s:\tclock: %10s\tcycles:\t%10ld\n", + in_ns ? "ns" : "host", clockstr, i); +} + +int main(int argc, char *argv[]) +{ + time_t offset = 10; + int nsfd; + + ksft_set_plan(4); + + fill_function_pointers(); + + test(CLOCK_MONOTONIC, "monotonic", false); + test(CLOCK_BOOTTIME, "boottime", false); + + nscheck(); + + if (unshare_timens()) + return 1; + + nsfd = open("/proc/self/ns/time_for_children", O_RDONLY); + if (nsfd < 0) + return pr_perror("Can't open a time namespace"); + + if (_settime(CLOCK_MONOTONIC, offset)) + return 1; + if (_settime(CLOCK_BOOTTIME, offset)) + return 1; + + if (setns(nsfd, CLONE_NEWTIME)) + return pr_perror("setns"); + + test(CLOCK_MONOTONIC, "monotonic", true); + test(CLOCK_BOOTTIME, "boottime", true); + + ksft_exit_pass(); + return 0; +} -- 2.23.0
[PATCHv7 33/33] selftests/timens: Check for right timens offsets after fork and exec
From: Andrei Vagin Output on success: 1..1 ok 1 exec # Pass 1 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output on failure: 1..1 not ok 1 36016 16 Bail out! Output with lack of permissions: 1..1 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..1 not ok 1 # SKIP Time namespaces are not supported Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/exec.c | 94 +++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/exec.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 16292e4d08a5..789f21e81028 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,4 +1,5 @@ clock_nanosleep +exec gettime_perf gettime_perf_cold procfs diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 6aefcaccb8f4..e9fb30bd8aeb 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs +TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec TEST_GEN_PROGS_EXTENDED := gettime_perf CFLAGS := -Wall -Werror -pthread diff --git a/tools/testing/selftests/timens/exec.c b/tools/testing/selftests/timens/exec.c new file mode 100644 index ..87b47b557a7a --- /dev/null +++ b/tools/testing/selftests/timens/exec.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +#define OFFSET (36000) + +int main(int argc, char *argv[]) +{ + struct timespec now, tst; + int status, i; + pid_t pid; + + if (argc > 1) { + if (sscanf(argv[1], "%ld", &now.tv_sec) != 1) + return pr_perror("sscanf"); + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec); + } + return 0; + } + + nscheck(); + + ksft_set_plan(1); + + clock_gettime(CLOCK_MONOTONIC, &now); + + if (unshare_timens()) + return 1; + + if (_settime(CLOCK_MONOTONIC, OFFSET)) + return 1; + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec, tst.tv_sec); + } + + if (argc > 1) + return 0; + + pid = fork(); + if (pid < 0) + return pr_perror("fork"); + + if (pid == 0) { + char now_str[64]; + char *cargv[] = {"exec", now_str, NULL}; + char *cenv[] = {NULL}; + + /* Check that a child process is in the new timens. */ + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec - OFFSET) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec + OFFSET, tst.tv_sec); + } + + /* Check for proper vvar offsets after execve. */ + snprintf(now_str, sizeof(now_str), "%ld", now.tv_sec + OFFSET); + execve("/proc/self/exe", cargv, cenv); + return pr_perror("execve"); + } + + if (waitpid(pid, &status, 0) != pid) + return pr_perror("waitpid"); + + if (status) + ksft_exit_fail(); + + ksft_test_result_pass("exec\n"); + ksft_exit_pass(); + return 0; +} -- 2.23.0
[PATCHv7 30/33] selftests/timens: Add procfs selftest
Check that /proc/uptime is correct inside a new time namespace. Output on success: 1..1 ok 1 Passed for /proc/uptime # Pass 1 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output with lack of permissions: 1..1 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..1 not ok 1 # SKIP Time namespaces are not supported Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/procfs.c | 144 ++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/procfs.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 9b6c8ddac2c8..94ffdd9cead7 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,3 +1,4 @@ clock_nanosleep +procfs timens timerfd diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 801e7ab2f8bf..e4913f2991d4 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens timerfd clock_nanosleep +TEST_GEN_PROGS := timens timerfd clock_nanosleep procfs CFLAGS := -Wall -Werror -pthread diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c new file mode 100644 index ..43d93f4006b9 --- /dev/null +++ b/tools/testing/selftests/timens/procfs.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +/* + * Test shouldn't be run for a day, so add 10 days to child + * time and check parent's time to be in the same day. + */ +#define MAX_TEST_TIME_SEC (60*5) +#define DAY_IN_SEC (60*60*24) +#define TEN_DAYS_IN_SEC(10*DAY_IN_SEC) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +static int child_ns, parent_ns; + +static int switch_ns(int fd) +{ + if (setns(fd, CLONE_NEWTIME)) + return pr_perror("setns()"); + + return 0; +} + +static int init_namespaces(void) +{ + char path[] = "/proc/self/ns/time_for_children"; + struct stat st1, st2; + + parent_ns = open(path, O_RDONLY); + if (parent_ns <= 0) + return pr_perror("Unable to open %s", path); + + if (fstat(parent_ns, &st1)) + return pr_perror("Unable to stat the parent timens"); + + if (unshare_timens()) + return -1; + + child_ns = open(path, O_RDONLY); + if (child_ns <= 0) + return pr_perror("Unable to open %s", path); + + if (fstat(child_ns, &st2)) + return pr_perror("Unable to stat the timens"); + + if (st1.st_ino == st2.st_ino) + return pr_err("The same child_ns after CLONE_NEWTIME"); + + if (_settime(CLOCK_BOOTTIME, TEN_DAYS_IN_SEC)) + return -1; + + return 0; +} + +static int read_proc_uptime(struct timespec *uptime) +{ + unsigned long up_sec, up_nsec; + FILE *proc; + + proc = fopen("/proc/uptime", "r"); + if (proc == NULL) { + pr_perror("Unable to open /proc/uptime"); + return -1; + } + + if (fscanf(proc, "%lu.%02lu", &up_sec, &up_nsec) != 2) { + if (errno) { + pr_perror("fscanf"); + return -errno; + } + pr_err("failed to parse /proc/uptime"); + return -1; + } + fclose(proc); + + uptime->tv_sec = up_sec; + uptime->tv_nsec = up_nsec; + return 0; +} + +static int check_uptime(void) +{ + struct timespec uptime_new, uptime_old; + time_t uptime_expected; + double prec = MAX_TEST_TIME_SEC; + + if (switch_ns(parent_ns)) + return pr_err("switch_ns(%d)", parent_ns); + + if (read_proc_uptime(&uptime_old)) + return 1; + + if (switch_ns(child_ns)) + return pr_err("switch_ns(%d)", child_ns); + + if (read_proc_uptime(&uptime_new)) + return 1; + + uptime_expected = uptime_old.tv_sec + TEN_DAYS_IN_SEC; + if (fabs(difftime(uptime_new.tv_sec, uptime_expected)) > prec) { + pr_fail("uptime in /proc/uptime: old %ld, new %ld [%ld]", + uptime_old.tv_sec, uptime_new.tv_sec, + uptim
[PATCHv7 14/33] hrtimers: Prepare hrtimer_nanosleep() for time namespaces
From: Andrei Vagin clock_nanosleep() accepts absolute values of expiration time when TIMER_ABSTIME flag is set. This absolute value is inside the task's time namespace, and has to be converted to the host's time. There is timens_ktime_to_host() helper for converting time, but it accepts ktime argument. As a preparation, make hrtimer_nanosleep() accept a clock value in ktime instead of timespec64. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- include/linux/hrtimer.h| 2 +- kernel/time/hrtimer.c | 8 kernel/time/posix-stubs.c | 4 ++-- kernel/time/posix-timers.c | 4 +++- tools/perf/examples/bpf/5sec.c | 6 -- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1b9a51a1bccb..cdcb2e9cd54a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -502,7 +502,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); -extern long hrtimer_nanosleep(const struct timespec64 *rqtp, +extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0d4dc241c0fb..19cc504bd7cc 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1907,7 +1907,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(const struct timespec64 *rqtp, +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; @@ -1920,7 +1920,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -1955,7 +1955,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif @@ -1975,7 +1975,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index edaf075d1ee4..2ccefc9ce184 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -147,7 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -233,7 +233,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 6e350cc8f600..c0ae1f6d2add 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1221,7 +1221,9 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { - return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? + ktime_t texp = timespec64_to_ktime(*rqtp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_R
[PATCHv7 06/33] alarmtimer: Provide get_timespec() callback
From: Andrei Vagin The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() Wire up alarm bases with get_timespec(). Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- include/linux/posix-timers.h | 3 +++ kernel/time/alarmtimer.c | 8 ++-- kernel/time/posix-timers.c | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 3d10c84a97a9..d535d52eb3ca 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -230,4 +230,7 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); void posixtimer_rearm(struct kernel_siginfo *info); + +int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp); +int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp); #endif diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 22b6f9b133b2..73a5458194c7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,12 +37,15 @@ * @lock: Lock for syncrhonized access to the base * @timerqueue:Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base + * @get_timespec: Function to read the namespace time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; ktime_t (*get_ktime)(void); + int (*get_timespec)(const clockid_t which_clock, + struct timespec64 *tp); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -670,8 +673,7 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec64(base->get_ktime()); - return 0; + return base->get_timespec(base->base_clockid, tp); } /** @@ -883,8 +885,10 @@ static int __init alarmtimer_init(void) /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_timespec = posix_get_realtime_timespec, alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_timespec = posix_get_boottime_timespec; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 68d4690cc225..e65241a46038 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -165,7 +165,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) +int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; @@ -222,7 +222,7 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) +int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); return 0; -- 2.23.0
[PATCHv7 10/33] kernel: Add do_timens_ktime_to_host() helper
From: Andrei Vagin The helper subtracts namespace's clock offset from the given time and checks that the result is in [0, KTIME_MAX]. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- include/linux/time_namespace.h | 14 + kernel/time/namespace.c| 36 ++ 2 files changed, 50 insertions(+) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 3d429c7ecca5..9a77d3854830 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -57,6 +57,15 @@ static inline void timens_add_boottime(struct timespec64 *ts) *ts = timespec64_add(*ts, ns_offsets->boottime); } +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *offsets); +static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) +{ + struct timens_offsets *offsets = ¤t->nsproxy->time_ns->offsets; + + return do_timens_ktime_to_host(clockid, tim, offsets); +} + #else static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { @@ -83,6 +92,11 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *ts static inline void timens_add_monotonic(struct timespec64 *ts) {} static inline void timens_add_boottime(struct timespec64 *ts) {} + +static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) +{ + return tim; +} #endif #endif /* _LINUX_TIMENS_H */ diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index c2a58e45fc4b..1a0fbaa5d2d4 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -16,6 +16,42 @@ #include #include +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *ns_offsets) +{ + ktime_t offset; + + switch (clockid) { + case CLOCK_MONOTONIC: + offset = timespec64_to_ktime(ns_offsets->monotonic); + break; + case CLOCK_BOOTTIME: + case CLOCK_BOOTTIME_ALARM: + offset = timespec64_to_ktime(ns_offsets->boottime); + break; + default: + return tim; + } + + /* +* Check that @tim value is in [offset, KTIME_MAX + offset] +* and subtract offset. +*/ + if (tim < offset) { + /* +* User can specify @tim *absolute* value - if it's lesser than +* the time namespace's offset - it's already expired. +*/ + tim = 0; + } else { + tim = ktime_sub(tim, offset); + if (unlikely(tim > KTIME_MAX)) + tim = KTIME_MAX; + } + + return tim; +} + static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); -- 2.23.0
[PATCHv7 13/33] alarmtimer: Make nanosleep time namespace aware
From: Andrei Vagin clock_nanosleep() accepts absolute values of expiration time when TIMER_ABSTIME flag is set. This absolute value is inside the task's time namespace, and has to be converted to the host's time. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/alarmtimer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c326427bb4cb..353e46f9acc2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -838,6 +838,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, ktime_t now = alarm_bases[type].get_ktime(); exp = ktime_add_safe(now, exp); + } else { + exp = timens_ktime_to_host(which_clock, exp); } ret = alarmtimer_do_nsleep(&alarm, exp, type); -- 2.23.0
[PATCHv7 04/33] posix-clocks: Rename .clock_get_timespec() callbacks accordingly
From: Andrei Vagin The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format in (struct k_clock). As a preparation ground for introducing clock_get_ktime(), the original callback clock_get() was renamed into clock_get_timespec(). Reflect the renaming into callbacks realizations. Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/alarmtimer.c | 6 +++--- kernel/time/posix-timers.c | 16 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8523df726fee..62b06cfa710d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -657,13 +657,13 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get_timespec interface + * alarm_clock_get_timespec - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * * Provides the underlying alarm base time. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) +static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; @@ -837,7 +837,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get_timespec = alarm_clock_get, + .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 44d4f9cb782d..68d4690cc225 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -165,7 +165,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; @@ -187,7 +187,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); return 0; @@ -222,13 +222,13 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; @@ -1261,7 +1261,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_clock_realtime_get, + .clock_get_timespec = posix_get_realtime_timespec, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1279,7 +1279,7 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_ktime_get_ts, + .clock_get_timespec = posix_get_monotonic_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1310,7 +1310,7 @@ static const struct k_clock clock_monotonic_coarse = { static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_get_tai, + .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1326,7 +1326,7 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clo
[PATCHv7 26/33] fs/proc: Introduce /proc/pid/timens_offsets
From: Andrei Vagin API to set time namespace offsets for children processes, i.e.: echo "clockid off_ses off_nsec" > /proc/self/timens_offsets Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- fs/proc/base.c | 95 +++ include/linux/time_namespace.h | 10 kernel/time/namespace.c| 100 + 3 files changed, 205 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index ebea9501afb8..1d2007365e87 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -1533,6 +1534,97 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_TIME_NS +static int timens_offsets_show(struct seq_file *m, void *v) +{ + struct task_struct *p; + + p = get_proc_task(file_inode(m->file)); + if (!p) + return -ESRCH; + proc_timens_show_offsets(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +timens_offsets_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + struct proc_timens_offset offsets[2]; + char *kbuf = NULL, *pos, *next_line; + struct task_struct *p; + int ret, noffsets; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Parse the user data */ + ret = -EINVAL; + noffsets = 0; + for (pos = kbuf; pos; pos = next_line) { + struct proc_timens_offset *off = &offsets[noffsets]; + int err; + + /* Find the end of line and ensure we don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; + } + + err = sscanf(pos, "%u %lld %lu", &off->clockid, + &off->val.tv_sec, &off->val.tv_nsec); + if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) + goto out; + noffsets++; + if (noffsets == ARRAY_SIZE(offsets)) { + if (next_line) + count = next_line - kbuf; + break; + } + } + + ret = -ESRCH; + p = get_proc_task(inode); + if (!p) + goto out; + ret = proc_timens_set_offset(file, p, offsets, noffsets); + put_task_struct(p); + if (ret) + goto out; + + ret = count; +out: + kfree(kbuf); + return ret; +} + +static int timens_offsets_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timens_offsets_show, inode); +} + +static const struct file_operations proc_timens_offsets_operations = { + .open = timens_offsets_open, + .read = seq_read, + .write = timens_offsets_write, + .llseek = seq_lseek, + .release= single_release, +}; +#endif /* CONFIG_TIME_NS */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -3015,6 +3107,9 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif +#ifdef CONFIG_TIME_NS + REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index dcf3dbf2836b..7cc80051cd17 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -50,6 +50,16 @@ static inline void put_time_ns(struct time_namespace *ns) kref_put(&ns->kref, free_time_ns); } +extern void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m); + +struct proc_timens_offset { + int clockid; + struct timespec64 val; +}; + +extern int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int n); + static inline void timens_add_monoto
[PATCHv7 16/33] fs/proc: Respect boottime inside time namespace for /proc/uptime
Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- fs/proc/uptime.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index a4c2791ab70b..5a1b228964fb 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,6 +5,7 @@ #include #include #include +#include #include static int uptime_proc_show(struct seq_file *m, void *v) @@ -20,6 +21,8 @@ static int uptime_proc_show(struct seq_file *m, void *v) nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; ktime_get_boottime_ts64(&uptime); + timens_add_boottime(&uptime); + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", -- 2.23.0
[PATCHv7 21/33] x86/vdso: Add timens page
To support time namespaces in the VDSO with a minimal impact on regular non time namespace affected tasks, the namespace handling needs to be hidden in a slow path. The most obvious place is vdso_seq_begin(). If a task belongs to a time namespace then the VVAR page which contains the system wide VDSO data is replaced with a namespace specific page which has the same layout as the VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. The extra check in the case that vdso_data->seq is odd, e.g. a concurrent update of the VDSO data is in progress, is not really affecting regular tasks which are not part of a time namespace as the task is spin waiting for the update to finish and vdso_data->seq to become even again. If a time namespace task hits that code path, it invokes the corresponding time getter function which retrieves the real VVAR page, reads host time and then adds the offset for the requested clock which is stored in the special VVAR page. Allocate Timens page among VVAR pages and place vdso_data on it. Provide __arch_get_timens_vdso_data() helper for VDSO code to get the code-relative position of VVARs on that special page. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- arch/x86/Kconfig | 1 + arch/x86/entry/vdso/vdso-layout.lds.S| 11 +-- arch/x86/entry/vdso/vdso2c.c | 3 +++ arch/x86/include/asm/vdso.h | 1 + arch/x86/include/asm/vdso/gettimeofday.h | 9 + arch/x86/include/asm/vvar.h | 5 - 6 files changed, 27 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d6e1faa28c58..15f076eb2b1c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -226,6 +226,7 @@ config X86 select VIRT_TO_BUS select X86_FEATURE_NAMESif PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS + select VDSO_TIMENS if TIME_NS config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 2330daad67c3..ea7e0155c604 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -16,8 +16,8 @@ SECTIONS * segment. */ - vvar_start = . - 3 * PAGE_SIZE; - vvar_page = vvar_start; + vvar_start = . - 4 * PAGE_SIZE; + vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ #define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; @@ -26,6 +26,13 @@ SECTIONS pvclock_page = vvar_start + PAGE_SIZE; hvclock_page = vvar_start + 2 * PAGE_SIZE; + timens_page = vvar_start + 3 * PAGE_SIZE; + +#undef _ASM_X86_VVAR_H + /* Place all vvars in timens too at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset; +#include +#undef EMIT_VVAR . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 3a4d8d4d39f8..3842873b3ae3 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -75,12 +75,14 @@ enum { sym_vvar_page, sym_pvclock_page, sym_hvclock_page, + sym_timens_page, }; const int special_pages[] = { sym_vvar_page, sym_pvclock_page, sym_hvclock_page, + sym_timens_page, }; struct vdso_sym { @@ -93,6 +95,7 @@ struct vdso_sym required_syms[] = { [sym_vvar_page] = {"vvar_page", true}, [sym_pvclock_page] = {"pvclock_page", true}, [sym_hvclock_page] = {"hvclock_page", true}, + [sym_timens_page] = {"timens_page", true}, {"VDSO32_NOTE_MASK", true}, {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 230474e2ddb5..bbcdc7b8f963 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -21,6 +21,7 @@ struct vdso_image { long sym_vvar_page; long sym_pvclock_page; long sym_hvclock_page; + long sym_timens_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index e9ee139cf29e..39a551df4fea 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -21,6 +21,7 @@ #include #define __vdso_data (VVAR(_vdso_data)) +#define __timens_vdso_data (TIMENS(_vdso_data)) #define VDSO_HAS_TIME 1 @@ -56,6 +57,14 @@ extern struct ms_hyperv_tsc_page hvclock_page __attribute__((visibility("hidden&qu
[PATCHv7 09/33] posix-clocks: Wire up clock_gettime() with timens offsets
From: Andrei Vagin Adjust monotonic and boottime clocks with per-timens offsets. As the result a process inside time namespace will see timers and clocks corrected to offsets that were set on creating namespace. Note that applications usually go through vDSO to get time, which is not yet adjusted. Further changes complete time namespace virtualisation with vDSO support. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/alarmtimer.c | 1 + kernel/time/posix-stubs.c | 3 +++ kernel/time/posix-timers.c | 5 + 3 files changed, 9 insertions(+) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9415c83f8cca..c326427bb4cb 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "posix-timers.h" diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 67df65f887ac..edaf075d1ee4 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER @@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) break; case CLOCK_MONOTONIC: ktime_get_ts64(tp); + timens_add_monotonic(tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); break; default: return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 47a8d43fe1c6..d8b5bd4cbae1 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "timekeeping.h" #include "posix-timers.h" @@ -195,6 +196,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -209,6 +211,7 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -223,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -235,6 +239,7 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); return 0; } -- 2.23.0
[PATCHv7 27/33] selftests/timens: Add Time Namespace test for supported clocks
A test to check that all supported clocks work on host and inside a new time namespace. Use both ways to get time: through VDSO and by entering the kernel with implicit syscall. Introduce a new timens directory in selftests framework for the next timens tests. Output on success: 1..10 ok 1 Passed for CLOCK_BOOTTIME (syscall) ok 2 Passed for CLOCK_BOOTTIME (vdso) ok 3 Passed for CLOCK_BOOTTIME_ALARM (syscall) ok 4 Passed for CLOCK_BOOTTIME_ALARM (vdso) ok 5 Passed for CLOCK_MONOTONIC (syscall) ok 6 Passed for CLOCK_MONOTONIC (vdso) ok 7 Passed for CLOCK_MONOTONIC_COARSE (syscall) ok 8 Passed for CLOCK_MONOTONIC_COARSE (vdso) ok 9 Passed for CLOCK_MONOTONIC_RAW (syscall) ok 10 Passed for CLOCK_MONOTONIC_RAW (vdso) # Pass 10 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output with lack of permissions: 1..10 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..10 not ok 1 # SKIP Time namespaces are not supported Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 5 + tools/testing/selftests/timens/config | 1 + tools/testing/selftests/timens/log.h | 26 +++ tools/testing/selftests/timens/timens.c | 185 ++ tools/testing/selftests/timens/timens.h | 73 + 7 files changed, 292 insertions(+) create mode 100644 tools/testing/selftests/timens/.gitignore create mode 100644 tools/testing/selftests/timens/Makefile create mode 100644 tools/testing/selftests/timens/config create mode 100644 tools/testing/selftests/timens/log.h create mode 100644 tools/testing/selftests/timens/timens.c create mode 100644 tools/testing/selftests/timens/timens.h diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 4cdbae6f4e61..f6c3329946a6 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -47,6 +47,7 @@ TARGETS += splice TARGETS += static_keys TARGETS += sync TARGETS += sysctl +TARGETS += timens ifneq (1, $(quicktest)) TARGETS += timers endif diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore new file mode 100644 index ..27a693229ce1 --- /dev/null +++ b/tools/testing/selftests/timens/.gitignore @@ -0,0 +1 @@ +timens diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile new file mode 100644 index ..b877efb78974 --- /dev/null +++ b/tools/testing/selftests/timens/Makefile @@ -0,0 +1,5 @@ +TEST_GEN_PROGS := timens + +CFLAGS := -Wall -Werror + +include ../lib.mk diff --git a/tools/testing/selftests/timens/config b/tools/testing/selftests/timens/config new file mode 100644 index ..4480620f6f49 --- /dev/null +++ b/tools/testing/selftests/timens/config @@ -0,0 +1 @@ +CONFIG_TIME_NS=y diff --git a/tools/testing/selftests/timens/log.h b/tools/testing/selftests/timens/log.h new file mode 100644 index ..db64df2a8483 --- /dev/null +++ b/tools/testing/selftests/timens/log.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SELFTEST_TIMENS_LOG_H__ +#define __SELFTEST_TIMENS_LOG_H__ + +#define pr_msg(fmt, lvl, ...) \ + ksft_print_msg("[%s] (%s:%d)\t" fmt "\n", \ + lvl, __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_p(func, fmt, ...) func(fmt ": %m", ##__VA_ARGS__) + +#define pr_err(fmt, ...) \ + ({ \ + ksft_test_result_error(fmt "\n", ##__VA_ARGS__); \ + -1; \ + }) + +#define pr_fail(fmt, ...) \ + ({ \ + ksft_test_result_fail(fmt, ##__VA_ARGS__); \ + -1; \ + }) + +#define pr_perror(fmt, ...)pr_p(pr_err, fmt, ##__VA_ARGS__) + +#endif diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c new file mode 100644 index ..9fc362d5a168 --- /dev/null +++ b/tools/testing/selftests/timens/timens.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +/* + * Test shouldn't be run for a day, so add 10 days to child + * time and check parent's time to be in the same day. + */ +#define DAY_IN_SEC (60*60*24) +#define TEN_DAYS_I
[PATCHv7 23/33] x86/vdso: Handle faults on timens page
If a task belongs to a time namespace then the VVAR page which contains the system wide VDSO data is replaced with a namespace specific page which has the same layout as the VVAR page. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vma.c | 53 +-- mm/mmap.c | 2 ++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 5dab706aca2e..f6e13ab29d94 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -14,11 +14,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -107,10 +109,36 @@ static int vvar_mremap(const struct vm_special_mapping *sm, return 0; } +#ifdef CONFIG_TIME_NS +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* +* VM_PFNMAP | VM_IO protect .fault() handler from being called +* through interfaces like /proc/$pid/mem or +* process_vm_{readv,writev}() as long as there's no .access() +* in special_mapping_vmops(). +* For more details check_vma_flags() and __access_remote_vm() +*/ + + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} +#else +static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} +#endif + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { const struct vdso_image *image = vma->vm_mm->context.vdso_image; + unsigned long pfn; long sym_offset; if (!image) @@ -130,8 +158,21 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - return vmf_insert_pfn(vma, vmf->address, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT); + struct page *timens_page = find_timens_vvar_page(vma); + + pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; + + /* +* If a task belongs to a time namespace then a namespace +* specific VVAR is mapped with the sym_vvar_page offset and +* the real VVAR page is mapped with the sym_timens_page +* offset. +* See also the comment near timens_setup_vdso_data(). +*/ + if (timens_page) + pfn = page_to_pfn(timens_page); + + return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_get_pvti_cpu0_va(); @@ -146,6 +187,14 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) return vmf_insert_pfn(vma, vmf->address, virt_to_phys(tsc_pg) >> PAGE_SHIFT); + } else if (sym_offset == image->sym_timens_page) { + struct page *timens_page = find_timens_vvar_page(vma); + + if (!timens_page) + return VM_FAULT_SIGBUS; + + pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; + return vmf_insert_pfn(vma, vmf->address, pfn); } return VM_FAULT_SIGBUS; diff --git a/mm/mmap.c b/mm/mmap.c index a7d8c84d19b7..af722a47db3c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3363,6 +3363,8 @@ static const struct vm_operations_struct special_mapping_vmops = { .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, + /* vDSO code relies that VVAR can't be accessed remotely */ + .access = NULL, }; static const struct vm_operations_struct legacy_special_mapping_vmops = { -- 2.23.0
[PATCHv7 05/33] alarmtimer: Rename gettime() callback to get_ktime()
From: Andrei Vagin The upcoming support for time namespaces requires to have access to: - The time in a tasks time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() struct alarm_base needs to follow the same name convention, so rename .gettime() callback into get_ktime() as a preparation for introducing get_timespec(). Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/alarmtimer.c | 34 +- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 62b06cfa710d..22b6f9b133b2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -36,13 +36,13 @@ * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue:Timerqueue head managing the list of events - * @gettime: Function to read the time correlating to the base + * @get_ktime: Function to read the time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - ktime_t (*gettime)(void); + ktime_t (*get_ktime)(void); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) - restart = alarm->function(alarm, base->gettime()); + restart = alarm->function(alarm, base->get_ktime()); spin_lock_irqsave(&base->lock, flags); if (restart != ALARMTIMER_NORESTART) { @@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_fired(alarm, base->gettime()); + trace_alarmtimer_fired(alarm, base->get_ktime()); return ret; } @@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) ktime_t alarm_expires_remaining(const struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - return ktime_sub(alarm->node.expires, base->gettime()); + return ktime_sub(alarm->node.expires, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_expires_remaining); @@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev) spin_unlock_irqrestore(&base->lock, flags); if (!next) continue; - delta = ktime_sub(next->expires, base->gettime()); + delta = ktime_sub(next->expires, base->get_ktime()); if (!min || (delta < min)) { expires = next->expires; min = delta; @@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start) hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_start(alarm, base->gettime()); + trace_alarmtimer_start(alarm, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_start); @@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add_safe(start, base->gettime()); + start = ktime_add_safe(start, base->get_ktime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_cancel(alarm, base->gettime()); + trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); @@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { struct alarm_base *base = &alarm_bases[alarm->type]; - return alarm_forward(alarm, base->gettime(), interval); + return alarm_forward(alarm, base->get_ktime(), interval); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) return; } - delta = ktime_sub(absexp, base->gettime()); + delta = ktime_sub(absexp, base->get_ktime()); spin_lock_irqsave(&freezer_delta_lock, flags); if (!freezer_delta || (delta < freezer_delta)) { @@ -632,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_
[PATCHv7 03/33] posix-clocks: Rename the clock_get() callback to clock_get_timespec()
From: Andrei Vagin The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format, rather than in (struct timespec). Rename the clock_get() callback to clock_get_timespec() as a preparation for introducing clock_get_ktime(). Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/alarmtimer.c | 4 ++-- kernel/time/posix-clock.c | 8 kernel/time/posix-cpu-timers.c | 32 kernel/time/posix-timers.c | 22 +++--- kernel/time/posix-timers.h | 4 ++-- 5 files changed, 35 insertions(+), 35 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 451f9d05ccfe..8523df726fee 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -657,7 +657,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get interface + * alarm_clock_get - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * @@ -837,7 +837,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, + .clock_get_timespec = alarm_clock_get, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ec960bb939fd..c8f9c9b1cd82 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -315,8 +315,8 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) } const struct k_clock clock_posix_dynamic = { - .clock_getres = pc_clock_getres, - .clock_set = pc_clock_settime, - .clock_get = pc_clock_gettime, - .clock_adj = pc_clock_adjtime, + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get_timespec = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 92a431981b1c..c84ee50e2eab 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1391,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer) } const struct k_clock clock_posix_cpu = { - .clock_getres = posix_cpu_clock_getres, - .clock_set = posix_cpu_clock_set, - .clock_get = posix_cpu_clock_get, - .timer_create = posix_cpu_timer_create, - .nsleep = posix_cpu_nsleep, - .timer_set = posix_cpu_timer_set, - .timer_del = posix_cpu_timer_del, - .timer_get = posix_cpu_timer_get, - .timer_rearm= posix_cpu_timer_rearm, + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get_timespec = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, + .timer_rearm= posix_cpu_timer_rearm, }; const struct k_clock clock_process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, + .clock_getres = process_cpu_clock_getres, + .clock_get_timespec = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, }; const struct k_clock clock_thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, + .clock_getres = thread_cpu_clock_getres, + .clock_get_timespec = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, }; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0ec5b7a1d769..44d4f9cb782d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -667,7 +667,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * The timespec64 based conversion is suboptimal, but it's not * worth to implement yet anothe
[PATCHv7 00/33] kernel: Introduce Time Namespace
. Cc: Adrian Reber Cc: Andrei Vagin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: "Eric W. Biederman" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Jeff Dike Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Shuah Khan Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: contain...@lists.linux-foundation.org Cc: c...@openvz.org Cc: linux-...@vger.kernel.org Cc: x...@kernel.org v7 on github (if someone prefers `git pull` to `git am`): https://github.com/0x7f454c46/linux/tree/timens-v7 v6: https://lkml.kernel.org/r/20190815163836.2927-1-d...@arista.com v5: https://lkml.kernel.org/r/20190729215758.28405-1-d...@arista.com v4: https://lkml.kernel.org/r/20190612192628.23797-1-d...@arista.com v3: https://lkml.kernel.org/r/20190425161416.26600-1-d...@arista.com v2: https://lore.kernel.org/lkml/20190206001107.16488-1-d...@arista.com/ RFC: https://lkml.kernel.org/r/20180919205037.9574-1-d...@arista.com/ v4..v5 Changes: * Rebased over generic vdso (already in master) * Addressing review comments by Thomas Gleixner (thanks much for your time and patience): - Dropping `timens` prefix from subjects (it's not a subsystem) - Keeping commit messages in a neutral technical form - Splitting unreasonably large patches - Document code with missing comments - Dropped dead code that's not compiled with !CONFIG_TIME_NS * Updated performance results [here, at the bottom] * Split vdso jump tables patch * Allow unshare() with many threads: it's safe until fork()/clone(), where we check for CLONE_THREADS * Add missed check in setns() for CLONE_VM | CLONE_THREADS * Fixed compilation with !CONFIG_UTS_NS * Add a plan in selftests (prevents new warning "Planned tests != run tests") * Set jump table section address & size to (-1UL) just in case if there is no such section while running vdso2c (and WARN() on boot in such case) v3..v4 Changes: * CLOCKE_NEWTIME is unshare()-only flag now (CLON_PIDFD took previous value) * Addressing Jann Horn's feedback - we don't allow CLONE_THREAD or CLONE_VM together with CLONE_NEWTIME (thanks for spotting!) * Addressing issues found by Thomas - removed unmaintainable CLOCK_TIMENS and introduced another call back into k_clock to get ktime instead of getting timespec and converting it (Patch 03) * Renaming timens_offsets members to omit _offset postfix (thanks Cyrill for the suggestion) * Suggestions, renaming and making code more maintainable from Thomas's feedback (thanks much!) * Fixing out-of-bounds and other issues in procfs file (kudos Jann Horn) * vdso_fault() can be called on a remote task by /proc/$pid/mem or process_vm_readv() - addressed by adding a slow-path with searching for owner's namespace (thanks for spotting this unobvious issue, Jann) * Other nits by Jann Horn v2..v3: Major changes: * Simplify two VDSO images by using static_branch() in vclock_gettime() Removes unwanted conflicts with generic VDSO movement patches and simplifies things by dropping too invasive linker magic. As an alternative to static_branch() we tested an attempt to introduce home-made dynamic patching called retcalls: https://github.com/0x7f454c46/linux/commit/4cc0180f6d65 Considering some theoretical problems with toolchains, we decided to go with long well-tested nop-patching in static_branch(). Though, it was needed to provide backend for relative code. * address Thomas' comments. * add sanity checks for offsets: - the current clock time in a namespace has to be in [0, KTIME_MAX / 2). KTIME_MAX is divided by two here to be sure that the KTIME_MAX limit is still unreachable. Link: https://lkml.org/lkml/2018/9/19/950 Link: https://lkml.org/lkml/2019/2/5/867 v1..v2: There are two major changes: * Two versions of the VDSO library to avoid a performance penalty for host tasks outside time namespace (as suggested by Andy and Thomas). As it has been discussed on timens RFC, adding a new conditional branch `if (inside_time_ns)` on VDSO for all processes is undesirable. It will add a penalty for everybody as branch predictor may mispredict the jump. Also there are instruction cache lines wasted on cmp/jmp. Those effects of introducing time namespace are very much unwanted having in mind how much work have been spent on micro-optimisation VDSO code. Addressing those problems, there are two versions of VDSO's .so: for host tasks (without any penalty) and for processes inside of time namespace with clk_to_ns() that subtracts offsets from host's time. * Allow to set clock offsets for a namespace only before any processes appear in it. Now a time namespace looks similar to a pid namespace in a way how it is created: unshare(CLONE_NEWTIME) system call creates a new time namespace, but doesn't set it to the current process. Then all children of
[PATCHv7 01/33] ns: Introduce Time Namespace
From: Andrei Vagin Time Namespace isolates clock values. The kernel provides access to several clocks CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc. CLOCK_REALTIME System-wide clock that measures real (i.e., wall-clock) time. CLOCK_MONOTONIC Clock that cannot be set and represents monotonic time since some unspecified starting point. CLOCK_BOOTTIME Identical to CLOCK_MONOTONIC, except it also includes any time that the system is suspended. For many users, the time namespace means the ability to changes date and time in a container (CLOCK_REALTIME). But in a context of the checkpoint/restore functionality, monotonic and bootime clocks become interesting. Both clocks are monotonic with unspecified staring points. These clocks are widely used to measure time slices and set timers. After restoring or migrating processes, we have to guarantee that they never go backward. In an ideal case, the behavior of these clocks should be the same as for a case when a whole system is suspended. All this means that we need to be able to set CLOCK_MONOTONIC and CLOCK_BOOTTIME clocks, what can be done by adding per-namespace offsets for clocks. A time namespace is similar to a pid namespace in a way how it is created: unshare(CLONE_NEWTIME) system call creates a new time namespace, but doesn't set it to the current process. Then all children of the process will be born in the new time namespace, or a process can use the setns() system call to join a namespace. This scheme allows setting clock offsets for a namespace, before any processes appear in it. All available clone flags have been used, so CLONE_NEWTIME uses the highest bit of CSIGNAL. It means that we can use it with the unshare() system call only. Rith now, this works for us, because time namespace offsets can be set only when a new time namespace is not populated. In a future, we will have the clone3() system call [1] which will allow to use the CSIGNAL mask for clone flags. [1]: httmps://lkml.kernel.org/r/20190604160944.4058-1-christ...@brauner.io Link: https://criu.org/Time_namespace Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- MAINTAINERS| 2 + fs/proc/namespaces.c | 4 + include/linux/nsproxy.h| 2 + include/linux/proc_ns.h| 3 + include/linux/time_namespace.h | 66 ++ include/linux/user_namespace.h | 1 + include/uapi/linux/sched.h | 6 + init/Kconfig | 7 ++ kernel/fork.c | 16 ++- kernel/nsproxy.c | 41 +-- kernel/time/Makefile | 1 + kernel/time/namespace.c| 217 + 12 files changed, 356 insertions(+), 10 deletions(-) create mode 100644 include/linux/time_namespace.h create mode 100644 kernel/time/namespace.c diff --git a/MAINTAINERS b/MAINTAINERS index d44d6732510d..cabe7bddbf69 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13009,6 +13009,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Maintained F: fs/timerfd.c F: include/linux/timer* +F: include/linux/time_namespace.h +F: kernel/time_namespace.c F: kernel/time/*timer* POWER MANAGEMENT CORE diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dd2b35f78b09..8b5c720fe5d7 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 2ae1b1a4d84d..074f395b9ad2 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -35,6 +35,8 @@ struct nsproxy { struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; + struct time_namespace *time_ns; + struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index d31cb6215905..d312e6281e69 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -32,6 +32,8 @@ extern const struct proc_ns_operations pidns_for_children_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; /* * We always define these enumerators @@ -43,6 +45,7 @@ enum { P
Re: [PATCH 4/9] select: Micro-optimise __estimate_accuracy()
On 9/19/19 3:05 PM, Cyrill Gorcunov wrote: [..] >> diff --git a/fs/select.c b/fs/select.c >> index 12cdefd3be2d..2477c202631e 100644 >> --- a/fs/select.c >> +++ b/fs/select.c >> @@ -51,15 +51,14 @@ >> >> static long __estimate_accuracy(ktime_t slack) >> { >> -int divfactor = 1000; >> - >> if (slack < 0) >> return 0; > > Btw, don't you better use <= here? > Good point, will do for v2. Thanks, Dmitry
Re: [PATCH 8/9] select/restart_block: Convert poll's timeout to u64
On 9/9/19 2:07 PM, David Laight wrote: > From: Dmitry Safonov >> Sent: 09 September 2019 11:24 >> >> All preparations have been done - now poll() can set u64 timeout in >> restart_block. It allows to do the next step - unifying all timeouts in >> restart_block and provide ptrace() API to read it. >> >> Signed-off-by: Dmitry Safonov >> --- >> fs/select.c | 27 +++ >> include/linux/restart_block.h | 4 +--- >> 2 files changed, 8 insertions(+), 23 deletions(-) >> >> diff --git a/fs/select.c b/fs/select.c >> index 4af88feaa2fe..ff2b9c4865cd 100644 >> --- a/fs/select.c >> +++ b/fs/select.c > ... >> @@ -1037,16 +1030,10 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, >> unsigned int, nfds, >> struct restart_block *restart_block; >> >> restart_block = ¤t->restart_block; >> -restart_block->fn = do_restart_poll; >> -restart_block->poll.ufds = ufds; >> -restart_block->poll.nfds = nfds; >> - >> -if (timeout_msecs >= 0) { >> -restart_block->poll.tv_sec = end_time.tv_sec; >> -restart_block->poll.tv_nsec = end_time.tv_nsec; >> -restart_block->poll.has_timeout = 1; >> -} else >> -restart_block->poll.has_timeout = 0; >> +restart_block->fn = do_restart_poll; >> +restart_block->poll.ufds= ufds; >> +restart_block->poll.nfds= nfds; >> +restart_block->poll.timeout = timeout; > > What is all that whitespace for? Aligned them with tabs just to make it look better. I've no hard feelings about this - I can do it with spaces or drop the align at all. Thanks, Dmitry
Re: [PATCH 4/9] select: Micro-optimise __estimate_accuracy()
Hi Cyrill, On Mon, 9 Sep 2019 at 12:18, Cyrill Gorcunov wrote: > Compiler precompute constants so it doesn't do division here. > But I didn't read the series yet so I might be missing > something obvious. Heh, like a division is in ktime_divns()? Thanks, Dmitry
[PATCH 3/9] select: Convert __esimate_accuracy() to ktime_t
__estimate_accuracy() divides 64-bit integers twice which is suboptimal. Converting to ktime_t not only avoids that, but also simplifies the logic on some extent. The long-term goal is to convert poll() to leave timeout value in ktime_t inside restart_block as it's the only user that leaves it in timespec. That's a preparation ground for introducing a new ptrace() request that will dump timeout for interrupted syscall. Furthermore, do_select() and do_poll() actually both need time in ktime_t for poll_schedule_timeout(), so there is this hack that converts time on the first loop. It's not only a "hack", but also it's done every time poll() syscall is restarted. After conversion it'll be removed. While at it, rename parameters "slack" and "timeout" which describe their purpose better. Signed-off-by: Dmitry Safonov --- fs/select.c | 33 + 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/select.c b/fs/select.c index 53a0c149f528..12cdefd3be2d 100644 --- a/fs/select.c +++ b/fs/select.c @@ -36,7 +36,7 @@ /* - * Estimate expected accuracy in ns from a timeval. + * Estimate expected accuracy in ns. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the @@ -49,22 +49,17 @@ #define MAX_SLACK (100 * NSEC_PER_MSEC) -static long __estimate_accuracy(struct timespec64 *tv) +static long __estimate_accuracy(ktime_t slack) { - long slack; int divfactor = 1000; - if (tv->tv_sec < 0) + if (slack < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; - if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) - return MAX_SLACK; - - slack = tv->tv_nsec / divfactor; - slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); + slack = ktime_divns(slack, divfactor); if (slack > MAX_SLACK) return MAX_SLACK; @@ -72,27 +67,25 @@ static long __estimate_accuracy(struct timespec64 *tv) return slack; } -u64 select_estimate_accuracy(struct timespec64 *tv) +u64 select_estimate_accuracy(struct timespec64 *timeout) { - u64 ret; - struct timespec64 now; + ktime_t now, slack; /* * Realtime tasks get a slack of 0 for obvious reasons. */ - if (rt_task(current)) return 0; - ktime_get_ts64(&now); - now = timespec64_sub(*tv, now); - ret = __estimate_accuracy(&now); - if (ret < current->timer_slack_ns) - return current->timer_slack_ns; - return ret; -} + now = ktime_get(); + slack = now - timespec64_to_ktime(*timeout); + slack = __estimate_accuracy(slack); + if (slack < current->timer_slack_ns) + return current->timer_slack_ns; + return slack; +} struct poll_table_page { struct poll_table_page * next; -- 2.23.0
[PATCH 2/9] restart_block: Prevent userspace set part of the block
Parameters for nanosleep() could be chosen the way to make hrtimer_nanosleep() fail. In that case changes to restarter_block bring it into inconsistent state. Luckily, it won't corrupt anything critical for poll() or futex(). But as it's not evident that userspace may do tricks in the union changing restart_block for other @fs(s) - than further changes in the code may create a potential local vulnerability. I.e., if userspace could do tricks with poll() or futex() than corruption to @clockid or @type would trigger BUG() in timer code. Set @fn every time restart_block is changed, preventing surprises. Also, add a comment for any new restart_block user. Signed-off-by: Dmitry Safonov --- include/linux/restart_block.h | 4 kernel/time/hrtimer.c | 8 +--- kernel/time/posix-cpu-timers.c | 6 +++--- kernel/time/posix-stubs.c | 8 +--- kernel/time/posix-timers.c | 8 +--- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index e5078cae5567..e66e982105f4 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -21,6 +21,10 @@ enum timespec_type { /* * System call restart block. + * + * Safety rule: if you change anything inside @restart_block, + * set @fn to keep the structure in consistent state and prevent + * userspace tricks in the union. */ struct restart_block { long (*fn)(struct restart_block *); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5ee77f1a8a92..4ba2b50d068f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1762,8 +1762,9 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; - current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; - current->restart_block.nanosleep.rmtp = rmtp; + current->restart_block.fn = do_no_restart_syscall; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } @@ -1782,7 +1783,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; - current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.fn = do_no_restart_syscall; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0a426f4e3125..b4dddf74dd15 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1243,6 +1243,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + static int do_cpu_nanosleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { @@ -1330,6 +1332,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; + restart->fn = posix_cpu_nsleep_restart; restart->nanosleep.expires = expires; if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); @@ -1338,8 +1341,6 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, return error; } -static long posix_cpu_nsleep_restart(struct restart_block *restart_block); - static int posix_cpu_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { @@ -1361,7 +1362,6 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; - restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; } return error; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 67df65f887ac..d73039a9ca8f 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -142,8 +142,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; - current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE
[PATCH 0/9] restart_block: Prepare the ground for dumping timeout
Hi, I'm trying to address an issue in CRIU (Checkpoint Restore In Userspace) about timed syscalls restart. It's not possible to use restart_syscall() as the majority of applications does, as after restore the kernel doesn't know anything about a syscall that may have been interrupted on checkpoint. That's because the tasks are re-created from scratch and so there isn't task_struct::restart_block set on a new task. As a preparation, unify timeouts for different syscalls in restart_block. On contrary, I'm struggling with patches that introduce the new ptrace() request API. I'll speak about difficulties of designing new ptrace operation on Containers Microconference at Plumbers [with a hope to find the sensible solution]. Cc: Adrian Reber Cc: Alexander Viro Cc: Andrei Vagin Cc: Andy Lutomirski Cc: Cyrill Gorcunov Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Thomas Gleixner Cc: contain...@lists.linux-foundation.org Cc: linux-fsde...@vger.kernel.org Dmitry Safonov (9): futex: Remove unused uaddr2 in restart_block restart_block: Prevent userspace set part of the block select: Convert __esimate_accuracy() to ktime_t select: Micro-optimise __estimate_accuracy() select: Convert select_estimate_accuracy() to take ktime_t select: Extract common code into do_sys_ppoll() select: Use ktime_t in do_sys_poll() and do_poll() select/restart_block: Convert poll's timeout to u64 restart_block: Make common timeout fs/eventpoll.c | 4 +- fs/select.c| 214 - include/linux/poll.h | 2 +- include/linux/restart_block.h | 11 +- kernel/futex.c | 14 +-- kernel/time/alarmtimer.c | 6 +- kernel/time/hrtimer.c | 14 ++- kernel/time/posix-cpu-timers.c | 10 +- kernel/time/posix-stubs.c | 8 +- kernel/time/posix-timers.c | 8 +- 10 files changed, 115 insertions(+), 176 deletions(-) -- 2.23.0
[PATCH 7/9] select: Use ktime_t in do_sys_poll() and do_poll()
The plan is to store what's left of timeout in restart block as ktime_t which will be used for futex() and nanosleep() timeouts too. That will be a value to return with a new ptrace() request API. Convert end_time argument of do_{sys_,}poll() functions to ktime_t as a preparation ground for storing ktime_t inside restart_block. Signed-off-by: Dmitry Safonov --- fs/select.c | 47 +++ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/fs/select.c b/fs/select.c index 262300e58370..4af88feaa2fe 100644 --- a/fs/select.c +++ b/fs/select.c @@ -854,25 +854,22 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, - struct timespec64 *end_time) + ktime_t end_time) { poll_table* pt = &wait->pt; - ktime_t expire, *to = NULL; + ktime_t *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ - if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { + if (ktime_compare(ktime_get(), end_time) >= 0) { pt->_qproc = NULL; timed_out = 1; - } - - if (end_time && !timed_out) { - expire = timespec64_to_ktime(*end_time); - to = &expire; - slack = select_estimate_accuracy(expire); + } else { + to = &end_time; + slack = select_estimate_accuracy(end_time); } for (;;) { @@ -936,7 +933,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec64 *end_time) + ktime_t end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len; @@ -1004,16 +1001,15 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - struct timespec64 *to = NULL, end_time; + ktime_t timeout = 0; int ret; if (restart_block->poll.has_timeout) { - end_time.tv_sec = restart_block->poll.tv_sec; - end_time.tv_nsec = restart_block->poll.tv_nsec; - to = &end_time; + timeout = ktime_set(restart_block->poll.tv_sec, + restart_block->poll.tv_nsec); } - ret = do_sys_poll(ufds, nfds, to); + ret = do_sys_poll(ufds, nfds, timeout); if (ret == -ERESTARTNOHAND) { restart_block->fn = do_restart_poll; @@ -1025,16 +1021,17 @@ static long do_restart_poll(struct restart_block *restart_block) SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { - struct timespec64 end_time, *to = NULL; + struct timespec64 end_time; + ktime_t timeout = 0; int ret; if (timeout_msecs >= 0) { - to = &end_time; - poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, + poll_select_set_timeout(&end_time, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); + timeout = timespec64_to_ktime(end_time); } - ret = do_sys_poll(ufds, nfds, to); + ret = do_sys_poll(ufds, nfds, timeout); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; @@ -1060,7 +1057,8 @@ static int do_sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, void __user *tsp, const void __user *sigmask, size_t sigsetsize, enum poll_time_type pt_type) { - struct timespec64 ts, end_time, *to = NULL; + struct timespec64 ts, *to = NULL; + ktime_t timeout = 0; int ret; if (tsp) { @@ -1078,9 +1076,10 @@ static int do_sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, return -ENOSYS; } - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + to = &ts; + if (poll_select_set_timeout(&ts, ts.tv_sec, ts.tv_nsec)) return -EINVAL; + timeout = timespec64_to_ktime(ts); } if (!in_compat_syscall()) @@ -1091,8 +1090,8 @@ static int do_sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, if (ret) return ret; - ret = do_sys_poll(ufds, nfds, to); - return poll_
[PATCH 5/9] select: Convert select_estimate_accuracy() to take ktime_t
Instead of converting the time on the first loop, the same if (end_time) can be shared. Simplify the loop by taking time conversion out. Also prepare the ground for converting poll() restart_block timeout into ktime_t - that's the only user that leaves it in timespec. The conversion is needed to introduce an API for ptrace() to get a timeout from restart_block. Signed-off-by: Dmitry Safonov --- fs/eventpoll.c | 4 ++-- fs/select.c | 38 -- include/linux/poll.h | 2 +- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d7f1f5011fac..d5120fc49a39 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1836,9 +1836,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); - slack = select_estimate_accuracy(&end_time); + expires = timespec64_to_ktime(end_time); to = &expires; - *to = timespec64_to_ktime(end_time); + slack = select_estimate_accuracy(expires); } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the diff --git a/fs/select.c b/fs/select.c index 2477c202631e..458f2a944318 100644 --- a/fs/select.c +++ b/fs/select.c @@ -66,7 +66,7 @@ static long __estimate_accuracy(ktime_t slack) return slack; } -u64 select_estimate_accuracy(struct timespec64 *timeout) +u64 select_estimate_accuracy(ktime_t timeout) { ktime_t now, slack; @@ -77,7 +77,7 @@ u64 select_estimate_accuracy(struct timespec64 *timeout) return 0; now = ktime_get(); - slack = now - timespec64_to_ktime(*timeout); + slack = now - timeout; slack = __estimate_accuracy(slack); if (slack < current->timer_slack_ns) @@ -490,8 +490,11 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) timed_out = 1; } - if (end_time && !timed_out) - slack = select_estimate_accuracy(end_time); + if (end_time && !timed_out) { + expire = timespec64_to_ktime(*end_time); + to = &expire; + slack = select_estimate_accuracy(expire); + } retval = 0; for (;;) { @@ -582,16 +585,6 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) } busy_flag = 0; - /* -* If this is the first loop and we have a timeout -* given, then we convert to ktime_t and set the to -* pointer to the expiry value. -*/ - if (end_time && !to) { - expire = timespec64_to_ktime(*end_time); - to = &expire; - } - if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; @@ -876,8 +869,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, timed_out = 1; } - if (end_time && !timed_out) - slack = select_estimate_accuracy(end_time); + if (end_time && !timed_out) { + expire = timespec64_to_ktime(*end_time); + to = &expire; + slack = select_estimate_accuracy(expire); + } for (;;) { struct poll_list *walk; @@ -930,16 +926,6 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, } busy_flag = 0; - /* -* If this is the first loop and we have a timeout -* given, then we convert to ktime_t and set the to -* pointer to the expiry value. -*/ - if (end_time && !to) { - expire = timespec64_to_ktime(*end_time); - to = &expire; - } - if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } diff --git a/include/linux/poll.h b/include/linux/poll.h index 1cdc32b1f1b0..d0f21eb19257 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -112,7 +112,7 @@ struct poll_wqueues { extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); -extern u64 select_estimate_accuracy(struct timespec64 *tv); +extern u64 select_estimate_accuracy(ktime_t timeout); #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) -- 2.23.0
[PATCH 4/9] select: Micro-optimise __estimate_accuracy()
Shift on s64 is faster than division, use it instead. As the result of the patch there is a hardly user-visible effect: poll(), select(), etc syscalls will be a bit more precise on ~2.3% than before because 1000 != 1024 :) Signed-off-by: Dmitry Safonov --- fs/select.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/select.c b/fs/select.c index 12cdefd3be2d..2477c202631e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -51,15 +51,14 @@ static long __estimate_accuracy(ktime_t slack) { - int divfactor = 1000; - if (slack < 0) return 0; - if (task_nice(current) > 0) - divfactor = divfactor / 5; + /* A bit more precise than 0.1% */ + slack = slack >> 10; - slack = ktime_divns(slack, divfactor); + if (task_nice(current) > 0) + slack = slack * 5; if (slack > MAX_SLACK) return MAX_SLACK; -- 2.23.0
[PATCH 9/9] restart_block: Make common timeout
In order to provide a unified API to get the leftover of timeout, the timeout for different users of restart_block can be joined. All preparations done, so move timeout out of union and convert the users. Signed-off-by: Dmitry Safonov --- fs/select.c| 10 +- include/linux/restart_block.h | 4 +--- kernel/futex.c | 14 +++--- kernel/time/alarmtimer.c | 6 +++--- kernel/time/hrtimer.c | 6 +++--- kernel/time/posix-cpu-timers.c | 6 +++--- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/fs/select.c b/fs/select.c index ff2b9c4865cd..9ab6fc6fb7c5 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1001,7 +1001,7 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - ktime_t timeout = restart_block->poll.timeout; + ktime_t timeout = restart_block->timeout; int ret; ret = do_sys_poll(ufds, nfds, timeout); @@ -1030,10 +1030,10 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, struct restart_block *restart_block; restart_block = ¤t->restart_block; - restart_block->fn = do_restart_poll; - restart_block->poll.ufds= ufds; - restart_block->poll.nfds= nfds; - restart_block->poll.timeout = timeout; + restart_block->fn= do_restart_poll; + restart_block->poll.ufds = ufds; + restart_block->poll.nfds = nfds; + restart_block->timeout = timeout; ret = -ERESTART_RESTARTBLOCK; } diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 63d647b65395..02f90ab00a2d 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -27,6 +27,7 @@ enum timespec_type { * userspace tricks in the union. */ struct restart_block { + s64 timeout; long (*fn)(struct restart_block *); union { /* For futex_wait and futex_wait_requeue_pi */ @@ -35,7 +36,6 @@ struct restart_block { u32 val; u32 flags; u32 bitset; - u64 time; } futex; /* For nanosleep */ struct { @@ -45,11 +45,9 @@ struct restart_block { struct __kernel_timespec __user *rmtp; struct old_timespec32 __user *compat_rmtp; }; - u64 expires; } nanosleep; /* For poll */ struct { - u64 timeout; struct pollfd __user *ufds; int nfds; } poll; diff --git a/kernel/futex.c b/kernel/futex.c index 6d50728ef2e7..0738167e4911 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2755,12 +2755,12 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, goto out; restart = ¤t->restart_block; - restart->fn = futex_wait_restart; - restart->futex.uaddr = uaddr; - restart->futex.val = val; - restart->futex.time = *abs_time; - restart->futex.bitset = bitset; - restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; + restart->fn = futex_wait_restart; + restart->futex.uaddr= uaddr; + restart->futex.val = val; + restart->timeout= *abs_time; + restart->futex.bitset = bitset; + restart->futex.flags= flags | FLAGS_HAS_TIMEOUT; ret = -ERESTART_RESTARTBLOCK; @@ -2779,7 +2779,7 @@ static long futex_wait_restart(struct restart_block *restart) ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t = restart->futex.time; + t = restart->timeout; tp = &t; } restart->fn = do_no_restart_syscall; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57518efc3810..148b187c371e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -763,7 +763,7 @@ alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) { enum alarmtimer_type type = restart->nanosleep.clockid; - ktime_t exp = restart->nanosleep.expires; + ktime_t exp = restart->timeout; struct alarm alarm; alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); @@ -816,9 +816,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (flags == TIMER_ABSTIME)
[PATCH 6/9] select: Extract common code into do_sys_ppoll()
Reduce the amount of code and shrink a .text section a bit: [linux]$ ./scripts/bloat-o-meter -t /tmp/vmlinux.o.{old,new} add/remove: 1/0 grow/shrink: 0/4 up/down: 284/-691 (-407) Function old new delta do_sys_ppoll - 284+284 __x64_sys_ppoll 214 42-172 __ia32_sys_ppoll 213 40-173 __ia32_compat_sys_ppoll_time64 213 40-173 __ia32_compat_sys_ppoll_time32 213 40-173 Total: Before=13357557, After=13357150, chg -0.00% The downside is that "tsp" and "sigmask" parameters gets (void *), but it seems worth losing static type checking if there is only one line in syscall definition. Other way could be to add compat parameters in do_sys_ppoll(), but that trashes 2 more registers.. Signed-off-by: Dmitry Safonov --- fs/select.c | 94 ++--- 1 file changed, 32 insertions(+), 62 deletions(-) diff --git a/fs/select.c b/fs/select.c index 458f2a944318..262300e58370 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1056,54 +1056,58 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, return ret; } -SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, - struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, - size_t, sigsetsize) +static int do_sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, + void __user *tsp, const void __user *sigmask, + size_t sigsetsize, enum poll_time_type pt_type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (get_timespec64(&ts, tsp)) - return -EFAULT; + switch (pt_type) { + case PT_TIMESPEC: + if (get_timespec64(&ts, tsp)) + return -EFAULT; + break; + case PT_OLD_TIMESPEC: + if (get_old_timespec32(&ts, tsp)) + return -EFAULT; + break; + default: + WARN_ON_ONCE(1); + return -ENOSYS; + } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } - ret = set_user_sigmask(sigmask, sigsetsize); + if (!in_compat_syscall()) + ret = set_user_sigmask(sigmask, sigsetsize); + else + ret = set_compat_user_sigmask(sigmask, sigsetsize); + if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); + return poll_select_finish(&end_time, tsp, pt_type, ret); } -#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) +SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, + struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, + size_t, sigsetsize) +{ + return do_sys_ppoll(ufds, nfds, tsp, sigmask, sigsetsize, PT_TIMESPEC); +} +#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { - struct timespec64 ts, end_time, *to = NULL; - int ret; - - if (tsp) { - if (get_old_timespec32(&ts, tsp)) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) - return -EINVAL; - } - - ret = set_user_sigmask(sigmask, sigsetsize); - if (ret) - return ret; - - ret = do_sys_poll(ufds, nfds, to); - return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); + return do_sys_ppoll(ufds, nfds, tsp, sigmask, sigsetsize, PT_OLD_TIMESPEC); } #endif @@ -1352,24 +1356,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { - struct timespec64 ts, end_time, *to = NULL; - int ret; - - if (tsp) { - if (get_old_timespec32(&ts, tsp)) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) - return -EINVAL; - } - - ret = set_compat_user_sigmask(sigmask, sigsetsize); -
[PATCH 8/9] select/restart_block: Convert poll's timeout to u64
All preparations have been done - now poll() can set u64 timeout in restart_block. It allows to do the next step - unifying all timeouts in restart_block and provide ptrace() API to read it. Signed-off-by: Dmitry Safonov --- fs/select.c | 27 +++ include/linux/restart_block.h | 4 +--- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/fs/select.c b/fs/select.c index 4af88feaa2fe..ff2b9c4865cd 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1001,14 +1001,9 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - ktime_t timeout = 0; + ktime_t timeout = restart_block->poll.timeout; int ret; - if (restart_block->poll.has_timeout) { - timeout = ktime_set(restart_block->poll.tv_sec, - restart_block->poll.tv_nsec); - } - ret = do_sys_poll(ufds, nfds, timeout); if (ret == -ERESTARTNOHAND) { @@ -1021,14 +1016,12 @@ static long do_restart_poll(struct restart_block *restart_block) SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { - struct timespec64 end_time; ktime_t timeout = 0; int ret; if (timeout_msecs >= 0) { - poll_select_set_timeout(&end_time, timeout_msecs / MSEC_PER_SEC, - NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); - timeout = timespec64_to_ktime(end_time); + timeout = ktime_add_ms(0, timeout_msecs); + timeout = ktime_add_safe(ktime_get(), timeout); } ret = do_sys_poll(ufds, nfds, timeout); @@ -1037,16 +1030,10 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, struct restart_block *restart_block; restart_block = ¤t->restart_block; - restart_block->fn = do_restart_poll; - restart_block->poll.ufds = ufds; - restart_block->poll.nfds = nfds; - - if (timeout_msecs >= 0) { - restart_block->poll.tv_sec = end_time.tv_sec; - restart_block->poll.tv_nsec = end_time.tv_nsec; - restart_block->poll.has_timeout = 1; - } else - restart_block->poll.has_timeout = 0; + restart_block->fn = do_restart_poll; + restart_block->poll.ufds= ufds; + restart_block->poll.nfds= nfds; + restart_block->poll.timeout = timeout; ret = -ERESTART_RESTARTBLOCK; } diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index e66e982105f4..63d647b65395 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -49,11 +49,9 @@ struct restart_block { } nanosleep; /* For poll */ struct { + u64 timeout; struct pollfd __user *ufds; int nfds; - int has_timeout; - unsigned long tv_sec; - unsigned long tv_nsec; } poll; }; }; -- 2.23.0
[PATCH 1/9] futex: Remove unused uaddr2 in restart_block
Not used since introduction in commit 52400ba94675 ("futex: add requeue_pi functionality"). The result union stays the same size, so nothing saved in task_struct, but still one __user pointer less to keep. Signed-off-by: Dmitry Safonov --- include/linux/restart_block.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index bba2920e9c05..e5078cae5567 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -32,7 +32,6 @@ struct restart_block { u32 flags; u32 bitset; u64 time; - u32 __user *uaddr2; } futex; /* For nanosleep */ struct { -- 2.23.0
Re: get_unmapped_area && in_ia32_syscall (Was: [PATCH] uprobes/x86: fix detection of 32-bit user mode)
-Cc my old @virtuozzo email. Previously it just ignored emails and now sends those ugly html replies. Sorry about that - I've updated .mailmap now. On 8/27/19 6:03 PM, Dmitry Safonov wrote: > Hi Oleg, > > On 8/27/19 3:00 PM, Oleg Nesterov wrote: > [..] >> But to remind, there is another problem with in_ia32_syscall() && uprobes. >> >> get_unmapped_area() paths use in_ia32_syscall() and this is wrong in case >> when the caller is xol_add_vma(), in this case TS_COMPAT won't be set.> >> Usually the addr = TASK_SIZE - PAGE_SIZE passed to get_unmapped_area() should >> work, mm->get_unmapped_area() won't be even called. But if this addr is >> already >> occupied get_area() can return addr > TASK_SIZE. > > Technically, it's not bigger than TASK_SIZE that's supplied > get_unmapped_area() as an argument.. > > [..] >> if (!area->vaddr) { >> +if(!is_64bit_mm(mm)) >> +current_thread_info()->status |= TS_COMPAT; >> /* Try to map as high as possible, this is only a hint. */ >> area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, >> PAGE_SIZE, 0, 0); >> +if(!is_64bit_mm(mm)) >> +current_thread_info()->status &= ~TS_COMPAT;; >> if (area->vaddr & ~PAGE_MASK) { >> ret = area->vaddr; >> goto fail; > > It could have been TASK_SIZE_OF(), but that would be not much better in > my POV. I see that arch_uprobe_analyze_insn() uses is_64bit_mm() which > is correct the majority of time, but not for processes those jump > switching CS.. Except criu afair there are at least wine, dosemu. > I had it in my TODO to fix this :) > > Do I read the code properly and xol is always one page? > Could that page be reserved on the top of mmap_base/mmap_compat_base at > the binfmt loading time? (I would need than to add .mremap() for > restoring sake). Probably, not reserving it if personality doesn't allow > randomization or providing a way to disable it.. If no one has concerns over such approach, I'll cook a fix just after Plumbers week. Thanks, Dmitry
[PATCH] mailmap: Add aliases for Dmitry Safonov
I don't work for Virtuozzo or Samsung anymore and I've noticed that they have started sending annoying html email-replies. And I prioritize my personal emails over work email box, so while at it add an entry for Arista too - so I can reply faster when needed. Signed-off-by: Dmitry Safonov --- .mailmap | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.mailmap b/.mailmap index acba1a6163f1..ce3463a93515 100644 --- a/.mailmap +++ b/.mailmap @@ -64,6 +64,9 @@ Dengcheng Zhu Dengcheng Zhu Dengcheng Zhu Dmitry Eremin-Solenikov +Dmitry Safonov <0x7f454...@gmail.com> +Dmitry Safonov <0x7f454...@gmail.com> +Dmitry Safonov <0x7f454...@gmail.com> Domen Puncer Douglas Gilbert Ed L. Cashin -- 2.22.0
Re: get_unmapped_area && in_ia32_syscall (Was: [PATCH] uprobes/x86: fix detection of 32-bit user mode)
Hi Oleg, On 8/27/19 3:00 PM, Oleg Nesterov wrote: [..] > But to remind, there is another problem with in_ia32_syscall() && uprobes. > > get_unmapped_area() paths use in_ia32_syscall() and this is wrong in case > when the caller is xol_add_vma(), in this case TS_COMPAT won't be set.> > Usually the addr = TASK_SIZE - PAGE_SIZE passed to get_unmapped_area() should > work, mm->get_unmapped_area() won't be even called. But if this addr is > already > occupied get_area() can return addr > TASK_SIZE. Technically, it's not bigger than TASK_SIZE that's supplied get_unmapped_area() as an argument.. [..] > if (!area->vaddr) { > + if(!is_64bit_mm(mm)) > + current_thread_info()->status |= TS_COMPAT; > /* Try to map as high as possible, this is only a hint. */ > area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, > PAGE_SIZE, 0, 0); > + if(!is_64bit_mm(mm)) > + current_thread_info()->status &= ~TS_COMPAT;; > if (area->vaddr & ~PAGE_MASK) { > ret = area->vaddr; > goto fail; It could have been TASK_SIZE_OF(), but that would be not much better in my POV. I see that arch_uprobe_analyze_insn() uses is_64bit_mm() which is correct the majority of time, but not for processes those jump switching CS.. Except criu afair there are at least wine, dosemu. I had it in my TODO to fix this :) Do I read the code properly and xol is always one page? Could that page be reserved on the top of mmap_base/mmap_compat_base at the binfmt loading time? (I would need than to add .mremap() for restoring sake). Probably, not reserving it if personality doesn't allow randomization or providing a way to disable it.. Thanks, Dmitry
Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso
Hi Thomas, On 8/18/19 5:21 PM, Thomas Gleixner wrote: [..] > I'm happy to review well written stuff which makes progress and takes > review comments into account or the submitter discusses them for > resolution. Thanks again for both your and Andy time! [..] > Coming back to Andy's idea. Create your time namespace page as an exact > copy of the vdso data page. When the page is created do: > >memset(p->vdso_data, 0, sizeof(p->vdso_data)); >p->vdso_data[0].clock_mode = CLOCK_TIMENS; >p->vdso_data[0].seq = 1; > >/* Store the namespace offsets in basetime */ >p->vdso_data[0].basetime[CLOCK_MONOTONIC].sec = myns->mono_sec; >p->vdso_data[0].basetime[CLOCK_MONOTONIC].nsec = myns->mono_nsec; >p->vdso_data[0].basetime[CLOCK_BOOTTIME].sec = myns->boot_sec; >p->vdso_data[0].basetime[CLOCK_BOOTTIME].nsec = myns->boot_nsec; > >p->vdso_data[1].clock_mode = CLOCK_TIMENS; >p->vdso_data[1].seq = 1; > > For a normal task the VVAR pages are installed in the normal ordering: > >VVAR >PVCLOCK >HVCLOCK >TIMENS <- Not really required > > Now for a timens task you install the pages in the following order > >TIMENS >PVCLOCK >HVCLOCK >VVAR > > The check for vdso_data->clock_mode is in the unlikely path of the now open > coded seq begin magic. So for the non-timens case most of the time 'seq' is > even, so the branch is not taken. > > If 'seq' is odd, i.e. a concurrent update is in progress, the extra check > for vdso_data->clock_mode is a non-issue. The task is spin waiting for the > update to finish and for 'seq' to become even anyway. > > Patch below. I tested this with the normal order and by installing a > 'timens' page unconditionally for all processes. I'll reply with the timens > testing hacks so you can see what I did. > > The test results are pretty good. > >Base (upstream) + VDSO patch + timens page > > MONO 30ns 30ns 32ns > REAL 30ns 30ns 32ns > BOOT 30ns 30ns 32ns > MONOCOARSE 7ns8ns 10ns > REALCOARSE 7ns8ns 10ns > TAI30ns 30ns 32ns > MONORAW30ns 30ns 32ns > > So except for the coarse clocks there is no change when the timens page is > not used, i.e. the regular VVAR page is at the proper place. But that's on > one machine, a different one showed an effect in the noise range. I'm not > worried about that as the VDSO behaviour varies depending on micro > architecture anyway. > > With timens enabled the performance hit (cache hot microbenchmark) is > somewhere in the range of 5-7% when looking at the perf counters > numbers. The hit for the coarse accessors is larger obviously because the > overhead is constant time. > > I did a quick comparison of the array vs. switch case (what you used for > your clk_to_ns() helper). The switch case is slower. > > So I rather go for the array based approach. It's simpler code and the > I-cache footprint is smaller and no conditional branches involved. > > That means your timens_to_host() and host_to_timens() conversion functions > should just use that special VDSO page and do the same array based > unconditional add/sub of the clock specific offset. I was a bit scarred that clock_mode change would result in some complex logic, but your patch showed me that it's definitely not so black as I was painting it. Will rework the patches set with Andrei based on your and Andy's suggestions and patches. Thanks, Dmitry
Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso
Hi Andy, Thomas, thank you very much for your time and the reviews, appreciate that. On 8/16/19 9:10 PM, Thomas Gleixner wrote: > On Fri, 16 Aug 2019, Andy Lutomirski wrote: [..] >> I'm unconvinced that any of this magic is wise. I think you should make a >> special timens vvar page that causes the normal fastpath to fail (using a >> special vclock mode, a special seq value, or a special "last" value) and then >> make the failure path detect that timens is in use and use the timens path. I see. That's so clever, it haven't come on my mind. Hmm, is that better because of the price of 5-byte NOP? I'm a bit afraid to complicate that seq/vclock logic more.. So, what I'm driving at is would you change your mind if timens still had boot-time dynamic patching but without introducing NOP? We've got the point that you want to have no penalty at all for host tasks [on RFC reply] by introducing `if` as trashing cache and branch predictor, but I wasn't sure if NOP is also unacceptable. At that moment we had a "plan B" with something that was half-wittedly called "retcalls". The basic idea is that all that the timens brings into vdso are calls clk_to_ns(), which are all placed in tails of functions. So, if we could just memcpy() function returns in host vdso over introduced time-ns tail calls - it would be a very same code that lied before. There is a draft of those [1], that actually works on x86 on both mine and Andrei's machines. Consulting with Andrei, I've decided that we better stick to static_branchs as they are well known and have already backends for other architectures. We probably mistakenly decided that a price of NOP on scalar machines is negligible and would be acceptable. Would those self-invented "retcalls" be something that could be reviewed and potentially accepted in further iterations? [1] https://github.com/0x7f454c46/linux/commit/ab0eeb646f43#diff-c22e1e73e7367f371e1f12e3877ea12f > My initial suggestion still stands. Do that at compile time. It really does > not matter whether we have another 2 or 3 variants of vdso binaries. > > Use it and be done with it. No special magic, just straight forward > decisions to use a timens capable VDSO or not. I believe that was something we did in version 1 of the patches set. It doesn't sound like a rocket science to do, but it resulted in a couple of ugly patches. The post-attempt notes about downsides of doing it compile-time are: 1. There is additional .so for each vdso: 64-bit, ia32, x32. The same for every architecture to-be supported. It adds rules in Makefiles. [2] 2. If we still intend to keep setns() working without exec(), function entries on both host/namespace vdso should be aligned to each other [3]. That results in a patch to vdso2c to generate offsets [4, 5] and in linker magic to align another vdso [6]. 3. As unexpected consequence, we also need to align local functions on vdso [7]. So, it might be all related to my lack of skills, but it seems to bring some big amount of complexity into build process. And in my point of view, major issue is that it would not scale easily when the day will come and there will be a need to introduce another vdso.so. As I didn't want to be the guy that happens to be remembered as "he wrote this unmaintainable pile of garbage", I've taken dynamic patching approach that is done once a boot time. Regardless, we both with Andrei want to improve the patches set and make it acceptable and easy to maintain in future. I hope, that our effort to do that is visible through evolution of patches. And we're very glad that we've constructive critics and such patient maintainers. So, if I'm mistaken in those points about compile-time vdso(s), or you have in mind a plan how-to avoid those, I'd appreciate and rework it to that direction. [2] lkml.kernel.org/r/20190206001107.16488-14-d...@arista.com [3] lkml.kernel.org/r/20190206001107.16488-15-d...@arista.com [4] lkml.kernel.org/r/20190206001107.16488-16-d...@arista.com [5] lkml.kernel.org/r/20190206001107.16488-17-d...@arista.com [6] lkml.kernel.org/r/20190206001107.16488-19-d...@arista.com [7] lkml.kernel.org/r/20190206001107.16488-20-d...@arista.com Thanks, Dmitry
Re: [PATCHv6 22/36] x86/vdso: Add offsets page in vvar
Hi Thomas, On 8/15/19 8:21 PM, Thomas Gleixner wrote: > On Thu, 15 Aug 2019, Dmitry Safonov wrote: >> --- >> arch/Kconfig | 5 +++ >> arch/x86/Kconfig | 1 + >> arch/x86/entry/vdso/vdso-layout.lds.S | 9 - >> arch/x86/entry/vdso/vdso2c.c | 3 ++ >> arch/x86/entry/vdso/vma.c | 12 +++ >> arch/x86/include/asm/vdso.h | 1 + >> init/Kconfig | 1 + >> lib/vdso/gettimeofday.c | 47 +++ > > This needs to be split into the generic lib/vdso part and then x86 making > use of it. Ok >> +#ifdef CONFIG_TIME_NS > > This should be COMPILE_WITH_TIME_NS and not CONFIG_TIME_NS > >> +extern u8 timens_page >> +__attribute__((visibility("hidden"))); >> + >> +notrace static __always_inline void clk_to_ns(clockid_t clk, struct >> __kernel_timespec *ts) > > This needs notrace because? Heh, well it's alive from the time it was in arch/x86. I believe, functions there had it since commit 23adec554a76 ("x86: add notrace annotations to vsyscall"). Probably, lib/vdso is compiled without mcount in the Makefile somewhere. Will drop. [..] >> +/* >> + * The kernel allows to set a negative offset only if the current clock >> + * value in a namespace is positive, so the result tv_sec can't be >> + * negative here. >> + */ >> +ts->tv_nsec += offset64->tv_nsec; >> +ts->tv_sec += offset64->tv_sec; >> +if (ts->tv_nsec >= NSEC_PER_SEC) { >> +ts->tv_nsec -= NSEC_PER_SEC; >> +ts->tv_sec++; >> +} >> +if (ts->tv_nsec < 0) { >> +ts->tv_nsec += NSEC_PER_SEC; >> +ts->tv_sec--; >> +} > > That's broken for 32bit user space on 64bit hosts. On LE due to > misalignment and on BE because 32bit will read always 0. Ugh, will look into that. Thanks, Dmitry
[PATCHv6 04/36] posix-clocks: Rename .clock_get_timespec() callbacks accordingly
From: Andrei Vagin The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format in (struct k_clock). As a preparation ground for introducing clock_get_ktime(), the original callback clock_get() was renamed into clock_get_timespec(). Reflect the renaming into callbacks realizations. Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/alarmtimer.c | 6 +++--- kernel/time/posix-timers.c | 16 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index b5f3779eae57..995dd5aa68f0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -644,13 +644,13 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get_timespec interface + * alarm_clock_get_timespec - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * * Provides the underlying alarm base time. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) +static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; @@ -824,7 +824,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get_timespec = alarm_clock_get, + .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 36a4f6a7c4d6..4e89e342cfcc 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -165,7 +165,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; @@ -187,7 +187,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); return 0; @@ -222,13 +222,13 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; @@ -1267,7 +1267,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_clock_realtime_get, + .clock_get_timespec = posix_get_realtime_timespec, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1284,7 +1284,7 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_ktime_get_ts, + .clock_get_timespec = posix_get_monotonic_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1314,7 +1314,7 @@ static const struct k_clock clock_monotonic_coarse = { static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_get_tai, + .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1329,7 +1329,7 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clo
[PATCHv6 01/36] ns: Introduce Time Namespace
From: Andrei Vagin Time Namespace isolates clock values. The kernel provides access to several clocks CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc. CLOCK_REALTIME System-wide clock that measures real (i.e., wall-clock) time. CLOCK_MONOTONIC Clock that cannot be set and represents monotonic time since some unspecified starting point. CLOCK_BOOTTIME Identical to CLOCK_MONOTONIC, except it also includes any time that the system is suspended. For many users, the time namespace means the ability to changes date and time in a container (CLOCK_REALTIME). But in a context of the checkpoint/restore functionality, monotonic and bootime clocks become interesting. Both clocks are monotonic with unspecified staring points. These clocks are widely used to measure time slices and set timers. After restoring or migrating processes, we have to guarantee that they never go backward. In an ideal case, the behavior of these clocks should be the same as for a case when a whole system is suspended. All this means that we need to be able to set CLOCK_MONOTONIC and CLOCK_BOOTTIME clocks, what can be done by adding per-namespace offsets for clocks. A time namespace is similar to a pid namespace in a way how it is created: unshare(CLONE_NEWTIME) system call creates a new time namespace, but doesn't set it to the current process. Then all children of the process will be born in the new time namespace, or a process can use the setns() system call to join a namespace. This scheme allows setting clock offsets for a namespace, before any processes appear in it. All available clone flags have been used, so CLONE_NEWTIME uses the highest bit of CSIGNAL. It means that we can use it with the unshare system call only. Rith now, this works for us, because time namespace offsets can be set only when a new time namespace is not populated. In a future, we will have the clone3 system call [1] which will allow to use the CSIGNAL mask for clone flags. [1]: httmps://lkml.kernel.org/r/20190604160944.4058-1-christ...@brauner.io Link: https://criu.org/Time_namespace Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- MAINTAINERS| 2 + fs/proc/namespaces.c | 4 + include/linux/nsproxy.h| 2 + include/linux/proc_ns.h| 3 + include/linux/time_namespace.h | 69 +++ include/linux/user_namespace.h | 1 + include/uapi/linux/sched.h | 6 + init/Kconfig | 7 ++ kernel/Makefile| 1 + kernel/fork.c | 16 ++- kernel/nsproxy.c | 41 +-- kernel/time_namespace.c| 217 + 12 files changed, 359 insertions(+), 10 deletions(-) create mode 100644 include/linux/time_namespace.h create mode 100644 kernel/time_namespace.c diff --git a/MAINTAINERS b/MAINTAINERS index 420567d1519a..97b7737f5aba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12898,6 +12898,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Maintained F: fs/timerfd.c F: include/linux/timer* +F: include/linux/time_namespace.h +F: kernel/time_namespace.c F: kernel/time/*timer* POWER MANAGEMENT CORE diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dd2b35f78b09..8b5c720fe5d7 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 2ae1b1a4d84d..074f395b9ad2 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -35,6 +35,8 @@ struct nsproxy { struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; + struct time_namespace *time_ns; + struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index d31cb6215905..d312e6281e69 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -32,6 +32,8 @@ extern const struct proc_ns_operations pidns_for_children_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; /* * We always define these enumerators @@ -43,6 +45,7 @@ enum { P
[PATCHv6 09/36] posix-clocks: Wire up clock_gettime() with timens offsets
From: Andrei Vagin Adjust monotonic and boottime clocks with per-timens offsets. As the result a process inside time namespace will see timers and clocks corrected to offsets that were set on creating namespace. Note that applications usually go through vDSO to get time, which is not yet adjusted. Further changes complete time namespace virtualisation with vDSO support. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/alarmtimer.c | 1 + kernel/time/posix-stubs.c | 3 +++ kernel/time/posix-timers.c | 5 + 3 files changed, 9 insertions(+) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c8f8cf3d7d08..fbf18b26faed 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "posix-timers.h" diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 67df65f887ac..edaf075d1ee4 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER @@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) break; case CLOCK_MONOTONIC: ktime_get_ts64(tp); + timens_add_monotonic(tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); break; default: return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1d41c6a41d63..365ac40d46b1 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "timekeeping.h" #include "posix-timers.h" @@ -195,6 +196,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -209,6 +211,7 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -223,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -235,6 +239,7 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); return 0; } -- 2.22.0
[PATCHv6 02/36] timens: Add timens_offsets
From: Andrei Vagin Introduce offsets for time namespace. They will contain an adjustment needed to convert clocks to/from host's. Allocate one page for each time namespace that will be premapped into userspace among vvar pages. A new namespace is created with the same offsets as the time namespace of the current process. Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- MAINTAINERS| 1 + include/linux/time_namespace.h | 18 ++ include/linux/timens_offsets.h | 10 ++ kernel/time_namespace.c| 16 ++-- 4 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 include/linux/timens_offsets.h diff --git a/MAINTAINERS b/MAINTAINERS index 97b7737f5aba..527aee1e616f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12899,6 +12899,7 @@ S: Maintained F: fs/timerfd.c F: include/linux/timer* F: include/linux/time_namespace.h +F: include/linux/timens_offsets.h F: kernel/time_namespace.c F: kernel/time/*timer* diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 9507ed7072fe..334c1a1c6607 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -8,6 +8,7 @@ #include #include #include +#include struct user_namespace; extern struct user_namespace init_user_ns; @@ -39,6 +40,21 @@ static inline void put_time_ns(struct time_namespace *ns) kref_put(&ns->kref, free_time_ns); } +static inline void timens_add_monotonic(struct timespec64 *ts) +{ + struct timens_offsets *ns_offsets = current->nsproxy->time_ns->offsets; + + if (ns_offsets) + *ts = timespec64_add(*ts, ns_offsets->monotonic); +} + +static inline void timens_add_boottime(struct timespec64 *ts) +{ + struct timens_offsets *ns_offsets = current->nsproxy->time_ns->offsets; + + if (ns_offsets) + *ts = timespec64_add(*ts, ns_offsets->boottime); +} #else static inline struct time_namespace *get_time_ns(struct time_namespace *ns) @@ -64,6 +80,8 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *ts return 0; } +static inline void timens_add_monotonic(struct timespec64 *ts) {} +static inline void timens_add_boottime(struct timespec64 *ts) {} #endif #endif /* _LINUX_TIMENS_H */ diff --git a/include/linux/timens_offsets.h b/include/linux/timens_offsets.h new file mode 100644 index ..e93aabaa5e45 --- /dev/null +++ b/include/linux/timens_offsets.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TIME_OFFSETS_H +#define _LINUX_TIME_OFFSETS_H + +struct timens_offsets { + struct timespec64 monotonic; + struct timespec64 boottime; +}; + +#endif diff --git a/kernel/time_namespace.c b/kernel/time_namespace.c index 8fd8384b7261..394a9e168e7c 100644 --- a/kernel/time_namespace.c +++ b/kernel/time_namespace.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { @@ -47,6 +48,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, { struct time_namespace *ns; struct ucounts *ucounts; + struct page *page; int err; err = -ENOSPC; @@ -59,15 +61,24 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto fail_free; + ns->offsets = page_address(page); + if (old_ns->offsets) + memcpy(ns->offsets, old_ns->offsets, sizeof(struct timens_offsets)); + BUILD_BUG_ON(sizeof(*ns->offsets) > PAGE_SIZE); + err = ns_alloc_inum(&ns->ns); if (err) - goto fail_free; + goto fail_page; ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); return ns; - +fail_page: + free_page((unsigned long)ns->offsets); fail_free: kfree(ns); fail_dec: @@ -95,6 +106,7 @@ void free_time_ns(struct kref *kref) struct time_namespace *ns; ns = container_of(kref, struct time_namespace, kref); + free_page((unsigned long)ns->offsets); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); -- 2.22.0
[PATCHv6 08/36] posix-timers: Use clock_get_ktime() in common_timer_get()
From: Andrei Vagin Now, when the clock_get_ktime() callback exists, the suboptimal timespec64-based conversion can be removed from common_timer_get(). Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Co-developed-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- kernel/time/posix-timers.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 7cf1216050d1..1d41c6a41d63 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -665,7 +665,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct timespec64 ts64; bool sig_none; sig_none = timr->it_sigev_notify == SIGEV_NONE; @@ -683,12 +682,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) return; } - /* -* The timespec64 based conversion is suboptimal, but it's not -* worth to implement yet another callback. -*/ - kc->clock_get_timespec(timr->it_clock, &ts64); - now = timespec64_to_ktime(ts64); + now = kc->clock_get_ktime(timr->it_clock); /* * When a requeue is pending or this is a SIGEV_NONE timer move the -- 2.22.0