[PATCH v3 4/4] virtio_balloon: introduce memory scan/reclaim info
Expose memory scan/reclaim information to the host side via virtio balloon device. Now we have a metric to analyze the memory performance: y: counter increases n: counter does not changes h: the rate of counter change is high l: the rate of counter change is low OOM: VIRTIO_BALLOON_S_OOM_KILL STALL: VIRTIO_BALLOON_S_ALLOC_STALL ASCAN: VIRTIO_BALLOON_S_SCAN_ASYNC DSCAN: VIRTIO_BALLOON_S_SCAN_DIRECT ARCLM: VIRTIO_BALLOON_S_RECLAIM_ASYNC DRCLM: VIRTIO_BALLOON_S_RECLAIM_DIRECT - OOM[y], STALL[*], ASCAN[*], DSCAN[*], ARCLM[*], DRCLM[*]: the guest runs under really critial memory pressure - OOM[n], STALL[h], ASCAN[*], DSCAN[l], ARCLM[*], DRCLM[l]: the memory allocation stalls due to cgroup, not the global memory pressure. - OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[h]: the memory allocation stalls due to global memory pressure. The performance gets hurt a lot. A high ratio between DRCLM/DSCAN shows quite effective memory reclaiming. - OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[l]: the memory allocation stalls due to global memory pressure. the ratio between DRCLM/DSCAN gets low, the guest OS is thrashing heavily, the serious case leads poor performance and difficult trouble shooting. Ex, sshd may block on memory allocation when accepting new connections, a user can't login a VM by ssh command. - OOM[n], STALL[n], ASCAN[h], DSCAN[n], ARCLM[l], DRCLM[n]: the low ratio between ARCLM/ASCAN shows that the guest tries to reclaim more memory, but it can't. Once more memory is required in future, it will struggle to reclaim memory. Acked-by: David Hildenbrand Signed-off-by: zhenwei pi --- drivers/virtio/virtio_balloon.c | 9 + include/uapi/linux/virtio_balloon.h | 12 ++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index e6229e548832..225662358221 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -340,6 +340,15 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall); + update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_SCAN, + pages_to_bytes(events[PGSCAN_KSWAPD])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_SCAN, + pages_to_bytes(events[PGSCAN_DIRECT])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_RECLAIM, + pages_to_bytes(events[PGSTEAL_KSWAPD])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_RECLAIM, + pages_to_bytes(events[PGSTEAL_DIRECT])); + #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, events[HTLB_BUDDY_PGALLOC]); diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 487b893a160e..ee35a372805d 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -73,7 +73,11 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ #define VIRTIO_BALLOON_S_OOM_KILL 10 /* OOM killer invocations */ #define VIRTIO_BALLOON_S_ALLOC_STALL 11 /* Stall count of memory allocatoin */ -#define VIRTIO_BALLOON_S_NR 12 +#define VIRTIO_BALLOON_S_ASYNC_SCAN12 /* Amount of memory scanned asynchronously */ +#define VIRTIO_BALLOON_S_DIRECT_SCAN 13 /* Amount of memory scanned directly */ +#define VIRTIO_BALLOON_S_ASYNC_RECLAIM 14 /* Amount of memory reclaimed asynchronously */ +#define VIRTIO_BALLOON_S_DIRECT_RECLAIM 15 /* Amount of memory reclaimed directly */ +#define VIRTIO_BALLOON_S_NR 16 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \ VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \ @@ -87,7 +91,11 @@ struct virtio_balloon_config { VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \ VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \ - VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \ + VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls", \ + VIRTIO_BALLOON_S_NAMES_prefix "async-scans", \ + VIRTIO_BALLOON_S_NAMES_prefix "direct-scans", \ + VIRTIO_BALLOON_S_NAMES_prefix "async-reclaims", \ + VIRTIO_BALLOON_S_NAMES_prefix "direct-reclaims" \ } #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("") -- 2.34.1
[PATCH v3 3/4] virtio_balloon: introduce memory allocation stall counter
Memory allocation stall counter represents the performance/latency of memory allocation, expose this counter to the host side by virtio balloon device via out-of-bound way. Acked-by: David Hildenbrand Signed-off-by: zhenwei pi --- drivers/virtio/virtio_balloon.c | 8 include/uapi/linux/virtio_balloon.h | 6 -- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index f7a47eaa0936..e6229e548832 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -322,6 +322,8 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) { unsigned long events[NR_VM_EVENT_ITEMS]; unsigned int idx = 0; + unsigned int zid; + unsigned long stall = 0; all_vm_events(events); update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, @@ -332,6 +334,12 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]); + /* sum all the stall events */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) + stall += events[ALLOCSTALL_NORMAL - ZONE_NORMAL + zid]; + + update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall); + #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, events[HTLB_BUDDY_PGALLOC]); diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index b17bbe033697..487b893a160e 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -72,7 +72,8 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */ #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ #define VIRTIO_BALLOON_S_OOM_KILL 10 /* OOM killer invocations */ -#define VIRTIO_BALLOON_S_NR 11 +#define VIRTIO_BALLOON_S_ALLOC_STALL 11 /* Stall count of memory allocatoin */ +#define VIRTIO_BALLOON_S_NR 12 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \ VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \ @@ -85,7 +86,8 @@ struct virtio_balloon_config { VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \ - VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \ + VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \ + VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \ } #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("") -- 2.34.1
[PATCH v3 2/4] virtio_balloon: introduce oom-kill invocations
When the guest OS runs under critical memory pressure, the guest starts to kill processes. A guest monitor agent may scan 'oom_kill' from /proc/vmstat, and reports the OOM KILL event. However, the agent may be killed and we will loss this critical event(and the later events). For now we can also grep for magic words in guest kernel log from host side. Rather than this unstable way, virtio balloon reports OOM-KILL invocations instead. Acked-by: David Hildenbrand Signed-off-by: zhenwei pi --- drivers/virtio/virtio_balloon.c | 1 + include/uapi/linux/virtio_balloon.h | 6 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 1710e3098ecd..f7a47eaa0936 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -330,6 +330,7 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) pages_to_bytes(events[PSWPOUT])); update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]); #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index ddaa45e723c4..b17bbe033697 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -71,7 +71,8 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */ #define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */ #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ -#define VIRTIO_BALLOON_S_NR 10 +#define VIRTIO_BALLOON_S_OOM_KILL 10 /* OOM killer invocations */ +#define VIRTIO_BALLOON_S_NR 11 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \ VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \ @@ -83,7 +84,8 @@ struct virtio_balloon_config { VIRTIO_BALLOON_S_NAMES_prefix "available-memory", \ VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \ - VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures" \ + VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \ + VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \ } #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("") -- 2.34.1
[PATCH v3 1/4] virtio_balloon: separate vm events into a function
All the VM events related statistics have dependence on 'CONFIG_VM_EVENT_COUNTERS', separate these events into a function to make code clean. Then we can remove 'CONFIG_VM_EVENT_COUNTERS' from 'update_balloon_stats'. Signed-off-by: zhenwei pi --- drivers/virtio/virtio_balloon.c | 43 ++--- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 1f5b3dd31fcf..1710e3098ecd 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -316,34 +316,49 @@ static inline void update_stat(struct virtio_balloon *vb, int idx, #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) -static unsigned int update_balloon_stats(struct virtio_balloon *vb) +#ifdef CONFIG_VM_EVENT_COUNTERS +/* Return the number of entries filled by vm events */ +static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) { unsigned long events[NR_VM_EVENT_ITEMS]; - struct sysinfo i; unsigned int idx = 0; - long available; - unsigned long caches; all_vm_events(events); - si_meminfo(); - - available = si_mem_available(); - caches = global_node_page_state(NR_FILE_PAGES); - -#ifdef CONFIG_VM_EVENT_COUNTERS update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, - pages_to_bytes(events[PSWPIN])); + pages_to_bytes(events[PSWPIN])); update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, - pages_to_bytes(events[PSWPOUT])); + pages_to_bytes(events[PSWPOUT])); update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); + #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, events[HTLB_BUDDY_PGALLOC]); update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, events[HTLB_BUDDY_PGALLOC_FAIL]); -#endif -#endif +#endif /* CONFIG_HUGETLB_PAGE */ + + return idx; +} +#else /* CONFIG_VM_EVENT_COUNTERS */ +static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) +{ + return 0; +} +#endif /* CONFIG_VM_EVENT_COUNTERS */ + +static unsigned int update_balloon_stats(struct virtio_balloon *vb) +{ + struct sysinfo i; + unsigned int idx; + long available; + unsigned long caches; + + idx = update_balloon_vm_stats(vb); + + si_meminfo(); + available = si_mem_available(); + caches = global_node_page_state(NR_FILE_PAGES); update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, pages_to_bytes(i.freeram)); update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, -- 2.34.1
[PATCH v3 0/4] Improve memory statistics for virtio balloon
Hi, v2 -> v3: - A few coding style change in '[PATCH v3 1/4] virtio_balloon: separate vm events into a function' v1 -> v2: - Add a new patch 'virtio_balloon: separate vm events into a function' to avoid any compiler warnings(unused stack variable on CONFIG_VM_EVENT_COUNTERS=n) - Suggested by David, use a loop 'for (zid = 0; zid < MAX_NR_ZONES; zid++)' to obtain all the stall events. RFC -> v1: - several text changes: oom-kill -> oom-kills, SCAN_ASYNC -> ASYN_SCAN. - move vm events codes into '#ifdef CONFIG_VM_EVENT_COUNTERS' RFC version: Link: https://lore.kernel.org/lkml/20240415084113.1203428-1-pizhen...@bytedance.com/T/#m1898963b3c27a989b1123db475135c3ca687ca84 zhenwei pi (4): virtio_balloon: separate vm events into a function virtio_balloon: introduce oom-kill invocations virtio_balloon: introduce memory allocation stall counter virtio_balloon: introduce memory scan/reclaim info drivers/virtio/virtio_balloon.c | 61 ++--- include/uapi/linux/virtio_balloon.h | 16 +++- 2 files changed, 61 insertions(+), 16 deletions(-) -- 2.34.1
Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()
On 4/23/24 06:46, Michael S. Tsirkin wrote: On Mon, Apr 08, 2024 at 02:15:24PM +1000, Gavin Shan wrote: On 3/30/24 19:02, Gavin Shan wrote: On 3/28/24 19:31, Michael S. Tsirkin wrote: On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote: All the callers of vhost_get_avail_idx() are concerned to the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. With it, the callers needn't to worry about the memory barrier. Suggested-by: Michael S. Tsirkin Signed-off-by: Gavin Shan Previous patches are ok. This one I feel needs more work - first more code such as sanity checking should go into this function, second there's actually a difference between comparing to last_avail_idx and just comparing to the previous value of avail_idx. I will pick patches 1-2 and post a cleanup on top so you can take a look, ok? Thanks, Michael. It's fine to me. A kindly ping. If it's ok to you, could you please merge PATCH[1-2]? Our downstream 9.4 need the fixes, especially for NVidia's grace-hopper and grace-grace platforms. For PATCH[3], I also can help with the improvement if you don't have time for it. Please let me know. 1-2 are upstream go ahead and post the cleanup. Michael, a cleanup series has been sent for review. https://lore.kernel.org/virtualization/20240423032407.262329-1-gs...@redhat.com/T/#t Thanks, Gavin
[PATCH 4/4] vhost: Reformat vhost_{get, put}_user()
Reformat the macros to use tab as the terminator for each line so that it looks clean. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 60 +-- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index a3de9325175f..3be19877f9df 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1207,21 +1207,22 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, return __vhost_get_user_slow(vq, addr, size, type); } -#define vhost_put_user(vq, x, ptr) \ -({ \ - int ret; \ - if (!vq->iotlb) { \ - ret = __put_user(x, ptr); \ - } else { \ - __typeof__(ptr) to = \ +#define vhost_put_user(vq, x, ptr) \ +({ \ + int ret;\ + if (!vq->iotlb) { \ + ret = __put_user(x, ptr); \ + } else {\ + __typeof__(ptr) to =\ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ - sizeof(*ptr), VHOST_ADDR_USED); \ - if (to != NULL) \ - ret = __put_user(x, to); \ - else \ - ret = -EFAULT; \ - } \ - ret; \ + sizeof(*ptr), \ + VHOST_ADDR_USED); \ + if (to != NULL) \ + ret = __put_user(x, to);\ + else\ + ret = -EFAULT; \ + } \ + ret;\ }) static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) @@ -1252,22 +1253,21 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) >used->idx); } -#define vhost_get_user(vq, x, ptr, type) \ -({ \ - int ret; \ - if (!vq->iotlb) { \ - ret = __get_user(x, ptr); \ - } else { \ - __typeof__(ptr) from = \ - (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ - sizeof(*ptr), \ - type); \ - if (from != NULL) \ - ret = __get_user(x, from); \ - else \ - ret = -EFAULT; \ - } \ - ret; \ +#define vhost_get_user(vq, x, ptr, type) \ +({ \ + int ret;\ + if (!vq->iotlb) { \ + ret = __get_user(x, ptr); \ + } else {\ + __typeof__(ptr) from = \ + (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ + sizeof(*ptr), type);\ + if (from != NULL) \ + ret = __get_user(x, from); \ + else\ + ret = -EFAULT; \ + } \ + ret;\ }) #define vhost_get_avail(vq, x, ptr) \ -- 2.44.0
[PATCH 3/4] vhost: Improve vhost_get_avail_head()
Improve vhost_get_avail_head() so that the head or errno is returned. With it, the relevant sanity checks are squeezed to vhost_get_avail_head() and vhost_get_vq_desc() is further simplified. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 43 +++ 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b3adc0bc9e72..a3de9325175f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1320,11 +1320,27 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) return 0; } -static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, - __virtio16 *head, int idx) +static inline int vhost_get_avail_head(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *head, - >avail->ring[idx & (vq->num - 1)]); + __virtio16 head; + int r; + + r = vhost_get_avail(vq, head, + >avail->ring[vq->last_avail_idx & (vq->num - 1)]); + if (unlikely(r)) { + vq_err(vq, "Failed to read head: idx %u address %p\n", + vq->last_avail_idx, + >avail->ring[vq->last_avail_idx % vq->num]); + return r; + } + + r = vhost16_to_cpu(vq, head); + if (unlikely(r >= vq->num)) { + vq_err(vq, "Invalid head %d (%u)\n", r, vq->num); + return -EINVAL; + } + + return r; } static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, @@ -2522,7 +2538,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - __virtio16 ring_head; int ret, access; if (vq->avail_idx == vq->last_avail_idx) { @@ -2539,21 +2554,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) { - vq_err(vq, "Failed to read head: idx %d address %p\n", - vq->last_avail_idx, - >avail->ring[vq->last_avail_idx % vq->num]); - return -EFAULT; - } - - head = vhost16_to_cpu(vq, ring_head); - - /* If their number is silly, that's an error. */ - if (unlikely(head >= vq->num)) { - vq_err(vq, "Guest says index %u > %u is available", - head, vq->num); - return -EINVAL; - } + head = vhost_get_avail_head(vq); + if (unlikely(head < 0)) + return head; /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; -- 2.44.0
[PATCH 2/4] vhost: Improve vhost_get_avail_idx() with smp_rmb()
All the callers of vhost_get_avail_idx() are concerned to the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. With it, the callers needn't to worry about the memory barrier. No functional change intended. Suggested-by: Michael S. Tsirkin Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 91 --- 1 file changed, 34 insertions(+), 57 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ef7942103232..b3adc0bc9e72 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,34 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(>vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, >avail->idx); + __virtio16 avail_idx; + int r; + + r = vhost_get_avail(vq, avail_idx, >avail->idx); + if (unlikely(r)) { + vq_err(vq, "Failed to access avail idx at %p\n", + >avail->idx); + return r; + } + + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Ensure the available ring entry read happens +* before the avail_idx read when the avail_idx +* is advanced. +*/ + smp_rmb(); + } + + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Invalid avail index change from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EINVAL; + } + + return 0; } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2498,35 +2522,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - __virtio16 avail_idx; __virtio16 ring_head; int ret, access; - /* Check it isn't doing very strange things with descriptor numbers. */ if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, _idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - >avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved avail index from %u to %u", - vq->last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret)) + return ret; /* If there's nothing new since last we looked, return * invalid. */ if (vq->avail_idx == vq->last_avail_idx) return vq->num; - - /* Only get avail ring entries after they have been -* exposed by guest. -*/ - smp_rmb(); } /* Grab the next descriptor number they're advertising, and increment @@ -2787,35 +2795,19 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; - int r; - if (vq->avail_idx != vq->last_avail_idx) return false; - r = vhost_get_avail_idx(vq, _idx); - if (unlikely(r)) - return false; - - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - if (vq->avail_idx != vq->last_avail_idx) { - /* Since we have updated avail_idx, the following -* call to vhost_get_vq_desc() will read available -* ring entries. Make sure that read happens after -* the avail_idx read. -*/ - smp_rmb(); + if (unlikely(vhost_get_avail_idx(vq))) return false; - } - return true; + return vq->avail_idx == vq->last_avail_idx; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); /* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) @@ -2839,25 +2831,10 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) /* They could have slipped one in as we were
[PATCH 1/4] vhost: Drop variable last_avail_idx in vhost_get_vq_desc()
The local variable @last_avail_idx is equivalent to vq->last_avail_idx. So the code can be simplified a bit by dropping the local variable @last_avail_idx. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8995730ce0bf..ef7942103232 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2498,14 +2498,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; __virtio16 avail_idx; __virtio16 ring_head; int ret, access; /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx; - if (vq->avail_idx == vq->last_avail_idx) { if (unlikely(vhost_get_avail_idx(vq, _idx))) { vq_err(vq, "Failed to access avail idx at %p\n", @@ -2514,16 +2511,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, } vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { vq_err(vq, "Guest moved avail index from %u to %u", - last_avail_idx, vq->avail_idx); + vq->last_avail_idx, vq->avail_idx); return -EFAULT; } /* If there's nothing new since last we looked, return * invalid. */ - if (vq->avail_idx == last_avail_idx) + if (vq->avail_idx == vq->last_avail_idx) return vq->num; /* Only get avail ring entries after they have been @@ -2534,10 +2531,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(vhost_get_avail_head(vq, _head, last_avail_idx))) { + if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) { vq_err(vq, "Failed to read head: idx %d address %p\n", - last_avail_idx, - >avail->ring[last_avail_idx % vq->num]); + vq->last_avail_idx, + >avail->ring[vq->last_avail_idx % vq->num]); return -EFAULT; } -- 2.44.0
[PATCH 0/4] vhost: Cleanup
This is suggested by Michael S. Tsirkin according to [1] and the goal is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it, the caller of the function needn't to worry about memory barriers. Since we're here, other cleanups are also applied. [1] https://lore.kernel.org/virtualization/20240327075940-mutt-send-email-...@kernel.org/ PATCH[1] drops the local variable @last_avail_idx since it's equivalent to vq->last_avail_idx PATCH[2] improves vhost_get_avail_idx() so that smp_rmb() is applied if needed. Besides, the sanity checks on the retrieved available queue index are also squeezed to vhost_get_avail_idx() PATCH[3] improves vhost_get_avail_head(), similar to what we're doing for vhost_get_avail_idx(), so that the relevant sanity checks on the head are squeezed to vhost_get_avail_head() PATCH[4] Reformat vhost_{get, put}_user() by using tab instead of space as the terminator for each line Gavin Shan (4): vhost: Drop variable last_avail_idx in vhost_get_vq_desc() vhost: Improve vhost_get_avail_idx() with smp_rmb() vhost: Improve vhost_get_avail_head() vhost: Reformat vhost_{get, put}_user() drivers/vhost/vhost.c | 199 +++--- 1 file changed, 88 insertions(+), 111 deletions(-) -- 2.44.0
Re: [PATCH v5 3/5] vduse: Add function to get/free the pages for reconnection
On Tue, Apr 23, 2024 at 4:05 AM Michael S. Tsirkin wrote: > > On Thu, Apr 18, 2024 at 08:57:51AM +0800, Jason Wang wrote: > > On Wed, Apr 17, 2024 at 5:29 PM Michael S. Tsirkin wrote: > > > > > > On Fri, Apr 12, 2024 at 09:28:23PM +0800, Cindy Lu wrote: > > > > Add the function vduse_alloc_reconnnect_info_mem > > > > and vduse_alloc_reconnnect_info_mem > > > > These functions allow vduse to allocate and free memory for reconnection > > > > information. The amount of memory allocated is vq_num pages. > > > > Each VQS will map its own page where the reconnection information will > > > > be saved > > > > > > > > Signed-off-by: Cindy Lu > > > > --- > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 40 ++ > > > > 1 file changed, 40 insertions(+) > > > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c > > > > b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > index ef3c9681941e..2da659d5f4a8 100644 > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > > @@ -65,6 +65,7 @@ struct vduse_virtqueue { > > > > int irq_effective_cpu; > > > > struct cpumask irq_affinity; > > > > struct kobject kobj; > > > > + unsigned long vdpa_reconnect_vaddr; > > > > }; > > > > > > > > struct vduse_dev; > > > > @@ -1105,6 +1106,38 @@ static void vduse_vq_update_effective_cpu(struct > > > > vduse_virtqueue *vq) > > > > > > > > vq->irq_effective_cpu = curr_cpu; > > > > } > > > > +static int vduse_alloc_reconnnect_info_mem(struct vduse_dev *dev) > > > > +{ > > > > + unsigned long vaddr = 0; > > > > + struct vduse_virtqueue *vq; > > > > + > > > > + for (int i = 0; i < dev->vq_num; i++) { > > > > + /*page 0~ vq_num save the reconnect info for vq*/ > > > > + vq = dev->vqs[i]; > > > > + vaddr = get_zeroed_page(GFP_KERNEL); > > > > > > > > > I don't get why you insist on stealing kernel memory for something > > > that is just used by userspace to store data for its own use. > > > Userspace does not lack ways to persist data, for example, > > > create a regular file anywhere in the filesystem. > > > > Good point. So the motivation here is to: > > > > 1) be self contained, no dependency for high speed persist data > > storage like tmpfs > > No idea what this means. I mean a regular file may slow down the datapath performance, so usually the application will try to use tmpfs and other which is a dependency for implementing the reconnection. > > > 2) standardize the format in uAPI which allows reconnection from > > arbitrary userspace, unfortunately, such effort was removed in new > > versions > > And I don't see why that has to live in the kernel tree either. I can't find a better place, any idea? Thanks > > > If the above doesn't make sense, we don't need to offer those pages by > > VDUSE. > > > > Thanks > > > > > > > > > > > > > > > > > + if (vaddr == 0) > > > > + return -ENOMEM; > > > > + > > > > + vq->vdpa_reconnect_vaddr = vaddr; > > > > + } > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +static int vduse_free_reconnnect_info_mem(struct vduse_dev *dev) > > > > +{ > > > > + struct vduse_virtqueue *vq; > > > > + > > > > + for (int i = 0; i < dev->vq_num; i++) { > > > > + vq = dev->vqs[i]; > > > > + > > > > + if (vq->vdpa_reconnect_vaddr) > > > > + free_page(vq->vdpa_reconnect_vaddr); > > > > + vq->vdpa_reconnect_vaddr = 0; > > > > + } > > > > + > > > > + return 0; > > > > +} > > > > > > > > static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > > > > unsigned long arg) > > > > @@ -1672,6 +1705,8 @@ static int vduse_destroy_dev(char *name) > > > > mutex_unlock(>lock); > > > > return -EBUSY; > > > > } > > > > + vduse_free_reconnnect_info_mem(dev); > > > > + > > > > dev->connected = true; > > > > mutex_unlock(>lock); > > > > > > > > @@ -1855,12 +1890,17 @@ static int vduse_create_dev(struct > > > > vduse_dev_config *config, > > > > ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num); > > > > if (ret) > > > > goto err_vqs; > > > > + ret = vduse_alloc_reconnnect_info_mem(dev); > > > > + if (ret < 0) > > > > + goto err_mem; > > > > > > > > __module_get(THIS_MODULE); > > > > > > > > return 0; > > > > err_vqs: > > > > device_destroy(_class, MKDEV(MAJOR(vduse_major), > > > > dev->minor)); > > > > +err_mem: > > > > + vduse_free_reconnnect_info_mem(dev); > > > > err_dev: > > > > idr_remove(_idr, dev->minor); > > > > err_idr: > > > > -- > > > > 2.43.0 > > > >
Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent
On Tue, Apr 23, 2024 at 10:14 AM Jason Xing wrote: > > Hi Simon, > > On Tue, Apr 23, 2024 at 2:28 AM Simon Horman wrote: > > > > On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote: > > > > ... > > > > > diff --git a/include/net/rstreason.h b/include/net/rstreason.h > > > > ... > > > > > +/** > > > + * There are three parts in order: > > > + * 1) reset reason in MPTCP: only for MPTCP use > > > + * 2) skb drop reason: relying on drop reasons for such as passive reset > > > + * 3) independent reset reason: such as active reset reasons > > > + */ > > > > Hi Jason, > > > > A minor nit from my side. > > > > '/**' denotes the beginning of a Kernel doc, > > but other than that, this comment is not a Kernel doc. > > > > FWIIW, I would suggest providing a proper Kernel doc for enum sk_rst_reason. > > But another option would be to simply make this a normal comment, > > starting with "/* There are" > > Thanks Simon. I'm trying to use the kdoc way to make it right :) > > How about this one: > /** > * enum sk_rst_reason - the reasons of socket reset > * > * The reason of skb drop, which is used in DCCP/TCP/MPTCP protocols. s/skb drop/sk reset/ Sorry, I cannot withdraw my previous email in time. > * > * There are three parts in order: > * 1) skb drop reasons: relying on drop reasons for such as passive > reset > * 2) independent reset reasons: such as active reset reasons > * 3) reset reasons in MPTCP: only for MPTCP use > */ > ? > > I chose to mimic what enum skb_drop_reason does in the > include/net/dropreason-core.h file. > > > +enum sk_rst_reason { > > + /** > > +* Copy from include/uapi/linux/mptcp.h. > > +* These reset fields will not be changed since they adhere to > > +* RFC 8684. So do not touch them. I'm going to list each definition > > +* of them respectively. > > +*/ > > Thanks to you, I found another similar point where I smell something > wrong as in the above code. I'm going to replace '/**' with '/*' since > it's only a comment, not a kdoc. > > Thanks, > Jason
Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent
Hi Simon, On Tue, Apr 23, 2024 at 2:28 AM Simon Horman wrote: > > On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote: > > ... > > > diff --git a/include/net/rstreason.h b/include/net/rstreason.h > > ... > > > +/** > > + * There are three parts in order: > > + * 1) reset reason in MPTCP: only for MPTCP use > > + * 2) skb drop reason: relying on drop reasons for such as passive reset > > + * 3) independent reset reason: such as active reset reasons > > + */ > > Hi Jason, > > A minor nit from my side. > > '/**' denotes the beginning of a Kernel doc, > but other than that, this comment is not a Kernel doc. > > FWIIW, I would suggest providing a proper Kernel doc for enum sk_rst_reason. > But another option would be to simply make this a normal comment, > starting with "/* There are" Thanks Simon. I'm trying to use the kdoc way to make it right :) How about this one: /** * enum sk_rst_reason - the reasons of socket reset * * The reason of skb drop, which is used in DCCP/TCP/MPTCP protocols. * * There are three parts in order: * 1) skb drop reasons: relying on drop reasons for such as passive reset * 2) independent reset reasons: such as active reset reasons * 3) reset reasons in MPTCP: only for MPTCP use */ ? I chose to mimic what enum skb_drop_reason does in the include/net/dropreason-core.h file. > +enum sk_rst_reason { > + /** > +* Copy from include/uapi/linux/mptcp.h. > +* These reset fields will not be changed since they adhere to > +* RFC 8684. So do not touch them. I'm going to list each definition > +* of them respectively. > +*/ Thanks to you, I found another similar point where I smell something wrong as in the above code. I'm going to replace '/**' with '/*' since it's only a comment, not a kdoc. Thanks, Jason
回复: [PATCH v5] vp_vdpa: don't allocate unused msix vectors
On Wed, Apr 10, 2024 at 11:30:20AM +0800, lyx634449800 wrote: > From: Yuxue Liu > > When there is a ctlq and it doesn't require interrupt callbacks,the > original method of calculating vectors wastes hardware msi or msix > resources as well as system IRQ resources. > > When conducting performance testing using testpmd in the guest os, it > was found that the performance was lower compared to directly using > vfio-pci to passthrough the device > > In scenarios where the virtio device in the guest os does not utilize > interrupts, the vdpa driver still configures the hardware's msix > vector. Therefore, the hardware still sends interrupts to the host os. >I just have a question on this part. How come hardware sends interrupts does >not guest driver disable them? 1:Assuming the guest OS's Virtio device is using PMD mode, QEMU sets the call fd to -1 2:On the host side, the vhost_vdpa program will set vp_vdpa->vring[i].cb.callback to invalid 3:Before the modification, the vp_vdpa_request_irq function does not check whether vp_vdpa->vring[i].cb.callback is valid. Instead, it enables the hardware's MSIX interrupts based on the number of queues of the device - Original Message - From: Michael S. Tsirkin m...@redhat.com Sent: April 22, 2024 20:09 To: Gavin Liu gavin@jaguarmicro.com Cc: jasow...@redhat.com; Angus Chen angus.c...@jaguarmicro.com; virtualizat...@lists.linux.dev; xuanz...@linux.alibaba.com; linux-kernel@vger.kernel.org; Heng Qi hen...@linux.alibaba.com Subject: Re: [PATCH v5] vp_vdpa: don't allocate unused msix vectors External Mail: This email originated from OUTSIDE of the organization! Do not click links, open attachments or provide ANY information unless you recognize the sender and know the content is safe. On Wed, Apr 10, 2024 at 11:30:20AM +0800, lyx634449800 wrote: > From: Yuxue Liu > > When there is a ctlq and it doesn't require interrupt callbacks,the > original method of calculating vectors wastes hardware msi or msix > resources as well as system IRQ resources. > > When conducting performance testing using testpmd in the guest os, it > was found that the performance was lower compared to directly using > vfio-pci to passthrough the device > > In scenarios where the virtio device in the guest os does not utilize > interrupts, the vdpa driver still configures the hardware's msix > vector. Therefore, the hardware still sends interrupts to the host os. I just have a question on this part. How come hardware sends interrupts does not guest driver disable them? > Because of this unnecessary > action by the hardware, hardware performance decreases, and it also > affects the performance of the host os. > > Before modification:(interrupt mode) > 32: 0 0 0 0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0 > 33: 0 0 0 0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1 > 34: 0 0 0 0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2 > 35: 0 0 0 0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config > > After modification:(interrupt mode) > 32: 0 0 1 7 PCI-MSI 32768-edge vp-vdpa[:00:02.0]-0 > 33: 36 0 3 0 PCI-MSI 32769-edge vp-vdpa[:00:02.0]-1 > 34: 0 0 0 0 PCI-MSI 32770-edge vp-vdpa[:00:02.0]-config > > Before modification:(virtio pmd mode for guest os) > 32: 0 0 0 0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0 > 33: 0 0 0 0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1 > 34: 0 0 0 0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2 > 35: 0 0 0 0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config > > After modification:(virtio pmd mode for guest os) > 32: 0 0 0 0 PCI-MSI 32768-edge vp-vdpa[:00:02.0]-config > > To verify the use of the virtio PMD mode in the guest operating > system, the following patch needs to be applied to QEMU: > https://lore.kernel.org/all/20240408073311.2049-1-yuxue.liu@jaguarmicr > o.com > > Signed-off-by: Yuxue Liu > Acked-by: Jason Wang > Reviewed-by: Heng Qi > --- > V5: modify the description of the printout when an exception occurs > V4: update the title and assign values to uninitialized variables > V3: delete unused variables and add validation records > V2: fix when allocating IRQs, scan all queues > > drivers/vdpa/virtio_pci/vp_vdpa.c | 22 -- > 1 file changed, 16 insertions(+), 6 deletions(-) > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c > b/drivers/vdpa/virtio_pci/vp_vdpa.c > index df5f4a3bccb5..8de0224e9ec2 100644 > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c > @@ -160,7 +160,13 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) > struct pci_dev *pdev = mdev->pci_dev; > int i, ret, irq; > int queues = vp_vdpa->queues; > - int vectors = queues + 1; > + int vectors = 1; > + int msix_vec = 0; > + > + for (i = 0; i < queues; i++) { > + if (vp_vdpa->vring[i].cb.callback) > +
Re: [syzbot] [virt?] [net?] KMSAN: uninit-value in vsock_assign_transport (2)
Hello, syzbot has tested the proposed patch and the reproducer did not trigger any issue: Reported-and-tested-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com Tested on: commit: bcc17a06 vhost/vsock: always initialize seqpacket_allow git tree: https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git console output: https://syzkaller.appspot.com/x/log.txt?x=12b58abb18 kernel config: https://syzkaller.appspot.com/x/.config?x=87a805e655619c64 dashboard link: https://syzkaller.appspot.com/bug?extid=6c21aeb59d0e82eb2782 compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40 Note: no patches were applied. Note: testing is done by a robot and is best-effort only.
Re: [PATCH 1/1] genirq/cpuhotplug: retry with online CPUs on irq_do_set_affinity failure
On Mon, Apr 22 2024 at 16:09, Dongli Zhang wrote: > On 4/22/24 13:58, Thomas Gleixner wrote: >> On Thu, Apr 18 2024 at 18:33, Dongli Zhang wrote: > Would you mind suggesting if the below commit message is fine to you? > > > genirq/cpuhotplug: retry with cpu_online_mask when irq_do_set_affinity return > -ENOSPC > > When a CPU goes offline, the interrupts pinned to that CPU are > re-configured. > > Its managed interrupts undergo either migration to other CPUs or shutdown > if all CPUs listed in the affinity are offline. This patch doesn't affect > managed interrupts. > > For regular interrupts, they are migrated to other selected online CPUs. > The target CPUs are chosen from either desc->pending_mask (suppose > CONFIG_GENERIC_PENDING_IRQ) or d->common->affinity (suppose CONFIG_SMP). > The cpu_online_mask is used as target CPUs only when CPUs in both > desc->pending_mask and d->common->affinity are offline. > > However, there is a bad corner case, when desc->pending_mask or > d->common->affinity is selected as the target cpumask, but none of their > CPUs has any available vectors. Up to here it's fine. > As a result, an -ENOSPC error happens: > > "IRQ151: set affinity failed(-28)." > > This is from the debugfs. The allocation fails although other online CPUs > (except CPU=2) have many free vectors. The debugfs output is not really providing more information than the last sentence. It just occupies space :) > The steps to reproduce the issue are in [1]. The core idea is: > > 1. Create a KVM guest with many virtio-net PCI devices, each configured > with a very large number of queues/vectors. > > 2. Set the affinity of all virtio-net interrupts to "2,3". That makes absolutely no sense at all. :) But yes, I can see the non-real world problem with that. > For regular interrupts, if irq_do_set_affinity() returns -ENOSPC, retry it > with all online CPUs. The issue does not happen for managed interrupts > because the vectors are always reserved (in cm->managed_map) before the CPU > offline operation. > > [1] > https://lore.kernel.org/all/20240419013322.58500-1-dongli.zh...@oracle.com/ The reproduction instructions are just additional information and not necessarily change log material. So I'd just say after the above: > However, there is a bad corner case, when desc->pending_mask or > d->common->affinity is selected as the target cpumask, but none of their > CPUs has any available vectors. In this case the migration fails and the device interrupt becomes stale. This is not any different from the case where the affinity mask does not contain any online CPU, but there is no fallback operation for this. Instead of giving up retry the migration attempt with the online CPU mask if the interrupt is not managed as managed interrupts cannot be affected by this problem. Hmm? > I will change it to a single line. > > Would you mind suggesting which is preferred? !cpumask_equal(affinity, > cpu_online_mask) or (affinity != cpu_online_mask)? If at all you want !cpumask_subset(cpu_online_mask, affinity), but as this is a corner case 'affinity != cpu_online_mask' should be good enough. Thanks, tglx
Re: [PATCH v9 5/9] clk: mmp: Add Marvell PXA1908 clock driver
Quoting Duje Mihanović (2024-04-20 06:32:56) > On 4/20/24 00:24, Stephen Boyd wrote: > > Quoting Duje Mihanović (2024-04-19 07:31:14) > >> On Friday, April 12, 2024 4:57:09 AM GMT+2 Stephen Boyd wrote: > >>> Quoting Duje Mihanović (2024-04-11 03:15:34) > >>> > On 4/11/2024 10:00 AM, Stephen Boyd wrote: > > Is there a reason this file can't be a platform driver? > > Not that I know of, I did it like this only because the other in-tree > MMP clk drivers do so. I guess the initialization should look like any > of the qcom GCC drivers then? > >>> > >>> Yes. > >> > >> With the entire clock driver code in one file this is quite messy as I also > >> needed to add module_init and module_exit functions to (un)register each > >> platform driver, presumably because the module_platform_driver macro > >> doesn't > >> work with multiple platform drivers in one module. If I split up the driver > >> code for each clock controller block into its own file (such as clk-of- > >> pxa1908-apbc.c) as I believe is the best option, should the commits be > >> split > >> up accordingly as well? > > > > Sure. Why is 'of' in the name? Maybe that is unnecessary? > > That seems to be a historical leftover from when Marvell was just adding > DT support to the ARM32 MMP SoCs which Rob followed along with in the > PXA1928 clk driver and so have I. Should I drop it then as Marvell has > in the PXA1908 vendor kernel? > Sounds good to me.
Re: [PATCH 1/2] tracing/user_events: Fix non-spaced field matching
On Mon, 22 Apr 2024 14:55:25 -0700 Beau Belgrave wrote: > On Sat, Apr 20, 2024 at 09:50:52PM +0900, Masami Hiramatsu wrote: > > On Fri, 19 Apr 2024 14:13:34 -0700 > > Beau Belgrave wrote: > > > > > On Fri, Apr 19, 2024 at 11:33:05AM +0900, Masami Hiramatsu wrote: > > > > On Tue, 16 Apr 2024 22:41:01 + > > > > Beau Belgrave wrote: > > *SNIP* > > > > > nit: This loop can be simpler, because we are sure fixed has enough > > > > length; > > > > > > > > /* insert a space after ';' if there is no space. */ > > > > while(*args) { > > > > *pos = *args++; > > > > if (*pos++ == ';' && !isspace(*args)) > > > > *pos++ = ' '; > > > > } > > > > > > > > > > I was worried that if count_semis_no_space() ever had different logic > > > (maybe after this commit) that it could cause an overflow if the count > > > was wrong, etc. > > > > > > I don't have an issue making it shorter, but I was trying to be more on > > > the safe side, since this isn't a fast path (event register). > > > > OK, anyway current code looks correct. But note that I don't think > > "pos++; len--;" is safer, since it is not atomic. This pattern > > easily loose "len--;" in my experience. So please carefully use it ;) > > > > I'll stick with your loop. Perhaps others will chime in on the v2 and > state a stronger opinion. > > You scared me with the atomic comment, I went back and looked at all the > paths for this. In the user_events IOCTL the buffer is copied from user > to kernel, so it cannot change (and no other threads access it). I also > checked trace_parse_run_command() which is the same. So at least in this > context the non-atomic part is OK. Oh, sorry if I scared you. I've seen bugs get introduced into loops like this many times (while updating the code), so I try to keep it simple. I'm sure that your code has no bugs. Thank you, -- Masami Hiramatsu (Google)
Re: [PATCH 1/1] genirq/cpuhotplug: retry with online CPUs on irq_do_set_affinity failure
Hi Thomas, On 4/22/24 13:58, Thomas Gleixner wrote: > On Thu, Apr 18 2024 at 18:33, Dongli Zhang wrote: > >> When a CPU is offline, its IRQs may migrate to other CPUs. For managed >> IRQs, they are migrated, or shutdown (if all CPUs of the managed IRQ >> affinity are offline). For regular IRQs, there will only be a >> migration. > > Please write out interrupts. There is enough space for it and IRQ is > just not a regular word. I will use "interrupts". > >> The migrate_one_irq() first uses pending_mask or affinity_mask of the IRQ. >> >> 104 if (irq_fixup_move_pending(desc, true)) >> 105 affinity = irq_desc_get_pending_mask(desc); >> 106 else >> 107 affinity = irq_data_get_affinity_mask(d); >> >> The migrate_one_irq() may use all online CPUs, if all CPUs in >> pending_mask/affinity_mask are already offline. >> >> 113 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { >> 114 /* >> 115 * If the interrupt is managed, then shut it down and >> leave >> 116 * the affinity untouched. >> 117 */ >> 118 if (irqd_affinity_is_managed(d)) { >> 119 irqd_set_managed_shutdown(d); >> 120 irq_shutdown_and_deactivate(desc); >> 121 return false; >> 122 } >> 123 affinity = cpu_online_mask; >> 124 brokeaff = true; >> 125 } > > Please don't copy code into the change log. Describe the problem in > text. Would you mind suggesting if the below commit message is fine to you? genirq/cpuhotplug: retry with cpu_online_mask when irq_do_set_affinity return -ENOSPC When a CPU goes offline, the interrupts pinned to that CPU are re-configured. Its managed interrupts undergo either migration to other CPUs or shutdown if all CPUs listed in the affinity are offline. This patch doesn't affect managed interrupts. For regular interrupts, they are migrated to other selected online CPUs. The target CPUs are chosen from either desc->pending_mask (suppose CONFIG_GENERIC_PENDING_IRQ) or d->common->affinity (suppose CONFIG_SMP). The cpu_online_mask is used as target CPUs only when CPUs in both desc->pending_mask and d->common->affinity are offline. However, there is a bad corner case, when desc->pending_mask or d->common->affinity is selected as the target cpumask, but none of their CPUs has any available vectors. As a result, an -ENOSPC error happens: "IRQ151: set affinity failed(-28)." This is from the debugfs. The allocation fails although other online CPUs (except CPU=2) have many free vectors. name: VECTOR size: 0 mapped: 529 flags: 0x0103 Online bitmaps:7 Global available:884 Global reserved: 6 Total allocated: 539 System: 36: 0-19,21,50,128,236,243-244,246-255 | CPU | avl | man | mac | act | vectors 0 147 0 0 55 32-49,51-87 1 147 0 0 55 32-49,51-87 2 0 0 0 202 32-49,51-127,129-235 4 147 0 0 55 32-49,51-87 5 147 0 0 55 32-49,51-87 6 148 0 0 54 32-49,51-86 7 148 0 0 54 32-49,51-86 The steps to reproduce the issue are in [1]. The core idea is: 1. Create a KVM guest with many virtio-net PCI devices, each configured with a very large number of queues/vectors. 2. Set the affinity of all virtio-net interrupts to "2,3". 3. Offline many CPUs, excluding "2,3". 4. Offline CPU=2, and irq_do_set_affinity() returns -ENOSPC. For regular interrupts, if irq_do_set_affinity() returns -ENOSPC, retry it with all online CPUs. The issue does not happen for managed interrupts because the vectors are always reserved (in cm->managed_map) before the CPU offline operation. [1] https://lore.kernel.org/all/20240419013322.58500-1-dongli.zh...@oracle.com/ Cc: Joe Jin Signed-off-by: Dongli Zhang > >> However, there is a corner case. Although some CPUs in >> pending_mask/affinity_mask are still online, they are lack of available >> vectors. If the kernel continues calling irq_do_set_affinity() with those >> CPUs, >> there will be -ENOSPC error. >> >> This is not reasonable as other online CPUs still have many available >> vectors. > > Reasonable is not the question here. It's either correct or not. This has been re-written in the new commit message. > >> name: VECTOR >> size: 0 >> mapped: 529 >> flags: 0x0103 >> Online bitmaps:7 >> Global available:884 >> Global reserved: 6 >> Total allocated: 539 >> System: 36: 0-19,21,50,128,236,243-244,246-255 >> | CPU | avl | man | mac | act | vectors >> 0 147 0 0 55 32-49,51-87 >> 1 147 0 0 55 32-49,51-87 >> 2 0 0 0 202 32-49,51-127,129-235 > > Just ouf of curiousity. How did this end up with CPU2 completely > occupied? The details are in the link:
Re: [PATCH 1/2] arm64: dts: qcom: pmi632: Add vibrator
On 4/18/24 12:03, Luca Weiss wrote: On Thu Apr 18, 2024 at 12:01 PM CEST, Konrad Dybcio wrote: On 18.04.2024 8:36 AM, Luca Weiss wrote: Add a node for the vibrator module found inside the PMI632. Signed-off-by: Luca Weiss --- Reviewed-by: Konrad Dybcio On a side note, this is a totally configuration-free peripheral that doesn't do anything crazy until manually configured. In the slow quest to be (hopefully) more sane about the defaults, should we keep them enabled by default? Bjorn? But many (most?) devices don't have a vibration motor connected to PMI632, some (like devboards) don't have anything, and other phones have a separate chip that controls the vibration motor. Enabling this by default would mean all devices with PMI632 would get an input device for the vibrator that probably doesn't work? Fair Konrad
Re: [PATCH v12 09/14] x86/sgx: Implement async reclamation for cgroup
On Mon, 2024-04-22 at 11:17 -0500, Haitao Huang wrote: > On Sun, 21 Apr 2024 19:22:27 -0500, Huang, Kai wrote: > > > On Fri, 2024-04-19 at 20:14 -0500, Haitao Huang wrote: > > > > > I think we can add support for "sgx_cgroup=disabled" in future if > > > indeed > > > > > needed. But just for init failure, no? > > > > > > > > > > > > > It's not about the commandline, which we can add in the future when > > > > needed. It's about we need to have a way to handle SGX cgroup being > > > > disabled at boot time nicely, because we already have a case where we > > > > need > > > > to do so. > > > > > > > > Your approach looks half-way to me, and is not future extendible. If > > > we > > > > choose to do it, do it right -- that is, we need a way to disable it > > > > completely in both kernel and userspace so that userspace won't be > > > able> to > > > > see it. > > > > > > That would need more changes in misc cgroup implementation to support > > > sgx-disable. Right now misc does not have separate files for different > > > resource types. So we can only block echo "sgx_epc..." to those > > > interfacefiles, can't really make files not visible. > > > > "won't be able to see" I mean "only for SGX EPC resource", but not the > > control files for the entire MISC cgroup. > > > > I replied at the beginning of the previous reply: > > > > " > > Given SGX EPC is just one type of MISC cgroup resources, we cannot just > > disable MISC cgroup as a whole. > > " > > > Sorry I missed this point. below. > > > You just need to set the SGX EPC "capacity" to 0 to disable SGX EPC. See > > the comment of @misc_res_capacity: > > > > * Miscellaneous resources capacity for the entire machine. 0 capacity > > * means resource is not initialized or not present in the host. > > > > IIUC I don't think the situation we have is either of those cases. For our > case, resource is inited and present on the host but we have allocation > error for sgx cgroup infra. You have calculated the "capacity", but later you failed something and then reset the "capacity" to 0, i.e., cleanup. What's wrong with that? > > > And "blocking echo sgx_epc ... to those control files" is already > > sufficient for the purpose of not exposing SGX EPC to userspace, correct? > > > > E.g., if SGX cgroup is enabled, you can see below when you read "max": > > > > # cat /sys/fs/cgroup/my_group/misc.max > > # > >sgx_epc ... > >... > > > > Otherwise you won't be able to see "sgx_epc": > > > > # cat /sys/fs/cgroup/my_group/misc.max > > # > >... > > > > And when you try to write the "max" for "sgx_epc", you will hit error: > > > > # echo "sgx_epc 100" > /sys/fs/cgroup/my_group/misc.max > > # ... echo: write error: Invalid argument > > > > The above applies to all the control files. To me this is pretty much > > means "SGX EPC is disabled" or "not supported" for userspace. > > > You are right, capacity == 0 does block echoing max and users see an error > if they do that. But 1) doubt you literately wanted "SGX EPC is disabled" > and make it unsupported in this case, > I don't understand. Something failed during SGX cgroup initialization, you _literally_ cannot continue to support it. > 2) even if we accept this is "sgx > cgroup disabled" I don't see how it is much better user experience than > current solution or really helps user better. In your way, the userspace is still able to see "sgx_epc" in control files and is able to update them. So from userspace's perspective SGX cgroup is enabled, but obviously updating to "max" doesn't have any impact. This will confuse userspace. > > Also to implement this approach, as you mentioned, we need workaround the > fact that misc_try_charge() fails when capacity set to zero, and adding > code to return root always? > Why this is a problem? > So it seems like more workaround code to just > make it work for a failing case no one really care much and end result is > not really much better IMHO. It's not workaround, it's the right thing to do. The result is userspace will see it being disabled when kernel disables it.
Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null
Em Tue, 23 Apr 2024 00:04:01 +0200 Greg KH escreveu: > On Mon, Apr 22, 2024 at 10:46:37PM +0100, Mauro Carvalho Chehab wrote: > > Em Mon, 22 Apr 2024 15:25:18 -0400 > > Konstantin Ryabitsev escreveu: > > > > > On Mon, Apr 22, 2024 at 05:49:29PM +0200, Thorsten Leemhuis wrote: > > > > @Greg, BTW: should this be stable+noauto...@kernel.org or have a > > > > 'vger.' > > > > > > No vger, just stable+whate...@kernel.org. > > > > > > > in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.' > > > > is fine, just wanted to be sure, as > > > > Documentation/process/stable-kernel-rules.rst in all other cases > > > > specifies sta...@vger.kernel.org, so people are likely to get confused. > > > > :-/ #sigh > > > > > > These serve two different purposes: > > > > > > sta...@kernel.org (goes into devnull) > > > sta...@vger.kernel.org (actual mailing list) > > > > > > Confusion happens all the time, unfortunately. > > > > Yeah, I did already used sta...@kernel.org a few times in the > > past. > > > > IMO, the best would be either for stable to also accept it or for > > kernel.org mail server to return an error message (only to the > > submitter) warning about the invalid address, eventually with a > > hint message pointing to the correct value. > > sta...@kernel.org is there to route to /dev/null on purpose so that > developers/maintainers who only want their patches to get picked up when > they hit Linus's tree, will have happen and not notify anyone else. > This is especially good when dealing with security-related things as we > have had MANY people accidentally leak patches way too early by having > cc: sta...@vger.kernel.org in their signed-off-by areas, and forgetting > to tell git send-email to suppress cc: when sending them out for > internal review. Nice! didn't know about that. On a quick check, the only place at documentation mentioning it without vger is at checkpatch.rst. Perhaps it would make sense to document that as well. > Having that bounce would just be noisy for the developers involved. > > thanks, > > greg k-h
Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null
On Mon, Apr 22, 2024 at 10:46:37PM +0100, Mauro Carvalho Chehab wrote: > Em Mon, 22 Apr 2024 15:25:18 -0400 > Konstantin Ryabitsev escreveu: > > > On Mon, Apr 22, 2024 at 05:49:29PM +0200, Thorsten Leemhuis wrote: > > > @Greg, BTW: should this be stable+noauto...@kernel.org or have a > > > 'vger.' > > > > No vger, just stable+whate...@kernel.org. > > > > > in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.' > > > is fine, just wanted to be sure, as > > > Documentation/process/stable-kernel-rules.rst in all other cases > > > specifies sta...@vger.kernel.org, so people are likely to get confused. > > > :-/ #sigh > > > > These serve two different purposes: > > > > sta...@kernel.org (goes into devnull) > > sta...@vger.kernel.org (actual mailing list) > > > > Confusion happens all the time, unfortunately. > > Yeah, I did already used sta...@kernel.org a few times in the > past. > > IMO, the best would be either for stable to also accept it or for > kernel.org mail server to return an error message (only to the > submitter) warning about the invalid address, eventually with a > hint message pointing to the correct value. sta...@kernel.org is there to route to /dev/null on purpose so that developers/maintainers who only want their patches to get picked up when they hit Linus's tree, will have happen and not notify anyone else. This is especially good when dealing with security-related things as we have had MANY people accidentally leak patches way too early by having cc: sta...@vger.kernel.org in their signed-off-by areas, and forgetting to tell git send-email to suppress cc: when sending them out for internal review. Having that bounce would just be noisy for the developers involved. thanks, greg k-h
Re: [PATCH 1/2] tracing/user_events: Fix non-spaced field matching
On Sat, Apr 20, 2024 at 09:50:52PM +0900, Masami Hiramatsu wrote: > On Fri, 19 Apr 2024 14:13:34 -0700 > Beau Belgrave wrote: > > > On Fri, Apr 19, 2024 at 11:33:05AM +0900, Masami Hiramatsu wrote: > > > On Tue, 16 Apr 2024 22:41:01 + > > > Beau Belgrave wrote: *SNIP* > > > nit: This loop can be simpler, because we are sure fixed has enough > > > length; > > > > > > /* insert a space after ';' if there is no space. */ > > > while(*args) { > > > *pos = *args++; > > > if (*pos++ == ';' && !isspace(*args)) > > > *pos++ = ' '; > > > } > > > > > > > I was worried that if count_semis_no_space() ever had different logic > > (maybe after this commit) that it could cause an overflow if the count > > was wrong, etc. > > > > I don't have an issue making it shorter, but I was trying to be more on > > the safe side, since this isn't a fast path (event register). > > OK, anyway current code looks correct. But note that I don't think > "pos++; len--;" is safer, since it is not atomic. This pattern > easily loose "len--;" in my experience. So please carefully use it ;) > I'll stick with your loop. Perhaps others will chime in on the v2 and state a stronger opinion. You scared me with the atomic comment, I went back and looked at all the paths for this. In the user_events IOCTL the buffer is copied from user to kernel, so it cannot change (and no other threads access it). I also checked trace_parse_run_command() which is the same. So at least in this context the non-atomic part is OK. > > > > > > + > > > > + /* > > > > +* len is the length of the copy excluding the null. > > > > +* This ensures we always have room for a null. > > > > +*/ > > > > + *pos = '\0'; > > > > + > > > > + return fixed; > > > > +} > > > > + > > > > +static char **user_event_argv_split(char *args, int *argc) > > > > +{ > > > > + /* Count how many ';' without a trailing space */ > > > > + int count = count_semis_no_space(args); > > > > + > > > > + if (count) { > > > > > > nit: it is better to exit fast, so > > > > > > if (!count) > > > return argv_split(GFP_KERNEL, args, argc); > > > > > > ... > > > > Sure, will fix in a v2. > > > > > > > > Thank you, > > > > > > OT: BTW, can this also simplify synthetic events? > > > > > > > I'm not sure, I'll check when I have some time. I want to get this fix > > in sooner rather than later. > > Ah, nevermind. Synthetic event parses the field by strsep(';') first > and argv_split(). So it does not have this issue. > Ok, seems unrelated. Thanks for checking. Thanks, -Beau > Thank you, > > > > > Thanks, > > -Beau > > *SNIP* > > > Masami Hiramatsu (Google) > > > -- > Masami Hiramatsu (Google)
Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null
Em Mon, 22 Apr 2024 15:25:18 -0400 Konstantin Ryabitsev escreveu: > On Mon, Apr 22, 2024 at 05:49:29PM +0200, Thorsten Leemhuis wrote: > > @Greg, BTW: should this be stable+noauto...@kernel.org or have a > > 'vger.' > > No vger, just stable+whate...@kernel.org. > > > in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.' > > is fine, just wanted to be sure, as > > Documentation/process/stable-kernel-rules.rst in all other cases > > specifies sta...@vger.kernel.org, so people are likely to get confused. > > :-/ #sigh > > These serve two different purposes: > > sta...@kernel.org (goes into devnull) > sta...@vger.kernel.org (actual mailing list) > > Confusion happens all the time, unfortunately. Yeah, I did already used sta...@kernel.org a few times in the past. IMO, the best would be either for stable to also accept it or for kernel.org mail server to return an error message (only to the submitter) warning about the invalid address, eventually with a hint message pointing to the correct value. > > Notably, even if someone uses stable+noauto...@vger.kernel.org, it won't > do anything terrible (it won't bounce, it'll just quietly go into > nowhere because that's not a valid expansion command). > > -K >
Re: [PATCH v2 0/6] virtiofs: fix the warning for ITER_KVEC dio
On 4/22/24 22:06, Michael S. Tsirkin wrote: > On Tue, Apr 09, 2024 at 09:48:08AM +0800, Hou Tao wrote: >> Hi, >> >> On 4/8/2024 3:45 PM, Michael S. Tsirkin wrote: >>> On Wed, Feb 28, 2024 at 10:41:20PM +0800, Hou Tao wrote: From: Hou Tao Hi, The patch set aims to fix the warning related to an abnormal size parameter of kmalloc() in virtiofs. The warning occurred when attempting to insert a 10MB sized kernel module kept in a virtiofs with cache disabled. As analyzed in patch #1, the root cause is that the length of the read buffer is no limited, and the read buffer is passed directly to virtiofs through out_args[0].value. Therefore patch #1 limits the length of the read buffer passed to virtiofs by using max_pages. However it is not enough, because now the maximal value of max_pages is 256. Consequently, when reading a 10MB-sized kernel module, the length of the bounce buffer in virtiofs will be 40 + (256 * 4096), and kmalloc will try to allocate 2MB from memory subsystem. The request for 2MB of physically contiguous memory significantly stress the memory subsystem and may fail indefinitely on hosts with fragmented memory. To address this, patch #2~#5 use scattered pages in a bio_vec to replace the kmalloc-allocated bounce buffer when the length of the bounce buffer for KVEC_ITER dio is larger than PAGE_SIZE. The final issue with the allocation of the bounce buffer and sg array in virtiofs is that GFP_ATOMIC is used even when the allocation occurs in a kworker context. Therefore the last patch uses GFP_NOFS for the allocation of both sg array and bounce buffer when initiated by the kworker. For more details, please check the individual patches. As usual, comments are always welcome. Change Log: >>> Bernd should I just merge the patchset as is? >>> It seems to fix a real problem and no one has the >>> time to work on a better fix WDYT? >> >> Sorry for the long delay. I am just start to prepare for v3. In v3, I >> plan to avoid the unnecessary memory copy between fuse args and bio_vec. >> Will post it before next week. > > Didn't happen before this week apparently. Hi Michael, sorry for my later reply, I had been totally busy for the last weeks as well. Also I can't decide to merge it - I'm not the official fuse maintainer... >From my point of view, patch 1 is just missing to set the actual limit and then would be clear and easy back-portable bug fix. Not promised, I will try it out if I find a bit time tomorrow. Bernd
Re: [PATCH 1/1] genirq/cpuhotplug: retry with online CPUs on irq_do_set_affinity failure
On Thu, Apr 18 2024 at 18:33, Dongli Zhang wrote: > When a CPU is offline, its IRQs may migrate to other CPUs. For managed > IRQs, they are migrated, or shutdown (if all CPUs of the managed IRQ > affinity are offline). For regular IRQs, there will only be a > migration. Please write out interrupts. There is enough space for it and IRQ is just not a regular word. > The migrate_one_irq() first uses pending_mask or affinity_mask of the IRQ. > > 104 if (irq_fixup_move_pending(desc, true)) > 105 affinity = irq_desc_get_pending_mask(desc); > 106 else > 107 affinity = irq_data_get_affinity_mask(d); > > The migrate_one_irq() may use all online CPUs, if all CPUs in > pending_mask/affinity_mask are already offline. > > 113 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { > 114 /* > 115 * If the interrupt is managed, then shut it down and > leave > 116 * the affinity untouched. > 117 */ > 118 if (irqd_affinity_is_managed(d)) { > 119 irqd_set_managed_shutdown(d); > 120 irq_shutdown_and_deactivate(desc); > 121 return false; > 122 } > 123 affinity = cpu_online_mask; > 124 brokeaff = true; > 125 } Please don't copy code into the change log. Describe the problem in text. > However, there is a corner case. Although some CPUs in > pending_mask/affinity_mask are still online, they are lack of available > vectors. If the kernel continues calling irq_do_set_affinity() with those > CPUs, > there will be -ENOSPC error. > > This is not reasonable as other online CPUs still have many available > vectors. Reasonable is not the question here. It's either correct or not. > name: VECTOR > size: 0 > mapped: 529 > flags: 0x0103 > Online bitmaps:7 > Global available:884 > Global reserved: 6 > Total allocated: 539 > System: 36: 0-19,21,50,128,236,243-244,246-255 > | CPU | avl | man | mac | act | vectors > 0 147 0 0 55 32-49,51-87 > 1 147 0 0 55 32-49,51-87 > 2 0 0 0 202 32-49,51-127,129-235 Just ouf of curiousity. How did this end up with CPU2 completely occupied? > 4 147 0 0 55 32-49,51-87 > 5 147 0 0 55 32-49,51-87 > 6 148 0 0 54 32-49,51-86 > 7 148 0 0 54 32-49,51-86 > > This issue should not happen for managed IRQs because the vectors are already > reserved before CPU hotplug. Should not? It either does or it does not. > For regular IRQs, do a re-try with all online > CPUs if the prior irq_do_set_affinity() is failed with -ENOSPC. > > Cc: Joe Jin > Signed-off-by: Dongli Zhang > --- > kernel/irq/cpuhotplug.c | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c > index 1ed2b1739363..d1666a6b73f4 100644 > --- a/kernel/irq/cpuhotplug.c > +++ b/kernel/irq/cpuhotplug.c > @@ -130,6 +130,19 @@ static bool migrate_one_irq(struct irq_desc *desc) >* CPU. >*/ > err = irq_do_set_affinity(d, affinity, false); > + > + if (err == -ENOSPC && > + !irqd_affinity_is_managed(d) && > + affinity != cpu_online_mask) { This really wants to be a single line conditional. > + affinity = cpu_online_mask; > + brokeaff = true; > + > + pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with all > online CPUs\n", > + d->irq, cpumask_pr_args(affinity)); How is it useful to print cpu_online_mask here? Thanks, tglx
Re: [PATCH 0/3] Improve memory statistics for virtio balloon
On Thu, Apr 18, 2024 at 02:25:59PM +0800, zhenwei pi wrote: > RFC -> v1: > - several text changes: oom-kill -> oom-kills, SCAN_ASYNC -> ASYN_SCAN. > - move vm events codes into '#ifdef CONFIG_VM_EVENT_COUNTERS' > > RFC version: > Link: > https://lore.kernel.org/lkml/20240415084113.1203428-1-pizhen...@bytedance.com/T/#m1898963b3c27a989b1123db475135c3ca687ca84 Make sure this builds without introducing new warnings please. > zhenwei pi (3): > virtio_balloon: introduce oom-kill invocations > virtio_balloon: introduce memory allocation stall counter > virtio_balloon: introduce memory scan/reclaim info > > drivers/virtio/virtio_balloon.c | 30 - > include/uapi/linux/virtio_balloon.h | 16 +-- > 2 files changed, 43 insertions(+), 3 deletions(-) > > -- > 2.34.1
Re: [RFC][PATCH] uprobe: support for private hugetlb mappings
On 22 Apr 20:59, David Hildenbrand wrote: > > The benefit - to me - is very clear. People do use hugetlb mappings to > > run code in production environments. The perf benefits are there for some > > workloads. Intel has published a whitepaper about it etc. > > Uprobes are a very good tool to do live tracing. If you can restart the > > process and reproduce, you should be able to disable hugetlb remapping > > but if you need to look at a live process, there are not many options. > > Not being able to use uprobes is crippling. > > Please add all that as motivation to the patch description or cover letter. > > > > Yes, libhugetlbfs exists. But why do we have to support uprobes with it? > > > Nobody cared until now, why care now? > > > > I think you could ask the same question for every new feature patch :) > > I have to, because it usually indicates a lack of motivation in the > cover-letter/patch description :P My cover letter was indeed lacking. I will make sure to add this kind of details next time. > > Since the removal a few releases ago of the __morecore() hook in glibc, > > the main feature of libhugetlbfs is ELF segments remapping. I think > > there are definitely a lot of users that simply deal with this > > unnecessary limitation. > > > > I am certainly not shoving this patch through anyone's throat if there > > is no interest. But we definitely find it a very useful feature ... > > Let me try to see if we can get this done cleaner. > > One ugly part (in general here) is the custom page replacement in the > registration part. > > We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing pages > ourselves (which we likely shouldn't do ...) ... maybe we could use > FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio > populated. (like KSM does nowadays) > > Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing, but > using FOLL_WRITE would not work on many file systems. So maybe we have to > trigger an unsharing fault ourselves. > > That would do the page replacement for us and we "should" be able to lookup > an anonymous folio that we can then just modify, like ptrace would. > > But then, there is also unregistration part, with weird conditional page > replacement. Zapping the anon page if the content matches the content of the > original page is one thing. But why are we placing an existing anonymous > page by a new anonymous page when the content from the original page differs > (but matches the one from the just copied page?)? > > I'll have to further think about that one. It's all a bit nasty. Sounds good to me. I am willing to help with the code when you have a plan or testing as you see fit. Let me know. > One thing to note is that hugetlb folios don't grow on trees. Likely, Many > setups *don't* reserve extra hugetlb folios and you might just easily be > running out of free hugetlb folios that you can use to break COW here > (replace a file hugetlb by a fresh anon hugetlb page). Likely it's easy to > make register or unregister fail. Agreed. -- Guillaume Morin
Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()
On Mon, Apr 08, 2024 at 02:15:24PM +1000, Gavin Shan wrote: > Hi Michael, > > On 3/30/24 19:02, Gavin Shan wrote: > > On 3/28/24 19:31, Michael S. Tsirkin wrote: > > > On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote: > > > > All the callers of vhost_get_avail_idx() are concerned to the memory > > > > barrier, imposed by smp_rmb() to ensure the order of the available > > > > ring entry read and avail_idx read. > > > > > > > > Improve vhost_get_avail_idx() so that smp_rmb() is executed when > > > > the avail_idx is advanced. With it, the callers needn't to worry > > > > about the memory barrier. > > > > > > > > Suggested-by: Michael S. Tsirkin > > > > Signed-off-by: Gavin Shan > > > > > > Previous patches are ok. This one I feel needs more work - > > > first more code such as sanity checking should go into > > > this function, second there's actually a difference > > > between comparing to last_avail_idx and just comparing > > > to the previous value of avail_idx. > > > I will pick patches 1-2 and post a cleanup on top so you can > > > take a look, ok? > > > > > > > Thanks, Michael. It's fine to me. > > > > A kindly ping. > > If it's ok to you, could you please merge PATCH[1-2]? Our downstream > 9.4 need the fixes, especially for NVidia's grace-hopper and grace-grace > platforms. > > For PATCH[3], I also can help with the improvement if you don't have time > for it. Please let me know. > > Thanks, > Gavin 1-2 are upstream go ahead and post the cleanup. -- MST
Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions
On Mon, Apr 22, 2024 at 08:27:17PM +0200, David Hildenbrand wrote: > On 22.04.24 20:20, Vincent Donnefort wrote: > > Hi David, > > > > Thanks for having a look, very much appreciated! > > > > On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote: > > > On 19.04.24 20:25, David Hildenbrand wrote: > > > > On 06.04.24 19:36, Vincent Donnefort wrote: > > > > > In preparation for allowing the user-space to map a ring-buffer, add > > > > > a set of mapping functions: > > > > > > > > > > ring_buffer_{map,unmap}() > > > > > > > > > > And controls on the ring-buffer: > > > > > > > > > > ring_buffer_map_get_reader() /* swap reader and head */ > > > > > > > > > > Mapping the ring-buffer also involves: > > > > > > > > > > A unique ID for each subbuf of the ring-buffer, currently they > > > > > are > > > > > only identified through their in-kernel VA. > > > > > > > > > > A meta-page, where are stored ring-buffer statistics and a > > > > > description for the current reader > > > > > > > > > > The linear mapping exposes the meta-page, and each subbuf of the > > > > > ring-buffer, ordered following their unique ID, assigned during the > > > > > first mapping. > > > > > > > > > > Once mapped, no subbuf can get in or out of the ring-buffer: the > > > > > buffer > > > > > size will remain unmodified and the splice enabling functions will in > > > > > reality simply memcpy the data instead of swapping subbufs. > > > > > > > > > > CC: > > > > > Signed-off-by: Vincent Donnefort > > > > > > > > > > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h > > > > > index dc5ae4e96aee..96d2140b471e 100644 > > > > > --- a/include/linux/ring_buffer.h > > > > > +++ b/include/linux/ring_buffer.h > > > > > @@ -6,6 +6,8 @@ > > > > > #include > > > > > #include > > > > > +#include > > > > > + > > > > > struct trace_buffer; > > > > > struct ring_buffer_iter; > > > > > @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct > > > > > hlist_node *node); > > > > > #define trace_rb_cpu_prepare NULL > > > > > #endif > > > > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu, > > > > > + struct vm_area_struct *vma); > > > > > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); > > > > > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); > > > > > #endif /* _LINUX_RING_BUFFER_H */ > > > > > diff --git a/include/uapi/linux/trace_mmap.h > > > > > b/include/uapi/linux/trace_mmap.h > > > > > new file mode 100644 > > > > > index ..ffcd8dfcaa4f > > > > > --- /dev/null > > > > > +++ b/include/uapi/linux/trace_mmap.h > > > > > @@ -0,0 +1,46 @@ > > > > > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > > > > > +#ifndef _TRACE_MMAP_H_ > > > > > +#define _TRACE_MMAP_H_ > > > > > + > > > > > +#include > > > > > + > > > > > +/** > > > > > + * struct trace_buffer_meta - Ring-buffer Meta-page description > > > > > + * @meta_page_size: Size of this meta-page. > > > > > + * @meta_struct_len: Size of this structure. > > > > > + * @subbuf_size: Size of each sub-buffer. > > > > > + * @nr_subbufs: Number of subbfs in the ring-buffer, > > > > > including the reader. > > > > > + * @reader.lost_events: Number of events lost at the time of > > > > > the reader swap. > > > > > + * @reader.id: subbuf ID of the current reader. ID > > > > > range [0 : @nr_subbufs - 1] > > > > > + * @reader.read: Number of bytes read on the reader subbuf. > > > > > + * @flags: Placeholder for now, 0 until new features are > > > > > supported. > > > > > + * @entries: Number of entries in the ring-buffer. > > > > > + * @overrun: Number of entries lost in the ring-buffer. > > > > > + * @read:Number of entries that have been read. > > > > > + * @Reserved1: Reserved for future use. > > > > > + * @Reserved2: Reserved for future use. > > > > > + */ > > > > > +struct trace_buffer_meta { > > > > > + __u32 meta_page_size; > > > > > + __u32 meta_struct_len; > > > > > + > > > > > + __u32 subbuf_size; > > > > > + __u32 nr_subbufs; > > > > > + > > > > > + struct { > > > > > + __u64 lost_events; > > > > > + __u32 id; > > > > > + __u32 read; > > > > > + } reader; > > > > > + > > > > > + __u64 flags; > > > > > + > > > > > + __u64 entries; > > > > > + __u64 overrun; > > > > > + __u64 read; > > > > > + > > > > > + __u64 Reserved1; > > > > > + __u64 Reserved2; > > > > > +}; > > > > > + > > > > > +#endif /* _TRACE_MMAP_H_ */ > > > > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > > > > > index cc9ebe593571..793ecc454039 100644 > > > > > --- a/kernel/trace/ring_buffer.c > > > > > +++ b/kernel/trace/ring_buffer.c > > > > > @@ -9,6
Re: [PATCHv3 bpf-next 0/7] uprobe: uretprobe speed up
On Tue, Apr 23, 2024 at 12:09:43AM +0900, Masami Hiramatsu wrote: > Hi Jiri, > > On Sun, 21 Apr 2024 21:41:59 +0200 > Jiri Olsa wrote: > > > hi, > > as part of the effort on speeding up the uprobes [0] coming with > > return uprobe optimization by using syscall instead of the trap > > on the uretprobe trampoline. > > > > The speed up depends on instruction type that uprobe is installed > > and depends on specific HW type, please check patch 1 for details. > > > > Patches 1-6 are based on bpf-next/master, but path 1 and 2 are > > apply-able on linux-trace.git tree probes/for-next branch. > > Patch 7 is based on man-pages master. > > Thanks for updated! I reviewed the series and just except for the > manpage, it looks good to me. > > Reviewed-by: Masami Hiramatsu (Google) > > for the series. > If Linux API maintainers are OK, I can pick this in probes/for-next. great, thanks > (BTW, who will pick the manpage patch?) ugh, I cc-ed linux-api but not linux-...@vger.kernel.org I'll add that for new version jirka > > Thank you, > > > > > v3 changes: > > - added source ip check if the uretprobe syscall is called from > > trampoline and sending SIGILL to process if it's not > > - keep x86 compat process to use standard breakpoint > > - split syscall wiring into separate change > > - ran ltp and syzkaller locally, no issues found [Masami] > > - building uprobe_compat binary in selftests which breaks > > CI atm because of missing 32-bit delve packages, I will > > need to fix that in separate changes once this is acked > > - added man page change > > - there were several changes so I removed acks [Oleg Andrii] > > > > Also available at: > > https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git > > uretprobe_syscall > > > > thanks, > > jirka > > > > > > Notes to check list items in Documentation/process/adding-syscalls.rst: > > > > - System Call Alternatives > > New syscall seems like the best way in here, becase we need > > just to quickly enter kernel with no extra arguments processing, > > which we'd need to do if we decided to use another syscall. > > > > - Designing the API: Planning for Extension > > The uretprobe syscall is very specific and most likely won't be > > extended in the future. > > > > At the moment it does not take any arguments and even if it does > > in future, it's allowed to be called only from trampoline prepared > > by kernel, so there'll be no broken user. > > > > - Designing the API: Other Considerations > > N/A because uretprobe syscall does not return reference to kernel > > object. > > > > - Proposing the API > > Wiring up of the uretprobe system call si in separate change, > > selftests and man page changes are part of the patchset. > > > > - Generic System Call Implementation > > There's no CONFIG option for the new functionality because it > > keeps the same behaviour from the user POV. > > > > - x86 System Call Implementation > > It's 64-bit syscall only. > > > > - Compatibility System Calls (Generic) > > N/A uretprobe syscall has no arguments and is not supported > > for compat processes. > > > > - Compatibility System Calls (x86) > > N/A uretprobe syscall is not supported for compat processes. > > > > - System Calls Returning Elsewhere > > N/A. > > > > - Other Details > > N/A. > > > > - Testing > > Adding new bpf selftests and ran ltp on top of this change. > > > > - Man Page > > Attached. > > > > - Do not call System Calls in the Kernel > > N/A. > > > > > > [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/ > > --- > > Jiri Olsa (6): > > uprobe: Wire up uretprobe system call > > uprobe: Add uretprobe syscall to speed up return probe > > selftests/bpf: Add uretprobe syscall test for regs integrity > > selftests/bpf: Add uretprobe syscall test for regs changes > > selftests/bpf: Add uretprobe syscall call from user space test > > selftests/bpf: Add uretprobe compat test > > > > arch/x86/entry/syscalls/syscall_64.tbl| 1 + > > arch/x86/kernel/uprobes.c | 115 > > ++ > > include/linux/syscalls.h | 2 + > > include/linux/uprobes.h | 3 + > > include/uapi/asm-generic/unistd.h | 5 +- > > kernel/events/uprobes.c | 24 +-- > > kernel/sys_ni.c | 2 + > > tools/include/linux/compiler.h| 4 ++ > > tools/testing/selftests/bpf/.gitignore| 1 + > > tools/testing/selftests/bpf/Makefile | 6 +- > > tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 123 > > +++- > > tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 362 > >
Re: [PATCH 7/7] man2: Add uretprobe syscall page
On Tue, Apr 23, 2024 at 12:07:29AM +0900, Masami Hiramatsu wrote: > On Sun, 21 Apr 2024 21:42:06 +0200 > Jiri Olsa wrote: > > > Adding man page for new uretprobe syscall. > > > > Signed-off-by: Jiri Olsa > > --- > > man2/uretprobe.2 | 40 > > 1 file changed, 40 insertions(+) > > create mode 100644 man2/uretprobe.2 > > > > diff --git a/man2/uretprobe.2 b/man2/uretprobe.2 > > new file mode 100644 > > index ..c0343a88bb57 > > --- /dev/null > > +++ b/man2/uretprobe.2 > > @@ -0,0 +1,40 @@ > > +.\" Copyright (C) 2024, Jiri Olsa > > +.\" > > +.\" SPDX-License-Identifier: Linux-man-pages-copyleft > > +.\" > > +.TH uretprobe 2 (date) "Linux man-pages (unreleased)" > > +.SH NAME > > +uretprobe \- execute pending return uprobes > > +.SH SYNOPSIS > > +.nf > > +.B int uretprobe(void) > > +.fi > > +.SH DESCRIPTION > > +On x86_64 architecture the kernel is using uretprobe syscall to trigger > > +uprobe return probe consumers instead of using standard breakpoint > > instruction. > > +The reason is that it's much faster to do syscall than breakpoint trap > > +on x86_64 architecture. > > Do we specify the supported architecture as this? Currently it is supported > only on x86-64, but it could be extended later, right? yes, that's the idea, but I can't really speak other than x86 ;-) so not sure abour other archs details > > This should be just noted as NOTES. Something like "This syscall is initially > introduced on x86-64 because a syscall is faster than a breakpoint trap on it. > But this will be extended to the architectures whose syscall is faster than > breakpoint trap." 's/will be extended/might be will be extended/' seems better to me, other than that it looks ok thanks, jirka > > Thank you, > > > + > > +The uretprobe syscall is not supposed to be called directly by user, it's > > allowed > > +to be invoked only through user space trampoline provided by kernel. > > +When called from outside of this trampoline, the calling process will > > receive > > +.BR SIGILL . > > + > > +.SH RETURN VALUE > > +.BR uretprobe() > > +return value is specific for given architecture. > > + > > +.SH VERSIONS > > +This syscall is not specified in POSIX, > > +and details of its behavior vary across systems. > > +.SH STANDARDS > > +None. > > +.SH NOTES > > +.BR uretprobe() > > +exists only to allow the invocation of return uprobe consumers. > > +It should > > +.B never > > +be called directly. > > +Details of the arguments (if any) passed to > > +.BR uretprobe () > > +and the return value are specific for given architecture. > > -- > > 2.44.0 > > > > > -- > Masami Hiramatsu (Google)
Re: [PATCH v2 0/6] virtiofs: fix the warning for ITER_KVEC dio
On Tue, Apr 09, 2024 at 09:48:08AM +0800, Hou Tao wrote: > Hi, > > On 4/8/2024 3:45 PM, Michael S. Tsirkin wrote: > > On Wed, Feb 28, 2024 at 10:41:20PM +0800, Hou Tao wrote: > >> From: Hou Tao > >> > >> Hi, > >> > >> The patch set aims to fix the warning related to an abnormal size > >> parameter of kmalloc() in virtiofs. The warning occurred when attempting > >> to insert a 10MB sized kernel module kept in a virtiofs with cache > >> disabled. As analyzed in patch #1, the root cause is that the length of > >> the read buffer is no limited, and the read buffer is passed directly to > >> virtiofs through out_args[0].value. Therefore patch #1 limits the > >> length of the read buffer passed to virtiofs by using max_pages. However > >> it is not enough, because now the maximal value of max_pages is 256. > >> Consequently, when reading a 10MB-sized kernel module, the length of the > >> bounce buffer in virtiofs will be 40 + (256 * 4096), and kmalloc will > >> try to allocate 2MB from memory subsystem. The request for 2MB of > >> physically contiguous memory significantly stress the memory subsystem > >> and may fail indefinitely on hosts with fragmented memory. To address > >> this, patch #2~#5 use scattered pages in a bio_vec to replace the > >> kmalloc-allocated bounce buffer when the length of the bounce buffer for > >> KVEC_ITER dio is larger than PAGE_SIZE. The final issue with the > >> allocation of the bounce buffer and sg array in virtiofs is that > >> GFP_ATOMIC is used even when the allocation occurs in a kworker context. > >> Therefore the last patch uses GFP_NOFS for the allocation of both sg > >> array and bounce buffer when initiated by the kworker. For more details, > >> please check the individual patches. > >> > >> As usual, comments are always welcome. > >> > >> Change Log: > > Bernd should I just merge the patchset as is? > > It seems to fix a real problem and no one has the > > time to work on a better fix WDYT? > > Sorry for the long delay. I am just start to prepare for v3. In v3, I > plan to avoid the unnecessary memory copy between fuse args and bio_vec. > Will post it before next week. Didn't happen before this week apparently. > > > > > >> v2: > >> * limit the length of ITER_KVEC dio by max_pages instead of the > >> newly-introduced max_nopage_rw. Using max_pages make the ITER_KVEC > >> dio being consistent with other rw operations. > >> * replace kmalloc-allocated bounce buffer by using a bounce buffer > >> backed by scattered pages when the length of the bounce buffer for > >> KVEC_ITER dio is larger than PAG_SIZE, so even on hosts with > >> fragmented memory, the KVEC_ITER dio can be handled normally by > >> virtiofs. (Bernd Schubert) > >> * merge the GFP_NOFS patch [1] into this patch-set and use > >> memalloc_nofs_{save|restore}+GFP_KERNEL instead of GFP_NOFS > >> (Benjamin Coddington) > >> > >> v1: > >> https://lore.kernel.org/linux-fsdevel/20240103105929.1902658-1-hou...@huaweicloud.com/ > >> > >> [1]: > >> https://lore.kernel.org/linux-fsdevel/20240105105305.4052672-1-hou...@huaweicloud.com/ > >> > >> Hou Tao (6): > >> fuse: limit the length of ITER_KVEC dio by max_pages > >> virtiofs: move alloc/free of argbuf into separated helpers > >> virtiofs: factor out more common methods for argbuf > >> virtiofs: support bounce buffer backed by scattered pages > >> virtiofs: use scattered bounce buffer for ITER_KVEC dio > >> virtiofs: use GFP_NOFS when enqueuing request through kworker > >> > >> fs/fuse/file.c | 12 +- > >> fs/fuse/virtio_fs.c | 336 +--- > >> 2 files changed, 296 insertions(+), 52 deletions(-) > >> > >> -- > >> 2.29.2
Re: [PATCH v5 3/5] vduse: Add function to get/free the pages for reconnection
On Thu, Apr 18, 2024 at 08:57:51AM +0800, Jason Wang wrote: > On Wed, Apr 17, 2024 at 5:29 PM Michael S. Tsirkin wrote: > > > > On Fri, Apr 12, 2024 at 09:28:23PM +0800, Cindy Lu wrote: > > > Add the function vduse_alloc_reconnnect_info_mem > > > and vduse_alloc_reconnnect_info_mem > > > These functions allow vduse to allocate and free memory for reconnection > > > information. The amount of memory allocated is vq_num pages. > > > Each VQS will map its own page where the reconnection information will be > > > saved > > > > > > Signed-off-by: Cindy Lu > > > --- > > > drivers/vdpa/vdpa_user/vduse_dev.c | 40 ++ > > > 1 file changed, 40 insertions(+) > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c > > > b/drivers/vdpa/vdpa_user/vduse_dev.c > > > index ef3c9681941e..2da659d5f4a8 100644 > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > @@ -65,6 +65,7 @@ struct vduse_virtqueue { > > > int irq_effective_cpu; > > > struct cpumask irq_affinity; > > > struct kobject kobj; > > > + unsigned long vdpa_reconnect_vaddr; > > > }; > > > > > > struct vduse_dev; > > > @@ -1105,6 +1106,38 @@ static void vduse_vq_update_effective_cpu(struct > > > vduse_virtqueue *vq) > > > > > > vq->irq_effective_cpu = curr_cpu; > > > } > > > +static int vduse_alloc_reconnnect_info_mem(struct vduse_dev *dev) > > > +{ > > > + unsigned long vaddr = 0; > > > + struct vduse_virtqueue *vq; > > > + > > > + for (int i = 0; i < dev->vq_num; i++) { > > > + /*page 0~ vq_num save the reconnect info for vq*/ > > > + vq = dev->vqs[i]; > > > + vaddr = get_zeroed_page(GFP_KERNEL); > > > > > > I don't get why you insist on stealing kernel memory for something > > that is just used by userspace to store data for its own use. > > Userspace does not lack ways to persist data, for example, > > create a regular file anywhere in the filesystem. > > Good point. So the motivation here is to: > > 1) be self contained, no dependency for high speed persist data > storage like tmpfs No idea what this means. > 2) standardize the format in uAPI which allows reconnection from > arbitrary userspace, unfortunately, such effort was removed in new > versions And I don't see why that has to live in the kernel tree either. > If the above doesn't make sense, we don't need to offer those pages by VDUSE. > > Thanks > > > > > > > > > > > + if (vaddr == 0) > > > + return -ENOMEM; > > > + > > > + vq->vdpa_reconnect_vaddr = vaddr; > > > + } > > > + > > > + return 0; > > > +} > > > + > > > +static int vduse_free_reconnnect_info_mem(struct vduse_dev *dev) > > > +{ > > > + struct vduse_virtqueue *vq; > > > + > > > + for (int i = 0; i < dev->vq_num; i++) { > > > + vq = dev->vqs[i]; > > > + > > > + if (vq->vdpa_reconnect_vaddr) > > > + free_page(vq->vdpa_reconnect_vaddr); > > > + vq->vdpa_reconnect_vaddr = 0; > > > + } > > > + > > > + return 0; > > > +} > > > > > > static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > > > unsigned long arg) > > > @@ -1672,6 +1705,8 @@ static int vduse_destroy_dev(char *name) > > > mutex_unlock(>lock); > > > return -EBUSY; > > > } > > > + vduse_free_reconnnect_info_mem(dev); > > > + > > > dev->connected = true; > > > mutex_unlock(>lock); > > > > > > @@ -1855,12 +1890,17 @@ static int vduse_create_dev(struct > > > vduse_dev_config *config, > > > ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num); > > > if (ret) > > > goto err_vqs; > > > + ret = vduse_alloc_reconnnect_info_mem(dev); > > > + if (ret < 0) > > > + goto err_mem; > > > > > > __module_get(THIS_MODULE); > > > > > > return 0; > > > err_vqs: > > > device_destroy(_class, MKDEV(MAJOR(vduse_major), dev->minor)); > > > +err_mem: > > > + vduse_free_reconnnect_info_mem(dev); > > > err_dev: > > > idr_remove(_idr, dev->minor); > > > err_idr: > > > -- > > > 2.43.0 > >
Re: [PATCH v11 14/14] selftests/sgx: Add scripts for EPC cgroup testing
Hi Jarkko On Mon, 15 Apr 2024 14:08:44 -0500, Jarkko Sakkinen wrote: I did run the basic test by manually creating the cgroup so you could add tested-by from my side to the other kernel patches expect this one I've reviewed it enough rounds and given various code suggestions etc. For me it is "good enough" or has been for a while. I just want this test to work so that people doing kernel QA will automatically get it to their testing cycle. That is why proper integration to kselftest framework is a must May I have your "Reviewed-by" tag also for the patches #8-13? Not sure if I missed any other comments/issue you raised. I think all are addressed in v12. I will refine the test scripts (patch #14) in v13. Thanks Haitao
Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null
On Mon, Apr 22, 2024 at 05:49:29PM +0200, Thorsten Leemhuis wrote: > @Greg, BTW: should this be stable+noauto...@kernel.org or have a > 'vger.' No vger, just stable+whate...@kernel.org. > in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.' > is fine, just wanted to be sure, as > Documentation/process/stable-kernel-rules.rst in all other cases > specifies sta...@vger.kernel.org, so people are likely to get confused. > :-/ #sigh These serve two different purposes: sta...@kernel.org (goes into devnull) sta...@vger.kernel.org (actual mailing list) Confusion happens all the time, unfortunately. Notably, even if someone uses stable+noauto...@vger.kernel.org, it won't do anything terrible (it won't bounce, it'll just quietly go into nowhere because that's not a valid expansion command). -K
Re: [RFC][PATCH] uprobe: support for private hugetlb mappings
On 22.04.24 20:11, Guillaume Morin wrote: (Dropping Mike Kravetz as CC since he has retired and his email is no longer valid, adding Muchun since he's the current hugetlb maintainer, as well as linux-trace-kernel) On 22 Apr 11:39, David Hildenbrand wrote: On 19.04.24 20:37, Guillaume Morin wrote: libhugetlbfs, the Intel iodlr code both allow to remap .text onto a hugetlb private mapping. It's also pretty easy to do it manually. One drawback of using this functionality is the lack of support for uprobes (NOTE uprobe ignores shareable vmas) This patch adds support for private hugetlb mappings. It does require exposing some hugetlbfs innards and relies on copy_user_large_folio which is only available when CONFIG_HUGETLBFS is used so I had to use an ugly #ifdef If there is some interest in applying this patch in some form or another, I am open to any refactoring suggestions (esp getting rid the #ifdef in uprobes.c) . I tried to limit the amount of branching. All that hugetlb special casing oh my. What's the benefit why we should be interested in making that code less clean -- to phrase it in a nice way ;) ? I do appreciate the nice phrasing. Believe me, I did try to limit the special casing to a minimum :-). Outside of __replace_page, I added only 3-ish branches so I do not think it's *too* bad. The uprobe code is using PAGE_{SHIFT,MASK} quite liberally so I had to add calls to retrieve these for the hugetlb vmas. __replace_page has a lot of special casing. I certainly agree (and unfortunately for me it's at the beginning of the patch :)). It's doing something pretty uncommon outside of the mm code so it has to make a bunch of specific hugetlb calls. I am not quite sure how to improve it but if you have suggestions, I'd be happy to refactor. See below. The benefit - to me - is very clear. People do use hugetlb mappings to run code in production environments. The perf benefits are there for some workloads. Intel has published a whitepaper about it etc. Uprobes are a very good tool to do live tracing. If you can restart the process and reproduce, you should be able to disable hugetlb remapping but if you need to look at a live process, there are not many options. Not being able to use uprobes is crippling. Please add all that as motivation to the patch description or cover letter. Yes, libhugetlbfs exists. But why do we have to support uprobes with it? Nobody cared until now, why care now? I think you could ask the same question for every new feature patch :) I have to, because it usually indicates a lack of motivation in the cover-letter/patch description :P People will have to maintain that code, and maintaining hugetlb code in odd places is no fun ... Since the removal a few releases ago of the __morecore() hook in glibc, the main feature of libhugetlbfs is ELF segments remapping. I think there are definitely a lot of users that simply deal with this unnecessary limitation. I am certainly not shoving this patch through anyone's throat if there is no interest. But we definitely find it a very useful feature ... Let me try to see if we can get this done cleaner. One ugly part (in general here) is the custom page replacement in the registration part. We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing pages ourselves (which we likely shouldn't do ...) ... maybe we could use FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio populated. (like KSM does nowadays) Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing, but using FOLL_WRITE would not work on many file systems. So maybe we have to trigger an unsharing fault ourselves. That would do the page replacement for us and we "should" be able to lookup an anonymous folio that we can then just modify, like ptrace would. But then, there is also unregistration part, with weird conditional page replacement. Zapping the anon page if the content matches the content of the original page is one thing. But why are we placing an existing anonymous page by a new anonymous page when the content from the original page differs (but matches the one from the just copied page?)? I'll have to further think about that one. It's all a bit nasty. One thing to note is that hugetlb folios don't grow on trees. Likely, Many setups *don't* reserve extra hugetlb folios and you might just easily be running out of free hugetlb folios that you can use to break COW here (replace a file hugetlb by a fresh anon hugetlb page). Likely it's easy to make register or unregister fail. -- Cheers, David / dhildenb
Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()
Hi Masami and Mike, On Sat, Apr 20, 2024 at 2:11 AM Masami Hiramatsu wrote: [...] > > > > > > IIUC, we need to update __execmem_cache_alloc() to take a range pointer as > > > input. module text will use "range" for EXECMEM_MODULE_TEXT, while kprobe > > > will use "range" for EXECMEM_KPROBE. Without "map to" concept or sharing > > > the "range" object, we will have to compare different range parameters to > > > check > > > we can share cached pages between module text and kprobe, which is not > > > efficient. Did I miss something? > > Song, thanks for trying to eplain. I think I need to explain why I used > module_alloc() originally. > > This depends on how kprobe features are implemented on the architecture, and > how much features are supported on kprobes. > > Because kprobe jump optimization and kprobe jump-back optimization need to > use a jump instruction to jump into the trampoline and jump back from the > trampoline directly, if the architecuture jmp instruction supports +-2GB range > like x86, it needs to allocate the trampoline buffer inside such address > space. > This requirement is similar to the modules (because module function needs to > call other functions in the kernel etc.), at least kprobes on x86 used > module_alloc(). > > However, if an architecture only supports breakpoint/trap based kprobe, > it does not need to consider whether the execmem is allocated. > > > > > We can always share large ROX pages as long as they are within the correct > > address space. The permissions for them are ROX and the alignment > > differences are due to KASAN and this is handled during allocation of the > > large page to refill the cache. __execmem_cache_alloc() only needs to limit > > the search for the address space of the range. > > So I don't think EXECMEM_KPROBE always same as EXECMEM_MODULE_TEXT, it > should be configured for each arch. Especially, if it is only used for > searching parameter, it looks OK to me. Thanks for the explanation! I was thinking "we can have EXECMEM_KPROBE share the same parameters as EXECMEM_MODULE_TEXT for all architectures". But this thought is built on top of assumptions on future changes/improvements within multiple sub systems. At this moment, I have no objections moving forward with current execmem APIs. Thanks, Song
Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent
On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote: ... > diff --git a/include/net/rstreason.h b/include/net/rstreason.h ... > +/** > + * There are three parts in order: > + * 1) reset reason in MPTCP: only for MPTCP use > + * 2) skb drop reason: relying on drop reasons for such as passive reset > + * 3) independent reset reason: such as active reset reasons > + */ Hi Jason, A minor nit from my side. '/**' denotes the beginning of a Kernel doc, but other than that, this comment is not a Kernel doc. FWIIW, I would suggest providing a proper Kernel doc for enum sk_rst_reason. But another option would be to simply make this a normal comment, starting with "/* There are" > +enum sk_rst_reason { ...
Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions
On 22.04.24 20:20, Vincent Donnefort wrote: Hi David, Thanks for having a look, very much appreciated! On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote: On 19.04.24 20:25, David Hildenbrand wrote: On 06.04.24 19:36, Vincent Donnefort wrote: In preparation for allowing the user-space to map a ring-buffer, add a set of mapping functions: ring_buffer_{map,unmap}() And controls on the ring-buffer: ring_buffer_map_get_reader() /* swap reader and head */ Mapping the ring-buffer also involves: A unique ID for each subbuf of the ring-buffer, currently they are only identified through their in-kernel VA. A meta-page, where are stored ring-buffer statistics and a description for the current reader The linear mapping exposes the meta-page, and each subbuf of the ring-buffer, ordered following their unique ID, assigned during the first mapping. Once mapped, no subbuf can get in or out of the ring-buffer: the buffer size will remain unmodified and the splice enabling functions will in reality simply memcpy the data instead of swapping subbufs. CC: Signed-off-by: Vincent Donnefort diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index dc5ae4e96aee..96d2140b471e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -6,6 +6,8 @@ #include #include +#include + struct trace_buffer; struct ring_buffer_iter; @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); #define trace_rb_cpu_prepareNULL #endif +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma); +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h new file mode 100644 index ..ffcd8dfcaa4f --- /dev/null +++ b/include/uapi/linux/trace_mmap.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _TRACE_MMAP_H_ +#define _TRACE_MMAP_H_ + +#include + +/** + * struct trace_buffer_meta - Ring-buffer Meta-page description + * @meta_page_size:Size of this meta-page. + * @meta_struct_len: Size of this structure. + * @subbuf_size: Size of each sub-buffer. + * @nr_subbufs:Number of subbfs in the ring-buffer, including the reader. + * @reader.lost_events:Number of events lost at the time of the reader swap. + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] + * @reader.read: Number of bytes read on the reader subbuf. + * @flags: Placeholder for now, 0 until new features are supported. + * @entries: Number of entries in the ring-buffer. + * @overrun: Number of entries lost in the ring-buffer. + * @read: Number of entries that have been read. + * @Reserved1: Reserved for future use. + * @Reserved2: Reserved for future use. + */ +struct trace_buffer_meta { + __u32 meta_page_size; + __u32 meta_struct_len; + + __u32 subbuf_size; + __u32 nr_subbufs; + + struct { + __u64 lost_events; + __u32 id; + __u32 read; + } reader; + + __u64 flags; + + __u64 entries; + __u64 overrun; + __u64 read; + + __u64 Reserved1; + __u64 Reserved2; +}; + +#endif /* _TRACE_MMAP_H_ */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc9ebe593571..793ecc454039 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include @@ -338,6 +340,7 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned longreal_end; /* real end of data */ unsigned order; /* order of the page */ + u32 id;/* ID for external mapping */ struct buffer_data_page *page; /* Actual data page */ }; @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; + + unsigned intmapped; + struct mutexmapping_lock; + unsigned long *subbuf_ids;/* ID to subbuf VA */ + struct trace_buffer_meta*meta_page; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ longnr_pages_to_update; struct list_head
Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions
Hi David, Thanks for having a look, very much appreciated! On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote: > On 19.04.24 20:25, David Hildenbrand wrote: > > On 06.04.24 19:36, Vincent Donnefort wrote: > > > In preparation for allowing the user-space to map a ring-buffer, add > > > a set of mapping functions: > > > > > > ring_buffer_{map,unmap}() > > > > > > And controls on the ring-buffer: > > > > > > ring_buffer_map_get_reader() /* swap reader and head */ > > > > > > Mapping the ring-buffer also involves: > > > > > > A unique ID for each subbuf of the ring-buffer, currently they are > > > only identified through their in-kernel VA. > > > > > > A meta-page, where are stored ring-buffer statistics and a > > > description for the current reader > > > > > > The linear mapping exposes the meta-page, and each subbuf of the > > > ring-buffer, ordered following their unique ID, assigned during the > > > first mapping. > > > > > > Once mapped, no subbuf can get in or out of the ring-buffer: the buffer > > > size will remain unmodified and the splice enabling functions will in > > > reality simply memcpy the data instead of swapping subbufs. > > > > > > CC: > > > Signed-off-by: Vincent Donnefort > > > > > > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h > > > index dc5ae4e96aee..96d2140b471e 100644 > > > --- a/include/linux/ring_buffer.h > > > +++ b/include/linux/ring_buffer.h > > > @@ -6,6 +6,8 @@ > > >#include > > >#include > > > +#include > > > + > > >struct trace_buffer; > > >struct ring_buffer_iter; > > > @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct > > > hlist_node *node); > > >#define trace_rb_cpu_prepare NULL > > >#endif > > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu, > > > + struct vm_area_struct *vma); > > > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); > > > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); > > >#endif /* _LINUX_RING_BUFFER_H */ > > > diff --git a/include/uapi/linux/trace_mmap.h > > > b/include/uapi/linux/trace_mmap.h > > > new file mode 100644 > > > index ..ffcd8dfcaa4f > > > --- /dev/null > > > +++ b/include/uapi/linux/trace_mmap.h > > > @@ -0,0 +1,46 @@ > > > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > > > +#ifndef _TRACE_MMAP_H_ > > > +#define _TRACE_MMAP_H_ > > > + > > > +#include > > > + > > > +/** > > > + * struct trace_buffer_meta - Ring-buffer Meta-page description > > > + * @meta_page_size: Size of this meta-page. > > > + * @meta_struct_len: Size of this structure. > > > + * @subbuf_size: Size of each sub-buffer. > > > + * @nr_subbufs: Number of subbfs in the ring-buffer, including > > > the reader. > > > + * @reader.lost_events: Number of events lost at the time of the reader > > > swap. > > > + * @reader.id: subbuf ID of the current reader. ID range [0 : > > > @nr_subbufs - 1] > > > + * @reader.read: Number of bytes read on the reader subbuf. > > > + * @flags: Placeholder for now, 0 until new features are > > > supported. > > > + * @entries: Number of entries in the ring-buffer. > > > + * @overrun: Number of entries lost in the ring-buffer. > > > + * @read:Number of entries that have been read. > > > + * @Reserved1: Reserved for future use. > > > + * @Reserved2: Reserved for future use. > > > + */ > > > +struct trace_buffer_meta { > > > + __u32 meta_page_size; > > > + __u32 meta_struct_len; > > > + > > > + __u32 subbuf_size; > > > + __u32 nr_subbufs; > > > + > > > + struct { > > > + __u64 lost_events; > > > + __u32 id; > > > + __u32 read; > > > + } reader; > > > + > > > + __u64 flags; > > > + > > > + __u64 entries; > > > + __u64 overrun; > > > + __u64 read; > > > + > > > + __u64 Reserved1; > > > + __u64 Reserved2; > > > +}; > > > + > > > +#endif /* _TRACE_MMAP_H_ */ > > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > > > index cc9ebe593571..793ecc454039 100644 > > > --- a/kernel/trace/ring_buffer.c > > > +++ b/kernel/trace/ring_buffer.c > > > @@ -9,6 +9,7 @@ > > >#include > > >#include > > >#include > > > +#include > > >#include > > >#include > > >#include > > > @@ -26,6 +27,7 @@ > > >#include > > >#include > > >#include > > > +#include > > >#include > > >#include > > > @@ -338,6 +340,7 @@ struct buffer_page { > > > local_t entries; /* entries on this page */ > > > unsigned longreal_end; /* real end of data */ > > > unsigned order; /* order of the page */ > > > + u32 id;/* ID for external mapping */ > > > struct buffer_data_page *page; /* Actual data page */ > > >
Re: [RFC][PATCH] uprobe: support for private hugetlb mappings
(Dropping Mike Kravetz as CC since he has retired and his email is no longer valid, adding Muchun since he's the current hugetlb maintainer, as well as linux-trace-kernel) On 22 Apr 11:39, David Hildenbrand wrote: > > On 19.04.24 20:37, Guillaume Morin wrote: > > libhugetlbfs, the Intel iodlr code both allow to remap .text onto a > > hugetlb private mapping. It's also pretty easy to do it manually. > > One drawback of using this functionality is the lack of support for > > uprobes (NOTE uprobe ignores shareable vmas) > > > > This patch adds support for private hugetlb mappings. It does require > > exposing > > some hugetlbfs innards and relies on copy_user_large_folio which is only > > available when CONFIG_HUGETLBFS is used so I had to use an ugly #ifdef > > > > If there is some interest in applying this patch in some form or > > another, I am open to any refactoring suggestions (esp getting rid the > > #ifdef in uprobes.c) . I tried to limit the > > amount of branching. > > All that hugetlb special casing oh my. What's the benefit why we should > be interested in making that code less clean -- to phrase it in a nice way > ;) ? I do appreciate the nice phrasing. Believe me, I did try to limit the special casing to a minimum :-). Outside of __replace_page, I added only 3-ish branches so I do not think it's *too* bad. The uprobe code is using PAGE_{SHIFT,MASK} quite liberally so I had to add calls to retrieve these for the hugetlb vmas. __replace_page has a lot of special casing. I certainly agree (and unfortunately for me it's at the beginning of the patch :)). It's doing something pretty uncommon outside of the mm code so it has to make a bunch of specific hugetlb calls. I am not quite sure how to improve it but if you have suggestions, I'd be happy to refactor. The benefit - to me - is very clear. People do use hugetlb mappings to run code in production environments. The perf benefits are there for some workloads. Intel has published a whitepaper about it etc. Uprobes are a very good tool to do live tracing. If you can restart the process and reproduce, you should be able to disable hugetlb remapping but if you need to look at a live process, there are not many options. Not being able to use uprobes is crippling. > Yes, libhugetlbfs exists. But why do we have to support uprobes with it? > Nobody cared until now, why care now? I think you could ask the same question for every new feature patch :) Since the removal a few releases ago of the __morecore() hook in glibc, the main feature of libhugetlbfs is ELF segments remapping. I think there are definitely a lot of users that simply deal with this unnecessary limitation. I am certainly not shoving this patch through anyone's throat if there is no interest. But we definitely find it a very useful feature ... Guillaume. -- Guillaume Morin
Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions
On Thu, Apr 18, 2024 at 11:43:46PM -0400, Steven Rostedt wrote: > On Thu, 18 Apr 2024 09:55:55 +0300 > Mike Rapoport wrote: > > Hi Mike, > > Thanks for doing this review! > > > > +/** > > > + * struct trace_buffer_meta - Ring-buffer Meta-page description > > > + * @meta_page_size: Size of this meta-page. > > > + * @meta_struct_len: Size of this structure. > > > + * @subbuf_size: Size of each sub-buffer. > > > + * @nr_subbufs: Number of subbfs in the ring-buffer, including > > > the reader. > > > + * @reader.lost_events: Number of events lost at the time of the reader > > > swap. > > > + * @reader.id: subbuf ID of the current reader. ID range [0 : > > > @nr_subbufs - 1] > > > + * @reader.read: Number of bytes read on the reader subbuf. > > > + * @flags: Placeholder for now, 0 until new features are > > > supported. > > > + * @entries: Number of entries in the ring-buffer. > > > + * @overrun: Number of entries lost in the ring-buffer. > > > + * @read:Number of entries that have been read. > > > + * @Reserved1: Reserved for future use. > > > + * @Reserved2: Reserved for future use. > > > + */ > > > +struct trace_buffer_meta { > > > + __u32 meta_page_size; > > > + __u32 meta_struct_len; > > > + > > > + __u32 subbuf_size; > > > + __u32 nr_subbufs; > > > + > > > + struct { > > > + __u64 lost_events; > > > + __u32 id; > > > + __u32 read; > > > + } reader; > > > + > > > + __u64 flags; > > > + > > > + __u64 entries; > > > + __u64 overrun; > > > + __u64 read; > > > + > > > + __u64 Reserved1; > > > + __u64 Reserved2; > > > > Why do you need reserved fields? This structure always resides in the > > beginning of a page and the rest of the page is essentially "reserved". > > So this code is also going to be used in arm's pkvm hypervisor code, > where it will be using these fields, but since we are looking at > keeping the same interface between the two, we don't want these used by > this interface. > > We probably should add a comment about that. > > > > > > +}; > > > + > > > +#endif /* _TRACE_MMAP_H_ */ > > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > > > index cc9ebe593571..793ecc454039 100644 > > > --- a/kernel/trace/ring_buffer.c > > > +++ b/kernel/trace/ring_buffer.c > > > > ... > > > > > +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu > > > *cpu_buffer, > > > +unsigned long *subbuf_ids) > > > +{ > > > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > > > + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; > > > + struct buffer_page *first_subbuf, *subbuf; > > > + int id = 0; > > > + > > > + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; > > > + cpu_buffer->reader_page->id = id++; > > > + > > > + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); > > > + do { > > > + if (WARN_ON(id >= nr_subbufs)) > > > + break; > > > + > > > + subbuf_ids[id] = (unsigned long)subbuf->page; > > > + subbuf->id = id; > > > + > > > + rb_inc_page(); > > > + id++; > > > + } while (subbuf != first_subbuf); > > > + > > > + /* install subbuf ID to kern VA translation */ > > > + cpu_buffer->subbuf_ids = subbuf_ids; > > > + > > > + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ > > > + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; > > > > Isn't this a single page? > > One thing we are doing is to make sure that the subbuffers are aligned > by their size. If a subbuffer is 3 pages, it should be aligned on 3 > page boundaries. This was something that Linus suggested. > > > > > > + meta->meta_struct_len = sizeof(*meta); > > > + meta->nr_subbufs = nr_subbufs; > > > + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; > > > + > > > + rb_update_meta_page(cpu_buffer); > > > +} > > > > ... > > > > > +#define subbuf_page(off, start) \ > > > + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) > > > + > > > +#define foreach_subbuf_page(sub_order, start, page) \ > > > > Nit: usually iterators in kernel use for_each > > Ah, good catch. Yeah, that should be changed. But then ... > > > > > > + page = subbuf_page(0, (start)); \ > > > + for (int __off = 0; __off < (1 << (sub_order)); \ > > > + __off++, page = subbuf_page(__off, (start))) > > > > The pages are allocated with alloc_pages_node(.. subbuf_order) are > > physically contiguous and struct pages for them are also contiguous, so > > inside a subbuf_order allocation you can just do page++. > > > > I'm wondering if we should just nuke the macro. It was there because of > the previous implementation did things twice. But now it's just done > once here: > > + while (s < nr_subbufs && p < nr_pages)
Re: [PATCH v12 09/14] x86/sgx: Implement async reclamation for cgroup
On Sun, 21 Apr 2024 19:22:27 -0500, Huang, Kai wrote: On Fri, 2024-04-19 at 20:14 -0500, Haitao Huang wrote: > > I think we can add support for "sgx_cgroup=disabled" in future if indeed > > needed. But just for init failure, no? > > > > It's not about the commandline, which we can add in the future when > needed. It's about we need to have a way to handle SGX cgroup being > disabled at boot time nicely, because we already have a case where we > need > to do so. > > Your approach looks half-way to me, and is not future extendible. If we > choose to do it, do it right -- that is, we need a way to disable it > completely in both kernel and userspace so that userspace won't be able> to > see it. That would need more changes in misc cgroup implementation to support sgx-disable. Right now misc does not have separate files for different resource types. So we can only block echo "sgx_epc..." to those interfacefiles, can't really make files not visible. "won't be able to see" I mean "only for SGX EPC resource", but not the control files for the entire MISC cgroup. I replied at the beginning of the previous reply: " Given SGX EPC is just one type of MISC cgroup resources, we cannot just disable MISC cgroup as a whole. " Sorry I missed this point. below. You just need to set the SGX EPC "capacity" to 0 to disable SGX EPC. See the comment of @misc_res_capacity: * Miscellaneous resources capacity for the entire machine. 0 capacity * means resource is not initialized or not present in the host. IIUC I don't think the situation we have is either of those cases. For our case, resource is inited and present on the host but we have allocation error for sgx cgroup infra. And "blocking echo sgx_epc ... to those control files" is already sufficient for the purpose of not exposing SGX EPC to userspace, correct? E.g., if SGX cgroup is enabled, you can see below when you read "max": # cat /sys/fs/cgroup/my_group/misc.max # sgx_epc ... ... Otherwise you won't be able to see "sgx_epc": # cat /sys/fs/cgroup/my_group/misc.max # ... And when you try to write the "max" for "sgx_epc", you will hit error: # echo "sgx_epc 100" > /sys/fs/cgroup/my_group/misc.max # ... echo: write error: Invalid argument The above applies to all the control files. To me this is pretty much means "SGX EPC is disabled" or "not supported" for userspace. You are right, capacity == 0 does block echoing max and users see an error if they do that. But 1) doubt you literately wanted "SGX EPC is disabled" and make it unsupported in this case, 2) even if we accept this is "sgx cgroup disabled" I don't see how it is much better user experience than current solution or really helps user better. Also to implement this approach, as you mentioned, we need workaround the fact that misc_try_charge() fails when capacity set to zero, and adding code to return root always? So it seems like more workaround code to just make it work for a failing case no one really care much and end result is not really much better IMHO. Thanks Haitao
Re: [PATCHv3 bpf-next 1/7] uprobe: Wire up uretprobe system call
On 04/21, Jiri Olsa wrote: > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > include/linux/syscalls.h | 2 ++ > include/uapi/asm-generic/unistd.h | 5 - > kernel/sys_ni.c| 2 ++ > 4 files changed, 9 insertions(+), 1 deletion(-) Reviewed-by: Oleg Nesterov
Re: [PATCHv3 bpf-next 2/7] uprobe: Add uretprobe syscall to speed up return probe
On 04/21, Jiri Olsa wrote: > > arch/x86/kernel/uprobes.c | 115 ++ > include/linux/uprobes.h | 3 + > kernel/events/uprobes.c | 24 +--- > 3 files changed, 135 insertions(+), 7 deletions(-) Reviewed-by: Oleg Nesterov
Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null
[CCing Sasha] On 18.04.24 15:20, Greg KH wrote: > On Thu, Apr 18, 2024 at 09:04:53AM +0200, Thorsten Leemhuis wrote: >> On 17.04.24 15:38, Greg KH wrote: >>> On Wed, Apr 17, 2024 at 03:21:12PM +0200, Thorsten Leemhuis wrote: On 17.04.24 14:52, Konstantin Ryabitsev wrote: > On Wed, Apr 17, 2024 at 09:48:18AM +0200, Thorsten Leemhuis wrote: >> Could you please create the email alias >> >>> How about: >>> cc: # Reason goes here, and >>> must be present >>> >>> and we can make that address be routed to /dev/null just like >>> is? FWIW, we could go back to what I initially proposed: use the existing stable tag with a pre-defined comment to mark patches that AUTOSEL et. al. should not pick up: https://lore.kernel.org/all/c0a08b160b286e8c98549eedb37404c6e784cf8a.1712812895.git.li...@leemhuis.info/ >>> >>> If you can pick a better string, possibly, yes. >> >> What did you think of Konstantin's >> >> Cc: stable+noauto...@kernel.org # Reason @Greg, BTW: should this be stable+noauto...@kernel.org or have a 'vger.' in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.' is fine, just wanted to be sure, as Documentation/process/stable-kernel-rules.rst in all other cases specifies sta...@vger.kernel.org, so people are likely to get confused. :-/ #sigh >> That looked like a good solution -- and I wondered why I did not come up >> with that idea myself. Sure, "autosel" would also imply/mean "the >> scripts/tools that look out for Fixes: tags", but does that matter? > > We can live with this, sure. In that case I guess I now also have to fix the scripts to honor that tag. @Greg: something like the attached for scripts/fixes_search perhaps? Was that the right one and are there any other scripts that might need something similar? @Sasha: are the scripts around autosel online somewhere? They need a similar change. Ciao, ThorstenFrom 1e973a069b07f8c045401a7d3d20ea760a27422f Mon Sep 17 00:00:00 2001 From: Thorsten Leemhuis Date: Mon, 22 Apr 2024 17:31:01 +0200 Subject: [PATCH] scripts/fixes_search: honor noautosel tag Ignore commits that contain a soon to be documented tag that is meant to exclude commits from processing by scripts like scripts/fixes_search. Link: https://lore.kernel.org/all/2024041830-karaoke-aspirate-df00@gregkh/ [1] Signed-off-by: Thorsten Leemhuis --- scripts/fixes_search | 7 +++ 1 file changed, 7 insertions(+) diff --git a/scripts/fixes_search b/scripts/fixes_search index aaa12ec..950509f 100755 --- a/scripts/fixes_search +++ b/scripts/fixes_search @@ -131,6 +131,13 @@ for commit in $(git rev-list --reverse --no-merges "${git_range}"); do # logn "commit = ${txtgrn}${commit}${txtrst} " logn "${txtgrn}${commit}${txtrst} " + # Check if we are supposed to ignore the commit + no_autosel=$(git log -1 --format='%B' "HEAD" | grep -i '^[[:space:]]*[Cc][Cc]:[[:space:]]*
Re: [PATCH] drivers: remoteproc: xlnx: Add Versal and Versal-NET support
On Thu, Apr 18, 2024 at 03:01:25PM -0700, Tanmay Shah wrote: > AMD-Xilinx Versal platform is successor of ZynqMP platform. > Real-time Processing Unit R5 cluster IP on Versal is same as > of ZynqMP Platform. Power-domains ids for Versal platform is > different than ZynqMP. > > AMD-Xilinx Versal-NET platform is successor of Versal platform. > Versal-NET Real-Time Processing Unit has two clusters and each > cluster contains dual core ARM Cortex-R52 processors. Each R52 > core is assigned 128KB of TCM memory. > > Signed-off-by: Tanmay Shah > --- > drivers/remoteproc/xlnx_r5_remoteproc.c | 53 - > 1 file changed, 17 insertions(+), 36 deletions(-) > Applied. Thanks, Mathieu > diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c > b/drivers/remoteproc/xlnx_r5_remoteproc.c > index 7b1c12108bff..a6d8ac7394e7 100644 > --- a/drivers/remoteproc/xlnx_r5_remoteproc.c > +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c > @@ -300,36 +300,6 @@ static void zynqmp_r5_rproc_kick(struct rproc *rproc, > int vqid) > dev_warn(dev, "failed to send message\n"); > } > > -/* > - * zynqmp_r5_set_mode() > - * > - * set RPU cluster and TCM operation mode > - * > - * @r5_core: pointer to zynqmp_r5_core type object > - * @fw_reg_val: value expected by firmware to configure RPU cluster mode > - * @tcm_mode: value expected by fw to configure TCM mode (lockstep or split) > - * > - * Return: 0 for success and < 0 for failure > - */ > -static int zynqmp_r5_set_mode(struct zynqmp_r5_core *r5_core, > - enum rpu_oper_mode fw_reg_val, > - enum rpu_tcm_comb tcm_mode) > -{ > - int ret; > - > - ret = zynqmp_pm_set_rpu_mode(r5_core->pm_domain_id, fw_reg_val); > - if (ret < 0) { > - dev_err(r5_core->dev, "failed to set RPU mode\n"); > - return ret; > - } > - > - ret = zynqmp_pm_set_tcm_config(r5_core->pm_domain_id, tcm_mode); > - if (ret < 0) > - dev_err(r5_core->dev, "failed to configure TCM\n"); > - > - return ret; > -} > - > /* > * zynqmp_r5_rproc_start() > * @rproc: single R5 core's corresponding rproc instance > @@ -941,7 +911,7 @@ static int zynqmp_r5_core_init(struct zynqmp_r5_cluster > *cluster, > /* Maintain backward compatibility for zynqmp by using hardcode TCM > address. */ > if (of_find_property(r5_core->np, "reg", NULL)) > ret = zynqmp_r5_get_tcm_node_from_dt(cluster); > - else > + else if (device_is_compatible(dev, "xlnx,zynqmp-r5fss")) > ret = zynqmp_r5_get_tcm_node(cluster); > > if (ret) { > @@ -960,12 +930,21 @@ static int zynqmp_r5_core_init(struct zynqmp_r5_cluster > *cluster, > return ret; > } > > - ret = zynqmp_r5_set_mode(r5_core, fw_reg_val, tcm_mode); > - if (ret) { > - dev_err(dev, "failed to set r5 cluster mode %d, err > %d\n", > - cluster->mode, ret); > + ret = zynqmp_pm_set_rpu_mode(r5_core->pm_domain_id, fw_reg_val); > + if (ret < 0) { > + dev_err(r5_core->dev, "failed to set RPU mode\n"); > return ret; > } > + > + if (of_find_property(dev_of_node(dev), "xlnx,tcm-mode", NULL) || > + device_is_compatible(dev, "xlnx,zynqmp-r5fss")) { > + ret = zynqmp_pm_set_tcm_config(r5_core->pm_domain_id, > +tcm_mode); > + if (ret < 0) { > + dev_err(r5_core->dev, "failed to configure > TCM\n"); > + return ret; > + } > + } > } > > return 0; > @@ -1022,7 +1001,7 @@ static int zynqmp_r5_cluster_init(struct > zynqmp_r5_cluster *cluster) > ret = of_property_read_u32(dev_node, "xlnx,tcm-mode", (u32 > *)_mode); > if (ret) > return ret; > - } else { > + } else if (device_is_compatible(dev, "xlnx,zynqmp-r5fss")) { > if (cluster_mode == LOCKSTEP_MODE) > tcm_mode = PM_RPU_TCM_COMB; > else > @@ -1212,6 +1191,8 @@ static int zynqmp_r5_remoteproc_probe(struct > platform_device *pdev) > > /* Match table for OF platform binding */ > static const struct of_device_id zynqmp_r5_remoteproc_match[] = { > + { .compatible = "xlnx,versal-net-r52fss", }, > + { .compatible = "xlnx,versal-r5fss", }, > { .compatible = "xlnx,zynqmp-r5fss", }, > { /* end of list */ }, > }; > > base-commit: 912ebe48bec5927e2049e91b0e8a9cc682a709d2 > -- > 2.25.1 >
Re: [PATCH v2 2/2] remoteproc: mediatek: Support MT8188 SCP core 1
Hi Olivia, On Fri, Apr 19, 2024 at 04:42:11PM +0800, Olivia Wen wrote: > From: "olivia.wen" > > There are three primary modifications. > > 1. The struct mtk_scp_of_data usage on MT8188 > MT8192 functions are unsuitable for the dual-core MT8188 SCP, > which has two RISC-V cores similar to MT8195 but without L1TCM. > We've added MT8188-specific functions to configure L1TCM > in multicore setups. > > 2. SCP_IPI_IMGSYS_CMD feature > This version also adds SCP_IPI_IMGSYS_CMD to facilitate > communication between the imgsys kernel and the backend driver. > > 3. Different code sizes and IPI share buffer sizes > Each SCP necessitates different code and IPI share buffer sizes. > Introducing a structure mtk_scp_sizes_data to handle them. > Please split in 3 different patches and in the changelog, concentrate on "why" you are making the changes rather than "what" changes are done. Thanks, Mathieu > Signed-off-by: olivia.wen > --- > drivers/remoteproc/mtk_common.h| 11 +- > drivers/remoteproc/mtk_scp.c | 230 > + > drivers/remoteproc/mtk_scp_ipi.c | 7 +- > include/linux/remoteproc/mtk_scp.h | 1 + > 4 files changed, 223 insertions(+), 26 deletions(-) > > diff --git a/drivers/remoteproc/mtk_common.h b/drivers/remoteproc/mtk_common.h > index 6d7736a..fd5c539 100644 > --- a/drivers/remoteproc/mtk_common.h > +++ b/drivers/remoteproc/mtk_common.h > @@ -78,7 +78,6 @@ > #define MT8195_L2TCM_OFFSET 0x850d0 > > #define SCP_FW_VER_LEN 32 > -#define SCP_SHARE_BUFFER_SIZE288 > > struct scp_run { > u32 signaled; > @@ -97,6 +96,11 @@ struct scp_ipi_desc { > > struct mtk_scp; > > +struct mtk_scp_sizes_data { > + size_t max_dram_size; > + size_t ipi_share_buffer_size; > +}; > + > struct mtk_scp_of_data { > int (*scp_clk_get)(struct mtk_scp *scp); > int (*scp_before_load)(struct mtk_scp *scp); > @@ -110,6 +114,7 @@ struct mtk_scp_of_data { > u32 host_to_scp_int_bit; > > size_t ipi_buf_offset; > + const struct mtk_scp_sizes_data *scp_sizes; > }; > > struct mtk_scp_of_cluster { > @@ -141,10 +146,10 @@ struct mtk_scp { > struct scp_ipi_desc ipi_desc[SCP_IPI_MAX]; > bool ipi_id_ack[SCP_IPI_MAX]; > wait_queue_head_t ack_wq; > + u8 *share_buf; > > void *cpu_addr; > dma_addr_t dma_addr; > - size_t dram_size; > > struct rproc_subdev *rpmsg_subdev; > > @@ -162,7 +167,7 @@ struct mtk_scp { > struct mtk_share_obj { > u32 id; > u32 len; > - u8 share_buf[SCP_SHARE_BUFFER_SIZE]; > + u8 *share_buf; > }; > > void scp_memcpy_aligned(void __iomem *dst, const void *src, unsigned int > len); > diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c > index 6751829..e281d28 100644 > --- a/drivers/remoteproc/mtk_scp.c > +++ b/drivers/remoteproc/mtk_scp.c > @@ -20,7 +20,6 @@ > #include "mtk_common.h" > #include "remoteproc_internal.h" > > -#define MAX_CODE_SIZE 0x50 > #define SECTION_NAME_IPI_BUFFER ".ipi_buffer" > > /** > @@ -94,14 +93,15 @@ static void scp_ipi_handler(struct mtk_scp *scp) > { > struct mtk_share_obj __iomem *rcv_obj = scp->recv_buf; > struct scp_ipi_desc *ipi_desc = scp->ipi_desc; > - u8 tmp_data[SCP_SHARE_BUFFER_SIZE]; > scp_ipi_handler_t handler; > u32 id = readl(_obj->id); > u32 len = readl(_obj->len); > + const struct mtk_scp_sizes_data *scp_sizes; > > - if (len > SCP_SHARE_BUFFER_SIZE) { > - dev_err(scp->dev, "ipi message too long (len %d, max %d)", len, > - SCP_SHARE_BUFFER_SIZE); > + scp_sizes = scp->data->scp_sizes; > + if (len > scp_sizes->ipi_share_buffer_size) { > + dev_err(scp->dev, "ipi message too long (len %d, max %zd)", len, > + scp_sizes->ipi_share_buffer_size); > return; > } > if (id >= SCP_IPI_MAX) { > @@ -117,8 +117,9 @@ static void scp_ipi_handler(struct mtk_scp *scp) > return; > } > > - memcpy_fromio(tmp_data, _obj->share_buf, len); > - handler(tmp_data, len, ipi_desc[id].priv); > + memset(scp->share_buf, 0, scp_sizes->ipi_share_buffer_size); > + memcpy_fromio(scp->share_buf, _obj->share_buf, len); > + handler(scp->share_buf, len, ipi_desc[id].priv); > scp_ipi_unlock(scp, id); > > scp->ipi_id_ack[id] = true; > @@ -133,6 +134,8 @@ static int scp_ipi_init(struct mtk_scp *scp, const struct > firmware *fw) > { > int ret; > size_t buf_sz, offset; > + size_t share_buf_offset; > + const struct mtk_scp_sizes_data *scp_sizes; > > /* read the ipi buf addr from FW itself first */ > ret = scp_elf_read_ipi_buf_addr(scp, fw, ); > @@ -152,12 +155,15 @@ static int scp_ipi_init(struct mtk_scp *scp, const > struct firmware *fw) > return -EOVERFLOW; > } > > + scp_sizes = scp->data->scp_sizes; >
Re: [PATCH v2] uprobes: reduce contention on uprobes_tree access
On Mon, 22 Apr 2024 03:23:05 -0700 Jonathan Haslam wrote: > Active uprobes are stored in an RB tree and accesses to this tree are > dominated by read operations. Currently these accesses are serialized by > a spinlock but this leads to enormous contention when large numbers of > threads are executing active probes. > > This patch converts the spinlock used to serialize access to the > uprobes_tree RB tree into a reader-writer spinlock. This lock type > aligns naturally with the overwhelmingly read-only nature of the tree > usage here. Although the addition of reader-writer spinlocks are > discouraged [0], this fix is proposed as an interim solution while an > RCU based approach is implemented (that work is in a nascent form). This > fix also has the benefit of being trivial, self contained and therefore > simple to backport. > > We have used a uprobe benchmark from the BPF selftests [1] to estimate > the improvements. Each block of results below show 1 line per execution > of the benchmark ("the "Summary" line) and each line is a run with one > more thread added - a thread is a "producer". The lines are edited to > remove extraneous output. > > The tests were executed with this driver script: > > for num_threads in {1..20} > do > sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary > done > > SPINLOCK (BEFORE) > == > Summary: hits1.396 ± 0.007M/s ( 1.396M/prod) > Summary: hits1.656 ± 0.016M/s ( 0.828M/prod) > Summary: hits2.246 ± 0.008M/s ( 0.749M/prod) > Summary: hits2.114 ± 0.010M/s ( 0.529M/prod) > Summary: hits2.013 ± 0.009M/s ( 0.403M/prod) > Summary: hits1.753 ± 0.008M/s ( 0.292M/prod) > Summary: hits1.847 ± 0.001M/s ( 0.264M/prod) > Summary: hits1.889 ± 0.001M/s ( 0.236M/prod) > Summary: hits1.833 ± 0.006M/s ( 0.204M/prod) > Summary: hits1.900 ± 0.003M/s ( 0.190M/prod) > Summary: hits1.918 ± 0.006M/s ( 0.174M/prod) > Summary: hits1.925 ± 0.002M/s ( 0.160M/prod) > Summary: hits1.837 ± 0.001M/s ( 0.141M/prod) > Summary: hits1.898 ± 0.001M/s ( 0.136M/prod) > Summary: hits1.799 ± 0.016M/s ( 0.120M/prod) > Summary: hits1.850 ± 0.005M/s ( 0.109M/prod) > Summary: hits1.816 ± 0.002M/s ( 0.101M/prod) > Summary: hits1.787 ± 0.001M/s ( 0.094M/prod) > Summary: hits1.764 ± 0.002M/s ( 0.088M/prod) > > RW SPINLOCK (AFTER) > === > Summary: hits1.444 ± 0.020M/s ( 1.444M/prod) > Summary: hits2.279 ± 0.011M/s ( 1.139M/prod) > Summary: hits3.422 ± 0.014M/s ( 1.141M/prod) > Summary: hits3.565 ± 0.017M/s ( 0.891M/prod) > Summary: hits2.671 ± 0.013M/s ( 0.534M/prod) > Summary: hits2.409 ± 0.005M/s ( 0.401M/prod) > Summary: hits2.485 ± 0.008M/s ( 0.355M/prod) > Summary: hits2.496 ± 0.003M/s ( 0.312M/prod) > Summary: hits2.585 ± 0.002M/s ( 0.287M/prod) > Summary: hits2.908 ± 0.011M/s ( 0.291M/prod) > Summary: hits2.346 ± 0.016M/s ( 0.213M/prod) > Summary: hits2.804 ± 0.004M/s ( 0.234M/prod) > Summary: hits2.556 ± 0.001M/s ( 0.197M/prod) > Summary: hits2.754 ± 0.004M/s ( 0.197M/prod) > Summary: hits2.482 ± 0.002M/s ( 0.165M/prod) > Summary: hits2.412 ± 0.005M/s ( 0.151M/prod) > Summary: hits2.710 ± 0.003M/s ( 0.159M/prod) > Summary: hits2.826 ± 0.005M/s ( 0.157M/prod) > Summary: hits2.718 ± 0.001M/s ( 0.143M/prod) > Summary: hits2.844 ± 0.006M/s ( 0.142M/prod) > > The numbers in parenthesis give averaged throughput per thread which is > of greatest interest here as a measure of scalability. Improvements are > in the order of 22 - 68% with this particular benchmark (mean = 43%). > > V2: > - Updated commit message to include benchmark results. > > [0] https://docs.kernel.org/locking/spinlocks.html > [1] > https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c Thanks for update! This looks good to me. Let me pick this for probes/for-next. Thank you, > > Signed-off-by: Jonathan Haslam > --- > kernel/events/uprobes.c | 22 +++--- > 1 file changed, 11 insertions(+), 11 deletions(-) > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index e4834d23e1d1..8ae0eefc3a34 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; > */ > #define no_uprobe_events() RB_EMPTY_ROOT(_tree) > > -static DEFINE_SPINLOCK(uprobes_treelock);/* serialize rbtree access */ > +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ > > #define UPROBES_HASH_SZ 13 > /* serialize uprobe->pending_list */ > @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, > loff_t offset) > { > struct uprobe *uprobe; > > - spin_lock(_treelock); > + read_lock(_treelock); > uprobe = __find_uprobe(inode, offset); > - spin_unlock(_treelock); > + read_unlock(_treelock); > >
Re: [PATCH v2] uprobes: reduce contention on uprobes_tree access
On Mon, 22 Apr 2024 13:39:32 +0200 Jiri Olsa wrote: > On Mon, Apr 22, 2024 at 03:23:05AM -0700, Jonathan Haslam wrote: > > Active uprobes are stored in an RB tree and accesses to this tree are > > dominated by read operations. Currently these accesses are serialized by > > a spinlock but this leads to enormous contention when large numbers of > > threads are executing active probes. > > > > This patch converts the spinlock used to serialize access to the > > uprobes_tree RB tree into a reader-writer spinlock. This lock type > > aligns naturally with the overwhelmingly read-only nature of the tree > > usage here. Although the addition of reader-writer spinlocks are > > discouraged [0], this fix is proposed as an interim solution while an > > RCU based approach is implemented (that work is in a nascent form). This > > fix also has the benefit of being trivial, self contained and therefore > > simple to backport. > > > > We have used a uprobe benchmark from the BPF selftests [1] to estimate > > the improvements. Each block of results below show 1 line per execution > > of the benchmark ("the "Summary" line) and each line is a run with one > > more thread added - a thread is a "producer". The lines are edited to > > remove extraneous output. > > > > The tests were executed with this driver script: > > > > for num_threads in {1..20} > > do > > sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary > > done > > > > SPINLOCK (BEFORE) > > == > > Summary: hits1.396 ± 0.007M/s ( 1.396M/prod) > > Summary: hits1.656 ± 0.016M/s ( 0.828M/prod) > > Summary: hits2.246 ± 0.008M/s ( 0.749M/prod) > > Summary: hits2.114 ± 0.010M/s ( 0.529M/prod) > > Summary: hits2.013 ± 0.009M/s ( 0.403M/prod) > > Summary: hits1.753 ± 0.008M/s ( 0.292M/prod) > > Summary: hits1.847 ± 0.001M/s ( 0.264M/prod) > > Summary: hits1.889 ± 0.001M/s ( 0.236M/prod) > > Summary: hits1.833 ± 0.006M/s ( 0.204M/prod) > > Summary: hits1.900 ± 0.003M/s ( 0.190M/prod) > > Summary: hits1.918 ± 0.006M/s ( 0.174M/prod) > > Summary: hits1.925 ± 0.002M/s ( 0.160M/prod) > > Summary: hits1.837 ± 0.001M/s ( 0.141M/prod) > > Summary: hits1.898 ± 0.001M/s ( 0.136M/prod) > > Summary: hits1.799 ± 0.016M/s ( 0.120M/prod) > > Summary: hits1.850 ± 0.005M/s ( 0.109M/prod) > > Summary: hits1.816 ± 0.002M/s ( 0.101M/prod) > > Summary: hits1.787 ± 0.001M/s ( 0.094M/prod) > > Summary: hits1.764 ± 0.002M/s ( 0.088M/prod) > > > > RW SPINLOCK (AFTER) > > === > > Summary: hits1.444 ± 0.020M/s ( 1.444M/prod) > > Summary: hits2.279 ± 0.011M/s ( 1.139M/prod) > > Summary: hits3.422 ± 0.014M/s ( 1.141M/prod) > > Summary: hits3.565 ± 0.017M/s ( 0.891M/prod) > > Summary: hits2.671 ± 0.013M/s ( 0.534M/prod) > > Summary: hits2.409 ± 0.005M/s ( 0.401M/prod) > > Summary: hits2.485 ± 0.008M/s ( 0.355M/prod) > > Summary: hits2.496 ± 0.003M/s ( 0.312M/prod) > > Summary: hits2.585 ± 0.002M/s ( 0.287M/prod) > > Summary: hits2.908 ± 0.011M/s ( 0.291M/prod) > > Summary: hits2.346 ± 0.016M/s ( 0.213M/prod) > > Summary: hits2.804 ± 0.004M/s ( 0.234M/prod) > > Summary: hits2.556 ± 0.001M/s ( 0.197M/prod) > > Summary: hits2.754 ± 0.004M/s ( 0.197M/prod) > > Summary: hits2.482 ± 0.002M/s ( 0.165M/prod) > > Summary: hits2.412 ± 0.005M/s ( 0.151M/prod) > > Summary: hits2.710 ± 0.003M/s ( 0.159M/prod) > > Summary: hits2.826 ± 0.005M/s ( 0.157M/prod) > > Summary: hits2.718 ± 0.001M/s ( 0.143M/prod) > > Summary: hits2.844 ± 0.006M/s ( 0.142M/prod) > > nice, I'm assuming Masami will take this one.. in any case: > > Acked-by: Jiri Olsa Thanks Jiri! This looks good to me too. Let me pick this for probes/for-next. Thank you, > > thanks, > jirka > > > > > The numbers in parenthesis give averaged throughput per thread which is > > of greatest interest here as a measure of scalability. Improvements are > > in the order of 22 - 68% with this particular benchmark (mean = 43%). > > > > V2: > > - Updated commit message to include benchmark results. > > > > [0] https://docs.kernel.org/locking/spinlocks.html > > [1] > > https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c > > > > Signed-off-by: Jonathan Haslam > > --- > > kernel/events/uprobes.c | 22 +++--- > > 1 file changed, 11 insertions(+), 11 deletions(-) > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > > index e4834d23e1d1..8ae0eefc3a34 100644 > > --- a/kernel/events/uprobes.c > > +++ b/kernel/events/uprobes.c > > @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; > > */ > > #define no_uprobe_events() RB_EMPTY_ROOT(_tree) > > > > -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ > > +static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */ > > > > #define
Re: [PATCHv3 bpf-next 0/7] uprobe: uretprobe speed up
Hi Jiri, On Sun, 21 Apr 2024 21:41:59 +0200 Jiri Olsa wrote: > hi, > as part of the effort on speeding up the uprobes [0] coming with > return uprobe optimization by using syscall instead of the trap > on the uretprobe trampoline. > > The speed up depends on instruction type that uprobe is installed > and depends on specific HW type, please check patch 1 for details. > > Patches 1-6 are based on bpf-next/master, but path 1 and 2 are > apply-able on linux-trace.git tree probes/for-next branch. > Patch 7 is based on man-pages master. Thanks for updated! I reviewed the series and just except for the manpage, it looks good to me. Reviewed-by: Masami Hiramatsu (Google) for the series. If Linux API maintainers are OK, I can pick this in probes/for-next. (BTW, who will pick the manpage patch?) Thank you, > > v3 changes: > - added source ip check if the uretprobe syscall is called from > trampoline and sending SIGILL to process if it's not > - keep x86 compat process to use standard breakpoint > - split syscall wiring into separate change > - ran ltp and syzkaller locally, no issues found [Masami] > - building uprobe_compat binary in selftests which breaks > CI atm because of missing 32-bit delve packages, I will > need to fix that in separate changes once this is acked > - added man page change > - there were several changes so I removed acks [Oleg Andrii] > > Also available at: > https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git > uretprobe_syscall > > thanks, > jirka > > > Notes to check list items in Documentation/process/adding-syscalls.rst: > > - System Call Alternatives > New syscall seems like the best way in here, becase we need > just to quickly enter kernel with no extra arguments processing, > which we'd need to do if we decided to use another syscall. > > - Designing the API: Planning for Extension > The uretprobe syscall is very specific and most likely won't be > extended in the future. > > At the moment it does not take any arguments and even if it does > in future, it's allowed to be called only from trampoline prepared > by kernel, so there'll be no broken user. > > - Designing the API: Other Considerations > N/A because uretprobe syscall does not return reference to kernel > object. > > - Proposing the API > Wiring up of the uretprobe system call si in separate change, > selftests and man page changes are part of the patchset. > > - Generic System Call Implementation > There's no CONFIG option for the new functionality because it > keeps the same behaviour from the user POV. > > - x86 System Call Implementation > It's 64-bit syscall only. > > - Compatibility System Calls (Generic) > N/A uretprobe syscall has no arguments and is not supported > for compat processes. > > - Compatibility System Calls (x86) > N/A uretprobe syscall is not supported for compat processes. > > - System Calls Returning Elsewhere > N/A. > > - Other Details > N/A. > > - Testing > Adding new bpf selftests and ran ltp on top of this change. > > - Man Page > Attached. > > - Do not call System Calls in the Kernel > N/A. > > > [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/ > --- > Jiri Olsa (6): > uprobe: Wire up uretprobe system call > uprobe: Add uretprobe syscall to speed up return probe > selftests/bpf: Add uretprobe syscall test for regs integrity > selftests/bpf: Add uretprobe syscall test for regs changes > selftests/bpf: Add uretprobe syscall call from user space test > selftests/bpf: Add uretprobe compat test > > arch/x86/entry/syscalls/syscall_64.tbl| 1 + > arch/x86/kernel/uprobes.c | 115 > ++ > include/linux/syscalls.h | 2 + > include/linux/uprobes.h | 3 + > include/uapi/asm-generic/unistd.h | 5 +- > kernel/events/uprobes.c | 24 +-- > kernel/sys_ni.c | 2 + > tools/include/linux/compiler.h| 4 ++ > tools/testing/selftests/bpf/.gitignore| 1 + > tools/testing/selftests/bpf/Makefile | 6 +- > tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 123 > +++- > tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 362 > + > tools/testing/selftests/bpf/progs/uprobe_syscall.c| 15 > tools/testing/selftests/bpf/progs/uprobe_syscall_call.c | 15 > tools/testing/selftests/bpf/progs/uprobe_syscall_compat.c | 13 > 15 files changed, 681 insertions(+), 10 deletions(-) > create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c > create mode 100644
Re: [PATCH 7/7] man2: Add uretprobe syscall page
On Sun, 21 Apr 2024 21:42:06 +0200 Jiri Olsa wrote: > Adding man page for new uretprobe syscall. > > Signed-off-by: Jiri Olsa > --- > man2/uretprobe.2 | 40 > 1 file changed, 40 insertions(+) > create mode 100644 man2/uretprobe.2 > > diff --git a/man2/uretprobe.2 b/man2/uretprobe.2 > new file mode 100644 > index ..c0343a88bb57 > --- /dev/null > +++ b/man2/uretprobe.2 > @@ -0,0 +1,40 @@ > +.\" Copyright (C) 2024, Jiri Olsa > +.\" > +.\" SPDX-License-Identifier: Linux-man-pages-copyleft > +.\" > +.TH uretprobe 2 (date) "Linux man-pages (unreleased)" > +.SH NAME > +uretprobe \- execute pending return uprobes > +.SH SYNOPSIS > +.nf > +.B int uretprobe(void) > +.fi > +.SH DESCRIPTION > +On x86_64 architecture the kernel is using uretprobe syscall to trigger > +uprobe return probe consumers instead of using standard breakpoint > instruction. > +The reason is that it's much faster to do syscall than breakpoint trap > +on x86_64 architecture. Do we specify the supported architecture as this? Currently it is supported only on x86-64, but it could be extended later, right? This should be just noted as NOTES. Something like "This syscall is initially introduced on x86-64 because a syscall is faster than a breakpoint trap on it. But this will be extended to the architectures whose syscall is faster than breakpoint trap." Thank you, > + > +The uretprobe syscall is not supposed to be called directly by user, it's > allowed > +to be invoked only through user space trampoline provided by kernel. > +When called from outside of this trampoline, the calling process will receive > +.BR SIGILL . > + > +.SH RETURN VALUE > +.BR uretprobe() > +return value is specific for given architecture. > + > +.SH VERSIONS > +This syscall is not specified in POSIX, > +and details of its behavior vary across systems. > +.SH STANDARDS > +None. > +.SH NOTES > +.BR uretprobe() > +exists only to allow the invocation of return uprobe consumers. > +It should > +.B never > +be called directly. > +Details of the arguments (if any) passed to > +.BR uretprobe () > +and the return value are specific for given architecture. > -- > 2.44.0 > -- Masami Hiramatsu (Google)
Re: [PATCH virt] virt: fix uninit-value in vhost_vsock_dev_open
On Mon, Apr 22, 2024 at 09:00:31AM -0400, Stefan Hajnoczi wrote: > On Sun, Apr 21, 2024 at 12:06:06PM +0900, Jeongjun Park wrote: > > static bool vhost_transport_seqpacket_allow(u32 remote_cid) > > { > > > > vsock = vhost_vsock_get(remote_cid); > > > > if (vsock) > > seqpacket_allow = vsock->seqpacket_allow; > > > > } > > > > I think this is due to reading a previously created uninitialized > > vsock->seqpacket_allow inside vhost_transport_seqpacket_allow(), > > which is executed by the function pointer present in the if statement. > > CCing Arseny, author of commit ced7b713711f ("vhost/vsock: support > SEQPACKET for transport"). > > Looks like a genuine bug in the commit. vhost_vsock_set_features() sets > seqpacket_allow to true when the feature is negotiated. The assumption > is that the field defaults to false. > > The rest of the vhost_vsock.ko code is written to initialize the > vhost_vsock fields, so you could argue seqpacket_allow should just be > explicitly initialized to false. > > However, eliminating this class of errors by zeroing seems reasonable in > this code path. vhost_vsock_dev_open() is not performance-critical. > > Acked-by: Stefan Hajnoczi But now that it's explained, the bugfix as proposed is incomplete: userspace can set features twice and the second time will leak old VIRTIO_VSOCK_F_SEQPACKET bit value. And I am pretty sure the Fixes tag is wrong. So I wrote this, but I actually don't have a set for seqpacket to test this. Arseny could you help test maybe? Thanks! commit bcc17a060d93b198d8a17a9b87b593f41337ee28 Author: Michael S. Tsirkin Date: Mon Apr 22 10:03:13 2024 -0400 vhost/vsock: always initialize seqpacket_allow There are two issues around seqpacket_allow: 1. seqpacket_allow is not initialized when socket is created. Thus if features are never set, it will be read uninitialized. 2. if VIRTIO_VSOCK_F_SEQPACKET is set and then cleared, then seqpacket_allow will not be cleared appropriately (existing apps I know about don't usually do this but it's legal and there's no way to be sure no one relies on this). To fix: - initialize seqpacket_allow after allocation - set it unconditionally in set_features Reported-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com Reported-by: Jeongjun Park Fixes: ced7b713711f ("vhost/vsock: support SEQPACKET for transport"). Cc: Arseny Krasnov Cc: David S. Miller Cc: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index ec20ecff85c7..bf664ec9341b 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -667,6 +667,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) } vsock->guest_cid = 0; /* no CID assigned yet */ + vsock->seqpacket_allow = false; atomic_set(>queued_replies, 0); @@ -810,8 +811,7 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) goto err; } - if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET)) - vsock->seqpacket_allow = true; + vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = >vqs[i];
Re: [syzbot] [virt?] [net?] KMSAN: uninit-value in vsock_assign_transport (2)
On Fri, Apr 19, 2024 at 02:39:20AM -0700, syzbot wrote: > Hello, > > syzbot found the following issue on: > > HEAD commit:8cd26fd90c1a Merge tag 'for-6.9-rc4-tag' of git://git.kern.. > git tree: upstream > console+strace: https://syzkaller.appspot.com/x/log.txt?x=102d27cd18 > kernel config: https://syzkaller.appspot.com/x/.config?x=87a805e655619c64 > dashboard link: https://syzkaller.appspot.com/bug?extid=6c21aeb59d0e82eb2782 > compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) > 2.40 > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=16e38c3b18 > C reproducer: https://syzkaller.appspot.com/x/repro.c?x=10e62fed18 > > Downloadable assets: > disk image: > https://storage.googleapis.com/syzbot-assets/488822aee24a/disk-8cd26fd9.raw.xz > vmlinux: > https://storage.googleapis.com/syzbot-assets/ba40e322ba00/vmlinux-8cd26fd9.xz > kernel image: > https://storage.googleapis.com/syzbot-assets/f30af1dfbc30/bzImage-8cd26fd9.xz > > IMPORTANT: if you fix the issue, please add the following tag to the commit: > Reported-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com > > = > BUG: KMSAN: uninit-value in vsock_assign_transport+0xb2a/0xb90 > net/vmw_vsock/af_vsock.c:500 > vsock_assign_transport+0xb2a/0xb90 net/vmw_vsock/af_vsock.c:500 > vsock_connect+0x544/0x1560 net/vmw_vsock/af_vsock.c:1393 > __sys_connect_file net/socket.c:2048 [inline] > __sys_connect+0x606/0x690 net/socket.c:2065 > __do_sys_connect net/socket.c:2075 [inline] > __se_sys_connect net/socket.c:2072 [inline] > __x64_sys_connect+0x91/0xe0 net/socket.c:2072 > x64_sys_call+0x3356/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:43 > do_syscall_x64 arch/x86/entry/common.c:52 [inline] > do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 > entry_SYSCALL_64_after_hwframe+0x77/0x7f > > Uninit was created at: > __kmalloc_large_node+0x231/0x370 mm/slub.c:3921 > __do_kmalloc_node mm/slub.c:3954 [inline] > __kmalloc_node+0xb07/0x1060 mm/slub.c:3973 > kmalloc_node include/linux/slab.h:648 [inline] > kvmalloc_node+0xc0/0x2d0 mm/util.c:634 > kvmalloc include/linux/slab.h:766 [inline] > vhost_vsock_dev_open+0x44/0x510 drivers/vhost/vsock.c:659 > misc_open+0x66b/0x760 drivers/char/misc.c:165 > chrdev_open+0xa5f/0xb80 fs/char_dev.c:414 > do_dentry_open+0x11f1/0x2120 fs/open.c:955 > vfs_open+0x7e/0xa0 fs/open.c:1089 > do_open fs/namei.c:3642 [inline] > path_openat+0x4a3c/0x5b00 fs/namei.c:3799 > do_filp_open+0x20e/0x590 fs/namei.c:3826 > do_sys_openat2+0x1bf/0x2f0 fs/open.c:1406 > do_sys_open fs/open.c:1421 [inline] > __do_sys_openat fs/open.c:1437 [inline] > __se_sys_openat fs/open.c:1432 [inline] > __x64_sys_openat+0x2a1/0x310 fs/open.c:1432 > x64_sys_call+0x3a64/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:258 > do_syscall_x64 arch/x86/entry/common.c:52 [inline] > do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 > entry_SYSCALL_64_after_hwframe+0x77/0x7f > > CPU: 1 PID: 5021 Comm: syz-executor390 Not tainted > 6.9.0-rc4-syzkaller-00038-g8cd26fd90c1a #0 > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS > Google 03/27/2024 > = > > > --- > This report is generated by a bot. It may contain errors. > See https://goo.gl/tpsmEJ for more information about syzbot. > syzbot engineers can be reached at syzkal...@googlegroups.com. > > syzbot will keep track of this issue. See: > https://goo.gl/tpsmEJ#status for how to communicate with syzbot. > > If the report is already addressed, let syzbot know by replying with: > #syz fix: exact-commit-title > > If you want syzbot to run the reproducer, reply with: > #syz test: git://repo/address.git branch-or-commit-hash > If you attach or paste a git patch, syzbot will apply it before testing. > > If you want to overwrite report's subsystems, reply with: > #syz set subsystems: new-subsystem > (See the list of subsystem names on the web dashboard) > > If the report is a duplicate of another one, reply with: > #syz dup: exact-subject-of-another-report > > If you want to undo deduplication, reply with: > #syz undup #syz test: https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git bcc17a060d93b198d8a17a9b87b593f41337ee28
Re: [PATCH v5 14/15] kprobes: remove dependency on CONFIG_MODULES
On Mon, 22 Apr 2024 12:44:35 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (IBM)" > > kprobes depended on CONFIG_MODULES because it has to allocate memory for > code. > > Since code allocations are now implemented with execmem, kprobes can be > enabled in non-modular kernels. > > Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside > modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the > dependency of CONFIG_KPROBES on CONFIG_MODULES. Looks good to me. Acked-by: Masami Hiramatsu (Google) Thank you! > > Signed-off-by: Mike Rapoport (IBM) > --- > arch/Kconfig| 2 +- > include/linux/module.h | 9 ++ > kernel/kprobes.c| 55 +++-- > kernel/trace/trace_kprobe.c | 20 +- > 4 files changed, 63 insertions(+), 23 deletions(-) > > diff --git a/arch/Kconfig b/arch/Kconfig > index 7006f71f0110..a48ce6a488b3 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -52,9 +52,9 @@ config GENERIC_ENTRY > > config KPROBES > bool "Kprobes" > - depends on MODULES > depends on HAVE_KPROBES > select KALLSYMS > + select EXECMEM > select TASKS_RCU if PREEMPTION > help > Kprobes allows you to trap at almost any kernel address and > diff --git a/include/linux/module.h b/include/linux/module.h > index 1153b0d99a80..ffa1c603163c 100644 > --- a/include/linux/module.h > +++ b/include/linux/module.h > @@ -605,6 +605,11 @@ static inline bool module_is_live(struct module *mod) > return mod->state != MODULE_STATE_GOING; > } > > +static inline bool module_is_coming(struct module *mod) > +{ > +return mod->state == MODULE_STATE_COMING; > +} > + > struct module *__module_text_address(unsigned long addr); > struct module *__module_address(unsigned long addr); > bool is_module_address(unsigned long addr); > @@ -857,6 +862,10 @@ void *dereference_module_function_descriptor(struct > module *mod, void *ptr) > return ptr; > } > > +static inline bool module_is_coming(struct module *mod) > +{ > + return false; > +} > #endif /* CONFIG_MODULES */ > > #ifdef CONFIG_SYSFS > diff --git a/kernel/kprobes.c b/kernel/kprobes.c > index ddd7cdc16edf..ca2c6cbd42d2 100644 > --- a/kernel/kprobes.c > +++ b/kernel/kprobes.c > @@ -1588,7 +1588,7 @@ static int check_kprobe_address_safe(struct kprobe *p, > } > > /* Get module refcount and reject __init functions for loaded modules. > */ > - if (*probed_mod) { > + if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { > /* >* We must hold a refcount of the probed module while updating >* its code to prohibit unexpected unloading. > @@ -1603,12 +1603,13 @@ static int check_kprobe_address_safe(struct kprobe *p, >* kprobes in there. >*/ > if (within_module_init((unsigned long)p->addr, *probed_mod) && > - (*probed_mod)->state != MODULE_STATE_COMING) { > + !module_is_coming(*probed_mod)) { > module_put(*probed_mod); > *probed_mod = NULL; > ret = -ENOENT; > } > } > + > out: > preempt_enable(); > jump_label_unlock(); > @@ -2488,24 +2489,6 @@ int kprobe_add_area_blacklist(unsigned long start, > unsigned long end) > return 0; > } > > -/* Remove all symbols in given area from kprobe blacklist */ > -static void kprobe_remove_area_blacklist(unsigned long start, unsigned long > end) > -{ > - struct kprobe_blacklist_entry *ent, *n; > - > - list_for_each_entry_safe(ent, n, _blacklist, list) { > - if (ent->start_addr < start || ent->start_addr >= end) > - continue; > - list_del(>list); > - kfree(ent); > - } > -} > - > -static void kprobe_remove_ksym_blacklist(unsigned long entry) > -{ > - kprobe_remove_area_blacklist(entry, entry + 1); > -} > - > int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long > *value, > char *type, char *sym) > { > @@ -2570,6 +2553,25 @@ static int __init populate_kprobe_blacklist(unsigned > long *start, > return ret ? : arch_populate_kprobe_blacklist(); > } > > +#ifdef CONFIG_MODULES > +/* Remove all symbols in given area from kprobe blacklist */ > +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long > end) > +{ > + struct kprobe_blacklist_entry *ent, *n; > + > + list_for_each_entry_safe(ent, n, _blacklist, list) { > + if (ent->start_addr < start || ent->start_addr >= end) > + continue; > + list_del(>list); > + kfree(ent); > + } > +} > + > +static void kprobe_remove_ksym_blacklist(unsigned long entry) > +{ > + kprobe_remove_area_blacklist(entry, entry + 1); > +} > + > static void add_module_kprobe_blacklist(struct module
Re: [PATCH virt] virt: fix uninit-value in vhost_vsock_dev_open
On Sun, Apr 21, 2024 at 12:06:06PM +0900, Jeongjun Park wrote: > static bool vhost_transport_seqpacket_allow(u32 remote_cid) > { > > vsock = vhost_vsock_get(remote_cid); > > if (vsock) > seqpacket_allow = vsock->seqpacket_allow; > > } > > I think this is due to reading a previously created uninitialized > vsock->seqpacket_allow inside vhost_transport_seqpacket_allow(), > which is executed by the function pointer present in the if statement. CCing Arseny, author of commit ced7b713711f ("vhost/vsock: support SEQPACKET for transport"). Looks like a genuine bug in the commit. vhost_vsock_set_features() sets seqpacket_allow to true when the feature is negotiated. The assumption is that the field defaults to false. The rest of the vhost_vsock.ko code is written to initialize the vhost_vsock fields, so you could argue seqpacket_allow should just be explicitly initialized to false. However, eliminating this class of errors by zeroing seems reasonable in this code path. vhost_vsock_dev_open() is not performance-critical. Acked-by: Stefan Hajnoczi signature.asc Description: PGP signature
Re: [PATCH v5 11/15] arch: make execmem setup available regardless of CONFIG_MODULES
On 22/4/24 11:44, Mike Rapoport wrote: From: "Mike Rapoport (IBM)" execmem does not depend on modules, on the contrary modules use execmem. To make execmem available when CONFIG_MODULES=n, for instance for kprobes, split execmem_params initialization out from arch/*/kernel/module.c and compile it when CONFIG_EXECMEM=y Signed-off-by: Mike Rapoport (IBM) --- arch/arm/kernel/module.c | 43 -- arch/arm/mm/init.c | 45 +++ arch/arm64/kernel/module.c | 140 - arch/arm64/mm/init.c | 140 + arch/loongarch/kernel/module.c | 19 - arch/loongarch/mm/init.c | 21 + arch/mips/kernel/module.c | 22 -- arch/mips/mm/init.c| 23 ++ arch/nios2/kernel/module.c | 20 - arch/nios2/mm/init.c | 21 + arch/parisc/kernel/module.c| 20 - arch/parisc/mm/init.c | 23 +- arch/powerpc/kernel/module.c | 63 --- arch/powerpc/mm/mem.c | 64 +++ arch/riscv/kernel/module.c | 44 --- arch/riscv/mm/init.c | 45 +++ arch/s390/kernel/module.c | 27 --- arch/s390/mm/init.c| 30 +++ arch/sparc/kernel/module.c | 19 - arch/sparc/mm/Makefile | 2 + arch/sparc/mm/execmem.c| 21 + arch/x86/kernel/module.c | 27 --- arch/x86/mm/init.c | 29 +++ 23 files changed, 463 insertions(+), 445 deletions(-) create mode 100644 arch/sparc/mm/execmem.c Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH v5] vp_vdpa: don't allocate unused msix vectors
On Wed, Apr 10, 2024 at 11:30:20AM +0800, lyx634449800 wrote: > From: Yuxue Liu > > When there is a ctlq and it doesn't require interrupt > callbacks,the original method of calculating vectors > wastes hardware msi or msix resources as well as system > IRQ resources. > > When conducting performance testing using testpmd in the > guest os, it was found that the performance was lower compared > to directly using vfio-pci to passthrough the device > > In scenarios where the virtio device in the guest os does > not utilize interrupts, the vdpa driver still configures > the hardware's msix vector. Therefore, the hardware still > sends interrupts to the host os. I just have a question on this part. How come hardware sends interrupts does not guest driver disable them? > Because of this unnecessary > action by the hardware, hardware performance decreases, and > it also affects the performance of the host os. > > Before modification:(interrupt mode) > 32: 0 0 0 0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0 > 33: 0 0 0 0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1 > 34: 0 0 0 0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2 > 35: 0 0 0 0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config > > After modification:(interrupt mode) > 32: 0 0 1 7 PCI-MSI 32768-edge vp-vdpa[:00:02.0]-0 > 33: 36 0 3 0 PCI-MSI 32769-edge vp-vdpa[:00:02.0]-1 > 34: 0 0 0 0 PCI-MSI 32770-edge vp-vdpa[:00:02.0]-config > > Before modification:(virtio pmd mode for guest os) > 32: 0 0 0 0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0 > 33: 0 0 0 0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1 > 34: 0 0 0 0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2 > 35: 0 0 0 0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config > > After modification:(virtio pmd mode for guest os) > 32: 0 0 0 0 PCI-MSI 32768-edge vp-vdpa[:00:02.0]-config > > To verify the use of the virtio PMD mode in the guest operating > system, the following patch needs to be applied to QEMU: > https://lore.kernel.org/all/20240408073311.2049-1-yuxue@jaguarmicro.com > > Signed-off-by: Yuxue Liu > Acked-by: Jason Wang > Reviewed-by: Heng Qi > --- > V5: modify the description of the printout when an exception occurs > V4: update the title and assign values to uninitialized variables > V3: delete unused variables and add validation records > V2: fix when allocating IRQs, scan all queues > > drivers/vdpa/virtio_pci/vp_vdpa.c | 22 -- > 1 file changed, 16 insertions(+), 6 deletions(-) > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c > b/drivers/vdpa/virtio_pci/vp_vdpa.c > index df5f4a3bccb5..8de0224e9ec2 100644 > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c > @@ -160,7 +160,13 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) > struct pci_dev *pdev = mdev->pci_dev; > int i, ret, irq; > int queues = vp_vdpa->queues; > - int vectors = queues + 1; > + int vectors = 1; > + int msix_vec = 0; > + > + for (i = 0; i < queues; i++) { > + if (vp_vdpa->vring[i].cb.callback) > + vectors++; > + } > > ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX); > if (ret != vectors) { > @@ -173,9 +179,12 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) > vp_vdpa->vectors = vectors; > > for (i = 0; i < queues; i++) { > + if (!vp_vdpa->vring[i].cb.callback) > + continue; > + > snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE, > "vp-vdpa[%s]-%d\n", pci_name(pdev), i); > - irq = pci_irq_vector(pdev, i); > + irq = pci_irq_vector(pdev, msix_vec); > ret = devm_request_irq(>dev, irq, > vp_vdpa_vq_handler, > 0, vp_vdpa->vring[i].msix_name, > @@ -185,21 +194,22 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) > "vp_vdpa: fail to request irq for vq %d\n", i); > goto err; > } > - vp_modern_queue_vector(mdev, i, i); > + vp_modern_queue_vector(mdev, i, msix_vec); > vp_vdpa->vring[i].irq = irq; > + msix_vec++; > } > > snprintf(vp_vdpa->msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-config\n", >pci_name(pdev)); > - irq = pci_irq_vector(pdev, queues); > + irq = pci_irq_vector(pdev, msix_vec); > ret = devm_request_irq(>dev, irq, vp_vdpa_config_handler, 0, > vp_vdpa->msix_name, vp_vdpa); > if (ret) { > dev_err(>dev, > - "vp_vdpa: fail to request irq for vq %d\n", i); > + "vp_vdpa: fail to request irq for config: %d\n", ret); > goto err; > } > -
Re: [PATCH v2] uprobes: reduce contention on uprobes_tree access
On Mon, Apr 22, 2024 at 03:23:05AM -0700, Jonathan Haslam wrote: > Active uprobes are stored in an RB tree and accesses to this tree are > dominated by read operations. Currently these accesses are serialized by > a spinlock but this leads to enormous contention when large numbers of > threads are executing active probes. > > This patch converts the spinlock used to serialize access to the > uprobes_tree RB tree into a reader-writer spinlock. This lock type > aligns naturally with the overwhelmingly read-only nature of the tree > usage here. Although the addition of reader-writer spinlocks are > discouraged [0], this fix is proposed as an interim solution while an > RCU based approach is implemented (that work is in a nascent form). This > fix also has the benefit of being trivial, self contained and therefore > simple to backport. > > We have used a uprobe benchmark from the BPF selftests [1] to estimate > the improvements. Each block of results below show 1 line per execution > of the benchmark ("the "Summary" line) and each line is a run with one > more thread added - a thread is a "producer". The lines are edited to > remove extraneous output. > > The tests were executed with this driver script: > > for num_threads in {1..20} > do > sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary > done > > SPINLOCK (BEFORE) > == > Summary: hits1.396 ± 0.007M/s ( 1.396M/prod) > Summary: hits1.656 ± 0.016M/s ( 0.828M/prod) > Summary: hits2.246 ± 0.008M/s ( 0.749M/prod) > Summary: hits2.114 ± 0.010M/s ( 0.529M/prod) > Summary: hits2.013 ± 0.009M/s ( 0.403M/prod) > Summary: hits1.753 ± 0.008M/s ( 0.292M/prod) > Summary: hits1.847 ± 0.001M/s ( 0.264M/prod) > Summary: hits1.889 ± 0.001M/s ( 0.236M/prod) > Summary: hits1.833 ± 0.006M/s ( 0.204M/prod) > Summary: hits1.900 ± 0.003M/s ( 0.190M/prod) > Summary: hits1.918 ± 0.006M/s ( 0.174M/prod) > Summary: hits1.925 ± 0.002M/s ( 0.160M/prod) > Summary: hits1.837 ± 0.001M/s ( 0.141M/prod) > Summary: hits1.898 ± 0.001M/s ( 0.136M/prod) > Summary: hits1.799 ± 0.016M/s ( 0.120M/prod) > Summary: hits1.850 ± 0.005M/s ( 0.109M/prod) > Summary: hits1.816 ± 0.002M/s ( 0.101M/prod) > Summary: hits1.787 ± 0.001M/s ( 0.094M/prod) > Summary: hits1.764 ± 0.002M/s ( 0.088M/prod) > > RW SPINLOCK (AFTER) > === > Summary: hits1.444 ± 0.020M/s ( 1.444M/prod) > Summary: hits2.279 ± 0.011M/s ( 1.139M/prod) > Summary: hits3.422 ± 0.014M/s ( 1.141M/prod) > Summary: hits3.565 ± 0.017M/s ( 0.891M/prod) > Summary: hits2.671 ± 0.013M/s ( 0.534M/prod) > Summary: hits2.409 ± 0.005M/s ( 0.401M/prod) > Summary: hits2.485 ± 0.008M/s ( 0.355M/prod) > Summary: hits2.496 ± 0.003M/s ( 0.312M/prod) > Summary: hits2.585 ± 0.002M/s ( 0.287M/prod) > Summary: hits2.908 ± 0.011M/s ( 0.291M/prod) > Summary: hits2.346 ± 0.016M/s ( 0.213M/prod) > Summary: hits2.804 ± 0.004M/s ( 0.234M/prod) > Summary: hits2.556 ± 0.001M/s ( 0.197M/prod) > Summary: hits2.754 ± 0.004M/s ( 0.197M/prod) > Summary: hits2.482 ± 0.002M/s ( 0.165M/prod) > Summary: hits2.412 ± 0.005M/s ( 0.151M/prod) > Summary: hits2.710 ± 0.003M/s ( 0.159M/prod) > Summary: hits2.826 ± 0.005M/s ( 0.157M/prod) > Summary: hits2.718 ± 0.001M/s ( 0.143M/prod) > Summary: hits2.844 ± 0.006M/s ( 0.142M/prod) nice, I'm assuming Masami will take this one.. in any case: Acked-by: Jiri Olsa thanks, jirka > > The numbers in parenthesis give averaged throughput per thread which is > of greatest interest here as a measure of scalability. Improvements are > in the order of 22 - 68% with this particular benchmark (mean = 43%). > > V2: > - Updated commit message to include benchmark results. > > [0] https://docs.kernel.org/locking/spinlocks.html > [1] > https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c > > Signed-off-by: Jonathan Haslam > --- > kernel/events/uprobes.c | 22 +++--- > 1 file changed, 11 insertions(+), 11 deletions(-) > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index e4834d23e1d1..8ae0eefc3a34 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; > */ > #define no_uprobe_events() RB_EMPTY_ROOT(_tree) > > -static DEFINE_SPINLOCK(uprobes_treelock);/* serialize rbtree access */ > +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ > > #define UPROBES_HASH_SZ 13 > /* serialize uprobe->pending_list */ > @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, > loff_t offset) > { > struct uprobe *uprobe; > > - spin_lock(_treelock); > + read_lock(_treelock); > uprobe = __find_uprobe(inode, offset); > - spin_unlock(_treelock); > + read_unlock(_treelock);
Re: [PATCH v5] vp_vdpa: don't allocate unused msix vectors
Dear Michael, I hope this email finds you well. I am reaching out to request your assistance in reviewing a patch. The patch in question is titled "[PATCH v5] vp_vdpa: don't allocate unused msix vectors". I believe your expertise and insights would be invaluable in ensuring the quality and effectiveness of this patch. Your feedback and review are highly appreciated. Please let me know if you have any questions or require further information. Thank you for your time and consideration. Best regards, Yuxue Liu -Original Message- From: Gavin Liu Sent: April 10, 2024 11:31 To: m...@redhat.com; jasow...@redhat.com Cc: Angus Chen angus.c...@jaguarmicro.com; virtualizat...@lists.linux.dev; xuanz...@linux.alibaba.com; Gavin Liu gavin@jaguarmicro.com; linux-kernel@vger.kernel.org; Heng Qi hen...@linux.alibaba.com Subject: [PATCH v5] vp_vdpa: don't allocate unused msix vectors From: Yuxue Liu When there is a ctlq and it doesn't require interrupt callbacks,the original method of calculating vectors wastes hardware msi or msix resources as well as system IRQ resources. When conducting performance testing using testpmd in the guest os, it was found that the performance was lower compared to directly using vfio-pci to passthrough the device In scenarios where the virtio device in the guest os does not utilize interrupts, the vdpa driver still configures the hardware's msix vector. Therefore, the hardware still sends interrupts to the host os. Because of this unnecessary action by the hardware, hardware performance decreases, and it also affects the performance of the host os. Before modification:(interrupt mode) 32: 0 0 0 0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0 33: 0 0 0 0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1 34: 0 0 0 0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2 35: 0 0 0 0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config After modification:(interrupt mode) 32: 0 0 1 7 PCI-MSI 32768-edge vp-vdpa[:00:02.0]-0 33: 36 0 3 0 PCI-MSI 32769-edge vp-vdpa[:00:02.0]-1 34: 0 0 0 0 PCI-MSI 32770-edge vp-vdpa[:00:02.0]-config Before modification:(virtio pmd mode for guest os) 32: 0 0 0 0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0 33: 0 0 0 0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1 34: 0 0 0 0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2 35: 0 0 0 0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config After modification:(virtio pmd mode for guest os) 32: 0 0 0 0 PCI-MSI 32768-edge vp-vdpa[:00:02.0]-config To verify the use of the virtio PMD mode in the guest operating system, the following patch needs to be applied to QEMU: https://lore.kernel.org/all/20240408073311.2049-1-yuxue@jaguarmicro.com Signed-off-by: Yuxue Liu Acked-by: Jason Wang Reviewed-by: Heng Qi --- V5: modify the description of the printout when an exception occurs V4: update the title and assign values to uninitialized variables V3: delete unused variables and add validation records V2: fix when allocating IRQs, scan all queues drivers/vdpa/virtio_pci/vp_vdpa.c | 22 -- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index df5f4a3bccb5..8de0224e9ec2 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -160,7 +160,13 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) struct pci_dev *pdev = mdev->pci_dev; int i, ret, irq; int queues = vp_vdpa->queues; - int vectors = queues + 1; + int vectors = 1; + int msix_vec = 0; + + for (i = 0; i < queues; i++) { + if (vp_vdpa->vring[i].cb.callback) + vectors++; + } ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX); if (ret != vectors) { @@ -173,9 +179,12 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) vp_vdpa->vectors = vectors; for (i = 0; i < queues; i++) { + if (!vp_vdpa->vring[i].cb.callback) + continue; + snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-%d\n", pci_name(pdev), i); - irq = pci_irq_vector(pdev, i); + irq = pci_irq_vector(pdev, msix_vec); ret = devm_request_irq(>dev, irq, vp_vdpa_vq_handler, 0, vp_vdpa->vring[i].msix_name, @@ -185,21 +194,22 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) "vp_vdpa: fail to request irq for vq %d\n", i); goto err; } - vp_modern_queue_vector(mdev, i, i); + vp_modern_queue_vector(mdev, i, msix_vec); vp_vdpa->vring[i].irq = irq; +
[PATCH v2] uprobes: reduce contention on uprobes_tree access
Active uprobes are stored in an RB tree and accesses to this tree are dominated by read operations. Currently these accesses are serialized by a spinlock but this leads to enormous contention when large numbers of threads are executing active probes. This patch converts the spinlock used to serialize access to the uprobes_tree RB tree into a reader-writer spinlock. This lock type aligns naturally with the overwhelmingly read-only nature of the tree usage here. Although the addition of reader-writer spinlocks are discouraged [0], this fix is proposed as an interim solution while an RCU based approach is implemented (that work is in a nascent form). This fix also has the benefit of being trivial, self contained and therefore simple to backport. We have used a uprobe benchmark from the BPF selftests [1] to estimate the improvements. Each block of results below show 1 line per execution of the benchmark ("the "Summary" line) and each line is a run with one more thread added - a thread is a "producer". The lines are edited to remove extraneous output. The tests were executed with this driver script: for num_threads in {1..20} do sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary done SPINLOCK (BEFORE) == Summary: hits1.396 ± 0.007M/s ( 1.396M/prod) Summary: hits1.656 ± 0.016M/s ( 0.828M/prod) Summary: hits2.246 ± 0.008M/s ( 0.749M/prod) Summary: hits2.114 ± 0.010M/s ( 0.529M/prod) Summary: hits2.013 ± 0.009M/s ( 0.403M/prod) Summary: hits1.753 ± 0.008M/s ( 0.292M/prod) Summary: hits1.847 ± 0.001M/s ( 0.264M/prod) Summary: hits1.889 ± 0.001M/s ( 0.236M/prod) Summary: hits1.833 ± 0.006M/s ( 0.204M/prod) Summary: hits1.900 ± 0.003M/s ( 0.190M/prod) Summary: hits1.918 ± 0.006M/s ( 0.174M/prod) Summary: hits1.925 ± 0.002M/s ( 0.160M/prod) Summary: hits1.837 ± 0.001M/s ( 0.141M/prod) Summary: hits1.898 ± 0.001M/s ( 0.136M/prod) Summary: hits1.799 ± 0.016M/s ( 0.120M/prod) Summary: hits1.850 ± 0.005M/s ( 0.109M/prod) Summary: hits1.816 ± 0.002M/s ( 0.101M/prod) Summary: hits1.787 ± 0.001M/s ( 0.094M/prod) Summary: hits1.764 ± 0.002M/s ( 0.088M/prod) RW SPINLOCK (AFTER) === Summary: hits1.444 ± 0.020M/s ( 1.444M/prod) Summary: hits2.279 ± 0.011M/s ( 1.139M/prod) Summary: hits3.422 ± 0.014M/s ( 1.141M/prod) Summary: hits3.565 ± 0.017M/s ( 0.891M/prod) Summary: hits2.671 ± 0.013M/s ( 0.534M/prod) Summary: hits2.409 ± 0.005M/s ( 0.401M/prod) Summary: hits2.485 ± 0.008M/s ( 0.355M/prod) Summary: hits2.496 ± 0.003M/s ( 0.312M/prod) Summary: hits2.585 ± 0.002M/s ( 0.287M/prod) Summary: hits2.908 ± 0.011M/s ( 0.291M/prod) Summary: hits2.346 ± 0.016M/s ( 0.213M/prod) Summary: hits2.804 ± 0.004M/s ( 0.234M/prod) Summary: hits2.556 ± 0.001M/s ( 0.197M/prod) Summary: hits2.754 ± 0.004M/s ( 0.197M/prod) Summary: hits2.482 ± 0.002M/s ( 0.165M/prod) Summary: hits2.412 ± 0.005M/s ( 0.151M/prod) Summary: hits2.710 ± 0.003M/s ( 0.159M/prod) Summary: hits2.826 ± 0.005M/s ( 0.157M/prod) Summary: hits2.718 ± 0.001M/s ( 0.143M/prod) Summary: hits2.844 ± 0.006M/s ( 0.142M/prod) The numbers in parenthesis give averaged throughput per thread which is of greatest interest here as a measure of scalability. Improvements are in the order of 22 - 68% with this particular benchmark (mean = 43%). V2: - Updated commit message to include benchmark results. [0] https://docs.kernel.org/locking/spinlocks.html [1] https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c Signed-off-by: Jonathan Haslam --- kernel/events/uprobes.c | 22 +++--- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4834d23e1d1..8ae0eefc3a34 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; */ #define no_uprobe_events() RB_EMPTY_ROOT(_tree) -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */ #define UPROBES_HASH_SZ13 /* serialize uprobe->pending_list */ @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - spin_lock(_treelock); + read_lock(_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock(_treelock); + read_unlock(_treelock); return uprobe; } @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; - spin_lock(_treelock); + write_lock(_treelock); u = __insert_uprobe(uprobe); - spin_unlock(_treelock); + write_unlock(_treelock); return u; } @@ -935,9 +935,9 @@ static void
Re: [PATCH v2 2/2] remoteproc: mediatek: Support MT8188 SCP core 1
Il 19/04/24 10:42, Olivia Wen ha scritto: From: "olivia.wen" There are three primary modifications. 1. The struct mtk_scp_of_data usage on MT8188 MT8192 functions are unsuitable for the dual-core MT8188 SCP, which has two RISC-V cores similar to MT8195 but without L1TCM. We've added MT8188-specific functions to configure L1TCM in multicore setups. 2. SCP_IPI_IMGSYS_CMD feature This version also adds SCP_IPI_IMGSYS_CMD to facilitate communication between the imgsys kernel and the backend driver. 3. Different code sizes and IPI share buffer sizes Each SCP necessitates different code and IPI share buffer sizes. Introducing a structure mtk_scp_sizes_data to handle them. Signed-off-by: olivia.wen Reviewed-by: AngeloGioacchino Del Regno
Re: [PATCH v2 1/2] dt-bindings: remoteproc: mediatek: Support MT8188 dual-core SCP
Il 19/04/24 10:42, Olivia Wen ha scritto: From: "olivia.wen" Under different applications, the MT8188 SCP can be used as single-core or dual-core. Signed-off-by: olivia.wen Reviewed-by: AngeloGioacchino Del Regno
Re: [PATCH v2 1/2] dt-bindings: remoteproc: mediatek: Support MT8188 dual-core SCP
Il 19/04/24 10:42, Olivia Wen ha scritto: From: "olivia.wen" Under different applications, the MT8188 SCP can be used as single-core or dual-core. Signed-off-by: olivia.wen Reviewed-by: AngeloGioacchino Del Regno
Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent
On 22/04/2024 11:17, Jason Xing wrote:> On Mon, Apr 22, 2024 at 4:47 PM Matthieu Baerts wrote: >> On 22/04/2024 05:01, Jason Xing wrote: >>> From: Jason Xing (...) >>> diff --git a/include/net/rstreason.h b/include/net/rstreason.h >>> new file mode 100644 >>> index ..c57bc5413c17 >>> --- /dev/null >>> +++ b/include/net/rstreason.h >>> @@ -0,0 +1,144 @@ >>> +/* SPDX-License-Identifier: GPL-2.0-or-later */ >>> + >>> +#ifndef _LINUX_RSTREASON_H >>> +#define _LINUX_RSTREASON_H >>> +#include >>> +#include >>> + >>> +#define DEFINE_RST_REASON(FN, FNe) \ >>> + FN(MPTCP_RST_EUNSPEC) \ >>> + FN(MPTCP_RST_EMPTCP)\ >>> + FN(MPTCP_RST_ERESOURCE) \ >>> + FN(MPTCP_RST_EPROHIBIT) \ >>> + FN(MPTCP_RST_EWQ2BIG) \ >>> + FN(MPTCP_RST_EBADPERF) \ >>> + FN(MPTCP_RST_EMIDDLEBOX)\ >> >> Small detail: should it not make more sense to put the ones linked to >> MPTCP at the end? I mean I guess MPTCP should be treated in second >> priority: CONFIG_MPTCP could not be set, and the ones linked to TCP >> should be more frequent, etc. > > Do you mean that I need to adjust the order: 1) tcp reasons first, 2) > independent reasons, 3) mptcp reasons ? Correct, it looks like it is a more "natural" order. > Reasonable. I will do it :) Thanks! Cheers, Matt -- Sponsored by the NGI0 Core fund.
[PATCH v5 15/15] bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of
From: "Mike Rapoport (IBM)" BPF just-in-time compiler depended on CONFIG_MODULES because it used module_alloc() to allocate memory for the generated code. Since code allocations are now implemented with execmem, drop dependency of CONFIG_BPF_JIT on CONFIG_MODULES and make it select CONFIG_EXECMEM. Suggested-by: Björn Töpel Signed-off-by: Mike Rapoport (IBM) --- kernel/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index bc25f5098a25..f999e4e0b344 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -43,7 +43,7 @@ config BPF_JIT bool "Enable BPF Just In Time compiler" depends on BPF depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT - depends on MODULES + select EXECMEM help BPF programs are normally handled by a BPF interpreter. This option allows the kernel to generate native code when a program is loaded -- 2.43.0
[PATCH v5 14/15] kprobes: remove dependency on CONFIG_MODULES
From: "Mike Rapoport (IBM)" kprobes depended on CONFIG_MODULES because it has to allocate memory for code. Since code allocations are now implemented with execmem, kprobes can be enabled in non-modular kernels. Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the dependency of CONFIG_KPROBES on CONFIG_MODULES. Signed-off-by: Mike Rapoport (IBM) --- arch/Kconfig| 2 +- include/linux/module.h | 9 ++ kernel/kprobes.c| 55 +++-- kernel/trace/trace_kprobe.c | 20 +- 4 files changed, 63 insertions(+), 23 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 7006f71f0110..a48ce6a488b3 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -52,9 +52,9 @@ config GENERIC_ENTRY config KPROBES bool "Kprobes" - depends on MODULES depends on HAVE_KPROBES select KALLSYMS + select EXECMEM select TASKS_RCU if PREEMPTION help Kprobes allows you to trap at almost any kernel address and diff --git a/include/linux/module.h b/include/linux/module.h index 1153b0d99a80..ffa1c603163c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -605,6 +605,11 @@ static inline bool module_is_live(struct module *mod) return mod->state != MODULE_STATE_GOING; } +static inline bool module_is_coming(struct module *mod) +{ +return mod->state == MODULE_STATE_COMING; +} + struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); @@ -857,6 +862,10 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr) return ptr; } +static inline bool module_is_coming(struct module *mod) +{ + return false; +} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ddd7cdc16edf..ca2c6cbd42d2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1588,7 +1588,7 @@ static int check_kprobe_address_safe(struct kprobe *p, } /* Get module refcount and reject __init functions for loaded modules. */ - if (*probed_mod) { + if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. @@ -1603,12 +1603,13 @@ static int check_kprobe_address_safe(struct kprobe *p, * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && - (*probed_mod)->state != MODULE_STATE_COMING) { + !module_is_coming(*probed_mod)) { module_put(*probed_mod); *probed_mod = NULL; ret = -ENOENT; } } + out: preempt_enable(); jump_label_unlock(); @@ -2488,24 +2489,6 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } -/* Remove all symbols in given area from kprobe blacklist */ -static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) -{ - struct kprobe_blacklist_entry *ent, *n; - - list_for_each_entry_safe(ent, n, _blacklist, list) { - if (ent->start_addr < start || ent->start_addr >= end) - continue; - list_del(>list); - kfree(ent); - } -} - -static void kprobe_remove_ksym_blacklist(unsigned long entry) -{ - kprobe_remove_area_blacklist(entry, entry + 1); -} - int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym) { @@ -2570,6 +2553,25 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +#ifdef CONFIG_MODULES +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, _blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(>list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; @@ -2672,6 +2674,17 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +static int kprobe_register_module_notifier(void) +{ + return register_module_notifier(_module_nb); +} +#else +static int kprobe_register_module_notifier(void) +{ +
[PATCH v5 13/15] powerpc: use CONFIG_EXECMEM instead of CONFIG_MODULES where appropriate
From: "Mike Rapoport (IBM)" There are places where CONFIG_MODULES guards the code that depends on memory allocation being done with module_alloc(). Replace CONFIG_MODULES with CONFIG_EXECMEM in such places. Signed-off-by: Mike Rapoport (IBM) --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/kasan.h | 2 +- arch/powerpc/kernel/head_8xx.S | 4 ++-- arch/powerpc/kernel/head_book3s_32.S | 6 +++--- arch/powerpc/lib/code-patching.c | 2 +- arch/powerpc/mm/book3s32/mmu.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1c4be3373686..2e586733a464 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -285,7 +285,7 @@ config PPC select IOMMU_HELPER if PPC64 select IRQ_DOMAIN select IRQ_FORCED_THREADING - select KASAN_VMALLOCif KASAN && MODULES + select KASAN_VMALLOCif KASAN && EXECMEM select LOCK_MM_AND_FIND_VMA select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 365d2720097c..b5bbb94c51f6 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -19,7 +19,7 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 -#if defined(CONFIG_MODULES) && defined(CONFIG_PPC32) +#if defined(CONFIG_EXECMEM) && defined(CONFIG_PPC32) #define KASAN_KERN_START ALIGN_DOWN(PAGE_OFFSET - SZ_256M, SZ_256M) #else #define KASAN_KERN_START PAGE_OFFSET diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 647b0b445e89..edc479a7c2bc 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -199,12 +199,12 @@ instruction_counter: mfspr r10, SPRN_SRR0 /* Get effective address of fault */ INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11) mtspr SPRN_MD_EPN, r10 -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM mfcrr11 compare_to_kernel_boundary r10, r10 #endif mfspr r10, SPRN_M_TWB /* Get level 1 table */ -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM blt+3f rlwinm r10, r10, 0, 20, 31 orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index c1d89764dd22..57196883a00e 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -419,14 +419,14 @@ InstructionTLBMiss: */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 #endif mfspr r2, SPRN_SDR1 li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC rlwinm r2, r2, 28, 0xf000 -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM li r0, 3 bgt-112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ @@ -442,7 +442,7 @@ InstructionTLBMiss: andc. r1,r1,r2/* check access & ~permission */ bne-InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM rlwimi r2, r0, 0, 31, 31 /* userspace ? -> PP lsb */ #endif ori r1, r1, 0xe06 /* clear out reserved bits */ diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index c6ab46156cda..7af791446ddf 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -225,7 +225,7 @@ void __init poking_init(void) static unsigned long get_patch_pfn(void *addr) { - if (IS_ENABLED(CONFIG_MODULES) && is_vmalloc_or_module_addr(addr)) + if (IS_ENABLED(CONFIG_EXECMEM) && is_vmalloc_or_module_addr(addr)) return vmalloc_to_pfn(addr); else return __pa_symbol(addr) >> PAGE_SHIFT; diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 100f999871bc..625fe7d08e06 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -184,7 +184,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) static bool is_module_segment(unsigned long addr) { - if (!IS_ENABLED(CONFIG_MODULES)) + if (!IS_ENABLED(CONFIG_EXECMEM)) return false; if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M)) return false; -- 2.43.0
[PATCH v5 12/15] x86/ftrace: enable dynamic ftrace without CONFIG_MODULES
From: "Mike Rapoport (IBM)" Dynamic ftrace must allocate memory for code and this was impossible without CONFIG_MODULES. With execmem separated from the modules code, execmem_text_alloc() is available regardless of CONFIG_MODULES. Remove dependency of dynamic ftrace on CONFIG_MODULES and make CONFIG_DYNAMIC_FTRACE select CONFIG_EXECMEM in Kconfig. Signed-off-by: Mike Rapoport (IBM) --- arch/x86/Kconfig | 1 + arch/x86/kernel/ftrace.c | 10 -- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3f5ba72c9480..cd8addb96a0b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86_64 select SWIOTLB select ARCH_HAS_ELFCORE_COMPAT select ZONE_DMA32 + select EXECMEM if DYNAMIC_FTRACE config FORCE_DYNAMIC_FTRACE def_bool y diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c8ddb7abda7c..8da0e66ca22d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -261,8 +261,6 @@ void arch_ftrace_update_code(int command) /* Currently only x86_64 supports dynamic trampolines */ #ifdef CONFIG_X86_64 -#ifdef CONFIG_MODULES -/* Module allocation simplifies allocating memory for code */ static inline void *alloc_tramp(unsigned long size) { return execmem_alloc(EXECMEM_FTRACE, size); @@ -271,14 +269,6 @@ static inline void tramp_free(void *tramp) { execmem_free(tramp); } -#else -/* Trampolines can only be created if modules are supported */ -static inline void *alloc_tramp(unsigned long size) -{ - return NULL; -} -static inline void tramp_free(void *tramp) { } -#endif /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); -- 2.43.0
[PATCH v5 11/15] arch: make execmem setup available regardless of CONFIG_MODULES
From: "Mike Rapoport (IBM)" execmem does not depend on modules, on the contrary modules use execmem. To make execmem available when CONFIG_MODULES=n, for instance for kprobes, split execmem_params initialization out from arch/*/kernel/module.c and compile it when CONFIG_EXECMEM=y Signed-off-by: Mike Rapoport (IBM) --- arch/arm/kernel/module.c | 43 -- arch/arm/mm/init.c | 45 +++ arch/arm64/kernel/module.c | 140 - arch/arm64/mm/init.c | 140 + arch/loongarch/kernel/module.c | 19 - arch/loongarch/mm/init.c | 21 + arch/mips/kernel/module.c | 22 -- arch/mips/mm/init.c| 23 ++ arch/nios2/kernel/module.c | 20 - arch/nios2/mm/init.c | 21 + arch/parisc/kernel/module.c| 20 - arch/parisc/mm/init.c | 23 +- arch/powerpc/kernel/module.c | 63 --- arch/powerpc/mm/mem.c | 64 +++ arch/riscv/kernel/module.c | 44 --- arch/riscv/mm/init.c | 45 +++ arch/s390/kernel/module.c | 27 --- arch/s390/mm/init.c| 30 +++ arch/sparc/kernel/module.c | 19 - arch/sparc/mm/Makefile | 2 + arch/sparc/mm/execmem.c| 21 + arch/x86/kernel/module.c | 27 --- arch/x86/mm/init.c | 29 +++ 23 files changed, 463 insertions(+), 445 deletions(-) create mode 100644 arch/sparc/mm/execmem.c diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index a98fdf6ff26c..677f218f7e84 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -12,57 +12,14 @@ #include #include #include -#include #include #include -#include -#include #include #include #include #include -#ifdef CONFIG_XIP_KERNEL -/* - * The XIP kernel text is mapped in the module area for modules and - * some other stuff to work without any indirect relocations. - * MODULES_VADDR is redefined here and not in asm/memory.h to avoid - * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. - */ -#undef MODULES_VADDR -#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) -#endif - -#ifdef CONFIG_MMU -static struct execmem_info execmem_info __ro_after_init; - -struct execmem_info __init *execmem_arch_setup(void) -{ - unsigned long fallback_start = 0, fallback_end = 0; - - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) { - fallback_start = VMALLOC_START; - fallback_end = VMALLOC_END; - } - - execmem_info = (struct execmem_info){ - .ranges = { - [EXECMEM_DEFAULT] = { - .start = MODULES_VADDR, - .end= MODULES_END, - .pgprot = PAGE_KERNEL_EXEC, - .alignment = 1, - .fallback_start = fallback_start, - .fallback_end = fallback_end, - }, - }, - }; - - return _info; -} -#endif - bool module_init_section(const char *name) { return strstarts(name, ".init") || diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index e8c6f4be0ce1..5345d218899a 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -486,3 +487,47 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif + +#ifdef CONFIG_EXECMEM + +#ifdef CONFIG_XIP_KERNEL +/* + * The XIP kernel text is mapped in the module area for modules and + * some other stuff to work without any indirect relocations. + * MODULES_VADDR is redefined here and not in asm/memory.h to avoid + * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. + */ +#undef MODULES_VADDR +#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) +#endif + +#ifdef CONFIG_MMU +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) +{ + unsigned long fallback_start = 0, fallback_end = 0; + + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) { + fallback_start = VMALLOC_START; + fallback_end = VMALLOC_END; + } + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL_EXEC, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + }, +
[PATCH v5 10/15] powerpc: extend execmem_params for kprobes allocations
From: "Mike Rapoport (IBM)" powerpc overrides kprobes::alloc_insn_page() to remove writable permissions when STRICT_MODULE_RWX is on. Add definition of EXECMEM_KRPOBES to execmem_params to allow using the generic kprobes::alloc_insn_page() with the desired permissions. As powerpc uses breakpoint instructions to inject kprobes, it does not need to constrain kprobe allocations to the modules area and can use the entire vmalloc address space. Signed-off-by: Mike Rapoport (IBM) --- arch/powerpc/kernel/kprobes.c | 20 arch/powerpc/kernel/module.c | 7 +++ 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 9fcd01bb2ce6..14c5ddec3056 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -126,26 +126,6 @@ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offse return (kprobe_opcode_t *)(addr + offset); } -void *alloc_insn_page(void) -{ - void *page; - - page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); - if (!page) - return NULL; - - if (strict_module_rwx_enabled()) { - int err = set_memory_rox((unsigned long)page, 1); - - if (err) - goto error; - } - return page; -error: - execmem_free(page); - return NULL; -} - int arch_prepare_kprobe(struct kprobe *p) { int ret = 0; diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index ac80559015a3..2a23cf7e141b 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -94,6 +94,7 @@ static struct execmem_info execmem_info __ro_after_init; struct execmem_info __init *execmem_arch_setup(void) { + pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; unsigned long fallback_start = 0, fallback_end = 0; unsigned long start, end; @@ -132,6 +133,12 @@ struct execmem_info __init *execmem_arch_setup(void) .fallback_start = fallback_start, .fallback_end = fallback_end, }, + [EXECMEM_KPROBES] = { + .start = VMALLOC_START, + .end= VMALLOC_END, + .pgprot = kprobes_prot, + .alignment = 1, + }, [EXECMEM_MODULE_DATA] = { .start = VMALLOC_START, .end= VMALLOC_END, -- 2.43.0
[PATCH v5 09/15] riscv: extend execmem_params for generated code allocations
From: "Mike Rapoport (IBM)" The memory allocations for kprobes and BPF on RISC-V are not placed in the modules area and these custom allocations are implemented with overrides of alloc_insn_page() and bpf_jit_alloc_exec(). Slightly reorder execmem_params initialization to support both 32 and 64 bit variants, define EXECMEM_KPROBES and EXECMEM_BPF ranges in riscv::execmem_params and drop overrides of alloc_insn_page() and bpf_jit_alloc_exec(). Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Alexandre Ghiti --- arch/riscv/kernel/module.c | 28 +--- arch/riscv/kernel/probes/kprobes.c | 10 -- arch/riscv/net/bpf_jit_core.c | 13 - 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 182904127ba0..2ecbacbc9993 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -906,19 +906,41 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, return 0; } -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +#ifdef CONFIG_MMU static struct execmem_info execmem_info __ro_after_init; struct execmem_info __init *execmem_arch_setup(void) { + unsigned long start, end; + + if (IS_ENABLED(CONFIG_64BIT)) { + start = MODULES_VADDR; + end = MODULES_END; + } else { + start = VMALLOC_START; + end = VMALLOC_END; + } + execmem_info = (struct execmem_info){ .ranges = { [EXECMEM_DEFAULT] = { - .start = MODULES_VADDR, - .end= MODULES_END, + .start = start, + .end= end, .pgprot = PAGE_KERNEL, .alignment = 1, }, + [EXECMEM_KPROBES] = { + .start = VMALLOC_START, + .end= VMALLOC_END, + .pgprot = PAGE_KERNEL_READ_EXEC, + .alignment = 1, + }, + [EXECMEM_BPF] = { + .start = BPF_JIT_REGION_START, + .end= BPF_JIT_REGION_END, + .pgprot = PAGE_KERNEL, + .alignment = PAGE_SIZE, + }, }, }; diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 2f08c14a933d..e64f2f3064eb 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -104,16 +104,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } -#ifdef CONFIG_MMU -void *alloc_insn_page(void) -{ - return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, -GFP_KERNEL, PAGE_KERNEL_READ_EXEC, -VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, -__builtin_return_address(0)); -} -#endif - /* install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 6b3acac30c06..e238fdbd5dbc 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -219,19 +219,6 @@ u64 bpf_jit_alloc_exec_limit(void) return BPF_JIT_REGION_SIZE; } -void *bpf_jit_alloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, - BPF_JIT_REGION_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} - -void bpf_jit_free_exec(void *addr) -{ - return vfree(addr); -} - void *bpf_arch_text_copy(void *dst, void *src, size_t len) { int ret; -- 2.43.0
[PATCH v5 08/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem
From: "Mike Rapoport (IBM)" Extend execmem parameters to accommodate more complex overrides of module_alloc() by architectures. This includes specification of a fallback range required by arm, arm64 and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for allocation of KASAN shadow required by s390 and x86 and support for early initialization of execmem required by x86. The core implementation of execmem_alloc() takes care of suppressing warnings when the initial allocation fails but there is a fallback range defined. Signed-off-by: Mike Rapoport (IBM) Acked-by: Will Deacon --- arch/Kconfig | 6 +++ arch/arm/kernel/module.c | 41 ++--- arch/arm64/kernel/module.c | 67 ++-- arch/arm64/kernel/probes/kprobes.c | 7 --- arch/arm64/net/bpf_jit_comp.c | 11 - arch/powerpc/kernel/module.c | 60 - arch/s390/kernel/module.c | 54 ++- arch/x86/Kconfig | 1 + arch/x86/kernel/module.c | 70 ++ include/linux/execmem.h| 34 +++ include/linux/moduleloader.h | 12 - kernel/module/main.c | 26 +++ mm/execmem.c | 70 +- mm/mm_init.c | 2 + 14 files changed, 259 insertions(+), 202 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 65afb1de48b3..7006f71f0110 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -960,6 +960,12 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC For architectures like powerpc/32 which have constraints on module allocation and need to allocate module data outside of module area. +config ARCH_WANTS_EXECMEM_EARLY + bool + help + For architectures that might allocate executable memory early on + boot, for instance ftrace on x86. + config HAVE_IRQ_EXIT_ON_IRQ_STACK bool help diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index e74d84f58b77..a98fdf6ff26c 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -34,23 +35,31 @@ #endif #ifdef CONFIG_MMU -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - /* Silence the initial allocation */ - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) - gfp_mask |= __GFP_NOWARN; - - p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) - return p; - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + unsigned long fallback_start = 0, fallback_end = 0; + + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) { + fallback_start = VMALLOC_START; + fallback_end = VMALLOC_END; + } + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL_EXEC, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + }, + }; + + return _info; } #endif diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index e92da4da1b2a..a52240ea084b 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -108,41 +109,59 @@ static int __init module_init_limits(void) return 0; } -subsys_initcall(module_init_limits); -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - void *p = NULL; + unsigned long fallback_start = 0, fallback_end = 0; + unsigned long start = 0, end = 0; + + module_init_limits(); /* * Where possible, prefer to allocate within direct branch range of the * kernel such that no PLTs are necessary. */ if (module_direct_base) { - p = __vmalloc_node_range(size, MODULE_ALIGN, -module_direct_base, -
[PATCH v5 07/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem
From: "Mike Rapoport (IBM)" Several architectures override module_alloc() only to define address range for code allocations different than VMALLOC address space. Provide a generic implementation in execmem that uses the parameters for address space ranges, required alignment and page protections provided by architectures. The architectures must fill execmem_info structure and implement execmem_arch_setup() that returns a pointer to that structure. This way the execmem initialization won't be called from every architecture, but rather from a central place, namely a core_initcall() in execmem. The execmem provides execmem_alloc() API that wraps __vmalloc_node_range() with the parameters defined by the architectures. If an architecture does not implement execmem_arch_setup(), execmem_alloc() will fall back to module_alloc(). Signed-off-by: Mike Rapoport (IBM) --- arch/loongarch/kernel/module.c | 19 +++-- arch/mips/kernel/module.c | 20 -- arch/nios2/kernel/module.c | 21 +++--- arch/parisc/kernel/module.c| 24 +++ arch/riscv/kernel/module.c | 24 +++ arch/sparc/kernel/module.c | 20 -- include/linux/execmem.h| 41 +++ mm/execmem.c | 73 -- 8 files changed, 208 insertions(+), 34 deletions(-) diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c index c7d0338d12c1..ca6dd7ea1610 100644 --- a/arch/loongarch/kernel/module.c +++ b/arch/loongarch/kernel/module.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -490,10 +491,22 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, return 0; } -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + }, + }; + + return _info; } static void module_init_ftrace_plt(const Elf_Ehdr *hdr, diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 9a6c96014904..59225a3cf918 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -20,6 +20,7 @@ #include #include #include +#include #include struct mips_hi16 { @@ -32,11 +33,22 @@ static LIST_HEAD(dbe_list); static DEFINE_SPINLOCK(dbe_lock); #ifdef MODULES_VADDR -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + }, + }; + + return _info; } #endif diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c index 9c97b7513853..0d1ee86631fc 100644 --- a/arch/nios2/kernel/module.c +++ b/arch/nios2/kernel/module.c @@ -18,15 +18,26 @@ #include #include #include +#include #include -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, - VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, - __builtin_return_address(0)); + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL_EXEC, + .alignment = 1, + }, + }, + }; + + return _info; } int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index
[PATCH v5 06/15] mm: introduce execmem_alloc() and execmem_free()
From: "Mike Rapoport (IBM)" module_alloc() is used everywhere as a mean to allocate memory for code. Beside being semantically wrong, this unnecessarily ties all subsystems that need to allocate code, such as ftrace, kprobes and BPF to modules and puts the burden of code allocation to the modules code. Several architectures override module_alloc() because of various constraints where the executable memory can be located and this causes additional obstacles for improvements of code allocation. Start splitting code allocation from modules by introducing execmem_alloc() and execmem_free() APIs. Initially, execmem_alloc() is a wrapper for module_alloc() and execmem_free() is a replacement of module_memfree() to allow updating all call sites to use the new APIs. Since architectures define different restrictions on placement, permissions, alignment and other parameters for memory that can be used by different subsystems that allocate executable memory, execmem_alloc() takes a type argument, that will be used to identify the calling subsystem and to allow architectures define parameters for ranges suitable for that subsystem. No functional changes. Signed-off-by: Mike Rapoport (IBM) Acked-by: Masami Hiramatsu (Google) --- arch/powerpc/kernel/kprobes.c| 6 ++-- arch/s390/kernel/ftrace.c| 4 +-- arch/s390/kernel/kprobes.c | 4 +-- arch/s390/kernel/module.c| 5 +-- arch/sparc/net/bpf_jit_comp_32.c | 8 ++--- arch/x86/kernel/ftrace.c | 6 ++-- arch/x86/kernel/kprobes/core.c | 4 +-- include/linux/execmem.h | 57 include/linux/moduleloader.h | 3 -- kernel/bpf/core.c| 6 ++-- kernel/kprobes.c | 8 ++--- kernel/module/Kconfig| 1 + kernel/module/main.c | 25 +- mm/Kconfig | 3 ++ mm/Makefile | 1 + mm/execmem.c | 32 ++ 16 files changed, 128 insertions(+), 45 deletions(-) create mode 100644 include/linux/execmem.h create mode 100644 mm/execmem.c diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bbca90a5e2ec..9fcd01bb2ce6 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -19,8 +19,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -130,7 +130,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; @@ -142,7 +142,7 @@ void *alloc_insn_page(void) } return page; error: - module_memfree(page); + execmem_free(page); return NULL; } diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index c46381ea04ec..798249ef5646 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -7,13 +7,13 @@ * Author(s): Martin Schwidefsky */ -#include #include #include #include #include #include #include +#include #include #include #include @@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void) { const char *start, *end; - ftrace_plt = module_alloc(PAGE_SIZE); + ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE); if (!ftrace_plt) panic("cannot allocate ftrace plt\n"); diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index f0cf20d4b3c5..3c1b1be744de 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -9,7 +9,6 @@ #define pr_fmt(fmt) "kprobes: " fmt -#include #include #include #include @@ -21,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +38,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; set_memory_rox((unsigned long)page, 1); diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 42215f9404af..ac97a905e8cd 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -76,7 +77,7 @@ void *module_alloc(unsigned long size) #ifdef CONFIG_FUNCTION_TRACER void module_arch_cleanup(struct module *mod) { - module_memfree(mod->arch.trampolines_start); + execmem_free(mod->arch.trampolines_start); } #endif @@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct module *me, size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size); numpages = DIV_ROUND_UP(size, PAGE_SIZE); - start = module_alloc(numpages * PAGE_SIZE); + start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE); if (!start) return -ENOMEM;
[PATCH v5 05/15] module: make module_memory_{alloc,free} more self-contained
From: "Mike Rapoport (IBM)" Move the logic related to the memory allocation and freeing into module_memory_alloc() and module_memory_free(). Signed-off-by: Mike Rapoport (IBM) --- kernel/module/main.c | 64 +++- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/kernel/module/main.c b/kernel/module/main.c index e1e8a7a9d6c1..5b82b069e0d3 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1203,15 +1203,44 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type) mod_mem_type_is_core_data(type); } -static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) +static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { + unsigned int size = PAGE_ALIGN(mod->mem[type].size); + void *ptr; + + mod->mem[type].size = size; + if (mod_mem_use_vmalloc(type)) - return vzalloc(size); - return module_alloc(size); + ptr = vmalloc(size); + else + ptr = module_alloc(size); + + if (!ptr) + return -ENOMEM; + + /* +* The pointer to these blocks of memory are stored on the module +* structure and we keep that around so long as the module is +* around. We only free that memory when we unload the module. +* Just mark them as not being a leak then. The .init* ELF +* sections *do* get freed after boot so we *could* treat them +* slightly differently with kmemleak_ignore() and only grey +* them out as they work as typical memory allocations which +* *do* eventually get freed, but let's just keep things simple +* and avoid *any* false positives. +*/ + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; + + return 0; } -static void module_memory_free(void *ptr, enum mod_mem_type type) +static void module_memory_free(struct module *mod, enum mod_mem_type type) { + void *ptr = mod->mem[type].base; + if (mod_mem_use_vmalloc(type)) vfree(ptr); else @@ -1229,12 +1258,12 @@ static void free_mod_mem(struct module *mod) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod_mem->base, type); + module_memory_free(mod, type); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); + module_memory_free(mod, MOD_DATA); } /* Free a module, remove from lists, etc. */ @@ -2225,7 +2254,6 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { int i; - void *ptr; enum mod_mem_type t = 0; int ret = -ENOMEM; @@ -2234,26 +2262,12 @@ static int move_module(struct module *mod, struct load_info *info) mod->mem[type].base = NULL; continue; } - mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size); - ptr = module_memory_alloc(mod->mem[type].size, type); - /* - * The pointer to these blocks of memory are stored on the module - * structure and we keep that around so long as the module is - * around. We only free that memory when we unload the module. - * Just mark them as not being a leak then. The .init* ELF - * sections *do* get freed after boot so we *could* treat them - * slightly differently with kmemleak_ignore() and only grey - * them out as they work as typical memory allocations which - * *do* eventually get freed, but let's just keep things simple - * and avoid *any* false positives. -*/ - kmemleak_not_leak(ptr); - if (!ptr) { + + ret = module_memory_alloc(mod, type); + if (ret) { t = type; goto out_enomem; } - memset(ptr, 0, mod->mem[type].size); - mod->mem[type].base = ptr; } /* Transfer each section which specifies SHF_ALLOC */ @@ -2296,7 +2310,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_enomem: for (t--; t >= 0; t--) - module_memory_free(mod->mem[t].base, t); + module_memory_free(mod, t); return ret; } -- 2.43.0
[PATCH v5 04/15] sparc: simplify module_alloc()
From: "Mike Rapoport (IBM)" Define MODULES_VADDR and MODULES_END as VMALLOC_START and VMALLOC_END for 32-bit and reduce module_alloc() to __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, ...) as with the new defines the allocations becames identical for both 32 and 64 bits. While on it, drop unsed include of Suggested-by: Sam Ravnborg Signed-off-by: Mike Rapoport (IBM) --- arch/sparc/include/asm/pgtable_32.h | 2 ++ arch/sparc/kernel/module.c | 25 + 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 9e85d57ac3f2..62bcafe38b1f 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -432,6 +432,8 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, #define VMALLOC_START _AC(0xfe60,UL) #define VMALLOC_END _AC(0xffc0,UL) +#define MODULES_VADDR VMALLOC_START +#define MODULES_END VMALLOC_END /* We provide our own get_unmapped_area to cope with VA holes for userland */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index 66c45a2764bc..d37adb2a0b54 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -21,35 +21,12 @@ #include "entry.h" -#ifdef CONFIG_SPARC64 - -#include - -static void *module_map(unsigned long size) +void *module_alloc(unsigned long size) { - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); } -#else -static void *module_map(unsigned long size) -{ - return vmalloc(size); -} -#endif /* CONFIG_SPARC64 */ - -void *module_alloc(unsigned long size) -{ - void *ret; - - ret = module_map(size); - if (ret) - memset(ret, 0, size); - - return ret; -} /* Make generic code ignore STT_REGISTER dummy undefined symbols. */ int module_frob_arch_sections(Elf_Ehdr *hdr, -- 2.43.0
[PATCH v5 03/15] nios2: define virtual address space for modules
From: "Mike Rapoport (IBM)" nios2 uses kmalloc() to implement module_alloc() because CALL26/PCREL26 cannot reach all of vmalloc address space. Define module space as 32MiB below the kernel base and switch nios2 to use vmalloc for module allocations. Suggested-by: Thomas Gleixner Acked-by: Dinh Nguyen Acked-by: Song Liu Signed-off-by: Mike Rapoport (IBM) --- arch/nios2/include/asm/pgtable.h | 5 - arch/nios2/kernel/module.c | 19 --- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index d052dfcbe8d3..eab87c6beacb 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -25,7 +25,10 @@ #include #define VMALLOC_START CONFIG_NIOS2_KERNEL_MMU_REGION_BASE -#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1) +#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M - 1) + +#define MODULES_VADDR (CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M) +#define MODULES_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1) struct mm_struct; diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c index 76e0a42d6e36..9c97b7513853 100644 --- a/arch/nios2/kernel/module.c +++ b/arch/nios2/kernel/module.c @@ -21,23 +21,12 @@ #include -/* - * Modules should NOT be allocated with kmalloc for (obvious) reasons. - * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach - * from 0x8000 (vmalloc area) to 0xc (kernel) (kmalloc returns - * addresses in 0xc000) - */ void *module_alloc(unsigned long size) { - if (size == 0) - return NULL; - return kmalloc(size, GFP_KERNEL); -} - -/* Free memory returned from module_alloc */ -void module_memfree(void *module_region) -{ - kfree(module_region); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, + __builtin_return_address(0)); } int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, -- 2.43.0
[PATCH v5 02/15] mips: module: rename MODULE_START to MODULES_VADDR
From: "Mike Rapoport (IBM)" and MODULE_END to MODULES_END to match other architectures that define custom address space for modules. Signed-off-by: Mike Rapoport (IBM) --- arch/mips/include/asm/pgtable-64.h | 4 ++-- arch/mips/kernel/module.c | 4 ++-- arch/mips/mm/fault.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 20ca48c1b606..c0109aff223b 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -147,8 +147,8 @@ #if defined(CONFIG_MODULES) && defined(KBUILD_64BIT_SYM32) && \ VMALLOC_START != CKSSEG /* Load modules into 32bit-compatible segment. */ -#define MODULE_START CKSSEG -#define MODULE_END (FIXADDR_START-2*PAGE_SIZE) +#define MODULES_VADDR CKSSEG +#define MODULES_END(FIXADDR_START-2*PAGE_SIZE) #endif #define pte_ERROR(e) \ diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 7b2fbaa9cac5..9a6c96014904 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -31,10 +31,10 @@ struct mips_hi16 { static LIST_HEAD(dbe_list); static DEFINE_SPINLOCK(dbe_lock); -#ifdef MODULE_START +#ifdef MODULES_VADDR void *module_alloc(unsigned long size) { - return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index aaa9a242ebba..37fedeaca2e9 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -83,8 +83,8 @@ static void __do_page_fault(struct pt_regs *regs, unsigned long write, if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END)) goto VMALLOC_FAULT_TARGET; -#ifdef MODULE_START - if (unlikely(address >= MODULE_START && address < MODULE_END)) +#ifdef MODULES_VADDR + if (unlikely(address >= MODULES_VADDR && address < MODULES_END)) goto VMALLOC_FAULT_TARGET; #endif -- 2.43.0
[PATCH v5 01/15] arm64: module: remove unneeded call to kasan_alloc_module_shadow()
From: "Mike Rapoport (IBM)" Since commit f6f37d9320a1 ("arm64: select KASAN_VMALLOC for SW/HW_TAGS modes") KASAN_VMALLOC is always enabled when KASAN is on. This means that allocations in module_alloc() will be tracked by KASAN protection for vmalloc() and that kasan_alloc_module_shadow() will be always an empty inline and there is no point in calling it. Drop meaningless call to kasan_alloc_module_shadow() from module_alloc(). Signed-off-by: Mike Rapoport (IBM) --- arch/arm64/kernel/module.c | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 47e0be610bb6..e92da4da1b2a 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -141,11 +141,6 @@ void *module_alloc(unsigned long size) __func__); } - if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { - vfree(p); - return NULL; - } - /* Memory is intended to be executable, reset the pointer tag. */ return kasan_reset_tag(p); } -- 2.43.0
[PATCH v5 00/15] mm: jit/text allocator
From: "Mike Rapoport (IBM)" (something went wrong with the prevois posting, sorry for the noise) Hi, Since v3 I looked into making execmem more of an utility toolbox, as we discussed at LPC with Mark Rutland, but it was getting more hairier than having a struct describing architecture constraints and a type identifying the consumer of execmem. And I do think that having the description of architecture constraints for allocations of executable memory in a single place is better than having it spread all over the place. The patches available via git: https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=execmem/v5 v5 changes: * rebase on v6.9-rc4 to avoid a conflict in kprobes * add copyrights to mm/execmem.c (Luis) * fix spelling (Ingo) * define MODULES_VADDDR for sparc (Sam) * consistently initialize struct execmem_info (Peter) * reduce #ifdefs in function bodies in kprobes (Masami) v4: https://lore.kernel.org/all/20240411160051.2093261-1-r...@kernel.org * rebase on v6.9-rc2 * rename execmem_params to execmem_info and execmem_arch_params() to execmem_arch_setup() * use single execmem_alloc() API instead of execmem_{text,data}_alloc() (Song) * avoid extra copy of execmem parameters (Rick) * run execmem_init() as core_initcall() except for the architectures that may allocated text really early (currently only x86) (Will) * add acks for some of arm64 and riscv changes, thanks Will and Alexandre * new commits: - drop call to kasan_alloc_module_shadow() on arm64 because it's not needed anymore - rename MODULE_START to MODULES_VADDR on MIPS - use CONFIG_EXECMEM instead of CONFIG_MODULES on powerpc as per Christophe: https://lore.kernel.org/all/79062fa3-3402-47b3-8920-9231ad05e...@csgroup.eu/ v3: https://lore.kernel.org/all/20230918072955.2507221-1-r...@kernel.org * add type parameter to execmem allocation APIs * remove BPF dependency on modules v2: https://lore.kernel.org/all/20230616085038.4121892-1-r...@kernel.org * Separate "module" and "others" allocations with execmem_text_alloc() and jit_text_alloc() * Drop ROX entailment on x86 * Add ack for nios2 changes, thanks Dinh Nguyen v1: https://lore.kernel.org/all/20230601101257.530867-1-r...@kernel.org = Cover letter from v1 (sligtly updated) = module_alloc() is used everywhere as a mean to allocate memory for code. Beside being semantically wrong, this unnecessarily ties all subsystmes that need to allocate code, such as ftrace, kprobes and BPF to modules and puts the burden of code allocation to the modules code. Several architectures override module_alloc() because of various constraints where the executable memory can be located and this causes additional obstacles for improvements of code allocation. A centralized infrastructure for code allocation allows allocations of executable memory as ROX, and future optimizations such as caching large pages for better iTLB performance and providing sub-page allocations for users that only need small jit code snippets. Rick Edgecombe proposed perm_alloc extension to vmalloc [1] and Song Liu proposed execmem_alloc [2], but both these approaches were targeting BPF allocations and lacked the ground work to abstract executable allocations and split them from the modules core. Thomas Gleixner suggested to express module allocation restrictions and requirements as struct mod_alloc_type_params [3] that would define ranges, protections and other parameters for different types of allocations used by modules and following that suggestion Song separated allocations of different types in modules (commit ac3b43283923 ("module: replace module_layout with module_memory")) and posted "Type aware module allocator" set [4]. I liked the idea of parametrising code allocation requirements as a structure, but I believe the original proposal and Song's module allocator was too module centric, so I came up with these patches. This set splits code allocation from modules by introducing execmem_alloc() and and execmem_free(), APIs, replaces call sites of module_alloc() and module_memfree() with the new APIs and implements core text and related allocations in a central place. Instead of architecture specific overrides for module_alloc(), the architectures that require non-default behaviour for text allocation must fill execmem_info structure and implement execmem_arch_setup() that returns a pointer to that structure. If an architecture does not implement execmem_arch_setup(), the defaults compatible with the current modules::module_alloc() are used. Since architectures define different restrictions on placement, permissions, alignment and other parameters for memory that can be used by different subsystems that allocate executable memory, execmem APIs take a type argument, that will be used to identify the calling subsystem and to allow architectures to define parameters for ranges suitable for that subsystem. The new infrastructure allows decoupling of BPF, kprobes and ftrace from
Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions
On 19.04.24 20:25, David Hildenbrand wrote: On 06.04.24 19:36, Vincent Donnefort wrote: In preparation for allowing the user-space to map a ring-buffer, add a set of mapping functions: ring_buffer_{map,unmap}() And controls on the ring-buffer: ring_buffer_map_get_reader() /* swap reader and head */ Mapping the ring-buffer also involves: A unique ID for each subbuf of the ring-buffer, currently they are only identified through their in-kernel VA. A meta-page, where are stored ring-buffer statistics and a description for the current reader The linear mapping exposes the meta-page, and each subbuf of the ring-buffer, ordered following their unique ID, assigned during the first mapping. Once mapped, no subbuf can get in or out of the ring-buffer: the buffer size will remain unmodified and the splice enabling functions will in reality simply memcpy the data instead of swapping subbufs. CC: Signed-off-by: Vincent Donnefort diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index dc5ae4e96aee..96d2140b471e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -6,6 +6,8 @@ #include #include +#include + struct trace_buffer; struct ring_buffer_iter; @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); #define trace_rb_cpu_prepare NULL #endif +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma); +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h new file mode 100644 index ..ffcd8dfcaa4f --- /dev/null +++ b/include/uapi/linux/trace_mmap.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _TRACE_MMAP_H_ +#define _TRACE_MMAP_H_ + +#include + +/** + * struct trace_buffer_meta - Ring-buffer Meta-page description + * @meta_page_size:Size of this meta-page. + * @meta_struct_len: Size of this structure. + * @subbuf_size: Size of each sub-buffer. + * @nr_subbufs:Number of subbfs in the ring-buffer, including the reader. + * @reader.lost_events:Number of events lost at the time of the reader swap. + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] + * @reader.read: Number of bytes read on the reader subbuf. + * @flags: Placeholder for now, 0 until new features are supported. + * @entries: Number of entries in the ring-buffer. + * @overrun: Number of entries lost in the ring-buffer. + * @read: Number of entries that have been read. + * @Reserved1: Reserved for future use. + * @Reserved2: Reserved for future use. + */ +struct trace_buffer_meta { + __u32 meta_page_size; + __u32 meta_struct_len; + + __u32 subbuf_size; + __u32 nr_subbufs; + + struct { + __u64 lost_events; + __u32 id; + __u32 read; + } reader; + + __u64 flags; + + __u64 entries; + __u64 overrun; + __u64 read; + + __u64 Reserved1; + __u64 Reserved2; +}; + +#endif /* _TRACE_MMAP_H_ */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc9ebe593571..793ecc454039 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include @@ -338,6 +340,7 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned longreal_end; /* real end of data */ unsigned order; /* order of the page */ + u32 id;/* ID for external mapping */ struct buffer_data_page *page; /* Actual data page */ }; @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; + + unsigned intmapped; + struct mutexmapping_lock; + unsigned long *subbuf_ids;/* ID to subbuf VA */ + struct trace_buffer_meta*meta_page; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ longnr_pages_to_update; struct list_headnew_pages; /* new pages to add */ @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_irq_work(_buffer->irq_work.work,
Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent
Hello Matthieu, On Mon, Apr 22, 2024 at 4:47 PM Matthieu Baerts wrote: > > Hi Jason, > > On 22/04/2024 05:01, Jason Xing wrote: > > From: Jason Xing > > > > Add a new standalone file for the easy future extension to support > > both active reset and passive reset in the TCP/DCCP/MPTCP protocols. > > Thank you for looking at that! Thanks for the review! > > (...) > > > diff --git a/include/net/rstreason.h b/include/net/rstreason.h > > new file mode 100644 > > index ..c57bc5413c17 > > --- /dev/null > > +++ b/include/net/rstreason.h > > @@ -0,0 +1,144 @@ > > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > > + > > +#ifndef _LINUX_RSTREASON_H > > +#define _LINUX_RSTREASON_H > > +#include > > +#include > > + > > +#define DEFINE_RST_REASON(FN, FNe) \ > > + FN(MPTCP_RST_EUNSPEC) \ > > + FN(MPTCP_RST_EMPTCP)\ > > + FN(MPTCP_RST_ERESOURCE) \ > > + FN(MPTCP_RST_EPROHIBIT) \ > > + FN(MPTCP_RST_EWQ2BIG) \ > > + FN(MPTCP_RST_EBADPERF) \ > > + FN(MPTCP_RST_EMIDDLEBOX)\ > > Small detail: should it not make more sense to put the ones linked to > MPTCP at the end? I mean I guess MPTCP should be treated in second > priority: CONFIG_MPTCP could not be set, and the ones linked to TCP > should be more frequent, etc. Do you mean that I need to adjust the order: 1) tcp reasons first, 2) independent reasons, 3) mptcp reasons ? Reasonable. I will do it :) > > > + FN(NOT_SPECIFIED) \ > > + FN(NO_SOCKET) \ > > + FNe(MAX) > > (...) > > > +/* Convert reset reasons in MPTCP to our own enum type */ > > +static inline enum sk_rst_reason convert_mptcpreason(u32 reason) > > +{ > > + switch (reason) { > > + case MPTCP_RST_EUNSPEC: > > + return SK_RST_REASON_MPTCP_RST_EUNSPEC; > > + case MPTCP_RST_EMPTCP: > > + return SK_RST_REASON_MPTCP_RST_EMPTCP; > > + case MPTCP_RST_ERESOURCE: > > + return SK_RST_REASON_MPTCP_RST_ERESOURCE; > > + case MPTCP_RST_EPROHIBIT: > > + return SK_RST_REASON_MPTCP_RST_EPROHIBIT; > > + case MPTCP_RST_EWQ2BIG: > > + return SK_RST_REASON_MPTCP_RST_EWQ2BIG; > > + case MPTCP_RST_EBADPERF: > > + return SK_RST_REASON_MPTCP_RST_EBADPERF; > > + case MPTCP_RST_EMIDDLEBOX: > > + return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX; > > + default: > > + /** > > + * It should not happen, or else errors may occur > > + * in MPTCP layer > > + */ > > + return SK_RST_REASON_ERROR; > > + } > > +} > > If this helper is only used on MPTCP, maybe better to move it to > net/mptcp/protocol.h (and to patch 5/7?)? We tried to isolate MPTCP code. Roger that. I will move the helper into protocol.h as well as the patch itself. > > Also, maybe it is just me, but I'm not a big fan of the helper name: > convert_mptcpreason() (same for the "drop" one). I think it should at > least mention its "origin" (rst reason): e.g. something like > (sk_)rst_reason_convert_mptcp or (sk_)rst_convert_mptcp_reason() (or > mptcp_to_rst_reason())? > > And (sk_)rst_reason_convert_(skb_)drop() (or skb_drop_to_rst_reason())? I agree with you. Actually I had a local patch where I used sk_rst_reason_skbdrop() and sk_rst_reason_mptcpreason(). Interestingly, I changed them in this patch series due to the function name being too long (which is my initial thought). I will use sk_rst_convert_xxx_reason() as you suggested. > > > +/* Convert reset reasons in MPTCP to our own enum type */ > > I don't think this part is linked to MPTCP, right? Ah, copy-paste syndrome... Sorry, I will correct it. > > > +static inline enum sk_rst_reason convert_dropreason(enum skb_drop_reason > > reason) > > +{ > > + switch (reason) { > > + case SKB_DROP_REASON_NOT_SPECIFIED: > > + return SK_RST_REASON_NOT_SPECIFIED; > > + case SKB_DROP_REASON_NO_SOCKET: > > + return SK_RST_REASON_NO_SOCKET; > > + default: > > + /* If we don't have our own corresponding reason */ > > + return SK_RST_REASON_NOT_SPECIFIED; > > + } > > +} > > (This helper could be introduced in patch 4/7 because it is not used > before, but I'm fine either ways.) Good. It makes more sense. Thanks, Jason
Re: [PATCH v2 1/4] virtio_balloon: separate vm events into a function
On 22.04.24 09:42, zhenwei pi wrote: All the VM events related statistics have dependence on 'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any VM events in future, we would have codes like: #ifdef CONFIG_VM_EVENT_COUNTERS unsigned long foo; #endif ... #ifdef CONFIG_VM_EVENT_COUNTERS foo = events[XXX] + events[YYY]; update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo); #endif Separate vm events into a single function, also remove 'CONFIG_VM_EVENT_COUNTERS' from 'update_balloon_stats'. Signed-off-by: zhenwei pi --- drivers/virtio/virtio_balloon.c | 44 ++--- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 1f5b3dd31fcf..59fe157e5722 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -316,34 +316,48 @@ static inline void update_stat(struct virtio_balloon *vb, int idx, #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) -static unsigned int update_balloon_stats(struct virtio_balloon *vb) +/* Return the number of entries filled by vm events */ +static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb, + unsigned int start) { +#ifdef CONFIG_VM_EVENT_COUNTERS unsigned long events[NR_VM_EVENT_ITEMS]; - struct sysinfo i; - unsigned int idx = 0; - long available; - unsigned long caches; + unsigned int idx = start; all_vm_events(events); - si_meminfo(); - - available = si_mem_available(); - caches = global_node_page_state(NR_FILE_PAGES); - -#ifdef CONFIG_VM_EVENT_COUNTERS update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, - pages_to_bytes(events[PSWPIN])); + pages_to_bytes(events[PSWPIN])); update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, - pages_to_bytes(events[PSWPOUT])); + pages_to_bytes(events[PSWPOUT])); update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); + #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, events[HTLB_BUDDY_PGALLOC]); update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, events[HTLB_BUDDY_PGALLOC_FAIL]); -#endif -#endif +#endif /* CONFIG_HUGETLB_PAGE */ + + return idx - start; +#else /* CONFIG_VM_EVENT_COUNTERS */ + + return 0; +#endif /* CONFIG_VM_EVENT_COUNTERS */ +} + +static unsigned int update_balloon_stats(struct virtio_balloon *vb) +{ + struct sysinfo i; + unsigned int idx = 0; + long available; + unsigned long caches; + + idx += update_balloon_vm_stats(vb, idx); No need to handle idx that complicated now. Just do unsigned int idx; idx = update_balloon_vm_stats(vb); We can go down that path if we ever want to rearrange the code and not have the vm_stats first. + + si_meminfo(); + available = si_mem_available(); + caches = global_node_page_state(NR_FILE_PAGES); update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, pages_to_bytes(i.freeram)); update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, -- Cheers, David / dhildenb
Re: [PATCH v2 1/4] virtio_balloon: separate vm events into a function
On 22.04.24 10:04, zhenwei pi wrote: On 4/22/24 15:47, David Hildenbrand wrote: On 22.04.24 09:42, zhenwei pi wrote: All the VM events related statistics have dependence on 'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any VM events in future, we would have codes like: #ifdef CONFIG_VM_EVENT_COUNTERS unsigned long foo; #endif ... #ifdef CONFIG_VM_EVENT_COUNTERS foo = events[XXX] + events[YYY]; update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo); #endif Separate vm events into a single function, also remove Why not simply use __maybe_unused for that variable? 1> static unsigned int update_balloon_stats() { unsigned __maybe_unused long foo; ... #ifdef CONFIG_VM_EVENT_COUNTERS foo = events[XXX] + events[YYY]; update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo); #endif } 2> static inline unsigned int update_balloon_vm_stats() { #ifdef CONFIG_VM_EVENT_COUNTERS unsigned long foo; foo = events[XXX] + events[YYY]; update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo); #endif } From the point of my view, I don't need to compile code in my brain when reading codes for case 2. :) But for #1? :) I mean, you didn't compile the code in your brain when you sent out v1 :P But I agree that moving that to a separate function ins cleaner, staring at resulting update_balloon_stats(). Let me comment on some nits as a fresh reply. -- Cheers, David / dhildenb
Re: [PATCH v2 1/4] virtio_balloon: separate vm events into a function
On Mon, Apr 22, 2024 at 03:42:51PM +0800, zhenwei pi wrote: > All the VM events related statistics have dependence on > 'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any > VM events in future, we would have codes like: > #ifdef CONFIG_VM_EVENT_COUNTERS > unsigned long foo; > #endif > ... > #ifdef CONFIG_VM_EVENT_COUNTERS > foo = events[XXX] + events[YYY]; > update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo); > #endif > > Separate vm events into a single function, also remove > 'CONFIG_VM_EVENT_COUNTERS' from 'update_balloon_stats'. > > Signed-off-by: zhenwei pi > --- > drivers/virtio/virtio_balloon.c | 44 ++--- > 1 file changed, 29 insertions(+), 15 deletions(-) > > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c > index 1f5b3dd31fcf..59fe157e5722 100644 > --- a/drivers/virtio/virtio_balloon.c > +++ b/drivers/virtio/virtio_balloon.c > @@ -316,34 +316,48 @@ static inline void update_stat(struct virtio_balloon > *vb, int idx, > > #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) > > -static unsigned int update_balloon_stats(struct virtio_balloon *vb) > +/* Return the number of entries filled by vm events */ > +static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb, > +unsigned int start) > { > +#ifdef CONFIG_VM_EVENT_COUNTERS > unsigned long events[NR_VM_EVENT_ITEMS]; > - struct sysinfo i; > - unsigned int idx = 0; > - long available; > - unsigned long caches; > + unsigned int idx = start; > > all_vm_events(events); > - si_meminfo(); > - > - available = si_mem_available(); > - caches = global_node_page_state(NR_FILE_PAGES); > - > -#ifdef CONFIG_VM_EVENT_COUNTERS > update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, > - pages_to_bytes(events[PSWPIN])); > + pages_to_bytes(events[PSWPIN])); > update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, > - pages_to_bytes(events[PSWPOUT])); > + pages_to_bytes(events[PSWPOUT])); > update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); > update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); > + > #ifdef CONFIG_HUGETLB_PAGE > update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, > events[HTLB_BUDDY_PGALLOC]); > update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, > events[HTLB_BUDDY_PGALLOC_FAIL]); > -#endif > -#endif > +#endif /* CONFIG_HUGETLB_PAGE */ > + > + return idx - start; > +#else /* CONFIG_VM_EVENT_COUNTERS */ > + > + return 0; > +#endif /* CONFIG_VM_EVENT_COUNTERS */ > +} > + Generally the preferred style is this: #ifdef . static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb, unsigned int start) { } #else /* CONFIG_VM_EVENT_COUNTERS */ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb, unsigned int start) { return 0; } #endif however given it was a spaghetti of ifdefs even before that, the patch's ok I think. > +static unsigned int update_balloon_stats(struct virtio_balloon *vb) > +{ > + struct sysinfo i; > + unsigned int idx = 0; > + long available; > + unsigned long caches; > + > + idx += update_balloon_vm_stats(vb, idx); > + > + si_meminfo(); > + available = si_mem_available(); > + caches = global_node_page_state(NR_FILE_PAGES); > update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, > pages_to_bytes(i.freeram)); > update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, > -- > 2.34.1
[PATCH v5 15/15] bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of
From: "Mike Rapoport (IBM)" BPF just-in-time compiler depended on CONFIG_MODULES because it used module_alloc() to allocate memory for the generated code. Since code allocations are now implemented with execmem, drop dependency of CONFIG_BPF_JIT on CONFIG_MODULES and make it select CONFIG_EXECMEM. Suggested-by: Björn Töpel Signed-off-by: Mike Rapoport (IBM) --- kernel/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index bc25f5098a25..f999e4e0b344 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -43,7 +43,7 @@ config BPF_JIT bool "Enable BPF Just In Time compiler" depends on BPF depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT - depends on MODULES + select EXECMEM help BPF programs are normally handled by a BPF interpreter. This option allows the kernel to generate native code when a program is loaded -- 2.43.0
[PATCH v5 14/15] kprobes: remove dependency on CONFIG_MODULES
From: "Mike Rapoport (IBM)" kprobes depended on CONFIG_MODULES because it has to allocate memory for code. Since code allocations are now implemented with execmem, kprobes can be enabled in non-modular kernels. Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the dependency of CONFIG_KPROBES on CONFIG_MODULES. Signed-off-by: Mike Rapoport (IBM) --- arch/Kconfig| 2 +- include/linux/module.h | 9 ++ kernel/kprobes.c| 55 +++-- kernel/trace/trace_kprobe.c | 20 +- 4 files changed, 63 insertions(+), 23 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 7006f71f0110..a48ce6a488b3 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -52,9 +52,9 @@ config GENERIC_ENTRY config KPROBES bool "Kprobes" - depends on MODULES depends on HAVE_KPROBES select KALLSYMS + select EXECMEM select TASKS_RCU if PREEMPTION help Kprobes allows you to trap at almost any kernel address and diff --git a/include/linux/module.h b/include/linux/module.h index 1153b0d99a80..ffa1c603163c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -605,6 +605,11 @@ static inline bool module_is_live(struct module *mod) return mod->state != MODULE_STATE_GOING; } +static inline bool module_is_coming(struct module *mod) +{ +return mod->state == MODULE_STATE_COMING; +} + struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); @@ -857,6 +862,10 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr) return ptr; } +static inline bool module_is_coming(struct module *mod) +{ + return false; +} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ddd7cdc16edf..ca2c6cbd42d2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1588,7 +1588,7 @@ static int check_kprobe_address_safe(struct kprobe *p, } /* Get module refcount and reject __init functions for loaded modules. */ - if (*probed_mod) { + if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. @@ -1603,12 +1603,13 @@ static int check_kprobe_address_safe(struct kprobe *p, * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && - (*probed_mod)->state != MODULE_STATE_COMING) { + !module_is_coming(*probed_mod)) { module_put(*probed_mod); *probed_mod = NULL; ret = -ENOENT; } } + out: preempt_enable(); jump_label_unlock(); @@ -2488,24 +2489,6 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } -/* Remove all symbols in given area from kprobe blacklist */ -static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) -{ - struct kprobe_blacklist_entry *ent, *n; - - list_for_each_entry_safe(ent, n, _blacklist, list) { - if (ent->start_addr < start || ent->start_addr >= end) - continue; - list_del(>list); - kfree(ent); - } -} - -static void kprobe_remove_ksym_blacklist(unsigned long entry) -{ - kprobe_remove_area_blacklist(entry, entry + 1); -} - int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym) { @@ -2570,6 +2553,25 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +#ifdef CONFIG_MODULES +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, _blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(>list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; @@ -2672,6 +2674,17 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +static int kprobe_register_module_notifier(void) +{ + return register_module_notifier(_module_nb); +} +#else +static int kprobe_register_module_notifier(void) +{ +
[PATCH v5 13/15] powerpc: use CONFIG_EXECMEM instead of CONFIG_MODULES where appropriate
From: "Mike Rapoport (IBM)" There are places where CONFIG_MODULES guards the code that depends on memory allocation being done with module_alloc(). Replace CONFIG_MODULES with CONFIG_EXECMEM in such places. Signed-off-by: Mike Rapoport (IBM) --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/kasan.h | 2 +- arch/powerpc/kernel/head_8xx.S | 4 ++-- arch/powerpc/kernel/head_book3s_32.S | 6 +++--- arch/powerpc/lib/code-patching.c | 2 +- arch/powerpc/mm/book3s32/mmu.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1c4be3373686..2e586733a464 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -285,7 +285,7 @@ config PPC select IOMMU_HELPER if PPC64 select IRQ_DOMAIN select IRQ_FORCED_THREADING - select KASAN_VMALLOCif KASAN && MODULES + select KASAN_VMALLOCif KASAN && EXECMEM select LOCK_MM_AND_FIND_VMA select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 365d2720097c..b5bbb94c51f6 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -19,7 +19,7 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 -#if defined(CONFIG_MODULES) && defined(CONFIG_PPC32) +#if defined(CONFIG_EXECMEM) && defined(CONFIG_PPC32) #define KASAN_KERN_START ALIGN_DOWN(PAGE_OFFSET - SZ_256M, SZ_256M) #else #define KASAN_KERN_START PAGE_OFFSET diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 647b0b445e89..edc479a7c2bc 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -199,12 +199,12 @@ instruction_counter: mfspr r10, SPRN_SRR0 /* Get effective address of fault */ INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11) mtspr SPRN_MD_EPN, r10 -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM mfcrr11 compare_to_kernel_boundary r10, r10 #endif mfspr r10, SPRN_M_TWB /* Get level 1 table */ -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM blt+3f rlwinm r10, r10, 0, 20, 31 orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index c1d89764dd22..57196883a00e 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -419,14 +419,14 @@ InstructionTLBMiss: */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 #endif mfspr r2, SPRN_SDR1 li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC rlwinm r2, r2, 28, 0xf000 -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM li r0, 3 bgt-112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ @@ -442,7 +442,7 @@ InstructionTLBMiss: andc. r1,r1,r2/* check access & ~permission */ bne-InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ -#ifdef CONFIG_MODULES +#ifdef CONFIG_EXECMEM rlwimi r2, r0, 0, 31, 31 /* userspace ? -> PP lsb */ #endif ori r1, r1, 0xe06 /* clear out reserved bits */ diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index c6ab46156cda..7af791446ddf 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -225,7 +225,7 @@ void __init poking_init(void) static unsigned long get_patch_pfn(void *addr) { - if (IS_ENABLED(CONFIG_MODULES) && is_vmalloc_or_module_addr(addr)) + if (IS_ENABLED(CONFIG_EXECMEM) && is_vmalloc_or_module_addr(addr)) return vmalloc_to_pfn(addr); else return __pa_symbol(addr) >> PAGE_SHIFT; diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 100f999871bc..625fe7d08e06 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -184,7 +184,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) static bool is_module_segment(unsigned long addr) { - if (!IS_ENABLED(CONFIG_MODULES)) + if (!IS_ENABLED(CONFIG_EXECMEM)) return false; if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M)) return false; -- 2.43.0
[PATCH v5 12/15] x86/ftrace: enable dynamic ftrace without CONFIG_MODULES
From: "Mike Rapoport (IBM)" Dynamic ftrace must allocate memory for code and this was impossible without CONFIG_MODULES. With execmem separated from the modules code, execmem_text_alloc() is available regardless of CONFIG_MODULES. Remove dependency of dynamic ftrace on CONFIG_MODULES and make CONFIG_DYNAMIC_FTRACE select CONFIG_EXECMEM in Kconfig. Signed-off-by: Mike Rapoport (IBM) --- arch/x86/Kconfig | 1 + arch/x86/kernel/ftrace.c | 10 -- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3f5ba72c9480..cd8addb96a0b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86_64 select SWIOTLB select ARCH_HAS_ELFCORE_COMPAT select ZONE_DMA32 + select EXECMEM if DYNAMIC_FTRACE config FORCE_DYNAMIC_FTRACE def_bool y diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c8ddb7abda7c..8da0e66ca22d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -261,8 +261,6 @@ void arch_ftrace_update_code(int command) /* Currently only x86_64 supports dynamic trampolines */ #ifdef CONFIG_X86_64 -#ifdef CONFIG_MODULES -/* Module allocation simplifies allocating memory for code */ static inline void *alloc_tramp(unsigned long size) { return execmem_alloc(EXECMEM_FTRACE, size); @@ -271,14 +269,6 @@ static inline void tramp_free(void *tramp) { execmem_free(tramp); } -#else -/* Trampolines can only be created if modules are supported */ -static inline void *alloc_tramp(unsigned long size) -{ - return NULL; -} -static inline void tramp_free(void *tramp) { } -#endif /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); -- 2.43.0
[PATCH v5 11/15] arch: make execmem setup available regardless of CONFIG_MODULES
From: "Mike Rapoport (IBM)" execmem does not depend on modules, on the contrary modules use execmem. To make execmem available when CONFIG_MODULES=n, for instance for kprobes, split execmem_params initialization out from arch/*/kernel/module.c and compile it when CONFIG_EXECMEM=y Signed-off-by: Mike Rapoport (IBM) --- arch/arm/kernel/module.c | 43 -- arch/arm/mm/init.c | 45 +++ arch/arm64/kernel/module.c | 140 - arch/arm64/mm/init.c | 140 + arch/loongarch/kernel/module.c | 19 - arch/loongarch/mm/init.c | 21 + arch/mips/kernel/module.c | 22 -- arch/mips/mm/init.c| 23 ++ arch/nios2/kernel/module.c | 20 - arch/nios2/mm/init.c | 21 + arch/parisc/kernel/module.c| 20 - arch/parisc/mm/init.c | 23 +- arch/powerpc/kernel/module.c | 63 --- arch/powerpc/mm/mem.c | 64 +++ arch/riscv/kernel/module.c | 44 --- arch/riscv/mm/init.c | 45 +++ arch/s390/kernel/module.c | 27 --- arch/s390/mm/init.c| 30 +++ arch/sparc/kernel/module.c | 19 - arch/sparc/mm/Makefile | 2 + arch/sparc/mm/execmem.c| 21 + arch/x86/kernel/module.c | 27 --- arch/x86/mm/init.c | 29 +++ 23 files changed, 463 insertions(+), 445 deletions(-) create mode 100644 arch/sparc/mm/execmem.c diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index a98fdf6ff26c..677f218f7e84 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -12,57 +12,14 @@ #include #include #include -#include #include #include -#include -#include #include #include #include #include -#ifdef CONFIG_XIP_KERNEL -/* - * The XIP kernel text is mapped in the module area for modules and - * some other stuff to work without any indirect relocations. - * MODULES_VADDR is redefined here and not in asm/memory.h to avoid - * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. - */ -#undef MODULES_VADDR -#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) -#endif - -#ifdef CONFIG_MMU -static struct execmem_info execmem_info __ro_after_init; - -struct execmem_info __init *execmem_arch_setup(void) -{ - unsigned long fallback_start = 0, fallback_end = 0; - - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) { - fallback_start = VMALLOC_START; - fallback_end = VMALLOC_END; - } - - execmem_info = (struct execmem_info){ - .ranges = { - [EXECMEM_DEFAULT] = { - .start = MODULES_VADDR, - .end= MODULES_END, - .pgprot = PAGE_KERNEL_EXEC, - .alignment = 1, - .fallback_start = fallback_start, - .fallback_end = fallback_end, - }, - }, - }; - - return _info; -} -#endif - bool module_init_section(const char *name) { return strstarts(name, ".init") || diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index e8c6f4be0ce1..5345d218899a 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -486,3 +487,47 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif + +#ifdef CONFIG_EXECMEM + +#ifdef CONFIG_XIP_KERNEL +/* + * The XIP kernel text is mapped in the module area for modules and + * some other stuff to work without any indirect relocations. + * MODULES_VADDR is redefined here and not in asm/memory.h to avoid + * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. + */ +#undef MODULES_VADDR +#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) +#endif + +#ifdef CONFIG_MMU +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) +{ + unsigned long fallback_start = 0, fallback_end = 0; + + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) { + fallback_start = VMALLOC_START; + fallback_end = VMALLOC_END; + } + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL_EXEC, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + }, +
[PATCH v5 10/15] powerpc: extend execmem_params for kprobes allocations
From: "Mike Rapoport (IBM)" powerpc overrides kprobes::alloc_insn_page() to remove writable permissions when STRICT_MODULE_RWX is on. Add definition of EXECMEM_KRPOBES to execmem_params to allow using the generic kprobes::alloc_insn_page() with the desired permissions. As powerpc uses breakpoint instructions to inject kprobes, it does not need to constrain kprobe allocations to the modules area and can use the entire vmalloc address space. Signed-off-by: Mike Rapoport (IBM) --- arch/powerpc/kernel/kprobes.c | 20 arch/powerpc/kernel/module.c | 7 +++ 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 9fcd01bb2ce6..14c5ddec3056 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -126,26 +126,6 @@ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offse return (kprobe_opcode_t *)(addr + offset); } -void *alloc_insn_page(void) -{ - void *page; - - page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); - if (!page) - return NULL; - - if (strict_module_rwx_enabled()) { - int err = set_memory_rox((unsigned long)page, 1); - - if (err) - goto error; - } - return page; -error: - execmem_free(page); - return NULL; -} - int arch_prepare_kprobe(struct kprobe *p) { int ret = 0; diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index ac80559015a3..2a23cf7e141b 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -94,6 +94,7 @@ static struct execmem_info execmem_info __ro_after_init; struct execmem_info __init *execmem_arch_setup(void) { + pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; unsigned long fallback_start = 0, fallback_end = 0; unsigned long start, end; @@ -132,6 +133,12 @@ struct execmem_info __init *execmem_arch_setup(void) .fallback_start = fallback_start, .fallback_end = fallback_end, }, + [EXECMEM_KPROBES] = { + .start = VMALLOC_START, + .end= VMALLOC_END, + .pgprot = kprobes_prot, + .alignment = 1, + }, [EXECMEM_MODULE_DATA] = { .start = VMALLOC_START, .end= VMALLOC_END, -- 2.43.0
[PATCH v5 09/15] riscv: extend execmem_params for generated code allocations
From: "Mike Rapoport (IBM)" The memory allocations for kprobes and BPF on RISC-V are not placed in the modules area and these custom allocations are implemented with overrides of alloc_insn_page() and bpf_jit_alloc_exec(). Slightly reorder execmem_params initialization to support both 32 and 64 bit variants, define EXECMEM_KPROBES and EXECMEM_BPF ranges in riscv::execmem_params and drop overrides of alloc_insn_page() and bpf_jit_alloc_exec(). Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Alexandre Ghiti --- arch/riscv/kernel/module.c | 28 +--- arch/riscv/kernel/probes/kprobes.c | 10 -- arch/riscv/net/bpf_jit_core.c | 13 - 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 182904127ba0..2ecbacbc9993 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -906,19 +906,41 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, return 0; } -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +#ifdef CONFIG_MMU static struct execmem_info execmem_info __ro_after_init; struct execmem_info __init *execmem_arch_setup(void) { + unsigned long start, end; + + if (IS_ENABLED(CONFIG_64BIT)) { + start = MODULES_VADDR; + end = MODULES_END; + } else { + start = VMALLOC_START; + end = VMALLOC_END; + } + execmem_info = (struct execmem_info){ .ranges = { [EXECMEM_DEFAULT] = { - .start = MODULES_VADDR, - .end= MODULES_END, + .start = start, + .end= end, .pgprot = PAGE_KERNEL, .alignment = 1, }, + [EXECMEM_KPROBES] = { + .start = VMALLOC_START, + .end= VMALLOC_END, + .pgprot = PAGE_KERNEL_READ_EXEC, + .alignment = 1, + }, + [EXECMEM_BPF] = { + .start = BPF_JIT_REGION_START, + .end= BPF_JIT_REGION_END, + .pgprot = PAGE_KERNEL, + .alignment = PAGE_SIZE, + }, }, }; diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 2f08c14a933d..e64f2f3064eb 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -104,16 +104,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } -#ifdef CONFIG_MMU -void *alloc_insn_page(void) -{ - return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, -GFP_KERNEL, PAGE_KERNEL_READ_EXEC, -VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, -__builtin_return_address(0)); -} -#endif - /* install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 6b3acac30c06..e238fdbd5dbc 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -219,19 +219,6 @@ u64 bpf_jit_alloc_exec_limit(void) return BPF_JIT_REGION_SIZE; } -void *bpf_jit_alloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, - BPF_JIT_REGION_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} - -void bpf_jit_free_exec(void *addr) -{ - return vfree(addr); -} - void *bpf_arch_text_copy(void *dst, void *src, size_t len) { int ret; -- 2.43.0
[PATCH v5 08/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem
From: "Mike Rapoport (IBM)" Extend execmem parameters to accommodate more complex overrides of module_alloc() by architectures. This includes specification of a fallback range required by arm, arm64 and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for allocation of KASAN shadow required by s390 and x86 and support for early initialization of execmem required by x86. The core implementation of execmem_alloc() takes care of suppressing warnings when the initial allocation fails but there is a fallback range defined. Signed-off-by: Mike Rapoport (IBM) Acked-by: Will Deacon --- arch/Kconfig | 6 +++ arch/arm/kernel/module.c | 41 ++--- arch/arm64/kernel/module.c | 67 ++-- arch/arm64/kernel/probes/kprobes.c | 7 --- arch/arm64/net/bpf_jit_comp.c | 11 - arch/powerpc/kernel/module.c | 60 - arch/s390/kernel/module.c | 54 ++- arch/x86/Kconfig | 1 + arch/x86/kernel/module.c | 70 ++ include/linux/execmem.h| 34 +++ include/linux/moduleloader.h | 12 - kernel/module/main.c | 26 +++ mm/execmem.c | 70 +- mm/mm_init.c | 2 + 14 files changed, 259 insertions(+), 202 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 65afb1de48b3..7006f71f0110 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -960,6 +960,12 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC For architectures like powerpc/32 which have constraints on module allocation and need to allocate module data outside of module area. +config ARCH_WANTS_EXECMEM_EARLY + bool + help + For architectures that might allocate executable memory early on + boot, for instance ftrace on x86. + config HAVE_IRQ_EXIT_ON_IRQ_STACK bool help diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index e74d84f58b77..a98fdf6ff26c 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -34,23 +35,31 @@ #endif #ifdef CONFIG_MMU -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - /* Silence the initial allocation */ - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) - gfp_mask |= __GFP_NOWARN; - - p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) - return p; - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + unsigned long fallback_start = 0, fallback_end = 0; + + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) { + fallback_start = VMALLOC_START; + fallback_end = VMALLOC_END; + } + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL_EXEC, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + }, + }; + + return _info; } #endif diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index e92da4da1b2a..a52240ea084b 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -108,41 +109,59 @@ static int __init module_init_limits(void) return 0; } -subsys_initcall(module_init_limits); -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - void *p = NULL; + unsigned long fallback_start = 0, fallback_end = 0; + unsigned long start = 0, end = 0; + + module_init_limits(); /* * Where possible, prefer to allocate within direct branch range of the * kernel such that no PLTs are necessary. */ if (module_direct_base) { - p = __vmalloc_node_range(size, MODULE_ALIGN, -module_direct_base, -
[PATCH v5 07/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem
From: "Mike Rapoport (IBM)" Several architectures override module_alloc() only to define address range for code allocations different than VMALLOC address space. Provide a generic implementation in execmem that uses the parameters for address space ranges, required alignment and page protections provided by architectures. The architectures must fill execmem_info structure and implement execmem_arch_setup() that returns a pointer to that structure. This way the execmem initialization won't be called from every architecture, but rather from a central place, namely a core_initcall() in execmem. The execmem provides execmem_alloc() API that wraps __vmalloc_node_range() with the parameters defined by the architectures. If an architecture does not implement execmem_arch_setup(), execmem_alloc() will fall back to module_alloc(). Signed-off-by: Mike Rapoport (IBM) --- arch/loongarch/kernel/module.c | 19 +++-- arch/mips/kernel/module.c | 20 -- arch/nios2/kernel/module.c | 21 +++--- arch/parisc/kernel/module.c| 24 +++ arch/riscv/kernel/module.c | 24 +++ arch/sparc/kernel/module.c | 20 -- include/linux/execmem.h| 41 +++ mm/execmem.c | 73 -- 8 files changed, 208 insertions(+), 34 deletions(-) diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c index c7d0338d12c1..ca6dd7ea1610 100644 --- a/arch/loongarch/kernel/module.c +++ b/arch/loongarch/kernel/module.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -490,10 +491,22 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, return 0; } -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + }, + }; + + return _info; } static void module_init_ftrace_plt(const Elf_Ehdr *hdr, diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 9a6c96014904..59225a3cf918 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -20,6 +20,7 @@ #include #include #include +#include #include struct mips_hi16 { @@ -32,11 +33,22 @@ static LIST_HEAD(dbe_list); static DEFINE_SPINLOCK(dbe_lock); #ifdef MODULES_VADDR -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + }, + }; + + return _info; } #endif diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c index 9c97b7513853..0d1ee86631fc 100644 --- a/arch/nios2/kernel/module.c +++ b/arch/nios2/kernel/module.c @@ -18,15 +18,26 @@ #include #include #include +#include #include -void *module_alloc(unsigned long size) +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) { - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, - VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, - __builtin_return_address(0)); + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end= MODULES_END, + .pgprot = PAGE_KERNEL_EXEC, + .alignment = 1, + }, + }, + }; + + return _info; } int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index
[PATCH v5 06/15] mm: introduce execmem_alloc() and execmem_free()
From: "Mike Rapoport (IBM)" module_alloc() is used everywhere as a mean to allocate memory for code. Beside being semantically wrong, this unnecessarily ties all subsystems that need to allocate code, such as ftrace, kprobes and BPF to modules and puts the burden of code allocation to the modules code. Several architectures override module_alloc() because of various constraints where the executable memory can be located and this causes additional obstacles for improvements of code allocation. Start splitting code allocation from modules by introducing execmem_alloc() and execmem_free() APIs. Initially, execmem_alloc() is a wrapper for module_alloc() and execmem_free() is a replacement of module_memfree() to allow updating all call sites to use the new APIs. Since architectures define different restrictions on placement, permissions, alignment and other parameters for memory that can be used by different subsystems that allocate executable memory, execmem_alloc() takes a type argument, that will be used to identify the calling subsystem and to allow architectures define parameters for ranges suitable for that subsystem. No functional changes. Signed-off-by: Mike Rapoport (IBM) Acked-by: Masami Hiramatsu (Google) --- arch/powerpc/kernel/kprobes.c| 6 ++-- arch/s390/kernel/ftrace.c| 4 +-- arch/s390/kernel/kprobes.c | 4 +-- arch/s390/kernel/module.c| 5 +-- arch/sparc/net/bpf_jit_comp_32.c | 8 ++--- arch/x86/kernel/ftrace.c | 6 ++-- arch/x86/kernel/kprobes/core.c | 4 +-- include/linux/execmem.h | 57 include/linux/moduleloader.h | 3 -- kernel/bpf/core.c| 6 ++-- kernel/kprobes.c | 8 ++--- kernel/module/Kconfig| 1 + kernel/module/main.c | 25 +- mm/Kconfig | 3 ++ mm/Makefile | 1 + mm/execmem.c | 32 ++ 16 files changed, 128 insertions(+), 45 deletions(-) create mode 100644 include/linux/execmem.h create mode 100644 mm/execmem.c diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bbca90a5e2ec..9fcd01bb2ce6 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -19,8 +19,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -130,7 +130,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; @@ -142,7 +142,7 @@ void *alloc_insn_page(void) } return page; error: - module_memfree(page); + execmem_free(page); return NULL; } diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index c46381ea04ec..798249ef5646 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -7,13 +7,13 @@ * Author(s): Martin Schwidefsky */ -#include #include #include #include #include #include #include +#include #include #include #include @@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void) { const char *start, *end; - ftrace_plt = module_alloc(PAGE_SIZE); + ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE); if (!ftrace_plt) panic("cannot allocate ftrace plt\n"); diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index f0cf20d4b3c5..3c1b1be744de 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -9,7 +9,6 @@ #define pr_fmt(fmt) "kprobes: " fmt -#include #include #include #include @@ -21,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +38,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; set_memory_rox((unsigned long)page, 1); diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 42215f9404af..ac97a905e8cd 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -76,7 +77,7 @@ void *module_alloc(unsigned long size) #ifdef CONFIG_FUNCTION_TRACER void module_arch_cleanup(struct module *mod) { - module_memfree(mod->arch.trampolines_start); + execmem_free(mod->arch.trampolines_start); } #endif @@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct module *me, size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size); numpages = DIV_ROUND_UP(size, PAGE_SIZE); - start = module_alloc(numpages * PAGE_SIZE); + start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE); if (!start) return -ENOMEM;
[PATCH v5 05/15] module: make module_memory_{alloc,free} more self-contained
From: "Mike Rapoport (IBM)" Move the logic related to the memory allocation and freeing into module_memory_alloc() and module_memory_free(). Signed-off-by: Mike Rapoport (IBM) --- kernel/module/main.c | 64 +++- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/kernel/module/main.c b/kernel/module/main.c index e1e8a7a9d6c1..5b82b069e0d3 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1203,15 +1203,44 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type) mod_mem_type_is_core_data(type); } -static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) +static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { + unsigned int size = PAGE_ALIGN(mod->mem[type].size); + void *ptr; + + mod->mem[type].size = size; + if (mod_mem_use_vmalloc(type)) - return vzalloc(size); - return module_alloc(size); + ptr = vmalloc(size); + else + ptr = module_alloc(size); + + if (!ptr) + return -ENOMEM; + + /* +* The pointer to these blocks of memory are stored on the module +* structure and we keep that around so long as the module is +* around. We only free that memory when we unload the module. +* Just mark them as not being a leak then. The .init* ELF +* sections *do* get freed after boot so we *could* treat them +* slightly differently with kmemleak_ignore() and only grey +* them out as they work as typical memory allocations which +* *do* eventually get freed, but let's just keep things simple +* and avoid *any* false positives. +*/ + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; + + return 0; } -static void module_memory_free(void *ptr, enum mod_mem_type type) +static void module_memory_free(struct module *mod, enum mod_mem_type type) { + void *ptr = mod->mem[type].base; + if (mod_mem_use_vmalloc(type)) vfree(ptr); else @@ -1229,12 +1258,12 @@ static void free_mod_mem(struct module *mod) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod_mem->base, type); + module_memory_free(mod, type); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); + module_memory_free(mod, MOD_DATA); } /* Free a module, remove from lists, etc. */ @@ -2225,7 +2254,6 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { int i; - void *ptr; enum mod_mem_type t = 0; int ret = -ENOMEM; @@ -2234,26 +2262,12 @@ static int move_module(struct module *mod, struct load_info *info) mod->mem[type].base = NULL; continue; } - mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size); - ptr = module_memory_alloc(mod->mem[type].size, type); - /* - * The pointer to these blocks of memory are stored on the module - * structure and we keep that around so long as the module is - * around. We only free that memory when we unload the module. - * Just mark them as not being a leak then. The .init* ELF - * sections *do* get freed after boot so we *could* treat them - * slightly differently with kmemleak_ignore() and only grey - * them out as they work as typical memory allocations which - * *do* eventually get freed, but let's just keep things simple - * and avoid *any* false positives. -*/ - kmemleak_not_leak(ptr); - if (!ptr) { + + ret = module_memory_alloc(mod, type); + if (ret) { t = type; goto out_enomem; } - memset(ptr, 0, mod->mem[type].size); - mod->mem[type].base = ptr; } /* Transfer each section which specifies SHF_ALLOC */ @@ -2296,7 +2310,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_enomem: for (t--; t >= 0; t--) - module_memory_free(mod->mem[t].base, t); + module_memory_free(mod, t); return ret; } -- 2.43.0