date:20240422

[PATCH v3 4/4] virtio_balloon: introduce memory scan/reclaim info

2024-04-22 Thread zhenwei pi

Expose memory scan/reclaim information to the host side via virtio
balloon device.

Now we have a metric to analyze the memory performance:

y: counter increases
n: counter does not changes
h: the rate of counter change is high
l: the rate of counter change is low

OOM: VIRTIO_BALLOON_S_OOM_KILL
STALL: VIRTIO_BALLOON_S_ALLOC_STALL
ASCAN: VIRTIO_BALLOON_S_SCAN_ASYNC
DSCAN: VIRTIO_BALLOON_S_SCAN_DIRECT
ARCLM: VIRTIO_BALLOON_S_RECLAIM_ASYNC
DRCLM: VIRTIO_BALLOON_S_RECLAIM_DIRECT

- OOM[y], STALL[*], ASCAN[*], DSCAN[*], ARCLM[*], DRCLM[*]:
  the guest runs under really critial memory pressure

- OOM[n], STALL[h], ASCAN[*], DSCAN[l], ARCLM[*], DRCLM[l]:
  the memory allocation stalls due to cgroup, not the global memory
  pressure.

- OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[h]:
  the memory allocation stalls due to global memory pressure. The
  performance gets hurt a lot. A high ratio between DRCLM/DSCAN shows
  quite effective memory reclaiming.

- OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[l]:
  the memory allocation stalls due to global memory pressure.
  the ratio between DRCLM/DSCAN gets low, the guest OS is thrashing
  heavily, the serious case leads poor performance and difficult
  trouble shooting. Ex, sshd may block on memory allocation when
  accepting new connections, a user can't login a VM by ssh command.

- OOM[n], STALL[n], ASCAN[h], DSCAN[n], ARCLM[l], DRCLM[n]:
  the low ratio between ARCLM/ASCAN shows that the guest tries to
  reclaim more memory, but it can't. Once more memory is required in
  future, it will struggle to reclaim memory.

Acked-by: David Hildenbrand 
Signed-off-by: zhenwei pi 
---
 drivers/virtio/virtio_balloon.c |  9 +
 include/uapi/linux/virtio_balloon.h | 12 ++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index e6229e548832..225662358221 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -340,6 +340,15 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb)
 
update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall);
 
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_SCAN,
+   pages_to_bytes(events[PGSCAN_KSWAPD]));
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_SCAN,
+   pages_to_bytes(events[PGSCAN_DIRECT]));
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_RECLAIM,
+   pages_to_bytes(events[PGSTEAL_KSWAPD]));
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_RECLAIM,
+   pages_to_bytes(events[PGSTEAL_DIRECT]));
+
 #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index 487b893a160e..ee35a372805d 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -73,7 +73,11 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page allocation failures 
*/
 #define VIRTIO_BALLOON_S_OOM_KILL  10 /* OOM killer invocations */
 #define VIRTIO_BALLOON_S_ALLOC_STALL   11 /* Stall count of memory allocatoin 
*/
-#define VIRTIO_BALLOON_S_NR   12
+#define VIRTIO_BALLOON_S_ASYNC_SCAN12 /* Amount of memory scanned 
asynchronously */
+#define VIRTIO_BALLOON_S_DIRECT_SCAN   13 /* Amount of memory scanned directly 
*/
+#define VIRTIO_BALLOON_S_ASYNC_RECLAIM 14 /* Amount of memory reclaimed 
asynchronously */
+#define VIRTIO_BALLOON_S_DIRECT_RECLAIM 15 /* Amount of memory reclaimed 
directly */
+#define VIRTIO_BALLOON_S_NR   16
 
 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
@@ -87,7 +91,11 @@ struct virtio_balloon_config {
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \
VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \
-   VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \
+   VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls", \
+   VIRTIO_BALLOON_S_NAMES_prefix "async-scans", \
+   VIRTIO_BALLOON_S_NAMES_prefix "direct-scans", \
+   VIRTIO_BALLOON_S_NAMES_prefix "async-reclaims", \
+   VIRTIO_BALLOON_S_NAMES_prefix "direct-reclaims" \
 }
 
 #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
-- 
2.34.1

[PATCH v3 3/4] virtio_balloon: introduce memory allocation stall counter

2024-04-22 Thread zhenwei pi

Memory allocation stall counter represents the performance/latency of
memory allocation, expose this counter to the host side by virtio
balloon device via out-of-bound way.

Acked-by: David Hildenbrand 
Signed-off-by: zhenwei pi 
---
 drivers/virtio/virtio_balloon.c | 8 
 include/uapi/linux/virtio_balloon.h | 6 --
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f7a47eaa0936..e6229e548832 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -322,6 +322,8 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb)
 {
unsigned long events[NR_VM_EVENT_ITEMS];
unsigned int idx = 0;
+   unsigned int zid;
+   unsigned long stall = 0;
 
all_vm_events(events);
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
@@ -332,6 +334,12 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb)
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]);
 
+   /* sum all the stall events */
+   for (zid = 0; zid < MAX_NR_ZONES; zid++)
+   stall += events[ALLOCSTALL_NORMAL - ZONE_NORMAL + zid];
+
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall);
+
 #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index b17bbe033697..487b893a160e 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -72,7 +72,8 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_HTLB_PGALLOC  8  /* Hugetlb page allocations */
 #define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page allocation failures 
*/
 #define VIRTIO_BALLOON_S_OOM_KILL  10 /* OOM killer invocations */
-#define VIRTIO_BALLOON_S_NR   11
+#define VIRTIO_BALLOON_S_ALLOC_STALL   11 /* Stall count of memory allocatoin 
*/
+#define VIRTIO_BALLOON_S_NR   12
 
 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
@@ -85,7 +86,8 @@ struct virtio_balloon_config {
VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \
-   VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \
+   VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \
+   VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \
 }
 
 #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
-- 
2.34.1

[PATCH v3 2/4] virtio_balloon: introduce oom-kill invocations

2024-04-22 Thread zhenwei pi

When the guest OS runs under critical memory pressure, the guest
starts to kill processes. A guest monitor agent may scan 'oom_kill'
from /proc/vmstat, and reports the OOM KILL event. However, the agent
may be killed and we will loss this critical event(and the later
events).

For now we can also grep for magic words in guest kernel log from host
side. Rather than this unstable way, virtio balloon reports OOM-KILL
invocations instead.

Acked-by: David Hildenbrand 
Signed-off-by: zhenwei pi 
---
 drivers/virtio/virtio_balloon.c | 1 +
 include/uapi/linux/virtio_balloon.h | 6 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1710e3098ecd..f7a47eaa0936 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -330,6 +330,7 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb)
pages_to_bytes(events[PSWPOUT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]);
 
 #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index ddaa45e723c4..b17bbe033697 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -71,7 +71,8 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_CACHES   7   /* Disk caches */
 #define VIRTIO_BALLOON_S_HTLB_PGALLOC  8  /* Hugetlb page allocations */
 #define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page allocation failures 
*/
-#define VIRTIO_BALLOON_S_NR   10
+#define VIRTIO_BALLOON_S_OOM_KILL  10 /* OOM killer invocations */
+#define VIRTIO_BALLOON_S_NR   11
 
 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
@@ -83,7 +84,8 @@ struct virtio_balloon_config {
VIRTIO_BALLOON_S_NAMES_prefix "available-memory", \
VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
-   VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures" \
+   VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \
+   VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \
 }
 
 #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
-- 
2.34.1

[PATCH v3 1/4] virtio_balloon: separate vm events into a function

2024-04-22 Thread zhenwei pi

All the VM events related statistics have dependence on
'CONFIG_VM_EVENT_COUNTERS', separate these events into a function to
make code clean. Then we can remove 'CONFIG_VM_EVENT_COUNTERS' from
'update_balloon_stats'.

Signed-off-by: zhenwei pi 
---
 drivers/virtio/virtio_balloon.c | 43 ++---
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1f5b3dd31fcf..1710e3098ecd 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -316,34 +316,49 @@ static inline void update_stat(struct virtio_balloon *vb, 
int idx,
 
 #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT)
 
-static unsigned int update_balloon_stats(struct virtio_balloon *vb)
+#ifdef CONFIG_VM_EVENT_COUNTERS
+/* Return the number of entries filled by vm events */
+static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb)
 {
unsigned long events[NR_VM_EVENT_ITEMS];
-   struct sysinfo i;
unsigned int idx = 0;
-   long available;
-   unsigned long caches;
 
all_vm_events(events);
-   si_meminfo();
-
-   available = si_mem_available();
-   caches = global_node_page_state(NR_FILE_PAGES);
-
-#ifdef CONFIG_VM_EVENT_COUNTERS
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
-   pages_to_bytes(events[PSWPIN]));
+   pages_to_bytes(events[PSWPIN]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
-   pages_to_bytes(events[PSWPOUT]));
+   pages_to_bytes(events[PSWPOUT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+
 #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
events[HTLB_BUDDY_PGALLOC_FAIL]);
-#endif
-#endif
+#endif /* CONFIG_HUGETLB_PAGE */
+
+   return idx;
+}
+#else /* CONFIG_VM_EVENT_COUNTERS */
+static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb)
+{
+   return 0;
+}
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
+static unsigned int update_balloon_stats(struct virtio_balloon *vb)
+{
+   struct sysinfo i;
+   unsigned int idx;
+   long available;
+   unsigned long caches;
+
+   idx = update_balloon_vm_stats(vb);
+
+   si_meminfo();
+   available = si_mem_available();
+   caches = global_node_page_state(NR_FILE_PAGES);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
pages_to_bytes(i.freeram));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,
-- 
2.34.1

[PATCH v3 0/4] Improve memory statistics for virtio balloon

2024-04-22 Thread zhenwei pi

Hi,

v2 -> v3:
- A few coding style change in
  '[PATCH v3 1/4] virtio_balloon: separate vm events into a function'

v1 -> v2:
- Add a new patch 'virtio_balloon: separate vm events into a function'
  to avoid any compiler warnings(unused stack variable on
  CONFIG_VM_EVENT_COUNTERS=n)
- Suggested by David, use a loop 'for (zid = 0; zid < MAX_NR_ZONES; zid++)'
  to obtain all the stall events.

RFC -> v1:
- several text changes: oom-kill -> oom-kills, SCAN_ASYNC -> ASYN_SCAN.
- move vm events codes into '#ifdef CONFIG_VM_EVENT_COUNTERS'

RFC version:
Link: 
https://lore.kernel.org/lkml/20240415084113.1203428-1-pizhen...@bytedance.com/T/#m1898963b3c27a989b1123db475135c3ca687ca84

zhenwei pi (4):
  virtio_balloon: separate vm events into a function
  virtio_balloon: introduce oom-kill invocations
  virtio_balloon: introduce memory allocation stall counter
  virtio_balloon: introduce memory scan/reclaim info

 drivers/virtio/virtio_balloon.c | 61 ++---
 include/uapi/linux/virtio_balloon.h | 16 +++-
 2 files changed, 61 insertions(+), 16 deletions(-)

-- 
2.34.1

Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-22 Thread Gavin Shan


On 4/23/24 06:46, Michael S. Tsirkin wrote:

On Mon, Apr 08, 2024 at 02:15:24PM +1000, Gavin Shan wrote:

On 3/30/24 19:02, Gavin Shan wrote:

On 3/28/24 19:31, Michael S. Tsirkin wrote:

On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote:

All the callers of vhost_get_avail_idx() are concerned to the memory
barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced. With it, the callers needn't to worry
about the memory barrier.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Gavin Shan 


Previous patches are ok. This one I feel needs more work -
first more code such as sanity checking should go into
this function, second there's actually a difference
between comparing to last_avail_idx and just comparing
to the previous value of avail_idx.
I will pick patches 1-2 and post a cleanup on top so you can
take a look, ok?



Thanks, Michael. It's fine to me.



A kindly ping.

If it's ok to you, could you please merge PATCH[1-2]? Our downstream
9.4 need the fixes, especially for NVidia's grace-hopper and grace-grace
platforms.

For PATCH[3], I also can help with the improvement if you don't have time
for it. Please let me know.



1-2 are upstream go ahead and post the cleanup.



Michael, a cleanup series has been sent for review.

https://lore.kernel.org/virtualization/20240423032407.262329-1-gs...@redhat.com/T/#t

Thanks,
Gavin

[PATCH 4/4] vhost: Reformat vhost_{get, put}_user()

2024-04-22 Thread Gavin Shan

Reformat the macros to use tab as the terminator for each line so
that it looks clean.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 60 +--
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a3de9325175f..3be19877f9df 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1207,21 +1207,22 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
return __vhost_get_user_slow(vq, addr, size, type);
 }
 
-#define vhost_put_user(vq, x, ptr) \
-({ \
-   int ret; \
-   if (!vq->iotlb) { \
-   ret = __put_user(x, ptr); \
-   } else { \
-   __typeof__(ptr) to = \
+#define vhost_put_user(vq, x, ptr) \
+({ \
+   int ret;\
+   if (!vq->iotlb) {   \
+   ret = __put_user(x, ptr);   \
+   } else {\
+   __typeof__(ptr) to =\
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
- sizeof(*ptr), VHOST_ADDR_USED); \
-   if (to != NULL) \
-   ret = __put_user(x, to); \
-   else \
-   ret = -EFAULT;  \
-   } \
-   ret; \
+   sizeof(*ptr),   \
+   VHOST_ADDR_USED);   \
+   if (to != NULL) \
+   ret = __put_user(x, to);\
+   else\
+   ret = -EFAULT;  \
+   }   \
+   ret;\
 })
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
@@ -1252,22 +1253,21 @@ static inline int vhost_put_used_idx(struct 
vhost_virtqueue *vq)
  >used->idx);
 }
 
-#define vhost_get_user(vq, x, ptr, type)   \
-({ \
-   int ret; \
-   if (!vq->iotlb) { \
-   ret = __get_user(x, ptr); \
-   } else { \
-   __typeof__(ptr) from = \
-   (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
-  sizeof(*ptr), \
-  type); \
-   if (from != NULL) \
-   ret = __get_user(x, from); \
-   else \
-   ret = -EFAULT; \
-   } \
-   ret; \
+#define vhost_get_user(vq, x, ptr, type)   \
+({ \
+   int ret;\
+   if (!vq->iotlb) {   \
+   ret = __get_user(x, ptr);   \
+   } else {\
+   __typeof__(ptr) from =  \
+   (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
+   sizeof(*ptr), type);\
+   if (from != NULL)   \
+   ret = __get_user(x, from);  \
+   else\
+   ret = -EFAULT;  \
+   }   \
+   ret;\
 })
 
 #define vhost_get_avail(vq, x, ptr) \
-- 
2.44.0

[PATCH 3/4] vhost: Improve vhost_get_avail_head()

2024-04-22 Thread Gavin Shan

Improve vhost_get_avail_head() so that the head or errno is returned.
With it, the relevant sanity checks are squeezed to vhost_get_avail_head()
and vhost_get_vq_desc() is further simplified.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 43 +++
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index b3adc0bc9e72..a3de9325175f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1320,11 +1320,27 @@ static inline int vhost_get_avail_idx(struct 
vhost_virtqueue *vq)
return 0;
 }
 
-static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
-  __virtio16 *head, int idx)
+static inline int vhost_get_avail_head(struct vhost_virtqueue *vq)
 {
-   return vhost_get_avail(vq, *head,
-  >avail->ring[idx & (vq->num - 1)]);
+   __virtio16 head;
+   int r;
+
+   r = vhost_get_avail(vq, head,
+   >avail->ring[vq->last_avail_idx & (vq->num - 
1)]);
+   if (unlikely(r)) {
+   vq_err(vq, "Failed to read head: idx %u address %p\n",
+  vq->last_avail_idx,
+  >avail->ring[vq->last_avail_idx % vq->num]);
+   return r;
+   }
+
+   r = vhost16_to_cpu(vq, head);
+   if (unlikely(r >= vq->num)) {
+   vq_err(vq, "Invalid head %d (%u)\n", r, vq->num);
+   return -EINVAL;
+   }
+
+   return r;
 }
 
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
@@ -2522,7 +2538,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   __virtio16 ring_head;
int ret, access;
 
if (vq->avail_idx == vq->last_avail_idx) {
@@ -2539,21 +2554,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-   if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) 
{
-   vq_err(vq, "Failed to read head: idx %d address %p\n",
-  vq->last_avail_idx,
-  >avail->ring[vq->last_avail_idx % vq->num]);
-   return -EFAULT;
-   }
-
-   head = vhost16_to_cpu(vq, ring_head);
-
-   /* If their number is silly, that's an error. */
-   if (unlikely(head >= vq->num)) {
-   vq_err(vq, "Guest says index %u > %u is available",
-  head, vq->num);
-   return -EINVAL;
-   }
+   head = vhost_get_avail_head(vq);
+   if (unlikely(head < 0))
+   return head;
 
/* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
-- 
2.44.0

[PATCH 2/4] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-22 Thread Gavin Shan

All the callers of vhost_get_avail_idx() are concerned to the memory
barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced. With it, the callers needn't to worry
about the memory barrier.

No functional change intended.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 91 ---
 1 file changed, 34 insertions(+), 57 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ef7942103232..b3adc0bc9e72 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1290,10 +1290,34 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
mutex_unlock(>vqs[i]->mutex);
 }
 
-static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
- __virtio16 *idx)
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq)
 {
-   return vhost_get_avail(vq, *idx, >avail->idx);
+   __virtio16 avail_idx;
+   int r;
+
+   r = vhost_get_avail(vq, avail_idx, >avail->idx);
+   if (unlikely(r)) {
+   vq_err(vq, "Failed to access avail idx at %p\n",
+  >avail->idx);
+   return r;
+   }
+
+   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Ensure the available ring entry read happens
+* before the avail_idx read when the avail_idx
+* is advanced.
+*/
+   smp_rmb();
+   }
+
+   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) {
+   vq_err(vq, "Invalid avail index change from %u to %u",
+  vq->last_avail_idx, vq->avail_idx);
+   return -EINVAL;
+   }
+
+   return 0;
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
@@ -2498,35 +2522,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   __virtio16 avail_idx;
__virtio16 ring_head;
int ret, access;
 
-   /* Check it isn't doing very strange things with descriptor numbers. */
if (vq->avail_idx == vq->last_avail_idx) {
-   if (unlikely(vhost_get_avail_idx(vq, _idx))) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-   >avail->idx);
-   return -EFAULT;
-   }
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-
-   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > 
vq->num)) {
-   vq_err(vq, "Guest moved avail index from %u to %u",
-   vq->last_avail_idx, vq->avail_idx);
-   return -EFAULT;
-   }
+   ret = vhost_get_avail_idx(vq);
+   if (unlikely(ret))
+   return ret;
 
/* If there's nothing new since last we looked, return
 * invalid.
 */
if (vq->avail_idx == vq->last_avail_idx)
return vq->num;
-
-   /* Only get avail ring entries after they have been
-* exposed by guest.
-*/
-   smp_rmb();
}
 
/* Grab the next descriptor number they're advertising, and increment
@@ -2787,35 +2795,19 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 /* return true if we're sure that avaiable ring is empty */
 bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
-   int r;
-
if (vq->avail_idx != vq->last_avail_idx)
return false;
 
-   r = vhost_get_avail_idx(vq, _idx);
-   if (unlikely(r))
-   return false;
-
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-   if (vq->avail_idx != vq->last_avail_idx) {
-   /* Since we have updated avail_idx, the following
-* call to vhost_get_vq_desc() will read available
-* ring entries. Make sure that read happens after
-* the avail_idx read.
-*/
-   smp_rmb();
+   if (unlikely(vhost_get_avail_idx(vq)))
return false;
-   }
 
-   return true;
+   return vq->avail_idx == vq->last_avail_idx;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
int r;
 
if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
@@ -2839,25 +2831,10 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
/* They could have slipped one in as we were

[PATCH 1/4] vhost: Drop variable last_avail_idx in vhost_get_vq_desc()

2024-04-22 Thread Gavin Shan

The local variable @last_avail_idx is equivalent to vq->last_avail_idx.
So the code can be simplified a bit by dropping the local variable
@last_avail_idx.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8995730ce0bf..ef7942103232 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2498,14 +2498,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   u16 last_avail_idx;
__virtio16 avail_idx;
__virtio16 ring_head;
int ret, access;
 
/* Check it isn't doing very strange things with descriptor numbers. */
-   last_avail_idx = vq->last_avail_idx;
-
if (vq->avail_idx == vq->last_avail_idx) {
if (unlikely(vhost_get_avail_idx(vq, _idx))) {
vq_err(vq, "Failed to access avail idx at %p\n",
@@ -2514,16 +2511,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
-   if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
+   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > 
vq->num)) {
vq_err(vq, "Guest moved avail index from %u to %u",
-   last_avail_idx, vq->avail_idx);
+   vq->last_avail_idx, vq->avail_idx);
return -EFAULT;
}
 
/* If there's nothing new since last we looked, return
 * invalid.
 */
-   if (vq->avail_idx == last_avail_idx)
+   if (vq->avail_idx == vq->last_avail_idx)
return vq->num;
 
/* Only get avail ring entries after they have been
@@ -2534,10 +2531,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-   if (unlikely(vhost_get_avail_head(vq, _head, last_avail_idx))) {
+   if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) 
{
vq_err(vq, "Failed to read head: idx %d address %p\n",
-  last_avail_idx,
-  >avail->ring[last_avail_idx % vq->num]);
+  vq->last_avail_idx,
+  >avail->ring[vq->last_avail_idx % vq->num]);
return -EFAULT;
}
 
-- 
2.44.0

[PATCH 0/4] vhost: Cleanup

2024-04-22 Thread Gavin Shan

This is suggested by Michael S. Tsirkin according to [1] and the goal
is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it,
the caller of the function needn't to worry about memory barriers. Since
we're here, other cleanups are also applied.

[1] 
https://lore.kernel.org/virtualization/20240327075940-mutt-send-email-...@kernel.org/

PATCH[1] drops the local variable @last_avail_idx since it's equivalent
 to vq->last_avail_idx
PATCH[2] improves vhost_get_avail_idx() so that smp_rmb() is applied if
 needed. Besides, the sanity checks on the retrieved available
 queue index are also squeezed to vhost_get_avail_idx()
PATCH[3] improves vhost_get_avail_head(), similar to what we're doing
 for vhost_get_avail_idx(), so that the relevant sanity checks
 on the head are squeezed to vhost_get_avail_head()
PATCH[4] Reformat vhost_{get, put}_user() by using tab instead of space
 as the terminator for each line

Gavin Shan (4):
  vhost: Drop variable last_avail_idx in vhost_get_vq_desc()
  vhost: Improve vhost_get_avail_idx() with smp_rmb()
  vhost: Improve vhost_get_avail_head()
  vhost: Reformat vhost_{get, put}_user()

 drivers/vhost/vhost.c | 199 +++---
 1 file changed, 88 insertions(+), 111 deletions(-)

-- 
2.44.0

Re: [PATCH v5 3/5] vduse: Add function to get/free the pages for reconnection

2024-04-22 Thread Jason Wang

On Tue, Apr 23, 2024 at 4:05 AM Michael S. Tsirkin  wrote:
>
> On Thu, Apr 18, 2024 at 08:57:51AM +0800, Jason Wang wrote:
> > On Wed, Apr 17, 2024 at 5:29 PM Michael S. Tsirkin  wrote:
> > >
> > > On Fri, Apr 12, 2024 at 09:28:23PM +0800, Cindy Lu wrote:
> > > > Add the function vduse_alloc_reconnnect_info_mem
> > > > and vduse_alloc_reconnnect_info_mem
> > > > These functions allow vduse to allocate and free memory for reconnection
> > > > information. The amount of memory allocated is vq_num pages.
> > > > Each VQS will map its own page where the reconnection information will 
> > > > be saved
> > > >
> > > > Signed-off-by: Cindy Lu 
> > > > ---
> > > >  drivers/vdpa/vdpa_user/vduse_dev.c | 40 ++
> > > >  1 file changed, 40 insertions(+)
> > > >
> > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
> > > > b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > index ef3c9681941e..2da659d5f4a8 100644
> > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > @@ -65,6 +65,7 @@ struct vduse_virtqueue {
> > > >   int irq_effective_cpu;
> > > >   struct cpumask irq_affinity;
> > > >   struct kobject kobj;
> > > > + unsigned long vdpa_reconnect_vaddr;
> > > >  };
> > > >
> > > >  struct vduse_dev;
> > > > @@ -1105,6 +1106,38 @@ static void vduse_vq_update_effective_cpu(struct 
> > > > vduse_virtqueue *vq)
> > > >
> > > >   vq->irq_effective_cpu = curr_cpu;
> > > >  }
> > > > +static int vduse_alloc_reconnnect_info_mem(struct vduse_dev *dev)
> > > > +{
> > > > + unsigned long vaddr = 0;
> > > > + struct vduse_virtqueue *vq;
> > > > +
> > > > + for (int i = 0; i < dev->vq_num; i++) {
> > > > + /*page 0~ vq_num save the reconnect info for vq*/
> > > > + vq = dev->vqs[i];
> > > > + vaddr = get_zeroed_page(GFP_KERNEL);
> > >
> > >
> > > I don't get why you insist on stealing kernel memory for something
> > > that is just used by userspace to store data for its own use.
> > > Userspace does not lack ways to persist data, for example,
> > > create a regular file anywhere in the filesystem.
> >
> > Good point. So the motivation here is to:
> >
> > 1) be self contained, no dependency for high speed persist data
> > storage like tmpfs
>
> No idea what this means.

I mean a regular file may slow down the datapath performance, so
usually the application will try to use tmpfs and other which is a
dependency for implementing the reconnection.

>
> > 2) standardize the format in uAPI which allows reconnection from
> > arbitrary userspace, unfortunately, such effort was removed in new
> > versions
>
> And I don't see why that has to live in the kernel tree either.

I can't find a better place, any idea?

Thanks

>
> > If the above doesn't make sense, we don't need to offer those pages by 
> > VDUSE.
> >
> > Thanks
> >
> >
> > >
> > >
> > >
> > > > + if (vaddr == 0)
> > > > + return -ENOMEM;
> > > > +
> > > > + vq->vdpa_reconnect_vaddr = vaddr;
> > > > + }
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int vduse_free_reconnnect_info_mem(struct vduse_dev *dev)
> > > > +{
> > > > + struct vduse_virtqueue *vq;
> > > > +
> > > > + for (int i = 0; i < dev->vq_num; i++) {
> > > > + vq = dev->vqs[i];
> > > > +
> > > > + if (vq->vdpa_reconnect_vaddr)
> > > > + free_page(vq->vdpa_reconnect_vaddr);
> > > > + vq->vdpa_reconnect_vaddr = 0;
> > > > + }
> > > > +
> > > > + return 0;
> > > > +}
> > > >
> > > >  static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> > > >   unsigned long arg)
> > > > @@ -1672,6 +1705,8 @@ static int vduse_destroy_dev(char *name)
> > > >   mutex_unlock(>lock);
> > > >   return -EBUSY;
> > > >   }
> > > > + vduse_free_reconnnect_info_mem(dev);
> > > > +
> > > >   dev->connected = true;
> > > >   mutex_unlock(>lock);
> > > >
> > > > @@ -1855,12 +1890,17 @@ static int vduse_create_dev(struct 
> > > > vduse_dev_config *config,
> > > >   ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
> > > >   if (ret)
> > > >   goto err_vqs;
> > > > + ret = vduse_alloc_reconnnect_info_mem(dev);
> > > > + if (ret < 0)
> > > > + goto err_mem;
> > > >
> > > >   __module_get(THIS_MODULE);
> > > >
> > > >   return 0;
> > > >  err_vqs:
> > > >   device_destroy(_class, MKDEV(MAJOR(vduse_major), 
> > > > dev->minor));
> > > > +err_mem:
> > > > + vduse_free_reconnnect_info_mem(dev);
> > > >  err_dev:
> > > >   idr_remove(_idr, dev->minor);
> > > >  err_idr:
> > > > --
> > > > 2.43.0
> > >
>

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Jason Xing

On Tue, Apr 23, 2024 at 10:14 AM Jason Xing  wrote:
>
> Hi Simon,
>
> On Tue, Apr 23, 2024 at 2:28 AM Simon Horman  wrote:
> >
> > On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote:
> >
> > ...
> >
> > > diff --git a/include/net/rstreason.h b/include/net/rstreason.h
> >
> > ...
> >
> > > +/**
> > > + * There are three parts in order:
> > > + * 1) reset reason in MPTCP: only for MPTCP use
> > > + * 2) skb drop reason: relying on drop reasons for such as passive reset
> > > + * 3) independent reset reason: such as active reset reasons
> > > + */
> >
> > Hi Jason,
> >
> > A minor nit from my side.
> >
> > '/**' denotes the beginning of a Kernel doc,
> > but other than that, this comment is not a Kernel doc.
> >
> > FWIIW, I would suggest providing a proper Kernel doc for enum sk_rst_reason.
> > But another option would be to simply make this a normal comment,
> > starting with "/* There are"
>
> Thanks Simon. I'm trying to use the kdoc way to make it right :)
>
> How about this one:
> /**
>  * enum sk_rst_reason - the reasons of socket reset
>  *
>  * The reason of skb drop, which is used in DCCP/TCP/MPTCP protocols.

s/skb drop/sk reset/

Sorry, I cannot withdraw my previous email in time.

>  *
>  * There are three parts in order:
>  * 1) skb drop reasons: relying on drop reasons for such as passive
> reset
>  * 2) independent reset reasons: such as active reset reasons
>  * 3) reset reasons in MPTCP: only for MPTCP use
>  */
> ?
>
> I chose to mimic what enum skb_drop_reason does in the
> include/net/dropreason-core.h file.
>
> > +enum sk_rst_reason {
> > +   /**
> > +* Copy from include/uapi/linux/mptcp.h.
> > +* These reset fields will not be changed since they adhere to
> > +* RFC 8684. So do not touch them. I'm going to list each definition
> > +* of them respectively.
> > +*/
>
> Thanks to you, I found another similar point where I smell something
> wrong as in the above code. I'm going to replace '/**' with '/*' since
> it's only a comment, not a kdoc.
>
> Thanks,
> Jason

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Jason Xing

Hi Simon,

On Tue, Apr 23, 2024 at 2:28 AM Simon Horman  wrote:
>
> On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote:
>
> ...
>
> > diff --git a/include/net/rstreason.h b/include/net/rstreason.h
>
> ...
>
> > +/**
> > + * There are three parts in order:
> > + * 1) reset reason in MPTCP: only for MPTCP use
> > + * 2) skb drop reason: relying on drop reasons for such as passive reset
> > + * 3) independent reset reason: such as active reset reasons
> > + */
>
> Hi Jason,
>
> A minor nit from my side.
>
> '/**' denotes the beginning of a Kernel doc,
> but other than that, this comment is not a Kernel doc.
>
> FWIIW, I would suggest providing a proper Kernel doc for enum sk_rst_reason.
> But another option would be to simply make this a normal comment,
> starting with "/* There are"

Thanks Simon. I'm trying to use the kdoc way to make it right :)

How about this one:
/**
 * enum sk_rst_reason - the reasons of socket reset
 *
 * The reason of skb drop, which is used in DCCP/TCP/MPTCP protocols.
 *
 * There are three parts in order:
 * 1) skb drop reasons: relying on drop reasons for such as passive
reset
 * 2) independent reset reasons: such as active reset reasons
 * 3) reset reasons in MPTCP: only for MPTCP use
 */
?

I chose to mimic what enum skb_drop_reason does in the
include/net/dropreason-core.h file.

> +enum sk_rst_reason {
> +   /**
> +* Copy from include/uapi/linux/mptcp.h.
> +* These reset fields will not be changed since they adhere to
> +* RFC 8684. So do not touch them. I'm going to list each definition
> +* of them respectively.
> +*/

Thanks to you, I found another similar point where I smell something
wrong as in the above code. I'm going to replace '/**' with '/*' since
it's only a comment, not a kdoc.

Thanks,
Jason

回复: [PATCH v5] vp_vdpa: don't allocate unused msix vectors

2024-04-22 Thread Gavin Liu

On Wed, Apr 10, 2024 at 11:30:20AM +0800, lyx634449800 wrote:
> From: Yuxue Liu 
>
> When there is a ctlq and it doesn't require interrupt callbacks,the 
> original method of calculating vectors wastes hardware msi or msix 
> resources as well as system IRQ resources.
>
> When conducting performance testing using testpmd in the guest os, it 
> was found that the performance was lower compared to directly using 
> vfio-pci to passthrough the device
>
> In scenarios where the virtio device in the guest os does not utilize 
> interrupts, the vdpa driver still configures the hardware's msix 
> vector. Therefore, the hardware still sends interrupts to the host os.

>I just have a question on this part. How come hardware sends interrupts does 
>not guest driver disable them?
   
   1：Assuming the guest OS's Virtio device is using PMD mode, QEMU sets the 
call fd to -1
   2：On the host side, the vhost_vdpa program will set 
vp_vdpa->vring[i].cb.callback to invalid
   3：Before the modification, the vp_vdpa_request_irq function does not check 
whether 
  vp_vdpa->vring[i].cb.callback is valid. Instead, it enables the 
hardware's MSIX
  interrupts based on the number of queues of the device




- Original Message -
From: Michael S. Tsirkin m...@redhat.com
Sent: April 22, 2024 20:09
To: Gavin Liu gavin@jaguarmicro.com
Cc: jasow...@redhat.com; Angus Chen angus.c...@jaguarmicro.com; 
virtualizat...@lists.linux.dev; xuanz...@linux.alibaba.com; 
linux-kernel@vger.kernel.org; Heng Qi hen...@linux.alibaba.com
Subject: Re: [PATCH v5] vp_vdpa: don't allocate unused msix vectors



External Mail: This email originated from OUTSIDE of the organization!
Do not click links, open attachments or provide ANY information unless you 
recognize the sender and know the content is safe.


On Wed, Apr 10, 2024 at 11:30:20AM +0800, lyx634449800 wrote:
> From: Yuxue Liu 
>
> When there is a ctlq and it doesn't require interrupt callbacks,the 
> original method of calculating vectors wastes hardware msi or msix 
> resources as well as system IRQ resources.
>
> When conducting performance testing using testpmd in the guest os, it 
> was found that the performance was lower compared to directly using 
> vfio-pci to passthrough the device
>
> In scenarios where the virtio device in the guest os does not utilize 
> interrupts, the vdpa driver still configures the hardware's msix 
> vector. Therefore, the hardware still sends interrupts to the host os.

I just have a question on this part. How come hardware sends interrupts does 
not guest driver disable them?

> Because of this unnecessary
> action by the hardware, hardware performance decreases, and it also 
> affects the performance of the host os.
>
> Before modification:(interrupt mode)
>  32:  0   0  0  0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0
>  33:  0   0  0  0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1
>  34:  0   0  0  0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2
>  35:  0   0  0  0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config
>
> After modification:(interrupt mode)
>  32:  0  0  1  7   PCI-MSI 32768-edge  vp-vdpa[:00:02.0]-0
>  33: 36  0  3  0   PCI-MSI 32769-edge  vp-vdpa[:00:02.0]-1
>  34:  0  0  0  0   PCI-MSI 32770-edge  vp-vdpa[:00:02.0]-config
>
> Before modification:(virtio pmd mode for guest os)
>  32:  0   0  0  0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0
>  33:  0   0  0  0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1
>  34:  0   0  0  0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2
>  35:  0   0  0  0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config
>
> After modification:(virtio pmd mode for guest os)
>  32: 0  0  0   0   PCI-MSI 32768-edge   vp-vdpa[:00:02.0]-config
>
> To verify the use of the virtio PMD mode in the guest operating 
> system, the following patch needs to be applied to QEMU:
> https://lore.kernel.org/all/20240408073311.2049-1-yuxue.liu@jaguarmicr
> o.com
>
> Signed-off-by: Yuxue Liu 
> Acked-by: Jason Wang 
> Reviewed-by: Heng Qi 
> ---
> V5: modify the description of the printout when an exception occurs
> V4: update the title and assign values to uninitialized variables
> V3: delete unused variables and add validation records
> V2: fix when allocating IRQs, scan all queues
>
>  drivers/vdpa/virtio_pci/vp_vdpa.c | 22 --
>  1 file changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c 
> b/drivers/vdpa/virtio_pci/vp_vdpa.c
> index df5f4a3bccb5..8de0224e9ec2 100644
> --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> @@ -160,7 +160,13 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
>   struct pci_dev *pdev = mdev->pci_dev;
>   int i, ret, irq;
>   int queues = vp_vdpa->queues;
> - int vectors = queues + 1;
> + int vectors = 1;
> + int msix_vec = 0;
> +
> + for (i = 0; i < queues; i++) {
> + if (vp_vdpa->vring[i].cb.callback)
> +

Re: [syzbot] [virt?] [net?] KMSAN: uninit-value in vsock_assign_transport (2)

2024-04-22 Thread syzbot

Hello,

syzbot has tested the proposed patch and the reproducer did not trigger any 
issue:

Reported-and-tested-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com

Tested on:

commit: bcc17a06 vhost/vsock: always initialize seqpacket_allow
git tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
console output: https://syzkaller.appspot.com/x/log.txt?x=12b58abb18
kernel config:  https://syzkaller.appspot.com/x/.config?x=87a805e655619c64
dashboard link: https://syzkaller.appspot.com/bug?extid=6c21aeb59d0e82eb2782
compiler:   Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 
2.40

Note: no patches were applied.
Note: testing is done by a robot and is best-effort only.

Re: [PATCH 1/1] genirq/cpuhotplug: retry with online CPUs on irq_do_set_affinity failure

2024-04-22 Thread Thomas Gleixner

On Mon, Apr 22 2024 at 16:09, Dongli Zhang wrote:
> On 4/22/24 13:58, Thomas Gleixner wrote:
>> On Thu, Apr 18 2024 at 18:33, Dongli Zhang wrote:
> Would you mind suggesting if the below commit message is fine to you?
>
>
> genirq/cpuhotplug: retry with cpu_online_mask when irq_do_set_affinity return
> -ENOSPC
>
> When a CPU goes offline, the interrupts pinned to that CPU are
> re-configured.
>
> Its managed interrupts undergo either migration to other CPUs or shutdown
> if all CPUs listed in the affinity are offline. This patch doesn't affect
> managed interrupts.
>
> For regular interrupts, they are migrated to other selected online CPUs.
> The target CPUs are chosen from either desc->pending_mask (suppose
> CONFIG_GENERIC_PENDING_IRQ) or d->common->affinity (suppose CONFIG_SMP).
> The cpu_online_mask is used as target CPUs only when CPUs in both
> desc->pending_mask and d->common->affinity are offline.
>
> However, there is a bad corner case, when desc->pending_mask or
> d->common->affinity is selected as the target cpumask, but none of their
> CPUs has any available vectors.

Up to here it's fine.

> As a result, an -ENOSPC error happens:
>
>   "IRQ151: set affinity failed(-28)."
>
> This is from the debugfs. The allocation fails although other online CPUs
> (except CPU=2) have many free vectors.

The debugfs output is not really providing more information than the
last sentence. It just occupies space :)

> The steps to reproduce the issue are in [1]. The core idea is:
>
> 1. Create a KVM guest with many virtio-net PCI devices, each configured
> with a very large number of queues/vectors.
>
> 2. Set the affinity of all virtio-net interrupts to "2,3".

That makes absolutely no sense at all. :)

But yes, I can see the non-real world problem with that.

> For regular interrupts, if irq_do_set_affinity() returns -ENOSPC, retry it
> with all online CPUs. The issue does not happen for managed interrupts
> because the vectors are always reserved (in cm->managed_map) before the CPU
> offline operation.
>
> [1]
> https://lore.kernel.org/all/20240419013322.58500-1-dongli.zh...@oracle.com/

The reproduction instructions are just additional information and not
necessarily change log material.

So I'd just say after the above:
> However, there is a bad corner case, when desc->pending_mask or
> d->common->affinity is selected as the target cpumask, but none of their
> CPUs has any available vectors.

   In this case the migration fails and the device interrupt becomes
   stale. This is not any different from the case where the affinity
   mask does not contain any online CPU, but there is no fallback
   operation for this.

   Instead of giving up retry the migration attempt with the online CPU
   mask if the interrupt is not managed as managed interrupts cannot be
   affected by this problem.

Hmm?

> I will change it to a single line.
>
> Would you mind suggesting which is preferred? !cpumask_equal(affinity,
> cpu_online_mask) or (affinity != cpu_online_mask)?

If at all you want !cpumask_subset(cpu_online_mask, affinity), but as
this is a corner case 'affinity != cpu_online_mask' should be good
enough.

Thanks,

tglx

Re: [PATCH v9 5/9] clk: mmp: Add Marvell PXA1908 clock driver

2024-04-22 Thread Stephen Boyd

Quoting Duje Mihanović (2024-04-20 06:32:56)
> On 4/20/24 00:24, Stephen Boyd wrote:
> > Quoting Duje Mihanović (2024-04-19 07:31:14)
> >> On Friday, April 12, 2024 4:57:09 AM GMT+2 Stephen Boyd wrote:
> >>> Quoting Duje Mihanović (2024-04-11 03:15:34)
> >>>
>  On 4/11/2024 10:00 AM, Stephen Boyd wrote:
> > Is there a reason this file can't be a platform driver?
> 
>  Not that I know of, I did it like this only because the other in-tree
>  MMP clk drivers do so. I guess the initialization should look like any
>  of the qcom GCC drivers then?
> >>>
> >>> Yes.
> >>
> >> With the entire clock driver code in one file this is quite messy as I also
> >> needed to add module_init and module_exit functions to (un)register each
> >> platform driver, presumably because the module_platform_driver macro 
> >> doesn't
> >> work with multiple platform drivers in one module. If I split up the driver
> >> code for each clock controller block into its own file (such as clk-of-
> >> pxa1908-apbc.c) as I believe is the best option, should the commits be 
> >> split
> >> up accordingly as well?
> > 
> > Sure. Why is 'of' in the name? Maybe that is unnecessary?
> 
> That seems to be a historical leftover from when Marvell was just adding 
> DT support to the ARM32 MMP SoCs which Rob followed along with in the 
> PXA1928 clk driver and so have I. Should I drop it then as Marvell has 
> in the PXA1908 vendor kernel?
> 

Sounds good to me.

Re: [PATCH 1/2] tracing/user_events: Fix non-spaced field matching

2024-04-22 Thread Google

On Mon, 22 Apr 2024 14:55:25 -0700
Beau Belgrave  wrote:

> On Sat, Apr 20, 2024 at 09:50:52PM +0900, Masami Hiramatsu wrote:
> > On Fri, 19 Apr 2024 14:13:34 -0700
> > Beau Belgrave  wrote:
> > 
> > > On Fri, Apr 19, 2024 at 11:33:05AM +0900, Masami Hiramatsu wrote:
> > > > On Tue, 16 Apr 2024 22:41:01 +
> > > > Beau Belgrave  wrote:
> 
> *SNIP*
> 
> > > > nit: This loop can be simpler, because we are sure fixed has enough 
> > > > length;
> > > > 
> > > > /* insert a space after ';' if there is no space. */
> > > > while(*args) {
> > > > *pos = *args++;
> > > > if (*pos++ == ';' && !isspace(*args))
> > > > *pos++ = ' ';
> > > > }
> > > > 
> > > 
> > > I was worried that if count_semis_no_space() ever had different logic
> > > (maybe after this commit) that it could cause an overflow if the count
> > > was wrong, etc.
> > > 
> > > I don't have an issue making it shorter, but I was trying to be more on
> > > the safe side, since this isn't a fast path (event register).
> > 
> > OK, anyway current code looks correct. But note that I don't think
> > "pos++; len--;" is safer, since it is not atomic. This pattern
> > easily loose "len--;" in my experience. So please carefully use it ;)
> > 
> 
> I'll stick with your loop. Perhaps others will chime in on the v2 and
> state a stronger opinion.
> 
> You scared me with the atomic comment, I went back and looked at all the
> paths for this. In the user_events IOCTL the buffer is copied from user
> to kernel, so it cannot change (and no other threads access it). I also
> checked trace_parse_run_command() which is the same. So at least in this
> context the non-atomic part is OK.

Oh, sorry if I scared you. I've seen bugs get introduced into loops like
this many times (while updating the code), so I try to keep it simple.
I'm sure that your code has no bugs.

Thank you,

-- 
Masami Hiramatsu (Google)

Re: [PATCH 1/1] genirq/cpuhotplug: retry with online CPUs on irq_do_set_affinity failure

2024-04-22 Thread Dongli Zhang

Hi Thomas,

On 4/22/24 13:58, Thomas Gleixner wrote:
> On Thu, Apr 18 2024 at 18:33, Dongli Zhang wrote:
> 
>> When a CPU is offline, its IRQs may migrate to other CPUs. For managed
>> IRQs, they are migrated, or shutdown (if all CPUs of the managed IRQ
>> affinity are offline). For regular IRQs, there will only be a
>> migration.
> 
> Please write out interrupts. There is enough space for it and IRQ is
> just not a regular word.

I will use "interrupts".

> 
>> The migrate_one_irq() first uses pending_mask or affinity_mask of the IRQ.
>>
>> 104 if (irq_fixup_move_pending(desc, true))
>> 105 affinity = irq_desc_get_pending_mask(desc);
>> 106 else
>> 107 affinity = irq_data_get_affinity_mask(d);
>>
>> The migrate_one_irq() may use all online CPUs, if all CPUs in
>> pending_mask/affinity_mask are already offline.
>>
>> 113 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
>> 114 /*
>> 115  * If the interrupt is managed, then shut it down and 
>> leave
>> 116  * the affinity untouched.
>> 117  */
>> 118 if (irqd_affinity_is_managed(d)) {
>> 119 irqd_set_managed_shutdown(d);
>> 120 irq_shutdown_and_deactivate(desc);
>> 121 return false;
>> 122 }
>> 123 affinity = cpu_online_mask;
>> 124 brokeaff = true;
>> 125 }
> 
> Please don't copy code into the change log. Describe the problem in
> text.

Would you mind suggesting if the below commit message is fine to you?

genirq/cpuhotplug: retry with cpu_online_mask when irq_do_set_affinity return
-ENOSPC

When a CPU goes offline, the interrupts pinned to that CPU are
re-configured.

Its managed interrupts undergo either migration to other CPUs or shutdown
if all CPUs listed in the affinity are offline. This patch doesn't affect
managed interrupts.

For regular interrupts, they are migrated to other selected online CPUs.
The target CPUs are chosen from either desc->pending_mask (suppose
CONFIG_GENERIC_PENDING_IRQ) or d->common->affinity (suppose CONFIG_SMP).
The cpu_online_mask is used as target CPUs only when CPUs in both
desc->pending_mask and d->common->affinity are offline.

However, there is a bad corner case, when desc->pending_mask or
d->common->affinity is selected as the target cpumask, but none of their
CPUs has any available vectors.

As a result, an -ENOSPC error happens:

  "IRQ151: set affinity failed(-28)."

This is from the debugfs. The allocation fails although other online CPUs
(except CPU=2) have many free vectors.

name:   VECTOR
 size:   0
 mapped: 529
 flags:  0x0103
Online bitmaps:7
Global available:884
Global reserved:   6
Total allocated: 539
System: 36: 0-19,21,50,128,236,243-244,246-255
 | CPU | avl | man | mac | act | vectors
 0   147 0 0   55  32-49,51-87
 1   147 0 0   55  32-49,51-87
 2 0 0 0  202  32-49,51-127,129-235
 4   147 0 0   55  32-49,51-87
 5   147 0 0   55  32-49,51-87
 6   148 0 0   54  32-49,51-86
 7   148 0 0   54  32-49,51-86

The steps to reproduce the issue are in [1]. The core idea is:

1. Create a KVM guest with many virtio-net PCI devices, each configured
with a very large number of queues/vectors.

2. Set the affinity of all virtio-net interrupts to "2,3".

3. Offline many CPUs, excluding "2,3".

4. Offline CPU=2, and irq_do_set_affinity() returns -ENOSPC.

For regular interrupts, if irq_do_set_affinity() returns -ENOSPC, retry it
with all online CPUs. The issue does not happen for managed interrupts
because the vectors are always reserved (in cm->managed_map) before the CPU
offline operation.

[1] https://lore.kernel.org/all/20240419013322.58500-1-dongli.zh...@oracle.com/

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 

> 
>> However, there is a corner case. Although some CPUs in
>> pending_mask/affinity_mask are still online, they are lack of available
>> vectors. If the kernel continues calling irq_do_set_affinity() with those 
>> CPUs,
>> there will be -ENOSPC error.
>>
>> This is not reasonable as other online CPUs still have many available
>> vectors.
> 
> Reasonable is not the question here. It's either correct or not.

This has been re-written in the new commit message.

> 
>> name:   VECTOR
>>  size:   0
>>  mapped: 529
>>  flags:  0x0103
>> Online bitmaps:7
>> Global available:884
>> Global reserved:   6
>> Total allocated: 539
>> System: 36: 0-19,21,50,128,236,243-244,246-255
>>  | CPU | avl | man | mac | act | vectors
>>  0   147 0 0   55  32-49,51-87
>>  1   147 0 0   55  32-49,51-87
>>  2 0 0 0  202  32-49,51-127,129-235
> 
> Just ouf of curiousity. How did this end up with CPU2 completely
> occupied?

The details are in the link:

Re: [PATCH 1/2] arm64: dts: qcom: pmi632: Add vibrator

2024-04-22 Thread Konrad Dybcio





On 4/18/24 12:03, Luca Weiss wrote:

On Thu Apr 18, 2024 at 12:01 PM CEST, Konrad Dybcio wrote:

On 18.04.2024 8:36 AM, Luca Weiss wrote:

Add a node for the vibrator module found inside the PMI632.

Signed-off-by: Luca Weiss 
---


Reviewed-by: Konrad Dybcio 

On a side note, this is a totally configuration-free peripheral that doesn't do
anything crazy until manually configured.

In the slow quest to be (hopefully) more sane about the defaults, should we keep
them enabled by default? Bjorn?


But many (most?) devices don't have a vibration motor connected to
PMI632, some (like devboards) don't have anything, and other phones have
a separate chip that controls the vibration motor.

Enabling this by default would mean all devices with PMI632 would get an
input device for the vibrator that probably doesn't work?


Fair

Konrad

Re: [PATCH v12 09/14] x86/sgx: Implement async reclamation for cgroup

2024-04-22 Thread Huang, Kai

On Mon, 2024-04-22 at 11:17 -0500, Haitao Huang wrote:
> On Sun, 21 Apr 2024 19:22:27 -0500, Huang, Kai  wrote:
> 
> > On Fri, 2024-04-19 at 20:14 -0500, Haitao Huang wrote:
> > > > > I think we can add support for "sgx_cgroup=disabled" in future if  
> > > indeed
> > > > > needed. But just for init failure, no?
> > > > > 
> > > > 
> > > > It's not about the commandline, which we can add in the future when
> > > > needed.  It's about we need to have a way to handle SGX cgroup being
> > > > disabled at boot time nicely, because we already have a case where we 
> > > > need
> > > > to do so.
> > > > 
> > > > Your approach looks half-way to me, and is not future extendible.  If  
> > > we
> > > > choose to do it, do it right -- that is, we need a way to disable it
> > > > completely in both kernel and userspace so that userspace won't be  
> > > able> to
> > > > see it.
> > > 
> > > That would need more changes in misc cgroup implementation to support 
> > > sgx-disable. Right now misc does not have separate files for different 
> > > resource types. So we can only block echo "sgx_epc..." to those  
> > > interfacefiles, can't really make files not visible.
> > 
> > "won't be able to see" I mean "only for SGX EPC resource", but not the
> > control files for the entire MISC cgroup.
> > 
> > I replied at the beginning of the previous reply:
> > 
> > "
> > Given SGX EPC is just one type of MISC cgroup resources, we cannot just
> > disable MISC cgroup as a whole.
> > "
> > 
> Sorry I missed this point. below.
> 
> > You just need to set the SGX EPC "capacity" to 0 to disable SGX EPC.  See
> > the comment of @misc_res_capacity:
> > 
> >  * Miscellaneous resources capacity for the entire machine. 0 capacity
> >  * means resource is not initialized or not present in the host.
> > 
> 
> IIUC I don't think the situation we have is either of those cases. For our  
> case, resource is inited and present on the host but we have allocation  
> error for sgx cgroup infra.

You have calculated the "capacity", but later you failed something and
then reset the "capacity" to 0, i.e., cleanup.  What's wrong with that?

> 
> > And "blocking echo sgx_epc ... to those control files" is already
> > sufficient for the purpose of not exposing SGX EPC to userspace, correct?
> > 
> > E.g., if SGX cgroup is enabled, you can see below when you read "max":
> > 
> >  # cat /sys/fs/cgroup/my_group/misc.max
> >  #  
> >sgx_epc ...
> >...
> > 
> > Otherwise you won't be able to see "sgx_epc":
> > 
> >  # cat /sys/fs/cgroup/my_group/misc.max
> >  #  
> >...
> > 
> > And when you try to write the "max" for "sgx_epc", you will hit error:
> > 
> >  # echo "sgx_epc 100" > /sys/fs/cgroup/my_group/misc.max
> >  # ... echo: write error: Invalid argument
> > 
> > The above applies to all the control files.  To me this is pretty much
> > means "SGX EPC is disabled" or "not supported" for userspace.
> > 
> You are right, capacity == 0 does block echoing max and users see an error  
> if they do that. But 1) doubt you literately wanted "SGX EPC is disabled"  
> and make it unsupported in this case, 
> 

I don't understand.  Something failed during SGX cgroup initialization,
you _literally_ cannot continue to support it.


> 2) even if we accept this is "sgx  
> cgroup disabled" I don't see how it is much better user experience than  
> current solution or really helps user better.

In your way, the userspace is still able to see "sgx_epc" in control files
and is able to update them.  So from userspace's perspective SGX cgroup is
enabled, but obviously updating to "max" doesn't have any impact.  This
will confuse userspace.

> 
> Also to implement this approach, as you mentioned, we need workaround the  
> fact that misc_try_charge() fails when capacity set to zero, and adding  
> code to return root always? 
> 

Why this is a problem?

> So it seems like more workaround code to just  
> make it work for a failing case no one really care much and end result is  
> not really much better IMHO.

It's not workaround, it's the right thing to do.

The result is userspace will see it being disabled when kernel disables
it.

Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null

2024-04-22 Thread Mauro Carvalho Chehab

Em Tue, 23 Apr 2024 00:04:01 +0200
Greg KH  escreveu:

> On Mon, Apr 22, 2024 at 10:46:37PM +0100, Mauro Carvalho Chehab wrote:
> > Em Mon, 22 Apr 2024 15:25:18 -0400
> > Konstantin Ryabitsev  escreveu:
> >   
> > > On Mon, Apr 22, 2024 at 05:49:29PM +0200, Thorsten Leemhuis wrote:  
> > > > @Greg, BTW: should this be stable+noauto...@kernel.org or have a 
> > > > 'vger.'
> > > 
> > > No vger, just stable+whate...@kernel.org.
> > >   
> > > > in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.'
> > > > is fine, just wanted to be sure, as
> > > > Documentation/process/stable-kernel-rules.rst in all other cases
> > > > specifies sta...@vger.kernel.org, so people are likely to get confused.
> > > > :-/ #sigh
> > > 
> > > These serve two different purposes:
> > > 
> > > sta...@kernel.org (goes into devnull)
> > > sta...@vger.kernel.org (actual mailing list)
> > > 
> > > Confusion happens all the time, unfortunately.  
> > 
> > Yeah, I did already used sta...@kernel.org a few times in the
> > past. 
> > 
> > IMO, the best would be either for stable to also accept it or for
> > kernel.org mail server to return an error message (only to the
> > submitter) warning about the invalid address, eventually with a
> > hint message pointing to the correct value.  
> 
> sta...@kernel.org is there to route to /dev/null on purpose so that
> developers/maintainers who only want their patches to get picked up when
> they hit Linus's tree, will have happen and not notify anyone else.
> This is especially good when dealing with security-related things as we
> have had MANY people accidentally leak patches way too early by having
>  cc: sta...@vger.kernel.org in their signed-off-by areas, and forgetting
> to tell git send-email to suppress cc: when sending them out for
> internal review.

Nice! didn't know about that. On a quick check, the only place at
documentation mentioning it without vger is at checkpatch.rst.

Perhaps it would make sense to document that as well.

> Having that bounce would just be noisy for the developers involved.
> 
> thanks,
> 
> greg k-h

Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null

2024-04-22 Thread Greg KH

On Mon, Apr 22, 2024 at 10:46:37PM +0100, Mauro Carvalho Chehab wrote:
> Em Mon, 22 Apr 2024 15:25:18 -0400
> Konstantin Ryabitsev  escreveu:
> 
> > On Mon, Apr 22, 2024 at 05:49:29PM +0200, Thorsten Leemhuis wrote:
> > > @Greg, BTW: should this be stable+noauto...@kernel.org or have a 
> > > 'vger.'  
> > 
> > No vger, just stable+whate...@kernel.org.
> > 
> > > in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.'
> > > is fine, just wanted to be sure, as
> > > Documentation/process/stable-kernel-rules.rst in all other cases
> > > specifies sta...@vger.kernel.org, so people are likely to get confused.
> > > :-/ #sigh  
> > 
> > These serve two different purposes:
> > 
> > sta...@kernel.org (goes into devnull)
> > sta...@vger.kernel.org (actual mailing list)
> > 
> > Confusion happens all the time, unfortunately.
> 
> Yeah, I did already used sta...@kernel.org a few times in the
> past. 
> 
> IMO, the best would be either for stable to also accept it or for
> kernel.org mail server to return an error message (only to the
> submitter) warning about the invalid address, eventually with a
> hint message pointing to the correct value.

sta...@kernel.org is there to route to /dev/null on purpose so that
developers/maintainers who only want their patches to get picked up when
they hit Linus's tree, will have happen and not notify anyone else.
This is especially good when dealing with security-related things as we
have had MANY people accidentally leak patches way too early by having
 cc: sta...@vger.kernel.org in their signed-off-by areas, and forgetting
to tell git send-email to suppress cc: when sending them out for
internal review.

Having that bounce would just be noisy for the developers involved.

thanks,

greg k-h

Re: [PATCH 1/2] tracing/user_events: Fix non-spaced field matching

2024-04-22 Thread Beau Belgrave

On Sat, Apr 20, 2024 at 09:50:52PM +0900, Masami Hiramatsu wrote:
> On Fri, 19 Apr 2024 14:13:34 -0700
> Beau Belgrave  wrote:
> 
> > On Fri, Apr 19, 2024 at 11:33:05AM +0900, Masami Hiramatsu wrote:
> > > On Tue, 16 Apr 2024 22:41:01 +
> > > Beau Belgrave  wrote:

*SNIP*

> > > nit: This loop can be simpler, because we are sure fixed has enough 
> > > length;
> > > 
> > > /* insert a space after ';' if there is no space. */
> > > while(*args) {
> > >   *pos = *args++;
> > >   if (*pos++ == ';' && !isspace(*args))
> > >   *pos++ = ' ';
> > > }
> > > 
> > 
> > I was worried that if count_semis_no_space() ever had different logic
> > (maybe after this commit) that it could cause an overflow if the count
> > was wrong, etc.
> > 
> > I don't have an issue making it shorter, but I was trying to be more on
> > the safe side, since this isn't a fast path (event register).
> 
> OK, anyway current code looks correct. But note that I don't think
> "pos++; len--;" is safer, since it is not atomic. This pattern
> easily loose "len--;" in my experience. So please carefully use it ;)
> 

I'll stick with your loop. Perhaps others will chime in on the v2 and
state a stronger opinion.

You scared me with the atomic comment, I went back and looked at all the
paths for this. In the user_events IOCTL the buffer is copied from user
to kernel, so it cannot change (and no other threads access it). I also
checked trace_parse_run_command() which is the same. So at least in this
context the non-atomic part is OK.

> > 
> > > > +
> > > > +   /*
> > > > +* len is the length of the copy excluding the null.
> > > > +* This ensures we always have room for a null.
> > > > +*/
> > > > +   *pos = '\0';
> > > > +
> > > > +   return fixed;
> > > > +}
> > > > +
> > > > +static char **user_event_argv_split(char *args, int *argc)
> > > > +{
> > > > +   /* Count how many ';' without a trailing space */
> > > > +   int count = count_semis_no_space(args);
> > > > +
> > > > +   if (count) {
> > > 
> > > nit: it is better to exit fast, so 
> > > 
> > >   if (!count)
> > >   return argv_split(GFP_KERNEL, args, argc);
> > > 
> > >   ...
> > 
> > Sure, will fix in a v2.
> > 
> > > 
> > > Thank you,
> > > 
> > > OT: BTW, can this also simplify synthetic events?
> > > 
> > 
> > I'm not sure, I'll check when I have some time. I want to get this fix
> > in sooner rather than later.
> 
> Ah, nevermind. Synthetic event parses the field by strsep(';') first
> and argv_split(). So it does not have this issue.
> 

Ok, seems unrelated. Thanks for checking.

Thanks,
-Beau

> Thank you,
> 
> > 
> > Thanks,
> > -Beau
> > 

*SNIP* 

> > > Masami Hiramatsu (Google) 
> 
> 
> -- 
> Masami Hiramatsu (Google)

Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null

2024-04-22 Thread Mauro Carvalho Chehab

Em Mon, 22 Apr 2024 15:25:18 -0400
Konstantin Ryabitsev  escreveu:

> On Mon, Apr 22, 2024 at 05:49:29PM +0200, Thorsten Leemhuis wrote:
> > @Greg, BTW: should this be stable+noauto...@kernel.org or have a 
> > 'vger.'  
> 
> No vger, just stable+whate...@kernel.org.
> 
> > in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.'
> > is fine, just wanted to be sure, as
> > Documentation/process/stable-kernel-rules.rst in all other cases
> > specifies sta...@vger.kernel.org, so people are likely to get confused.
> > :-/ #sigh  
> 
> These serve two different purposes:
> 
> sta...@kernel.org (goes into devnull)
> sta...@vger.kernel.org (actual mailing list)
> 
> Confusion happens all the time, unfortunately.

Yeah, I did already used sta...@kernel.org a few times in the
past. 

IMO, the best would be either for stable to also accept it or for
kernel.org mail server to return an error message (only to the
submitter) warning about the invalid address, eventually with a
hint message pointing to the correct value.

> 
> Notably, even if someone uses stable+noauto...@vger.kernel.org, it won't 
> do anything terrible (it won't bounce, it'll just quietly go into 
> nowhere because that's not a valid expansion command).
> 
> -K
>

Re: [PATCH v2 0/6] virtiofs: fix the warning for ITER_KVEC dio

2024-04-22 Thread Bernd Schubert

On 4/22/24 22:06, Michael S. Tsirkin wrote:
> On Tue, Apr 09, 2024 at 09:48:08AM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 4/8/2024 3:45 PM, Michael S. Tsirkin wrote:
>>> On Wed, Feb 28, 2024 at 10:41:20PM +0800, Hou Tao wrote:
 From: Hou Tao 

 Hi,

 The patch set aims to fix the warning related to an abnormal size
 parameter of kmalloc() in virtiofs. The warning occurred when attempting
 to insert a 10MB sized kernel module kept in a virtiofs with cache
 disabled. As analyzed in patch #1, the root cause is that the length of
 the read buffer is no limited, and the read buffer is passed directly to
 virtiofs through out_args[0].value. Therefore patch #1 limits the
 length of the read buffer passed to virtiofs by using max_pages. However
 it is not enough, because now the maximal value of max_pages is 256.
 Consequently, when reading a 10MB-sized kernel module, the length of the
 bounce buffer in virtiofs will be 40 + (256 * 4096), and kmalloc will
 try to allocate 2MB from memory subsystem. The request for 2MB of
 physically contiguous memory significantly stress the memory subsystem
 and may fail indefinitely on hosts with fragmented memory. To address
 this, patch #2~#5 use scattered pages in a bio_vec to replace the
 kmalloc-allocated bounce buffer when the length of the bounce buffer for
 KVEC_ITER dio is larger than PAGE_SIZE. The final issue with the
 allocation of the bounce buffer and sg array in virtiofs is that
 GFP_ATOMIC is used even when the allocation occurs in a kworker context.
 Therefore the last patch uses GFP_NOFS for the allocation of both sg
 array and bounce buffer when initiated by the kworker. For more details,
 please check the individual patches.

 As usual, comments are always welcome.

 Change Log:
>>> Bernd should I just merge the patchset as is?
>>> It seems to fix a real problem and no one has the
>>> time to work on a better fix  WDYT?
>>
>> Sorry for the long delay. I am just start to prepare for v3. In v3, I
>> plan to avoid the unnecessary memory copy between fuse args and bio_vec.
>> Will post it before next week.
> 
> Didn't happen before this week apparently.

Hi Michael,

sorry for my later reply, I had been totally busy for the last weeks as
well. Also I can't decide to merge it - I'm not the official fuse
maintainer...
>From my point of view, patch 1 is just missing to set the actual limit
and then would be clear and easy back-portable bug fix.
Not promised, I will try it out if I find a bit time tomorrow.

Bernd

Re: [PATCH 1/1] genirq/cpuhotplug: retry with online CPUs on irq_do_set_affinity failure

2024-04-22 Thread Thomas Gleixner

On Thu, Apr 18 2024 at 18:33, Dongli Zhang wrote:

> When a CPU is offline, its IRQs may migrate to other CPUs. For managed
> IRQs, they are migrated, or shutdown (if all CPUs of the managed IRQ
> affinity are offline). For regular IRQs, there will only be a
> migration.

Please write out interrupts. There is enough space for it and IRQ is
just not a regular word.

> The migrate_one_irq() first uses pending_mask or affinity_mask of the IRQ.
>
> 104 if (irq_fixup_move_pending(desc, true))
> 105 affinity = irq_desc_get_pending_mask(desc);
> 106 else
> 107 affinity = irq_data_get_affinity_mask(d);
>
> The migrate_one_irq() may use all online CPUs, if all CPUs in
> pending_mask/affinity_mask are already offline.
>
> 113 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
> 114 /*
> 115  * If the interrupt is managed, then shut it down and 
> leave
> 116  * the affinity untouched.
> 117  */
> 118 if (irqd_affinity_is_managed(d)) {
> 119 irqd_set_managed_shutdown(d);
> 120 irq_shutdown_and_deactivate(desc);
> 121 return false;
> 122 }
> 123 affinity = cpu_online_mask;
> 124 brokeaff = true;
> 125 }

Please don't copy code into the change log. Describe the problem in
text.

> However, there is a corner case. Although some CPUs in
> pending_mask/affinity_mask are still online, they are lack of available
> vectors. If the kernel continues calling irq_do_set_affinity() with those 
> CPUs,
> there will be -ENOSPC error.
>
> This is not reasonable as other online CPUs still have many available
> vectors.

Reasonable is not the question here. It's either correct or not.

> name:   VECTOR
>  size:   0
>  mapped: 529
>  flags:  0x0103
> Online bitmaps:7
> Global available:884
> Global reserved:   6
> Total allocated: 539
> System: 36: 0-19,21,50,128,236,243-244,246-255
>  | CPU | avl | man | mac | act | vectors
>  0   147 0 0   55  32-49,51-87
>  1   147 0 0   55  32-49,51-87
>  2 0 0 0  202  32-49,51-127,129-235

Just ouf of curiousity. How did this end up with CPU2 completely
occupied?

>  4   147 0 0   55  32-49,51-87
>  5   147 0 0   55  32-49,51-87
>  6   148 0 0   54  32-49,51-86
>  7   148 0 0   54  32-49,51-86
>
> This issue should not happen for managed IRQs because the vectors are already
> reserved before CPU hotplug.

Should not? It either does or it does not.

> For regular IRQs, do a re-try with all online
> CPUs if the prior irq_do_set_affinity() is failed with -ENOSPC.
>
> Cc: Joe Jin 
> Signed-off-by: Dongli Zhang 
> ---
>  kernel/irq/cpuhotplug.c | 13 +
>  1 file changed, 13 insertions(+)
>
> diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
> index 1ed2b1739363..d1666a6b73f4 100644
> --- a/kernel/irq/cpuhotplug.c
> +++ b/kernel/irq/cpuhotplug.c
> @@ -130,6 +130,19 @@ static bool migrate_one_irq(struct irq_desc *desc)
>* CPU.
>*/
>   err = irq_do_set_affinity(d, affinity, false);
> +
> + if (err == -ENOSPC &&
> + !irqd_affinity_is_managed(d) &&
> + affinity != cpu_online_mask) {

This really wants to be a single line conditional.

> + affinity = cpu_online_mask;
> + brokeaff = true;
> +
> + pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with all 
> online CPUs\n",
> +  d->irq, cpumask_pr_args(affinity));

How is it useful to print cpu_online_mask here?

Thanks,

tglx

Re: [PATCH 0/3] Improve memory statistics for virtio balloon

2024-04-22 Thread Michael S. Tsirkin

On Thu, Apr 18, 2024 at 02:25:59PM +0800, zhenwei pi wrote:
> RFC -> v1:
> - several text changes: oom-kill -> oom-kills, SCAN_ASYNC -> ASYN_SCAN.
> - move vm events codes into '#ifdef CONFIG_VM_EVENT_COUNTERS'
> 
> RFC version:
> Link: 
> https://lore.kernel.org/lkml/20240415084113.1203428-1-pizhen...@bytedance.com/T/#m1898963b3c27a989b1123db475135c3ca687ca84


Make sure this builds without introducing new warnings please. 

> zhenwei pi (3):
>   virtio_balloon: introduce oom-kill invocations
>   virtio_balloon: introduce memory allocation stall counter
>   virtio_balloon: introduce memory scan/reclaim info
> 
>  drivers/virtio/virtio_balloon.c | 30 -
>  include/uapi/linux/virtio_balloon.h | 16 +--
>  2 files changed, 43 insertions(+), 3 deletions(-)
> 
> -- 
> 2.34.1

Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

2024-04-22 Thread Guillaume Morin

On 22 Apr 20:59, David Hildenbrand wrote:
> > The benefit - to me - is very clear. People do use hugetlb mappings to
> > run code in production environments. The perf benefits are there for some
> > workloads. Intel has published a whitepaper about it etc.
> > Uprobes are a very good tool to do live tracing. If you can restart the
> > process and reproduce, you should be able to disable hugetlb remapping
> > but if you need to look at a live process, there are not many options.
> > Not being able to use uprobes is crippling.
> 
> Please add all that as motivation to the patch description or cover letter.
>
> > > Yes, libhugetlbfs exists. But why do we have to support uprobes with it?
> > > Nobody cared until now, why care now?
> > 
> > I think you could ask the same question for every new feature patch :)
> 
> I have to, because it usually indicates a lack of motivation in the
> cover-letter/patch description :P

My cover letter was indeed lacking. I will make sure to add this kind of
details next time.
 
> > Since the removal a few releases ago of the __morecore() hook in glibc,
> > the main feature of libhugetlbfs is ELF segments remapping. I think
> > there are definitely a lot of users that simply deal with this
> > unnecessary limitation.
> > 
> > I am certainly not shoving this patch through anyone's throat if there
> > is no interest. But we definitely find it a very useful feature ...
> 
> Let me try to see if we can get this done cleaner.
> 
> One ugly part (in general here) is the custom page replacement in the
> registration part.
> 
> We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing pages
> ourselves (which we likely shouldn't do ...) ... maybe we could use
> FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio
> populated. (like KSM does nowadays)
> 
> Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing, but
> using FOLL_WRITE would not work on many file systems. So maybe we have to
> trigger an unsharing fault ourselves.
> 
> That would do the page replacement for us and we "should" be able to lookup
> an anonymous folio that we can then just modify, like ptrace would.
> 
> But then, there is also unregistration part, with weird conditional page
> replacement. Zapping the anon page if the content matches the content of the
> original page is one thing. But why are we placing an existing anonymous
> page by a new anonymous page when the content from the original page differs
> (but matches the one from the just copied page?)?
> 
> I'll have to further think about that one. It's all a bit nasty.

Sounds good to me. I am willing to help with the code when you have a
plan or testing as you see fit. Let me know.

> One thing to note is that hugetlb folios don't grow on trees. Likely, Many
> setups *don't* reserve extra hugetlb folios and you might just easily be
> running out of free hugetlb folios that you can use to break COW here
> (replace a file hugetlb by a fresh anon hugetlb page). Likely it's easy to
> make register or unregister fail.

Agreed.

-- 
Guillaume Morin

Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-22 Thread Michael S. Tsirkin

On Mon, Apr 08, 2024 at 02:15:24PM +1000, Gavin Shan wrote:
> Hi Michael,
> 
> On 3/30/24 19:02, Gavin Shan wrote:
> > On 3/28/24 19:31, Michael S. Tsirkin wrote:
> > > On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote:
> > > > All the callers of vhost_get_avail_idx() are concerned to the memory
> > > > barrier, imposed by smp_rmb() to ensure the order of the available
> > > > ring entry read and avail_idx read.
> > > > 
> > > > Improve vhost_get_avail_idx() so that smp_rmb() is executed when
> > > > the avail_idx is advanced. With it, the callers needn't to worry
> > > > about the memory barrier.
> > > > 
> > > > Suggested-by: Michael S. Tsirkin 
> > > > Signed-off-by: Gavin Shan 
> > > 
> > > Previous patches are ok. This one I feel needs more work -
> > > first more code such as sanity checking should go into
> > > this function, second there's actually a difference
> > > between comparing to last_avail_idx and just comparing
> > > to the previous value of avail_idx.
> > > I will pick patches 1-2 and post a cleanup on top so you can
> > > take a look, ok?
> > > 
> > 
> > Thanks, Michael. It's fine to me.
> > 
> 
> A kindly ping.
> 
> If it's ok to you, could you please merge PATCH[1-2]? Our downstream
> 9.4 need the fixes, especially for NVidia's grace-hopper and grace-grace
> platforms.
> 
> For PATCH[3], I also can help with the improvement if you don't have time
> for it. Please let me know.
> 
> Thanks,
> Gavin

1-2 are upstream go ahead and post the cleanup.

-- 
MST

Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions

2024-04-22 Thread Vincent Donnefort

On Mon, Apr 22, 2024 at 08:27:17PM +0200, David Hildenbrand wrote:
> On 22.04.24 20:20, Vincent Donnefort wrote:
> > Hi David,
> > 
> > Thanks for having a look, very much appreciated!
> > 
> > On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote:
> > > On 19.04.24 20:25, David Hildenbrand wrote:
> > > > On 06.04.24 19:36, Vincent Donnefort wrote:
> > > > > In preparation for allowing the user-space to map a ring-buffer, add
> > > > > a set of mapping functions:
> > > > > 
> > > > >  ring_buffer_{map,unmap}()
> > > > > 
> > > > > And controls on the ring-buffer:
> > > > > 
> > > > >  ring_buffer_map_get_reader()  /* swap reader and head */
> > > > > 
> > > > > Mapping the ring-buffer also involves:
> > > > > 
> > > > >  A unique ID for each subbuf of the ring-buffer, currently they 
> > > > > are
> > > > >  only identified through their in-kernel VA.
> > > > > 
> > > > >  A meta-page, where are stored ring-buffer statistics and a
> > > > >  description for the current reader
> > > > > 
> > > > > The linear mapping exposes the meta-page, and each subbuf of the
> > > > > ring-buffer, ordered following their unique ID, assigned during the
> > > > > first mapping.
> > > > > 
> > > > > Once mapped, no subbuf can get in or out of the ring-buffer: the 
> > > > > buffer
> > > > > size will remain unmodified and the splice enabling functions will in
> > > > > reality simply memcpy the data instead of swapping subbufs.
> > > > > 
> > > > > CC: 
> > > > > Signed-off-by: Vincent Donnefort 
> > > > > 
> > > > > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
> > > > > index dc5ae4e96aee..96d2140b471e 100644
> > > > > --- a/include/linux/ring_buffer.h
> > > > > +++ b/include/linux/ring_buffer.h
> > > > > @@ -6,6 +6,8 @@
> > > > > #include 
> > > > > #include 
> > > > > +#include 
> > > > > +
> > > > > struct trace_buffer;
> > > > > struct ring_buffer_iter;
> > > > > @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct 
> > > > > hlist_node *node);
> > > > > #define trace_rb_cpu_prepare  NULL
> > > > > #endif
> > > > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu,
> > > > > + struct vm_area_struct *vma);
> > > > > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
> > > > > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
> > > > > #endif /* _LINUX_RING_BUFFER_H */
> > > > > diff --git a/include/uapi/linux/trace_mmap.h 
> > > > > b/include/uapi/linux/trace_mmap.h
> > > > > new file mode 100644
> > > > > index ..ffcd8dfcaa4f
> > > > > --- /dev/null
> > > > > +++ b/include/uapi/linux/trace_mmap.h
> > > > > @@ -0,0 +1,46 @@
> > > > > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > > > > +#ifndef _TRACE_MMAP_H_
> > > > > +#define _TRACE_MMAP_H_
> > > > > +
> > > > > +#include 
> > > > > +
> > > > > +/**
> > > > > + * struct trace_buffer_meta - Ring-buffer Meta-page description
> > > > > + * @meta_page_size:  Size of this meta-page.
> > > > > + * @meta_struct_len: Size of this structure.
> > > > > + * @subbuf_size: Size of each sub-buffer.
> > > > > + * @nr_subbufs:  Number of subbfs in the ring-buffer, 
> > > > > including the reader.
> > > > > + * @reader.lost_events:  Number of events lost at the time of 
> > > > > the reader swap.
> > > > > + * @reader.id:   subbuf ID of the current reader. ID 
> > > > > range [0 : @nr_subbufs - 1]
> > > > > + * @reader.read: Number of bytes read on the reader subbuf.
> > > > > + * @flags:   Placeholder for now, 0 until new features are 
> > > > > supported.
> > > > > + * @entries: Number of entries in the ring-buffer.
> > > > > + * @overrun: Number of entries lost in the ring-buffer.
> > > > > + * @read:Number of entries that have been read.
> > > > > + * @Reserved1:   Reserved for future use.
> > > > > + * @Reserved2:   Reserved for future use.
> > > > > + */
> > > > > +struct trace_buffer_meta {
> > > > > + __u32   meta_page_size;
> > > > > + __u32   meta_struct_len;
> > > > > +
> > > > > + __u32   subbuf_size;
> > > > > + __u32   nr_subbufs;
> > > > > +
> > > > > + struct {
> > > > > + __u64   lost_events;
> > > > > + __u32   id;
> > > > > + __u32   read;
> > > > > + } reader;
> > > > > +
> > > > > + __u64   flags;
> > > > > +
> > > > > + __u64   entries;
> > > > > + __u64   overrun;
> > > > > + __u64   read;
> > > > > +
> > > > > + __u64   Reserved1;
> > > > > + __u64   Reserved2;
> > > > > +};
> > > > > +
> > > > > +#endif /* _TRACE_MMAP_H_ */
> > > > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > > > > index cc9ebe593571..793ecc454039 100644
> > > > > --- a/kernel/trace/ring_buffer.c
> > > > > +++ b/kernel/trace/ring_buffer.c
> > > > > @@ -9,6

Re: [PATCHv3 bpf-next 0/7] uprobe: uretprobe speed up

2024-04-22 Thread Jiri Olsa

On Tue, Apr 23, 2024 at 12:09:43AM +0900, Masami Hiramatsu wrote:
> Hi Jiri,
> 
> On Sun, 21 Apr 2024 21:41:59 +0200
> Jiri Olsa  wrote:
> 
> > hi,
> > as part of the effort on speeding up the uprobes [0] coming with
> > return uprobe optimization by using syscall instead of the trap
> > on the uretprobe trampoline.
> > 
> > The speed up depends on instruction type that uprobe is installed
> > and depends on specific HW type, please check patch 1 for details.
> > 
> > Patches 1-6 are based on bpf-next/master, but path 1 and 2 are
> > apply-able on linux-trace.git tree probes/for-next branch.
> > Patch 7 is based on man-pages master.
> 
> Thanks for updated! I reviewed the series and just except for the
> manpage, it looks good to me.
> 
> Reviewed-by: Masami Hiramatsu (Google) 
> 
> for the series.
> If Linux API maintainers are OK, I can pick this in probes/for-next.

great, thanks

> (BTW, who will pick the manpage patch?)

ugh, I cc-ed linux-api but not linux-...@vger.kernel.org
I'll add that for new version

jirka

> 
> Thank you,
> 
> > 
> > v3 changes:
> >   - added source ip check if the uretprobe syscall is called from
> > trampoline and sending SIGILL to process if it's not
> >   - keep x86 compat process to use standard breakpoint
> >   - split syscall wiring into separate change
> >   - ran ltp and syzkaller locally, no issues found [Masami]
> >   - building uprobe_compat binary in selftests which breaks
> > CI atm because of missing 32-bit delve packages, I will
> > need to fix that in separate changes once this is acked
> >   - added man page change
> >   - there were several changes so I removed acks [Oleg Andrii]
> > 
> > Also available at:
> >   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> >   uretprobe_syscall
> > 
> > thanks,
> > jirka
> > 
> > 
> > Notes to check list items in Documentation/process/adding-syscalls.rst:
> > 
> > - System Call Alternatives
> >   New syscall seems like the best way in here, becase we need
> >   just to quickly enter kernel with no extra arguments processing,
> >   which we'd need to do if we decided to use another syscall.
> > 
> > - Designing the API: Planning for Extension
> >   The uretprobe syscall is very specific and most likely won't be
> >   extended in the future.
> > 
> >   At the moment it does not take any arguments and even if it does
> >   in future, it's allowed to be called only from trampoline prepared
> >   by kernel, so there'll be no broken user.
> > 
> > - Designing the API: Other Considerations
> >   N/A because uretprobe syscall does not return reference to kernel
> >   object.
> > 
> > - Proposing the API
> >   Wiring up of the uretprobe system call si in separate change,
> >   selftests and man page changes are part of the patchset.
> > 
> > - Generic System Call Implementation
> >   There's no CONFIG option for the new functionality because it
> >   keeps the same behaviour from the user POV.
> > 
> > - x86 System Call Implementation
> >   It's 64-bit syscall only.
> > 
> > - Compatibility System Calls (Generic)
> >   N/A uretprobe syscall has no arguments and is not supported
> >   for compat processes.
> > 
> > - Compatibility System Calls (x86)
> >   N/A uretprobe syscall is not supported for compat processes.
> > 
> > - System Calls Returning Elsewhere
> >   N/A.
> > 
> > - Other Details
> >   N/A.
> > 
> > - Testing
> >   Adding new bpf selftests and ran ltp on top of this change.
> > 
> > - Man Page
> >   Attached.
> > 
> > - Do not call System Calls in the Kernel
> >   N/A.
> > 
> > 
> > [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> > ---
> > Jiri Olsa (6):
> >   uprobe: Wire up uretprobe system call
> >   uprobe: Add uretprobe syscall to speed up return probe
> >   selftests/bpf: Add uretprobe syscall test for regs integrity
> >   selftests/bpf: Add uretprobe syscall test for regs changes
> >   selftests/bpf: Add uretprobe syscall call from user space test
> >   selftests/bpf: Add uretprobe compat test
> > 
> >  arch/x86/entry/syscalls/syscall_64.tbl|   1 +
> >  arch/x86/kernel/uprobes.c | 115 
> > ++
> >  include/linux/syscalls.h  |   2 +
> >  include/linux/uprobes.h   |   3 +
> >  include/uapi/asm-generic/unistd.h |   5 +-
> >  kernel/events/uprobes.c   |  24 +--
> >  kernel/sys_ni.c   |   2 +
> >  tools/include/linux/compiler.h|   4 ++
> >  tools/testing/selftests/bpf/.gitignore|   1 +
> >  tools/testing/selftests/bpf/Makefile  |   6 +-
> >  tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 123 
> > +++-
> >  tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c   | 362 
> >

Re: [PATCH 7/7] man2: Add uretprobe syscall page

2024-04-22 Thread Jiri Olsa

On Tue, Apr 23, 2024 at 12:07:29AM +0900, Masami Hiramatsu wrote:
> On Sun, 21 Apr 2024 21:42:06 +0200
> Jiri Olsa  wrote:
> 
> > Adding man page for new uretprobe syscall.
> > 
> > Signed-off-by: Jiri Olsa 
> > ---
> >  man2/uretprobe.2 | 40 
> >  1 file changed, 40 insertions(+)
> >  create mode 100644 man2/uretprobe.2
> > 
> > diff --git a/man2/uretprobe.2 b/man2/uretprobe.2
> > new file mode 100644
> > index ..c0343a88bb57
> > --- /dev/null
> > +++ b/man2/uretprobe.2
> > @@ -0,0 +1,40 @@
> > +.\" Copyright (C) 2024, Jiri Olsa 
> > +.\"
> > +.\" SPDX-License-Identifier: Linux-man-pages-copyleft
> > +.\"
> > +.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
> > +.SH NAME
> > +uretprobe \- execute pending return uprobes
> > +.SH SYNOPSIS
> > +.nf
> > +.B int uretprobe(void)
> > +.fi
> > +.SH DESCRIPTION
> > +On x86_64 architecture the kernel is using uretprobe syscall to trigger
> > +uprobe return probe consumers instead of using standard breakpoint 
> > instruction.
> > +The reason is that it's much faster to do syscall than breakpoint trap
> > +on x86_64 architecture.
> 
> Do we specify the supported architecture as this? Currently it is supported
> only on x86-64, but it could be extended later, right?

yes, that's the idea, but I can't really speak other than x86 ;-)
so not sure abour other archs details

> 
> This should be just noted as NOTES. Something like "This syscall is initially
> introduced on x86-64 because a syscall is faster than a breakpoint trap on it.
> But this will be extended to the architectures whose syscall is faster than
> breakpoint trap."

's/will be extended/might be will be extended/' seems better to me,
other than that it looks ok

thanks,
jirka


> 
> Thank you,
> 
> > +
> > +The uretprobe syscall is not supposed to be called directly by user, it's 
> > allowed
> > +to be invoked only through user space trampoline provided by kernel.
> > +When called from outside of this trampoline, the calling process will 
> > receive
> > +.BR SIGILL .
> > +
> > +.SH RETURN VALUE
> > +.BR uretprobe()
> > +return value is specific for given architecture.
> > +
> > +.SH VERSIONS
> > +This syscall is not specified in POSIX,
> > +and details of its behavior vary across systems.
> > +.SH STANDARDS
> > +None.
> > +.SH NOTES
> > +.BR uretprobe()
> > +exists only to allow the invocation of return uprobe consumers.
> > +It should
> > +.B never
> > +be called directly.
> > +Details of the arguments (if any) passed to
> > +.BR uretprobe ()
> > +and the return value are specific for given architecture.
> > -- 
> > 2.44.0
> > 
> 
> 
> -- 
> Masami Hiramatsu (Google)

Re: [PATCH v2 0/6] virtiofs: fix the warning for ITER_KVEC dio

2024-04-22 Thread Michael S. Tsirkin

On Tue, Apr 09, 2024 at 09:48:08AM +0800, Hou Tao wrote:
> Hi,
> 
> On 4/8/2024 3:45 PM, Michael S. Tsirkin wrote:
> > On Wed, Feb 28, 2024 at 10:41:20PM +0800, Hou Tao wrote:
> >> From: Hou Tao 
> >>
> >> Hi,
> >>
> >> The patch set aims to fix the warning related to an abnormal size
> >> parameter of kmalloc() in virtiofs. The warning occurred when attempting
> >> to insert a 10MB sized kernel module kept in a virtiofs with cache
> >> disabled. As analyzed in patch #1, the root cause is that the length of
> >> the read buffer is no limited, and the read buffer is passed directly to
> >> virtiofs through out_args[0].value. Therefore patch #1 limits the
> >> length of the read buffer passed to virtiofs by using max_pages. However
> >> it is not enough, because now the maximal value of max_pages is 256.
> >> Consequently, when reading a 10MB-sized kernel module, the length of the
> >> bounce buffer in virtiofs will be 40 + (256 * 4096), and kmalloc will
> >> try to allocate 2MB from memory subsystem. The request for 2MB of
> >> physically contiguous memory significantly stress the memory subsystem
> >> and may fail indefinitely on hosts with fragmented memory. To address
> >> this, patch #2~#5 use scattered pages in a bio_vec to replace the
> >> kmalloc-allocated bounce buffer when the length of the bounce buffer for
> >> KVEC_ITER dio is larger than PAGE_SIZE. The final issue with the
> >> allocation of the bounce buffer and sg array in virtiofs is that
> >> GFP_ATOMIC is used even when the allocation occurs in a kworker context.
> >> Therefore the last patch uses GFP_NOFS for the allocation of both sg
> >> array and bounce buffer when initiated by the kworker. For more details,
> >> please check the individual patches.
> >>
> >> As usual, comments are always welcome.
> >>
> >> Change Log:
> > Bernd should I just merge the patchset as is?
> > It seems to fix a real problem and no one has the
> > time to work on a better fix  WDYT?
> 
> Sorry for the long delay. I am just start to prepare for v3. In v3, I
> plan to avoid the unnecessary memory copy between fuse args and bio_vec.
> Will post it before next week.

Didn't happen before this week apparently.

> >
> >
> >> v2:
> >>   * limit the length of ITER_KVEC dio by max_pages instead of the
> >> newly-introduced max_nopage_rw. Using max_pages make the ITER_KVEC
> >> dio being consistent with other rw operations.
> >>   * replace kmalloc-allocated bounce buffer by using a bounce buffer
> >> backed by scattered pages when the length of the bounce buffer for
> >> KVEC_ITER dio is larger than PAG_SIZE, so even on hosts with
> >> fragmented memory, the KVEC_ITER dio can be handled normally by
> >> virtiofs. (Bernd Schubert)
> >>   * merge the GFP_NOFS patch [1] into this patch-set and use
> >> memalloc_nofs_{save|restore}+GFP_KERNEL instead of GFP_NOFS
> >> (Benjamin Coddington)
> >>
> >> v1: 
> >> https://lore.kernel.org/linux-fsdevel/20240103105929.1902658-1-hou...@huaweicloud.com/
> >>
> >> [1]: 
> >> https://lore.kernel.org/linux-fsdevel/20240105105305.4052672-1-hou...@huaweicloud.com/
> >>
> >> Hou Tao (6):
> >>   fuse: limit the length of ITER_KVEC dio by max_pages
> >>   virtiofs: move alloc/free of argbuf into separated helpers
> >>   virtiofs: factor out more common methods for argbuf
> >>   virtiofs: support bounce buffer backed by scattered pages
> >>   virtiofs: use scattered bounce buffer for ITER_KVEC dio
> >>   virtiofs: use GFP_NOFS when enqueuing request through kworker
> >>
> >>  fs/fuse/file.c  |  12 +-
> >>  fs/fuse/virtio_fs.c | 336 +---
> >>  2 files changed, 296 insertions(+), 52 deletions(-)
> >>
> >> -- 
> >> 2.29.2

Re: [PATCH v5 3/5] vduse: Add function to get/free the pages for reconnection

2024-04-22 Thread Michael S. Tsirkin

On Thu, Apr 18, 2024 at 08:57:51AM +0800, Jason Wang wrote:
> On Wed, Apr 17, 2024 at 5:29 PM Michael S. Tsirkin  wrote:
> >
> > On Fri, Apr 12, 2024 at 09:28:23PM +0800, Cindy Lu wrote:
> > > Add the function vduse_alloc_reconnnect_info_mem
> > > and vduse_alloc_reconnnect_info_mem
> > > These functions allow vduse to allocate and free memory for reconnection
> > > information. The amount of memory allocated is vq_num pages.
> > > Each VQS will map its own page where the reconnection information will be 
> > > saved
> > >
> > > Signed-off-by: Cindy Lu 
> > > ---
> > >  drivers/vdpa/vdpa_user/vduse_dev.c | 40 ++
> > >  1 file changed, 40 insertions(+)
> > >
> > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
> > > b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > index ef3c9681941e..2da659d5f4a8 100644
> > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > @@ -65,6 +65,7 @@ struct vduse_virtqueue {
> > >   int irq_effective_cpu;
> > >   struct cpumask irq_affinity;
> > >   struct kobject kobj;
> > > + unsigned long vdpa_reconnect_vaddr;
> > >  };
> > >
> > >  struct vduse_dev;
> > > @@ -1105,6 +1106,38 @@ static void vduse_vq_update_effective_cpu(struct 
> > > vduse_virtqueue *vq)
> > >
> > >   vq->irq_effective_cpu = curr_cpu;
> > >  }
> > > +static int vduse_alloc_reconnnect_info_mem(struct vduse_dev *dev)
> > > +{
> > > + unsigned long vaddr = 0;
> > > + struct vduse_virtqueue *vq;
> > > +
> > > + for (int i = 0; i < dev->vq_num; i++) {
> > > + /*page 0~ vq_num save the reconnect info for vq*/
> > > + vq = dev->vqs[i];
> > > + vaddr = get_zeroed_page(GFP_KERNEL);
> >
> >
> > I don't get why you insist on stealing kernel memory for something
> > that is just used by userspace to store data for its own use.
> > Userspace does not lack ways to persist data, for example,
> > create a regular file anywhere in the filesystem.
> 
> Good point. So the motivation here is to:
> 
> 1) be self contained, no dependency for high speed persist data
> storage like tmpfs

No idea what this means.

> 2) standardize the format in uAPI which allows reconnection from
> arbitrary userspace, unfortunately, such effort was removed in new
> versions

And I don't see why that has to live in the kernel tree either.

> If the above doesn't make sense, we don't need to offer those pages by VDUSE.
> 
> Thanks
> 
> 
> >
> >
> >
> > > + if (vaddr == 0)
> > > + return -ENOMEM;
> > > +
> > > + vq->vdpa_reconnect_vaddr = vaddr;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int vduse_free_reconnnect_info_mem(struct vduse_dev *dev)
> > > +{
> > > + struct vduse_virtqueue *vq;
> > > +
> > > + for (int i = 0; i < dev->vq_num; i++) {
> > > + vq = dev->vqs[i];
> > > +
> > > + if (vq->vdpa_reconnect_vaddr)
> > > + free_page(vq->vdpa_reconnect_vaddr);
> > > + vq->vdpa_reconnect_vaddr = 0;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > >
> > >  static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> > >   unsigned long arg)
> > > @@ -1672,6 +1705,8 @@ static int vduse_destroy_dev(char *name)
> > >   mutex_unlock(>lock);
> > >   return -EBUSY;
> > >   }
> > > + vduse_free_reconnnect_info_mem(dev);
> > > +
> > >   dev->connected = true;
> > >   mutex_unlock(>lock);
> > >
> > > @@ -1855,12 +1890,17 @@ static int vduse_create_dev(struct 
> > > vduse_dev_config *config,
> > >   ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
> > >   if (ret)
> > >   goto err_vqs;
> > > + ret = vduse_alloc_reconnnect_info_mem(dev);
> > > + if (ret < 0)
> > > + goto err_mem;
> > >
> > >   __module_get(THIS_MODULE);
> > >
> > >   return 0;
> > >  err_vqs:
> > >   device_destroy(_class, MKDEV(MAJOR(vduse_major), dev->minor));
> > > +err_mem:
> > > + vduse_free_reconnnect_info_mem(dev);
> > >  err_dev:
> > >   idr_remove(_idr, dev->minor);
> > >  err_idr:
> > > --
> > > 2.43.0
> >

Re: [PATCH v11 14/14] selftests/sgx: Add scripts for EPC cgroup testing

2024-04-22 Thread Haitao Huang


Hi Jarkko

On Mon, 15 Apr 2024 14:08:44 -0500, Jarkko Sakkinen   
wrote:



I did run the basic test by manually creating the cgroup so you could
add tested-by from my side to the other kernel patches expect this one

I've reviewed it enough rounds and given various code suggestions etc.
For me it is "good enough" or has been for a while. I just want this
test to work so that people doing kernel QA will automatically get it
to their testing cycle. That is why proper integration to kselftest
framework is a must


May I have your "Reviewed-by" tag also for the patches #8-13?

Not sure if I missed any other comments/issue you raised. I think all are  
addressed in v12. I will refine the test scripts (patch #14) in v13.


Thanks
Haitao

Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null

2024-04-22 Thread Konstantin Ryabitsev

On Mon, Apr 22, 2024 at 05:49:29PM +0200, Thorsten Leemhuis wrote:
> @Greg, BTW: should this be stable+noauto...@kernel.org or have a 
> 'vger.'

No vger, just stable+whate...@kernel.org.

> in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.'
> is fine, just wanted to be sure, as
> Documentation/process/stable-kernel-rules.rst in all other cases
> specifies sta...@vger.kernel.org, so people are likely to get confused.
> :-/ #sigh

These serve two different purposes:

sta...@kernel.org (goes into devnull)
sta...@vger.kernel.org (actual mailing list)

Confusion happens all the time, unfortunately.

Notably, even if someone uses stable+noauto...@vger.kernel.org, it won't 
do anything terrible (it won't bounce, it'll just quietly go into 
nowhere because that's not a valid expansion command).

-K

Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

2024-04-22 Thread David Hildenbrand


On 22.04.24 20:11, Guillaume Morin wrote:

(Dropping Mike Kravetz as CC since he has retired and his email is no
longer valid, adding Muchun since he's the current hugetlb maintainer,
as well as linux-trace-kernel)

On 22 Apr 11:39, David Hildenbrand wrote:


On 19.04.24 20:37, Guillaume Morin wrote:

libhugetlbfs, the Intel iodlr code both allow to remap .text onto a
hugetlb private mapping. It's also pretty easy to do it manually.
One drawback of using this functionality is the lack of support for
uprobes (NOTE uprobe ignores shareable vmas)

This patch adds support for private hugetlb mappings.  It does require exposing
some hugetlbfs innards and relies on copy_user_large_folio which is only
available when CONFIG_HUGETLBFS is used so I had to use an ugly #ifdef

If there is some interest in applying this patch in some form or
another, I am open to any refactoring suggestions (esp getting rid the
#ifdef in uprobes.c) . I tried to limit the
amount of branching.


All that hugetlb special casing  oh my. What's the benefit why we should
be interested in making that code less clean -- to phrase it in a nice way
;) ?


I do appreciate the nice phrasing. Believe me, I did try to limit the
special casing to a minimum :-).

Outside of __replace_page, I added only 3-ish branches so I do not think
it's *too* bad. The uprobe code is using PAGE_{SHIFT,MASK} quite liberally so I
had to add calls to retrieve these for the hugetlb vmas.

__replace_page has a lot of special casing. I certainly agree (and
unfortunately for me it's at the beginning of the patch :)).  It's doing
something pretty uncommon outside of the mm code so it has to make a
bunch of specific hugetlb calls. I am not quite sure how to improve it
but if you have suggestions, I'd be happy to refactor.


See below.



The benefit - to me - is very clear. People do use hugetlb mappings to
run code in production environments. The perf benefits are there for some
workloads. Intel has published a whitepaper about it etc.
Uprobes are a very good tool to do live tracing. If you can restart the
process and reproduce, you should be able to disable hugetlb remapping
but if you need to look at a live process, there are not many options.
Not being able to use uprobes is crippling.


Please add all that as motivation to the patch description or cover letter.




Yes, libhugetlbfs exists. But why do we have to support uprobes with it?
Nobody cared until now, why care now?


I think you could ask the same question for every new feature patch :)


I have to, because it usually indicates a lack of motivation in the 
cover-letter/patch description :P


People will have to maintain that code, and maintaining hugetlb code in 
odd places is no fun ...




Since the removal a few releases ago of the __morecore() hook in glibc,
the main feature of libhugetlbfs is ELF segments remapping. I think
there are definitely a lot of users that simply deal with this
unnecessary limitation.

I am certainly not shoving this patch through anyone's throat if there
is no interest. But we definitely find it a very useful feature ...


Let me try to see if we can get this done cleaner.

One ugly part (in general here) is the custom page replacement in the 
registration part.


We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing 
pages ourselves (which we likely shouldn't do ...) ... maybe we could 
use FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio 
populated. (like KSM does nowadays)


Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing, 
but using FOLL_WRITE would not work on many file systems. So maybe we 
have to trigger an unsharing fault ourselves.


That would do the page replacement for us and we "should" be able to 
lookup an anonymous folio that we can then just modify, like ptrace would.


But then, there is also unregistration part, with weird conditional page 
replacement. Zapping the anon page if the content matches the content of 
the original page is one thing. But why are we placing an existing 
anonymous page by a new anonymous page when the content from the 
original page differs (but matches the one from the just copied page?)?


I'll have to further think about that one. It's all a bit nasty.


One thing to note is that hugetlb folios don't grow on trees. Likely, 
Many setups *don't* reserve extra hugetlb folios and you might just 
easily be running out of free hugetlb folios that you can use to break 
COW here (replace a file hugetlb by a fresh anon hugetlb page). Likely 
it's easy to make register or unregister fail.


--
Cheers,

David / dhildenb

Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-22 Thread Song Liu

Hi Masami and Mike,

On Sat, Apr 20, 2024 at 2:11 AM Masami Hiramatsu  wrote:
[...]
> > >
> > > IIUC, we need to update __execmem_cache_alloc() to take a range pointer as
> > > input. module text will use "range" for EXECMEM_MODULE_TEXT, while kprobe
> > > will use "range" for EXECMEM_KPROBE. Without "map to" concept or sharing
> > > the "range" object, we will have to compare different range parameters to 
> > > check
> > > we can share cached pages between module text and kprobe, which is not
> > > efficient. Did I miss something?
>
> Song, thanks for trying to eplain. I think I need to explain why I used
> module_alloc() originally.
>
> This depends on how kprobe features are implemented on the architecture, and
> how much features are supported on kprobes.
>
> Because kprobe jump optimization and kprobe jump-back optimization need to
> use a jump instruction to jump into the trampoline and jump back from the
> trampoline directly, if the architecuture jmp instruction supports +-2GB range
> like x86, it needs to allocate the trampoline buffer inside such address 
> space.
> This requirement is similar to the modules (because module function needs to
> call other functions in the kernel etc.), at least kprobes on x86 used
> module_alloc().
>
> However, if an architecture only supports breakpoint/trap based kprobe,
> it does not need to consider whether the execmem is allocated.
>
> >
> > We can always share large ROX pages as long as they are within the correct
> > address space. The permissions for them are ROX and the alignment
> > differences are due to KASAN and this is handled during allocation of the
> > large page to refill the cache. __execmem_cache_alloc() only needs to limit
> > the search for the address space of the range.
>
> So I don't think EXECMEM_KPROBE always same as EXECMEM_MODULE_TEXT, it
> should be configured for each arch. Especially, if it is only used for
> searching parameter, it looks OK to me.

Thanks for the explanation!

I was thinking "we can have EXECMEM_KPROBE share the same parameters as
EXECMEM_MODULE_TEXT for all architectures". But this thought is built on top
of assumptions on future changes/improvements within multiple sub systems.
At this moment, I have no objections moving forward with current execmem APIs.

Thanks,
Song

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Simon Horman

On Mon, Apr 22, 2024 at 11:01:03AM +0800, Jason Xing wrote:

...

> diff --git a/include/net/rstreason.h b/include/net/rstreason.h

...

> +/**
> + * There are three parts in order:
> + * 1) reset reason in MPTCP: only for MPTCP use
> + * 2) skb drop reason: relying on drop reasons for such as passive reset
> + * 3) independent reset reason: such as active reset reasons
> + */

Hi Jason,

A minor nit from my side.

'/**' denotes the beginning of a Kernel doc,
but other than that, this comment is not a Kernel doc.

FWIIW, I would suggest providing a proper Kernel doc for enum sk_rst_reason.
But another option would be to simply make this a normal comment,
starting with "/* There are"

> +enum sk_rst_reason {

...

Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions

2024-04-22 Thread David Hildenbrand


On 22.04.24 20:20, Vincent Donnefort wrote:

Hi David,

Thanks for having a look, very much appreciated!

On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote:

On 19.04.24 20:25, David Hildenbrand wrote:

On 06.04.24 19:36, Vincent Donnefort wrote:

In preparation for allowing the user-space to map a ring-buffer, add
a set of mapping functions:

 ring_buffer_{map,unmap}()

And controls on the ring-buffer:

 ring_buffer_map_get_reader()  /* swap reader and head */

Mapping the ring-buffer also involves:

 A unique ID for each subbuf of the ring-buffer, currently they are
 only identified through their in-kernel VA.

 A meta-page, where are stored ring-buffer statistics and a
 description for the current reader

The linear mapping exposes the meta-page, and each subbuf of the
ring-buffer, ordered following their unique ID, assigned during the
first mapping.

Once mapped, no subbuf can get in or out of the ring-buffer: the buffer
size will remain unmodified and the splice enabling functions will in
reality simply memcpy the data instead of swapping subbufs.

CC: 
Signed-off-by: Vincent Donnefort 

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dc5ae4e96aee..96d2140b471e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -6,6 +6,8 @@
#include 
#include 
+#include 
+
struct trace_buffer;
struct ring_buffer_iter;
@@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct 
hlist_node *node);
#define trace_rb_cpu_prepareNULL
#endif
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,
+   struct vm_area_struct *vma);
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
#endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
new file mode 100644
index ..ffcd8dfcaa4f
--- /dev/null
+++ b/include/uapi/linux/trace_mmap.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _TRACE_MMAP_H_
+#define _TRACE_MMAP_H_
+
+#include 
+
+/**
+ * struct trace_buffer_meta - Ring-buffer Meta-page description
+ * @meta_page_size:Size of this meta-page.
+ * @meta_struct_len:   Size of this structure.
+ * @subbuf_size:   Size of each sub-buffer.
+ * @nr_subbufs:Number of subbfs in the ring-buffer, including 
the reader.
+ * @reader.lost_events:Number of events lost at the time of the reader 
swap.
+ * @reader.id: subbuf ID of the current reader. ID range [0 : 
@nr_subbufs - 1]
+ * @reader.read:   Number of bytes read on the reader subbuf.
+ * @flags: Placeholder for now, 0 until new features are supported.
+ * @entries:   Number of entries in the ring-buffer.
+ * @overrun:   Number of entries lost in the ring-buffer.
+ * @read:  Number of entries that have been read.
+ * @Reserved1: Reserved for future use.
+ * @Reserved2: Reserved for future use.
+ */
+struct trace_buffer_meta {
+   __u32   meta_page_size;
+   __u32   meta_struct_len;
+
+   __u32   subbuf_size;
+   __u32   nr_subbufs;
+
+   struct {
+   __u64   lost_events;
+   __u32   id;
+   __u32   read;
+   } reader;
+
+   __u64   flags;
+
+   __u64   entries;
+   __u64   overrun;
+   __u64   read;
+
+   __u64   Reserved1;
+   __u64   Reserved2;
+};
+
+#endif /* _TRACE_MMAP_H_ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cc9ebe593571..793ecc454039 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,6 +9,7 @@
#include 
#include 
#include 
+#include 
#include 
#include 
#include 
@@ -26,6 +27,7 @@
#include 
#include 
#include 
+#include 
#include 
#include 
@@ -338,6 +340,7 @@ struct buffer_page {
local_t  entries;   /* entries on this page */
unsigned longreal_end;  /* real end of data */
unsigned order; /* order of the page */
+   u32  id;/* ID for external mapping */
struct buffer_data_page *page;  /* Actual data page */
};
@@ -484,6 +487,12 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
/* pages removed since last reset */
unsigned long   pages_removed;
+
+   unsigned intmapped;
+   struct mutexmapping_lock;
+   unsigned long   *subbuf_ids;/* ID to subbuf VA */
+   struct trace_buffer_meta*meta_page;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
longnr_pages_to_update;
struct list_head

Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions

2024-04-22 Thread Vincent Donnefort

Hi David,

Thanks for having a look, very much appreciated!

On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote:
> On 19.04.24 20:25, David Hildenbrand wrote:
> > On 06.04.24 19:36, Vincent Donnefort wrote:
> > > In preparation for allowing the user-space to map a ring-buffer, add
> > > a set of mapping functions:
> > > 
> > > ring_buffer_{map,unmap}()
> > > 
> > > And controls on the ring-buffer:
> > > 
> > > ring_buffer_map_get_reader()  /* swap reader and head */
> > > 
> > > Mapping the ring-buffer also involves:
> > > 
> > > A unique ID for each subbuf of the ring-buffer, currently they are
> > > only identified through their in-kernel VA.
> > > 
> > > A meta-page, where are stored ring-buffer statistics and a
> > > description for the current reader
> > > 
> > > The linear mapping exposes the meta-page, and each subbuf of the
> > > ring-buffer, ordered following their unique ID, assigned during the
> > > first mapping.
> > > 
> > > Once mapped, no subbuf can get in or out of the ring-buffer: the buffer
> > > size will remain unmodified and the splice enabling functions will in
> > > reality simply memcpy the data instead of swapping subbufs.
> > > 
> > > CC: 
> > > Signed-off-by: Vincent Donnefort 
> > > 
> > > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
> > > index dc5ae4e96aee..96d2140b471e 100644
> > > --- a/include/linux/ring_buffer.h
> > > +++ b/include/linux/ring_buffer.h
> > > @@ -6,6 +6,8 @@
> > >#include 
> > >#include 
> > > +#include 
> > > +
> > >struct trace_buffer;
> > >struct ring_buffer_iter;
> > > @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct 
> > > hlist_node *node);
> > >#define trace_rb_cpu_prepare   NULL
> > >#endif
> > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu,
> > > + struct vm_area_struct *vma);
> > > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
> > > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
> > >#endif /* _LINUX_RING_BUFFER_H */
> > > diff --git a/include/uapi/linux/trace_mmap.h 
> > > b/include/uapi/linux/trace_mmap.h
> > > new file mode 100644
> > > index ..ffcd8dfcaa4f
> > > --- /dev/null
> > > +++ b/include/uapi/linux/trace_mmap.h
> > > @@ -0,0 +1,46 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > > +#ifndef _TRACE_MMAP_H_
> > > +#define _TRACE_MMAP_H_
> > > +
> > > +#include 
> > > +
> > > +/**
> > > + * struct trace_buffer_meta - Ring-buffer Meta-page description
> > > + * @meta_page_size:  Size of this meta-page.
> > > + * @meta_struct_len: Size of this structure.
> > > + * @subbuf_size: Size of each sub-buffer.
> > > + * @nr_subbufs:  Number of subbfs in the ring-buffer, including 
> > > the reader.
> > > + * @reader.lost_events:  Number of events lost at the time of the reader 
> > > swap.
> > > + * @reader.id:   subbuf ID of the current reader. ID range [0 : 
> > > @nr_subbufs - 1]
> > > + * @reader.read: Number of bytes read on the reader subbuf.
> > > + * @flags:   Placeholder for now, 0 until new features are 
> > > supported.
> > > + * @entries: Number of entries in the ring-buffer.
> > > + * @overrun: Number of entries lost in the ring-buffer.
> > > + * @read:Number of entries that have been read.
> > > + * @Reserved1:   Reserved for future use.
> > > + * @Reserved2:   Reserved for future use.
> > > + */
> > > +struct trace_buffer_meta {
> > > + __u32   meta_page_size;
> > > + __u32   meta_struct_len;
> > > +
> > > + __u32   subbuf_size;
> > > + __u32   nr_subbufs;
> > > +
> > > + struct {
> > > + __u64   lost_events;
> > > + __u32   id;
> > > + __u32   read;
> > > + } reader;
> > > +
> > > + __u64   flags;
> > > +
> > > + __u64   entries;
> > > + __u64   overrun;
> > > + __u64   read;
> > > +
> > > + __u64   Reserved1;
> > > + __u64   Reserved2;
> > > +};
> > > +
> > > +#endif /* _TRACE_MMAP_H_ */
> > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > > index cc9ebe593571..793ecc454039 100644
> > > --- a/kernel/trace/ring_buffer.c
> > > +++ b/kernel/trace/ring_buffer.c
> > > @@ -9,6 +9,7 @@
> > >#include 
> > >#include 
> > >#include 
> > > +#include 
> > >#include 
> > >#include 
> > >#include 
> > > @@ -26,6 +27,7 @@
> > >#include 
> > >#include 
> > >#include 
> > > +#include 
> > >#include 
> > >#include 
> > > @@ -338,6 +340,7 @@ struct buffer_page {
> > >   local_t  entries;   /* entries on this page */
> > >   unsigned longreal_end;  /* real end of data */
> > >   unsigned order; /* order of the page */
> > > + u32  id;/* ID for external mapping */
> > >   struct buffer_data_page *page;  /* Actual data page */
> > >

Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

2024-04-22 Thread Guillaume Morin

(Dropping Mike Kravetz as CC since he has retired and his email is no
longer valid, adding Muchun since he's the current hugetlb maintainer,
as well as linux-trace-kernel)

On 22 Apr 11:39, David Hildenbrand wrote:
>
> On 19.04.24 20:37, Guillaume Morin wrote:
> > libhugetlbfs, the Intel iodlr code both allow to remap .text onto a
> > hugetlb private mapping. It's also pretty easy to do it manually.
> > One drawback of using this functionality is the lack of support for
> > uprobes (NOTE uprobe ignores shareable vmas)
> > 
> > This patch adds support for private hugetlb mappings.  It does require 
> > exposing
> > some hugetlbfs innards and relies on copy_user_large_folio which is only
> > available when CONFIG_HUGETLBFS is used so I had to use an ugly #ifdef
> > 
> > If there is some interest in applying this patch in some form or
> > another, I am open to any refactoring suggestions (esp getting rid the
> > #ifdef in uprobes.c) . I tried to limit the
> > amount of branching.
> 
> All that hugetlb special casing  oh my. What's the benefit why we should
> be interested in making that code less clean -- to phrase it in a nice way
> ;) ?

I do appreciate the nice phrasing. Believe me, I did try to limit the
special casing to a minimum :-).

Outside of __replace_page, I added only 3-ish branches so I do not think
it's *too* bad. The uprobe code is using PAGE_{SHIFT,MASK} quite liberally so I
had to add calls to retrieve these for the hugetlb vmas.

__replace_page has a lot of special casing. I certainly agree (and
unfortunately for me it's at the beginning of the patch :)).  It's doing
something pretty uncommon outside of the mm code so it has to make a
bunch of specific hugetlb calls. I am not quite sure how to improve it
but if you have suggestions, I'd be happy to refactor.

The benefit - to me - is very clear. People do use hugetlb mappings to
run code in production environments. The perf benefits are there for some
workloads. Intel has published a whitepaper about it etc.
Uprobes are a very good tool to do live tracing. If you can restart the
process and reproduce, you should be able to disable hugetlb remapping
but if you need to look at a live process, there are not many options.
Not being able to use uprobes is crippling.

> Yes, libhugetlbfs exists. But why do we have to support uprobes with it?
> Nobody cared until now, why care now?

I think you could ask the same question for every new feature patch :)

Since the removal a few releases ago of the __morecore() hook in glibc,
the main feature of libhugetlbfs is ELF segments remapping. I think
there are definitely a lot of users that simply deal with this
unnecessary limitation.

I am certainly not shoving this patch through anyone's throat if there
is no interest. But we definitely find it a very useful feature ...

Guillaume.

-- 
Guillaume Morin

Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions

2024-04-22 Thread Vincent Donnefort

On Thu, Apr 18, 2024 at 11:43:46PM -0400, Steven Rostedt wrote:
> On Thu, 18 Apr 2024 09:55:55 +0300
> Mike Rapoport  wrote:
> 
> Hi Mike,
> 
> Thanks for doing this review!
> 
> > > +/**
> > > + * struct trace_buffer_meta - Ring-buffer Meta-page description
> > > + * @meta_page_size:  Size of this meta-page.
> > > + * @meta_struct_len: Size of this structure.
> > > + * @subbuf_size: Size of each sub-buffer.
> > > + * @nr_subbufs:  Number of subbfs in the ring-buffer, including 
> > > the reader.
> > > + * @reader.lost_events:  Number of events lost at the time of the reader 
> > > swap.
> > > + * @reader.id:   subbuf ID of the current reader. ID range [0 : 
> > > @nr_subbufs - 1]
> > > + * @reader.read: Number of bytes read on the reader subbuf.
> > > + * @flags:   Placeholder for now, 0 until new features are 
> > > supported.
> > > + * @entries: Number of entries in the ring-buffer.
> > > + * @overrun: Number of entries lost in the ring-buffer.
> > > + * @read:Number of entries that have been read.
> > > + * @Reserved1:   Reserved for future use.
> > > + * @Reserved2:   Reserved for future use.
> > > + */
> > > +struct trace_buffer_meta {
> > > + __u32   meta_page_size;
> > > + __u32   meta_struct_len;
> > > +
> > > + __u32   subbuf_size;
> > > + __u32   nr_subbufs;
> > > +
> > > + struct {
> > > + __u64   lost_events;
> > > + __u32   id;
> > > + __u32   read;
> > > + } reader;
> > > +
> > > + __u64   flags;
> > > +
> > > + __u64   entries;
> > > + __u64   overrun;
> > > + __u64   read;
> > > +
> > > + __u64   Reserved1;
> > > + __u64   Reserved2;  
> > 
> > Why do you need reserved fields? This structure always resides in the
> > beginning of a page and the rest of the page is essentially "reserved".
> 
> So this code is also going to be used in arm's pkvm hypervisor code,
> where it will be using these fields, but since we are looking at
> keeping the same interface between the two, we don't want these used by
> this interface.
> 
> We probably should add a comment about that.
> 
> > 
> > > +};
> > > +
> > > +#endif /* _TRACE_MMAP_H_ */
> > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > > index cc9ebe593571..793ecc454039 100644
> > > --- a/kernel/trace/ring_buffer.c
> > > +++ b/kernel/trace/ring_buffer.c  
> > 
> > ... 
> > 
> > > +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu 
> > > *cpu_buffer,
> > > +unsigned long *subbuf_ids)
> > > +{
> > > + struct trace_buffer_meta *meta = cpu_buffer->meta_page;
> > > + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
> > > + struct buffer_page *first_subbuf, *subbuf;
> > > + int id = 0;
> > > +
> > > + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
> > > + cpu_buffer->reader_page->id = id++;
> > > +
> > > + first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
> > > + do {
> > > + if (WARN_ON(id >= nr_subbufs))
> > > + break;
> > > +
> > > + subbuf_ids[id] = (unsigned long)subbuf->page;
> > > + subbuf->id = id;
> > > +
> > > + rb_inc_page();
> > > + id++;
> > > + } while (subbuf != first_subbuf);
> > > +
> > > + /* install subbuf ID to kern VA translation */
> > > + cpu_buffer->subbuf_ids = subbuf_ids;
> > > +
> > > + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */
> > > + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order;  
> > 
> > Isn't this a single page?
> 
> One thing we are doing is to make sure that the subbuffers are aligned
> by their size. If a subbuffer is 3 pages, it should be aligned on 3
> page boundaries. This was something that Linus suggested.
> 
> > 
> > > + meta->meta_struct_len = sizeof(*meta);
> > > + meta->nr_subbufs = nr_subbufs;
> > > + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
> > > +
> > > + rb_update_meta_page(cpu_buffer);
> > > +}  
> > 
> > ...
> > 
> > > +#define subbuf_page(off, start) \
> > > + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT)))
> > > +
> > > +#define foreach_subbuf_page(sub_order, start, page)  \  
> > 
> > Nit: usually iterators in kernel use for_each
> 
> Ah, good catch. Yeah, that should be changed. But then ...
> 
> > 
> > > + page = subbuf_page(0, (start)); \
> > > + for (int __off = 0; __off < (1 << (sub_order)); \
> > > +  __off++, page = subbuf_page(__off, (start)))  
> > 
> > The pages are allocated with alloc_pages_node(.. subbuf_order) are
> > physically contiguous and struct pages for them are also contiguous, so
> > inside a subbuf_order allocation you can just do page++.
> > 
> 
> I'm wondering if we should just nuke the macro. It was there because of
> the previous implementation did things twice. But now it's just done
> once here:
> 
> + while (s < nr_subbufs && p < nr_pages)

Re: [PATCH v12 09/14] x86/sgx: Implement async reclamation for cgroup

2024-04-22 Thread Haitao Huang

On Sun, 21 Apr 2024 19:22:27 -0500, Huang, Kai  wrote:

On Fri, 2024-04-19 at 20:14 -0500, Haitao Huang wrote:
> > I think we can add support for "sgx_cgroup=disabled" in future if  
indeed

> > needed. But just for init failure, no?
> >
>
> It's not about the commandline, which we can add in the future when
> needed.  It's about we need to have a way to handle SGX cgroup being
> disabled at boot time nicely, because we already have a case where we 
> need

> to do so.
>
> Your approach looks half-way to me, and is not future extendible.  If  
we

> choose to do it, do it right -- that is, we need a way to disable it
> completely in both kernel and userspace so that userspace won't be  
able> to

> see it.

That would need more changes in misc cgroup implementation to support 
sgx-disable. Right now misc does not have separate files for different 
resource types. So we can only block echo "sgx_epc..." to those  
interfacefiles, can't really make files not visible.

"won't be able to see" I mean "only for SGX EPC resource", but not the
control files for the entire MISC cgroup.

I replied at the beginning of the previous reply:

"
Given SGX EPC is just one type of MISC cgroup resources, we cannot just
disable MISC cgroup as a whole.
"

Sorry I missed this point. below.

You just need to set the SGX EPC "capacity" to 0 to disable SGX EPC.  See
the comment of @misc_res_capacity:

 * Miscellaneous resources capacity for the entire machine. 0 capacity
 * means resource is not initialized or not present in the host.

IIUC I don't think the situation we have is either of those cases. For our  
case, resource is inited and present on the host but we have allocation  
error for sgx cgroup infra.

And "blocking echo sgx_epc ... to those control files" is already
sufficient for the purpose of not exposing SGX EPC to userspace, correct?

E.g., if SGX cgroup is enabled, you can see below when you read "max":

 # cat /sys/fs/cgroup/my_group/misc.max
 #  
   sgx_epc ...
   ...

Otherwise you won't be able to see "sgx_epc":

 # cat /sys/fs/cgroup/my_group/misc.max
 #  
   ...

And when you try to write the "max" for "sgx_epc", you will hit error:

 # echo "sgx_epc 100" > /sys/fs/cgroup/my_group/misc.max
 # ... echo: write error: Invalid argument

The above applies to all the control files.  To me this is pretty much
means "SGX EPC is disabled" or "not supported" for userspace.

You are right, capacity == 0 does block echoing max and users see an error  
if they do that. But 1) doubt you literately wanted "SGX EPC is disabled"  
and make it unsupported in this case, 2) even if we accept this is "sgx  
cgroup disabled" I don't see how it is much better user experience than  
current solution or really helps user better.

Also to implement this approach, as you mentioned, we need workaround the  
fact that misc_try_charge() fails when capacity set to zero, and adding  
code to return root always? So it seems like more workaround code to just  
make it work for a failing case no one really care much and end result is  
not really much better IMHO.

Thanks
Haitao

Re: [PATCHv3 bpf-next 1/7] uprobe: Wire up uretprobe system call

2024-04-22 Thread Oleg Nesterov

On 04/21, Jiri Olsa wrote:
>
>  arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>  include/linux/syscalls.h   | 2 ++
>  include/uapi/asm-generic/unistd.h  | 5 -
>  kernel/sys_ni.c| 2 ++
>  4 files changed, 9 insertions(+), 1 deletion(-)

Reviewed-by: Oleg Nesterov

Re: [PATCHv3 bpf-next 2/7] uprobe: Add uretprobe syscall to speed up return probe

2024-04-22 Thread Oleg Nesterov

On 04/21, Jiri Olsa wrote:
>
>  arch/x86/kernel/uprobes.c | 115 ++
>  include/linux/uprobes.h   |   3 +
>  kernel/events/uprobes.c   |  24 +---
>  3 files changed, 135 insertions(+), 7 deletions(-)

Reviewed-by: Oleg Nesterov

Re: Please create the email alias do-not-apply-to-sta...@kernel.org -> /dev/null

2024-04-22 Thread Thorsten Leemhuis

[CCing Sasha]

On 18.04.24 15:20, Greg KH wrote:
> On Thu, Apr 18, 2024 at 09:04:53AM +0200, Thorsten Leemhuis wrote:
>> On 17.04.24 15:38, Greg KH wrote:
>>> On Wed, Apr 17, 2024 at 03:21:12PM +0200, Thorsten Leemhuis wrote:
 On 17.04.24 14:52, Konstantin Ryabitsev wrote:
> On Wed, Apr 17, 2024 at 09:48:18AM +0200, Thorsten Leemhuis wrote:
>> Could you please create the email alias
>>
>>> How about:
>>> cc:  # Reason goes here, and 
>>> must be present
>>>
>>> and we can make that address be routed to /dev/null just like
>>>  is?

 FWIW, we could go back to what I initially proposed: use the existing
 stable tag with a pre-defined comment to mark patches that AUTOSEL et.
 al. should not pick up:
 https://lore.kernel.org/all/c0a08b160b286e8c98549eedb37404c6e784cf8a.1712812895.git.li...@leemhuis.info/
>>>
>>> If you can pick a better string, possibly, yes.
>>
>> What did you think of Konstantin's
>>
>> Cc: stable+noauto...@kernel.org # Reason

@Greg, BTW: should this be stable+noauto...@kernel.org or have a 'vger.'
in it, e.g. stable+noauto...@vger.kernel.org? I assume without 'vger.'
is fine, just wanted to be sure, as
Documentation/process/stable-kernel-rules.rst in all other cases
specifies sta...@vger.kernel.org, so people are likely to get confused.
:-/ #sigh

>> That looked like a good solution -- and I wondered why I did not come up
>> with that idea myself. Sure, "autosel" would also imply/mean "the
>> scripts/tools that look out for Fixes: tags", but does that matter?
> 
> We can live with this, sure. 

In that case I guess I now also have to fix the scripts to honor that tag.

@Greg: something like the attached for scripts/fixes_search perhaps? Was
that the right one and are there any other scripts that might need
something similar?

@Sasha: are the scripts around autosel online somewhere? They need a
similar change.

Ciao, ThorstenFrom 1e973a069b07f8c045401a7d3d20ea760a27422f Mon Sep 17 00:00:00 2001
From: Thorsten Leemhuis 
Date: Mon, 22 Apr 2024 17:31:01 +0200
Subject: [PATCH] scripts/fixes_search: honor noautosel tag

Ignore commits that contain a soon to be documented tag that is
meant to exclude commits from processing by scripts like
scripts/fixes_search.

Link: https://lore.kernel.org/all/2024041830-karaoke-aspirate-df00@gregkh/ [1]
Signed-off-by: Thorsten Leemhuis 
---
 scripts/fixes_search | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/scripts/fixes_search b/scripts/fixes_search
index aaa12ec..950509f 100755
--- a/scripts/fixes_search
+++ b/scripts/fixes_search
@@ -131,6 +131,13 @@ for commit in $(git rev-list --reverse --no-merges "${git_range}"); do
 	# logn "commit = ${txtgrn}${commit}${txtrst}	"
 	logn "${txtgrn}${commit}${txtrst}	"
 
+	# Check if we are supposed to ignore the commit
+	no_autosel=$(git log -1 --format='%B' "HEAD" | grep -i '^[[:space:]]*[Cc][Cc]:[[:space:]]*

Re: [PATCH] drivers: remoteproc: xlnx: Add Versal and Versal-NET support

2024-04-22 Thread Mathieu Poirier

On Thu, Apr 18, 2024 at 03:01:25PM -0700, Tanmay Shah wrote:
> AMD-Xilinx Versal platform is successor of ZynqMP platform.
> Real-time Processing Unit R5 cluster IP on Versal is same as
> of ZynqMP Platform. Power-domains ids for Versal platform is
> different than ZynqMP.
> 
> AMD-Xilinx Versal-NET platform is successor of Versal platform.
> Versal-NET Real-Time Processing Unit has two clusters and each
> cluster contains dual core ARM Cortex-R52 processors. Each R52
> core is assigned 128KB of TCM memory.
> 
> Signed-off-by: Tanmay Shah 
> ---
>  drivers/remoteproc/xlnx_r5_remoteproc.c | 53 -
>  1 file changed, 17 insertions(+), 36 deletions(-)
>

Applied.

Thanks,
Mathieu

> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
> b/drivers/remoteproc/xlnx_r5_remoteproc.c
> index 7b1c12108bff..a6d8ac7394e7 100644
> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
> @@ -300,36 +300,6 @@ static void zynqmp_r5_rproc_kick(struct rproc *rproc, 
> int vqid)
>   dev_warn(dev, "failed to send message\n");
>  }
>  
> -/*
> - * zynqmp_r5_set_mode()
> - *
> - * set RPU cluster and TCM operation mode
> - *
> - * @r5_core: pointer to zynqmp_r5_core type object
> - * @fw_reg_val: value expected by firmware to configure RPU cluster mode
> - * @tcm_mode: value expected by fw to configure TCM mode (lockstep or split)
> - *
> - * Return: 0 for success and < 0 for failure
> - */
> -static int zynqmp_r5_set_mode(struct zynqmp_r5_core *r5_core,
> -   enum rpu_oper_mode fw_reg_val,
> -   enum rpu_tcm_comb tcm_mode)
> -{
> - int ret;
> -
> - ret = zynqmp_pm_set_rpu_mode(r5_core->pm_domain_id, fw_reg_val);
> - if (ret < 0) {
> - dev_err(r5_core->dev, "failed to set RPU mode\n");
> - return ret;
> - }
> -
> - ret = zynqmp_pm_set_tcm_config(r5_core->pm_domain_id, tcm_mode);
> - if (ret < 0)
> - dev_err(r5_core->dev, "failed to configure TCM\n");
> -
> - return ret;
> -}
> -
>  /*
>   * zynqmp_r5_rproc_start()
>   * @rproc: single R5 core's corresponding rproc instance
> @@ -941,7 +911,7 @@ static int zynqmp_r5_core_init(struct zynqmp_r5_cluster 
> *cluster,
>   /* Maintain backward compatibility for zynqmp by using hardcode TCM 
> address. */
>   if (of_find_property(r5_core->np, "reg", NULL))
>   ret = zynqmp_r5_get_tcm_node_from_dt(cluster);
> - else
> + else if (device_is_compatible(dev, "xlnx,zynqmp-r5fss"))
>   ret = zynqmp_r5_get_tcm_node(cluster);
>  
>   if (ret) {
> @@ -960,12 +930,21 @@ static int zynqmp_r5_core_init(struct zynqmp_r5_cluster 
> *cluster,
>   return ret;
>   }
>  
> - ret = zynqmp_r5_set_mode(r5_core, fw_reg_val, tcm_mode);
> - if (ret) {
> - dev_err(dev, "failed to set r5 cluster mode %d, err 
> %d\n",
> - cluster->mode, ret);
> + ret = zynqmp_pm_set_rpu_mode(r5_core->pm_domain_id, fw_reg_val);
> + if (ret < 0) {
> + dev_err(r5_core->dev, "failed to set RPU mode\n");
>   return ret;
>   }
> +
> + if (of_find_property(dev_of_node(dev), "xlnx,tcm-mode", NULL) ||
> + device_is_compatible(dev, "xlnx,zynqmp-r5fss")) {
> + ret = zynqmp_pm_set_tcm_config(r5_core->pm_domain_id,
> +tcm_mode);
> + if (ret < 0) {
> + dev_err(r5_core->dev, "failed to configure 
> TCM\n");
> + return ret;
> + }
> + }
>   }
>  
>   return 0;
> @@ -1022,7 +1001,7 @@ static int zynqmp_r5_cluster_init(struct 
> zynqmp_r5_cluster *cluster)
>   ret = of_property_read_u32(dev_node, "xlnx,tcm-mode", (u32 
> *)_mode);
>   if (ret)
>   return ret;
> - } else {
> + } else if (device_is_compatible(dev, "xlnx,zynqmp-r5fss")) {
>   if (cluster_mode == LOCKSTEP_MODE)
>   tcm_mode = PM_RPU_TCM_COMB;
>   else
> @@ -1212,6 +1191,8 @@ static int zynqmp_r5_remoteproc_probe(struct 
> platform_device *pdev)
>  
>  /* Match table for OF platform binding */
>  static const struct of_device_id zynqmp_r5_remoteproc_match[] = {
> + { .compatible = "xlnx,versal-net-r52fss", },
> + { .compatible = "xlnx,versal-r5fss", },
>   { .compatible = "xlnx,zynqmp-r5fss", },
>   { /* end of list */ },
>  };
> 
> base-commit: 912ebe48bec5927e2049e91b0e8a9cc682a709d2
> -- 
> 2.25.1
>

Re: [PATCH v2 2/2] remoteproc: mediatek: Support MT8188 SCP core 1

2024-04-22 Thread Mathieu Poirier

Hi Olivia,

On Fri, Apr 19, 2024 at 04:42:11PM +0800, Olivia Wen wrote:
> From: "olivia.wen" 
> 
> There are three primary modifications.
> 
> 1. The struct mtk_scp_of_data usage on MT8188
> MT8192 functions are unsuitable for the dual-core MT8188 SCP,
> which has two RISC-V cores similar to MT8195 but without L1TCM.
> We've added MT8188-specific functions to configure L1TCM
> in multicore setups.
> 
> 2. SCP_IPI_IMGSYS_CMD feature
> This version also adds SCP_IPI_IMGSYS_CMD to facilitate
> communication between the imgsys kernel and the backend driver.
> 
> 3. Different code sizes and IPI share buffer sizes
> Each SCP necessitates different code and IPI share buffer sizes.
> Introducing a structure mtk_scp_sizes_data to handle them.
> 

Please split in 3 different patches and in the changelog, concentrate on "why"
you are making the changes rather than "what" changes are done.

Thanks,
Mathieu

> Signed-off-by: olivia.wen 
> ---
>  drivers/remoteproc/mtk_common.h|  11 +-
>  drivers/remoteproc/mtk_scp.c   | 230 
> +
>  drivers/remoteproc/mtk_scp_ipi.c   |   7 +-
>  include/linux/remoteproc/mtk_scp.h |   1 +
>  4 files changed, 223 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/remoteproc/mtk_common.h b/drivers/remoteproc/mtk_common.h
> index 6d7736a..fd5c539 100644
> --- a/drivers/remoteproc/mtk_common.h
> +++ b/drivers/remoteproc/mtk_common.h
> @@ -78,7 +78,6 @@
>  #define MT8195_L2TCM_OFFSET  0x850d0
>  
>  #define SCP_FW_VER_LEN   32
> -#define SCP_SHARE_BUFFER_SIZE288
>  
>  struct scp_run {
>   u32 signaled;
> @@ -97,6 +96,11 @@ struct scp_ipi_desc {
>  
>  struct mtk_scp;
>  
> +struct mtk_scp_sizes_data {
> + size_t max_dram_size;
> + size_t ipi_share_buffer_size;
> +};
> +
>  struct mtk_scp_of_data {
>   int (*scp_clk_get)(struct mtk_scp *scp);
>   int (*scp_before_load)(struct mtk_scp *scp);
> @@ -110,6 +114,7 @@ struct mtk_scp_of_data {
>   u32 host_to_scp_int_bit;
>  
>   size_t ipi_buf_offset;
> + const struct mtk_scp_sizes_data *scp_sizes;
>  };
>  
>  struct mtk_scp_of_cluster {
> @@ -141,10 +146,10 @@ struct mtk_scp {
>   struct scp_ipi_desc ipi_desc[SCP_IPI_MAX];
>   bool ipi_id_ack[SCP_IPI_MAX];
>   wait_queue_head_t ack_wq;
> + u8 *share_buf;
>  
>   void *cpu_addr;
>   dma_addr_t dma_addr;
> - size_t dram_size;
>  
>   struct rproc_subdev *rpmsg_subdev;
>  
> @@ -162,7 +167,7 @@ struct mtk_scp {
>  struct mtk_share_obj {
>   u32 id;
>   u32 len;
> - u8 share_buf[SCP_SHARE_BUFFER_SIZE];
> + u8 *share_buf;
>  };
>  
>  void scp_memcpy_aligned(void __iomem *dst, const void *src, unsigned int 
> len);
> diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c
> index 6751829..e281d28 100644
> --- a/drivers/remoteproc/mtk_scp.c
> +++ b/drivers/remoteproc/mtk_scp.c
> @@ -20,7 +20,6 @@
>  #include "mtk_common.h"
>  #include "remoteproc_internal.h"
>  
> -#define MAX_CODE_SIZE 0x50
>  #define SECTION_NAME_IPI_BUFFER ".ipi_buffer"
>  
>  /**
> @@ -94,14 +93,15 @@ static void scp_ipi_handler(struct mtk_scp *scp)
>  {
>   struct mtk_share_obj __iomem *rcv_obj = scp->recv_buf;
>   struct scp_ipi_desc *ipi_desc = scp->ipi_desc;
> - u8 tmp_data[SCP_SHARE_BUFFER_SIZE];
>   scp_ipi_handler_t handler;
>   u32 id = readl(_obj->id);
>   u32 len = readl(_obj->len);
> + const struct mtk_scp_sizes_data *scp_sizes;
>  
> - if (len > SCP_SHARE_BUFFER_SIZE) {
> - dev_err(scp->dev, "ipi message too long (len %d, max %d)", len,
> - SCP_SHARE_BUFFER_SIZE);
> + scp_sizes = scp->data->scp_sizes;
> + if (len > scp_sizes->ipi_share_buffer_size) {
> + dev_err(scp->dev, "ipi message too long (len %d, max %zd)", len,
> + scp_sizes->ipi_share_buffer_size);
>   return;
>   }
>   if (id >= SCP_IPI_MAX) {
> @@ -117,8 +117,9 @@ static void scp_ipi_handler(struct mtk_scp *scp)
>   return;
>   }
>  
> - memcpy_fromio(tmp_data, _obj->share_buf, len);
> - handler(tmp_data, len, ipi_desc[id].priv);
> + memset(scp->share_buf, 0, scp_sizes->ipi_share_buffer_size);
> + memcpy_fromio(scp->share_buf, _obj->share_buf, len);
> + handler(scp->share_buf, len, ipi_desc[id].priv);
>   scp_ipi_unlock(scp, id);
>  
>   scp->ipi_id_ack[id] = true;
> @@ -133,6 +134,8 @@ static int scp_ipi_init(struct mtk_scp *scp, const struct 
> firmware *fw)
>  {
>   int ret;
>   size_t buf_sz, offset;
> + size_t share_buf_offset;
> + const struct mtk_scp_sizes_data *scp_sizes;
>  
>   /* read the ipi buf addr from FW itself first */
>   ret = scp_elf_read_ipi_buf_addr(scp, fw, );
> @@ -152,12 +155,15 @@ static int scp_ipi_init(struct mtk_scp *scp, const 
> struct firmware *fw)
>   return -EOVERFLOW;
>   }
>  
> + scp_sizes = scp->data->scp_sizes;
>

Re: [PATCH v2] uprobes: reduce contention on uprobes_tree access

2024-04-22 Thread Google

On Mon, 22 Apr 2024 03:23:05 -0700
Jonathan Haslam  wrote:

> Active uprobes are stored in an RB tree and accesses to this tree are
> dominated by read operations. Currently these accesses are serialized by
> a spinlock but this leads to enormous contention when large numbers of
> threads are executing active probes.
> 
> This patch converts the spinlock used to serialize access to the
> uprobes_tree RB tree into a reader-writer spinlock. This lock type
> aligns naturally with the overwhelmingly read-only nature of the tree
> usage here. Although the addition of reader-writer spinlocks are
> discouraged [0], this fix is proposed as an interim solution while an
> RCU based approach is implemented (that work is in a nascent form). This
> fix also has the benefit of being trivial, self contained and therefore
> simple to backport.
> 
> We have used a uprobe benchmark from the BPF selftests [1] to estimate
> the improvements. Each block of results below show 1 line per execution
> of the benchmark ("the "Summary" line) and each line is a run with one
> more thread added - a thread is a "producer". The lines are edited to
> remove extraneous output.
> 
> The tests were executed with this driver script:
> 
> for num_threads in {1..20}
> do
>   sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary
> done
> 
> SPINLOCK (BEFORE)
> ==
> Summary: hits1.396 ± 0.007M/s (  1.396M/prod)
> Summary: hits1.656 ± 0.016M/s (  0.828M/prod)
> Summary: hits2.246 ± 0.008M/s (  0.749M/prod)
> Summary: hits2.114 ± 0.010M/s (  0.529M/prod)
> Summary: hits2.013 ± 0.009M/s (  0.403M/prod)
> Summary: hits1.753 ± 0.008M/s (  0.292M/prod)
> Summary: hits1.847 ± 0.001M/s (  0.264M/prod)
> Summary: hits1.889 ± 0.001M/s (  0.236M/prod)
> Summary: hits1.833 ± 0.006M/s (  0.204M/prod)
> Summary: hits1.900 ± 0.003M/s (  0.190M/prod)
> Summary: hits1.918 ± 0.006M/s (  0.174M/prod)
> Summary: hits1.925 ± 0.002M/s (  0.160M/prod)
> Summary: hits1.837 ± 0.001M/s (  0.141M/prod)
> Summary: hits1.898 ± 0.001M/s (  0.136M/prod)
> Summary: hits1.799 ± 0.016M/s (  0.120M/prod)
> Summary: hits1.850 ± 0.005M/s (  0.109M/prod)
> Summary: hits1.816 ± 0.002M/s (  0.101M/prod)
> Summary: hits1.787 ± 0.001M/s (  0.094M/prod)
> Summary: hits1.764 ± 0.002M/s (  0.088M/prod)
> 
> RW SPINLOCK (AFTER)
> ===
> Summary: hits1.444 ± 0.020M/s (  1.444M/prod)
> Summary: hits2.279 ± 0.011M/s (  1.139M/prod)
> Summary: hits3.422 ± 0.014M/s (  1.141M/prod)
> Summary: hits3.565 ± 0.017M/s (  0.891M/prod)
> Summary: hits2.671 ± 0.013M/s (  0.534M/prod)
> Summary: hits2.409 ± 0.005M/s (  0.401M/prod)
> Summary: hits2.485 ± 0.008M/s (  0.355M/prod)
> Summary: hits2.496 ± 0.003M/s (  0.312M/prod)
> Summary: hits2.585 ± 0.002M/s (  0.287M/prod)
> Summary: hits2.908 ± 0.011M/s (  0.291M/prod)
> Summary: hits2.346 ± 0.016M/s (  0.213M/prod)
> Summary: hits2.804 ± 0.004M/s (  0.234M/prod)
> Summary: hits2.556 ± 0.001M/s (  0.197M/prod)
> Summary: hits2.754 ± 0.004M/s (  0.197M/prod)
> Summary: hits2.482 ± 0.002M/s (  0.165M/prod)
> Summary: hits2.412 ± 0.005M/s (  0.151M/prod)
> Summary: hits2.710 ± 0.003M/s (  0.159M/prod)
> Summary: hits2.826 ± 0.005M/s (  0.157M/prod)
> Summary: hits2.718 ± 0.001M/s (  0.143M/prod)
> Summary: hits2.844 ± 0.006M/s (  0.142M/prod)
> 
> The numbers in parenthesis give averaged throughput per thread which is
> of greatest interest here as a measure of scalability. Improvements are
> in the order of 22 - 68% with this particular benchmark (mean = 43%).
> 
> V2:
>  - Updated commit message to include benchmark results.
> 
> [0] https://docs.kernel.org/locking/spinlocks.html
> [1] 
> https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c

Thanks for update! This looks good to me.
Let me pick this for probes/for-next.

Thank you,

> 
> Signed-off-by: Jonathan Haslam 
> ---
>  kernel/events/uprobes.c | 22 +++---
>  1 file changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index e4834d23e1d1..8ae0eefc3a34 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
>   */
>  #define no_uprobe_events()   RB_EMPTY_ROOT(_tree)
>  
> -static DEFINE_SPINLOCK(uprobes_treelock);/* serialize rbtree access */
> +static DEFINE_RWLOCK(uprobes_treelock);  /* serialize rbtree access */
>  
>  #define UPROBES_HASH_SZ  13
>  /* serialize uprobe->pending_list */
> @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, 
> loff_t offset)
>  {
>   struct uprobe *uprobe;
>  
> - spin_lock(_treelock);
> + read_lock(_treelock);
>   uprobe = __find_uprobe(inode, offset);
> - spin_unlock(_treelock);
> + read_unlock(_treelock);
>  
>

Re: [PATCH v2] uprobes: reduce contention on uprobes_tree access

2024-04-22 Thread Google

On Mon, 22 Apr 2024 13:39:32 +0200
Jiri Olsa  wrote:

> On Mon, Apr 22, 2024 at 03:23:05AM -0700, Jonathan Haslam wrote:
> > Active uprobes are stored in an RB tree and accesses to this tree are
> > dominated by read operations. Currently these accesses are serialized by
> > a spinlock but this leads to enormous contention when large numbers of
> > threads are executing active probes.
> > 
> > This patch converts the spinlock used to serialize access to the
> > uprobes_tree RB tree into a reader-writer spinlock. This lock type
> > aligns naturally with the overwhelmingly read-only nature of the tree
> > usage here. Although the addition of reader-writer spinlocks are
> > discouraged [0], this fix is proposed as an interim solution while an
> > RCU based approach is implemented (that work is in a nascent form). This
> > fix also has the benefit of being trivial, self contained and therefore
> > simple to backport.
> > 
> > We have used a uprobe benchmark from the BPF selftests [1] to estimate
> > the improvements. Each block of results below show 1 line per execution
> > of the benchmark ("the "Summary" line) and each line is a run with one
> > more thread added - a thread is a "producer". The lines are edited to
> > remove extraneous output.
> > 
> > The tests were executed with this driver script:
> > 
> > for num_threads in {1..20}
> > do
> >   sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary
> > done
> > 
> > SPINLOCK (BEFORE)
> > ==
> > Summary: hits1.396 ± 0.007M/s (  1.396M/prod)
> > Summary: hits1.656 ± 0.016M/s (  0.828M/prod)
> > Summary: hits2.246 ± 0.008M/s (  0.749M/prod)
> > Summary: hits2.114 ± 0.010M/s (  0.529M/prod)
> > Summary: hits2.013 ± 0.009M/s (  0.403M/prod)
> > Summary: hits1.753 ± 0.008M/s (  0.292M/prod)
> > Summary: hits1.847 ± 0.001M/s (  0.264M/prod)
> > Summary: hits1.889 ± 0.001M/s (  0.236M/prod)
> > Summary: hits1.833 ± 0.006M/s (  0.204M/prod)
> > Summary: hits1.900 ± 0.003M/s (  0.190M/prod)
> > Summary: hits1.918 ± 0.006M/s (  0.174M/prod)
> > Summary: hits1.925 ± 0.002M/s (  0.160M/prod)
> > Summary: hits1.837 ± 0.001M/s (  0.141M/prod)
> > Summary: hits1.898 ± 0.001M/s (  0.136M/prod)
> > Summary: hits1.799 ± 0.016M/s (  0.120M/prod)
> > Summary: hits1.850 ± 0.005M/s (  0.109M/prod)
> > Summary: hits1.816 ± 0.002M/s (  0.101M/prod)
> > Summary: hits1.787 ± 0.001M/s (  0.094M/prod)
> > Summary: hits1.764 ± 0.002M/s (  0.088M/prod)
> > 
> > RW SPINLOCK (AFTER)
> > ===
> > Summary: hits1.444 ± 0.020M/s (  1.444M/prod)
> > Summary: hits2.279 ± 0.011M/s (  1.139M/prod)
> > Summary: hits3.422 ± 0.014M/s (  1.141M/prod)
> > Summary: hits3.565 ± 0.017M/s (  0.891M/prod)
> > Summary: hits2.671 ± 0.013M/s (  0.534M/prod)
> > Summary: hits2.409 ± 0.005M/s (  0.401M/prod)
> > Summary: hits2.485 ± 0.008M/s (  0.355M/prod)
> > Summary: hits2.496 ± 0.003M/s (  0.312M/prod)
> > Summary: hits2.585 ± 0.002M/s (  0.287M/prod)
> > Summary: hits2.908 ± 0.011M/s (  0.291M/prod)
> > Summary: hits2.346 ± 0.016M/s (  0.213M/prod)
> > Summary: hits2.804 ± 0.004M/s (  0.234M/prod)
> > Summary: hits2.556 ± 0.001M/s (  0.197M/prod)
> > Summary: hits2.754 ± 0.004M/s (  0.197M/prod)
> > Summary: hits2.482 ± 0.002M/s (  0.165M/prod)
> > Summary: hits2.412 ± 0.005M/s (  0.151M/prod)
> > Summary: hits2.710 ± 0.003M/s (  0.159M/prod)
> > Summary: hits2.826 ± 0.005M/s (  0.157M/prod)
> > Summary: hits2.718 ± 0.001M/s (  0.143M/prod)
> > Summary: hits2.844 ± 0.006M/s (  0.142M/prod)
> 
> nice, I'm assuming Masami will take this one.. in any case:
> 
> Acked-by: Jiri Olsa 

Thanks Jiri!

This looks good to me too.
Let me pick this for probes/for-next.

Thank you,

> 
> thanks,
> jirka
> 
> > 
> > The numbers in parenthesis give averaged throughput per thread which is
> > of greatest interest here as a measure of scalability. Improvements are
> > in the order of 22 - 68% with this particular benchmark (mean = 43%).
> > 
> > V2:
> >  - Updated commit message to include benchmark results.
> > 
> > [0] https://docs.kernel.org/locking/spinlocks.html
> > [1] 
> > https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c
> > 
> > Signed-off-by: Jonathan Haslam 
> > ---
> >  kernel/events/uprobes.c | 22 +++---
> >  1 file changed, 11 insertions(+), 11 deletions(-)
> > 
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index e4834d23e1d1..8ae0eefc3a34 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
> >   */
> >  #define no_uprobe_events() RB_EMPTY_ROOT(_tree)
> >  
> > -static DEFINE_SPINLOCK(uprobes_treelock);  /* serialize rbtree access */
> > +static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
> >  
> >  #define

Re: [PATCHv3 bpf-next 0/7] uprobe: uretprobe speed up

2024-04-22 Thread Google

Hi Jiri,

On Sun, 21 Apr 2024 21:41:59 +0200
Jiri Olsa  wrote:

> hi,
> as part of the effort on speeding up the uprobes [0] coming with
> return uprobe optimization by using syscall instead of the trap
> on the uretprobe trampoline.
> 
> The speed up depends on instruction type that uprobe is installed
> and depends on specific HW type, please check patch 1 for details.
> 
> Patches 1-6 are based on bpf-next/master, but path 1 and 2 are
> apply-able on linux-trace.git tree probes/for-next branch.
> Patch 7 is based on man-pages master.

Thanks for updated! I reviewed the series and just except for the
manpage, it looks good to me.

Reviewed-by: Masami Hiramatsu (Google) 

for the series.
If Linux API maintainers are OK, I can pick this in probes/for-next.
(BTW, who will pick the manpage patch?)

Thank you,

> 
> v3 changes:
>   - added source ip check if the uretprobe syscall is called from
> trampoline and sending SIGILL to process if it's not
>   - keep x86 compat process to use standard breakpoint
>   - split syscall wiring into separate change
>   - ran ltp and syzkaller locally, no issues found [Masami]
>   - building uprobe_compat binary in selftests which breaks
> CI atm because of missing 32-bit delve packages, I will
> need to fix that in separate changes once this is acked
>   - added man page change
>   - there were several changes so I removed acks [Oleg Andrii]
> 
> Also available at:
>   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
>   uretprobe_syscall
> 
> thanks,
> jirka
> 
> 
> Notes to check list items in Documentation/process/adding-syscalls.rst:
> 
> - System Call Alternatives
>   New syscall seems like the best way in here, becase we need
>   just to quickly enter kernel with no extra arguments processing,
>   which we'd need to do if we decided to use another syscall.
> 
> - Designing the API: Planning for Extension
>   The uretprobe syscall is very specific and most likely won't be
>   extended in the future.
> 
>   At the moment it does not take any arguments and even if it does
>   in future, it's allowed to be called only from trampoline prepared
>   by kernel, so there'll be no broken user.
> 
> - Designing the API: Other Considerations
>   N/A because uretprobe syscall does not return reference to kernel
>   object.
> 
> - Proposing the API
>   Wiring up of the uretprobe system call si in separate change,
>   selftests and man page changes are part of the patchset.
> 
> - Generic System Call Implementation
>   There's no CONFIG option for the new functionality because it
>   keeps the same behaviour from the user POV.
> 
> - x86 System Call Implementation
>   It's 64-bit syscall only.
> 
> - Compatibility System Calls (Generic)
>   N/A uretprobe syscall has no arguments and is not supported
>   for compat processes.
> 
> - Compatibility System Calls (x86)
>   N/A uretprobe syscall is not supported for compat processes.
> 
> - System Calls Returning Elsewhere
>   N/A.
> 
> - Other Details
>   N/A.
> 
> - Testing
>   Adding new bpf selftests and ran ltp on top of this change.
> 
> - Man Page
>   Attached.
> 
> - Do not call System Calls in the Kernel
>   N/A.
> 
> 
> [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> ---
> Jiri Olsa (6):
>   uprobe: Wire up uretprobe system call
>   uprobe: Add uretprobe syscall to speed up return probe
>   selftests/bpf: Add uretprobe syscall test for regs integrity
>   selftests/bpf: Add uretprobe syscall test for regs changes
>   selftests/bpf: Add uretprobe syscall call from user space test
>   selftests/bpf: Add uretprobe compat test
> 
>  arch/x86/entry/syscalls/syscall_64.tbl|   1 +
>  arch/x86/kernel/uprobes.c | 115 
> ++
>  include/linux/syscalls.h  |   2 +
>  include/linux/uprobes.h   |   3 +
>  include/uapi/asm-generic/unistd.h |   5 +-
>  kernel/events/uprobes.c   |  24 +--
>  kernel/sys_ni.c   |   2 +
>  tools/include/linux/compiler.h|   4 ++
>  tools/testing/selftests/bpf/.gitignore|   1 +
>  tools/testing/selftests/bpf/Makefile  |   6 +-
>  tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 123 
> +++-
>  tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c   | 362 
> +
>  tools/testing/selftests/bpf/progs/uprobe_syscall.c|  15 
>  tools/testing/selftests/bpf/progs/uprobe_syscall_call.c   |  15 
>  tools/testing/selftests/bpf/progs/uprobe_syscall_compat.c |  13 
>  15 files changed, 681 insertions(+), 10 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
>  create mode 100644

Re: [PATCH 7/7] man2: Add uretprobe syscall page

2024-04-22 Thread Google

On Sun, 21 Apr 2024 21:42:06 +0200
Jiri Olsa  wrote:

> Adding man page for new uretprobe syscall.
> 
> Signed-off-by: Jiri Olsa 
> ---
>  man2/uretprobe.2 | 40 
>  1 file changed, 40 insertions(+)
>  create mode 100644 man2/uretprobe.2
> 
> diff --git a/man2/uretprobe.2 b/man2/uretprobe.2
> new file mode 100644
> index ..c0343a88bb57
> --- /dev/null
> +++ b/man2/uretprobe.2
> @@ -0,0 +1,40 @@
> +.\" Copyright (C) 2024, Jiri Olsa 
> +.\"
> +.\" SPDX-License-Identifier: Linux-man-pages-copyleft
> +.\"
> +.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
> +.SH NAME
> +uretprobe \- execute pending return uprobes
> +.SH SYNOPSIS
> +.nf
> +.B int uretprobe(void)
> +.fi
> +.SH DESCRIPTION
> +On x86_64 architecture the kernel is using uretprobe syscall to trigger
> +uprobe return probe consumers instead of using standard breakpoint 
> instruction.
> +The reason is that it's much faster to do syscall than breakpoint trap
> +on x86_64 architecture.

Do we specify the supported architecture as this? Currently it is supported
only on x86-64, but it could be extended later, right?

This should be just noted as NOTES. Something like "This syscall is initially
introduced on x86-64 because a syscall is faster than a breakpoint trap on it.
But this will be extended to the architectures whose syscall is faster than
breakpoint trap."

Thank you,

> +
> +The uretprobe syscall is not supposed to be called directly by user, it's 
> allowed
> +to be invoked only through user space trampoline provided by kernel.
> +When called from outside of this trampoline, the calling process will receive
> +.BR SIGILL .
> +
> +.SH RETURN VALUE
> +.BR uretprobe()
> +return value is specific for given architecture.
> +
> +.SH VERSIONS
> +This syscall is not specified in POSIX,
> +and details of its behavior vary across systems.
> +.SH STANDARDS
> +None.
> +.SH NOTES
> +.BR uretprobe()
> +exists only to allow the invocation of return uprobe consumers.
> +It should
> +.B never
> +be called directly.
> +Details of the arguments (if any) passed to
> +.BR uretprobe ()
> +and the return value are specific for given architecture.
> -- 
> 2.44.0
> 


-- 
Masami Hiramatsu (Google)

Re: [PATCH virt] virt: fix uninit-value in vhost_vsock_dev_open

2024-04-22 Thread Michael S. Tsirkin

On Mon, Apr 22, 2024 at 09:00:31AM -0400, Stefan Hajnoczi wrote:
> On Sun, Apr 21, 2024 at 12:06:06PM +0900, Jeongjun Park wrote:
> > static bool vhost_transport_seqpacket_allow(u32 remote_cid)
> > {
> > 
> > vsock = vhost_vsock_get(remote_cid);
> > 
> > if (vsock)
> > seqpacket_allow = vsock->seqpacket_allow;
> > 
> > }
> > 
> > I think this is due to reading a previously created uninitialized 
> > vsock->seqpacket_allow inside vhost_transport_seqpacket_allow(), 
> > which is executed by the function pointer present in the if statement.
> 
> CCing Arseny, author of commit ced7b713711f ("vhost/vsock: support
> SEQPACKET for transport").
> 
> Looks like a genuine bug in the commit. vhost_vsock_set_features() sets
> seqpacket_allow to true when the feature is negotiated. The assumption
> is that the field defaults to false.
> 
> The rest of the vhost_vsock.ko code is written to initialize the
> vhost_vsock fields, so you could argue seqpacket_allow should just be
> explicitly initialized to false.
> 
> However, eliminating this class of errors by zeroing seems reasonable in
> this code path. vhost_vsock_dev_open() is not performance-critical.
> 
> Acked-by: Stefan Hajnoczi 

But now that it's explained, the bugfix as proposed is incomplete:
userspace can set features twice and the second time will leak
old VIRTIO_VSOCK_F_SEQPACKET bit value.

And I am pretty sure the Fixes tag is wrong.

So I wrote this, but I actually don't have a set for
seqpacket to test this. Arseny could you help test maybe?
Thanks!

commit bcc17a060d93b198d8a17a9b87b593f41337ee28
Author: Michael S. Tsirkin 
Date:   Mon Apr 22 10:03:13 2024 -0400

vhost/vsock: always initialize seqpacket_allow

There are two issues around seqpacket_allow:
1. seqpacket_allow is not initialized when socket is
created. Thus if features are never set, it will be
read uninitialized.
2. if VIRTIO_VSOCK_F_SEQPACKET is set and then cleared,
then seqpacket_allow will not be cleared appropriately
(existing apps I know about don't usually do this but
it's legal and there's no way to be sure no one relies
on this).

To fix:
- initialize seqpacket_allow after allocation
- set it unconditionally in set_features

Reported-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com
Reported-by: Jeongjun Park 
Fixes: ced7b713711f ("vhost/vsock: support SEQPACKET for transport").
Cc: Arseny Krasnov 
Cc: David S. Miller 
Cc: Stefan Hajnoczi 
Signed-off-by: Michael S. Tsirkin 

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index ec20ecff85c7..bf664ec9341b 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -667,6 +667,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct 
file *file)
}

vsock->guest_cid = 0; /* no CID assigned yet */
+   vsock->seqpacket_allow = false;

atomic_set(>queued_replies, 0);

@@ -810,8 +811,7 @@ static int vhost_vsock_set_features(struct vhost_vsock 
*vsock, u64 features)
goto err;
}

-   if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
-   vsock->seqpacket_allow = true;
+   vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET);

for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
vq = >vqs[i];

Re: [syzbot] [virt?] [net?] KMSAN: uninit-value in vsock_assign_transport (2)

2024-04-22 Thread Michael S. Tsirkin

On Fri, Apr 19, 2024 at 02:39:20AM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:8cd26fd90c1a Merge tag 'for-6.9-rc4-tag' of git://git.kern..
> git tree:   upstream
> console+strace: https://syzkaller.appspot.com/x/log.txt?x=102d27cd18
> kernel config:  https://syzkaller.appspot.com/x/.config?x=87a805e655619c64
> dashboard link: https://syzkaller.appspot.com/bug?extid=6c21aeb59d0e82eb2782
> compiler:   Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 
> 2.40
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=16e38c3b18
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10e62fed18
> 
> Downloadable assets:
> disk image: 
> https://storage.googleapis.com/syzbot-assets/488822aee24a/disk-8cd26fd9.raw.xz
> vmlinux: 
> https://storage.googleapis.com/syzbot-assets/ba40e322ba00/vmlinux-8cd26fd9.xz
> kernel image: 
> https://storage.googleapis.com/syzbot-assets/f30af1dfbc30/bzImage-8cd26fd9.xz
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com
> 
> =
> BUG: KMSAN: uninit-value in vsock_assign_transport+0xb2a/0xb90 
> net/vmw_vsock/af_vsock.c:500
>  vsock_assign_transport+0xb2a/0xb90 net/vmw_vsock/af_vsock.c:500
>  vsock_connect+0x544/0x1560 net/vmw_vsock/af_vsock.c:1393
>  __sys_connect_file net/socket.c:2048 [inline]
>  __sys_connect+0x606/0x690 net/socket.c:2065
>  __do_sys_connect net/socket.c:2075 [inline]
>  __se_sys_connect net/socket.c:2072 [inline]
>  __x64_sys_connect+0x91/0xe0 net/socket.c:2072
>  x64_sys_call+0x3356/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:43
>  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
>  do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83
>  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> 
> Uninit was created at:
>  __kmalloc_large_node+0x231/0x370 mm/slub.c:3921
>  __do_kmalloc_node mm/slub.c:3954 [inline]
>  __kmalloc_node+0xb07/0x1060 mm/slub.c:3973
>  kmalloc_node include/linux/slab.h:648 [inline]
>  kvmalloc_node+0xc0/0x2d0 mm/util.c:634
>  kvmalloc include/linux/slab.h:766 [inline]
>  vhost_vsock_dev_open+0x44/0x510 drivers/vhost/vsock.c:659
>  misc_open+0x66b/0x760 drivers/char/misc.c:165
>  chrdev_open+0xa5f/0xb80 fs/char_dev.c:414
>  do_dentry_open+0x11f1/0x2120 fs/open.c:955
>  vfs_open+0x7e/0xa0 fs/open.c:1089
>  do_open fs/namei.c:3642 [inline]
>  path_openat+0x4a3c/0x5b00 fs/namei.c:3799
>  do_filp_open+0x20e/0x590 fs/namei.c:3826
>  do_sys_openat2+0x1bf/0x2f0 fs/open.c:1406
>  do_sys_open fs/open.c:1421 [inline]
>  __do_sys_openat fs/open.c:1437 [inline]
>  __se_sys_openat fs/open.c:1432 [inline]
>  __x64_sys_openat+0x2a1/0x310 fs/open.c:1432
>  x64_sys_call+0x3a64/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:258
>  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
>  do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83
>  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> 
> CPU: 1 PID: 5021 Comm: syz-executor390 Not tainted 
> 6.9.0-rc4-syzkaller-00038-g8cd26fd90c1a #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 03/27/2024
> =
> 
> 
> ---
> This report is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkal...@googlegroups.com.
> 
> syzbot will keep track of this issue. See:
> https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
> 
> If the report is already addressed, let syzbot know by replying with:
> #syz fix: exact-commit-title
> 
> If you want syzbot to run the reproducer, reply with:
> #syz test: git://repo/address.git branch-or-commit-hash
> If you attach or paste a git patch, syzbot will apply it before testing.
> 
> If you want to overwrite report's subsystems, reply with:
> #syz set subsystems: new-subsystem
> (See the list of subsystem names on the web dashboard)
> 
> If the report is a duplicate of another one, reply with:
> #syz dup: exact-subject-of-another-report
> 
> If you want to undo deduplication, reply with:
> #syz undup


#syz test: https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git 
bcc17a060d93b198d8a17a9b87b593f41337ee28

Re: [PATCH v5 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-22 Thread Google

On Mon, 22 Apr 2024 12:44:35 +0300
Mike Rapoport  wrote:

> From: "Mike Rapoport (IBM)" 
> 
> kprobes depended on CONFIG_MODULES because it has to allocate memory for
> code.
> 
> Since code allocations are now implemented with execmem, kprobes can be
> enabled in non-modular kernels.
> 
> Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
> modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> dependency of CONFIG_KPROBES on CONFIG_MODULES.

Looks good to me.

Acked-by: Masami Hiramatsu (Google) 

Thank you!

> 
> Signed-off-by: Mike Rapoport (IBM) 
> ---
>  arch/Kconfig|  2 +-
>  include/linux/module.h  |  9 ++
>  kernel/kprobes.c| 55 +++--
>  kernel/trace/trace_kprobe.c | 20 +-
>  4 files changed, 63 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 7006f71f0110..a48ce6a488b3 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -52,9 +52,9 @@ config GENERIC_ENTRY
>  
>  config KPROBES
>   bool "Kprobes"
> - depends on MODULES
>   depends on HAVE_KPROBES
>   select KALLSYMS
> + select EXECMEM
>   select TASKS_RCU if PREEMPTION
>   help
> Kprobes allows you to trap at almost any kernel address and
> diff --git a/include/linux/module.h b/include/linux/module.h
> index 1153b0d99a80..ffa1c603163c 100644
> --- a/include/linux/module.h
> +++ b/include/linux/module.h
> @@ -605,6 +605,11 @@ static inline bool module_is_live(struct module *mod)
>   return mod->state != MODULE_STATE_GOING;
>  }
>  
> +static inline bool module_is_coming(struct module *mod)
> +{
> +return mod->state == MODULE_STATE_COMING;
> +}
> +
>  struct module *__module_text_address(unsigned long addr);
>  struct module *__module_address(unsigned long addr);
>  bool is_module_address(unsigned long addr);
> @@ -857,6 +862,10 @@ void *dereference_module_function_descriptor(struct 
> module *mod, void *ptr)
>   return ptr;
>  }
>  
> +static inline bool module_is_coming(struct module *mod)
> +{
> + return false;
> +}
>  #endif /* CONFIG_MODULES */
>  
>  #ifdef CONFIG_SYSFS
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index ddd7cdc16edf..ca2c6cbd42d2 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -1588,7 +1588,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
>   }
>  
>   /* Get module refcount and reject __init functions for loaded modules. 
> */
> - if (*probed_mod) {
> + if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
>   /*
>* We must hold a refcount of the probed module while updating
>* its code to prohibit unexpected unloading.
> @@ -1603,12 +1603,13 @@ static int check_kprobe_address_safe(struct kprobe *p,
>* kprobes in there.
>*/
>   if (within_module_init((unsigned long)p->addr, *probed_mod) &&
> - (*probed_mod)->state != MODULE_STATE_COMING) {
> + !module_is_coming(*probed_mod)) {
>   module_put(*probed_mod);
>   *probed_mod = NULL;
>   ret = -ENOENT;
>   }
>   }
> +
>  out:
>   preempt_enable();
>   jump_label_unlock();
> @@ -2488,24 +2489,6 @@ int kprobe_add_area_blacklist(unsigned long start, 
> unsigned long end)
>   return 0;
>  }
>  
> -/* Remove all symbols in given area from kprobe blacklist */
> -static void kprobe_remove_area_blacklist(unsigned long start, unsigned long 
> end)
> -{
> - struct kprobe_blacklist_entry *ent, *n;
> -
> - list_for_each_entry_safe(ent, n, _blacklist, list) {
> - if (ent->start_addr < start || ent->start_addr >= end)
> - continue;
> - list_del(>list);
> - kfree(ent);
> - }
> -}
> -
> -static void kprobe_remove_ksym_blacklist(unsigned long entry)
> -{
> - kprobe_remove_area_blacklist(entry, entry + 1);
> -}
> -
>  int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long 
> *value,
>  char *type, char *sym)
>  {
> @@ -2570,6 +2553,25 @@ static int __init populate_kprobe_blacklist(unsigned 
> long *start,
>   return ret ? : arch_populate_kprobe_blacklist();
>  }
>  
> +#ifdef CONFIG_MODULES
> +/* Remove all symbols in given area from kprobe blacklist */
> +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long 
> end)
> +{
> + struct kprobe_blacklist_entry *ent, *n;
> +
> + list_for_each_entry_safe(ent, n, _blacklist, list) {
> + if (ent->start_addr < start || ent->start_addr >= end)
> + continue;
> + list_del(>list);
> + kfree(ent);
> + }
> +}
> +
> +static void kprobe_remove_ksym_blacklist(unsigned long entry)
> +{
> + kprobe_remove_area_blacklist(entry, entry + 1);
> +}
> +
>  static void add_module_kprobe_blacklist(struct module

Re: [PATCH virt] virt: fix uninit-value in vhost_vsock_dev_open

2024-04-22 Thread Stefan Hajnoczi

On Sun, Apr 21, 2024 at 12:06:06PM +0900, Jeongjun Park wrote:
> static bool vhost_transport_seqpacket_allow(u32 remote_cid)
> {
> 
>   vsock = vhost_vsock_get(remote_cid);
> 
>   if (vsock)
>   seqpacket_allow = vsock->seqpacket_allow;
> 
> }
> 
> I think this is due to reading a previously created uninitialized 
> vsock->seqpacket_allow inside vhost_transport_seqpacket_allow(), 
> which is executed by the function pointer present in the if statement.

CCing Arseny, author of commit ced7b713711f ("vhost/vsock: support
SEQPACKET for transport").

Looks like a genuine bug in the commit. vhost_vsock_set_features() sets
seqpacket_allow to true when the feature is negotiated. The assumption
is that the field defaults to false.

The rest of the vhost_vsock.ko code is written to initialize the
vhost_vsock fields, so you could argue seqpacket_allow should just be
explicitly initialized to false.

However, eliminating this class of errors by zeroing seems reasonable in
this code path. vhost_vsock_dev_open() is not performance-critical.

Acked-by: Stefan Hajnoczi 

signature.asc
Description: PGP signature

Re: [PATCH v5 11/15] arch: make execmem setup available regardless of CONFIG_MODULES

2024-04-22 Thread Philippe Mathieu-Daudé


On 22/4/24 11:44, Mike Rapoport wrote:

From: "Mike Rapoport (IBM)" 

execmem does not depend on modules, on the contrary modules use
execmem.

To make execmem available when CONFIG_MODULES=n, for instance for
kprobes, split execmem_params initialization out from
arch/*/kernel/module.c and compile it when CONFIG_EXECMEM=y

Signed-off-by: Mike Rapoport (IBM) 
---
  arch/arm/kernel/module.c   |  43 --
  arch/arm/mm/init.c |  45 +++
  arch/arm64/kernel/module.c | 140 -
  arch/arm64/mm/init.c   | 140 +
  arch/loongarch/kernel/module.c |  19 -
  arch/loongarch/mm/init.c   |  21 +
  arch/mips/kernel/module.c  |  22 --
  arch/mips/mm/init.c|  23 ++
  arch/nios2/kernel/module.c |  20 -
  arch/nios2/mm/init.c   |  21 +
  arch/parisc/kernel/module.c|  20 -
  arch/parisc/mm/init.c  |  23 +-
  arch/powerpc/kernel/module.c   |  63 ---
  arch/powerpc/mm/mem.c  |  64 +++
  arch/riscv/kernel/module.c |  44 ---
  arch/riscv/mm/init.c   |  45 +++
  arch/s390/kernel/module.c  |  27 ---
  arch/s390/mm/init.c|  30 +++
  arch/sparc/kernel/module.c |  19 -
  arch/sparc/mm/Makefile |   2 +
  arch/sparc/mm/execmem.c|  21 +
  arch/x86/kernel/module.c   |  27 ---
  arch/x86/mm/init.c |  29 +++
  23 files changed, 463 insertions(+), 445 deletions(-)
  create mode 100644 arch/sparc/mm/execmem.c


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH v5] vp_vdpa: don't allocate unused msix vectors

2024-04-22 Thread Michael S. Tsirkin

On Wed, Apr 10, 2024 at 11:30:20AM +0800, lyx634449800 wrote:
> From: Yuxue Liu 
> 
> When there is a ctlq and it doesn't require interrupt
> callbacks,the original method of calculating vectors
> wastes hardware msi or msix resources as well as system
> IRQ resources.
> 
> When conducting performance testing using testpmd in the
> guest os, it was found that the performance was lower compared
> to directly using vfio-pci to passthrough the device
> 
> In scenarios where the virtio device in the guest os does
> not utilize interrupts, the vdpa driver still configures
> the hardware's msix vector. Therefore, the hardware still
> sends interrupts to the host os.

I just have a question on this part. How come hardware
sends interrupts does not guest driver disable them?

> Because of this unnecessary
> action by the hardware, hardware performance decreases, and
> it also affects the performance of the host os.
> 
> Before modification:(interrupt mode)
>  32:  0   0  0  0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0
>  33:  0   0  0  0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1
>  34:  0   0  0  0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2
>  35:  0   0  0  0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config
> 
> After modification:(interrupt mode)
>  32:  0  0  1  7   PCI-MSI 32768-edge  vp-vdpa[:00:02.0]-0
>  33: 36  0  3  0   PCI-MSI 32769-edge  vp-vdpa[:00:02.0]-1
>  34:  0  0  0  0   PCI-MSI 32770-edge  vp-vdpa[:00:02.0]-config
> 
> Before modification:(virtio pmd mode for guest os)
>  32:  0   0  0  0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0
>  33:  0   0  0  0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1
>  34:  0   0  0  0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2
>  35:  0   0  0  0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config
> 
> After modification:(virtio pmd mode for guest os)
>  32: 0  0  0   0   PCI-MSI 32768-edge   vp-vdpa[:00:02.0]-config
> 
> To verify the use of the virtio PMD mode in the guest operating
> system, the following patch needs to be applied to QEMU:
> https://lore.kernel.org/all/20240408073311.2049-1-yuxue@jaguarmicro.com
> 
> Signed-off-by: Yuxue Liu 
> Acked-by: Jason Wang 
> Reviewed-by: Heng Qi 
> ---
> V5: modify the description of the printout when an exception occurs
> V4: update the title and assign values to uninitialized variables
> V3: delete unused variables and add validation records
> V2: fix when allocating IRQs, scan all queues
> 
>  drivers/vdpa/virtio_pci/vp_vdpa.c | 22 --
>  1 file changed, 16 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c 
> b/drivers/vdpa/virtio_pci/vp_vdpa.c
> index df5f4a3bccb5..8de0224e9ec2 100644
> --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> @@ -160,7 +160,13 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
>   struct pci_dev *pdev = mdev->pci_dev;
>   int i, ret, irq;
>   int queues = vp_vdpa->queues;
> - int vectors = queues + 1;
> + int vectors = 1;
> + int msix_vec = 0;
> +
> + for (i = 0; i < queues; i++) {
> + if (vp_vdpa->vring[i].cb.callback)
> + vectors++;
> + }
>  
>   ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX);
>   if (ret != vectors) {
> @@ -173,9 +179,12 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
>   vp_vdpa->vectors = vectors;
>  
>   for (i = 0; i < queues; i++) {
> + if (!vp_vdpa->vring[i].cb.callback)
> + continue;
> +
>   snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE,
>   "vp-vdpa[%s]-%d\n", pci_name(pdev), i);
> - irq = pci_irq_vector(pdev, i);
> + irq = pci_irq_vector(pdev, msix_vec);
>   ret = devm_request_irq(>dev, irq,
>  vp_vdpa_vq_handler,
>  0, vp_vdpa->vring[i].msix_name,
> @@ -185,21 +194,22 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
>   "vp_vdpa: fail to request irq for vq %d\n", i);
>   goto err;
>   }
> - vp_modern_queue_vector(mdev, i, i);
> + vp_modern_queue_vector(mdev, i, msix_vec);
>   vp_vdpa->vring[i].irq = irq;
> + msix_vec++;
>   }
>  
>   snprintf(vp_vdpa->msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-config\n",
>pci_name(pdev));
> - irq = pci_irq_vector(pdev, queues);
> + irq = pci_irq_vector(pdev, msix_vec);
>   ret = devm_request_irq(>dev, irq, vp_vdpa_config_handler, 0,
>  vp_vdpa->msix_name, vp_vdpa);
>   if (ret) {
>   dev_err(>dev,
> - "vp_vdpa: fail to request irq for vq %d\n", i);
> + "vp_vdpa: fail to request irq for config: %d\n", ret);
>   goto err;
>   }
> -

Re: [PATCH v2] uprobes: reduce contention on uprobes_tree access

2024-04-22 Thread Jiri Olsa

On Mon, Apr 22, 2024 at 03:23:05AM -0700, Jonathan Haslam wrote:
> Active uprobes are stored in an RB tree and accesses to this tree are
> dominated by read operations. Currently these accesses are serialized by
> a spinlock but this leads to enormous contention when large numbers of
> threads are executing active probes.
> 
> This patch converts the spinlock used to serialize access to the
> uprobes_tree RB tree into a reader-writer spinlock. This lock type
> aligns naturally with the overwhelmingly read-only nature of the tree
> usage here. Although the addition of reader-writer spinlocks are
> discouraged [0], this fix is proposed as an interim solution while an
> RCU based approach is implemented (that work is in a nascent form). This
> fix also has the benefit of being trivial, self contained and therefore
> simple to backport.
> 
> We have used a uprobe benchmark from the BPF selftests [1] to estimate
> the improvements. Each block of results below show 1 line per execution
> of the benchmark ("the "Summary" line) and each line is a run with one
> more thread added - a thread is a "producer". The lines are edited to
> remove extraneous output.
> 
> The tests were executed with this driver script:
> 
> for num_threads in {1..20}
> do
>   sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary
> done
> 
> SPINLOCK (BEFORE)
> ==
> Summary: hits1.396 ± 0.007M/s (  1.396M/prod)
> Summary: hits1.656 ± 0.016M/s (  0.828M/prod)
> Summary: hits2.246 ± 0.008M/s (  0.749M/prod)
> Summary: hits2.114 ± 0.010M/s (  0.529M/prod)
> Summary: hits2.013 ± 0.009M/s (  0.403M/prod)
> Summary: hits1.753 ± 0.008M/s (  0.292M/prod)
> Summary: hits1.847 ± 0.001M/s (  0.264M/prod)
> Summary: hits1.889 ± 0.001M/s (  0.236M/prod)
> Summary: hits1.833 ± 0.006M/s (  0.204M/prod)
> Summary: hits1.900 ± 0.003M/s (  0.190M/prod)
> Summary: hits1.918 ± 0.006M/s (  0.174M/prod)
> Summary: hits1.925 ± 0.002M/s (  0.160M/prod)
> Summary: hits1.837 ± 0.001M/s (  0.141M/prod)
> Summary: hits1.898 ± 0.001M/s (  0.136M/prod)
> Summary: hits1.799 ± 0.016M/s (  0.120M/prod)
> Summary: hits1.850 ± 0.005M/s (  0.109M/prod)
> Summary: hits1.816 ± 0.002M/s (  0.101M/prod)
> Summary: hits1.787 ± 0.001M/s (  0.094M/prod)
> Summary: hits1.764 ± 0.002M/s (  0.088M/prod)
> 
> RW SPINLOCK (AFTER)
> ===
> Summary: hits1.444 ± 0.020M/s (  1.444M/prod)
> Summary: hits2.279 ± 0.011M/s (  1.139M/prod)
> Summary: hits3.422 ± 0.014M/s (  1.141M/prod)
> Summary: hits3.565 ± 0.017M/s (  0.891M/prod)
> Summary: hits2.671 ± 0.013M/s (  0.534M/prod)
> Summary: hits2.409 ± 0.005M/s (  0.401M/prod)
> Summary: hits2.485 ± 0.008M/s (  0.355M/prod)
> Summary: hits2.496 ± 0.003M/s (  0.312M/prod)
> Summary: hits2.585 ± 0.002M/s (  0.287M/prod)
> Summary: hits2.908 ± 0.011M/s (  0.291M/prod)
> Summary: hits2.346 ± 0.016M/s (  0.213M/prod)
> Summary: hits2.804 ± 0.004M/s (  0.234M/prod)
> Summary: hits2.556 ± 0.001M/s (  0.197M/prod)
> Summary: hits2.754 ± 0.004M/s (  0.197M/prod)
> Summary: hits2.482 ± 0.002M/s (  0.165M/prod)
> Summary: hits2.412 ± 0.005M/s (  0.151M/prod)
> Summary: hits2.710 ± 0.003M/s (  0.159M/prod)
> Summary: hits2.826 ± 0.005M/s (  0.157M/prod)
> Summary: hits2.718 ± 0.001M/s (  0.143M/prod)
> Summary: hits2.844 ± 0.006M/s (  0.142M/prod)

nice, I'm assuming Masami will take this one.. in any case:

Acked-by: Jiri Olsa 

thanks,
jirka

> 
> The numbers in parenthesis give averaged throughput per thread which is
> of greatest interest here as a measure of scalability. Improvements are
> in the order of 22 - 68% with this particular benchmark (mean = 43%).
> 
> V2:
>  - Updated commit message to include benchmark results.
> 
> [0] https://docs.kernel.org/locking/spinlocks.html
> [1] 
> https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c
> 
> Signed-off-by: Jonathan Haslam 
> ---
>  kernel/events/uprobes.c | 22 +++---
>  1 file changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index e4834d23e1d1..8ae0eefc3a34 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
>   */
>  #define no_uprobe_events()   RB_EMPTY_ROOT(_tree)
>  
> -static DEFINE_SPINLOCK(uprobes_treelock);/* serialize rbtree access */
> +static DEFINE_RWLOCK(uprobes_treelock);  /* serialize rbtree access */
>  
>  #define UPROBES_HASH_SZ  13
>  /* serialize uprobe->pending_list */
> @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, 
> loff_t offset)
>  {
>   struct uprobe *uprobe;
>  
> - spin_lock(_treelock);
> + read_lock(_treelock);
>   uprobe = __find_uprobe(inode, offset);
> - spin_unlock(_treelock);
> + read_unlock(_treelock);

Re: [PATCH v5] vp_vdpa: don't allocate unused msix vectors

2024-04-22 Thread Gavin Liu

Dear Michael,

I hope this email finds you well. I am reaching out to request your 
assistance in reviewing a patch.

The patch in question is titled "[PATCH v5] vp_vdpa: don't allocate 
unused msix vectors". I believe your expertise and insights would be invaluable 
in ensuring the quality and effectiveness of this patch.

Your feedback and review are highly appreciated. Please let me know if 
you have any questions or require further information.

Thank you for your time and consideration.

Best regards,
Yuxue Liu



-Original Message-
From: Gavin Liu
Sent: April 10, 2024 11:31
To: m...@redhat.com; jasow...@redhat.com
Cc: Angus Chen angus.c...@jaguarmicro.com; virtualizat...@lists.linux.dev; 
xuanz...@linux.alibaba.com; Gavin Liu gavin@jaguarmicro.com; 
linux-kernel@vger.kernel.org; Heng Qi hen...@linux.alibaba.com
Subject: [PATCH v5] vp_vdpa: don't allocate unused msix vectors
From: Yuxue Liu 

When there is a ctlq and it doesn't require interrupt callbacks,the original 
method of calculating vectors wastes hardware msi or msix resources as well as 
system IRQ resources.

When conducting performance testing using testpmd in the guest os, it was found 
that the performance was lower compared to directly using vfio-pci to 
passthrough the device

In scenarios where the virtio device in the guest os does not utilize 
interrupts, the vdpa driver still configures the hardware's msix vector. 
Therefore, the hardware still sends interrupts to the host os. Because of this 
unnecessary action by the hardware, hardware performance decreases, and it also 
affects the performance of the host os.

Before modification:(interrupt mode)
 32:  0   0  0  0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0
 33:  0   0  0  0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1
 34:  0   0  0  0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2
 35:  0   0  0  0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config

After modification:(interrupt mode)
 32:  0  0  1  7   PCI-MSI 32768-edge  vp-vdpa[:00:02.0]-0
 33: 36  0  3  0   PCI-MSI 32769-edge  vp-vdpa[:00:02.0]-1
 34:  0  0  0  0   PCI-MSI 32770-edge  vp-vdpa[:00:02.0]-config

Before modification:(virtio pmd mode for guest os)
 32:  0   0  0  0 PCI-MSI 32768-edgevp-vdpa[:00:02.0]-0
 33:  0   0  0  0 PCI-MSI 32769-edgevp-vdpa[:00:02.0]-1
 34:  0   0  0  0 PCI-MSI 32770-edgevp-vdpa[:00:02.0]-2
 35:  0   0  0  0 PCI-MSI 32771-edgevp-vdpa[:00:02.0]-config

After modification:(virtio pmd mode for guest os)
 32: 0  0  0   0   PCI-MSI 32768-edge   vp-vdpa[:00:02.0]-config

To verify the use of the virtio PMD mode in the guest operating system, the 
following patch needs to be applied to QEMU:
https://lore.kernel.org/all/20240408073311.2049-1-yuxue@jaguarmicro.com

Signed-off-by: Yuxue Liu 
Acked-by: Jason Wang 
Reviewed-by: Heng Qi 
---
V5: modify the description of the printout when an exception occurs
V4: update the title and assign values to uninitialized variables
V3: delete unused variables and add validation records
V2: fix when allocating IRQs, scan all queues

 drivers/vdpa/virtio_pci/vp_vdpa.c | 22 --
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c 
b/drivers/vdpa/virtio_pci/vp_vdpa.c
index df5f4a3bccb5..8de0224e9ec2 100644
--- a/drivers/vdpa/virtio_pci/vp_vdpa.c
+++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
@@ -160,7 +160,13 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
struct pci_dev *pdev = mdev->pci_dev;
int i, ret, irq;
int queues = vp_vdpa->queues;
-   int vectors = queues + 1;
+   int vectors = 1;
+   int msix_vec = 0;
+
+   for (i = 0; i < queues; i++) {
+   if (vp_vdpa->vring[i].cb.callback)
+   vectors++;
+   }
 
ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX);
if (ret != vectors) {
@@ -173,9 +179,12 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
vp_vdpa->vectors = vectors;
 
for (i = 0; i < queues; i++) {
+   if (!vp_vdpa->vring[i].cb.callback)
+   continue;
+
snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE,
"vp-vdpa[%s]-%d\n", pci_name(pdev), i);
-   irq = pci_irq_vector(pdev, i);
+   irq = pci_irq_vector(pdev, msix_vec);
ret = devm_request_irq(>dev, irq,
   vp_vdpa_vq_handler,
   0, vp_vdpa->vring[i].msix_name, @@ 
-185,21 +194,22 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
"vp_vdpa: fail to request irq for vq %d\n", i);
goto err;
}
-   vp_modern_queue_vector(mdev, i, i);
+   vp_modern_queue_vector(mdev, i, msix_vec);
vp_vdpa->vring[i].irq = irq;
+

[PATCH v2] uprobes: reduce contention on uprobes_tree access

2024-04-22 Thread Jonathan Haslam

Active uprobes are stored in an RB tree and accesses to this tree are
dominated by read operations. Currently these accesses are serialized by
a spinlock but this leads to enormous contention when large numbers of
threads are executing active probes.

This patch converts the spinlock used to serialize access to the
uprobes_tree RB tree into a reader-writer spinlock. This lock type
aligns naturally with the overwhelmingly read-only nature of the tree
usage here. Although the addition of reader-writer spinlocks are
discouraged [0], this fix is proposed as an interim solution while an
RCU based approach is implemented (that work is in a nascent form). This
fix also has the benefit of being trivial, self contained and therefore
simple to backport.

We have used a uprobe benchmark from the BPF selftests [1] to estimate
the improvements. Each block of results below show 1 line per execution
of the benchmark ("the "Summary" line) and each line is a run with one
more thread added - a thread is a "producer". The lines are edited to
remove extraneous output.

The tests were executed with this driver script:

for num_threads in {1..20}
do
  sudo ./bench -a -p $num_threads trig-uprobe-nop | grep Summary
done

SPINLOCK (BEFORE)
==
Summary: hits1.396 ± 0.007M/s (  1.396M/prod)
Summary: hits1.656 ± 0.016M/s (  0.828M/prod)
Summary: hits2.246 ± 0.008M/s (  0.749M/prod)
Summary: hits2.114 ± 0.010M/s (  0.529M/prod)
Summary: hits2.013 ± 0.009M/s (  0.403M/prod)
Summary: hits1.753 ± 0.008M/s (  0.292M/prod)
Summary: hits1.847 ± 0.001M/s (  0.264M/prod)
Summary: hits1.889 ± 0.001M/s (  0.236M/prod)
Summary: hits1.833 ± 0.006M/s (  0.204M/prod)
Summary: hits1.900 ± 0.003M/s (  0.190M/prod)
Summary: hits1.918 ± 0.006M/s (  0.174M/prod)
Summary: hits1.925 ± 0.002M/s (  0.160M/prod)
Summary: hits1.837 ± 0.001M/s (  0.141M/prod)
Summary: hits1.898 ± 0.001M/s (  0.136M/prod)
Summary: hits1.799 ± 0.016M/s (  0.120M/prod)
Summary: hits1.850 ± 0.005M/s (  0.109M/prod)
Summary: hits1.816 ± 0.002M/s (  0.101M/prod)
Summary: hits1.787 ± 0.001M/s (  0.094M/prod)
Summary: hits1.764 ± 0.002M/s (  0.088M/prod)

RW SPINLOCK (AFTER)
===
Summary: hits1.444 ± 0.020M/s (  1.444M/prod)
Summary: hits2.279 ± 0.011M/s (  1.139M/prod)
Summary: hits3.422 ± 0.014M/s (  1.141M/prod)
Summary: hits3.565 ± 0.017M/s (  0.891M/prod)
Summary: hits2.671 ± 0.013M/s (  0.534M/prod)
Summary: hits2.409 ± 0.005M/s (  0.401M/prod)
Summary: hits2.485 ± 0.008M/s (  0.355M/prod)
Summary: hits2.496 ± 0.003M/s (  0.312M/prod)
Summary: hits2.585 ± 0.002M/s (  0.287M/prod)
Summary: hits2.908 ± 0.011M/s (  0.291M/prod)
Summary: hits2.346 ± 0.016M/s (  0.213M/prod)
Summary: hits2.804 ± 0.004M/s (  0.234M/prod)
Summary: hits2.556 ± 0.001M/s (  0.197M/prod)
Summary: hits2.754 ± 0.004M/s (  0.197M/prod)
Summary: hits2.482 ± 0.002M/s (  0.165M/prod)
Summary: hits2.412 ± 0.005M/s (  0.151M/prod)
Summary: hits2.710 ± 0.003M/s (  0.159M/prod)
Summary: hits2.826 ± 0.005M/s (  0.157M/prod)
Summary: hits2.718 ± 0.001M/s (  0.143M/prod)
Summary: hits2.844 ± 0.006M/s (  0.142M/prod)

The numbers in parenthesis give averaged throughput per thread which is
of greatest interest here as a measure of scalability. Improvements are
in the order of 22 - 68% with this particular benchmark (mean = 43%).

V2:
 - Updated commit message to include benchmark results.

[0] https://docs.kernel.org/locking/spinlocks.html
[1] 
https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/benchs/bench_trigger.c

Signed-off-by: Jonathan Haslam 
---
 kernel/events/uprobes.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e4834d23e1d1..8ae0eefc3a34 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
  */
 #define no_uprobe_events() RB_EMPTY_ROOT(_tree)
 
-static DEFINE_SPINLOCK(uprobes_treelock);  /* serialize rbtree access */
+static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ13
 /* serialize uprobe->pending_list */
@@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, 
loff_t offset)
 {
struct uprobe *uprobe;
 
-   spin_lock(_treelock);
+   read_lock(_treelock);
uprobe = __find_uprobe(inode, offset);
-   spin_unlock(_treelock);
+   read_unlock(_treelock);
 
return uprobe;
 }
@@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 {
struct uprobe *u;
 
-   spin_lock(_treelock);
+   write_lock(_treelock);
u = __insert_uprobe(uprobe);
-   spin_unlock(_treelock);
+   write_unlock(_treelock);
 
return u;
 }
@@ -935,9 +935,9 @@ static void

Re: [PATCH v2 2/2] remoteproc: mediatek: Support MT8188 SCP core 1

2024-04-22 Thread AngeloGioacchino Del Regno


Il 19/04/24 10:42, Olivia Wen ha scritto:

From: "olivia.wen" 

There are three primary modifications.

1. The struct mtk_scp_of_data usage on MT8188
MT8192 functions are unsuitable for the dual-core MT8188 SCP,
which has two RISC-V cores similar to MT8195 but without L1TCM.
We've added MT8188-specific functions to configure L1TCM
in multicore setups.

2. SCP_IPI_IMGSYS_CMD feature
This version also adds SCP_IPI_IMGSYS_CMD to facilitate
communication between the imgsys kernel and the backend driver.

3. Different code sizes and IPI share buffer sizes
Each SCP necessitates different code and IPI share buffer sizes.
Introducing a structure mtk_scp_sizes_data to handle them.

Signed-off-by: olivia.wen 


Reviewed-by: AngeloGioacchino Del Regno

Re: [PATCH v2 1/2] dt-bindings: remoteproc: mediatek: Support MT8188 dual-core SCP

2024-04-22 Thread AngeloGioacchino Del Regno


Il 19/04/24 10:42, Olivia Wen ha scritto:

From: "olivia.wen" 

Under different applications, the MT8188 SCP can be used as single-core
or dual-core.

Signed-off-by: olivia.wen 


Reviewed-by: AngeloGioacchino Del Regno

Re: [PATCH v2 1/2] dt-bindings: remoteproc: mediatek: Support MT8188 dual-core SCP

2024-04-22 Thread AngeloGioacchino Del Regno


Il 19/04/24 10:42, Olivia Wen ha scritto:

From: "olivia.wen" 

Under different applications, the MT8188 SCP can be used as single-core
or dual-core.

Signed-off-by: olivia.wen 


Reviewed-by: AngeloGioacchino Del Regno

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Matthieu Baerts

On 22/04/2024 11:17, Jason Xing wrote:> On Mon, Apr 22, 2024 at 4:47 PM
Matthieu Baerts  wrote:
>> On 22/04/2024 05:01, Jason Xing wrote:
>>> From: Jason Xing 

(...)

>>> diff --git a/include/net/rstreason.h b/include/net/rstreason.h
>>> new file mode 100644
>>> index ..c57bc5413c17
>>> --- /dev/null
>>> +++ b/include/net/rstreason.h
>>> @@ -0,0 +1,144 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>>> +
>>> +#ifndef _LINUX_RSTREASON_H
>>> +#define _LINUX_RSTREASON_H
>>> +#include 
>>> +#include 
>>> +
>>> +#define DEFINE_RST_REASON(FN, FNe)   \
>>> + FN(MPTCP_RST_EUNSPEC)   \
>>> + FN(MPTCP_RST_EMPTCP)\
>>> + FN(MPTCP_RST_ERESOURCE) \
>>> + FN(MPTCP_RST_EPROHIBIT) \
>>> + FN(MPTCP_RST_EWQ2BIG)   \
>>> + FN(MPTCP_RST_EBADPERF)  \
>>> + FN(MPTCP_RST_EMIDDLEBOX)\
>>
>> Small detail: should it not make more sense to put the ones linked to
>> MPTCP at the end? I mean I guess MPTCP should be treated in second
>> priority: CONFIG_MPTCP could not be set, and the ones linked to TCP
>> should be more frequent, etc.
> 
> Do you mean that I need to adjust the order: 1) tcp reasons first, 2)
> independent reasons, 3) mptcp reasons ?

Correct, it looks like it is a more "natural" order.

> Reasonable. I will do it :)

Thanks!

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.

[PATCH v5 15/15] bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

BPF just-in-time compiler depended on CONFIG_MODULES because it used
module_alloc() to allocate memory for the generated code.

Since code allocations are now implemented with execmem, drop dependency of
CONFIG_BPF_JIT on CONFIG_MODULES and make it select CONFIG_EXECMEM.

Suggested-by: Björn Töpel 
Signed-off-by: Mike Rapoport (IBM) 
---
 kernel/bpf/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bc25f5098a25..f999e4e0b344 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -43,7 +43,7 @@ config BPF_JIT
bool "Enable BPF Just In Time compiler"
depends on BPF
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
-   depends on MODULES
+   select EXECMEM
help
  BPF programs are normally handled by a BPF interpreter. This option
  allows the kernel to generate native code when a program is loaded
-- 
2.43.0

[PATCH v5 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

kprobes depended on CONFIG_MODULES because it has to allocate memory for
code.

Since code allocations are now implemented with execmem, kprobes can be
enabled in non-modular kernels.

Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
dependency of CONFIG_KPROBES on CONFIG_MODULES.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/Kconfig|  2 +-
 include/linux/module.h  |  9 ++
 kernel/kprobes.c| 55 +++--
 kernel/trace/trace_kprobe.c | 20 +-
 4 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 7006f71f0110..a48ce6a488b3 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -52,9 +52,9 @@ config GENERIC_ENTRY
 
 config KPROBES
bool "Kprobes"
-   depends on MODULES
depends on HAVE_KPROBES
select KALLSYMS
+   select EXECMEM
select TASKS_RCU if PREEMPTION
help
  Kprobes allows you to trap at almost any kernel address and
diff --git a/include/linux/module.h b/include/linux/module.h
index 1153b0d99a80..ffa1c603163c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -605,6 +605,11 @@ static inline bool module_is_live(struct module *mod)
return mod->state != MODULE_STATE_GOING;
 }
 
+static inline bool module_is_coming(struct module *mod)
+{
+return mod->state == MODULE_STATE_COMING;
+}
+
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
@@ -857,6 +862,10 @@ void *dereference_module_function_descriptor(struct module 
*mod, void *ptr)
return ptr;
 }
 
+static inline bool module_is_coming(struct module *mod)
+{
+   return false;
+}
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ddd7cdc16edf..ca2c6cbd42d2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,7 +1588,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
}
 
/* Get module refcount and reject __init functions for loaded modules. 
*/
-   if (*probed_mod) {
+   if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
/*
 * We must hold a refcount of the probed module while updating
 * its code to prohibit unexpected unloading.
@@ -1603,12 +1603,13 @@ static int check_kprobe_address_safe(struct kprobe *p,
 * kprobes in there.
 */
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
-   (*probed_mod)->state != MODULE_STATE_COMING) {
+   !module_is_coming(*probed_mod)) {
module_put(*probed_mod);
*probed_mod = NULL;
ret = -ENOENT;
}
}
+
 out:
preempt_enable();
jump_label_unlock();
@@ -2488,24 +2489,6 @@ int kprobe_add_area_blacklist(unsigned long start, 
unsigned long end)
return 0;
 }
 
-/* Remove all symbols in given area from kprobe blacklist */
-static void kprobe_remove_area_blacklist(unsigned long start, unsigned long 
end)
-{
-   struct kprobe_blacklist_entry *ent, *n;
-
-   list_for_each_entry_safe(ent, n, _blacklist, list) {
-   if (ent->start_addr < start || ent->start_addr >= end)
-   continue;
-   list_del(>list);
-   kfree(ent);
-   }
-}
-
-static void kprobe_remove_ksym_blacklist(unsigned long entry)
-{
-   kprobe_remove_area_blacklist(entry, entry + 1);
-}
-
 int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
   char *type, char *sym)
 {
@@ -2570,6 +2553,25 @@ static int __init populate_kprobe_blacklist(unsigned 
long *start,
return ret ? : arch_populate_kprobe_blacklist();
 }
 
+#ifdef CONFIG_MODULES
+/* Remove all symbols in given area from kprobe blacklist */
+static void kprobe_remove_area_blacklist(unsigned long start, unsigned long 
end)
+{
+   struct kprobe_blacklist_entry *ent, *n;
+
+   list_for_each_entry_safe(ent, n, _blacklist, list) {
+   if (ent->start_addr < start || ent->start_addr >= end)
+   continue;
+   list_del(>list);
+   kfree(ent);
+   }
+}
+
+static void kprobe_remove_ksym_blacklist(unsigned long entry)
+{
+   kprobe_remove_area_blacklist(entry, entry + 1);
+}
+
 static void add_module_kprobe_blacklist(struct module *mod)
 {
unsigned long start, end;
@@ -2672,6 +2674,17 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
 };
 
+static int kprobe_register_module_notifier(void)
+{
+   return register_module_notifier(_module_nb);
+}
+#else
+static int kprobe_register_module_notifier(void)
+{
+

[PATCH v5 13/15] powerpc: use CONFIG_EXECMEM instead of CONFIG_MODULES where appropriate

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

There are places where CONFIG_MODULES guards the code that depends on
memory allocation being done with module_alloc().

Replace CONFIG_MODULES with CONFIG_EXECMEM in such places.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/powerpc/Kconfig | 2 +-
 arch/powerpc/include/asm/kasan.h | 2 +-
 arch/powerpc/kernel/head_8xx.S   | 4 ++--
 arch/powerpc/kernel/head_book3s_32.S | 6 +++---
 arch/powerpc/lib/code-patching.c | 2 +-
 arch/powerpc/mm/book3s32/mmu.c   | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1c4be3373686..2e586733a464 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -285,7 +285,7 @@ config PPC
select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
-   select KASAN_VMALLOCif KASAN && MODULES
+   select KASAN_VMALLOCif KASAN && EXECMEM
select LOCK_MM_AND_FIND_VMA
select MMU_GATHER_PAGE_SIZE
select MMU_GATHER_RCU_TABLE_FREE
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index 365d2720097c..b5bbb94c51f6 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -19,7 +19,7 @@
 
 #define KASAN_SHADOW_SCALE_SHIFT   3
 
-#if defined(CONFIG_MODULES) && defined(CONFIG_PPC32)
+#if defined(CONFIG_EXECMEM) && defined(CONFIG_PPC32)
 #define KASAN_KERN_START   ALIGN_DOWN(PAGE_OFFSET - SZ_256M, SZ_256M)
 #else
 #define KASAN_KERN_START   PAGE_OFFSET
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 647b0b445e89..edc479a7c2bc 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -199,12 +199,12 @@ instruction_counter:
mfspr   r10, SPRN_SRR0  /* Get effective address of fault */
INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11)
mtspr   SPRN_MD_EPN, r10
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
mfcrr11
compare_to_kernel_boundary r10, r10
 #endif
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
blt+3f
rlwinm  r10, r10, 0, 20, 31
orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha
diff --git a/arch/powerpc/kernel/head_book3s_32.S 
b/arch/powerpc/kernel/head_book3s_32.S
index c1d89764dd22..57196883a00e 100644
--- a/arch/powerpc/kernel/head_book3s_32.S
+++ b/arch/powerpc/kernel/head_book3s_32.S
@@ -419,14 +419,14 @@ InstructionTLBMiss:
  */
/* Get PTE (linux-style) and check access */
mfspr   r3,SPRN_IMISS
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
lis r1, TASK_SIZE@h /* check if kernel address */
cmplw   0,r1,r3
 #endif
mfspr   r2, SPRN_SDR1
li  r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
rlwinm  r2, r2, 28, 0xf000
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
li  r0, 3
bgt-112f
lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha   /* if kernel address, 
use */
@@ -442,7 +442,7 @@ InstructionTLBMiss:
andc.   r1,r1,r2/* check access & ~permission */
bne-InstructionAddressInvalid /* return if access not permitted */
/* Convert linux-style PTE to low word of PPC-style PTE */
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
rlwimi  r2, r0, 0, 31, 31   /* userspace ? -> PP lsb */
 #endif
ori r1, r1, 0xe06   /* clear out reserved bits */
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index c6ab46156cda..7af791446ddf 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -225,7 +225,7 @@ void __init poking_init(void)
 
 static unsigned long get_patch_pfn(void *addr)
 {
-   if (IS_ENABLED(CONFIG_MODULES) && is_vmalloc_or_module_addr(addr))
+   if (IS_ENABLED(CONFIG_EXECMEM) && is_vmalloc_or_module_addr(addr))
return vmalloc_to_pfn(addr);
else
return __pa_symbol(addr) >> PAGE_SHIFT;
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 100f999871bc..625fe7d08e06 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -184,7 +184,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, 
unsigned long top)
 
 static bool is_module_segment(unsigned long addr)
 {
-   if (!IS_ENABLED(CONFIG_MODULES))
+   if (!IS_ENABLED(CONFIG_EXECMEM))
return false;
if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M))
return false;
-- 
2.43.0

[PATCH v5 12/15] x86/ftrace: enable dynamic ftrace without CONFIG_MODULES

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Dynamic ftrace must allocate memory for code and this was impossible
without CONFIG_MODULES.

With execmem separated from the modules code, execmem_text_alloc() is
available regardless of CONFIG_MODULES.

Remove dependency of dynamic ftrace on CONFIG_MODULES and make
CONFIG_DYNAMIC_FTRACE select CONFIG_EXECMEM in Kconfig.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/x86/Kconfig |  1 +
 arch/x86/kernel/ftrace.c | 10 --
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3f5ba72c9480..cd8addb96a0b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86_64
select SWIOTLB
select ARCH_HAS_ELFCORE_COMPAT
select ZONE_DMA32
+   select EXECMEM if DYNAMIC_FTRACE
 
 config FORCE_DYNAMIC_FTRACE
def_bool y
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c8ddb7abda7c..8da0e66ca22d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -261,8 +261,6 @@ void arch_ftrace_update_code(int command)
 /* Currently only x86_64 supports dynamic trampolines */
 #ifdef CONFIG_X86_64
 
-#ifdef CONFIG_MODULES
-/* Module allocation simplifies allocating memory for code */
 static inline void *alloc_tramp(unsigned long size)
 {
return execmem_alloc(EXECMEM_FTRACE, size);
@@ -271,14 +269,6 @@ static inline void tramp_free(void *tramp)
 {
execmem_free(tramp);
 }
-#else
-/* Trampolines can only be created if modules are supported */
-static inline void *alloc_tramp(unsigned long size)
-{
-   return NULL;
-}
-static inline void tramp_free(void *tramp) { }
-#endif
 
 /* Defined as markers to the end of the ftrace default trampolines */
 extern void ftrace_regs_caller_end(void);
-- 
2.43.0

[PATCH v5 11/15] arch: make execmem setup available regardless of CONFIG_MODULES

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

execmem does not depend on modules, on the contrary modules use
execmem.

To make execmem available when CONFIG_MODULES=n, for instance for
kprobes, split execmem_params initialization out from
arch/*/kernel/module.c and compile it when CONFIG_EXECMEM=y

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm/kernel/module.c   |  43 --
 arch/arm/mm/init.c |  45 +++
 arch/arm64/kernel/module.c | 140 -
 arch/arm64/mm/init.c   | 140 +
 arch/loongarch/kernel/module.c |  19 -
 arch/loongarch/mm/init.c   |  21 +
 arch/mips/kernel/module.c  |  22 --
 arch/mips/mm/init.c|  23 ++
 arch/nios2/kernel/module.c |  20 -
 arch/nios2/mm/init.c   |  21 +
 arch/parisc/kernel/module.c|  20 -
 arch/parisc/mm/init.c  |  23 +-
 arch/powerpc/kernel/module.c   |  63 ---
 arch/powerpc/mm/mem.c  |  64 +++
 arch/riscv/kernel/module.c |  44 ---
 arch/riscv/mm/init.c   |  45 +++
 arch/s390/kernel/module.c  |  27 ---
 arch/s390/mm/init.c|  30 +++
 arch/sparc/kernel/module.c |  19 -
 arch/sparc/mm/Makefile |   2 +
 arch/sparc/mm/execmem.c|  21 +
 arch/x86/kernel/module.c   |  27 ---
 arch/x86/mm/init.c |  29 +++
 23 files changed, 463 insertions(+), 445 deletions(-)
 create mode 100644 arch/sparc/mm/execmem.c

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index a98fdf6ff26c..677f218f7e84 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -12,57 +12,14 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
-#include 
 
 #include 
 #include 
 #include 
 #include 
 
-#ifdef CONFIG_XIP_KERNEL
-/*
- * The XIP kernel text is mapped in the module area for modules and
- * some other stuff to work without any indirect relocations.
- * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
- * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
- */
-#undef MODULES_VADDR
-#define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
-#endif
-
-#ifdef CONFIG_MMU
-static struct execmem_info execmem_info __ro_after_init;
-
-struct execmem_info __init *execmem_arch_setup(void)
-{
-   unsigned long fallback_start = 0, fallback_end = 0;
-
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
-   fallback_start = VMALLOC_START;
-   fallback_end = VMALLOC_END;
-   }
-
-   execmem_info = (struct execmem_info){
-   .ranges = {
-   [EXECMEM_DEFAULT] = {
-   .start  = MODULES_VADDR,
-   .end= MODULES_END,
-   .pgprot = PAGE_KERNEL_EXEC,
-   .alignment = 1,
-   .fallback_start = fallback_start,
-   .fallback_end   = fallback_end,
-   },
-   },
-   };
-
-   return _info;
-}
-#endif
-
 bool module_init_section(const char *name)
 {
return strstarts(name, ".init") ||
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index e8c6f4be0ce1..5345d218899a 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -486,3 +487,47 @@ void free_initrd_mem(unsigned long start, unsigned long 
end)
free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
+
+#ifdef CONFIG_EXECMEM
+
+#ifdef CONFIG_XIP_KERNEL
+/*
+ * The XIP kernel text is mapped in the module area for modules and
+ * some other stuff to work without any indirect relocations.
+ * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
+ * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
+ */
+#undef MODULES_VADDR
+#define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
+#endif
+
+#ifdef CONFIG_MMU
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+   unsigned long fallback_start = 0, fallback_end = 0;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   fallback_start = VMALLOC_START;
+   fallback_end = VMALLOC_END;
+   }
+
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   .fallback_start = fallback_start,
+   .fallback_end   = fallback_end,
+   },
+   },
+

[PATCH v5 10/15] powerpc: extend execmem_params for kprobes allocations

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

powerpc overrides kprobes::alloc_insn_page() to remove writable
permissions when STRICT_MODULE_RWX is on.

Add definition of EXECMEM_KRPOBES to execmem_params to allow using the
generic kprobes::alloc_insn_page() with the desired permissions.

As powerpc uses breakpoint instructions to inject kprobes, it does not
need to constrain kprobe allocations to the modules area and can use the
entire vmalloc address space.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/powerpc/kernel/kprobes.c | 20 
 arch/powerpc/kernel/module.c  |  7 +++
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 9fcd01bb2ce6..14c5ddec3056 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -126,26 +126,6 @@ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long 
addr, unsigned long offse
return (kprobe_opcode_t *)(addr + offset);
 }
 
-void *alloc_insn_page(void)
-{
-   void *page;
-
-   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
-   if (!page)
-   return NULL;
-
-   if (strict_module_rwx_enabled()) {
-   int err = set_memory_rox((unsigned long)page, 1);
-
-   if (err)
-   goto error;
-   }
-   return page;
-error:
-   execmem_free(page);
-   return NULL;
-}
-
 int arch_prepare_kprobe(struct kprobe *p)
 {
int ret = 0;
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index ac80559015a3..2a23cf7e141b 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -94,6 +94,7 @@ static struct execmem_info execmem_info __ro_after_init;
 
 struct execmem_info __init *execmem_arch_setup(void)
 {
+   pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : 
PAGE_KERNEL_EXEC;
pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : 
PAGE_KERNEL_EXEC;
unsigned long fallback_start = 0, fallback_end = 0;
unsigned long start, end;
@@ -132,6 +133,12 @@ struct execmem_info __init *execmem_arch_setup(void)
.fallback_start = fallback_start,
.fallback_end   = fallback_end,
},
+   [EXECMEM_KPROBES] = {
+   .start  = VMALLOC_START,
+   .end= VMALLOC_END,
+   .pgprot = kprobes_prot,
+   .alignment = 1,
+   },
[EXECMEM_MODULE_DATA] = {
.start  = VMALLOC_START,
.end= VMALLOC_END,
-- 
2.43.0

[PATCH v5 09/15] riscv: extend execmem_params for generated code allocations

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

The memory allocations for kprobes and BPF on RISC-V are not placed in
the modules area and these custom allocations are implemented with
overrides of alloc_insn_page() and  bpf_jit_alloc_exec().

Slightly reorder execmem_params initialization to support both 32 and 64
bit variants, define EXECMEM_KPROBES and EXECMEM_BPF ranges in
riscv::execmem_params and drop overrides of alloc_insn_page() and
bpf_jit_alloc_exec().

Signed-off-by: Mike Rapoport (IBM) 
Reviewed-by: Alexandre Ghiti 
---
 arch/riscv/kernel/module.c | 28 +---
 arch/riscv/kernel/probes/kprobes.c | 10 --
 arch/riscv/net/bpf_jit_core.c  | 13 -
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 182904127ba0..2ecbacbc9993 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -906,19 +906,41 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+#ifdef CONFIG_MMU
 static struct execmem_info execmem_info __ro_after_init;
 
 struct execmem_info __init *execmem_arch_setup(void)
 {
+   unsigned long start, end;
+
+   if (IS_ENABLED(CONFIG_64BIT)) {
+   start = MODULES_VADDR;
+   end = MODULES_END;
+   } else {
+   start = VMALLOC_START;
+   end = VMALLOC_END;
+   }
+
execmem_info = (struct execmem_info){
.ranges = {
[EXECMEM_DEFAULT] = {
-   .start  = MODULES_VADDR,
-   .end= MODULES_END,
+   .start  = start,
+   .end= end,
.pgprot = PAGE_KERNEL,
.alignment = 1,
},
+   [EXECMEM_KPROBES] = {
+   .start  = VMALLOC_START,
+   .end= VMALLOC_END,
+   .pgprot = PAGE_KERNEL_READ_EXEC,
+   .alignment = 1,
+   },
+   [EXECMEM_BPF] = {
+   .start  = BPF_JIT_REGION_START,
+   .end= BPF_JIT_REGION_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = PAGE_SIZE,
+   },
},
};
 
diff --git a/arch/riscv/kernel/probes/kprobes.c 
b/arch/riscv/kernel/probes/kprobes.c
index 2f08c14a933d..e64f2f3064eb 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -104,16 +104,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return 0;
 }
 
-#ifdef CONFIG_MMU
-void *alloc_insn_page(void)
-{
-   return  __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
-GFP_KERNEL, PAGE_KERNEL_READ_EXEC,
-VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-__builtin_return_address(0));
-}
-#endif
-
 /* install breakpoint in text */
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 6b3acac30c06..e238fdbd5dbc 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -219,19 +219,6 @@ u64 bpf_jit_alloc_exec_limit(void)
return BPF_JIT_REGION_SIZE;
 }
 
-void *bpf_jit_alloc_exec(unsigned long size)
-{
-   return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
-   BPF_JIT_REGION_END, GFP_KERNEL,
-   PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
-   return vfree(addr);
-}
-
 void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 {
int ret;
-- 
2.43.0

[PATCH v5 08/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Extend execmem parameters to accommodate more complex overrides of
module_alloc() by architectures.

This includes specification of a fallback range required by arm, arm64
and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for
allocation of KASAN shadow required by s390 and x86 and support for
early initialization of execmem required by x86.

The core implementation of execmem_alloc() takes care of suppressing
warnings when the initial allocation fails but there is a fallback range
defined.

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Will Deacon 
---
 arch/Kconfig   |  6 +++
 arch/arm/kernel/module.c   | 41 ++---
 arch/arm64/kernel/module.c | 67 ++--
 arch/arm64/kernel/probes/kprobes.c |  7 ---
 arch/arm64/net/bpf_jit_comp.c  | 11 -
 arch/powerpc/kernel/module.c   | 60 -
 arch/s390/kernel/module.c  | 54 ++-
 arch/x86/Kconfig   |  1 +
 arch/x86/kernel/module.c   | 70 ++
 include/linux/execmem.h| 34 +++
 include/linux/moduleloader.h   | 12 -
 kernel/module/main.c   | 26 +++
 mm/execmem.c   | 70 +-
 mm/mm_init.c   |  2 +
 14 files changed, 259 insertions(+), 202 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 65afb1de48b3..7006f71f0110 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -960,6 +960,12 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC
  For architectures like powerpc/32 which have constraints on module
  allocation and need to allocate module data outside of module area.
 
+config ARCH_WANTS_EXECMEM_EARLY
+   bool
+   help
+ For architectures that might allocate executable memory early on
+ boot, for instance ftrace on x86.
+
 config HAVE_IRQ_EXIT_ON_IRQ_STACK
bool
help
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e74d84f58b77..a98fdf6ff26c 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -34,23 +35,31 @@
 #endif
 
 #ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   gfp_t gfp_mask = GFP_KERNEL;
-   void *p;
-
-   /* Silence the initial allocation */
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-   gfp_mask |= __GFP_NOWARN;
-
-   p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-   if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-   return p;
-   return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   unsigned long fallback_start = 0, fallback_end = 0;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   fallback_start = VMALLOC_START;
+   fallback_end = VMALLOC_END;
+   }
+
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   .fallback_start = fallback_start,
+   .fallback_end   = fallback_end,
+   },
+   },
+   };
+
+   return _info;
 }
 #endif
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index e92da4da1b2a..a52240ea084b 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -108,41 +109,59 @@ static int __init module_init_limits(void)
 
return 0;
 }
-subsys_initcall(module_init_limits);
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   void *p = NULL;
+   unsigned long fallback_start = 0, fallback_end = 0;
+   unsigned long start = 0, end = 0;
+
+   module_init_limits();
 
/*
 * Where possible, prefer to allocate within direct branch range of the
 * kernel such that no PLTs are necessary.
 */
if (module_direct_base) {
-   p = __vmalloc_node_range(size, MODULE_ALIGN,
-module_direct_base,
-

[PATCH v5 07/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Several architectures override module_alloc() only to define address
range for code allocations different than VMALLOC address space.

Provide a generic implementation in execmem that uses the parameters for
address space ranges, required alignment and page protections provided
by architectures.

The architectures must fill execmem_info structure and implement
execmem_arch_setup() that returns a pointer to that structure. This way the
execmem initialization won't be called from every architecture, but rather
from a central place, namely a core_initcall() in execmem.

The execmem provides execmem_alloc() API that wraps __vmalloc_node_range()
with the parameters defined by the architectures.  If an architecture does
not implement execmem_arch_setup(), execmem_alloc() will fall back to
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/loongarch/kernel/module.c | 19 +++--
 arch/mips/kernel/module.c  | 20 --
 arch/nios2/kernel/module.c | 21 +++---
 arch/parisc/kernel/module.c| 24 +++
 arch/riscv/kernel/module.c | 24 +++
 arch/sparc/kernel/module.c | 20 --
 include/linux/execmem.h| 41 +++
 mm/execmem.c   | 73 --
 8 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c
index c7d0338d12c1..ca6dd7ea1610 100644
--- a/arch/loongarch/kernel/module.c
+++ b/arch/loongarch/kernel/module.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -490,10 +491,22 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, 
__builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 
 static void module_init_ftrace_plt(const Elf_Ehdr *hdr,
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 9a6c96014904..59225a3cf918 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct mips_hi16 {
@@ -32,11 +33,22 @@ static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
 #ifdef MODULES_VADDR
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 #endif
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 9c97b7513853..0d1ee86631fc 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -18,15 +18,26 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC,
-   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index

[PATCH v5 06/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystems
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

Start splitting code allocation from modules by introducing execmem_alloc()
and execmem_free() APIs.

Initially, execmem_alloc() is a wrapper for module_alloc() and
execmem_free() is a replacement of module_memfree() to allow updating all
call sites to use the new APIs.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem_alloc() takes
a type argument, that will be used to identify the calling subsystem and to
allow architectures define parameters for ranges suitable for that
subsystem.

No functional changes.

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Masami Hiramatsu (Google) 
---
 arch/powerpc/kernel/kprobes.c|  6 ++--
 arch/s390/kernel/ftrace.c|  4 +--
 arch/s390/kernel/kprobes.c   |  4 +--
 arch/s390/kernel/module.c|  5 +--
 arch/sparc/net/bpf_jit_comp_32.c |  8 ++---
 arch/x86/kernel/ftrace.c |  6 ++--
 arch/x86/kernel/kprobes/core.c   |  4 +--
 include/linux/execmem.h  | 57 
 include/linux/moduleloader.h |  3 --
 kernel/bpf/core.c|  6 ++--
 kernel/kprobes.c |  8 ++---
 kernel/module/Kconfig|  1 +
 kernel/module/main.c | 25 +-
 mm/Kconfig   |  3 ++
 mm/Makefile  |  1 +
 mm/execmem.c | 32 ++
 16 files changed, 128 insertions(+), 45 deletions(-)
 create mode 100644 include/linux/execmem.h
 create mode 100644 mm/execmem.c

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bbca90a5e2ec..9fcd01bb2ce6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -19,8 +19,8 @@
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -130,7 +130,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
 
@@ -142,7 +142,7 @@ void *alloc_insn_page(void)
}
return page;
 error:
-   module_memfree(page);
+   execmem_free(page);
return NULL;
 }
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index c46381ea04ec..798249ef5646 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -7,13 +7,13 @@
  *   Author(s): Martin Schwidefsky 
  */
 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void)
 {
const char *start, *end;
 
-   ftrace_plt = module_alloc(PAGE_SIZE);
+   ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE);
if (!ftrace_plt)
panic("cannot allocate ftrace plt\n");
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index f0cf20d4b3c5..3c1b1be744de 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -9,7 +9,6 @@
 
 #define pr_fmt(fmt) "kprobes: " fmt
 
-#include 
 #include 
 #include 
 #include 
@@ -21,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -38,7 +38,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
set_memory_rox((unsigned long)page, 1);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 42215f9404af..ac97a905e8cd 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -76,7 +77,7 @@ void *module_alloc(unsigned long size)
 #ifdef CONFIG_FUNCTION_TRACER
 void module_arch_cleanup(struct module *mod)
 {
-   module_memfree(mod->arch.trampolines_start);
+   execmem_free(mod->arch.trampolines_start);
 }
 #endif
 
@@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct 
module *me,
 
size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
numpages = DIV_ROUND_UP(size, PAGE_SIZE);
-   start = module_alloc(numpages * PAGE_SIZE);
+   start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
if (!start)
return -ENOMEM;

[PATCH v5 05/15] module: make module_memory_{alloc,free} more self-contained

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Move the logic related to the memory allocation and freeing into
module_memory_alloc() and module_memory_free().

Signed-off-by: Mike Rapoport (IBM) 
---
 kernel/module/main.c | 64 +++-
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index e1e8a7a9d6c1..5b82b069e0d3 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1203,15 +1203,44 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type)
mod_mem_type_is_core_data(type);
 }
 
-static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
+static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
+   unsigned int size = PAGE_ALIGN(mod->mem[type].size);
+   void *ptr;
+
+   mod->mem[type].size = size;
+
if (mod_mem_use_vmalloc(type))
-   return vzalloc(size);
-   return module_alloc(size);
+   ptr = vmalloc(size);
+   else
+   ptr = module_alloc(size);
+
+   if (!ptr)
+   return -ENOMEM;
+
+   /*
+* The pointer to these blocks of memory are stored on the module
+* structure and we keep that around so long as the module is
+* around. We only free that memory when we unload the module.
+* Just mark them as not being a leak then. The .init* ELF
+* sections *do* get freed after boot so we *could* treat them
+* slightly differently with kmemleak_ignore() and only grey
+* them out as they work as typical memory allocations which
+* *do* eventually get freed, but let's just keep things simple
+* and avoid *any* false positives.
+*/
+   kmemleak_not_leak(ptr);
+
+   memset(ptr, 0, size);
+   mod->mem[type].base = ptr;
+
+   return 0;
 }
 
-static void module_memory_free(void *ptr, enum mod_mem_type type)
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
 {
+   void *ptr = mod->mem[type].base;
+
if (mod_mem_use_vmalloc(type))
vfree(ptr);
else
@@ -1229,12 +1258,12 @@ static void free_mod_mem(struct module *mod)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
-   module_memory_free(mod_mem->base, type);
+   module_memory_free(mod, type);
}
 
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, 
mod->mem[MOD_DATA].size);
-   module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+   module_memory_free(mod, MOD_DATA);
 }
 
 /* Free a module, remove from lists, etc. */
@@ -2225,7 +2254,6 @@ static int find_module_sections(struct module *mod, 
struct load_info *info)
 static int move_module(struct module *mod, struct load_info *info)
 {
int i;
-   void *ptr;
enum mod_mem_type t = 0;
int ret = -ENOMEM;
 
@@ -2234,26 +2262,12 @@ static int move_module(struct module *mod, struct 
load_info *info)
mod->mem[type].base = NULL;
continue;
}
-   mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
-   ptr = module_memory_alloc(mod->mem[type].size, type);
-   /*
- * The pointer to these blocks of memory are stored on the 
module
- * structure and we keep that around so long as the module is
- * around. We only free that memory when we unload the module.
- * Just mark them as not being a leak then. The .init* ELF
- * sections *do* get freed after boot so we *could* treat them
- * slightly differently with kmemleak_ignore() and only grey
- * them out as they work as typical memory allocations which
- * *do* eventually get freed, but let's just keep things simple
- * and avoid *any* false positives.
-*/
-   kmemleak_not_leak(ptr);
-   if (!ptr) {
+
+   ret = module_memory_alloc(mod, type);
+   if (ret) {
t = type;
goto out_enomem;
}
-   memset(ptr, 0, mod->mem[type].size);
-   mod->mem[type].base = ptr;
}
 
/* Transfer each section which specifies SHF_ALLOC */
@@ -2296,7 +2310,7 @@ static int move_module(struct module *mod, struct 
load_info *info)
return 0;
 out_enomem:
for (t--; t >= 0; t--)
-   module_memory_free(mod->mem[t].base, t);
+   module_memory_free(mod, t);
return ret;
 }
 
-- 
2.43.0

[PATCH v5 04/15] sparc: simplify module_alloc()

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Define MODULES_VADDR and MODULES_END as VMALLOC_START and VMALLOC_END
for 32-bit and reduce module_alloc() to

__vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, ...)

as with the new defines the allocations becames identical for both 32
and 64 bits.

While on it, drop unsed include of 

Suggested-by: Sam Ravnborg 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/sparc/include/asm/pgtable_32.h |  2 ++
 arch/sparc/kernel/module.c  | 25 +
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/arch/sparc/include/asm/pgtable_32.h 
b/arch/sparc/include/asm/pgtable_32.h
index 9e85d57ac3f2..62bcafe38b1f 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -432,6 +432,8 @@ static inline int io_remap_pfn_range(struct vm_area_struct 
*vma,
 
 #define VMALLOC_START   _AC(0xfe60,UL)
 #define VMALLOC_END _AC(0xffc0,UL)
+#define MODULES_VADDR   VMALLOC_START
+#define MODULES_END VMALLOC_END
 
 /* We provide our own get_unmapped_area to cope with VA holes for userland */
 #define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index 66c45a2764bc..d37adb2a0b54 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -21,35 +21,12 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_SPARC64
-
-#include 
-
-static void *module_map(unsigned long size)
+void *module_alloc(unsigned long size)
 {
-   if (PAGE_ALIGN(size) > MODULES_LEN)
-   return NULL;
return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
-#else
-static void *module_map(unsigned long size)
-{
-   return vmalloc(size);
-}
-#endif /* CONFIG_SPARC64 */
-
-void *module_alloc(unsigned long size)
-{
-   void *ret;
-
-   ret = module_map(size);
-   if (ret)
-   memset(ret, 0, size);
-
-   return ret;
-}
 
 /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
 int module_frob_arch_sections(Elf_Ehdr *hdr,
-- 
2.43.0

[PATCH v5 03/15] nios2: define virtual address space for modules

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

nios2 uses kmalloc() to implement module_alloc() because CALL26/PCREL26
cannot reach all of vmalloc address space.

Define module space as 32MiB below the kernel base and switch nios2 to
use vmalloc for module allocations.

Suggested-by: Thomas Gleixner 
Acked-by: Dinh Nguyen 
Acked-by: Song Liu 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/nios2/include/asm/pgtable.h |  5 -
 arch/nios2/kernel/module.c   | 19 ---
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index d052dfcbe8d3..eab87c6beacb 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -25,7 +25,10 @@
 #include 
 
 #define VMALLOC_START  CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
-#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
+#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M - 1)
+
+#define MODULES_VADDR  (CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M)
+#define MODULES_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
 
 struct mm_struct;
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 76e0a42d6e36..9c97b7513853 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -21,23 +21,12 @@
 
 #include 
 
-/*
- * Modules should NOT be allocated with kmalloc for (obvious) reasons.
- * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
- * from 0x8000 (vmalloc area) to 0xc (kernel) (kmalloc returns
- * addresses in 0xc000)
- */
 void *module_alloc(unsigned long size)
 {
-   if (size == 0)
-   return NULL;
-   return kmalloc(size, GFP_KERNEL);
-}
-
-/* Free memory returned from module_alloc */
-void module_memfree(void *module_region)
-{
-   kfree(module_region);
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_EXEC,
+   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+   __builtin_return_address(0));
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
-- 
2.43.0

[PATCH v5 02/15] mips: module: rename MODULE_START to MODULES_VADDR

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

and MODULE_END to MODULES_END to match other architectures that define
custom address space for modules.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/mips/include/asm/pgtable-64.h | 4 ++--
 arch/mips/kernel/module.c  | 4 ++--
 arch/mips/mm/fault.c   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/pgtable-64.h 
b/arch/mips/include/asm/pgtable-64.h
index 20ca48c1b606..c0109aff223b 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -147,8 +147,8 @@
 #if defined(CONFIG_MODULES) && defined(KBUILD_64BIT_SYM32) && \
VMALLOC_START != CKSSEG
 /* Load modules into 32bit-compatible segment. */
-#define MODULE_START   CKSSEG
-#define MODULE_END (FIXADDR_START-2*PAGE_SIZE)
+#define MODULES_VADDR  CKSSEG
+#define MODULES_END(FIXADDR_START-2*PAGE_SIZE)
 #endif
 
 #define pte_ERROR(e) \
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 7b2fbaa9cac5..9a6c96014904 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -31,10 +31,10 @@ struct mips_hi16 {
 static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
-#ifdef MODULE_START
+#ifdef MODULES_VADDR
 void *module_alloc(unsigned long size)
 {
-   return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index aaa9a242ebba..37fedeaca2e9 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -83,8 +83,8 @@ static void __do_page_fault(struct pt_regs *regs, unsigned 
long write,
 
if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END))
goto VMALLOC_FAULT_TARGET;
-#ifdef MODULE_START
-   if (unlikely(address >= MODULE_START && address < MODULE_END))
+#ifdef MODULES_VADDR
+   if (unlikely(address >= MODULES_VADDR && address < MODULES_END))
goto VMALLOC_FAULT_TARGET;
 #endif
 
-- 
2.43.0

[PATCH v5 01/15] arm64: module: remove unneeded call to kasan_alloc_module_shadow()

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Since commit f6f37d9320a1 ("arm64: select KASAN_VMALLOC for SW/HW_TAGS
modes") KASAN_VMALLOC is always enabled when KASAN is on. This means
that allocations in module_alloc() will be tracked by KASAN protection
for vmalloc() and that kasan_alloc_module_shadow() will be always an
empty inline and there is no point in calling it.

Drop meaningless call to kasan_alloc_module_shadow() from
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm64/kernel/module.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 47e0be610bb6..e92da4da1b2a 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -141,11 +141,6 @@ void *module_alloc(unsigned long size)
__func__);
}
 
-   if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
-   vfree(p);
-   return NULL;
-   }
-
/* Memory is intended to be executable, reset the pointer tag. */
return kasan_reset_tag(p);
 }
-- 
2.43.0

[PATCH v5 00/15] mm: jit/text allocator

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

(something went wrong with the prevois posting, sorry for the noise)

Hi,

Since v3 I looked into making execmem more of an utility toolbox, as we
discussed at LPC with Mark Rutland, but it was getting more hairier than
having a struct describing architecture constraints and a type identifying
the consumer of execmem.

And I do think that having the description of architecture constraints for
allocations of executable memory in a single place is better than having it
spread all over the place.

The patches available via git:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=execmem/v5

v5 changes:
* rebase on v6.9-rc4 to avoid a conflict in kprobes
* add copyrights to mm/execmem.c (Luis)
* fix spelling (Ingo)
* define MODULES_VADDDR for sparc (Sam)
* consistently initialize struct execmem_info (Peter)
* reduce #ifdefs in function bodies in kprobes (Masami) 

v4: https://lore.kernel.org/all/20240411160051.2093261-1-r...@kernel.org
* rebase on v6.9-rc2
* rename execmem_params to execmem_info and execmem_arch_params() to
  execmem_arch_setup()
* use single execmem_alloc() API instead of execmem_{text,data}_alloc() (Song)
* avoid extra copy of execmem parameters (Rick)
* run execmem_init() as core_initcall() except for the architectures that
  may allocated text really early (currently only x86) (Will)
* add acks for some of arm64 and riscv changes, thanks Will and Alexandre
* new commits:
  - drop call to kasan_alloc_module_shadow() on arm64 because it's not
needed anymore
  - rename MODULE_START to MODULES_VADDR on MIPS
  - use CONFIG_EXECMEM instead of CONFIG_MODULES on powerpc as per Christophe:
https://lore.kernel.org/all/79062fa3-3402-47b3-8920-9231ad05e...@csgroup.eu/

v3: https://lore.kernel.org/all/20230918072955.2507221-1-r...@kernel.org
* add type parameter to execmem allocation APIs
* remove BPF dependency on modules

v2: https://lore.kernel.org/all/20230616085038.4121892-1-r...@kernel.org
* Separate "module" and "others" allocations with execmem_text_alloc()
and jit_text_alloc()
* Drop ROX entailment on x86
* Add ack for nios2 changes, thanks Dinh Nguyen

v1: https://lore.kernel.org/all/20230601101257.530867-1-r...@kernel.org

= Cover letter from v1 (sligtly updated) =

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystmes
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

A centralized infrastructure for code allocation allows allocations of
executable memory as ROX, and future optimizations such as caching large
pages for better iTLB performance and providing sub-page allocations for
users that only need small jit code snippets.

Rick Edgecombe proposed perm_alloc extension to vmalloc [1] and Song Liu
proposed execmem_alloc [2], but both these approaches were targeting BPF
allocations and lacked the ground work to abstract executable allocations
and split them from the modules core.

Thomas Gleixner suggested to express module allocation restrictions and
requirements as struct mod_alloc_type_params [3] that would define ranges,
protections and other parameters for different types of allocations used by
modules and following that suggestion Song separated allocations of
different types in modules (commit ac3b43283923 ("module: replace
module_layout with module_memory")) and posted "Type aware module
allocator" set [4].

I liked the idea of parametrising code allocation requirements as a
structure, but I believe the original proposal and Song's module allocator
was too module centric, so I came up with these patches.

This set splits code allocation from modules by introducing execmem_alloc()
and and execmem_free(), APIs, replaces call sites of module_alloc() and
module_memfree() with the new APIs and implements core text and related
allocations in a central place.

Instead of architecture specific overrides for module_alloc(), the
architectures that require non-default behaviour for text allocation must
fill execmem_info structure and implement execmem_arch_setup() that returns
a pointer to that structure. If an architecture does not implement
execmem_arch_setup(), the defaults compatible with the current
modules::module_alloc() are used.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem APIs
take a type argument, that will be used to identify the calling subsystem
and to allow architectures to define parameters for ranges suitable for that
subsystem.

The new infrastructure allows decoupling of BPF, kprobes and ftrace from

Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions

2024-04-22 Thread David Hildenbrand


On 19.04.24 20:25, David Hildenbrand wrote:

On 06.04.24 19:36, Vincent Donnefort wrote:

In preparation for allowing the user-space to map a ring-buffer, add
a set of mapping functions:

ring_buffer_{map,unmap}()

And controls on the ring-buffer:

ring_buffer_map_get_reader()  /* swap reader and head */

Mapping the ring-buffer also involves:

A unique ID for each subbuf of the ring-buffer, currently they are
only identified through their in-kernel VA.

A meta-page, where are stored ring-buffer statistics and a
description for the current reader

The linear mapping exposes the meta-page, and each subbuf of the
ring-buffer, ordered following their unique ID, assigned during the
first mapping.

Once mapped, no subbuf can get in or out of the ring-buffer: the buffer
size will remain unmodified and the splice enabling functions will in
reality simply memcpy the data instead of swapping subbufs.

CC: 
Signed-off-by: Vincent Donnefort 

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dc5ae4e96aee..96d2140b471e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -6,6 +6,8 @@
   #include 
   #include 
   
+#include 

+
   struct trace_buffer;
   struct ring_buffer_iter;
   
@@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);

   #define trace_rb_cpu_prepare NULL
   #endif
   
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,

+   struct vm_area_struct *vma);
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
   #endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
new file mode 100644
index ..ffcd8dfcaa4f
--- /dev/null
+++ b/include/uapi/linux/trace_mmap.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _TRACE_MMAP_H_
+#define _TRACE_MMAP_H_
+
+#include 
+
+/**
+ * struct trace_buffer_meta - Ring-buffer Meta-page description
+ * @meta_page_size:Size of this meta-page.
+ * @meta_struct_len:   Size of this structure.
+ * @subbuf_size:   Size of each sub-buffer.
+ * @nr_subbufs:Number of subbfs in the ring-buffer, including 
the reader.
+ * @reader.lost_events:Number of events lost at the time of the reader 
swap.
+ * @reader.id: subbuf ID of the current reader. ID range [0 : 
@nr_subbufs - 1]
+ * @reader.read:   Number of bytes read on the reader subbuf.
+ * @flags: Placeholder for now, 0 until new features are supported.
+ * @entries:   Number of entries in the ring-buffer.
+ * @overrun:   Number of entries lost in the ring-buffer.
+ * @read:  Number of entries that have been read.
+ * @Reserved1: Reserved for future use.
+ * @Reserved2: Reserved for future use.
+ */
+struct trace_buffer_meta {
+   __u32   meta_page_size;
+   __u32   meta_struct_len;
+
+   __u32   subbuf_size;
+   __u32   nr_subbufs;
+
+   struct {
+   __u64   lost_events;
+   __u32   id;
+   __u32   read;
+   } reader;
+
+   __u64   flags;
+
+   __u64   entries;
+   __u64   overrun;
+   __u64   read;
+
+   __u64   Reserved1;
+   __u64   Reserved2;
+};
+
+#endif /* _TRACE_MMAP_H_ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cc9ebe593571..793ecc454039 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,6 +9,7 @@
   #include 
   #include 
   #include 
+#include 
   #include 
   #include 
   #include 
@@ -26,6 +27,7 @@
   #include 
   #include 
   #include 
+#include 
   
   #include 

   #include 
@@ -338,6 +340,7 @@ struct buffer_page {
local_t  entries;   /* entries on this page */
unsigned longreal_end;  /* real end of data */
unsigned order; /* order of the page */
+   u32  id;/* ID for external mapping */
struct buffer_data_page *page;  /* Actual data page */
   };
   
@@ -484,6 +487,12 @@ struct ring_buffer_per_cpu {

u64 read_stamp;
/* pages removed since last reset */
unsigned long   pages_removed;
+
+   unsigned intmapped;
+   struct mutexmapping_lock;
+   unsigned long   *subbuf_ids;/* ID to subbuf VA */
+   struct trace_buffer_meta*meta_page;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
longnr_pages_to_update;
struct list_headnew_pages; /* new pages to add */
@@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long 
nr_pages, int cpu)
init_irq_work(_buffer->irq_work.work,

Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Jason Xing

Hello Matthieu,

On Mon, Apr 22, 2024 at 4:47 PM Matthieu Baerts  wrote:
>
> Hi Jason,
>
> On 22/04/2024 05:01, Jason Xing wrote:
> > From: Jason Xing 
> >
> > Add a new standalone file for the easy future extension to support
> > both active reset and passive reset in the TCP/DCCP/MPTCP protocols.
>
> Thank you for looking at that!

Thanks for the review!

>
> (...)
>
> > diff --git a/include/net/rstreason.h b/include/net/rstreason.h
> > new file mode 100644
> > index ..c57bc5413c17
> > --- /dev/null
> > +++ b/include/net/rstreason.h
> > @@ -0,0 +1,144 @@
> > +/* SPDX-License-Identifier: GPL-2.0-or-later */
> > +
> > +#ifndef _LINUX_RSTREASON_H
> > +#define _LINUX_RSTREASON_H
> > +#include 
> > +#include 
> > +
> > +#define DEFINE_RST_REASON(FN, FNe)   \
> > + FN(MPTCP_RST_EUNSPEC)   \
> > + FN(MPTCP_RST_EMPTCP)\
> > + FN(MPTCP_RST_ERESOURCE) \
> > + FN(MPTCP_RST_EPROHIBIT) \
> > + FN(MPTCP_RST_EWQ2BIG)   \
> > + FN(MPTCP_RST_EBADPERF)  \
> > + FN(MPTCP_RST_EMIDDLEBOX)\
>
> Small detail: should it not make more sense to put the ones linked to
> MPTCP at the end? I mean I guess MPTCP should be treated in second
> priority: CONFIG_MPTCP could not be set, and the ones linked to TCP
> should be more frequent, etc.

Do you mean that I need to adjust the order: 1) tcp reasons first, 2)
independent reasons, 3) mptcp reasons ?

Reasonable. I will do it :)

>
> > + FN(NOT_SPECIFIED)   \
> > + FN(NO_SOCKET)   \
> > + FNe(MAX)
>
> (...)
>
> > +/* Convert reset reasons in MPTCP to our own enum type */
> > +static inline enum sk_rst_reason convert_mptcpreason(u32 reason)
> > +{
> > + switch (reason) {
> > + case MPTCP_RST_EUNSPEC:
> > + return SK_RST_REASON_MPTCP_RST_EUNSPEC;
> > + case MPTCP_RST_EMPTCP:
> > + return SK_RST_REASON_MPTCP_RST_EMPTCP;
> > + case MPTCP_RST_ERESOURCE:
> > + return SK_RST_REASON_MPTCP_RST_ERESOURCE;
> > + case MPTCP_RST_EPROHIBIT:
> > + return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
> > + case MPTCP_RST_EWQ2BIG:
> > + return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
> > + case MPTCP_RST_EBADPERF:
> > + return SK_RST_REASON_MPTCP_RST_EBADPERF;
> > + case MPTCP_RST_EMIDDLEBOX:
> > + return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
> > + default:
> > + /**
> > +  * It should not happen, or else errors may occur
> > +  * in MPTCP layer
> > +  */
> > + return SK_RST_REASON_ERROR;
> > + }
> > +}
>
> If this helper is only used on MPTCP, maybe better to move it to
> net/mptcp/protocol.h (and to patch 5/7?)? We tried to isolate MPTCP code.

Roger that. I will move the helper into protocol.h as well as the patch itself.

>
> Also, maybe it is just me, but I'm not a big fan of the helper name:
> convert_mptcpreason() (same for the "drop" one). I think it should at
> least mention its "origin" (rst reason): e.g. something like
> (sk_)rst_reason_convert_mptcp or (sk_)rst_convert_mptcp_reason() (or
> mptcp_to_rst_reason())?
>
> And (sk_)rst_reason_convert_(skb_)drop() (or skb_drop_to_rst_reason())?

I agree with you. Actually I had a local patch where I used
sk_rst_reason_skbdrop() and sk_rst_reason_mptcpreason().
Interestingly, I changed them in this patch series due to the function
name being too long (which is my initial thought).

I will use sk_rst_convert_xxx_reason() as you suggested.

>
> > +/* Convert reset reasons in MPTCP to our own enum type */
>
> I don't think this part is linked to MPTCP, right?

Ah, copy-paste syndrome... Sorry, I will correct it.

>
> > +static inline enum sk_rst_reason convert_dropreason(enum skb_drop_reason 
> > reason)
> > +{
> > + switch (reason) {
> > + case SKB_DROP_REASON_NOT_SPECIFIED:
> > + return SK_RST_REASON_NOT_SPECIFIED;
> > + case SKB_DROP_REASON_NO_SOCKET:
> > + return SK_RST_REASON_NO_SOCKET;
> > + default:
> > + /* If we don't have our own corresponding reason */
> > + return SK_RST_REASON_NOT_SPECIFIED;
> > + }
> > +}
>
> (This helper could be introduced in patch 4/7 because it is not used
> before, but I'm fine either ways.)

Good. It makes more sense.

Thanks,
Jason

Re: [PATCH v2 1/4] virtio_balloon: separate vm events into a function

2024-04-22 Thread David Hildenbrand


On 22.04.24 09:42, zhenwei pi wrote:

All the VM events related statistics have dependence on
'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any
VM events in future, we would have codes like:
  #ifdef CONFIG_VM_EVENT_COUNTERS
   unsigned long foo;
  #endif
   ...
  #ifdef CONFIG_VM_EVENT_COUNTERS
   foo = events[XXX] + events[YYY];
   update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
  #endif

Separate vm events into a single function, also remove
'CONFIG_VM_EVENT_COUNTERS' from 'update_balloon_stats'.

Signed-off-by: zhenwei pi 
---
  drivers/virtio/virtio_balloon.c | 44 ++---
  1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1f5b3dd31fcf..59fe157e5722 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -316,34 +316,48 @@ static inline void update_stat(struct virtio_balloon *vb, 
int idx,
  
  #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT)
  
-static unsigned int update_balloon_stats(struct virtio_balloon *vb)

+/* Return the number of entries filled by vm events */
+static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb,
+  unsigned int start)
  {
+#ifdef CONFIG_VM_EVENT_COUNTERS
unsigned long events[NR_VM_EVENT_ITEMS];
-   struct sysinfo i;
-   unsigned int idx = 0;
-   long available;
-   unsigned long caches;
+   unsigned int idx = start;
  
  	all_vm_events(events);

-   si_meminfo();
-
-   available = si_mem_available();
-   caches = global_node_page_state(NR_FILE_PAGES);
-
-#ifdef CONFIG_VM_EVENT_COUNTERS
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
-   pages_to_bytes(events[PSWPIN]));
+   pages_to_bytes(events[PSWPIN]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
-   pages_to_bytes(events[PSWPOUT]));
+   pages_to_bytes(events[PSWPOUT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+
  #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
events[HTLB_BUDDY_PGALLOC_FAIL]);
-#endif
-#endif
+#endif /* CONFIG_HUGETLB_PAGE */
+
+   return idx - start;
+#else /* CONFIG_VM_EVENT_COUNTERS */
+
+   return 0;
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+}
+
+static unsigned int update_balloon_stats(struct virtio_balloon *vb)
+{
+   struct sysinfo i;
+   unsigned int idx = 0;
+   long available;
+   unsigned long caches;
+
+   idx += update_balloon_vm_stats(vb, idx);


No need to handle idx that complicated now. Just do

unsigned int idx;

idx = update_balloon_vm_stats(vb);

We can go down that path if we ever want to rearrange the code and not 
have the vm_stats first.



+
+   si_meminfo();
+   available = si_mem_available();
+   caches = global_node_page_state(NR_FILE_PAGES);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
pages_to_bytes(i.freeram));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,


--
Cheers,

David / dhildenb

Re: [PATCH v2 1/4] virtio_balloon: separate vm events into a function

2024-04-22 Thread David Hildenbrand


On 22.04.24 10:04, zhenwei pi wrote:



On 4/22/24 15:47, David Hildenbrand wrote:

On 22.04.24 09:42, zhenwei pi wrote:

All the VM events related statistics have dependence on
'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any
VM events in future, we would have codes like:
   #ifdef CONFIG_VM_EVENT_COUNTERS
    unsigned long foo;
   #endif
    ...
   #ifdef CONFIG_VM_EVENT_COUNTERS
    foo = events[XXX] + events[YYY];
    update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
   #endif

Separate vm events into a single function, also remove


Why not simply use __maybe_unused for that variable?



1>
static unsigned int update_balloon_stats()
{
  unsigned __maybe_unused long foo;

  ...
#ifdef CONFIG_VM_EVENT_COUNTERS
  foo = events[XXX] + events[YYY];
  update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
#endif
}

2>
static inline unsigned int update_balloon_vm_stats()
{
#ifdef CONFIG_VM_EVENT_COUNTERS
  unsigned long foo;

  foo = events[XXX] + events[YYY];
  update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
#endif
}

  From the point of my view, I don't need to compile code in my brain
when reading codes for case 2. :)


But for #1? :)

I mean, you didn't compile the code in your brain when you sent out v1 :P

But I agree that moving that to a separate function ins cleaner, staring 
at resulting update_balloon_stats().


Let me comment on some nits as a fresh reply.

--
Cheers,

David / dhildenb

Re: [PATCH v2 1/4] virtio_balloon: separate vm events into a function

2024-04-22 Thread Michael S. Tsirkin

On Mon, Apr 22, 2024 at 03:42:51PM +0800, zhenwei pi wrote:
> All the VM events related statistics have dependence on
> 'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any
> VM events in future, we would have codes like:
>  #ifdef CONFIG_VM_EVENT_COUNTERS
>   unsigned long foo;
>  #endif
>   ...
>  #ifdef CONFIG_VM_EVENT_COUNTERS
>   foo = events[XXX] + events[YYY];
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
>  #endif
> 
> Separate vm events into a single function, also remove
> 'CONFIG_VM_EVENT_COUNTERS' from 'update_balloon_stats'.
> 
> Signed-off-by: zhenwei pi 
> ---
>  drivers/virtio/virtio_balloon.c | 44 ++---
>  1 file changed, 29 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 1f5b3dd31fcf..59fe157e5722 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -316,34 +316,48 @@ static inline void update_stat(struct virtio_balloon 
> *vb, int idx,
>  
>  #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT)
>  
> -static unsigned int update_balloon_stats(struct virtio_balloon *vb)
> +/* Return the number of entries filled by vm events */
> +static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb,
> +unsigned int start)
>  {
> +#ifdef CONFIG_VM_EVENT_COUNTERS
>   unsigned long events[NR_VM_EVENT_ITEMS];
> - struct sysinfo i;
> - unsigned int idx = 0;
> - long available;
> - unsigned long caches;
> + unsigned int idx = start;
>  
>   all_vm_events(events);
> - si_meminfo();
> -
> - available = si_mem_available();
> - caches = global_node_page_state(NR_FILE_PAGES);
> -
> -#ifdef CONFIG_VM_EVENT_COUNTERS
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
> - pages_to_bytes(events[PSWPIN]));
> + pages_to_bytes(events[PSWPIN]));
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
> - pages_to_bytes(events[PSWPOUT]));
> + pages_to_bytes(events[PSWPOUT]));
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
> +
>  #ifdef CONFIG_HUGETLB_PAGE
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
>   events[HTLB_BUDDY_PGALLOC]);
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
>   events[HTLB_BUDDY_PGALLOC_FAIL]);
> -#endif
> -#endif
> +#endif /* CONFIG_HUGETLB_PAGE */
> +
> + return idx - start;
> +#else /* CONFIG_VM_EVENT_COUNTERS */
> +
> + return 0;
> +#endif /* CONFIG_VM_EVENT_COUNTERS */
> +}
> +

Generally the preferred style is this:

#ifdef .

static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb,
   unsigned int start)
{

}

#else /* CONFIG_VM_EVENT_COUNTERS */

static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb,
   unsigned int start)
{
return 0;
}

#endif

however given it was a spaghetti of ifdefs even before that,
the patch's ok I think.


> +static unsigned int update_balloon_stats(struct virtio_balloon *vb)
> +{
> + struct sysinfo i;
> + unsigned int idx = 0;
> + long available;
> + unsigned long caches;
> +
> + idx += update_balloon_vm_stats(vb, idx);
> +
> + si_meminfo();
> + available = si_mem_available();
> + caches = global_node_page_state(NR_FILE_PAGES);
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
>   pages_to_bytes(i.freeram));
>   update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,
> -- 
> 2.34.1

[PATCH v5 15/15] bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

BPF just-in-time compiler depended on CONFIG_MODULES because it used
module_alloc() to allocate memory for the generated code.

Since code allocations are now implemented with execmem, drop dependency of
CONFIG_BPF_JIT on CONFIG_MODULES and make it select CONFIG_EXECMEM.

Suggested-by: Björn Töpel 
Signed-off-by: Mike Rapoport (IBM) 
---
 kernel/bpf/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bc25f5098a25..f999e4e0b344 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -43,7 +43,7 @@ config BPF_JIT
bool "Enable BPF Just In Time compiler"
depends on BPF
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
-   depends on MODULES
+   select EXECMEM
help
  BPF programs are normally handled by a BPF interpreter. This option
  allows the kernel to generate native code when a program is loaded
-- 
2.43.0

[PATCH v5 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

kprobes depended on CONFIG_MODULES because it has to allocate memory for
code.

Since code allocations are now implemented with execmem, kprobes can be
enabled in non-modular kernels.

Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
dependency of CONFIG_KPROBES on CONFIG_MODULES.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/Kconfig|  2 +-
 include/linux/module.h  |  9 ++
 kernel/kprobes.c| 55 +++--
 kernel/trace/trace_kprobe.c | 20 +-
 4 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 7006f71f0110..a48ce6a488b3 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -52,9 +52,9 @@ config GENERIC_ENTRY
 
 config KPROBES
bool "Kprobes"
-   depends on MODULES
depends on HAVE_KPROBES
select KALLSYMS
+   select EXECMEM
select TASKS_RCU if PREEMPTION
help
  Kprobes allows you to trap at almost any kernel address and
diff --git a/include/linux/module.h b/include/linux/module.h
index 1153b0d99a80..ffa1c603163c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -605,6 +605,11 @@ static inline bool module_is_live(struct module *mod)
return mod->state != MODULE_STATE_GOING;
 }
 
+static inline bool module_is_coming(struct module *mod)
+{
+return mod->state == MODULE_STATE_COMING;
+}
+
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
@@ -857,6 +862,10 @@ void *dereference_module_function_descriptor(struct module 
*mod, void *ptr)
return ptr;
 }
 
+static inline bool module_is_coming(struct module *mod)
+{
+   return false;
+}
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ddd7cdc16edf..ca2c6cbd42d2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,7 +1588,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
}
 
/* Get module refcount and reject __init functions for loaded modules. 
*/
-   if (*probed_mod) {
+   if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
/*
 * We must hold a refcount of the probed module while updating
 * its code to prohibit unexpected unloading.
@@ -1603,12 +1603,13 @@ static int check_kprobe_address_safe(struct kprobe *p,
 * kprobes in there.
 */
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
-   (*probed_mod)->state != MODULE_STATE_COMING) {
+   !module_is_coming(*probed_mod)) {
module_put(*probed_mod);
*probed_mod = NULL;
ret = -ENOENT;
}
}
+
 out:
preempt_enable();
jump_label_unlock();
@@ -2488,24 +2489,6 @@ int kprobe_add_area_blacklist(unsigned long start, 
unsigned long end)
return 0;
 }
 
-/* Remove all symbols in given area from kprobe blacklist */
-static void kprobe_remove_area_blacklist(unsigned long start, unsigned long 
end)
-{
-   struct kprobe_blacklist_entry *ent, *n;
-
-   list_for_each_entry_safe(ent, n, _blacklist, list) {
-   if (ent->start_addr < start || ent->start_addr >= end)
-   continue;
-   list_del(>list);
-   kfree(ent);
-   }
-}
-
-static void kprobe_remove_ksym_blacklist(unsigned long entry)
-{
-   kprobe_remove_area_blacklist(entry, entry + 1);
-}
-
 int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
   char *type, char *sym)
 {
@@ -2570,6 +2553,25 @@ static int __init populate_kprobe_blacklist(unsigned 
long *start,
return ret ? : arch_populate_kprobe_blacklist();
 }
 
+#ifdef CONFIG_MODULES
+/* Remove all symbols in given area from kprobe blacklist */
+static void kprobe_remove_area_blacklist(unsigned long start, unsigned long 
end)
+{
+   struct kprobe_blacklist_entry *ent, *n;
+
+   list_for_each_entry_safe(ent, n, _blacklist, list) {
+   if (ent->start_addr < start || ent->start_addr >= end)
+   continue;
+   list_del(>list);
+   kfree(ent);
+   }
+}
+
+static void kprobe_remove_ksym_blacklist(unsigned long entry)
+{
+   kprobe_remove_area_blacklist(entry, entry + 1);
+}
+
 static void add_module_kprobe_blacklist(struct module *mod)
 {
unsigned long start, end;
@@ -2672,6 +2674,17 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
 };
 
+static int kprobe_register_module_notifier(void)
+{
+   return register_module_notifier(_module_nb);
+}
+#else
+static int kprobe_register_module_notifier(void)
+{
+

[PATCH v5 13/15] powerpc: use CONFIG_EXECMEM instead of CONFIG_MODULES where appropriate

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

There are places where CONFIG_MODULES guards the code that depends on
memory allocation being done with module_alloc().

Replace CONFIG_MODULES with CONFIG_EXECMEM in such places.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/powerpc/Kconfig | 2 +-
 arch/powerpc/include/asm/kasan.h | 2 +-
 arch/powerpc/kernel/head_8xx.S   | 4 ++--
 arch/powerpc/kernel/head_book3s_32.S | 6 +++---
 arch/powerpc/lib/code-patching.c | 2 +-
 arch/powerpc/mm/book3s32/mmu.c   | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1c4be3373686..2e586733a464 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -285,7 +285,7 @@ config PPC
select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
-   select KASAN_VMALLOCif KASAN && MODULES
+   select KASAN_VMALLOCif KASAN && EXECMEM
select LOCK_MM_AND_FIND_VMA
select MMU_GATHER_PAGE_SIZE
select MMU_GATHER_RCU_TABLE_FREE
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index 365d2720097c..b5bbb94c51f6 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -19,7 +19,7 @@
 
 #define KASAN_SHADOW_SCALE_SHIFT   3
 
-#if defined(CONFIG_MODULES) && defined(CONFIG_PPC32)
+#if defined(CONFIG_EXECMEM) && defined(CONFIG_PPC32)
 #define KASAN_KERN_START   ALIGN_DOWN(PAGE_OFFSET - SZ_256M, SZ_256M)
 #else
 #define KASAN_KERN_START   PAGE_OFFSET
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 647b0b445e89..edc479a7c2bc 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -199,12 +199,12 @@ instruction_counter:
mfspr   r10, SPRN_SRR0  /* Get effective address of fault */
INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11)
mtspr   SPRN_MD_EPN, r10
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
mfcrr11
compare_to_kernel_boundary r10, r10
 #endif
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
blt+3f
rlwinm  r10, r10, 0, 20, 31
orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha
diff --git a/arch/powerpc/kernel/head_book3s_32.S 
b/arch/powerpc/kernel/head_book3s_32.S
index c1d89764dd22..57196883a00e 100644
--- a/arch/powerpc/kernel/head_book3s_32.S
+++ b/arch/powerpc/kernel/head_book3s_32.S
@@ -419,14 +419,14 @@ InstructionTLBMiss:
  */
/* Get PTE (linux-style) and check access */
mfspr   r3,SPRN_IMISS
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
lis r1, TASK_SIZE@h /* check if kernel address */
cmplw   0,r1,r3
 #endif
mfspr   r2, SPRN_SDR1
li  r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
rlwinm  r2, r2, 28, 0xf000
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
li  r0, 3
bgt-112f
lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha   /* if kernel address, 
use */
@@ -442,7 +442,7 @@ InstructionTLBMiss:
andc.   r1,r1,r2/* check access & ~permission */
bne-InstructionAddressInvalid /* return if access not permitted */
/* Convert linux-style PTE to low word of PPC-style PTE */
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
rlwimi  r2, r0, 0, 31, 31   /* userspace ? -> PP lsb */
 #endif
ori r1, r1, 0xe06   /* clear out reserved bits */
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index c6ab46156cda..7af791446ddf 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -225,7 +225,7 @@ void __init poking_init(void)
 
 static unsigned long get_patch_pfn(void *addr)
 {
-   if (IS_ENABLED(CONFIG_MODULES) && is_vmalloc_or_module_addr(addr))
+   if (IS_ENABLED(CONFIG_EXECMEM) && is_vmalloc_or_module_addr(addr))
return vmalloc_to_pfn(addr);
else
return __pa_symbol(addr) >> PAGE_SHIFT;
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 100f999871bc..625fe7d08e06 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -184,7 +184,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, 
unsigned long top)
 
 static bool is_module_segment(unsigned long addr)
 {
-   if (!IS_ENABLED(CONFIG_MODULES))
+   if (!IS_ENABLED(CONFIG_EXECMEM))
return false;
if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M))
return false;
-- 
2.43.0

[PATCH v5 12/15] x86/ftrace: enable dynamic ftrace without CONFIG_MODULES

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Dynamic ftrace must allocate memory for code and this was impossible
without CONFIG_MODULES.

With execmem separated from the modules code, execmem_text_alloc() is
available regardless of CONFIG_MODULES.

Remove dependency of dynamic ftrace on CONFIG_MODULES and make
CONFIG_DYNAMIC_FTRACE select CONFIG_EXECMEM in Kconfig.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/x86/Kconfig |  1 +
 arch/x86/kernel/ftrace.c | 10 --
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3f5ba72c9480..cd8addb96a0b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86_64
select SWIOTLB
select ARCH_HAS_ELFCORE_COMPAT
select ZONE_DMA32
+   select EXECMEM if DYNAMIC_FTRACE
 
 config FORCE_DYNAMIC_FTRACE
def_bool y
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c8ddb7abda7c..8da0e66ca22d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -261,8 +261,6 @@ void arch_ftrace_update_code(int command)
 /* Currently only x86_64 supports dynamic trampolines */
 #ifdef CONFIG_X86_64
 
-#ifdef CONFIG_MODULES
-/* Module allocation simplifies allocating memory for code */
 static inline void *alloc_tramp(unsigned long size)
 {
return execmem_alloc(EXECMEM_FTRACE, size);
@@ -271,14 +269,6 @@ static inline void tramp_free(void *tramp)
 {
execmem_free(tramp);
 }
-#else
-/* Trampolines can only be created if modules are supported */
-static inline void *alloc_tramp(unsigned long size)
-{
-   return NULL;
-}
-static inline void tramp_free(void *tramp) { }
-#endif
 
 /* Defined as markers to the end of the ftrace default trampolines */
 extern void ftrace_regs_caller_end(void);
-- 
2.43.0

[PATCH v5 11/15] arch: make execmem setup available regardless of CONFIG_MODULES

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

execmem does not depend on modules, on the contrary modules use
execmem.

To make execmem available when CONFIG_MODULES=n, for instance for
kprobes, split execmem_params initialization out from
arch/*/kernel/module.c and compile it when CONFIG_EXECMEM=y

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm/kernel/module.c   |  43 --
 arch/arm/mm/init.c |  45 +++
 arch/arm64/kernel/module.c | 140 -
 arch/arm64/mm/init.c   | 140 +
 arch/loongarch/kernel/module.c |  19 -
 arch/loongarch/mm/init.c   |  21 +
 arch/mips/kernel/module.c  |  22 --
 arch/mips/mm/init.c|  23 ++
 arch/nios2/kernel/module.c |  20 -
 arch/nios2/mm/init.c   |  21 +
 arch/parisc/kernel/module.c|  20 -
 arch/parisc/mm/init.c  |  23 +-
 arch/powerpc/kernel/module.c   |  63 ---
 arch/powerpc/mm/mem.c  |  64 +++
 arch/riscv/kernel/module.c |  44 ---
 arch/riscv/mm/init.c   |  45 +++
 arch/s390/kernel/module.c  |  27 ---
 arch/s390/mm/init.c|  30 +++
 arch/sparc/kernel/module.c |  19 -
 arch/sparc/mm/Makefile |   2 +
 arch/sparc/mm/execmem.c|  21 +
 arch/x86/kernel/module.c   |  27 ---
 arch/x86/mm/init.c |  29 +++
 23 files changed, 463 insertions(+), 445 deletions(-)
 create mode 100644 arch/sparc/mm/execmem.c

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index a98fdf6ff26c..677f218f7e84 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -12,57 +12,14 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
-#include 
 
 #include 
 #include 
 #include 
 #include 
 
-#ifdef CONFIG_XIP_KERNEL
-/*
- * The XIP kernel text is mapped in the module area for modules and
- * some other stuff to work without any indirect relocations.
- * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
- * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
- */
-#undef MODULES_VADDR
-#define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
-#endif
-
-#ifdef CONFIG_MMU
-static struct execmem_info execmem_info __ro_after_init;
-
-struct execmem_info __init *execmem_arch_setup(void)
-{
-   unsigned long fallback_start = 0, fallback_end = 0;
-
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
-   fallback_start = VMALLOC_START;
-   fallback_end = VMALLOC_END;
-   }
-
-   execmem_info = (struct execmem_info){
-   .ranges = {
-   [EXECMEM_DEFAULT] = {
-   .start  = MODULES_VADDR,
-   .end= MODULES_END,
-   .pgprot = PAGE_KERNEL_EXEC,
-   .alignment = 1,
-   .fallback_start = fallback_start,
-   .fallback_end   = fallback_end,
-   },
-   },
-   };
-
-   return _info;
-}
-#endif
-
 bool module_init_section(const char *name)
 {
return strstarts(name, ".init") ||
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index e8c6f4be0ce1..5345d218899a 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -486,3 +487,47 @@ void free_initrd_mem(unsigned long start, unsigned long 
end)
free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
+
+#ifdef CONFIG_EXECMEM
+
+#ifdef CONFIG_XIP_KERNEL
+/*
+ * The XIP kernel text is mapped in the module area for modules and
+ * some other stuff to work without any indirect relocations.
+ * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
+ * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
+ */
+#undef MODULES_VADDR
+#define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
+#endif
+
+#ifdef CONFIG_MMU
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+   unsigned long fallback_start = 0, fallback_end = 0;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   fallback_start = VMALLOC_START;
+   fallback_end = VMALLOC_END;
+   }
+
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   .fallback_start = fallback_start,
+   .fallback_end   = fallback_end,
+   },
+   },
+

[PATCH v5 10/15] powerpc: extend execmem_params for kprobes allocations

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

powerpc overrides kprobes::alloc_insn_page() to remove writable
permissions when STRICT_MODULE_RWX is on.

Add definition of EXECMEM_KRPOBES to execmem_params to allow using the
generic kprobes::alloc_insn_page() with the desired permissions.

As powerpc uses breakpoint instructions to inject kprobes, it does not
need to constrain kprobe allocations to the modules area and can use the
entire vmalloc address space.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/powerpc/kernel/kprobes.c | 20 
 arch/powerpc/kernel/module.c  |  7 +++
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 9fcd01bb2ce6..14c5ddec3056 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -126,26 +126,6 @@ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long 
addr, unsigned long offse
return (kprobe_opcode_t *)(addr + offset);
 }
 
-void *alloc_insn_page(void)
-{
-   void *page;
-
-   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
-   if (!page)
-   return NULL;
-
-   if (strict_module_rwx_enabled()) {
-   int err = set_memory_rox((unsigned long)page, 1);
-
-   if (err)
-   goto error;
-   }
-   return page;
-error:
-   execmem_free(page);
-   return NULL;
-}
-
 int arch_prepare_kprobe(struct kprobe *p)
 {
int ret = 0;
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index ac80559015a3..2a23cf7e141b 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -94,6 +94,7 @@ static struct execmem_info execmem_info __ro_after_init;
 
 struct execmem_info __init *execmem_arch_setup(void)
 {
+   pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : 
PAGE_KERNEL_EXEC;
pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : 
PAGE_KERNEL_EXEC;
unsigned long fallback_start = 0, fallback_end = 0;
unsigned long start, end;
@@ -132,6 +133,12 @@ struct execmem_info __init *execmem_arch_setup(void)
.fallback_start = fallback_start,
.fallback_end   = fallback_end,
},
+   [EXECMEM_KPROBES] = {
+   .start  = VMALLOC_START,
+   .end= VMALLOC_END,
+   .pgprot = kprobes_prot,
+   .alignment = 1,
+   },
[EXECMEM_MODULE_DATA] = {
.start  = VMALLOC_START,
.end= VMALLOC_END,
-- 
2.43.0

[PATCH v5 09/15] riscv: extend execmem_params for generated code allocations

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

The memory allocations for kprobes and BPF on RISC-V are not placed in
the modules area and these custom allocations are implemented with
overrides of alloc_insn_page() and  bpf_jit_alloc_exec().

Slightly reorder execmem_params initialization to support both 32 and 64
bit variants, define EXECMEM_KPROBES and EXECMEM_BPF ranges in
riscv::execmem_params and drop overrides of alloc_insn_page() and
bpf_jit_alloc_exec().

Signed-off-by: Mike Rapoport (IBM) 
Reviewed-by: Alexandre Ghiti 
---
 arch/riscv/kernel/module.c | 28 +---
 arch/riscv/kernel/probes/kprobes.c | 10 --
 arch/riscv/net/bpf_jit_core.c  | 13 -
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 182904127ba0..2ecbacbc9993 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -906,19 +906,41 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+#ifdef CONFIG_MMU
 static struct execmem_info execmem_info __ro_after_init;
 
 struct execmem_info __init *execmem_arch_setup(void)
 {
+   unsigned long start, end;
+
+   if (IS_ENABLED(CONFIG_64BIT)) {
+   start = MODULES_VADDR;
+   end = MODULES_END;
+   } else {
+   start = VMALLOC_START;
+   end = VMALLOC_END;
+   }
+
execmem_info = (struct execmem_info){
.ranges = {
[EXECMEM_DEFAULT] = {
-   .start  = MODULES_VADDR,
-   .end= MODULES_END,
+   .start  = start,
+   .end= end,
.pgprot = PAGE_KERNEL,
.alignment = 1,
},
+   [EXECMEM_KPROBES] = {
+   .start  = VMALLOC_START,
+   .end= VMALLOC_END,
+   .pgprot = PAGE_KERNEL_READ_EXEC,
+   .alignment = 1,
+   },
+   [EXECMEM_BPF] = {
+   .start  = BPF_JIT_REGION_START,
+   .end= BPF_JIT_REGION_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = PAGE_SIZE,
+   },
},
};
 
diff --git a/arch/riscv/kernel/probes/kprobes.c 
b/arch/riscv/kernel/probes/kprobes.c
index 2f08c14a933d..e64f2f3064eb 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -104,16 +104,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return 0;
 }
 
-#ifdef CONFIG_MMU
-void *alloc_insn_page(void)
-{
-   return  __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
-GFP_KERNEL, PAGE_KERNEL_READ_EXEC,
-VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-__builtin_return_address(0));
-}
-#endif
-
 /* install breakpoint in text */
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 6b3acac30c06..e238fdbd5dbc 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -219,19 +219,6 @@ u64 bpf_jit_alloc_exec_limit(void)
return BPF_JIT_REGION_SIZE;
 }
 
-void *bpf_jit_alloc_exec(unsigned long size)
-{
-   return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
-   BPF_JIT_REGION_END, GFP_KERNEL,
-   PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
-   return vfree(addr);
-}
-
 void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 {
int ret;
-- 
2.43.0

[PATCH v5 08/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Extend execmem parameters to accommodate more complex overrides of
module_alloc() by architectures.

This includes specification of a fallback range required by arm, arm64
and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for
allocation of KASAN shadow required by s390 and x86 and support for
early initialization of execmem required by x86.

The core implementation of execmem_alloc() takes care of suppressing
warnings when the initial allocation fails but there is a fallback range
defined.

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Will Deacon 
---
 arch/Kconfig   |  6 +++
 arch/arm/kernel/module.c   | 41 ++---
 arch/arm64/kernel/module.c | 67 ++--
 arch/arm64/kernel/probes/kprobes.c |  7 ---
 arch/arm64/net/bpf_jit_comp.c  | 11 -
 arch/powerpc/kernel/module.c   | 60 -
 arch/s390/kernel/module.c  | 54 ++-
 arch/x86/Kconfig   |  1 +
 arch/x86/kernel/module.c   | 70 ++
 include/linux/execmem.h| 34 +++
 include/linux/moduleloader.h   | 12 -
 kernel/module/main.c   | 26 +++
 mm/execmem.c   | 70 +-
 mm/mm_init.c   |  2 +
 14 files changed, 259 insertions(+), 202 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 65afb1de48b3..7006f71f0110 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -960,6 +960,12 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC
  For architectures like powerpc/32 which have constraints on module
  allocation and need to allocate module data outside of module area.
 
+config ARCH_WANTS_EXECMEM_EARLY
+   bool
+   help
+ For architectures that might allocate executable memory early on
+ boot, for instance ftrace on x86.
+
 config HAVE_IRQ_EXIT_ON_IRQ_STACK
bool
help
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e74d84f58b77..a98fdf6ff26c 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -34,23 +35,31 @@
 #endif
 
 #ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   gfp_t gfp_mask = GFP_KERNEL;
-   void *p;
-
-   /* Silence the initial allocation */
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-   gfp_mask |= __GFP_NOWARN;
-
-   p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-   if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-   return p;
-   return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   unsigned long fallback_start = 0, fallback_end = 0;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   fallback_start = VMALLOC_START;
+   fallback_end = VMALLOC_END;
+   }
+
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   .fallback_start = fallback_start,
+   .fallback_end   = fallback_end,
+   },
+   },
+   };
+
+   return _info;
 }
 #endif
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index e92da4da1b2a..a52240ea084b 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -108,41 +109,59 @@ static int __init module_init_limits(void)
 
return 0;
 }
-subsys_initcall(module_init_limits);
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   void *p = NULL;
+   unsigned long fallback_start = 0, fallback_end = 0;
+   unsigned long start = 0, end = 0;
+
+   module_init_limits();
 
/*
 * Where possible, prefer to allocate within direct branch range of the
 * kernel such that no PLTs are necessary.
 */
if (module_direct_base) {
-   p = __vmalloc_node_range(size, MODULE_ALIGN,
-module_direct_base,
-

[PATCH v5 07/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Several architectures override module_alloc() only to define address
range for code allocations different than VMALLOC address space.

Provide a generic implementation in execmem that uses the parameters for
address space ranges, required alignment and page protections provided
by architectures.

The architectures must fill execmem_info structure and implement
execmem_arch_setup() that returns a pointer to that structure. This way the
execmem initialization won't be called from every architecture, but rather
from a central place, namely a core_initcall() in execmem.

The execmem provides execmem_alloc() API that wraps __vmalloc_node_range()
with the parameters defined by the architectures.  If an architecture does
not implement execmem_arch_setup(), execmem_alloc() will fall back to
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/loongarch/kernel/module.c | 19 +++--
 arch/mips/kernel/module.c  | 20 --
 arch/nios2/kernel/module.c | 21 +++---
 arch/parisc/kernel/module.c| 24 +++
 arch/riscv/kernel/module.c | 24 +++
 arch/sparc/kernel/module.c | 20 --
 include/linux/execmem.h| 41 +++
 mm/execmem.c   | 73 --
 8 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c
index c7d0338d12c1..ca6dd7ea1610 100644
--- a/arch/loongarch/kernel/module.c
+++ b/arch/loongarch/kernel/module.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -490,10 +491,22 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, 
__builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 
 static void module_init_ftrace_plt(const Elf_Ehdr *hdr,
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 9a6c96014904..59225a3cf918 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct mips_hi16 {
@@ -32,11 +33,22 @@ static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
 #ifdef MODULES_VADDR
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 #endif
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 9c97b7513853..0d1ee86631fc 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -18,15 +18,26 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC,
-   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index

[PATCH v5 06/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystems
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

Start splitting code allocation from modules by introducing execmem_alloc()
and execmem_free() APIs.

Initially, execmem_alloc() is a wrapper for module_alloc() and
execmem_free() is a replacement of module_memfree() to allow updating all
call sites to use the new APIs.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem_alloc() takes
a type argument, that will be used to identify the calling subsystem and to
allow architectures define parameters for ranges suitable for that
subsystem.

No functional changes.

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Masami Hiramatsu (Google) 
---
 arch/powerpc/kernel/kprobes.c|  6 ++--
 arch/s390/kernel/ftrace.c|  4 +--
 arch/s390/kernel/kprobes.c   |  4 +--
 arch/s390/kernel/module.c|  5 +--
 arch/sparc/net/bpf_jit_comp_32.c |  8 ++---
 arch/x86/kernel/ftrace.c |  6 ++--
 arch/x86/kernel/kprobes/core.c   |  4 +--
 include/linux/execmem.h  | 57 
 include/linux/moduleloader.h |  3 --
 kernel/bpf/core.c|  6 ++--
 kernel/kprobes.c |  8 ++---
 kernel/module/Kconfig|  1 +
 kernel/module/main.c | 25 +-
 mm/Kconfig   |  3 ++
 mm/Makefile  |  1 +
 mm/execmem.c | 32 ++
 16 files changed, 128 insertions(+), 45 deletions(-)
 create mode 100644 include/linux/execmem.h
 create mode 100644 mm/execmem.c

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bbca90a5e2ec..9fcd01bb2ce6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -19,8 +19,8 @@
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -130,7 +130,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
 
@@ -142,7 +142,7 @@ void *alloc_insn_page(void)
}
return page;
 error:
-   module_memfree(page);
+   execmem_free(page);
return NULL;
 }
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index c46381ea04ec..798249ef5646 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -7,13 +7,13 @@
  *   Author(s): Martin Schwidefsky 
  */
 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void)
 {
const char *start, *end;
 
-   ftrace_plt = module_alloc(PAGE_SIZE);
+   ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE);
if (!ftrace_plt)
panic("cannot allocate ftrace plt\n");
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index f0cf20d4b3c5..3c1b1be744de 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -9,7 +9,6 @@
 
 #define pr_fmt(fmt) "kprobes: " fmt
 
-#include 
 #include 
 #include 
 #include 
@@ -21,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -38,7 +38,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
set_memory_rox((unsigned long)page, 1);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 42215f9404af..ac97a905e8cd 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -76,7 +77,7 @@ void *module_alloc(unsigned long size)
 #ifdef CONFIG_FUNCTION_TRACER
 void module_arch_cleanup(struct module *mod)
 {
-   module_memfree(mod->arch.trampolines_start);
+   execmem_free(mod->arch.trampolines_start);
 }
 #endif
 
@@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct 
module *me,
 
size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
numpages = DIV_ROUND_UP(size, PAGE_SIZE);
-   start = module_alloc(numpages * PAGE_SIZE);
+   start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
if (!start)
return -ENOMEM;

[PATCH v5 05/15] module: make module_memory_{alloc,free} more self-contained

2024-04-22 Thread Mike Rapoport

From: "Mike Rapoport (IBM)" 

Move the logic related to the memory allocation and freeing into
module_memory_alloc() and module_memory_free().

Signed-off-by: Mike Rapoport (IBM) 
---
 kernel/module/main.c | 64 +++-
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index e1e8a7a9d6c1..5b82b069e0d3 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1203,15 +1203,44 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type)
mod_mem_type_is_core_data(type);
 }
 
-static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
+static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
+   unsigned int size = PAGE_ALIGN(mod->mem[type].size);
+   void *ptr;
+
+   mod->mem[type].size = size;
+
if (mod_mem_use_vmalloc(type))
-   return vzalloc(size);
-   return module_alloc(size);
+   ptr = vmalloc(size);
+   else
+   ptr = module_alloc(size);
+
+   if (!ptr)
+   return -ENOMEM;
+
+   /*
+* The pointer to these blocks of memory are stored on the module
+* structure and we keep that around so long as the module is
+* around. We only free that memory when we unload the module.
+* Just mark them as not being a leak then. The .init* ELF
+* sections *do* get freed after boot so we *could* treat them
+* slightly differently with kmemleak_ignore() and only grey
+* them out as they work as typical memory allocations which
+* *do* eventually get freed, but let's just keep things simple
+* and avoid *any* false positives.
+*/
+   kmemleak_not_leak(ptr);
+
+   memset(ptr, 0, size);
+   mod->mem[type].base = ptr;
+
+   return 0;
 }
 
-static void module_memory_free(void *ptr, enum mod_mem_type type)
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
 {
+   void *ptr = mod->mem[type].base;
+
if (mod_mem_use_vmalloc(type))
vfree(ptr);
else
@@ -1229,12 +1258,12 @@ static void free_mod_mem(struct module *mod)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
-   module_memory_free(mod_mem->base, type);
+   module_memory_free(mod, type);
}
 
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, 
mod->mem[MOD_DATA].size);
-   module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+   module_memory_free(mod, MOD_DATA);
 }
 
 /* Free a module, remove from lists, etc. */
@@ -2225,7 +2254,6 @@ static int find_module_sections(struct module *mod, 
struct load_info *info)
 static int move_module(struct module *mod, struct load_info *info)
 {
int i;
-   void *ptr;
enum mod_mem_type t = 0;
int ret = -ENOMEM;
 
@@ -2234,26 +2262,12 @@ static int move_module(struct module *mod, struct 
load_info *info)
mod->mem[type].base = NULL;
continue;
}
-   mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
-   ptr = module_memory_alloc(mod->mem[type].size, type);
-   /*
- * The pointer to these blocks of memory are stored on the 
module
- * structure and we keep that around so long as the module is
- * around. We only free that memory when we unload the module.
- * Just mark them as not being a leak then. The .init* ELF
- * sections *do* get freed after boot so we *could* treat them
- * slightly differently with kmemleak_ignore() and only grey
- * them out as they work as typical memory allocations which
- * *do* eventually get freed, but let's just keep things simple
- * and avoid *any* false positives.
-*/
-   kmemleak_not_leak(ptr);
-   if (!ptr) {
+
+   ret = module_memory_alloc(mod, type);
+   if (ret) {
t = type;
goto out_enomem;
}
-   memset(ptr, 0, mod->mem[type].size);
-   mod->mem[type].base = ptr;
}
 
/* Transfer each section which specifies SHF_ALLOC */
@@ -2296,7 +2310,7 @@ static int move_module(struct module *mod, struct 
load_info *info)
return 0;
 out_enomem:
for (t--; t >= 0; t--)
-   module_memory_free(mod->mem[t].base, t);
+   module_memory_free(mod, t);
return ret;
 }
 
-- 
2.43.0

1 2 >

1 - 100 of 118 matches

Mail list logo