Re: [RFC PATCH 02/24] xsk: add user memory registration sockopt

2018-02-07 Thread Björn Töpel
2018-02-07 17:00 GMT+01:00 Willem de Bruijn :
> On Wed, Jan 31, 2018 at 8:53 AM, Björn Töpel  wrote:
>> From: Björn Töpel 
>>
>> The XDP_MEM_REG socket option allows a process to register a window of
>> user space memory to the kernel. This memory will later be used as
>> frame data buffer.
>>
>> Signed-off-by: Björn Töpel 
>> ---
>
>> +static struct xsk_umem *xsk_mem_reg(u64 addr, u64 size, u32 frame_size,
>> +   u32 data_headroom)
>> +{
>> +   unsigned long lock_limit, locked, npages;
>> +   int ret = 0;
>> +   struct xsk_umem *umem;
>> +
>> +   if (!can_do_mlock())
>> +   return ERR_PTR(-EPERM);
>> +
>> +   umem = xsk_umem_create(addr, size, frame_size, data_headroom);
>> +   if (IS_ERR(umem))
>> +   return umem;
>> +
>> +   npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
>> +
>> +   down_write(>mm->mmap_sem);
>> +
>> +   locked = npages + current->mm->pinned_vm;
>> +   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> +
>> +   if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> +   ret = -ENOMEM;
>> +   goto out;
>> +   }
>> +
>> +   if (npages == 0 || npages > UINT_MAX) {
>> +   ret = -EINVAL;
>> +   goto out;
>> +   }
>> +   umem->npgs = npages;
>> +
>> +   ret = xsk_umem_pin_pages(umem);
>> +
>> +out:
>> +   if (ret < 0) {
>> +   put_pid(umem->pid);
>> +   kfree(umem);
>> +   } else {
>> +   current->mm->pinned_vm = locked;
>> +   }
>> +
>> +   up_write(>mm->mmap_sem);
>
> This limits per process. You may want to limit per user. See also
> mm_account_pinned_pages.

Ah, noted! Thanks for pointing that out!


Re: [RFC PATCH 02/24] xsk: add user memory registration sockopt

2018-02-07 Thread Willem de Bruijn
On Wed, Jan 31, 2018 at 8:53 AM, Björn Töpel  wrote:
> From: Björn Töpel 
>
> The XDP_MEM_REG socket option allows a process to register a window of
> user space memory to the kernel. This memory will later be used as
> frame data buffer.
>
> Signed-off-by: Björn Töpel 
> ---

> +static struct xsk_umem *xsk_mem_reg(u64 addr, u64 size, u32 frame_size,
> +   u32 data_headroom)
> +{
> +   unsigned long lock_limit, locked, npages;
> +   int ret = 0;
> +   struct xsk_umem *umem;
> +
> +   if (!can_do_mlock())
> +   return ERR_PTR(-EPERM);
> +
> +   umem = xsk_umem_create(addr, size, frame_size, data_headroom);
> +   if (IS_ERR(umem))
> +   return umem;
> +
> +   npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
> +
> +   down_write(>mm->mmap_sem);
> +
> +   locked = npages + current->mm->pinned_vm;
> +   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +   if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +   ret = -ENOMEM;
> +   goto out;
> +   }
> +
> +   if (npages == 0 || npages > UINT_MAX) {
> +   ret = -EINVAL;
> +   goto out;
> +   }
> +   umem->npgs = npages;
> +
> +   ret = xsk_umem_pin_pages(umem);
> +
> +out:
> +   if (ret < 0) {
> +   put_pid(umem->pid);
> +   kfree(umem);
> +   } else {
> +   current->mm->pinned_vm = locked;
> +   }
> +
> +   up_write(>mm->mmap_sem);

This limits per process. You may want to limit per user. See also
mm_account_pinned_pages.


[RFC PATCH 02/24] xsk: add user memory registration sockopt

2018-01-31 Thread Björn Töpel
From: Björn Töpel 

The XDP_MEM_REG socket option allows a process to register a window of
user space memory to the kernel. This memory will later be used as
frame data buffer.

Signed-off-by: Björn Töpel 
---
 include/uapi/linux/if_xdp.h |   7 ++
 net/xdp/xsk.c   | 294 +++-
 net/xdp/xsk.h   |  19 ++-
 3 files changed, 316 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index cd09232e16c1..3f8c90c708b4 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -29,4 +29,11 @@ struct sockaddr_xdp {
 #define XDP_RX_RING2
 #define XDP_TX_RING3
 
+struct xdp_mr_req {
+   __u64   addr;   /* Start of packet data area */
+   __u64   len;/* Length of packet data area */
+   __u32   frame_size; /* Frame size */
+   __u32   data_headroom;  /* Frame head room */
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2d7c08a50c60..333ce1450cc7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -19,18 +19,235 @@
 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 
 #include "xsk.h"
 
+#define XSK_UMEM_MIN_FRAME_SIZE 2048
+
 struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
+   struct xsk_umem *umem;
 };
 
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+   return (struct xdp_sock *)sk;
+}
+
+static void xsk_umem_unpin_pages(struct xsk_umem *umem)
+{
+   unsigned int i;
+
+   if (umem->pgs) {
+   for (i = 0; i < umem->npgs; i++) {
+   struct page *page = umem->pgs[i];
+
+   set_page_dirty_lock(page);
+   put_page(page);
+   }
+
+   kfree(umem->pgs);
+   umem->pgs = NULL;
+   }
+}
+
+static void xsk_umem_destroy(struct xsk_umem *umem)
+{
+   struct mm_struct *mm;
+   struct task_struct *task;
+   unsigned long diff;
+
+   if (!umem)
+   return;
+
+   xsk_umem_unpin_pages(umem);
+
+   task = get_pid_task(umem->pid, PIDTYPE_PID);
+   put_pid(umem->pid);
+   if (!task)
+   goto out;
+   mm = get_task_mm(task);
+   put_task_struct(task);
+   if (!mm)
+   goto out;
+
+   diff = umem->size >> PAGE_SHIFT;
+
+   down_write(>mmap_sem);
+   mm->pinned_vm -= diff;
+   up_write(>mmap_sem);
+   mmput(mm);
+out:
+   kfree(umem);
+}
+
+static struct xsk_umem *xsk_umem_create(u64 addr, u64 size, u32 frame_size,
+   u32 data_headroom)
+{
+   struct xsk_umem *umem;
+   unsigned int nframes;
+   int size_chk;
+
+   if (frame_size < XSK_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+   /* Strictly speaking we could support this, if:
+* - huge pages, or*
+* - using an IOMMU, or
+* - making sure the memory area is consecutive
+* but for now, we simply say "computer says no".
+*/
+   return ERR_PTR(-EINVAL);
+   }
+
+   if (!is_power_of_2(frame_size))
+   return ERR_PTR(-EINVAL);
+
+   if (!PAGE_ALIGNED(addr)) {
+   /* Memory area has to be page size aligned. For
+* simplicity, this might change.
+*/
+   return ERR_PTR(-EINVAL);
+   }
+
+   if ((addr + size) < addr)
+   return ERR_PTR(-EINVAL);
+
+   nframes = size / frame_size;
+   if (nframes == 0)
+   return ERR_PTR(-EINVAL);
+
+   data_headroom = ALIGN(data_headroom, 64);
+
+   size_chk = frame_size - data_headroom - XSK_KERNEL_HEADROOM;
+   if (size_chk < 0)
+   return ERR_PTR(-EINVAL);
+
+   umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+   if (!umem)
+   return ERR_PTR(-ENOMEM);
+
+   umem->pid = get_task_pid(current, PIDTYPE_PID);
+   umem->size = (size_t)size;
+   umem->address = (unsigned long)addr;
+   umem->frame_size = frame_size;
+   umem->nframes = nframes;
+   umem->data_headroom = data_headroom;
+   umem->pgs = NULL;
+
+   return umem;
+}
+
+static int xsk_umem_pin_pages(struct xsk_umem *umem)
+{
+   unsigned int gup_flags = FOLL_WRITE;
+   long npgs;
+   int err;
+
+   /* XXX Fix so that we don't always pin.
+* "copy to user" from interrupt context, but how?
+*/
+   umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_ATOMIC);
+   if (!umem->pgs)
+   return -ENOMEM;
+
+   npgs = get_user_pages(umem->address, umem->npgs,
+ gup_flags, >pgs[0], NULL);
+   if (npgs != umem->npgs) {
+   if (npgs >= 0) {
+   umem->npgs = npgs;
+