[Xen-devel] [PATCHv2 2/2] xen/privcmd: add ioctls for locking/unlocking hypercall buffers

2016-08-04 Thread David Vrabel
Using mlock() for hypercall buffers is not sufficient since mlocked
pages are still subject to compaction and page migration.  Page
migration can be prevented by taking additional references to the
pages.

Introduce two new ioctls: IOCTL_PRIVCMD_HCALL_BUF_LOCK and
IOCTL_PRIVCMD_HCALL_BUF_UNLOCK which get and put the necessary page
references.  The buffers do not need to be page aligned and they may
overlap with other buffers.  However, it is not possible to partially
unlock a buffer (i.e., the LOCK/UNLOCK must always be paired).  Any
locked buffers are automatically unlocked when the file descriptor is
closed.

An alternative approach would be to extend the driver with an ioctl to
populate a VMA with "special", non-migratable pages.  But the
LOCK/UNLOCK ioctls are more flexible as they allow any page to be used
for hypercalls (e.g., stack, mmap'd files, etc.).  This could be used
to minimize bouncing for performance critical hypercalls.

Signed-off-by: David Vrabel 
---
v2:
- Only take one reference per buffer.
- Interate over the tree to unlock/free all buffers.
- Max 1GB size per buffer.
- Only use one memory allocation per buffer.
- Fix overflow issues in privcmd_hbuf_compare().
- Return -ENXIO when unlocking an unlocked buffer.
---
 drivers/xen/privcmd.c  | 216 +
 include/uapi/xen/privcmd.h |  38 
 2 files changed, 254 insertions(+)

diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index ac76bc4..5419b31 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -43,6 +44,19 @@ MODULE_LICENSE("GPL");
 
 #define PRIV_VMA_LOCKED ((void *)1)
 
+struct privcmd_data {
+   struct rb_root hbuf_root;
+   struct mutex hbuf_mutex;
+};
+
+struct privcmd_hbuf {
+   struct rb_node node;
+   struct privcmd_hcall_buf buf;
+   unsigned int count;
+   unsigned int nr_pages;
+   struct page *pages[0];
+};
+
 static int privcmd_vma_range_is_mapped(
struct vm_area_struct *vma,
unsigned long addr,
@@ -548,9 +562,175 @@ out_unlock:
goto out;
 }
 
+static void privcmd_hbuf_free(struct privcmd_hbuf *hbuf)
+{
+   unsigned int i;
+
+   for (i = 0; i < hbuf->nr_pages; i++)
+   put_page(hbuf->pages[i]);
+
+   kfree(hbuf);
+}
+
+static struct privcmd_hbuf *privcmd_hbuf_alloc(struct privcmd_hcall_buf *buf)
+{
+   struct privcmd_hbuf *hbuf;
+   unsigned long start, end, nr_pages;
+   int ret;
+
+   start = (unsigned long)buf->start & PAGE_MASK;
+   end = ALIGN((unsigned long)buf->start + buf->len, PAGE_SIZE);
+   nr_pages = (end - start) / PAGE_SIZE;
+
+   hbuf = kzalloc(sizeof(*hbuf) + nr_pages * sizeof(hbuf->pages[0]),
+  GFP_KERNEL);
+   if (!hbuf)
+   return ERR_PTR(-ENOMEM);
+
+   /*
+* Take a reference to each page, this will prevent swapping
+* and page migration.
+*/
+   ret = get_user_pages_fast(start, nr_pages, 1, hbuf->pages);
+   if (ret < 0)
+   goto error;
+
+   hbuf->buf = *buf;
+   hbuf->count = 1;
+   hbuf->nr_pages = ret;
+
+   if (hbuf->nr_pages != nr_pages) {
+   ret = -ENOMEM;
+   goto error;
+   }
+
+   return hbuf;
+
+  error:
+   privcmd_hbuf_free(hbuf);
+   return ERR_PTR(ret);
+}
+
+static int privcmd_hbuf_compare(struct privcmd_hcall_buf *a,
+   struct privcmd_hcall_buf *b)
+{
+   if (b->start > a->start)
+   return 1;
+   if (b->start < a->start)
+   return -1;
+   if (b->len > a->len)
+   return 1;
+   if (b->len < a->len)
+   return -1;
+   return 0;
+}
+
+static int privcmd_hbuf_insert(struct privcmd_data *priv,
+   struct privcmd_hcall_buf *buf)
+{
+   struct rb_node **new = &priv->hbuf_root.rb_node, *parent = NULL;
+   struct privcmd_hbuf *hbuf;
+
+   /* Figure out where to put new node */
+   while (*new) {
+   struct privcmd_hbuf *this = container_of(*new, struct 
privcmd_hbuf, node);
+   int result = privcmd_hbuf_compare(buf, &this->buf);
+
+   parent = *new;
+   if (result < 0)
+   new = &(*new)->rb_left;
+   else if (result > 0)
+   new = &(*new)->rb_right;
+   else {
+   this->count++;
+   return 0;
+   }
+   }
+
+   /* Allocate and insert a new node. */
+   hbuf = privcmd_hbuf_alloc(buf);
+   if (IS_ERR(hbuf))
+   return PTR_ERR(hbuf);
+
+   rb_link_node(&hbuf->node, parent, new);
+   rb_insert_color(&hbuf->node, &priv->hbuf_root);
+
+   return 0;
+}
+
+static int privcmd_hbuf_remove(struct privcmd_data *priv,
+  stru

Re: [Xen-devel] [PATCHv2 2/2] xen/privcmd: add ioctls for locking/unlocking hypercall buffers

2016-08-04 Thread Jan Beulich
>>> On 04.08.16 at 17:16,  wrote:
> Using mlock() for hypercall buffers is not sufficient since mlocked
> pages are still subject to compaction and page migration.  Page
> migration can be prevented by taking additional references to the
> pages.
> 
> Introduce two new ioctls: IOCTL_PRIVCMD_HCALL_BUF_LOCK and
> IOCTL_PRIVCMD_HCALL_BUF_UNLOCK which get and put the necessary page
> references.  The buffers do not need to be page aligned and they may
> overlap with other buffers.  However, it is not possible to partially
> unlock a buffer (i.e., the LOCK/UNLOCK must always be paired).  Any
> locked buffers are automatically unlocked when the file descriptor is
> closed.
> 
> An alternative approach would be to extend the driver with an ioctl to
> populate a VMA with "special", non-migratable pages.  But the
> LOCK/UNLOCK ioctls are more flexible as they allow any page to be used
> for hypercalls (e.g., stack, mmap'd files, etc.).  This could be used
> to minimize bouncing for performance critical hypercalls.
> 
> Signed-off-by: David Vrabel 

Reviewed-by: Jan Beulich 

with two remarks: Do you not care about compat mode callers? In
case you indeed mean to not support them, does the default
handling ensure they will see an error instead of you mis-interpreting
(and overrunning) the presented structure?

> +static struct privcmd_hbuf *privcmd_hbuf_alloc(struct privcmd_hcall_buf *buf)
> +{
> + struct privcmd_hbuf *hbuf;
> + unsigned long start, end, nr_pages;
> + int ret;
> +
> + start = (unsigned long)buf->start & PAGE_MASK;
> + end = ALIGN((unsigned long)buf->start + buf->len, PAGE_SIZE);
> + nr_pages = (end - start) / PAGE_SIZE;

So entry re-use is based on the actual length the caller passed,
while page tracking of course is page granular. Wouldn't it make
sense to re-use entries based on the pages they encompass,
which in particular would mean that two distinct buffers on the
same page would get just a single entry?

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv2 2/2] xen/privcmd: add ioctls for locking/unlocking hypercall buffers

2016-08-22 Thread David Vrabel
On 04/08/16 17:02, Jan Beulich wrote:
 On 04.08.16 at 17:16,  wrote:
>> Using mlock() for hypercall buffers is not sufficient since mlocked
>> pages are still subject to compaction and page migration.  Page
>> migration can be prevented by taking additional references to the
>> pages.
>>
>> Introduce two new ioctls: IOCTL_PRIVCMD_HCALL_BUF_LOCK and
>> IOCTL_PRIVCMD_HCALL_BUF_UNLOCK which get and put the necessary page
>> references.  The buffers do not need to be page aligned and they may
>> overlap with other buffers.  However, it is not possible to partially
>> unlock a buffer (i.e., the LOCK/UNLOCK must always be paired).  Any
>> locked buffers are automatically unlocked when the file descriptor is
>> closed.
>>
>> An alternative approach would be to extend the driver with an ioctl to
>> populate a VMA with "special", non-migratable pages.  But the
>> LOCK/UNLOCK ioctls are more flexible as they allow any page to be used
>> for hypercalls (e.g., stack, mmap'd files, etc.).  This could be used
>> to minimize bouncing for performance critical hypercalls.
>>
>> Signed-off-by: David Vrabel 
> 
> Reviewed-by: Jan Beulich 
> 
> with two remarks: Do you not care about compat mode callers?

No.  Compat mode callers can't make hypercalls since the hypervisor ABI
structures aren't converted anywhere.

> case you indeed mean to not support them, does the default
> handling ensure they will see an error instead of you mis-interpreting
> (and overrunning) the presented structure?

I will look at this.

>> +static struct privcmd_hbuf *privcmd_hbuf_alloc(struct privcmd_hcall_buf 
>> *buf)
>> +{
>> +struct privcmd_hbuf *hbuf;
>> +unsigned long start, end, nr_pages;
>> +int ret;
>> +
>> +start = (unsigned long)buf->start & PAGE_MASK;
>> +end = ALIGN((unsigned long)buf->start + buf->len, PAGE_SIZE);
>> +nr_pages = (end - start) / PAGE_SIZE;
> 
> So entry re-use is based on the actual length the caller passed,
> while page tracking of course is page granular. Wouldn't it make
> sense to re-use entries based on the pages they encompass,
> which in particular would mean that two distinct buffers on the
> same page would get just a single entry?

If you make the entries page aligned, you can't give always give an
error when the user unlocks something of the wrong size.

David

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv2 2/2] xen/privcmd: add ioctls for locking/unlocking hypercall buffers

2016-08-23 Thread Jan Beulich
>>> On 22.08.16 at 19:24,  wrote:
> On 04/08/16 17:02, Jan Beulich wrote:
> On 04.08.16 at 17:16,  wrote:
>>> Using mlock() for hypercall buffers is not sufficient since mlocked
>>> pages are still subject to compaction and page migration.  Page
>>> migration can be prevented by taking additional references to the
>>> pages.
>>>
>>> Introduce two new ioctls: IOCTL_PRIVCMD_HCALL_BUF_LOCK and
>>> IOCTL_PRIVCMD_HCALL_BUF_UNLOCK which get and put the necessary page
>>> references.  The buffers do not need to be page aligned and they may
>>> overlap with other buffers.  However, it is not possible to partially
>>> unlock a buffer (i.e., the LOCK/UNLOCK must always be paired).  Any
>>> locked buffers are automatically unlocked when the file descriptor is
>>> closed.
>>>
>>> An alternative approach would be to extend the driver with an ioctl to
>>> populate a VMA with "special", non-migratable pages.  But the
>>> LOCK/UNLOCK ioctls are more flexible as they allow any page to be used
>>> for hypercalls (e.g., stack, mmap'd files, etc.).  This could be used
>>> to minimize bouncing for performance critical hypercalls.
>>>
>>> Signed-off-by: David Vrabel 
>> 
>> Reviewed-by: Jan Beulich 
>> 
>> with two remarks: Do you not care about compat mode callers?
> 
> No.  Compat mode callers can't make hypercalls since the hypervisor ABI
> structures aren't converted anywhere.

Hmm - is putting a 64-bit kernel underneath a 32-bit installation
something that works for all other purposes, but is meant to
explicitly not work with Xen? The domctl/sysctl model (which the
hvmctl series follows, since the earlier hvmop already has this
requirement) already requires no translation.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv2 2/2] xen/privcmd: add ioctls for locking/unlocking hypercall buffers

2016-08-23 Thread Andrew Cooper
On 23/08/16 09:46, Jan Beulich wrote:
 On 22.08.16 at 19:24,  wrote:
>> On 04/08/16 17:02, Jan Beulich wrote:
>> On 04.08.16 at 17:16,  wrote:
 Using mlock() for hypercall buffers is not sufficient since mlocked
 pages are still subject to compaction and page migration.  Page
 migration can be prevented by taking additional references to the
 pages.

 Introduce two new ioctls: IOCTL_PRIVCMD_HCALL_BUF_LOCK and
 IOCTL_PRIVCMD_HCALL_BUF_UNLOCK which get and put the necessary page
 references.  The buffers do not need to be page aligned and they may
 overlap with other buffers.  However, it is not possible to partially
 unlock a buffer (i.e., the LOCK/UNLOCK must always be paired).  Any
 locked buffers are automatically unlocked when the file descriptor is
 closed.

 An alternative approach would be to extend the driver with an ioctl to
 populate a VMA with "special", non-migratable pages.  But the
 LOCK/UNLOCK ioctls are more flexible as they allow any page to be used
 for hypercalls (e.g., stack, mmap'd files, etc.).  This could be used
 to minimize bouncing for performance critical hypercalls.

 Signed-off-by: David Vrabel 
>>> Reviewed-by: Jan Beulich 
>>>
>>> with two remarks: Do you not care about compat mode callers?
>> No.  Compat mode callers can't make hypercalls since the hypervisor ABI
>> structures aren't converted anywhere.
> Hmm - is putting a 64-bit kernel underneath a 32-bit installation
> something that works for all other purposes, but is meant to
> explicitly not work with Xen? The domctl/sysctl model (which the
> hvmctl series follows, since the earlier hvmop already has this
> requirement) already requires no translation.

There were some very hacky patches around 5 years ago which allowed 64
bit PV guests to execute the compat hypercall entry in Xen, for the
purpose of making 32bit userspace work in a 64bit guest under 64bit Xen.

The modifications required bypassing the hypercall page in the guest
kernel to execute `int $0x82` rather than `syscall`, and modifications
in Xen to follow the hypercall path rather than bounce `int $0x82` back
into the guest.

As for now, this is not something which functions in the slightest.  The
bitness of hypercall-aware userspace must match the bitness of the guest
kernel it is running under.

~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel