date:20170725

[PATCH 3.18 47/60] udf: Fix deadlock between writeback and udf_setsize()

2017-07-25 Thread Greg Kroah-Hartman

3.18-stable review patch.  If anyone has any objections, please let me know.

--

From: Jan Kara 

commit f2e95355891153f66d4156bf3a142c6489cd78c6 upstream.

udf_setsize() called truncate_setsize() with i_data_sem held. Thus
truncate_pagecache() called from truncate_setsize() could lock a page
under i_data_sem which can deadlock as page lock ranks below
i_data_sem - e. g. writeback can hold page lock and try to acquire
i_data_sem to map a block.

Fix the problem by moving truncate_setsize() calls from under
i_data_sem. It is safe for us to change i_size without holding
i_data_sem as all the places that depend on i_size being stable already
hold inode_lock.

Fixes: 7e49b6f2480cb9a9e7322a91592e56a5c85361f5
Signed-off-by: Jan Kara 
Signed-off-by: Greg Kroah-Hartman 

---
 fs/udf/inode.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1237,8 +1237,8 @@ int udf_setsize(struct inode *inode, lof
return err;
}
 set_size:
-   truncate_setsize(inode, newsize);
up_write(>i_data_sem);
+   truncate_setsize(inode, newsize);
} else {
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
down_write(>i_data_sem);
@@ -1255,9 +1255,9 @@ set_size:
  udf_get_block);
if (err)
return err;
+   truncate_setsize(inode, newsize);
down_write(>i_data_sem);
udf_clear_extent_cache(inode);
-   truncate_setsize(inode, newsize);
udf_truncate_extents(inode);
up_write(>i_data_sem);
}

[PATCH 3.18 47/60] udf: Fix deadlock between writeback and udf_setsize()

2017-07-25 Thread Greg Kroah-Hartman

3.18-stable review patch.  If anyone has any objections, please let me know.

--

From: Jan Kara 

commit f2e95355891153f66d4156bf3a142c6489cd78c6 upstream.

udf_setsize() called truncate_setsize() with i_data_sem held. Thus
truncate_pagecache() called from truncate_setsize() could lock a page
under i_data_sem which can deadlock as page lock ranks below
i_data_sem - e. g. writeback can hold page lock and try to acquire
i_data_sem to map a block.

Fix the problem by moving truncate_setsize() calls from under
i_data_sem. It is safe for us to change i_size without holding
i_data_sem as all the places that depend on i_size being stable already
hold inode_lock.

Fixes: 7e49b6f2480cb9a9e7322a91592e56a5c85361f5
Signed-off-by: Jan Kara 
Signed-off-by: Greg Kroah-Hartman 

---
 fs/udf/inode.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1237,8 +1237,8 @@ int udf_setsize(struct inode *inode, lof
return err;
}
 set_size:
-   truncate_setsize(inode, newsize);
up_write(>i_data_sem);
+   truncate_setsize(inode, newsize);
} else {
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
down_write(>i_data_sem);
@@ -1255,9 +1255,9 @@ set_size:
  udf_get_block);
if (err)
return err;
+   truncate_setsize(inode, newsize);
down_write(>i_data_sem);
udf_clear_extent_cache(inode);
-   truncate_setsize(inode, newsize);
udf_truncate_extents(inode);
up_write(>i_data_sem);
}

[PATCH 3.18 50/60] Revert "perf/core: Drop kernel samples even though :u is specified"

2017-07-25 Thread Greg Kroah-Hartman

3.18-stable review patch.  If anyone has any objections, please let me know.

--

From: Ingo Molnar 

commit 6a8a75f3235724c5941a33e287b2f98966ad14c5 upstream.

This reverts commit cc1582c231ea041fbc68861dfaf957eaf902b829.

This commit introduced a regression that broke rr-project, which uses sampling
events to receive a signal on overflow (but does not care about the contents
of the sample). These signals are critical to the correct operation of rr.

There's been some back and forth about how to fix it - but to not keep
applications in limbo queue up a revert.

Reported-by: Kyle Huey 
Acked-by: Kyle Huey 
Acked-by: Peter Zijlstra 
Cc: Jin Yao 
Cc: Vince Weaver 
Cc: Linus Torvalds 
Cc: Will Deacon 
Cc: Arnaldo Carvalho de Melo 
Cc: Alexander Shishkin 
Cc: Stephane Eranian 
Cc: Namhyung Kim 
Cc: Jiri Olsa 
Link: http://lkml.kernel.org/r/20170628105600.GC5981@leverpostej
Signed-off-by: Ingo Molnar 
Signed-off-by: Greg Kroah-Hartman 

---
 kernel/events/core.c |   21 -
 1 file changed, 21 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5761,21 +5761,6 @@ static void perf_log_throttle(struct per
perf_output_end();
 }
 
-static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
-{
-   /*
-* Due to interrupt latency (AKA "skid"), we may enter the
-* kernel before taking an overflow, even if the PMU is only
-* counting user events.
-* To avoid leaking information to userspace, we must always
-* reject kernel samples when exclude_kernel is set.
-*/
-   if (event->attr.exclude_kernel && !user_mode(regs))
-   return false;
-
-   return true;
-}
-
 /*
  * Generic event overflow handling, sampling.
  */
@@ -5823,12 +5808,6 @@ static int __perf_event_overflow(struct
}
 
/*
-* For security, drop the skid kernel samples if necessary.
-*/
-   if (!sample_is_allowed(event, regs))
-   return ret;
-
-   /*
 * XXX event_limit might not quite work as expected on inherited
 * events
 */

[PATCH 3.18 50/60] Revert "perf/core: Drop kernel samples even though :u is specified"

2017-07-25 Thread Greg Kroah-Hartman

3.18-stable review patch.  If anyone has any objections, please let me know.

--

From: Ingo Molnar 

commit 6a8a75f3235724c5941a33e287b2f98966ad14c5 upstream.

This reverts commit cc1582c231ea041fbc68861dfaf957eaf902b829.

This commit introduced a regression that broke rr-project, which uses sampling
events to receive a signal on overflow (but does not care about the contents
of the sample). These signals are critical to the correct operation of rr.

There's been some back and forth about how to fix it - but to not keep
applications in limbo queue up a revert.

Reported-by: Kyle Huey 
Acked-by: Kyle Huey 
Acked-by: Peter Zijlstra 
Cc: Jin Yao 
Cc: Vince Weaver 
Cc: Linus Torvalds 
Cc: Will Deacon 
Cc: Arnaldo Carvalho de Melo 
Cc: Alexander Shishkin 
Cc: Stephane Eranian 
Cc: Namhyung Kim 
Cc: Jiri Olsa 
Link: http://lkml.kernel.org/r/20170628105600.GC5981@leverpostej
Signed-off-by: Ingo Molnar 
Signed-off-by: Greg Kroah-Hartman 

---
 kernel/events/core.c |   21 -
 1 file changed, 21 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5761,21 +5761,6 @@ static void perf_log_throttle(struct per
perf_output_end();
 }
 
-static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
-{
-   /*
-* Due to interrupt latency (AKA "skid"), we may enter the
-* kernel before taking an overflow, even if the PMU is only
-* counting user events.
-* To avoid leaking information to userspace, we must always
-* reject kernel samples when exclude_kernel is set.
-*/
-   if (event->attr.exclude_kernel && !user_mode(regs))
-   return false;
-
-   return true;
-}
-
 /*
  * Generic event overflow handling, sampling.
  */
@@ -5823,12 +5808,6 @@ static int __perf_event_overflow(struct
}
 
/*
-* For security, drop the skid kernel samples if necessary.
-*/
-   if (!sample_is_allowed(event, regs))
-   return ret;
-
-   /*
 * XXX event_limit might not quite work as expected on inherited
 * events
 */

[PATCH 3.18 51/60] staging: rtl8188eu: add TL-WN722N v2 support

2017-07-25 Thread Greg Kroah-Hartman

3.18-stable review patch.  If anyone has any objections, please let me know.

--

From: Michael Gugino 

commit 5a1d4c5dd4eb2f1f8a9b30e61762f3b3b564df70 upstream.

Add support for USB Device TP-Link TL-WN722N v2.
VendorID: 0x2357, ProductID: 0x010c

Signed-off-by: Michael Gugino 
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/staging/rtl8188eu/os_dep/usb_intf.c |1 +
 1 file changed, 1 insertion(+)

--- a/drivers/staging/rtl8188eu/os_dep/usb_intf.c
+++ b/drivers/staging/rtl8188eu/os_dep/usb_intf.c
@@ -48,6 +48,7 @@ static struct usb_device_id rtw_usb_id_t
{USB_DEVICE(0x2001, 0x330F)}, /* DLink DWA-125 REV D1 */
{USB_DEVICE(0x2001, 0x3310)}, /* Dlink DWA-123 REV D1 */
{USB_DEVICE(0x2001, 0x3311)}, /* DLink GO-USB-N150 REV B1 */
+   {USB_DEVICE(0x2357, 0x010c)}, /* TP-Link TL-WN722N v2 */
{USB_DEVICE(0x0df6, 0x0076)}, /* Sitecom N150 v2 */
{}  /* Terminating entry */
 };

[PATCH 3.18 51/60] staging: rtl8188eu: add TL-WN722N v2 support

2017-07-25 Thread Greg Kroah-Hartman

3.18-stable review patch.  If anyone has any objections, please let me know.

--

From: Michael Gugino 

commit 5a1d4c5dd4eb2f1f8a9b30e61762f3b3b564df70 upstream.

Add support for USB Device TP-Link TL-WN722N v2.
VendorID: 0x2357, ProductID: 0x010c

Signed-off-by: Michael Gugino 
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/staging/rtl8188eu/os_dep/usb_intf.c |1 +
 1 file changed, 1 insertion(+)

--- a/drivers/staging/rtl8188eu/os_dep/usb_intf.c
+++ b/drivers/staging/rtl8188eu/os_dep/usb_intf.c
@@ -48,6 +48,7 @@ static struct usb_device_id rtw_usb_id_t
{USB_DEVICE(0x2001, 0x330F)}, /* DLink DWA-125 REV D1 */
{USB_DEVICE(0x2001, 0x3310)}, /* Dlink DWA-123 REV D1 */
{USB_DEVICE(0x2001, 0x3311)}, /* DLink GO-USB-N150 REV B1 */
+   {USB_DEVICE(0x2357, 0x010c)}, /* TP-Link TL-WN722N v2 */
{USB_DEVICE(0x0df6, 0x0076)}, /* Sitecom N150 v2 */
{}  /* Terminating entry */
 };

[PATCH v2 02/13] xen/pvcalls: connect to the backend

2017-07-25 Thread Stefano Stabellini

Implement the probe function for the pvcalls frontend. Read the
supported versions, max-page-order and function-calls nodes from
xenstore.

Introduce a data structure named pvcalls_bedata. It contains pointers to
the command ring, the event channel, a list of active sockets and a list
of passive sockets. Lists accesses are protected by a spin_lock.

Introduce a waitqueue to allow waiting for a response on commands sent
to the backend.

Introduce an array of struct xen_pvcalls_response to store commands
responses.

Only one frontend<->backend connection is supported at any given time
for a guest. Store the active frontend device to a static pointer.

Introduce a stub functions for the event handler.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 153 
 1 file changed, 153 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index a8d38c2..5e0b265 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -20,6 +20,29 @@
 #include 
 #include 
 
+#define PVCALLS_INVALID_ID (UINT_MAX)
+#define RING_ORDER XENBUS_MAX_RING_GRANT_ORDER
+#define PVCALLS_NR_REQ_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE)
+
+struct pvcalls_bedata {
+   struct xen_pvcalls_front_ring ring;
+   grant_ref_t ref;
+   int irq;
+
+   struct list_head socket_mappings;
+   struct list_head socketpass_mappings;
+   spinlock_t pvcallss_lock;
+
+   wait_queue_head_t inflight_req;
+   struct xen_pvcalls_response rsp[PVCALLS_NR_REQ_PER_RING];
+};
+struct xenbus_device *pvcalls_front_dev;
+
+static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
+{
+   return IRQ_HANDLED;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
@@ -33,12 +56,142 @@ static int pvcalls_front_remove(struct xenbus_device *dev)
 static int pvcalls_front_probe(struct xenbus_device *dev,
  const struct xenbus_device_id *id)
 {
+   int ret = -EFAULT, evtchn, ref = -1, i;
+   unsigned int max_page_order, function_calls, len;
+   char *versions;
+   grant_ref_t gref_head = 0;
+   struct xenbus_transaction xbt;
+   struct pvcalls_bedata *bedata = NULL;
+   struct xen_pvcalls_sring *sring;
+
+   if (pvcalls_front_dev != NULL) {
+   dev_err(>dev, "only one PV Calls connection supported\n");
+   return -EINVAL;
+   }
+
+   versions = xenbus_read(XBT_NIL, dev->otherend, "versions", );
+   if (!len)
+   return -EINVAL;
+   if (strcmp(versions, "1")) {
+   kfree(versions);
+   return -EINVAL;
+   }
+   kfree(versions);
+   ret = xenbus_scanf(XBT_NIL, dev->otherend,
+  "max-page-order", "%u", _page_order);
+   if (ret <= 0)
+   return -ENODEV;
+   if (max_page_order < RING_ORDER)
+   return -ENODEV;
+   ret = xenbus_scanf(XBT_NIL, dev->otherend,
+  "function-calls", "%u", _calls);
+   if (ret <= 0 || function_calls != 1)
+   return -ENODEV;
+   pr_info("%s max-page-order is %u\n", __func__, max_page_order);
+
+   bedata = kzalloc(sizeof(struct pvcalls_bedata), GFP_KERNEL);
+   if (!bedata)
+   return -ENOMEM;
+
+   init_waitqueue_head(>inflight_req);
+   for (i = 0; i < PVCALLS_NR_REQ_PER_RING; i++)
+   bedata->rsp[i].req_id = PVCALLS_INVALID_ID;
+
+   sring = (struct xen_pvcalls_sring *) __get_free_page(GFP_KERNEL |
+__GFP_ZERO);
+   if (!sring)
+   goto error;
+   SHARED_RING_INIT(sring);
+   FRONT_RING_INIT(>ring, sring, XEN_PAGE_SIZE);
+
+   ret = xenbus_alloc_evtchn(dev, );
+   if (ret)
+   goto error;
+
+   bedata->irq = bind_evtchn_to_irqhandler(evtchn,
+   pvcalls_front_event_handler,
+   0, "pvcalls-frontend", dev);
+   if (bedata->irq < 0) {
+   ret = bedata->irq;
+   goto error;
+   }
+
+   ret = gnttab_alloc_grant_references(1, _head);
+   if (ret < 0)
+   goto error;
+   bedata->ref = ref = gnttab_claim_grant_reference(_head);
+   if (ref < 0)
+   goto error;
+   gnttab_grant_foreign_access_ref(ref, dev->otherend_id,
+   virt_to_gfn((void *)sring), 0);
+
+ again:
+   ret = xenbus_transaction_start();
+   if (ret) {
+   xenbus_dev_fatal(dev, ret, "starting transaction");
+   goto error;
+   }
+   ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1);
+   if (ret)
+   goto error_xenbus;
+   ret = xenbus_printf(xbt,

[PATCH v2 02/13] xen/pvcalls: connect to the backend

2017-07-25 Thread Stefano Stabellini

Implement the probe function for the pvcalls frontend. Read the
supported versions, max-page-order and function-calls nodes from
xenstore.

Introduce a data structure named pvcalls_bedata. It contains pointers to
the command ring, the event channel, a list of active sockets and a list
of passive sockets. Lists accesses are protected by a spin_lock.

Introduce a waitqueue to allow waiting for a response on commands sent
to the backend.

Introduce an array of struct xen_pvcalls_response to store commands
responses.

Only one frontend<->backend connection is supported at any given time
for a guest. Store the active frontend device to a static pointer.

Introduce a stub functions for the event handler.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 153 
 1 file changed, 153 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index a8d38c2..5e0b265 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -20,6 +20,29 @@
 #include 
 #include 
 
+#define PVCALLS_INVALID_ID (UINT_MAX)
+#define RING_ORDER XENBUS_MAX_RING_GRANT_ORDER
+#define PVCALLS_NR_REQ_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE)
+
+struct pvcalls_bedata {
+   struct xen_pvcalls_front_ring ring;
+   grant_ref_t ref;
+   int irq;
+
+   struct list_head socket_mappings;
+   struct list_head socketpass_mappings;
+   spinlock_t pvcallss_lock;
+
+   wait_queue_head_t inflight_req;
+   struct xen_pvcalls_response rsp[PVCALLS_NR_REQ_PER_RING];
+};
+struct xenbus_device *pvcalls_front_dev;
+
+static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
+{
+   return IRQ_HANDLED;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
@@ -33,12 +56,142 @@ static int pvcalls_front_remove(struct xenbus_device *dev)
 static int pvcalls_front_probe(struct xenbus_device *dev,
  const struct xenbus_device_id *id)
 {
+   int ret = -EFAULT, evtchn, ref = -1, i;
+   unsigned int max_page_order, function_calls, len;
+   char *versions;
+   grant_ref_t gref_head = 0;
+   struct xenbus_transaction xbt;
+   struct pvcalls_bedata *bedata = NULL;
+   struct xen_pvcalls_sring *sring;
+
+   if (pvcalls_front_dev != NULL) {
+   dev_err(>dev, "only one PV Calls connection supported\n");
+   return -EINVAL;
+   }
+
+   versions = xenbus_read(XBT_NIL, dev->otherend, "versions", );
+   if (!len)
+   return -EINVAL;
+   if (strcmp(versions, "1")) {
+   kfree(versions);
+   return -EINVAL;
+   }
+   kfree(versions);
+   ret = xenbus_scanf(XBT_NIL, dev->otherend,
+  "max-page-order", "%u", _page_order);
+   if (ret <= 0)
+   return -ENODEV;
+   if (max_page_order < RING_ORDER)
+   return -ENODEV;
+   ret = xenbus_scanf(XBT_NIL, dev->otherend,
+  "function-calls", "%u", _calls);
+   if (ret <= 0 || function_calls != 1)
+   return -ENODEV;
+   pr_info("%s max-page-order is %u\n", __func__, max_page_order);
+
+   bedata = kzalloc(sizeof(struct pvcalls_bedata), GFP_KERNEL);
+   if (!bedata)
+   return -ENOMEM;
+
+   init_waitqueue_head(>inflight_req);
+   for (i = 0; i < PVCALLS_NR_REQ_PER_RING; i++)
+   bedata->rsp[i].req_id = PVCALLS_INVALID_ID;
+
+   sring = (struct xen_pvcalls_sring *) __get_free_page(GFP_KERNEL |
+__GFP_ZERO);
+   if (!sring)
+   goto error;
+   SHARED_RING_INIT(sring);
+   FRONT_RING_INIT(>ring, sring, XEN_PAGE_SIZE);
+
+   ret = xenbus_alloc_evtchn(dev, );
+   if (ret)
+   goto error;
+
+   bedata->irq = bind_evtchn_to_irqhandler(evtchn,
+   pvcalls_front_event_handler,
+   0, "pvcalls-frontend", dev);
+   if (bedata->irq < 0) {
+   ret = bedata->irq;
+   goto error;
+   }
+
+   ret = gnttab_alloc_grant_references(1, _head);
+   if (ret < 0)
+   goto error;
+   bedata->ref = ref = gnttab_claim_grant_reference(_head);
+   if (ref < 0)
+   goto error;
+   gnttab_grant_foreign_access_ref(ref, dev->otherend_id,
+   virt_to_gfn((void *)sring), 0);
+
+ again:
+   ret = xenbus_transaction_start();
+   if (ret) {
+   xenbus_dev_fatal(dev, ret, "starting transaction");
+   goto error;
+   }
+   ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1);
+   if (ret)
+   goto error_xenbus;
+   ret = xenbus_printf(xbt, dev->nodename, "ring-ref",

[PATCH v2 03/13] xen/pvcalls: implement socket command and handle events

2017-07-25 Thread Stefano Stabellini

Send a PVCALLS_SOCKET command to the backend, use the masked
req_prod_pvt as req_id. This way, req_id is guaranteed to be between 0
and PVCALLS_NR_REQ_PER_RING. We already have a slot in the rsp array
ready for the response, and there cannot be two outstanding responses
with the same req_id.

Wait for the response by waiting on the inflight_req waitqueue and
check for the req_id field in rsp[req_id]. Use atomic accesses to
read the field. Once a response is received, clear the corresponding rsp
slot by setting req_id to PVCALLS_INVALID_ID. Note that
PVCALLS_INVALID_ID is invalid only from the frontend point of view. It
is not part of the PVCalls protocol.

pvcalls_front_event_handler is in charge of copying responses from the
ring to the appropriate rsp slot. It is done by copying the body of the
response first, then by copying req_id atomically. After the copies,
wake up anybody waiting on waitqueue.

pvcallss_lock protects accesses to the ring.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 94 +
 drivers/xen/pvcalls-front.h |  8 
 2 files changed, 102 insertions(+)
 create mode 100644 drivers/xen/pvcalls-front.h

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 5e0b265..d1dbcf1 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -20,6 +20,8 @@
 #include 
 #include 
 
+#include "pvcalls-front.h"
+
 #define PVCALLS_INVALID_ID (UINT_MAX)
 #define RING_ORDER XENBUS_MAX_RING_GRANT_ORDER
 #define PVCALLS_NR_REQ_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE)
@@ -40,9 +42,101 @@ struct pvcalls_bedata {
 
 static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
 {
+   struct xenbus_device *dev = dev_id;
+   struct pvcalls_bedata *bedata;
+   struct xen_pvcalls_response *rsp;
+   uint8_t *src, *dst;
+   int req_id = 0, more = 0, done = 0;
+
+   if (dev == NULL)
+   return IRQ_HANDLED;
+
+   bedata = dev_get_drvdata(>dev);
+   if (bedata == NULL)
+   return IRQ_HANDLED;
+
+again:
+   while (RING_HAS_UNCONSUMED_RESPONSES(>ring)) {
+   rsp = RING_GET_RESPONSE(>ring, bedata->ring.rsp_cons);
+
+   req_id = rsp->req_id;
+   src = (uint8_t *)>rsp[req_id];
+   src += sizeof(rsp->req_id);
+   dst = (uint8_t *)rsp;
+   dst += sizeof(rsp->req_id);
+   memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id));
+   /*
+* First copy the rest of the data, then req_id. It is
+* paired with the barrier when accessing bedata->rsp.
+*/
+   smp_wmb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, rsp->req_id);
+
+   done = 1;
+   bedata->ring.rsp_cons++;
+   }
+
+   RING_FINAL_CHECK_FOR_RESPONSES(>ring, more);
+   if (more)
+   goto again;
+   if (done)
+   wake_up(>inflight_req);
return IRQ_HANDLED;
 }
 
+int pvcalls_front_socket(struct socket *sock)
+{
+   struct pvcalls_bedata *bedata;
+   struct xen_pvcalls_request *req;
+   int notify, req_id, ret;
+
+   if (!pvcalls_front_dev)
+   return -EACCES;
+   /*
+* PVCalls only supports domain AF_INET,
+* type SOCK_STREAM and protocol 0 sockets for now.
+*
+* Check socket type here, AF_INET and protocol checks are done
+* by the caller.
+*/
+   if (sock->type != SOCK_STREAM)
+   return -ENOTSUPP;
+
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_SOCKET;
+   req->u.socket.id = (uint64_t) sock;
+   req->u.socket.domain = AF_INET;
+   req->u.socket.type = SOCK_STREAM;
+   req->u.socket.protocol = 0;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   if (wait_event_interruptible(bedata->inflight_req,
+   READ_ONCE(bedata->rsp[req_id].req_id) == req_id) != 0)
+   return -EINTR;
+
+   ret = bedata->rsp[req_id].ret;
+   /* read ret, then set this rsp slot to be reused */
+   smp_mb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+
+   return ret;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git

[PATCH v2 03/13] xen/pvcalls: implement socket command and handle events

2017-07-25 Thread Stefano Stabellini

Send a PVCALLS_SOCKET command to the backend, use the masked
req_prod_pvt as req_id. This way, req_id is guaranteed to be between 0
and PVCALLS_NR_REQ_PER_RING. We already have a slot in the rsp array
ready for the response, and there cannot be two outstanding responses
with the same req_id.

Wait for the response by waiting on the inflight_req waitqueue and
check for the req_id field in rsp[req_id]. Use atomic accesses to
read the field. Once a response is received, clear the corresponding rsp
slot by setting req_id to PVCALLS_INVALID_ID. Note that
PVCALLS_INVALID_ID is invalid only from the frontend point of view. It
is not part of the PVCalls protocol.

pvcalls_front_event_handler is in charge of copying responses from the
ring to the appropriate rsp slot. It is done by copying the body of the
response first, then by copying req_id atomically. After the copies,
wake up anybody waiting on waitqueue.

pvcallss_lock protects accesses to the ring.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 94 +
 drivers/xen/pvcalls-front.h |  8 
 2 files changed, 102 insertions(+)
 create mode 100644 drivers/xen/pvcalls-front.h

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 5e0b265..d1dbcf1 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -20,6 +20,8 @@
 #include 
 #include 
 
+#include "pvcalls-front.h"
+
 #define PVCALLS_INVALID_ID (UINT_MAX)
 #define RING_ORDER XENBUS_MAX_RING_GRANT_ORDER
 #define PVCALLS_NR_REQ_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE)
@@ -40,9 +42,101 @@ struct pvcalls_bedata {
 
 static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
 {
+   struct xenbus_device *dev = dev_id;
+   struct pvcalls_bedata *bedata;
+   struct xen_pvcalls_response *rsp;
+   uint8_t *src, *dst;
+   int req_id = 0, more = 0, done = 0;
+
+   if (dev == NULL)
+   return IRQ_HANDLED;
+
+   bedata = dev_get_drvdata(>dev);
+   if (bedata == NULL)
+   return IRQ_HANDLED;
+
+again:
+   while (RING_HAS_UNCONSUMED_RESPONSES(>ring)) {
+   rsp = RING_GET_RESPONSE(>ring, bedata->ring.rsp_cons);
+
+   req_id = rsp->req_id;
+   src = (uint8_t *)>rsp[req_id];
+   src += sizeof(rsp->req_id);
+   dst = (uint8_t *)rsp;
+   dst += sizeof(rsp->req_id);
+   memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id));
+   /*
+* First copy the rest of the data, then req_id. It is
+* paired with the barrier when accessing bedata->rsp.
+*/
+   smp_wmb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, rsp->req_id);
+
+   done = 1;
+   bedata->ring.rsp_cons++;
+   }
+
+   RING_FINAL_CHECK_FOR_RESPONSES(>ring, more);
+   if (more)
+   goto again;
+   if (done)
+   wake_up(>inflight_req);
return IRQ_HANDLED;
 }
 
+int pvcalls_front_socket(struct socket *sock)
+{
+   struct pvcalls_bedata *bedata;
+   struct xen_pvcalls_request *req;
+   int notify, req_id, ret;
+
+   if (!pvcalls_front_dev)
+   return -EACCES;
+   /*
+* PVCalls only supports domain AF_INET,
+* type SOCK_STREAM and protocol 0 sockets for now.
+*
+* Check socket type here, AF_INET and protocol checks are done
+* by the caller.
+*/
+   if (sock->type != SOCK_STREAM)
+   return -ENOTSUPP;
+
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_SOCKET;
+   req->u.socket.id = (uint64_t) sock;
+   req->u.socket.domain = AF_INET;
+   req->u.socket.type = SOCK_STREAM;
+   req->u.socket.protocol = 0;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   if (wait_event_interruptible(bedata->inflight_req,
+   READ_ONCE(bedata->rsp[req_id].req_id) == req_id) != 0)
+   return -EINTR;
+
+   ret = bedata->rsp[req_id].ret;
+   /* read ret, then set this rsp slot to be reused */
+   smp_mb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+
+   return ret;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git

[PATCH v2 04/13] xen/pvcalls: implement connect command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_CONNECT to the backend. Allocate a new ring and evtchn for
the active socket.

Introduce a data structure to keep track of sockets. Introduce a
waitqueue to allow the frontend to wait on data coming from the backend
on the active socket (recvmsg command).

Two mutexes (one of reads and one for writes) will be used to protect
the active socket in and out rings from concurrent accesses.

sock->sk->sk_send_head is not used for ip sockets: reuse the field to
store a pointer to the struct sock_mapping corresponding to the socket.
This way, we can easily get the struct sock_mapping from the struct
socket.

Convert the struct socket pointer into an uint64_t and use it as id for
the new socket to pass to the backend.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 177 +---
 drivers/xen/pvcalls-front.h |   2 +
 2 files changed, 168 insertions(+), 11 deletions(-)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index d1dbcf1..d0f5f42 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -13,6 +13,10 @@
  */
 
 #include 
+#include 
+#include 
+
+#include 
 
 #include 
 #include 
@@ -40,6 +44,24 @@ struct pvcalls_bedata {
 };
 struct xenbus_device *pvcalls_front_dev;
 
+struct sock_mapping {
+   bool active_socket;
+   struct list_head list;
+   struct socket *sock;
+   union {
+   struct {
+   int irq;
+   grant_ref_t ref;
+   struct pvcalls_data_intf *ring;
+   struct pvcalls_data data;
+   struct mutex in_mutex;
+   struct mutex out_mutex;
+
+   wait_queue_head_t inflight_conn_req;
+   } active;
+   };
+};
+
 static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
 {
struct xenbus_device *dev = dev_id;
@@ -84,6 +106,18 @@ static irqreturn_t pvcalls_front_event_handler(int irq, 
void *dev_id)
return IRQ_HANDLED;
 }
 
+static irqreturn_t pvcalls_front_conn_handler(int irq, void *sock_map)
+{
+   struct sock_mapping *map = sock_map;
+
+   if (map == NULL)
+   return IRQ_HANDLED;
+
+   wake_up_interruptible(>active.inflight_conn_req);
+
+   return IRQ_HANDLED;
+}
+
 int pvcalls_front_socket(struct socket *sock)
 {
struct pvcalls_bedata *bedata;
@@ -137,6 +171,127 @@ int pvcalls_front_socket(struct socket *sock)
return ret;
 }
 
+static struct sock_mapping *create_active(int *evtchn)
+{
+   struct sock_mapping *map = NULL;
+   void *bytes;
+   int ret, irq = -1, i;
+
+   map = kzalloc(sizeof(*map), GFP_KERNEL);
+   if (map == NULL)
+   return NULL;
+
+   init_waitqueue_head(>active.inflight_conn_req);
+
+   map->active.ring = (struct pvcalls_data_intf *)
+   __get_free_page(GFP_KERNEL | __GFP_ZERO);
+   if (map->active.ring == NULL)
+   goto out_error;
+   memset(map->active.ring, 0, XEN_PAGE_SIZE);
+   map->active.ring->ring_order = RING_ORDER;
+   bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+   map->active.ring->ring_order);
+   if (bytes == NULL)
+   goto out_error;
+   for (i = 0; i < (1 << map->active.ring->ring_order); i++)
+   map->active.ring->ref[i] = gnttab_grant_foreign_access(
+   pvcalls_front_dev->otherend_id,
+   pfn_to_gfn(virt_to_pfn(bytes) + i), 0);
+
+   map->active.ref = gnttab_grant_foreign_access(
+   pvcalls_front_dev->otherend_id,
+   pfn_to_gfn(virt_to_pfn((void *)map->active.ring)), 0);
+
+   map->active.data.in = bytes;
+   map->active.data.out = bytes +
+   XEN_FLEX_RING_SIZE(map->active.ring->ring_order);
+
+   ret = xenbus_alloc_evtchn(pvcalls_front_dev, evtchn);
+   if (ret)
+   goto out_error;
+   irq = bind_evtchn_to_irqhandler(*evtchn, pvcalls_front_conn_handler,
+   0, "pvcalls-frontend", map);
+   if (irq < 0)
+   goto out_error;
+
+   map->active.irq = irq;
+   map->active_socket = true;
+   mutex_init(>active.in_mutex);
+   mutex_init(>active.out_mutex);
+
+   return map;
+
+out_error:
+   if (irq >= 0)
+   unbind_from_irqhandler(irq, map);
+   else if (*evtchn >= 0)
+   xenbus_free_evtchn(pvcalls_front_dev, *evtchn);
+   kfree(map->active.data.in);
+   kfree(map->active.ring);
+   kfree(map);
+   return NULL;
+}
+
+int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
+   int addr_len, int flags)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map = NULL;
+   struct

[PATCH v2 04/13] xen/pvcalls: implement connect command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_CONNECT to the backend. Allocate a new ring and evtchn for
the active socket.

Introduce a data structure to keep track of sockets. Introduce a
waitqueue to allow the frontend to wait on data coming from the backend
on the active socket (recvmsg command).

Two mutexes (one of reads and one for writes) will be used to protect
the active socket in and out rings from concurrent accesses.

sock->sk->sk_send_head is not used for ip sockets: reuse the field to
store a pointer to the struct sock_mapping corresponding to the socket.
This way, we can easily get the struct sock_mapping from the struct
socket.

Convert the struct socket pointer into an uint64_t and use it as id for
the new socket to pass to the backend.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 177 +---
 drivers/xen/pvcalls-front.h |   2 +
 2 files changed, 168 insertions(+), 11 deletions(-)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index d1dbcf1..d0f5f42 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -13,6 +13,10 @@
  */
 
 #include 
+#include 
+#include 
+
+#include 
 
 #include 
 #include 
@@ -40,6 +44,24 @@ struct pvcalls_bedata {
 };
 struct xenbus_device *pvcalls_front_dev;
 
+struct sock_mapping {
+   bool active_socket;
+   struct list_head list;
+   struct socket *sock;
+   union {
+   struct {
+   int irq;
+   grant_ref_t ref;
+   struct pvcalls_data_intf *ring;
+   struct pvcalls_data data;
+   struct mutex in_mutex;
+   struct mutex out_mutex;
+
+   wait_queue_head_t inflight_conn_req;
+   } active;
+   };
+};
+
 static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
 {
struct xenbus_device *dev = dev_id;
@@ -84,6 +106,18 @@ static irqreturn_t pvcalls_front_event_handler(int irq, 
void *dev_id)
return IRQ_HANDLED;
 }
 
+static irqreturn_t pvcalls_front_conn_handler(int irq, void *sock_map)
+{
+   struct sock_mapping *map = sock_map;
+
+   if (map == NULL)
+   return IRQ_HANDLED;
+
+   wake_up_interruptible(>active.inflight_conn_req);
+
+   return IRQ_HANDLED;
+}
+
 int pvcalls_front_socket(struct socket *sock)
 {
struct pvcalls_bedata *bedata;
@@ -137,6 +171,127 @@ int pvcalls_front_socket(struct socket *sock)
return ret;
 }
 
+static struct sock_mapping *create_active(int *evtchn)
+{
+   struct sock_mapping *map = NULL;
+   void *bytes;
+   int ret, irq = -1, i;
+
+   map = kzalloc(sizeof(*map), GFP_KERNEL);
+   if (map == NULL)
+   return NULL;
+
+   init_waitqueue_head(>active.inflight_conn_req);
+
+   map->active.ring = (struct pvcalls_data_intf *)
+   __get_free_page(GFP_KERNEL | __GFP_ZERO);
+   if (map->active.ring == NULL)
+   goto out_error;
+   memset(map->active.ring, 0, XEN_PAGE_SIZE);
+   map->active.ring->ring_order = RING_ORDER;
+   bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+   map->active.ring->ring_order);
+   if (bytes == NULL)
+   goto out_error;
+   for (i = 0; i < (1 << map->active.ring->ring_order); i++)
+   map->active.ring->ref[i] = gnttab_grant_foreign_access(
+   pvcalls_front_dev->otherend_id,
+   pfn_to_gfn(virt_to_pfn(bytes) + i), 0);
+
+   map->active.ref = gnttab_grant_foreign_access(
+   pvcalls_front_dev->otherend_id,
+   pfn_to_gfn(virt_to_pfn((void *)map->active.ring)), 0);
+
+   map->active.data.in = bytes;
+   map->active.data.out = bytes +
+   XEN_FLEX_RING_SIZE(map->active.ring->ring_order);
+
+   ret = xenbus_alloc_evtchn(pvcalls_front_dev, evtchn);
+   if (ret)
+   goto out_error;
+   irq = bind_evtchn_to_irqhandler(*evtchn, pvcalls_front_conn_handler,
+   0, "pvcalls-frontend", map);
+   if (irq < 0)
+   goto out_error;
+
+   map->active.irq = irq;
+   map->active_socket = true;
+   mutex_init(>active.in_mutex);
+   mutex_init(>active.out_mutex);
+
+   return map;
+
+out_error:
+   if (irq >= 0)
+   unbind_from_irqhandler(irq, map);
+   else if (*evtchn >= 0)
+   xenbus_free_evtchn(pvcalls_front_dev, *evtchn);
+   kfree(map->active.data.in);
+   kfree(map->active.ring);
+   kfree(map);
+   return NULL;
+}
+
+int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
+   int addr_len, int flags)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map = NULL;
+   struct xen_pvcalls_request *req;
+

[PATCH v2 05/13] xen/pvcalls: implement bind command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_BIND to the backend. Introduce a new structure, part of
struct sock_mapping, to store information specific to passive sockets.

Introduce a status field to keep track of the status of the passive
socket.

Introduce a waitqueue for the "accept" command (see the accept command
implementation): it is used to allow only one outstanding accept
command at any given time and to implement polling on the passive
socket. Introduce a flags field to keep track of in-flight accept and
poll commands.

sock->sk->sk_send_head is not used for ip sockets: reuse the field to
store a pointer to the struct sock_mapping corresponding to the socket.

Convert the struct socket pointer into an uint64_t and use it as id for
the socket to pass to the backend.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 73 +
 drivers/xen/pvcalls-front.h |  3 ++
 2 files changed, 76 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index d0f5f42..af2ce20 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -59,6 +59,23 @@ struct sock_mapping {
 
wait_queue_head_t inflight_conn_req;
} active;
+   struct {
+   /* Socket status */
+#define PVCALLS_STATUS_UNINITALIZED  0
+#define PVCALLS_STATUS_BIND  1
+#define PVCALLS_STATUS_LISTEN2
+   uint8_t status;
+   /*
+* Internal state-machine flags.
+* Only one accept operation can be inflight for a socket.
+* Only one poll operation can be inflight for a given socket.
+*/
+#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0
+#define PVCALLS_FLAG_POLL_INFLIGHT   1
+#define PVCALLS_FLAG_POLL_RET2
+   uint8_t flags;
+   wait_queue_head_t inflight_accept_req;
+   } passive;
};
 };
 
@@ -292,6 +309,62 @@ int pvcalls_front_connect(struct socket *sock, struct 
sockaddr *addr,
return ret;
 }
 
+int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int 
addr_len)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map = NULL;
+   struct xen_pvcalls_request *req;
+   int notify, req_id, ret;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM)
+   return -ENOTSUPP;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = kzalloc(sizeof(*map), GFP_KERNEL);
+   if (map == NULL)
+   return -ENOMEM;
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   kfree(map);
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   map->sock = sock;
+   req->cmd = PVCALLS_BIND;
+   req->u.bind.id = (uint64_t) sock;
+   memcpy(req->u.bind.addr, addr, sizeof(*addr));
+   req->u.bind.len = addr_len;
+
+   init_waitqueue_head(>passive.inflight_accept_req);
+
+   list_add_tail(>list, >socketpass_mappings);
+   WRITE_ONCE(sock->sk->sk_send_head, (void *)map);
+   map->active_socket = false;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   wait_event(bedata->inflight_req,
+  READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+   map->passive.status = PVCALLS_STATUS_BIND;
+   ret = bedata->rsp[req_id].ret;
+   /* read ret, then set this rsp slot to be reused */
+   smp_mb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+   return 0;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index 63b0417..8b0a274 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -6,5 +6,8 @@
 int pvcalls_front_socket(struct socket *sock);
 int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
  int addr_len, int flags);
+int pvcalls_front_bind(struct socket *sock,
+  struct sockaddr *addr,
+  int addr_len);
 
 #endif
-- 
1.9.1

[PATCH v2 05/13] xen/pvcalls: implement bind command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_BIND to the backend. Introduce a new structure, part of
struct sock_mapping, to store information specific to passive sockets.

Introduce a status field to keep track of the status of the passive
socket.

Introduce a waitqueue for the "accept" command (see the accept command
implementation): it is used to allow only one outstanding accept
command at any given time and to implement polling on the passive
socket. Introduce a flags field to keep track of in-flight accept and
poll commands.

sock->sk->sk_send_head is not used for ip sockets: reuse the field to
store a pointer to the struct sock_mapping corresponding to the socket.

Convert the struct socket pointer into an uint64_t and use it as id for
the socket to pass to the backend.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 73 +
 drivers/xen/pvcalls-front.h |  3 ++
 2 files changed, 76 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index d0f5f42..af2ce20 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -59,6 +59,23 @@ struct sock_mapping {
 
wait_queue_head_t inflight_conn_req;
} active;
+   struct {
+   /* Socket status */
+#define PVCALLS_STATUS_UNINITALIZED  0
+#define PVCALLS_STATUS_BIND  1
+#define PVCALLS_STATUS_LISTEN2
+   uint8_t status;
+   /*
+* Internal state-machine flags.
+* Only one accept operation can be inflight for a socket.
+* Only one poll operation can be inflight for a given socket.
+*/
+#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0
+#define PVCALLS_FLAG_POLL_INFLIGHT   1
+#define PVCALLS_FLAG_POLL_RET2
+   uint8_t flags;
+   wait_queue_head_t inflight_accept_req;
+   } passive;
};
 };
 
@@ -292,6 +309,62 @@ int pvcalls_front_connect(struct socket *sock, struct 
sockaddr *addr,
return ret;
 }
 
+int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int 
addr_len)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map = NULL;
+   struct xen_pvcalls_request *req;
+   int notify, req_id, ret;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM)
+   return -ENOTSUPP;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = kzalloc(sizeof(*map), GFP_KERNEL);
+   if (map == NULL)
+   return -ENOMEM;
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   kfree(map);
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   map->sock = sock;
+   req->cmd = PVCALLS_BIND;
+   req->u.bind.id = (uint64_t) sock;
+   memcpy(req->u.bind.addr, addr, sizeof(*addr));
+   req->u.bind.len = addr_len;
+
+   init_waitqueue_head(>passive.inflight_accept_req);
+
+   list_add_tail(>list, >socketpass_mappings);
+   WRITE_ONCE(sock->sk->sk_send_head, (void *)map);
+   map->active_socket = false;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   wait_event(bedata->inflight_req,
+  READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+   map->passive.status = PVCALLS_STATUS_BIND;
+   ret = bedata->rsp[req_id].ret;
+   /* read ret, then set this rsp slot to be reused */
+   smp_mb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+   return 0;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index 63b0417..8b0a274 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -6,5 +6,8 @@
 int pvcalls_front_socket(struct socket *sock);
 int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
  int addr_len, int flags);
+int pvcalls_front_bind(struct socket *sock,
+  struct sockaddr *addr,
+  int addr_len);
 
 #endif
-- 
1.9.1

[PATCH v2 10/13] xen/pvcalls: implement poll command

2017-07-25 Thread Stefano Stabellini

For active sockets, check the indexes and use the inflight_conn_req
waitqueue to wait.

For passive sockets, send PVCALLS_POLL to the backend. Use the
inflight_accept_req waitqueue if an accept is outstanding. Otherwise use
the inflight_req waitqueue: inflight_req is awaken when a new response
is received; on wakeup we check whether the POLL response is arrived by
looking at the PVCALLS_FLAG_POLL_RET flag. We set the flag from
pvcalls_front_event_handler, if the response was for a POLL command.

In pvcalls_front_event_handler, get the struct socket pointer from the
poll id (we previously converted struct socket* to uint64_t and used it
as id).

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 134 
 drivers/xen/pvcalls-front.h |   3 +
 2 files changed, 126 insertions(+), 11 deletions(-)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index b4ca569..833b717 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -130,17 +130,35 @@ static irqreturn_t pvcalls_front_event_handler(int irq, 
void *dev_id)
rsp = RING_GET_RESPONSE(>ring, bedata->ring.rsp_cons);
 
req_id = rsp->req_id;
-   src = (uint8_t *)>rsp[req_id];
-   src += sizeof(rsp->req_id);
-   dst = (uint8_t *)rsp;
-   dst += sizeof(rsp->req_id);
-   memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id));
-   /*
-* First copy the rest of the data, then req_id. It is
-* paired with the barrier when accessing bedata->rsp.
-*/
-   smp_wmb();
-   WRITE_ONCE(bedata->rsp[req_id].req_id, rsp->req_id);
+   if (rsp->cmd == PVCALLS_POLL) {
+   struct socket *sock = (struct socket *) rsp->u.poll.id;
+   struct sock_mapping *map =
+   (struct sock_mapping *)
+   READ_ONCE(sock->sk->sk_send_head);
+
+   set_bit(PVCALLS_FLAG_POLL_RET,
+   (void *)>passive.flags);
+   /*
+* Set RET, then clear INFLIGHT. It pairs with
+* the checks at the beginning of
+* pvcalls_front_poll_passive.
+*/
+   smp_wmb();
+   clear_bit(PVCALLS_FLAG_POLL_INFLIGHT,
+ (void *)>passive.flags);
+   } else {
+   src = (uint8_t *)>rsp[req_id];
+   src += sizeof(rsp->req_id);
+   dst = (uint8_t *)rsp;
+   dst += sizeof(rsp->req_id);
+   memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id));
+   /*
+* First copy the rest of the data, then req_id. It is
+* paired with the barrier when accessing bedata->rsp.
+*/
+   smp_wmb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, rsp->req_id);
+   }
 
done = 1;
bedata->ring.rsp_cons++;
@@ -707,6 +725,100 @@ int pvcalls_front_accept(struct socket *sock, struct 
socket *newsock, int flags)
return ret;
 }
 
+static unsigned int pvcalls_front_poll_passive(struct file *file,
+  struct pvcalls_bedata *bedata,
+  struct sock_mapping *map,
+  poll_table *wait)
+{
+   int notify, req_id;
+   struct xen_pvcalls_request *req;
+
+   if (test_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+(void *)>passive.flags)) {
+   poll_wait(file, >passive.inflight_accept_req, wait);
+   return 0;
+   }
+
+   if (test_and_clear_bit(PVCALLS_FLAG_POLL_RET,
+  (void *)>passive.flags))
+   return POLLIN;
+
+   /*
+* First check RET, then INFLIGHT. No barriers necessary to
+* ensure execution ordering because of the conditional
+* instructions creating control dependencies.
+*/
+
+   if (test_and_set_bit(PVCALLS_FLAG_POLL_INFLIGHT,
+(void *)>passive.flags)) {
+   poll_wait(file, >inflight_req, wait);
+   return 0;
+   }
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+

[PATCH v2 10/13] xen/pvcalls: implement poll command

2017-07-25 Thread Stefano Stabellini

For active sockets, check the indexes and use the inflight_conn_req
waitqueue to wait.

For passive sockets, send PVCALLS_POLL to the backend. Use the
inflight_accept_req waitqueue if an accept is outstanding. Otherwise use
the inflight_req waitqueue: inflight_req is awaken when a new response
is received; on wakeup we check whether the POLL response is arrived by
looking at the PVCALLS_FLAG_POLL_RET flag. We set the flag from
pvcalls_front_event_handler, if the response was for a POLL command.

In pvcalls_front_event_handler, get the struct socket pointer from the
poll id (we previously converted struct socket* to uint64_t and used it
as id).

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 134 
 drivers/xen/pvcalls-front.h |   3 +
 2 files changed, 126 insertions(+), 11 deletions(-)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index b4ca569..833b717 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -130,17 +130,35 @@ static irqreturn_t pvcalls_front_event_handler(int irq, 
void *dev_id)
rsp = RING_GET_RESPONSE(>ring, bedata->ring.rsp_cons);
 
req_id = rsp->req_id;
-   src = (uint8_t *)>rsp[req_id];
-   src += sizeof(rsp->req_id);
-   dst = (uint8_t *)rsp;
-   dst += sizeof(rsp->req_id);
-   memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id));
-   /*
-* First copy the rest of the data, then req_id. It is
-* paired with the barrier when accessing bedata->rsp.
-*/
-   smp_wmb();
-   WRITE_ONCE(bedata->rsp[req_id].req_id, rsp->req_id);
+   if (rsp->cmd == PVCALLS_POLL) {
+   struct socket *sock = (struct socket *) rsp->u.poll.id;
+   struct sock_mapping *map =
+   (struct sock_mapping *)
+   READ_ONCE(sock->sk->sk_send_head);
+
+   set_bit(PVCALLS_FLAG_POLL_RET,
+   (void *)>passive.flags);
+   /*
+* Set RET, then clear INFLIGHT. It pairs with
+* the checks at the beginning of
+* pvcalls_front_poll_passive.
+*/
+   smp_wmb();
+   clear_bit(PVCALLS_FLAG_POLL_INFLIGHT,
+ (void *)>passive.flags);
+   } else {
+   src = (uint8_t *)>rsp[req_id];
+   src += sizeof(rsp->req_id);
+   dst = (uint8_t *)rsp;
+   dst += sizeof(rsp->req_id);
+   memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id));
+   /*
+* First copy the rest of the data, then req_id. It is
+* paired with the barrier when accessing bedata->rsp.
+*/
+   smp_wmb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, rsp->req_id);
+   }
 
done = 1;
bedata->ring.rsp_cons++;
@@ -707,6 +725,100 @@ int pvcalls_front_accept(struct socket *sock, struct 
socket *newsock, int flags)
return ret;
 }
 
+static unsigned int pvcalls_front_poll_passive(struct file *file,
+  struct pvcalls_bedata *bedata,
+  struct sock_mapping *map,
+  poll_table *wait)
+{
+   int notify, req_id;
+   struct xen_pvcalls_request *req;
+
+   if (test_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+(void *)>passive.flags)) {
+   poll_wait(file, >passive.inflight_accept_req, wait);
+   return 0;
+   }
+
+   if (test_and_clear_bit(PVCALLS_FLAG_POLL_RET,
+  (void *)>passive.flags))
+   return POLLIN;
+
+   /*
+* First check RET, then INFLIGHT. No barriers necessary to
+* ensure execution ordering because of the conditional
+* instructions creating control dependencies.
+*/
+
+   if (test_and_set_bit(PVCALLS_FLAG_POLL_INFLIGHT,
+(void *)>passive.flags)) {
+   poll_wait(file, >inflight_req, wait);
+   return 0;
+   }
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_POLL;

[PATCH v2 09/13] xen/pvcalls: implement recvmsg

2017-07-25 Thread Stefano Stabellini

Implement recvmsg by copying data from the "in" ring. If not enough data
is available and the recvmsg call is blocking, then wait on the
inflight_conn_req waitqueue. Take the active socket in_mutex so that
only one function can access the ring at any given time.

If not enough data is available on the ring, rather than returning
immediately or sleep-waiting, spin for up to 5000 cycles. This small
optimization turns out to improve performance and latency significantly.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 106 
 drivers/xen/pvcalls-front.h |   4 ++
 2 files changed, 110 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index d8ed280..b4ca569 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -96,6 +96,20 @@ static int pvcalls_front_write_todo(struct sock_mapping *map)
return size - pvcalls_queued(prod, cons, size);
 }
 
+static bool pvcalls_front_read_todo(struct sock_mapping *map)
+{
+   struct pvcalls_data_intf *intf = map->active.ring;
+   RING_IDX cons, prod;
+   int32_t error;
+
+   cons = intf->in_cons;
+   prod = intf->in_prod;
+   error = intf->in_error;
+   return (error != 0 ||
+   pvcalls_queued(prod, cons,
+  XEN_FLEX_RING_SIZE(intf->ring_order))) != 0;
+}
+
 static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
 {
struct xenbus_device *dev = dev_id;
@@ -418,6 +432,98 @@ int pvcalls_front_sendmsg(struct socket *sock, struct 
msghdr *msg,
return tot_sent;
 }
 
+static int __read_ring(struct pvcalls_data_intf *intf,
+  struct pvcalls_data *data,
+  struct iov_iter *msg_iter,
+  size_t len, int flags)
+{
+   RING_IDX cons, prod, size, masked_prod, masked_cons;
+   RING_IDX array_size = XEN_FLEX_RING_SIZE(intf->ring_order);
+   int32_t error;
+
+   cons = intf->in_cons;
+   prod = intf->in_prod;
+   error = intf->in_error;
+   /* get pointers before reading from the ring */
+   virt_rmb();
+   if (error < 0)
+   return error;
+
+   size = pvcalls_queued(prod, cons, array_size);
+   masked_prod = pvcalls_mask(prod, array_size);
+   masked_cons = pvcalls_mask(cons, array_size);
+
+   if (size == 0)
+   return 0;
+
+   if (len > size)
+   len = size;
+
+   if (masked_prod > masked_cons) {
+   copy_to_iter(data->in + masked_cons, len, msg_iter);
+   } else {
+   if (len > (array_size - masked_cons)) {
+   copy_to_iter(data->in + masked_cons,
+array_size - masked_cons, msg_iter);
+   copy_to_iter(data->in,
+len - (array_size - masked_cons),
+msg_iter);
+   } else {
+   copy_to_iter(data->in + masked_cons, len, msg_iter);
+   }
+   }
+   /* read data from the ring before increasing the index */
+   virt_mb();
+   if (!(flags & MSG_PEEK))
+   intf->in_cons += len;
+
+   return len;
+}
+
+int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+int flags)
+{
+   struct pvcalls_bedata *bedata;
+   int ret = -EAGAIN;
+   struct sock_mapping *map;
+   int count = 0;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (!map)
+   return -ENOTSOCK;
+
+   if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC))
+   return -EOPNOTSUPP;
+
+   mutex_lock(>active.in_mutex);
+   if (len > XEN_FLEX_RING_SIZE(map->active.ring->ring_order))
+   len = XEN_FLEX_RING_SIZE(map->active.ring->ring_order);
+
+   while (!(flags & MSG_DONTWAIT) && !pvcalls_front_read_todo(map)) {
+   if (count < PVCALLS_FRONT_MAX_SPIN)
+   count++;
+   else
+   wait_event_interruptible(map->active.inflight_conn_req,
+pvcalls_front_read_todo(map));
+   }
+   ret = __read_ring(map->active.ring, >active.data,
+ >msg_iter, len, flags);
+
+   if (ret > 0)
+   notify_remote_via_irq(map->active.irq);
+   if (ret == 0)
+   ret = -EAGAIN;
+   if (ret == -ENOTCONN)
+   ret = 0;
+
+   mutex_unlock(>active.in_mutex);
+   return ret;
+}
+
 int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int 
addr_len)
 {
struct pvcalls_bedata *bedata;
diff --git

[PATCH v2 09/13] xen/pvcalls: implement recvmsg

2017-07-25 Thread Stefano Stabellini

Implement recvmsg by copying data from the "in" ring. If not enough data
is available and the recvmsg call is blocking, then wait on the
inflight_conn_req waitqueue. Take the active socket in_mutex so that
only one function can access the ring at any given time.

If not enough data is available on the ring, rather than returning
immediately or sleep-waiting, spin for up to 5000 cycles. This small
optimization turns out to improve performance and latency significantly.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 106 
 drivers/xen/pvcalls-front.h |   4 ++
 2 files changed, 110 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index d8ed280..b4ca569 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -96,6 +96,20 @@ static int pvcalls_front_write_todo(struct sock_mapping *map)
return size - pvcalls_queued(prod, cons, size);
 }
 
+static bool pvcalls_front_read_todo(struct sock_mapping *map)
+{
+   struct pvcalls_data_intf *intf = map->active.ring;
+   RING_IDX cons, prod;
+   int32_t error;
+
+   cons = intf->in_cons;
+   prod = intf->in_prod;
+   error = intf->in_error;
+   return (error != 0 ||
+   pvcalls_queued(prod, cons,
+  XEN_FLEX_RING_SIZE(intf->ring_order))) != 0;
+}
+
 static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
 {
struct xenbus_device *dev = dev_id;
@@ -418,6 +432,98 @@ int pvcalls_front_sendmsg(struct socket *sock, struct 
msghdr *msg,
return tot_sent;
 }
 
+static int __read_ring(struct pvcalls_data_intf *intf,
+  struct pvcalls_data *data,
+  struct iov_iter *msg_iter,
+  size_t len, int flags)
+{
+   RING_IDX cons, prod, size, masked_prod, masked_cons;
+   RING_IDX array_size = XEN_FLEX_RING_SIZE(intf->ring_order);
+   int32_t error;
+
+   cons = intf->in_cons;
+   prod = intf->in_prod;
+   error = intf->in_error;
+   /* get pointers before reading from the ring */
+   virt_rmb();
+   if (error < 0)
+   return error;
+
+   size = pvcalls_queued(prod, cons, array_size);
+   masked_prod = pvcalls_mask(prod, array_size);
+   masked_cons = pvcalls_mask(cons, array_size);
+
+   if (size == 0)
+   return 0;
+
+   if (len > size)
+   len = size;
+
+   if (masked_prod > masked_cons) {
+   copy_to_iter(data->in + masked_cons, len, msg_iter);
+   } else {
+   if (len > (array_size - masked_cons)) {
+   copy_to_iter(data->in + masked_cons,
+array_size - masked_cons, msg_iter);
+   copy_to_iter(data->in,
+len - (array_size - masked_cons),
+msg_iter);
+   } else {
+   copy_to_iter(data->in + masked_cons, len, msg_iter);
+   }
+   }
+   /* read data from the ring before increasing the index */
+   virt_mb();
+   if (!(flags & MSG_PEEK))
+   intf->in_cons += len;
+
+   return len;
+}
+
+int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+int flags)
+{
+   struct pvcalls_bedata *bedata;
+   int ret = -EAGAIN;
+   struct sock_mapping *map;
+   int count = 0;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (!map)
+   return -ENOTSOCK;
+
+   if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC))
+   return -EOPNOTSUPP;
+
+   mutex_lock(>active.in_mutex);
+   if (len > XEN_FLEX_RING_SIZE(map->active.ring->ring_order))
+   len = XEN_FLEX_RING_SIZE(map->active.ring->ring_order);
+
+   while (!(flags & MSG_DONTWAIT) && !pvcalls_front_read_todo(map)) {
+   if (count < PVCALLS_FRONT_MAX_SPIN)
+   count++;
+   else
+   wait_event_interruptible(map->active.inflight_conn_req,
+pvcalls_front_read_todo(map));
+   }
+   ret = __read_ring(map->active.ring, >active.data,
+ >msg_iter, len, flags);
+
+   if (ret > 0)
+   notify_remote_via_irq(map->active.irq);
+   if (ret == 0)
+   ret = -EAGAIN;
+   if (ret == -ENOTCONN)
+   ret = 0;
+
+   mutex_unlock(>active.in_mutex);
+   return ret;
+}
+
 int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int 
addr_len)
 {
struct pvcalls_bedata *bedata;
diff --git a/drivers/xen/pvcalls-front.h

[PATCH v2 06/13] xen/pvcalls: implement listen command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_LISTEN to the backend.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 48 +
 drivers/xen/pvcalls-front.h |  1 +
 2 files changed, 49 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index af2ce20..3b5d50e 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -365,6 +365,54 @@ int pvcalls_front_bind(struct socket *sock, struct 
sockaddr *addr, int addr_len)
return 0;
 }
 
+int pvcalls_front_listen(struct socket *sock, int backlog)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map;
+   struct xen_pvcalls_request *req;
+   int notify, req_id, ret;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (!map)
+   return -ENOTSOCK;
+
+   if (map->passive.status != PVCALLS_STATUS_BIND)
+   return -EOPNOTSUPP;
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   bedata->rsp[req_id].req_id != PVCALLS_INVALID_ID) {
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_LISTEN;
+   req->u.listen.id = (uint64_t) sock;
+   req->u.listen.backlog = backlog;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   wait_event(bedata->inflight_req,
+  READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+   map->passive.status = PVCALLS_STATUS_LISTEN;
+   ret = bedata->rsp[req_id].ret;
+   /* read ret, then set this rsp slot to be reused */
+   smp_mb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+   return ret;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index 8b0a274..aa8fe10 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -9,5 +9,6 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr 
*addr,
 int pvcalls_front_bind(struct socket *sock,
   struct sockaddr *addr,
   int addr_len);
+int pvcalls_front_listen(struct socket *sock, int backlog);
 
 #endif
-- 
1.9.1

[PATCH v2 06/13] xen/pvcalls: implement listen command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_LISTEN to the backend.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 48 +
 drivers/xen/pvcalls-front.h |  1 +
 2 files changed, 49 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index af2ce20..3b5d50e 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -365,6 +365,54 @@ int pvcalls_front_bind(struct socket *sock, struct 
sockaddr *addr, int addr_len)
return 0;
 }
 
+int pvcalls_front_listen(struct socket *sock, int backlog)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map;
+   struct xen_pvcalls_request *req;
+   int notify, req_id, ret;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (!map)
+   return -ENOTSOCK;
+
+   if (map->passive.status != PVCALLS_STATUS_BIND)
+   return -EOPNOTSUPP;
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   bedata->rsp[req_id].req_id != PVCALLS_INVALID_ID) {
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_LISTEN;
+   req->u.listen.id = (uint64_t) sock;
+   req->u.listen.backlog = backlog;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   wait_event(bedata->inflight_req,
+  READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+   map->passive.status = PVCALLS_STATUS_LISTEN;
+   ret = bedata->rsp[req_id].ret;
+   /* read ret, then set this rsp slot to be reused */
+   smp_mb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+   return ret;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index 8b0a274..aa8fe10 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -9,5 +9,6 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr 
*addr,
 int pvcalls_front_bind(struct socket *sock,
   struct sockaddr *addr,
   int addr_len);
+int pvcalls_front_listen(struct socket *sock, int backlog);
 
 #endif
-- 
1.9.1

[PATCH v2 01/13] xen/pvcalls: introduce the pvcalls xenbus frontend

2017-07-25 Thread Stefano Stabellini

Introduce a xenbus frontend for the pvcalls protocol, as defined by
https://xenbits.xen.org/docs/unstable/misc/pvcalls.html.

This patch only adds the stubs, the code will be added by the following
patches.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 61 +
 1 file changed, 61 insertions(+)
 create mode 100644 drivers/xen/pvcalls-front.c

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
new file mode 100644
index 000..a8d38c2
--- /dev/null
+++ b/drivers/xen/pvcalls-front.c
@@ -0,0 +1,61 @@
+/*
+ * (c) 2017 Stefano Stabellini 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static const struct xenbus_device_id pvcalls_front_ids[] = {
+   { "pvcalls" },
+   { "" }
+};
+
+static int pvcalls_front_remove(struct xenbus_device *dev)
+{
+   return 0;
+}
+
+static int pvcalls_front_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+   return 0;
+}
+
+static void pvcalls_front_changed(struct xenbus_device *dev,
+   enum xenbus_state backend_state)
+{
+}
+
+static struct xenbus_driver pvcalls_front_driver = {
+   .ids = pvcalls_front_ids,
+   .probe = pvcalls_front_probe,
+   .remove = pvcalls_front_remove,
+   .otherend_changed = pvcalls_front_changed,
+};
+
+static int __init pvcalls_frontend_init(void)
+{
+   if (!xen_domain())
+   return -ENODEV;
+
+   pr_info("Initialising Xen pvcalls frontend driver\n");
+
+   return xenbus_register_frontend(_front_driver);
+}
+
+module_init(pvcalls_frontend_init);
-- 
1.9.1

[PATCH v2 01/13] xen/pvcalls: introduce the pvcalls xenbus frontend

2017-07-25 Thread Stefano Stabellini

Introduce a xenbus frontend for the pvcalls protocol, as defined by
https://xenbits.xen.org/docs/unstable/misc/pvcalls.html.

This patch only adds the stubs, the code will be added by the following
patches.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 61 +
 1 file changed, 61 insertions(+)
 create mode 100644 drivers/xen/pvcalls-front.c

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
new file mode 100644
index 000..a8d38c2
--- /dev/null
+++ b/drivers/xen/pvcalls-front.c
@@ -0,0 +1,61 @@
+/*
+ * (c) 2017 Stefano Stabellini 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static const struct xenbus_device_id pvcalls_front_ids[] = {
+   { "pvcalls" },
+   { "" }
+};
+
+static int pvcalls_front_remove(struct xenbus_device *dev)
+{
+   return 0;
+}
+
+static int pvcalls_front_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+   return 0;
+}
+
+static void pvcalls_front_changed(struct xenbus_device *dev,
+   enum xenbus_state backend_state)
+{
+}
+
+static struct xenbus_driver pvcalls_front_driver = {
+   .ids = pvcalls_front_ids,
+   .probe = pvcalls_front_probe,
+   .remove = pvcalls_front_remove,
+   .otherend_changed = pvcalls_front_changed,
+};
+
+static int __init pvcalls_frontend_init(void)
+{
+   if (!xen_domain())
+   return -ENODEV;
+
+   pr_info("Initialising Xen pvcalls frontend driver\n");
+
+   return xenbus_register_frontend(_front_driver);
+}
+
+module_init(pvcalls_frontend_init);
-- 
1.9.1

[PATCH v2 11/13] xen/pvcalls: implement release command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_RELEASE to the backend and wait for a reply. Take both
in_mutex and out_mutex to avoid concurrent accesses. Then, free the
socket.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 85 +
 drivers/xen/pvcalls-front.h |  1 +
 2 files changed, 86 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 833b717..5a4040e 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -184,6 +184,23 @@ static irqreturn_t pvcalls_front_conn_handler(int irq, 
void *sock_map)
return IRQ_HANDLED;
 }
 
+static void pvcalls_front_free_map(struct pvcalls_bedata *bedata,
+  struct sock_mapping *map)
+{
+   int i;
+
+   spin_lock(>pvcallss_lock);
+   if (!list_empty(>list))
+   list_del_init(>list);
+   spin_unlock(>pvcallss_lock);
+
+   for (i = 0; i < (1 << map->active.ring->ring_order); i++)
+   gnttab_end_foreign_access(map->active.ring->ref[i], 0, 0);
+   gnttab_end_foreign_access(map->active.ref, 0, 0);
+   free_page((unsigned long)map->active.ring);
+   unbind_from_irqhandler(map->active.irq, map);
+}
+
 int pvcalls_front_socket(struct socket *sock)
 {
struct pvcalls_bedata *bedata;
@@ -819,6 +836,74 @@ unsigned int pvcalls_front_poll(struct file *file, struct 
socket *sock,
return pvcalls_front_poll_passive(file, bedata, map, wait);
 }
 
+int pvcalls_front_release(struct socket *sock)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map;
+   int req_id, notify;
+   struct xen_pvcalls_request *req;
+
+   if (!pvcalls_front_dev)
+   return -EIO;
+   bedata = dev_get_drvdata(_front_dev->dev);
+   if (!bedata)
+   return -EIO;
+
+   if (sock->sk == NULL)
+   return 0;
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (map == NULL)
+   return 0;
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   WRITE_ONCE(sock->sk->sk_send_head, NULL);
+
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_RELEASE;
+   req->u.release.id = (uint64_t)sock;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   wait_event(bedata->inflight_req,
+   READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+   if (map->active_socket) {
+   /* 
+* Set in_error and wake up inflight_conn_req to force
+* recvmsg waiters to exit.
+*/
+   map->active.ring->in_error = -EBADF;
+   wake_up_interruptible(>active.inflight_conn_req);
+
+   mutex_lock(>active.in_mutex);
+   mutex_lock(>active.out_mutex);
+   pvcalls_front_free_map(bedata, map);
+   mutex_unlock(>active.out_mutex);
+   mutex_unlock(>active.in_mutex);
+   kfree(map);
+   } else {
+   spin_lock(>pvcallss_lock);
+   list_del_init(>list);
+   kfree(map);
+   spin_unlock(>pvcallss_lock);
+   }
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+
+   return 0;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index 25e05b8..3332978 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -23,5 +23,6 @@ int pvcalls_front_recvmsg(struct socket *sock,
 unsigned int pvcalls_front_poll(struct file *file,
struct socket *sock,
poll_table *wait);
+int pvcalls_front_release(struct socket *sock);
 
 #endif
-- 
1.9.1

[PATCH v2 08/13] xen/pvcalls: implement sendmsg

2017-07-25 Thread Stefano Stabellini

Send data to an active socket by copying data to the "out" ring. Take
the active socket out_mutex so that only one function can access the
ring at any given time.

If not enough room is available on the ring, rather than returning
immediately or sleep-waiting, spin for up to 5000 cycles. This small
optimization turns out to improve performance significantly.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 109 
 drivers/xen/pvcalls-front.h |   3 ++
 2 files changed, 112 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index b8c4538..d8ed280 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -29,6 +29,7 @@
 #define PVCALLS_INVALID_ID (UINT_MAX)
 #define RING_ORDER XENBUS_MAX_RING_GRANT_ORDER
 #define PVCALLS_NR_REQ_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE)
+#define PVCALLS_FRONT_MAX_SPIN 5000
 
 struct pvcalls_bedata {
struct xen_pvcalls_front_ring ring;
@@ -79,6 +80,22 @@ struct sock_mapping {
};
 };
 
+static int pvcalls_front_write_todo(struct sock_mapping *map)
+{
+   struct pvcalls_data_intf *intf = map->active.ring;
+   RING_IDX cons, prod, size = XEN_FLEX_RING_SIZE(intf->ring_order);
+   int32_t error;
+
+   cons = intf->out_cons;
+   prod = intf->out_prod;
+   error = intf->out_error;
+   if (error == -ENOTCONN)
+   return 0;
+   if (error != 0)
+   return error;
+   return size - pvcalls_queued(prod, cons, size);
+}
+
 static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
 {
struct xenbus_device *dev = dev_id;
@@ -309,6 +326,98 @@ int pvcalls_front_connect(struct socket *sock, struct 
sockaddr *addr,
return ret;
 }
 
+static int __write_ring(struct pvcalls_data_intf *intf,
+   struct pvcalls_data *data,
+   struct iov_iter *msg_iter,
+   size_t len)
+{
+   RING_IDX cons, prod, size, masked_prod, masked_cons;
+   RING_IDX array_size = XEN_FLEX_RING_SIZE(intf->ring_order);
+   int32_t error;
+
+   cons = intf->out_cons;
+   prod = intf->out_prod;
+   error = intf->out_error;
+   /* read indexes before continuing */
+   virt_mb();
+
+   if (error < 0)
+   return error;
+
+   size = pvcalls_queued(prod, cons, array_size);
+   if (size >= array_size)
+   return 0;
+   if (len > array_size - size)
+   len = array_size - size;
+
+   masked_prod = pvcalls_mask(prod, array_size);
+   masked_cons = pvcalls_mask(cons, array_size);
+
+   if (masked_prod < masked_cons) {
+   copy_from_iter(data->out + masked_prod, len, msg_iter);
+   } else {
+   if (len > array_size - masked_prod) {
+   copy_from_iter(data->out + masked_prod,
+  array_size - masked_prod, msg_iter);
+   copy_from_iter(data->out,
+  len - (array_size - masked_prod),
+  msg_iter);
+   } else {
+   copy_from_iter(data->out + masked_prod, len, msg_iter);
+   }
+   }
+   /* write to ring before updating pointer */
+   virt_wmb();
+   intf->out_prod += len;
+
+   return len;
+}
+
+int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map;
+   int sent = 0, tot_sent = 0;
+   int count = 0, flags;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (!map)
+   return -ENOTSOCK;
+
+   flags = msg->msg_flags;
+   if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB))
+   return -EOPNOTSUPP;
+
+   mutex_lock(>active.out_mutex);
+   if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) {
+   mutex_unlock(>active.out_mutex);
+   return -EAGAIN;
+   }
+
+again:
+   count++;
+   sent = __write_ring(map->active.ring,
+   >active.data, >msg_iter,
+   len);
+   if (sent > 0) {
+   len -= sent;
+   tot_sent += sent;
+   notify_remote_via_irq(map->active.irq);
+   }
+   if (sent >= 0 && len > 0 && count < PVCALLS_FRONT_MAX_SPIN)
+   goto again;
+   if (sent < 0)
+   tot_sent = sent;
+
+   mutex_unlock(>active.out_mutex);
+   return tot_sent;
+}
+
 int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int 
addr_len)
 {
struct pvcalls_bedata

[PATCH v2 11/13] xen/pvcalls: implement release command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_RELEASE to the backend and wait for a reply. Take both
in_mutex and out_mutex to avoid concurrent accesses. Then, free the
socket.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 85 +
 drivers/xen/pvcalls-front.h |  1 +
 2 files changed, 86 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 833b717..5a4040e 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -184,6 +184,23 @@ static irqreturn_t pvcalls_front_conn_handler(int irq, 
void *sock_map)
return IRQ_HANDLED;
 }
 
+static void pvcalls_front_free_map(struct pvcalls_bedata *bedata,
+  struct sock_mapping *map)
+{
+   int i;
+
+   spin_lock(>pvcallss_lock);
+   if (!list_empty(>list))
+   list_del_init(>list);
+   spin_unlock(>pvcallss_lock);
+
+   for (i = 0; i < (1 << map->active.ring->ring_order); i++)
+   gnttab_end_foreign_access(map->active.ring->ref[i], 0, 0);
+   gnttab_end_foreign_access(map->active.ref, 0, 0);
+   free_page((unsigned long)map->active.ring);
+   unbind_from_irqhandler(map->active.irq, map);
+}
+
 int pvcalls_front_socket(struct socket *sock)
 {
struct pvcalls_bedata *bedata;
@@ -819,6 +836,74 @@ unsigned int pvcalls_front_poll(struct file *file, struct 
socket *sock,
return pvcalls_front_poll_passive(file, bedata, map, wait);
 }
 
+int pvcalls_front_release(struct socket *sock)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map;
+   int req_id, notify;
+   struct xen_pvcalls_request *req;
+
+   if (!pvcalls_front_dev)
+   return -EIO;
+   bedata = dev_get_drvdata(_front_dev->dev);
+   if (!bedata)
+   return -EIO;
+
+   if (sock->sk == NULL)
+   return 0;
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (map == NULL)
+   return 0;
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+   WRITE_ONCE(sock->sk->sk_send_head, NULL);
+
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_RELEASE;
+   req->u.release.id = (uint64_t)sock;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   wait_event(bedata->inflight_req,
+   READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+   if (map->active_socket) {
+   /* 
+* Set in_error and wake up inflight_conn_req to force
+* recvmsg waiters to exit.
+*/
+   map->active.ring->in_error = -EBADF;
+   wake_up_interruptible(>active.inflight_conn_req);
+
+   mutex_lock(>active.in_mutex);
+   mutex_lock(>active.out_mutex);
+   pvcalls_front_free_map(bedata, map);
+   mutex_unlock(>active.out_mutex);
+   mutex_unlock(>active.in_mutex);
+   kfree(map);
+   } else {
+   spin_lock(>pvcallss_lock);
+   list_del_init(>list);
+   kfree(map);
+   spin_unlock(>pvcallss_lock);
+   }
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+
+   return 0;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index 25e05b8..3332978 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -23,5 +23,6 @@ int pvcalls_front_recvmsg(struct socket *sock,
 unsigned int pvcalls_front_poll(struct file *file,
struct socket *sock,
poll_table *wait);
+int pvcalls_front_release(struct socket *sock);
 
 #endif
-- 
1.9.1

[PATCH v2 08/13] xen/pvcalls: implement sendmsg

2017-07-25 Thread Stefano Stabellini

Send data to an active socket by copying data to the "out" ring. Take
the active socket out_mutex so that only one function can access the
ring at any given time.

If not enough room is available on the ring, rather than returning
immediately or sleep-waiting, spin for up to 5000 cycles. This small
optimization turns out to improve performance significantly.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 109 
 drivers/xen/pvcalls-front.h |   3 ++
 2 files changed, 112 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index b8c4538..d8ed280 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -29,6 +29,7 @@
 #define PVCALLS_INVALID_ID (UINT_MAX)
 #define RING_ORDER XENBUS_MAX_RING_GRANT_ORDER
 #define PVCALLS_NR_REQ_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE)
+#define PVCALLS_FRONT_MAX_SPIN 5000
 
 struct pvcalls_bedata {
struct xen_pvcalls_front_ring ring;
@@ -79,6 +80,22 @@ struct sock_mapping {
};
 };
 
+static int pvcalls_front_write_todo(struct sock_mapping *map)
+{
+   struct pvcalls_data_intf *intf = map->active.ring;
+   RING_IDX cons, prod, size = XEN_FLEX_RING_SIZE(intf->ring_order);
+   int32_t error;
+
+   cons = intf->out_cons;
+   prod = intf->out_prod;
+   error = intf->out_error;
+   if (error == -ENOTCONN)
+   return 0;
+   if (error != 0)
+   return error;
+   return size - pvcalls_queued(prod, cons, size);
+}
+
 static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
 {
struct xenbus_device *dev = dev_id;
@@ -309,6 +326,98 @@ int pvcalls_front_connect(struct socket *sock, struct 
sockaddr *addr,
return ret;
 }
 
+static int __write_ring(struct pvcalls_data_intf *intf,
+   struct pvcalls_data *data,
+   struct iov_iter *msg_iter,
+   size_t len)
+{
+   RING_IDX cons, prod, size, masked_prod, masked_cons;
+   RING_IDX array_size = XEN_FLEX_RING_SIZE(intf->ring_order);
+   int32_t error;
+
+   cons = intf->out_cons;
+   prod = intf->out_prod;
+   error = intf->out_error;
+   /* read indexes before continuing */
+   virt_mb();
+
+   if (error < 0)
+   return error;
+
+   size = pvcalls_queued(prod, cons, array_size);
+   if (size >= array_size)
+   return 0;
+   if (len > array_size - size)
+   len = array_size - size;
+
+   masked_prod = pvcalls_mask(prod, array_size);
+   masked_cons = pvcalls_mask(cons, array_size);
+
+   if (masked_prod < masked_cons) {
+   copy_from_iter(data->out + masked_prod, len, msg_iter);
+   } else {
+   if (len > array_size - masked_prod) {
+   copy_from_iter(data->out + masked_prod,
+  array_size - masked_prod, msg_iter);
+   copy_from_iter(data->out,
+  len - (array_size - masked_prod),
+  msg_iter);
+   } else {
+   copy_from_iter(data->out + masked_prod, len, msg_iter);
+   }
+   }
+   /* write to ring before updating pointer */
+   virt_wmb();
+   intf->out_prod += len;
+
+   return len;
+}
+
+int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map;
+   int sent = 0, tot_sent = 0;
+   int count = 0, flags;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (!map)
+   return -ENOTSOCK;
+
+   flags = msg->msg_flags;
+   if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB))
+   return -EOPNOTSUPP;
+
+   mutex_lock(>active.out_mutex);
+   if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) {
+   mutex_unlock(>active.out_mutex);
+   return -EAGAIN;
+   }
+
+again:
+   count++;
+   sent = __write_ring(map->active.ring,
+   >active.data, >msg_iter,
+   len);
+   if (sent > 0) {
+   len -= sent;
+   tot_sent += sent;
+   notify_remote_via_irq(map->active.irq);
+   }
+   if (sent >= 0 && len > 0 && count < PVCALLS_FRONT_MAX_SPIN)
+   goto again;
+   if (sent < 0)
+   tot_sent = sent;
+
+   mutex_unlock(>active.out_mutex);
+   return tot_sent;
+}
+
 int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int 
addr_len)
 {
struct pvcalls_bedata *bedata;
diff --git

[PATCH v2 07/13] xen/pvcalls: implement accept command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_ACCEPT to the backend. Allocate a new active socket. Make
sure that only one accept command is executed at any given time by
setting PVCALLS_FLAG_ACCEPT_INFLIGHT and waiting on the
inflight_accept_req waitqueue.

sock->sk->sk_send_head is not used for ip sockets: reuse the field to
store a pointer to the struct sock_mapping corresponding to the socket.

Convert the new struct socket pointer into an uint64_t and use it as id
for the new socket to pass to the backend.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 79 +
 drivers/xen/pvcalls-front.h |  3 ++
 2 files changed, 82 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 3b5d50e..b8c4538 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -413,6 +413,85 @@ int pvcalls_front_listen(struct socket *sock, int backlog)
return ret;
 }
 
+int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int 
flags)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map;
+   struct sock_mapping *map2 = NULL;
+   struct xen_pvcalls_request *req;
+   int notify, req_id, ret, evtchn;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (!map)
+   return -ENOTSOCK;
+
+   if (map->passive.status != PVCALLS_STATUS_LISTEN)
+   return -EINVAL;
+
+   /*
+* Backend only supports 1 inflight accept request, will return
+* errors for the others
+*/
+   if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+(void *)>passive.flags)) {
+   if (wait_event_interruptible(map->passive.inflight_accept_req,
+   !test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+ (void *)>passive.flags))
+   != 0)
+   return -EINTR;
+   }
+
+
+   newsock->sk = kzalloc(sizeof(*newsock->sk), GFP_KERNEL);
+   if (newsock->sk == NULL)
+   return -ENOMEM;
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   kfree(newsock->sk);
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+
+   map2 = create_active();
+
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_ACCEPT;
+   req->u.accept.id = (uint64_t) sock;
+   req->u.accept.ref = map2->active.ref;
+   req->u.accept.id_new = (uint64_t) newsock;
+   req->u.accept.evtchn = evtchn;
+
+   list_add_tail(>list, >socket_mappings);
+   WRITE_ONCE(newsock->sk->sk_send_head, (void *)map2);
+   map2->sock = newsock;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   wait_event(bedata->inflight_req,
+  READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+   clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)>passive.flags);
+   wake_up(>passive.inflight_accept_req);
+
+   ret = bedata->rsp[req_id].ret;
+   /* read ret, then set this rsp slot to be reused */
+   smp_mb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+   return ret;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index aa8fe10..ab4f1da 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -10,5 +10,8 @@ int pvcalls_front_bind(struct socket *sock,
   struct sockaddr *addr,
   int addr_len);
 int pvcalls_front_listen(struct socket *sock, int backlog);
+int pvcalls_front_accept(struct socket *sock,
+struct socket *newsock,
+int flags);
 
 #endif
-- 
1.9.1

[PATCH v2 07/13] xen/pvcalls: implement accept command

2017-07-25 Thread Stefano Stabellini

Send PVCALLS_ACCEPT to the backend. Allocate a new active socket. Make
sure that only one accept command is executed at any given time by
setting PVCALLS_FLAG_ACCEPT_INFLIGHT and waiting on the
inflight_accept_req waitqueue.

sock->sk->sk_send_head is not used for ip sockets: reuse the field to
store a pointer to the struct sock_mapping corresponding to the socket.

Convert the new struct socket pointer into an uint64_t and use it as id
for the new socket to pass to the backend.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 79 +
 drivers/xen/pvcalls-front.h |  3 ++
 2 files changed, 82 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 3b5d50e..b8c4538 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -413,6 +413,85 @@ int pvcalls_front_listen(struct socket *sock, int backlog)
return ret;
 }
 
+int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int 
flags)
+{
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map;
+   struct sock_mapping *map2 = NULL;
+   struct xen_pvcalls_request *req;
+   int notify, req_id, ret, evtchn;
+
+   if (!pvcalls_front_dev)
+   return -ENOTCONN;
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   map = (struct sock_mapping *) READ_ONCE(sock->sk->sk_send_head);
+   if (!map)
+   return -ENOTSOCK;
+
+   if (map->passive.status != PVCALLS_STATUS_LISTEN)
+   return -EINVAL;
+
+   /*
+* Backend only supports 1 inflight accept request, will return
+* errors for the others
+*/
+   if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+(void *)>passive.flags)) {
+   if (wait_event_interruptible(map->passive.inflight_accept_req,
+   !test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+ (void *)>passive.flags))
+   != 0)
+   return -EINTR;
+   }
+
+
+   newsock->sk = kzalloc(sizeof(*newsock->sk), GFP_KERNEL);
+   if (newsock->sk == NULL)
+   return -ENOMEM;
+
+   spin_lock(>pvcallss_lock);
+   req_id = bedata->ring.req_prod_pvt & (RING_SIZE(>ring) - 1);
+   if (RING_FULL(>ring) ||
+   READ_ONCE(bedata->rsp[req_id].req_id) != PVCALLS_INVALID_ID) {
+   kfree(newsock->sk);
+   spin_unlock(>pvcallss_lock);
+   return -EAGAIN;
+   }
+
+   map2 = create_active();
+
+   req = RING_GET_REQUEST(>ring, req_id);
+   req->req_id = req_id;
+   req->cmd = PVCALLS_ACCEPT;
+   req->u.accept.id = (uint64_t) sock;
+   req->u.accept.ref = map2->active.ref;
+   req->u.accept.id_new = (uint64_t) newsock;
+   req->u.accept.evtchn = evtchn;
+
+   list_add_tail(>list, >socket_mappings);
+   WRITE_ONCE(newsock->sk->sk_send_head, (void *)map2);
+   map2->sock = newsock;
+
+   bedata->ring.req_prod_pvt++;
+   RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(>ring, notify);
+   spin_unlock(>pvcallss_lock);
+   if (notify)
+   notify_remote_via_irq(bedata->irq);
+
+   wait_event(bedata->inflight_req,
+  READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+   clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)>passive.flags);
+   wake_up(>passive.inflight_accept_req);
+
+   ret = bedata->rsp[req_id].ret;
+   /* read ret, then set this rsp slot to be reused */
+   smp_mb();
+   WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+   return ret;
+}
+
 static const struct xenbus_device_id pvcalls_front_ids[] = {
{ "pvcalls" },
{ "" }
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index aa8fe10..ab4f1da 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -10,5 +10,8 @@ int pvcalls_front_bind(struct socket *sock,
   struct sockaddr *addr,
   int addr_len);
 int pvcalls_front_listen(struct socket *sock, int backlog);
+int pvcalls_front_accept(struct socket *sock,
+struct socket *newsock,
+int flags);
 
 #endif
-- 
1.9.1

[PATCH v2 12/13] xen/pvcalls: implement frontend disconnect

2017-07-25 Thread Stefano Stabellini

Implement pvcalls frontend removal function. Go through the list of
active and passive sockets and free them all, one at a time.

Signed-off-by: Stefano Stabellini 
Reviewed-by: Juergen Gross 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 5a4040e..b3d4675 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -911,6 +911,34 @@ int pvcalls_front_release(struct socket *sock)
 
 static int pvcalls_front_remove(struct xenbus_device *dev)
 {
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map = NULL, *n;
+
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   list_for_each_entry_safe(map, n, >socket_mappings, list) {
+   mutex_lock(>active.in_mutex);
+   mutex_lock(>active.out_mutex);
+   pvcalls_front_free_map(bedata, map);
+   mutex_unlock(>active.out_mutex);
+   mutex_unlock(>active.in_mutex);
+   kfree(map);
+   }
+   list_for_each_entry_safe(map, n, >socketpass_mappings, list) {
+   spin_lock(>pvcallss_lock);
+   list_del_init(>list);
+   spin_unlock(>pvcallss_lock);
+   kfree(map);
+   }
+   if (bedata->irq > 0)
+   unbind_from_irqhandler(bedata->irq, dev);
+   if (bedata->ref >= 0)
+   gnttab_end_foreign_access(bedata->ref, 0, 0);
+   kfree(bedata->ring.sring);
+   kfree(bedata);
+   dev_set_drvdata(>dev, NULL);
+   xenbus_switch_state(dev, XenbusStateClosed);
+   pvcalls_front_dev = NULL;
return 0;
 }
 
-- 
1.9.1

[PATCH v2 12/13] xen/pvcalls: implement frontend disconnect

2017-07-25 Thread Stefano Stabellini

Implement pvcalls frontend removal function. Go through the list of
active and passive sockets and free them all, one at a time.

Signed-off-by: Stefano Stabellini 
Reviewed-by: Juergen Gross 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/pvcalls-front.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 5a4040e..b3d4675 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -911,6 +911,34 @@ int pvcalls_front_release(struct socket *sock)
 
 static int pvcalls_front_remove(struct xenbus_device *dev)
 {
+   struct pvcalls_bedata *bedata;
+   struct sock_mapping *map = NULL, *n;
+
+   bedata = dev_get_drvdata(_front_dev->dev);
+
+   list_for_each_entry_safe(map, n, >socket_mappings, list) {
+   mutex_lock(>active.in_mutex);
+   mutex_lock(>active.out_mutex);
+   pvcalls_front_free_map(bedata, map);
+   mutex_unlock(>active.out_mutex);
+   mutex_unlock(>active.in_mutex);
+   kfree(map);
+   }
+   list_for_each_entry_safe(map, n, >socketpass_mappings, list) {
+   spin_lock(>pvcallss_lock);
+   list_del_init(>list);
+   spin_unlock(>pvcallss_lock);
+   kfree(map);
+   }
+   if (bedata->irq > 0)
+   unbind_from_irqhandler(bedata->irq, dev);
+   if (bedata->ref >= 0)
+   gnttab_end_foreign_access(bedata->ref, 0, 0);
+   kfree(bedata->ring.sring);
+   kfree(bedata);
+   dev_set_drvdata(>dev, NULL);
+   xenbus_switch_state(dev, XenbusStateClosed);
+   pvcalls_front_dev = NULL;
return 0;
 }
 
-- 
1.9.1

[PATCH v2 13/13] xen: introduce a Kconfig option to enable the pvcalls frontend

2017-07-25 Thread Stefano Stabellini

Also add pvcalls-front to the Makefile.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/Kconfig  | 9 +
 drivers/xen/Makefile | 1 +
 2 files changed, 10 insertions(+)

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 4545561..0b2c828 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -196,6 +196,15 @@ config XEN_PCIDEV_BACKEND
 
  If in doubt, say m.
 
+config XEN_PVCALLS_FRONTEND
+   tristate "XEN PV Calls frontend driver"
+   depends on INET && XEN
+   help
+ Experimental frontend for the Xen PV Calls protocol
+ (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It
+ sends a small set of POSIX calls to the backend, which
+ implements them.
+
 config XEN_PVCALLS_BACKEND
bool "XEN PV Calls backend driver"
depends on INET && XEN && XEN_BACKEND
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 480b928..afb9e03 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_XEN_EFI) += efi.o
 obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o
 obj-$(CONFIG_XEN_AUTO_XLATE)   += xlate_mmu.o
 obj-$(CONFIG_XEN_PVCALLS_BACKEND)  += pvcalls-back.o
+obj-$(CONFIG_XEN_PVCALLS_FRONTEND) += pvcalls-front.o
 xen-evtchn-y   := evtchn.o
 xen-gntdev-y   := gntdev.o
 xen-gntalloc-y := gntalloc.o
-- 
1.9.1

[PATCH v2 13/13] xen: introduce a Kconfig option to enable the pvcalls frontend

2017-07-25 Thread Stefano Stabellini

Also add pvcalls-front to the Makefile.

Signed-off-by: Stefano Stabellini 
CC: boris.ostrov...@oracle.com
CC: jgr...@suse.com
---
 drivers/xen/Kconfig  | 9 +
 drivers/xen/Makefile | 1 +
 2 files changed, 10 insertions(+)

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 4545561..0b2c828 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -196,6 +196,15 @@ config XEN_PCIDEV_BACKEND
 
  If in doubt, say m.
 
+config XEN_PVCALLS_FRONTEND
+   tristate "XEN PV Calls frontend driver"
+   depends on INET && XEN
+   help
+ Experimental frontend for the Xen PV Calls protocol
+ (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It
+ sends a small set of POSIX calls to the backend, which
+ implements them.
+
 config XEN_PVCALLS_BACKEND
bool "XEN PV Calls backend driver"
depends on INET && XEN && XEN_BACKEND
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 480b928..afb9e03 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_XEN_EFI) += efi.o
 obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o
 obj-$(CONFIG_XEN_AUTO_XLATE)   += xlate_mmu.o
 obj-$(CONFIG_XEN_PVCALLS_BACKEND)  += pvcalls-back.o
+obj-$(CONFIG_XEN_PVCALLS_FRONTEND) += pvcalls-front.o
 xen-evtchn-y   := evtchn.o
 xen-gntdev-y   := gntdev.o
 xen-gntalloc-y := gntalloc.o
-- 
1.9.1

[PATCH v2 00/13] introduce the Xen PV Calls frontend

2017-07-25 Thread Stefano Stabellini

Hi all,

this series introduces the frontend for the newly introduced PV Calls
procotol.

PV Calls is a paravirtualized protocol that allows the implementation of
a set of POSIX functions in a different domain. The PV Calls frontend
sends POSIX function calls to the backend, which implements them and
returns a value to the frontend and acts on the function call.

For more information about PV Calls, please read:

https://xenbits.xen.org/docs/unstable/misc/pvcalls.html

This patch series only implements the frontend driver. It doesn't
attempt to redirect POSIX calls to it. The functions exported in
pvcalls-front.h are meant to be used for that. A separate patch series
will be sent to use them and hook them into the system.


Changes in v2:
- use xenbus_read_unsigned when possible
- call dev_set_drvdata earlier in pvcalls_front_probe not to dereference
  a NULL pointer in the error path
- set ret appropriately in pvcalls_front_probe
- include pvcalls-front.h in pvcalls-front.c
- call wake_up only once after the consuming loop in pvcalls_front_event_handler
- don't leak "bytes" in case of errors in create_active
- call spin_unlock appropriately in case of errors in create_active
- remove all BUG_ONs
- don't leak newsock->sk in pvcalls_front_accept in case of errors
- rename PVCALLS_FRON_MAX_SPIN to PVCALLS_FRONT_MAX_SPIN
- return bool from pvcalls_front_read_todo
- add a barrier after setting PVCALLS_FLAG_POLL_RET in
  pvcalls_front_event_handler
- remove outdated comment in pvcalls_front_free_map
- clear sock->sk->sk_send_head later in pvcalls_front_release
- make XEN_PVCALLS_FRONTEND tristate
- don't add an empty resume function


Stefano Stabellini (13):
  xen/pvcalls: introduce the pvcalls xenbus frontend
  xen/pvcalls: connect to the backend
  xen/pvcalls: implement socket command and handle events
  xen/pvcalls: implement connect command
  xen/pvcalls: implement bind command
  xen/pvcalls: implement listen command
  xen/pvcalls: implement accept command
  xen/pvcalls: implement sendmsg
  xen/pvcalls: implement recvmsg
  xen/pvcalls: implement poll command
  xen/pvcalls: implement release command
  xen/pvcalls: implement frontend disconnect
  xen: introduce a Kconfig option to enable the pvcalls frontend

 drivers/xen/Kconfig |9 +
 drivers/xen/Makefile|1 +
 drivers/xen/pvcalls-front.c | 1103 +++
 drivers/xen/pvcalls-front.h |   28 ++
 4 files changed, 1141 insertions(+)
 create mode 100644 drivers/xen/pvcalls-front.c
 create mode 100644 drivers/xen/pvcalls-front.h

[PATCH v2 00/13] introduce the Xen PV Calls frontend

2017-07-25 Thread Stefano Stabellini

Hi all,

this series introduces the frontend for the newly introduced PV Calls
procotol.

PV Calls is a paravirtualized protocol that allows the implementation of
a set of POSIX functions in a different domain. The PV Calls frontend
sends POSIX function calls to the backend, which implements them and
returns a value to the frontend and acts on the function call.

For more information about PV Calls, please read:

https://xenbits.xen.org/docs/unstable/misc/pvcalls.html

This patch series only implements the frontend driver. It doesn't
attempt to redirect POSIX calls to it. The functions exported in
pvcalls-front.h are meant to be used for that. A separate patch series
will be sent to use them and hook them into the system.


Changes in v2:
- use xenbus_read_unsigned when possible
- call dev_set_drvdata earlier in pvcalls_front_probe not to dereference
  a NULL pointer in the error path
- set ret appropriately in pvcalls_front_probe
- include pvcalls-front.h in pvcalls-front.c
- call wake_up only once after the consuming loop in pvcalls_front_event_handler
- don't leak "bytes" in case of errors in create_active
- call spin_unlock appropriately in case of errors in create_active
- remove all BUG_ONs
- don't leak newsock->sk in pvcalls_front_accept in case of errors
- rename PVCALLS_FRON_MAX_SPIN to PVCALLS_FRONT_MAX_SPIN
- return bool from pvcalls_front_read_todo
- add a barrier after setting PVCALLS_FLAG_POLL_RET in
  pvcalls_front_event_handler
- remove outdated comment in pvcalls_front_free_map
- clear sock->sk->sk_send_head later in pvcalls_front_release
- make XEN_PVCALLS_FRONTEND tristate
- don't add an empty resume function


Stefano Stabellini (13):
  xen/pvcalls: introduce the pvcalls xenbus frontend
  xen/pvcalls: connect to the backend
  xen/pvcalls: implement socket command and handle events
  xen/pvcalls: implement connect command
  xen/pvcalls: implement bind command
  xen/pvcalls: implement listen command
  xen/pvcalls: implement accept command
  xen/pvcalls: implement sendmsg
  xen/pvcalls: implement recvmsg
  xen/pvcalls: implement poll command
  xen/pvcalls: implement release command
  xen/pvcalls: implement frontend disconnect
  xen: introduce a Kconfig option to enable the pvcalls frontend

 drivers/xen/Kconfig |9 +
 drivers/xen/Makefile|1 +
 drivers/xen/pvcalls-front.c | 1103 +++
 drivers/xen/pvcalls-front.h |   28 ++
 4 files changed, 1141 insertions(+)
 create mode 100644 drivers/xen/pvcalls-front.c
 create mode 100644 drivers/xen/pvcalls-front.h

[GIT PULL] (xen) stable/for-jens-4.13

2017-07-25 Thread Konrad Rzeszutek Wilk

Hi Jens,

Please git pull in your branch "for-linus" the following
branch which is based on 765e40b675a9566459ddcb8358ad16f3b8344bbe
"blk-mq: map queues to all present CPUs":

 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git 
stable/for-jens-4.13

It has two bug-fixes for the xen-blkfront driver.

Thank you!


drivers/block/xen-blkfront.c | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

Dongli Zhang (1):
  xen/blkfront: always allocate grants first from per-queue persistent 
grants

Junxiao Bi (1):
  xen-blkfront: fix mq start/stop race

[PATCH 3.18 52/60] drm/mst: Fix error handling during MST sideband message reception

2017-07-25 Thread Greg Kroah-Hartman

3.18-stable review patch.  If anyone has any objections, please let me know.

--

From: Imre Deak 

commit 448421b5e93b9177c5698f0cf6f5e72d2995eeca upstream.

Handle any error due to partial reads, timeouts etc. to avoid parsing
uninitialized data subsequently. Also bail out if the parsing itself
fails.

Cc: Dave Airlie 
Cc: Lyude 
Cc: Daniel Vetter 
Signed-off-by: Imre Deak 
Reviewed-by: Lyude 
Signed-off-by: Daniel Vetter 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20170719114330.26540-2-imre.d...@intel.com
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/gpu/drm/drm_dp_mst_topology.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

--- a/drivers/gpu/drm/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/drm_dp_mst_topology.c
@@ -2081,11 +2081,17 @@ static void drm_dp_get_one_sb_msg(struct
ret = drm_dp_dpcd_read(mgr->aux, basereg + curreply,
replyblock, len);
if (ret != len) {
-   DRM_DEBUG_KMS("failed to read a chunk\n");
+   DRM_DEBUG_KMS("failed to read a chunk (len %d, ret 
%d)\n",
+ len, ret);
+   return;
}
+
ret = drm_dp_sideband_msg_build(msg, replyblock, len, false);
-   if (ret == false)
+   if (!ret) {
DRM_DEBUG_KMS("failed to build sideband msg\n");
+   return;
+   }
+
curreply += len;
replylen -= len;
}

[GIT PULL] (xen) stable/for-jens-4.13

2017-07-25 Thread Konrad Rzeszutek Wilk

Hi Jens,

Please git pull in your branch "for-linus" the following
branch which is based on 765e40b675a9566459ddcb8358ad16f3b8344bbe
"blk-mq: map queues to all present CPUs":

 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git 
stable/for-jens-4.13

It has two bug-fixes for the xen-blkfront driver.

Thank you!


drivers/block/xen-blkfront.c | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

Dongli Zhang (1):
  xen/blkfront: always allocate grants first from per-queue persistent 
grants

Junxiao Bi (1):
  xen-blkfront: fix mq start/stop race

[PATCH 3.18 52/60] drm/mst: Fix error handling during MST sideband message reception

2017-07-25 Thread Greg Kroah-Hartman

3.18-stable review patch.  If anyone has any objections, please let me know.

--

From: Imre Deak 

commit 448421b5e93b9177c5698f0cf6f5e72d2995eeca upstream.

Handle any error due to partial reads, timeouts etc. to avoid parsing
uninitialized data subsequently. Also bail out if the parsing itself
fails.

Cc: Dave Airlie 
Cc: Lyude 
Cc: Daniel Vetter 
Signed-off-by: Imre Deak 
Reviewed-by: Lyude 
Signed-off-by: Daniel Vetter 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20170719114330.26540-2-imre.d...@intel.com
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/gpu/drm/drm_dp_mst_topology.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

--- a/drivers/gpu/drm/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/drm_dp_mst_topology.c
@@ -2081,11 +2081,17 @@ static void drm_dp_get_one_sb_msg(struct
ret = drm_dp_dpcd_read(mgr->aux, basereg + curreply,
replyblock, len);
if (ret != len) {
-   DRM_DEBUG_KMS("failed to read a chunk\n");
+   DRM_DEBUG_KMS("failed to read a chunk (len %d, ret 
%d)\n",
+ len, ret);
+   return;
}
+
ret = drm_dp_sideband_msg_build(msg, replyblock, len, false);
-   if (ret == false)
+   if (!ret) {
DRM_DEBUG_KMS("failed to build sideband msg\n");
+   return;
+   }
+
curreply += len;
replylen -= len;
}

[PATCH 4.4 00/83] 4.4.79-stable review

2017-07-25 Thread Greg Kroah-Hartman

This is the start of the stable review cycle for the 4.4.79 release.
There are 83 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Thu Jul 27 19:16:34 UTC 2017.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:
kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.79-rc1.gz
or in the git tree and branch at:
  git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-4.4.y
and the diffstat can be found below.

thanks,

greg k-h

-
Pseudo-Shortlog of commits:

Greg Kroah-Hartman 
Linux 4.4.79-rc1

Greg Hackmann 
alarmtimer: don't rate limit one-shot timers

Chunyu Hu 
tracing: Fix kmemleak in instance_rmdir

Bjorn Andersson 
spmi: Include OF based modalias in device uevent

Stephen Boyd 
of: device: Export of_device_{get_modalias, uvent_modalias} to modules

Imre Deak 
drm/mst: Avoid processing partially received up/down message transactions

Imre Deak 
drm/mst: Avoid dereferencing a NULL mstb in drm_dp_mst_handle_up_req()

Imre Deak 
drm/mst: Fix error handling during MST sideband message reception

Ismail, Mustafa 
RDMA/core: Initialize port_num in qp_attr

Yan, Zheng 
ceph: fix race in concurrent readdir

Michael Gugino 
staging: rtl8188eu: add TL-WN722N v2 support

Ingo Molnar 
Revert "perf/core: Drop kernel samples even though :u is specified"

Jin Yao 
perf annotate: Fix broken arrow at row 0 connecting jmp instruction to its 
target

Jiang Yi 
target: Fix COMPARE_AND_WRITE caw_sem leak during se_cmd quiesce

Jan Kara 
udf: Fix deadlock between writeback and udf_setsize()

NeilBrown 
NFS: only invalidate dentrys that are clearly invalid.

Chen Hong 
Input: i8042 - fix crash at boot time

Maciej W. Rozycki 
MIPS: Fix a typo: s/preset/present/ in r2-to-r6 emulation error message

Maciej W. Rozycki 
MIPS: Send SIGILL for linked branches in `__compute_return_epc_for_insn'

Maciej W. Rozycki 
MIPS: Rename `sigill_r6' to `sigill_r2r6' in `__compute_return_epc_for_insn'

Maciej W. Rozycki 
MIPS: Send SIGILL for BPOSGE32 in `__compute_return_epc_for_insn'

Maciej W. Rozycki 
MIPS: math-emu: Prevent wrong ISA mode instruction emulation

Maciej W. Rozycki 
MIPS: Fix unaligned PC interpretation in `compute_return_epc'

Maciej W. Rozycki 
MIPS: Actually decode JALX in `__compute_return_epc_for_insn'

James Hogan 
MIPS: Save static registers before sysmips

Maciej W. Rozycki 
MIPS: Fix MIPS I ISA /proc/cpuinfo reporting

Seunghun Han 
x86/ioapic: Pass the correct data to unmask_ioapic_irq()

Seunghun Han 
x86/acpi: Prevent out of bound access caused by broken ACPI tables

James Hogan 
MIPS: Negate error syscall return in trace

James Hogan 
MIPS: Fix mips_atomic_set() with EVA

James Hogan 
MIPS: Fix mips_atomic_set() retry condition

Dan Carpenter 
ftrace: Fix uninitialized variable in match_records()

Alex Williamson 
vfio: New external user group/file match

Alex Williamson 
vfio: Fix group release deadlock

Jaegeuk Kim 
f2fs: Don't clear SGID when inheriting ACLs

Corey Minyard 
ipmi:ssif: Add missing unlock in error branch

Tony Camuso 
ipmi: use rcu lock around call to intf->handlers->sender()

Mario Kleiner 
drm/radeon: Fix eDP for single-display iMac10,1 (v2)

Alex Deucher 
drm/radeon/ci: disable mclk switching for high refresh rates (v2)

Tom St Denis 
drm/amd/amdgpu: Return error if initiating read out of range on vram

Jiri Olsa 
s390/syscalls: Fix out of bounds arguments access

Xiao Ni 
Raid5 should update rdev->sectors after reshape

Devin Heitmueller 
cx88: Fix regression in initial video standard setting

Marek Marczykowski-Górecki 
x86/xen: allow userspace access during hypercalls

Mikulas Patocka 
md: don't use flush_signals

[PATCH 4.4 00/83] 4.4.79-stable review

2017-07-25 Thread Greg Kroah-Hartman

This is the start of the stable review cycle for the 4.4.79 release.
There are 83 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Thu Jul 27 19:16:34 UTC 2017.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:
kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.79-rc1.gz
or in the git tree and branch at:
  git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-4.4.y
and the diffstat can be found below.

thanks,

greg k-h

-
Pseudo-Shortlog of commits:

Greg Kroah-Hartman 
Linux 4.4.79-rc1

Greg Hackmann 
alarmtimer: don't rate limit one-shot timers

Chunyu Hu 
tracing: Fix kmemleak in instance_rmdir

Bjorn Andersson 
spmi: Include OF based modalias in device uevent

Stephen Boyd 
of: device: Export of_device_{get_modalias, uvent_modalias} to modules

Imre Deak 
drm/mst: Avoid processing partially received up/down message transactions

Imre Deak 
drm/mst: Avoid dereferencing a NULL mstb in drm_dp_mst_handle_up_req()

Imre Deak 
drm/mst: Fix error handling during MST sideband message reception

Ismail, Mustafa 
RDMA/core: Initialize port_num in qp_attr

Yan, Zheng 
ceph: fix race in concurrent readdir

Michael Gugino 
staging: rtl8188eu: add TL-WN722N v2 support

Ingo Molnar 
Revert "perf/core: Drop kernel samples even though :u is specified"

Jin Yao 
perf annotate: Fix broken arrow at row 0 connecting jmp instruction to its 
target

Jiang Yi 
target: Fix COMPARE_AND_WRITE caw_sem leak during se_cmd quiesce

Jan Kara 
udf: Fix deadlock between writeback and udf_setsize()

NeilBrown 
NFS: only invalidate dentrys that are clearly invalid.

Chen Hong 
Input: i8042 - fix crash at boot time

Maciej W. Rozycki 
MIPS: Fix a typo: s/preset/present/ in r2-to-r6 emulation error message

Maciej W. Rozycki 
MIPS: Send SIGILL for linked branches in `__compute_return_epc_for_insn'

Maciej W. Rozycki 
MIPS: Rename `sigill_r6' to `sigill_r2r6' in `__compute_return_epc_for_insn'

Maciej W. Rozycki 
MIPS: Send SIGILL for BPOSGE32 in `__compute_return_epc_for_insn'

Maciej W. Rozycki 
MIPS: math-emu: Prevent wrong ISA mode instruction emulation

Maciej W. Rozycki 
MIPS: Fix unaligned PC interpretation in `compute_return_epc'

Maciej W. Rozycki 
MIPS: Actually decode JALX in `__compute_return_epc_for_insn'

James Hogan 
MIPS: Save static registers before sysmips

Maciej W. Rozycki 
MIPS: Fix MIPS I ISA /proc/cpuinfo reporting

Seunghun Han 
x86/ioapic: Pass the correct data to unmask_ioapic_irq()

Seunghun Han 
x86/acpi: Prevent out of bound access caused by broken ACPI tables

James Hogan 
MIPS: Negate error syscall return in trace

James Hogan 
MIPS: Fix mips_atomic_set() with EVA

James Hogan 
MIPS: Fix mips_atomic_set() retry condition

Dan Carpenter 
ftrace: Fix uninitialized variable in match_records()

Alex Williamson 
vfio: New external user group/file match

Alex Williamson 
vfio: Fix group release deadlock

Jaegeuk Kim 
f2fs: Don't clear SGID when inheriting ACLs

Corey Minyard 
ipmi:ssif: Add missing unlock in error branch

Tony Camuso 
ipmi: use rcu lock around call to intf->handlers->sender()

Mario Kleiner 
drm/radeon: Fix eDP for single-display iMac10,1 (v2)

Alex Deucher 
drm/radeon/ci: disable mclk switching for high refresh rates (v2)

Tom St Denis 
drm/amd/amdgpu: Return error if initiating read out of range on vram

Jiri Olsa 
s390/syscalls: Fix out of bounds arguments access

Xiao Ni 
Raid5 should update rdev->sectors after reshape

Devin Heitmueller 
cx88: Fix regression in initial video standard setting

Marek Marczykowski-Górecki 
x86/xen: allow userspace access during hypercalls

Mikulas Patocka 
md: don't use flush_signals in userspace processes

Yoshihiro Shimoda 
usb: renesas_usbhs: gadget: disable all eps when the driver stops

Yoshihiro Shimoda 
usb: renesas_usbhs: fix usbhsc_resume() for !USBHSF_RUNTIME_PWCTRL

Johan Hovold 
USB: cdc-acm: add device-id for quirky printer

Colin Ian King 
usb: storage: return on error to avoid a null pointer dereference

Mathias Nyman 
xhci: Fix NULL pointer dereference when cleaning up streams for removed host

Mathias Nyman 
xhci: fix 2ms port resume timeout

Julian Anastasov 
ipvs: SNAT packet replies only for NATed connections

Chen Yu 
PCI/PM: Restore the status of PCI devices across hibernation

Herbert Xu 
af_key: Fix sadb_x_ipsecrequest parsing

Oliver O'Halloran 
powerpc/asm: Mark cr0 as clobbered in mftb()

Anton Blanchard 
powerpc: Fix emulation of mfocrf in emulate_step()

Anton Blanchard 
powerpc: Fix emulation of mcrf in emulate_step()

Michael Ellerman 
powerpc/64: Fix

[PATCH 4.4 04/83] thermal: cpu_cooling: Avoid accessing potentially freed structures

2017-07-25 Thread Greg Kroah-Hartman

4.4-stable review patch.  If anyone has any objections, please let me know.

--

From: Viresh Kumar 

commit 289d72afddf83440117c35d864bf0c6309c1d011 upstream.

After the lock is dropped, it is possible that the cpufreq_dev gets
freed before we call get_level() and that can cause kernel to crash.

Drop the lock after we are done using the structure.

Fixes: 02373d7c69b4 ("thermal: cpu_cooling: fix lockdep problems in 
cpu_cooling")
Signed-off-by: Viresh Kumar 
Reviewed-by: Lukasz Luba 
Tested-by: Lukasz Luba 
Signed-off-by: Eduardo Valentin 
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/thermal/cpu_cooling.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -191,8 +191,10 @@ unsigned long cpufreq_cooling_get_level(
mutex_lock(_list_lock);
list_for_each_entry(cpufreq_dev, _dev_list, node) {
if (cpumask_test_cpu(cpu, _dev->allowed_cpus)) {
+   unsigned long level = get_level(cpufreq_dev, freq);
+
mutex_unlock(_list_lock);
-   return get_level(cpufreq_dev, freq);
+   return level;
}
}
mutex_unlock(_list_lock);

[PATCH 4.4 04/83] thermal: cpu_cooling: Avoid accessing potentially freed structures

2017-07-25 Thread Greg Kroah-Hartman

4.4-stable review patch.  If anyone has any objections, please let me know.

--

From: Viresh Kumar 

commit 289d72afddf83440117c35d864bf0c6309c1d011 upstream.

After the lock is dropped, it is possible that the cpufreq_dev gets
freed before we call get_level() and that can cause kernel to crash.

Drop the lock after we are done using the structure.

Fixes: 02373d7c69b4 ("thermal: cpu_cooling: fix lockdep problems in 
cpu_cooling")
Signed-off-by: Viresh Kumar 
Reviewed-by: Lukasz Luba 
Tested-by: Lukasz Luba 
Signed-off-by: Eduardo Valentin 
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/thermal/cpu_cooling.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -191,8 +191,10 @@ unsigned long cpufreq_cooling_get_level(
mutex_lock(_list_lock);
list_for_each_entry(cpufreq_dev, _dev_list, node) {
if (cpumask_test_cpu(cpu, _dev->allowed_cpus)) {
+   unsigned long level = get_level(cpufreq_dev, freq);
+
mutex_unlock(_list_lock);
-   return get_level(cpufreq_dev, freq);
+   return level;
}
}
mutex_unlock(_list_lock);

[PATCH 4.4 03/83] [media] s5p-jpeg: dont return a random width/height

2017-07-25 Thread Greg Kroah-Hartman

4.4-stable review patch.  If anyone has any objections, please let me know.

--

From: Mauro Carvalho Chehab 

commit a16e37726c444cbda91e73ed5f742e717bfe866f upstream.

Gcc 7.1 complains about:

drivers/media/platform/s5p-jpeg/jpeg-core.c: In function 
's5p_jpeg_parse_hdr.isra.9':
drivers/media/platform/s5p-jpeg/jpeg-core.c:1207:12: warning: 'width' may be 
used uninitialized in this function [-Wmaybe-uninitialized]
  result->w = width;
  ~~^~~
drivers/media/platform/s5p-jpeg/jpeg-core.c:1208:12: warning: 'height' may be 
used uninitialized in this function [-Wmaybe-uninitialized]
  result->h = height;
  ~~^~~~

Indeed the code would allow it to return a random value (although
it shouldn't happen, in practice). So, explicitly set both to zero,
just in case.

Acked-by: Andrzej Pietrasiewicz 
Signed-off-by: Mauro Carvalho Chehab 
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/media/platform/s5p-jpeg/jpeg-core.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

--- a/drivers/media/platform/s5p-jpeg/jpeg-core.c
+++ b/drivers/media/platform/s5p-jpeg/jpeg-core.c
@@ -1098,10 +1098,10 @@ static bool s5p_jpeg_parse_hdr(struct s5
   struct s5p_jpeg_ctx *ctx)
 {
int c, components = 0, notfound, n_dht = 0, n_dqt = 0;
-   unsigned int height, width, word, subsampling = 0, sos = 0, sof = 0,
-sof_len = 0;
-   unsigned int dht[S5P_JPEG_MAX_MARKER], dht_len[S5P_JPEG_MAX_MARKER],
-dqt[S5P_JPEG_MAX_MARKER], dqt_len[S5P_JPEG_MAX_MARKER];
+   unsigned int height = 0, width = 0, word, subsampling = 0;
+   unsigned int sos = 0, sof = 0, sof_len = 0;
+   unsigned int dht[S5P_JPEG_MAX_MARKER], dht_len[S5P_JPEG_MAX_MARKER];
+   unsigned int dqt[S5P_JPEG_MAX_MARKER], dqt_len[S5P_JPEG_MAX_MARKER];
long length;
struct s5p_jpeg_buffer jpeg_buffer;

[PATCH 4.4 03/83] [media] s5p-jpeg: dont return a random width/height

2017-07-25 Thread Greg Kroah-Hartman

4.4-stable review patch.  If anyone has any objections, please let me know.

--

From: Mauro Carvalho Chehab 

commit a16e37726c444cbda91e73ed5f742e717bfe866f upstream.

Gcc 7.1 complains about:

drivers/media/platform/s5p-jpeg/jpeg-core.c: In function 
's5p_jpeg_parse_hdr.isra.9':
drivers/media/platform/s5p-jpeg/jpeg-core.c:1207:12: warning: 'width' may be 
used uninitialized in this function [-Wmaybe-uninitialized]
  result->w = width;
  ~~^~~
drivers/media/platform/s5p-jpeg/jpeg-core.c:1208:12: warning: 'height' may be 
used uninitialized in this function [-Wmaybe-uninitialized]
  result->h = height;
  ~~^~~~

Indeed the code would allow it to return a random value (although
it shouldn't happen, in practice). So, explicitly set both to zero,
just in case.

Acked-by: Andrzej Pietrasiewicz 
Signed-off-by: Mauro Carvalho Chehab 
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/media/platform/s5p-jpeg/jpeg-core.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

--- a/drivers/media/platform/s5p-jpeg/jpeg-core.c
+++ b/drivers/media/platform/s5p-jpeg/jpeg-core.c
@@ -1098,10 +1098,10 @@ static bool s5p_jpeg_parse_hdr(struct s5
   struct s5p_jpeg_ctx *ctx)
 {
int c, components = 0, notfound, n_dht = 0, n_dqt = 0;
-   unsigned int height, width, word, subsampling = 0, sos = 0, sof = 0,
-sof_len = 0;
-   unsigned int dht[S5P_JPEG_MAX_MARKER], dht_len[S5P_JPEG_MAX_MARKER],
-dqt[S5P_JPEG_MAX_MARKER], dqt_len[S5P_JPEG_MAX_MARKER];
+   unsigned int height = 0, width = 0, word, subsampling = 0;
+   unsigned int sos = 0, sof = 0, sof_len = 0;
+   unsigned int dht[S5P_JPEG_MAX_MARKER], dht_len[S5P_JPEG_MAX_MARKER];
+   unsigned int dqt[S5P_JPEG_MAX_MARKER], dqt_len[S5P_JPEG_MAX_MARKER];
long length;
struct s5p_jpeg_buffer jpeg_buffer;

Re: [PATCH] mm, oom: allow oom reaper to race with exit_mmap

2017-07-25 Thread Andrea Arcangeli

On Tue, Jul 25, 2017 at 06:04:00PM +0200, Michal Hocko wrote:
> - down_write(>mmap_sem);
> + if (tsk_is_oom_victim(current))
> + down_write(>mmap_sem);
>   free_pgtables(, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
>   tlb_finish_mmu(, 0, -1);
>  
> @@ -3012,7 +3014,8 @@ void exit_mmap(struct mm_struct *mm)
>   }
>   mm->mmap = NULL;
>   vm_unacct_memory(nr_accounted);
> - up_write(>mmap_sem);
> + if (tsk_is_oom_victim(current))
> + up_write(>mmap_sem);

How is this possibly safe? mark_oom_victim can run while exit_mmap is
running. Even if you cache the first read in the local stack, failure
to notice you marked it, could lead to use after free. Or at least
there's no comment on which lock should prevent the use after free
with the above.

Re: [PATCH] mm, oom: allow oom reaper to race with exit_mmap

2017-07-25 Thread Andrea Arcangeli

On Tue, Jul 25, 2017 at 06:04:00PM +0200, Michal Hocko wrote:
> - down_write(>mmap_sem);
> + if (tsk_is_oom_victim(current))
> + down_write(>mmap_sem);
>   free_pgtables(, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
>   tlb_finish_mmu(, 0, -1);
>  
> @@ -3012,7 +3014,8 @@ void exit_mmap(struct mm_struct *mm)
>   }
>   mm->mmap = NULL;
>   vm_unacct_memory(nr_accounted);
> - up_write(>mmap_sem);
> + if (tsk_is_oom_victim(current))
> + up_write(>mmap_sem);

How is this possibly safe? mark_oom_victim can run while exit_mmap is
running. Even if you cache the first read in the local stack, failure
to notice you marked it, could lead to use after free. Or at least
there's no comment on which lock should prevent the use after free
with the above.

Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option

2017-07-25 Thread Paul E. McKenney

On Tue, Jul 25, 2017 at 10:24:51PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> 
> > There are a lot of variations, to be sure.  For whatever it is worth,
> > the original patch that started this uses mprotect():
> > 
> > https://github.com/msullivan/userspace-rcu/commit/04656b468d418efbc5d934ab07954eb8395a7ab0
> 
> FWIW that will not work on s390 (and maybe others), they don't in fact
> require IPIs for remote TLB invalidation.

Nor will it for ARM.  Nor (I think) for PowerPC.  But that is in fact
what people are doing right now in real life.  Hence my renewed interest
in sys_membarrier().

> > > Which are those? I thought we significantly reduced those with the nohz
> > > full work. Most IPI uses now first check if a CPU actually needs the IPI
> > > before sending it IIRC.
> > 
> > If the task being awakened is higher priority than the task currently
> > running on a given CPU, that CPU still gets an IPI, right?  Or am I
> > completely confused?
> 
> I was thinking of things like on_each_cpu_cond().

Fair enough.

But it would not be hard for userspace code to force IPIs by repeatedly
awakening higher-priority threads that sleep immediately after being
awakened, right?

> > Unlike userspace preempt disable, in this case we get the abuse anyway
> > via existing mechanisms, as in they are already being abused.  If we
> > provide a mechanism for this purpose, we at least have the potential
> > for handling the abuse, for example:
> > 
> > o   "Defanging" sys_membarrier() on systems that are sensitive to
> > latency.  For example, this patch can be defanged by booting
> > with the rcupdate.rcu_normal=1 kernel boot parameter, which
> > causes requests for expedited grace periods to instead use
> > normal grace periods.
> > 
> > o   Detecting and responding to abuse.  For example, perhaps if there
> > are more than (say) 50 expedited sys_membarrier()s within a given
> > jiffy, the excess sys_membarrier()s are non-expedited.
> > 
> > o   Batching optimizations allow large number of concurrent requests
> > to be handled with fewer grace periods -- and both normal and
> > expedited grace periods already do exactly this.
> > 
> > This horse is already out, so trying to shut the gate won't be effective.
> 
> Well, I'm not sure there is an easy means of doing machine wide IPIs for
> !root out there. This would be a first.
> 
> Something along the lines of:
> 
> void dummy(void *arg)
> {
>   /* IPIs are assumed to be serializing */
> }
> 
> void ipi_mm(struct mm_struct *mm)
> {
>   cpumask_var_t cpus;
>   int cpu;
> 
>   zalloc_cpumask_var(, GFP_KERNEL);
> 
>   for_each_cpu(cpu, mm_cpumask(mm)) {
>   struct task_struct *p;
> 
>   /*
>* If the current task of @cpu isn't of this @mm, then
>* it needs a context switch to become one, which will
>* provide the ordering we require.
>*/
>   rcu_read_lock();
>   p = task_rcu_dereference(_curr(cpu));
>   if (p && p->mm == mm)
>   __cpumask_set_cpu(cpu, cpus);
>   rcu_read_unlock();
>   }
> 
>   on_each_cpu_mask(cpus, dummy, NULL, 1);
> }
> 
> Would appear to be minimally invasive and only shoot at CPUs we're
> currently running our process on, which greatly reduces the impact.

I am good with this approach as well, and I do very much like that it
avoids IPIing CPUs that aren't running our process (at least in the
common case).  But don't we also need added memory ordering?  It is
sort of OK to IPI a CPU that just now switched away from our process,
but not so good to miss IPIing a CPU that switched to our process just
a little before sys_membarrier().

I was intending to base this on the last few versions of a 2010 patch,
but maybe things have changed:

https://marc.info/?l=linux-kernel=126358017229620=2
https://marc.info/?l=linux-kernel=126436996014016=2
https://marc.info/?l=linux-kernel=126601479802978=2
https://marc.info/?l=linux-kernel=126970692903302=2

Discussion here:

https://marc.info/?l=linux-kernel=126349766324224=2

The discussion led to acquiring the runqueue locks, as there was
otherwise a need to add code to the scheduler fastpaths.

There was a desire to make this work automatically among multiple
processes sharing some memory, but I believe that in this case
the user is going to have to track the multiple processes anyway,
and so can simply do sys_membarrier from each:

https://marc.info/?l=linux-arch=126686393820832=2

Some architectures are less precise than others in tracking which
CPUs are running a given process due to ASIDs, though this is
thought to be a non-problem:

https://marc.info/?l=linux-arch=126716090413065=2
https://marc.info/?l=linux-arch=126716262815202=2

Thoughts?

Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option

2017-07-25 Thread Paul E. McKenney

On Tue, Jul 25, 2017 at 10:24:51PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> 
> > There are a lot of variations, to be sure.  For whatever it is worth,
> > the original patch that started this uses mprotect():
> > 
> > https://github.com/msullivan/userspace-rcu/commit/04656b468d418efbc5d934ab07954eb8395a7ab0
> 
> FWIW that will not work on s390 (and maybe others), they don't in fact
> require IPIs for remote TLB invalidation.

Nor will it for ARM.  Nor (I think) for PowerPC.  But that is in fact
what people are doing right now in real life.  Hence my renewed interest
in sys_membarrier().

> > > Which are those? I thought we significantly reduced those with the nohz
> > > full work. Most IPI uses now first check if a CPU actually needs the IPI
> > > before sending it IIRC.
> > 
> > If the task being awakened is higher priority than the task currently
> > running on a given CPU, that CPU still gets an IPI, right?  Or am I
> > completely confused?
> 
> I was thinking of things like on_each_cpu_cond().

Fair enough.

But it would not be hard for userspace code to force IPIs by repeatedly
awakening higher-priority threads that sleep immediately after being
awakened, right?

> > Unlike userspace preempt disable, in this case we get the abuse anyway
> > via existing mechanisms, as in they are already being abused.  If we
> > provide a mechanism for this purpose, we at least have the potential
> > for handling the abuse, for example:
> > 
> > o   "Defanging" sys_membarrier() on systems that are sensitive to
> > latency.  For example, this patch can be defanged by booting
> > with the rcupdate.rcu_normal=1 kernel boot parameter, which
> > causes requests for expedited grace periods to instead use
> > normal grace periods.
> > 
> > o   Detecting and responding to abuse.  For example, perhaps if there
> > are more than (say) 50 expedited sys_membarrier()s within a given
> > jiffy, the excess sys_membarrier()s are non-expedited.
> > 
> > o   Batching optimizations allow large number of concurrent requests
> > to be handled with fewer grace periods -- and both normal and
> > expedited grace periods already do exactly this.
> > 
> > This horse is already out, so trying to shut the gate won't be effective.
> 
> Well, I'm not sure there is an easy means of doing machine wide IPIs for
> !root out there. This would be a first.
> 
> Something along the lines of:
> 
> void dummy(void *arg)
> {
>   /* IPIs are assumed to be serializing */
> }
> 
> void ipi_mm(struct mm_struct *mm)
> {
>   cpumask_var_t cpus;
>   int cpu;
> 
>   zalloc_cpumask_var(, GFP_KERNEL);
> 
>   for_each_cpu(cpu, mm_cpumask(mm)) {
>   struct task_struct *p;
> 
>   /*
>* If the current task of @cpu isn't of this @mm, then
>* it needs a context switch to become one, which will
>* provide the ordering we require.
>*/
>   rcu_read_lock();
>   p = task_rcu_dereference(_curr(cpu));
>   if (p && p->mm == mm)
>   __cpumask_set_cpu(cpu, cpus);
>   rcu_read_unlock();
>   }
> 
>   on_each_cpu_mask(cpus, dummy, NULL, 1);
> }
> 
> Would appear to be minimally invasive and only shoot at CPUs we're
> currently running our process on, which greatly reduces the impact.

I am good with this approach as well, and I do very much like that it
avoids IPIing CPUs that aren't running our process (at least in the
common case).  But don't we also need added memory ordering?  It is
sort of OK to IPI a CPU that just now switched away from our process,
but not so good to miss IPIing a CPU that switched to our process just
a little before sys_membarrier().

I was intending to base this on the last few versions of a 2010 patch,
but maybe things have changed:

https://marc.info/?l=linux-kernel=126358017229620=2
https://marc.info/?l=linux-kernel=126436996014016=2
https://marc.info/?l=linux-kernel=126601479802978=2
https://marc.info/?l=linux-kernel=126970692903302=2

Discussion here:

https://marc.info/?l=linux-kernel=126349766324224=2

The discussion led to acquiring the runqueue locks, as there was
otherwise a need to add code to the scheduler fastpaths.

There was a desire to make this work automatically among multiple
processes sharing some memory, but I believe that in this case
the user is going to have to track the multiple processes anyway,
and so can simply do sys_membarrier from each:

https://marc.info/?l=linux-arch=126686393820832=2

Some architectures are less precise than others in tracking which
CPUs are running a given process due to ASIDs, though this is
thought to be a non-problem:

https://marc.info/?l=linux-arch=126716090413065=2
https://marc.info/?l=linux-arch=126716262815202=2

Thoughts?

[PATCH 18/28] x86/intel_rdt: Prepare for RDT monitor data support

2017-07-25 Thread Vikas Shivappa

Rename the intel_rdt_schemata file to intel_rdt_ctrlmondata as we now
want to add support for RDT monitoring data for the events that are
supported in later patches.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/Makefile|   2 +-
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 286 
 arch/x86/kernel/cpu/intel_rdt_schemata.c| 286 
 3 files changed, 287 insertions(+), 287 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
 delete mode 100644 arch/x86/kernel/cpu/intel_rdt_schemata.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 81b0060..1245f98 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)   += umc.o
 
-obj-$(CONFIG_INTEL_RDT)+= intel_rdt.o intel_rdt_rdtgroup.o 
intel_rdt_schemata.o intel_rdt_monitor.o
+obj-$(CONFIG_INTEL_RDT)+= intel_rdt.o intel_rdt_rdtgroup.o 
intel_rdt_monitor.o intel_rdt_ctrlmondata.o
 
 obj-$(CONFIG_X86_MCE)  += mcheck/
 obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c 
b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
new file mode 100644
index 000..952156c
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -0,0 +1,286 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *Fenghua Yu 
+ *Tony Luck 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include "intel_rdt.h"
+
+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+   unsigned long bw;
+   int ret;
+
+   /*
+* Only linear delay values is supported for current Intel SKUs.
+*/
+   if (!r->membw.delay_linear)
+   return false;
+
+   ret = kstrtoul(buf, 10, );
+   if (ret)
+   return false;
+
+   if (bw < r->membw.min_bw || bw > r->default_ctrl)
+   return false;
+
+   *data = roundup(bw, (unsigned long)r->membw.bw_gran);
+   return true;
+}
+
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+   unsigned long data;
+
+   if (d->have_new_ctrl)
+   return -EINVAL;
+
+   if (!bw_validate(buf, , r))
+   return -EINVAL;
+   d->new_ctrl = data;
+   d->have_new_ctrl = true;
+
+   return 0;
+}
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ * Please note that all (and only) contiguous '1' combinations
+ * are allowed (e.g. H, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource 
*r)
+{
+   unsigned long first_bit, zero_bit, val;
+   unsigned int cbm_len = r->cache.cbm_len;
+   int ret;
+
+   ret = kstrtoul(buf, 16, );
+   if (ret)
+   return false;
+
+   if (val == 0 || val > r->default_ctrl)
+   return false;
+
+   first_bit = find_first_bit(, cbm_len);
+   zero_bit = find_next_zero_bit(, cbm_len, first_bit);
+
+   if (find_next_bit(, cbm_len, zero_bit) < cbm_len)
+   return false;
+
+   if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
+   return false;
+
+   *data = val;
+   return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+   unsigned long data;
+
+   if (d->have_new_ctrl)
+   return -EINVAL;
+
+   if(!cbm_validate(buf, , r))
+   return -EINVAL;
+   d->new_ctrl = data;
+   d->have_new_ctrl = true;
+
+   return 0;
+}
+
+/*
+ *

[PATCH 16/28] x86/intel_rdt: Prepare to add RDT monitor cpus file support

2017-07-25 Thread Vikas Shivappa

Separate the ctrl cpus file handling from the generic cpus file handling
and convert the per cpu closid from u32 to a struct which will be used
later to add rmid to the same struct. Also cleanup some name space.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h   |   4 +-
 arch/x86/kernel/cpu/intel_rdt.c  |   6 +-
 arch/x86/kernel/cpu/intel_rdt.h  |   2 -
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 103 +--
 4 files changed, 62 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 1d8d45a..2c704d2 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -26,7 +26,7 @@ struct intel_pqr_state {
 };
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+DECLARE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
@@ -54,7 +54,7 @@ static inline void intel_rdt_sched_in(void)
 */
closid = current->closid;
if (closid == 0)
-   closid = this_cpu_read(cpu_closid);
+   closid = this_cpu_read(rdt_cpu_default.closid);
 
if (closid != state->closid) {
state->closid = closid;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index eab2467..cd48ec9 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -39,8 +39,6 @@
 /* Mutex to protect rdtgroup access. */
 DEFINE_MUTEX(rdtgroup_mutex);
 
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
 /*
  * The cached intel_pqr_state is strictly per CPU and can never be
  * updated from a remote CPU. Functions which modify the state
@@ -49,6 +47,8 @@
  */
 DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 
+DEFINE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
+
 /*
  * Used to store the max resource name width and max resource data width
  * to display the schemata in a tabular format
@@ -500,7 +500,7 @@ static void clear_closid(int cpu)
 {
struct intel_pqr_state *state = this_cpu_ptr(_state);
 
-   per_cpu(cpu_closid, cpu) = 0;
+   per_cpu(rdt_cpu_default.closid, cpu) = 0;
state->closid = 0;
wrmsr(IA32_PQR_ASSOC, state->rmid, 0);
 }
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index b2a2de3..6f07047 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -320,8 +320,6 @@ enum {
unsigned int full;
 };
 
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
 void rdt_ctrl_update(void *arg);
 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
 void rdtgroup_kn_unlock(struct kernfs_node *kn);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index d02596e9..7f8f52d 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -181,13 +181,16 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 /*
  * This is safe against intel_rdt_sched_in() called from __switch_to()
  * because __switch_to() is executed with interrupts disabled. A local call
- * from rdt_update_closid() is proteced against __switch_to() because
+ * from update_closid() is proteced against __switch_to() because
  * preemption is disabled.
  */
-static void rdt_update_cpu_closid(void *closid)
+static void update_cpu_closid(void *info)
 {
-   if (closid)
-   this_cpu_write(cpu_closid, *(int *)closid);
+   struct rdtgroup *r = info;
+
+   if (r)
+   this_cpu_write(rdt_cpu_default.closid, r->closid);
+
/*
 * We cannot unconditionally write the MSR because the current
 * executing task might have its own closid selected. Just reuse
@@ -199,28 +202,62 @@ static void rdt_update_cpu_closid(void *closid)
 /*
  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
  *
- * Per task closids must have been set up before calling this function.
- *
- * The per cpu closids are updated with the smp function call, when @closid
- * is not NULL. If @closid is NULL then all affected percpu closids must
- * have been set up before calling this function.
+ * Per task closids/rmids must have been set up before calling this function.
  */
 static void
-rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+update_closid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 {
int cpu = get_cpu();
 
if (cpumask_test_cpu(cpu, cpu_mask))
-   rdt_update_cpu_closid(closid);
-   smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+   update_cpu_closid(r);
+   smp_call_function_many(cpu_mask, update_cpu_closid, r, 1);
put_cpu();
 }
 
+static int cpus_ctrl_write(struct rdtgroup

[PATCH 18/28] x86/intel_rdt: Prepare for RDT monitor data support

2017-07-25 Thread Vikas Shivappa

Rename the intel_rdt_schemata file to intel_rdt_ctrlmondata as we now
want to add support for RDT monitoring data for the events that are
supported in later patches.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/Makefile|   2 +-
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 286 
 arch/x86/kernel/cpu/intel_rdt_schemata.c| 286 
 3 files changed, 287 insertions(+), 287 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
 delete mode 100644 arch/x86/kernel/cpu/intel_rdt_schemata.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 81b0060..1245f98 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)   += umc.o
 
-obj-$(CONFIG_INTEL_RDT)+= intel_rdt.o intel_rdt_rdtgroup.o 
intel_rdt_schemata.o intel_rdt_monitor.o
+obj-$(CONFIG_INTEL_RDT)+= intel_rdt.o intel_rdt_rdtgroup.o 
intel_rdt_monitor.o intel_rdt_ctrlmondata.o
 
 obj-$(CONFIG_X86_MCE)  += mcheck/
 obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c 
b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
new file mode 100644
index 000..952156c
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -0,0 +1,286 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *Fenghua Yu 
+ *Tony Luck 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include "intel_rdt.h"
+
+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+   unsigned long bw;
+   int ret;
+
+   /*
+* Only linear delay values is supported for current Intel SKUs.
+*/
+   if (!r->membw.delay_linear)
+   return false;
+
+   ret = kstrtoul(buf, 10, );
+   if (ret)
+   return false;
+
+   if (bw < r->membw.min_bw || bw > r->default_ctrl)
+   return false;
+
+   *data = roundup(bw, (unsigned long)r->membw.bw_gran);
+   return true;
+}
+
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+   unsigned long data;
+
+   if (d->have_new_ctrl)
+   return -EINVAL;
+
+   if (!bw_validate(buf, , r))
+   return -EINVAL;
+   d->new_ctrl = data;
+   d->have_new_ctrl = true;
+
+   return 0;
+}
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ * Please note that all (and only) contiguous '1' combinations
+ * are allowed (e.g. H, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource 
*r)
+{
+   unsigned long first_bit, zero_bit, val;
+   unsigned int cbm_len = r->cache.cbm_len;
+   int ret;
+
+   ret = kstrtoul(buf, 16, );
+   if (ret)
+   return false;
+
+   if (val == 0 || val > r->default_ctrl)
+   return false;
+
+   first_bit = find_first_bit(, cbm_len);
+   zero_bit = find_next_zero_bit(, cbm_len, first_bit);
+
+   if (find_next_bit(, cbm_len, zero_bit) < cbm_len)
+   return false;
+
+   if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
+   return false;
+
+   *data = val;
+   return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+   unsigned long data;
+
+   if (d->have_new_ctrl)
+   return -EINVAL;
+
+   if(!cbm_validate(buf, , r))
+   return -EINVAL;
+   d->new_ctrl = data;
+   d->have_new_ctrl = true;
+
+   return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ *

[PATCH 16/28] x86/intel_rdt: Prepare to add RDT monitor cpus file support

2017-07-25 Thread Vikas Shivappa

Separate the ctrl cpus file handling from the generic cpus file handling
and convert the per cpu closid from u32 to a struct which will be used
later to add rmid to the same struct. Also cleanup some name space.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h   |   4 +-
 arch/x86/kernel/cpu/intel_rdt.c  |   6 +-
 arch/x86/kernel/cpu/intel_rdt.h  |   2 -
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 103 +--
 4 files changed, 62 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 1d8d45a..2c704d2 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -26,7 +26,7 @@ struct intel_pqr_state {
 };
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+DECLARE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
@@ -54,7 +54,7 @@ static inline void intel_rdt_sched_in(void)
 */
closid = current->closid;
if (closid == 0)
-   closid = this_cpu_read(cpu_closid);
+   closid = this_cpu_read(rdt_cpu_default.closid);
 
if (closid != state->closid) {
state->closid = closid;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index eab2467..cd48ec9 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -39,8 +39,6 @@
 /* Mutex to protect rdtgroup access. */
 DEFINE_MUTEX(rdtgroup_mutex);
 
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
 /*
  * The cached intel_pqr_state is strictly per CPU and can never be
  * updated from a remote CPU. Functions which modify the state
@@ -49,6 +47,8 @@
  */
 DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 
+DEFINE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
+
 /*
  * Used to store the max resource name width and max resource data width
  * to display the schemata in a tabular format
@@ -500,7 +500,7 @@ static void clear_closid(int cpu)
 {
struct intel_pqr_state *state = this_cpu_ptr(_state);
 
-   per_cpu(cpu_closid, cpu) = 0;
+   per_cpu(rdt_cpu_default.closid, cpu) = 0;
state->closid = 0;
wrmsr(IA32_PQR_ASSOC, state->rmid, 0);
 }
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index b2a2de3..6f07047 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -320,8 +320,6 @@ enum {
unsigned int full;
 };
 
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
 void rdt_ctrl_update(void *arg);
 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
 void rdtgroup_kn_unlock(struct kernfs_node *kn);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index d02596e9..7f8f52d 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -181,13 +181,16 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 /*
  * This is safe against intel_rdt_sched_in() called from __switch_to()
  * because __switch_to() is executed with interrupts disabled. A local call
- * from rdt_update_closid() is proteced against __switch_to() because
+ * from update_closid() is proteced against __switch_to() because
  * preemption is disabled.
  */
-static void rdt_update_cpu_closid(void *closid)
+static void update_cpu_closid(void *info)
 {
-   if (closid)
-   this_cpu_write(cpu_closid, *(int *)closid);
+   struct rdtgroup *r = info;
+
+   if (r)
+   this_cpu_write(rdt_cpu_default.closid, r->closid);
+
/*
 * We cannot unconditionally write the MSR because the current
 * executing task might have its own closid selected. Just reuse
@@ -199,28 +202,62 @@ static void rdt_update_cpu_closid(void *closid)
 /*
  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
  *
- * Per task closids must have been set up before calling this function.
- *
- * The per cpu closids are updated with the smp function call, when @closid
- * is not NULL. If @closid is NULL then all affected percpu closids must
- * have been set up before calling this function.
+ * Per task closids/rmids must have been set up before calling this function.
  */
 static void
-rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+update_closid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 {
int cpu = get_cpu();
 
if (cpumask_test_cpu(cpu, cpu_mask))
-   rdt_update_cpu_closid(closid);
-   smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+   update_cpu_closid(r);
+   smp_call_function_many(cpu_mask, update_cpu_closid, r, 1);
put_cpu();
 }
 
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,

[PATCH 15/28] x86/intel_rdt/cqm: Add tasks file support

2017-07-25 Thread Vikas Shivappa

The root directory, ctrl_mon and monitor groups are populated
with a read/write file named "tasks". When read, it shows all the task
IDs assigned to the resource group.

Tasks can be added to groups by writing the PID to the file. A task can
be present in one "ctrl_mon" group "and" one "monitor" group. IOW a
PID_x can be seen in a ctrl_mon group and a monitor group at the same
time. When a task is added to a ctrl_mon group, it is automatically
removed from the previous ctrl_mon group where it belonged. Similarly if
a task is moved to a monitor group it is removed from the previous
monitor group . Also since the monitor groups can only have subset of
tasks of parent ctrl_mon group, a task can be moved to a monitor group
only if its already present in the parent ctrl_mon group.

Task membership is indicated by a new field in the task_struct "u32
rmid" which holds the RMID for the task. RMID=0 is reserved for the
default root group where the tasks belong to at mount.

[tony: zero the rmid if rdtgroup was deleted when task was being moved]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 19 +--
 include/linux/sched.h|  1 +
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 843a131..d02596e9 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -314,6 +314,7 @@ static void move_myself(struct callback_head *head)
if (atomic_dec_and_test(>waitcount) &&
(rdtgrp->flags & RDT_DELETED)) {
current->closid = 0;
+   current->rmid = 0;
kfree(rdtgrp);
}
 
@@ -352,7 +353,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
atomic_dec(>waitcount);
kfree(callback);
} else {
-   tsk->closid = rdtgrp->closid;
+   /*
+* For ctrl_mon groups move both closid and rmid.
+* For monitor groups, can move the tasks only from
+* their parent CTRL group.
+*/
+   if (rdtgrp->type == RDTCTRL_GROUP) {
+   tsk->closid = rdtgrp->closid;
+   tsk->rmid = rdtgrp->mon.rmid;
+   } else if (rdtgrp->type == RDTMON_GROUP) {
+   if (rdtgrp->mon.parent->closid == tsk->closid)
+   tsk->rmid = rdtgrp->mon.rmid;
+   else
+   ret = -EINVAL;
+   }
}
return ret;
 }
@@ -432,7 +446,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct 
seq_file *s)
 
rcu_read_lock();
for_each_process_thread(p, t) {
-   if (t->closid == r->closid)
+   if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+   (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
seq_printf(s, "%d\n", t->pid);
}
rcu_read_unlock();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b8d0d5c..cc47923 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -866,6 +866,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_INTEL_RDT
u32 closid;
+   u32 rmid;
 #endif
 #ifdef CONFIG_FUTEX
struct robust_list_head __user  *robust_list;
-- 
1.9.1

[PATCH 13/28] x86/intel_rdt/cqm: Add mkdir support for RDT monitoring

2017-07-25 Thread Vikas Shivappa

Resource control groups can be created using mkdir in resctrl
fs(rdtgroup). In order to extend the resctrl interface to support
monitoring the control groups, extend the current mkdir to support
resource monitoring also.

This allows the rdtgroup created under the root directory to be able to
both control and monitor resources (ctrl_mon group). The ctrl_mon groups
are associated with one CLOSID like the legacy rdtgroups and one
RMID(Resource monitoring ID) as well. Hardware uses RMID to track the
resource usage. Once either of the CLOSID or RMID are exhausted, the
mkdir fails with -ENOSPC. If there are RMIDs in limbo list but not free
an -EBUSY is returned. User can also monitor a subset of the ctrl_mon
rdtgroup's tasks/cpus using the monitor groups. The monitor groups are
created using mkdir under the "mon_groups" directory in every ctrl_mon
group.

[Merged tony's code:
Removed a lot of common mkdir code, a fix to handling of the list of the
child rdtgroups and some cleanups in list traversal. Also the changes to
have similar alloc and free for CLOS/RMID and return -EBUSY when RMIDs
are in limbo and not free]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.h  |  26 ++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 134 +--
 2 files changed, 152 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 4051f5e..e8fb08f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -37,6 +37,25 @@ struct mon_evt {
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
+
+enum rdt_group_type {
+   RDTCTRL_GROUP = 0,
+   RDTMON_GROUP,
+   RDT_NUM_GROUP,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @parent:parent rdtgrp
+ * @crdtgrp_list:  child rdtgroup node list
+ * @rmid:  rmid for this rdtgroup
+ */
+struct mongroup {
+   struct rdtgroup *parent;
+   struct list_headcrdtgrp_list;
+   u32 rmid;
+};
+
 /**
  * struct rdtgroup - store rdtgroup's data in resctrl file system.
  * @kn:kernfs node
@@ -46,6 +65,9 @@ struct mon_evt {
  * @flags: status bits
  * @waitcount: how many cpus expect to find this
  * group when they acquire rdtgroup_mutex
+ * @type:  indicates type of this rdtgroup - either
+ * monitor only or ctrl_mon group
+ * @mon:   mongroup related data
  */
 struct rdtgroup {
struct kernfs_node  *kn;
@@ -54,6 +76,8 @@ struct rdtgroup {
struct cpumask  cpu_mask;
int flags;
atomic_twaitcount;
+   enum rdt_group_type type;
+   struct mongroup mon;
 };
 
 /* rdtgroup.flags */
@@ -306,6 +330,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
   struct seq_file *s, void *v);
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 64d4963..cfb2a89 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -759,6 +759,39 @@ static int rdtgroup_create_info_dir(struct kernfs_node 
*parent_kn)
return ret;
 }
 
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+   char *name, struct kernfs_node **dest_kn)
+{
+   struct kernfs_node *kn;
+   int ret;
+
+   /* create the directory */
+   kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+   if (IS_ERR(kn))
+   return PTR_ERR(kn);
+
+   if (dest_kn)
+   *dest_kn = kn;
+
+   /*
+* This extra ref will be put in kernfs_remove() and guarantees
+* that @rdtgrp->kn is always accessible.
+*/
+   kernfs_get(kn);
+
+   ret = rdtgroup_kn_set_ugid(kn);
+   if (ret)
+   goto out_destroy;
+
+   kernfs_activate(kn);
+
+   return 0;
+
+out_destroy:
+   kernfs_remove(kn);
+   return ret;
+}
 static void l3_qos_cfg_update(void *arg)
 {
bool *enable = arg;
@@ -1086,7 +1119,7 @@ static void rdt_kill_sb(struct super_block *sb)
 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 struct kernfs_node *prgrp_kn,
 const char *name, umode_t mode,
-

[PATCH 13/28] x86/intel_rdt/cqm: Add mkdir support for RDT monitoring

2017-07-25 Thread Vikas Shivappa

Resource control groups can be created using mkdir in resctrl
fs(rdtgroup). In order to extend the resctrl interface to support
monitoring the control groups, extend the current mkdir to support
resource monitoring also.

This allows the rdtgroup created under the root directory to be able to
both control and monitor resources (ctrl_mon group). The ctrl_mon groups
are associated with one CLOSID like the legacy rdtgroups and one
RMID(Resource monitoring ID) as well. Hardware uses RMID to track the
resource usage. Once either of the CLOSID or RMID are exhausted, the
mkdir fails with -ENOSPC. If there are RMIDs in limbo list but not free
an -EBUSY is returned. User can also monitor a subset of the ctrl_mon
rdtgroup's tasks/cpus using the monitor groups. The monitor groups are
created using mkdir under the "mon_groups" directory in every ctrl_mon
group.

[Merged tony's code:
Removed a lot of common mkdir code, a fix to handling of the list of the
child rdtgroups and some cleanups in list traversal. Also the changes to
have similar alloc and free for CLOS/RMID and return -EBUSY when RMIDs
are in limbo and not free]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.h  |  26 ++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 134 +--
 2 files changed, 152 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 4051f5e..e8fb08f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -37,6 +37,25 @@ struct mon_evt {
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
+
+enum rdt_group_type {
+   RDTCTRL_GROUP = 0,
+   RDTMON_GROUP,
+   RDT_NUM_GROUP,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @parent:parent rdtgrp
+ * @crdtgrp_list:  child rdtgroup node list
+ * @rmid:  rmid for this rdtgroup
+ */
+struct mongroup {
+   struct rdtgroup *parent;
+   struct list_headcrdtgrp_list;
+   u32 rmid;
+};
+
 /**
  * struct rdtgroup - store rdtgroup's data in resctrl file system.
  * @kn:kernfs node
@@ -46,6 +65,9 @@ struct mon_evt {
  * @flags: status bits
  * @waitcount: how many cpus expect to find this
  * group when they acquire rdtgroup_mutex
+ * @type:  indicates type of this rdtgroup - either
+ * monitor only or ctrl_mon group
+ * @mon:   mongroup related data
  */
 struct rdtgroup {
struct kernfs_node  *kn;
@@ -54,6 +76,8 @@ struct rdtgroup {
struct cpumask  cpu_mask;
int flags;
atomic_twaitcount;
+   enum rdt_group_type type;
+   struct mongroup mon;
 };
 
 /* rdtgroup.flags */
@@ -306,6 +330,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
   struct seq_file *s, void *v);
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 64d4963..cfb2a89 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -759,6 +759,39 @@ static int rdtgroup_create_info_dir(struct kernfs_node 
*parent_kn)
return ret;
 }
 
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+   char *name, struct kernfs_node **dest_kn)
+{
+   struct kernfs_node *kn;
+   int ret;
+
+   /* create the directory */
+   kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+   if (IS_ERR(kn))
+   return PTR_ERR(kn);
+
+   if (dest_kn)
+   *dest_kn = kn;
+
+   /*
+* This extra ref will be put in kernfs_remove() and guarantees
+* that @rdtgrp->kn is always accessible.
+*/
+   kernfs_get(kn);
+
+   ret = rdtgroup_kn_set_ugid(kn);
+   if (ret)
+   goto out_destroy;
+
+   kernfs_activate(kn);
+
+   return 0;
+
+out_destroy:
+   kernfs_remove(kn);
+   return ret;
+}
 static void l3_qos_cfg_update(void *arg)
 {
bool *enable = arg;
@@ -1086,7 +1119,7 @@ static void rdt_kill_sb(struct super_block *sb)
 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 struct kernfs_node *prgrp_kn,
 const char *name, umode_t mode,
-struct rdtgroup **r)
+enum

[PATCH 15/28] x86/intel_rdt/cqm: Add tasks file support

2017-07-25 Thread Vikas Shivappa

The root directory, ctrl_mon and monitor groups are populated
with a read/write file named "tasks". When read, it shows all the task
IDs assigned to the resource group.

Tasks can be added to groups by writing the PID to the file. A task can
be present in one "ctrl_mon" group "and" one "monitor" group. IOW a
PID_x can be seen in a ctrl_mon group and a monitor group at the same
time. When a task is added to a ctrl_mon group, it is automatically
removed from the previous ctrl_mon group where it belonged. Similarly if
a task is moved to a monitor group it is removed from the previous
monitor group . Also since the monitor groups can only have subset of
tasks of parent ctrl_mon group, a task can be moved to a monitor group
only if its already present in the parent ctrl_mon group.

Task membership is indicated by a new field in the task_struct "u32
rmid" which holds the RMID for the task. RMID=0 is reserved for the
default root group where the tasks belong to at mount.

[tony: zero the rmid if rdtgroup was deleted when task was being moved]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 19 +--
 include/linux/sched.h|  1 +
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 843a131..d02596e9 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -314,6 +314,7 @@ static void move_myself(struct callback_head *head)
if (atomic_dec_and_test(>waitcount) &&
(rdtgrp->flags & RDT_DELETED)) {
current->closid = 0;
+   current->rmid = 0;
kfree(rdtgrp);
}
 
@@ -352,7 +353,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
atomic_dec(>waitcount);
kfree(callback);
} else {
-   tsk->closid = rdtgrp->closid;
+   /*
+* For ctrl_mon groups move both closid and rmid.
+* For monitor groups, can move the tasks only from
+* their parent CTRL group.
+*/
+   if (rdtgrp->type == RDTCTRL_GROUP) {
+   tsk->closid = rdtgrp->closid;
+   tsk->rmid = rdtgrp->mon.rmid;
+   } else if (rdtgrp->type == RDTMON_GROUP) {
+   if (rdtgrp->mon.parent->closid == tsk->closid)
+   tsk->rmid = rdtgrp->mon.rmid;
+   else
+   ret = -EINVAL;
+   }
}
return ret;
 }
@@ -432,7 +446,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct 
seq_file *s)
 
rcu_read_lock();
for_each_process_thread(p, t) {
-   if (t->closid == r->closid)
+   if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+   (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
seq_printf(s, "%d\n", t->pid);
}
rcu_read_unlock();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b8d0d5c..cc47923 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -866,6 +866,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_INTEL_RDT
u32 closid;
+   u32 rmid;
 #endif
 #ifdef CONFIG_FUTEX
struct robust_list_head __user  *robust_list;
-- 
1.9.1

[PATCH 14/28] x86/intel_rdt: Change closid type from int to u32

2017-07-25 Thread Vikas Shivappa

OS associates a CLOSid(Class of service id) to a task by writing the
high 32 bits of per CPU IA32_PQR_ASSOC MSR when a task is scheduled in.
CPUID.(EAX=10H, ECX=1):EDX[15:0] enumerates the max CLOSID supported and
it is zero indexed. Hence change the type to u32 from int.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h   | 2 +-
 arch/x86/kernel/cpu/intel_rdt.h  | 2 +-
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +-
 include/linux/sched.h| 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 4dee77b..1d8d45a 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -46,7 +46,7 @@ static inline void intel_rdt_sched_in(void)
 {
if (static_branch_likely(_alloc_enable_key)) {
struct intel_pqr_state *state = this_cpu_ptr(_state);
-   int closid;
+   u32 closid;
 
/*
 * If this task has a closid assigned, use it.
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index e8fb08f..b2a2de3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -72,7 +72,7 @@ struct mongroup {
 struct rdtgroup {
struct kernfs_node  *kn;
struct list_headrdtgroup_list;
-   int closid;
+   u32 closid;
struct cpumask  cpu_mask;
int flags;
atomic_twaitcount;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index cfb2a89..843a131 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -77,7 +77,7 @@ static void closid_init(void)
 
 static int closid_alloc(void)
 {
-   int closid = ffs(closid_free_map);
+   u32 closid = ffs(closid_free_map);
 
if (closid == 0)
return -ENOSPC;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e31b3d..b8d0d5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -865,7 +865,7 @@ struct task_struct {
struct list_headcg_list;
 #endif
 #ifdef CONFIG_INTEL_RDT
-   int closid;
+   u32 closid;
 #endif
 #ifdef CONFIG_FUTEX
struct robust_list_head __user  *robust_list;
-- 
1.9.1

[PATCH 14/28] x86/intel_rdt: Change closid type from int to u32

2017-07-25 Thread Vikas Shivappa

OS associates a CLOSid(Class of service id) to a task by writing the
high 32 bits of per CPU IA32_PQR_ASSOC MSR when a task is scheduled in.
CPUID.(EAX=10H, ECX=1):EDX[15:0] enumerates the max CLOSID supported and
it is zero indexed. Hence change the type to u32 from int.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h   | 2 +-
 arch/x86/kernel/cpu/intel_rdt.h  | 2 +-
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +-
 include/linux/sched.h| 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 4dee77b..1d8d45a 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -46,7 +46,7 @@ static inline void intel_rdt_sched_in(void)
 {
if (static_branch_likely(_alloc_enable_key)) {
struct intel_pqr_state *state = this_cpu_ptr(_state);
-   int closid;
+   u32 closid;
 
/*
 * If this task has a closid assigned, use it.
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index e8fb08f..b2a2de3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -72,7 +72,7 @@ struct mongroup {
 struct rdtgroup {
struct kernfs_node  *kn;
struct list_headrdtgroup_list;
-   int closid;
+   u32 closid;
struct cpumask  cpu_mask;
int flags;
atomic_twaitcount;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index cfb2a89..843a131 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -77,7 +77,7 @@ static void closid_init(void)
 
 static int closid_alloc(void)
 {
-   int closid = ffs(closid_free_map);
+   u32 closid = ffs(closid_free_map);
 
if (closid == 0)
return -ENOSPC;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e31b3d..b8d0d5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -865,7 +865,7 @@ struct task_struct {
struct list_headcg_list;
 #endif
 #ifdef CONFIG_INTEL_RDT
-   int closid;
+   u32 closid;
 #endif
 #ifdef CONFIG_FUTEX
struct robust_list_head __user  *robust_list;
-- 
1.9.1

[PATCH 10/28] x86/intel_rdt: Simplify info and base file lists

2017-07-25 Thread Vikas Shivappa

From: Tony luck 

The info directory files and base files need to be different for each
resource like cache and Memory bandwidth. With in each resource, the
files would be further different for monitoring and ctrl. This leads to
a lot of different static array declarations given that we are adding
resctrl monitoring.

Simplify this to one common list of files and then declare a set of
flags to choose the files based on the resource, whether it is info or
base and if it is control type file. This is as a preparation to include
monitoring based info and base files.

No functional change.

[Vikas: Extended the flags to have few bits per category like resource,
info/base etc]

Signed-off-by: Tony luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c  |   7 +-
 arch/x86/kernel/cpu/intel_rdt.h  |  22 +++-
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 191 +++
 3 files changed, 115 insertions(+), 105 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index fe6dc75..eab2467 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -83,6 +83,7 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval  = parse_cbm,
.format_str = "%d=%0*x",
+   .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L3DATA] =
{
@@ -98,6 +99,7 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval  = parse_cbm,
.format_str = "%d=%0*x",
+   .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L3CODE] =
{
@@ -113,6 +115,7 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval  = parse_cbm,
.format_str = "%d=%0*x",
+   .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L2] =
{
@@ -128,6 +131,7 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval  = parse_cbm,
.format_str = "%d=%0*x",
+   .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_MBA] =
{
@@ -138,6 +142,7 @@ struct rdt_resource rdt_resources_all[] = {
.cache_level= 3,
.parse_ctrlval  = parse_bw,
.format_str = "%d=%*d",
+   .fflags = RFTYPE_RES_MB,
},
 };
 
@@ -233,7 +238,6 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
return false;
}
r->data_width = 3;
-   rdt_get_mba_infofile(r);
 
r->alloc_capable = true;
r->alloc_enabled = true;
@@ -252,7 +256,6 @@ static void rdt_get_cache_alloc_config(int idx, struct 
rdt_resource *r)
r->cache.cbm_len = eax.split.cbm_len + 1;
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
r->data_width = (r->cache.cbm_len + 3) / 4;
-   rdt_get_cache_infofile(r);
r->alloc_capable = true;
r->alloc_enabled = true;
 }
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 35bf8eb..aecbe77 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -62,6 +62,18 @@ struct rdtgroup {
 /* rftype.flags */
 #define RFTYPE_FLAGS_CPUS_LIST 1
 
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFOBIT(0)
+#define RFTYPE_BASEBIT(1)
+#define RF_CTRLSHIFT   4
+#define RFTYPE_CTRLBIT(RF_CTRLSHIFT)
+#define RFTYPE_RES_CACHE   BIT(8)
+#define RFTYPE_RES_MB  BIT(9)
+#define RF_CTRL_INFO   (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_CTRL_BASE   (RFTYPE_BASE | RFTYPE_CTRL)
+
 /* List of all resource groups */
 extern struct list_head rdt_all_groups;
 
@@ -75,6 +87,7 @@ struct rdtgroup {
  * @mode:  Access mode
  * @kf_ops:File operations
  * @flags: File specific RFTYPE_FLAGS_* flags
+ * @fflags:File specific RF_* or RFTYPE_* flags
  * @seq_show:  Show content of the file
  * @write: Write to the file
  */
@@ -83,6 +96,7 @@ struct rftype {
umode_t mode;
struct kernfs_ops   *kf_ops;
unsigned long   flags;
+   unsigned long   fflags;
 
int (*seq_show)(struct kernfs_open_file *of,
struct seq_file *sf, void *v);
@@ -181,13 +195,12 @@ static inline bool is_llc_occupancy_enabled(void)
  * @data_width:Character width of data when displaying
  * @domains:   All domains for this resource
  * @cache:

[PATCH 11/28] x86/intel_rdt/cqm: Add info files for RDT monitoring

2017-07-25 Thread Vikas Shivappa

Add info directory files specific to RDT monitoring.

 num_rmids:
The number of RMIDs which are valid for the resource.

 mon_features:
Lists the monitoring events if monitoring is enabled for the
resource.

 max_threshold_occupancy:
This is specific to llc_occupancy monitoring and is used to
determine if an RMID can be reused. Provides an upper bound on the
threshold and is shown to the user in bytes though the internal
value will be rounded to the scaling factor supported by the h/w.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.h  |  8 
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 82 
 2 files changed, 90 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index aecbe77..4051f5e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -68,10 +68,13 @@ struct rdtgroup {
 #define RFTYPE_INFOBIT(0)
 #define RFTYPE_BASEBIT(1)
 #define RF_CTRLSHIFT   4
+#define RF_MONSHIFT5
 #define RFTYPE_CTRLBIT(RF_CTRLSHIFT)
+#define RFTYPE_MON BIT(RF_MONSHIFT)
 #define RFTYPE_RES_CACHE   BIT(8)
 #define RFTYPE_RES_MB  BIT(9)
 #define RF_CTRL_INFO   (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO(RFTYPE_INFO | RFTYPE_MON)
 #define RF_CTRL_BASE   (RFTYPE_BASE | RFTYPE_CTRL)
 
 /* List of all resource groups */
@@ -264,6 +267,11 @@ enum {
 r++) \
if (r->alloc_enabled)
 
+#define for_each_mon_enabled_rdt_resource(r) \
+   for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+r++) \
+   if (r->mon_enabled)
+
 /* CPUID.(EAX=10H, ECX=ResID=1).EAX */
 union cpuid_0x10_1_eax {
struct {
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 20dbabd..d121339 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -490,6 +490,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
return 0;
 }
 
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+   struct rdt_resource *r = of->kn->parent->priv;
+
+   seq_printf(seq, "%d\n", r->num_rmid);
+
+   return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+struct seq_file *seq, void *v)
+{
+   struct rdt_resource *r = of->kn->parent->priv;
+   struct mon_evt *mevt;
+
+   list_for_each_entry(mevt, >evt_list, list)
+   seq_printf(seq, "%s\n", mevt->name);
+
+   return 0;
+}
+
 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 struct seq_file *seq, void *v)
 {
@@ -508,6 +530,35 @@ static int rdt_delay_linear_show(struct kernfs_open_file 
*of,
return 0;
 }
 
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+   struct rdt_resource *r = of->kn->parent->priv;
+
+   seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+
+   return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+  char *buf, size_t nbytes, loff_t off)
+{
+   struct rdt_resource *r = of->kn->parent->priv;
+   unsigned int bytes;
+   int ret;
+
+   ret = kstrtouint(buf, 0, );
+   if (ret)
+   return ret;
+
+   if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+   return -EINVAL;
+
+   intel_cqm_threshold = bytes / r->mon_scale;
+
+   return ret ?: nbytes;
+}
+
 /* rdtgroup information files for one cache resource. */
 static struct rftype res_common_files[] = {
{
@@ -518,6 +569,20 @@ static int rdt_delay_linear_show(struct kernfs_open_file 
*of,
.fflags = RF_CTRL_INFO,
},
{
+   .name   = "mon_features",
+   .mode   = 0444,
+   .kf_ops = _kf_single_ops,
+   .seq_show   = rdt_mon_features_show,
+   .fflags = RF_MON_INFO,
+   },
+   {
+   .name   = "num_rmids",
+   .mode   = 0444,
+   .kf_ops = _kf_single_ops,
+   .seq_show   = rdt_num_rmids_show,
+   .fflags = RF_MON_INFO,
+   },
+   {
.name   = "cbm_mask",
.mode   = 0444,
.kf_ops = _kf_single_ops,
@@ -553,6

[PATCH 10/28] x86/intel_rdt: Simplify info and base file lists

2017-07-25 Thread Vikas Shivappa

From: Tony luck 

The info directory files and base files need to be different for each
resource like cache and Memory bandwidth. With in each resource, the
files would be further different for monitoring and ctrl. This leads to
a lot of different static array declarations given that we are adding
resctrl monitoring.

Simplify this to one common list of files and then declare a set of
flags to choose the files based on the resource, whether it is info or
base and if it is control type file. This is as a preparation to include
monitoring based info and base files.

No functional change.

[Vikas: Extended the flags to have few bits per category like resource,
info/base etc]

Signed-off-by: Tony luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c  |   7 +-
 arch/x86/kernel/cpu/intel_rdt.h  |  22 +++-
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 191 +++
 3 files changed, 115 insertions(+), 105 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index fe6dc75..eab2467 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -83,6 +83,7 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval  = parse_cbm,
.format_str = "%d=%0*x",
+   .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L3DATA] =
{
@@ -98,6 +99,7 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval  = parse_cbm,
.format_str = "%d=%0*x",
+   .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L3CODE] =
{
@@ -113,6 +115,7 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval  = parse_cbm,
.format_str = "%d=%0*x",
+   .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L2] =
{
@@ -128,6 +131,7 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval  = parse_cbm,
.format_str = "%d=%0*x",
+   .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_MBA] =
{
@@ -138,6 +142,7 @@ struct rdt_resource rdt_resources_all[] = {
.cache_level= 3,
.parse_ctrlval  = parse_bw,
.format_str = "%d=%*d",
+   .fflags = RFTYPE_RES_MB,
},
 };
 
@@ -233,7 +238,6 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
return false;
}
r->data_width = 3;
-   rdt_get_mba_infofile(r);
 
r->alloc_capable = true;
r->alloc_enabled = true;
@@ -252,7 +256,6 @@ static void rdt_get_cache_alloc_config(int idx, struct 
rdt_resource *r)
r->cache.cbm_len = eax.split.cbm_len + 1;
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
r->data_width = (r->cache.cbm_len + 3) / 4;
-   rdt_get_cache_infofile(r);
r->alloc_capable = true;
r->alloc_enabled = true;
 }
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 35bf8eb..aecbe77 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -62,6 +62,18 @@ struct rdtgroup {
 /* rftype.flags */
 #define RFTYPE_FLAGS_CPUS_LIST 1
 
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFOBIT(0)
+#define RFTYPE_BASEBIT(1)
+#define RF_CTRLSHIFT   4
+#define RFTYPE_CTRLBIT(RF_CTRLSHIFT)
+#define RFTYPE_RES_CACHE   BIT(8)
+#define RFTYPE_RES_MB  BIT(9)
+#define RF_CTRL_INFO   (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_CTRL_BASE   (RFTYPE_BASE | RFTYPE_CTRL)
+
 /* List of all resource groups */
 extern struct list_head rdt_all_groups;
 
@@ -75,6 +87,7 @@ struct rdtgroup {
  * @mode:  Access mode
  * @kf_ops:File operations
  * @flags: File specific RFTYPE_FLAGS_* flags
+ * @fflags:File specific RF_* or RFTYPE_* flags
  * @seq_show:  Show content of the file
  * @write: Write to the file
  */
@@ -83,6 +96,7 @@ struct rftype {
umode_t mode;
struct kernfs_ops   *kf_ops;
unsigned long   flags;
+   unsigned long   fflags;
 
int (*seq_show)(struct kernfs_open_file *of,
struct seq_file *sf, void *v);
@@ -181,13 +195,12 @@ static inline bool is_llc_occupancy_enabled(void)
  * @data_width:Character width of data when displaying
  * @domains:   All domains for this resource
  * @cache: Cache allocation related data
- * @info_files:resctrl info

[PATCH 11/28] x86/intel_rdt/cqm: Add info files for RDT monitoring

2017-07-25 Thread Vikas Shivappa

Add info directory files specific to RDT monitoring.

 num_rmids:
The number of RMIDs which are valid for the resource.

 mon_features:
Lists the monitoring events if monitoring is enabled for the
resource.

 max_threshold_occupancy:
This is specific to llc_occupancy monitoring and is used to
determine if an RMID can be reused. Provides an upper bound on the
threshold and is shown to the user in bytes though the internal
value will be rounded to the scaling factor supported by the h/w.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.h  |  8 
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 82 
 2 files changed, 90 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index aecbe77..4051f5e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -68,10 +68,13 @@ struct rdtgroup {
 #define RFTYPE_INFOBIT(0)
 #define RFTYPE_BASEBIT(1)
 #define RF_CTRLSHIFT   4
+#define RF_MONSHIFT5
 #define RFTYPE_CTRLBIT(RF_CTRLSHIFT)
+#define RFTYPE_MON BIT(RF_MONSHIFT)
 #define RFTYPE_RES_CACHE   BIT(8)
 #define RFTYPE_RES_MB  BIT(9)
 #define RF_CTRL_INFO   (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO(RFTYPE_INFO | RFTYPE_MON)
 #define RF_CTRL_BASE   (RFTYPE_BASE | RFTYPE_CTRL)
 
 /* List of all resource groups */
@@ -264,6 +267,11 @@ enum {
 r++) \
if (r->alloc_enabled)
 
+#define for_each_mon_enabled_rdt_resource(r) \
+   for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+r++) \
+   if (r->mon_enabled)
+
 /* CPUID.(EAX=10H, ECX=ResID=1).EAX */
 union cpuid_0x10_1_eax {
struct {
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 20dbabd..d121339 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -490,6 +490,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
return 0;
 }
 
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+   struct rdt_resource *r = of->kn->parent->priv;
+
+   seq_printf(seq, "%d\n", r->num_rmid);
+
+   return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+struct seq_file *seq, void *v)
+{
+   struct rdt_resource *r = of->kn->parent->priv;
+   struct mon_evt *mevt;
+
+   list_for_each_entry(mevt, >evt_list, list)
+   seq_printf(seq, "%s\n", mevt->name);
+
+   return 0;
+}
+
 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 struct seq_file *seq, void *v)
 {
@@ -508,6 +530,35 @@ static int rdt_delay_linear_show(struct kernfs_open_file 
*of,
return 0;
 }
 
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+   struct rdt_resource *r = of->kn->parent->priv;
+
+   seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+
+   return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+  char *buf, size_t nbytes, loff_t off)
+{
+   struct rdt_resource *r = of->kn->parent->priv;
+   unsigned int bytes;
+   int ret;
+
+   ret = kstrtouint(buf, 0, );
+   if (ret)
+   return ret;
+
+   if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+   return -EINVAL;
+
+   intel_cqm_threshold = bytes / r->mon_scale;
+
+   return ret ?: nbytes;
+}
+
 /* rdtgroup information files for one cache resource. */
 static struct rftype res_common_files[] = {
{
@@ -518,6 +569,20 @@ static int rdt_delay_linear_show(struct kernfs_open_file 
*of,
.fflags = RF_CTRL_INFO,
},
{
+   .name   = "mon_features",
+   .mode   = 0444,
+   .kf_ops = _kf_single_ops,
+   .seq_show   = rdt_mon_features_show,
+   .fflags = RF_MON_INFO,
+   },
+   {
+   .name   = "num_rmids",
+   .mode   = 0444,
+   .kf_ops = _kf_single_ops,
+   .seq_show   = rdt_num_rmids_show,
+   .fflags = RF_MON_INFO,
+   },
+   {
.name   = "cbm_mask",
.mode   = 0444,
.kf_ops = _kf_single_ops,
@@ -553,6 +618,14 @@ static int

[PATCH 12/28] x86/intel_rdt: Prepare for RDT monitoring mkdir support

2017-07-25 Thread Vikas Shivappa

Separate the ctrl mkdir code from the rest in order to prepare for
adding support for RDT monitoring mkdir support as well.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 112 ++-
 1 file changed, 80 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index d121339..64d4963 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1083,46 +1083,35 @@ static void rdt_kill_sb(struct super_block *sb)
.kill_sb = rdt_kill_sb,
 };
 
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+struct kernfs_node *prgrp_kn,
+const char *name, umode_t mode,
+struct rdtgroup **r)
 {
-   struct rdtgroup *parent, *rdtgrp;
+   struct rdtgroup *prdtgrp, *rdtgrp;
struct kernfs_node *kn;
-   int ret, closid;
-
-   /* Only allow mkdir in the root directory */
-   if (parent_kn != rdtgroup_default.kn)
-   return -EPERM;
-
-   /* Do not accept '\n' to avoid unparsable situation. */
-   if (strchr(name, '\n'))
-   return -EINVAL;
+   uint files = 0;
+   int ret;
 
-   parent = rdtgroup_kn_lock_live(parent_kn);
-   if (!parent) {
+   prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+   if (!prdtgrp) {
ret = -ENODEV;
goto out_unlock;
}
 
-   ret = closid_alloc();
-   if (ret < 0)
-   goto out_unlock;
-   closid = ret;
-
/* allocate the rdtgroup. */
rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
if (!rdtgrp) {
ret = -ENOSPC;
-   goto out_closid_free;
+   goto out_unlock;
}
-   rdtgrp->closid = closid;
-   list_add(>rdtgroup_list, _all_groups);
+   *r = rdtgrp;
 
/* kernfs creates the directory for rdtgrp */
-   kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+   kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
-   goto out_cancel_ref;
+   goto out_free_rgrp;
}
rdtgrp->kn = kn;
 
@@ -1138,27 +1127,86 @@ static int rdtgroup_mkdir(struct kernfs_node 
*parent_kn, const char *name,
if (ret)
goto out_destroy;
 
-   ret = rdtgroup_add_files(kn, RF_CTRL_BASE);
+   files = RFTYPE_BASE | RFTYPE_CTRL;
+   ret = rdtgroup_add_files(kn, files);
if (ret)
goto out_destroy;
 
kernfs_activate(kn);
 
-   ret = 0;
-   goto out_unlock;
+   /*
+* The caller unlocks the prgrp_kn upon success.
+*/
+   return 0;
 
 out_destroy:
kernfs_remove(rdtgrp->kn);
-out_cancel_ref:
-   list_del(>rdtgroup_list);
+out_free_rgrp:
kfree(rdtgrp);
-out_closid_free:
-   closid_free(closid);
 out_unlock:
-   rdtgroup_kn_unlock(parent_kn);
+   rdtgroup_kn_unlock(prgrp_kn);
+   return ret;
+}
+
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+   kernfs_remove(rgrp->kn);
+   kfree(rgrp);
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate resources.
+ */
+static int rdtgroup_mkdir_ctrl(struct kernfs_node *parent_kn,
+  struct kernfs_node *prgrp_kn,
+  const char *name, umode_t mode)
+{
+   struct rdtgroup *rdtgrp;
+   struct kernfs_node *kn;
+   u32 closid;
+   int ret;
+
+   ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, );
+   if (ret)
+   return ret;
+
+   kn = rdtgrp->kn;
+   ret = closid_alloc();
+   if (ret < 0)
+   goto out_common_fail;
+   closid = ret;
+
+   rdtgrp->closid = closid;
+   list_add(>rdtgroup_list, _all_groups);
+
+   goto out_unlock;
+
+out_common_fail:
+   mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+   rdtgroup_kn_unlock(prgrp_kn);
return ret;
 }
 
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+   /* Do not accept '\n' to avoid unparsable situation. */
+   if (strchr(name, '\n'))
+   return -EINVAL;
+
+   /*
+* If the parent directory is the root directory and RDT
+* allocation is supported, add a control rdtgroup.
+*/
+   if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+   return rdtgroup_mkdir_ctrl(parent_kn, parent_kn, name,
+  mode);
+
+   return -EPERM;
+}
+
 static int rdtgroup_rmdir(struct kernfs_node *kn)
 {
int ret,

[PATCH 12/28] x86/intel_rdt: Prepare for RDT monitoring mkdir support

2017-07-25 Thread Vikas Shivappa

Separate the ctrl mkdir code from the rest in order to prepare for
adding support for RDT monitoring mkdir support as well.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 112 ++-
 1 file changed, 80 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index d121339..64d4963 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1083,46 +1083,35 @@ static void rdt_kill_sb(struct super_block *sb)
.kill_sb = rdt_kill_sb,
 };
 
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+struct kernfs_node *prgrp_kn,
+const char *name, umode_t mode,
+struct rdtgroup **r)
 {
-   struct rdtgroup *parent, *rdtgrp;
+   struct rdtgroup *prdtgrp, *rdtgrp;
struct kernfs_node *kn;
-   int ret, closid;
-
-   /* Only allow mkdir in the root directory */
-   if (parent_kn != rdtgroup_default.kn)
-   return -EPERM;
-
-   /* Do not accept '\n' to avoid unparsable situation. */
-   if (strchr(name, '\n'))
-   return -EINVAL;
+   uint files = 0;
+   int ret;
 
-   parent = rdtgroup_kn_lock_live(parent_kn);
-   if (!parent) {
+   prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+   if (!prdtgrp) {
ret = -ENODEV;
goto out_unlock;
}
 
-   ret = closid_alloc();
-   if (ret < 0)
-   goto out_unlock;
-   closid = ret;
-
/* allocate the rdtgroup. */
rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
if (!rdtgrp) {
ret = -ENOSPC;
-   goto out_closid_free;
+   goto out_unlock;
}
-   rdtgrp->closid = closid;
-   list_add(>rdtgroup_list, _all_groups);
+   *r = rdtgrp;
 
/* kernfs creates the directory for rdtgrp */
-   kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+   kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
-   goto out_cancel_ref;
+   goto out_free_rgrp;
}
rdtgrp->kn = kn;
 
@@ -1138,27 +1127,86 @@ static int rdtgroup_mkdir(struct kernfs_node 
*parent_kn, const char *name,
if (ret)
goto out_destroy;
 
-   ret = rdtgroup_add_files(kn, RF_CTRL_BASE);
+   files = RFTYPE_BASE | RFTYPE_CTRL;
+   ret = rdtgroup_add_files(kn, files);
if (ret)
goto out_destroy;
 
kernfs_activate(kn);
 
-   ret = 0;
-   goto out_unlock;
+   /*
+* The caller unlocks the prgrp_kn upon success.
+*/
+   return 0;
 
 out_destroy:
kernfs_remove(rdtgrp->kn);
-out_cancel_ref:
-   list_del(>rdtgroup_list);
+out_free_rgrp:
kfree(rdtgrp);
-out_closid_free:
-   closid_free(closid);
 out_unlock:
-   rdtgroup_kn_unlock(parent_kn);
+   rdtgroup_kn_unlock(prgrp_kn);
+   return ret;
+}
+
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+   kernfs_remove(rgrp->kn);
+   kfree(rgrp);
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate resources.
+ */
+static int rdtgroup_mkdir_ctrl(struct kernfs_node *parent_kn,
+  struct kernfs_node *prgrp_kn,
+  const char *name, umode_t mode)
+{
+   struct rdtgroup *rdtgrp;
+   struct kernfs_node *kn;
+   u32 closid;
+   int ret;
+
+   ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, );
+   if (ret)
+   return ret;
+
+   kn = rdtgrp->kn;
+   ret = closid_alloc();
+   if (ret < 0)
+   goto out_common_fail;
+   closid = ret;
+
+   rdtgrp->closid = closid;
+   list_add(>rdtgroup_list, _all_groups);
+
+   goto out_unlock;
+
+out_common_fail:
+   mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+   rdtgroup_kn_unlock(prgrp_kn);
return ret;
 }
 
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+   /* Do not accept '\n' to avoid unparsable situation. */
+   if (strchr(name, '\n'))
+   return -EINVAL;
+
+   /*
+* If the parent directory is the root directory and RDT
+* allocation is supported, add a control rdtgroup.
+*/
+   if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+   return rdtgroup_mkdir_ctrl(parent_kn, parent_kn, name,
+  mode);
+
+   return -EPERM;
+}
+
 static int rdtgroup_rmdir(struct kernfs_node *kn)
 {
int ret, cpu, closid =

[PATCH 06/28] x86/intel_rdt: Cleanup namespace to support RDT monitoring

2017-07-25 Thread Vikas Shivappa

Few of the data-structures have generic names although they are RDT
allocation specific. Rename them to be allocation specific to
accommodate RDT monitoring. E.g. s/enabled/alloc_enabled/

No functional change.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h   |  4 ++--
 arch/x86/kernel/cpu/intel_rdt.c  | 26 -
 arch/x86/kernel/cpu/intel_rdt.h  | 18 -
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 33 
 arch/x86/kernel/cpu/intel_rdt_schemata.c |  8 
 5 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 62a70bc..4dee77b 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -27,7 +27,7 @@ struct intel_pqr_state {
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
  * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
@@ -44,7 +44,7 @@ struct intel_pqr_state {
  */
 static inline void intel_rdt_sched_in(void)
 {
-   if (static_branch_likely(_enable_key)) {
+   if (static_branch_likely(_alloc_enable_key)) {
struct intel_pqr_state *state = this_cpu_ptr(_state);
int closid;
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 08872e9..835e1ff 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -173,8 +173,8 @@ static inline bool cache_alloc_hsw_probe(void)
r->default_ctrl = max_cbm;
r->cache.cbm_len = 20;
r->cache.min_cbm_bits = 2;
-   r->capable = true;
-   r->enabled = true;
+   r->alloc_capable = true;
+   r->alloc_enabled = true;
 
return true;
}
@@ -224,8 +224,8 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
r->data_width = 3;
rdt_get_mba_infofile(r);
 
-   r->capable = true;
-   r->enabled = true;
+   r->alloc_capable = true;
+   r->alloc_enabled = true;
 
return true;
 }
@@ -242,8 +242,8 @@ static void rdt_get_cache_config(int idx, struct 
rdt_resource *r)
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
r->data_width = (r->cache.cbm_len + 3) / 4;
rdt_get_cache_infofile(r);
-   r->capable = true;
-   r->enabled = true;
+   r->alloc_capable = true;
+   r->alloc_enabled = true;
 }
 
 static void rdt_get_cdp_l3_config(int type)
@@ -255,12 +255,12 @@ static void rdt_get_cdp_l3_config(int type)
r->cache.cbm_len = r_l3->cache.cbm_len;
r->default_ctrl = r_l3->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
-   r->capable = true;
+   r->alloc_capable = true;
/*
 * By default, CDP is disabled. CDP can be enabled by mount parameter
 * "cdp" during resctrl file system mount time.
 */
-   r->enabled = false;
+   r->alloc_enabled = false;
 }
 
 static int get_cache_id(int cpu, int level)
@@ -422,7 +422,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
d->id = id;
 
-   if (domain_setup_ctrlval(r, d)) {
+   if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
kfree(d);
return;
}
@@ -464,7 +464,7 @@ static int intel_rdt_online_cpu(unsigned int cpu)
struct rdt_resource *r;
 
mutex_lock(_mutex);
-   for_each_capable_rdt_resource(r)
+   for_each_alloc_capable_rdt_resource(r)
domain_add_cpu(cpu, r);
/* The cpu is set in default rdtgroup after online. */
cpumask_set_cpu(cpu, _default.cpu_mask);
@@ -480,7 +480,7 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
struct rdt_resource *r;
 
mutex_lock(_mutex);
-   for_each_capable_rdt_resource(r)
+   for_each_alloc_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
list_for_each_entry(rdtgrp, _all_groups, rdtgroup_list) {
if (cpumask_test_and_clear_cpu(cpu, >cpu_mask))
@@ -501,7 +501,7 @@ static __init void rdt_init_padding(void)
struct rdt_resource *r;
int cl;
 
-   for_each_capable_rdt_resource(r) {
+   for_each_alloc_capable_rdt_resource(r) {
cl = strlen(r->name);
if (cl > max_name_width)
max_name_width = cl;
@@ -565,7 +565,7 @@ static int __init intel_rdt_late_init(void)
return ret;
}
 
-   for_each_capable_rdt_resource(r)
+   for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);
 
return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h

[PATCH 06/28] x86/intel_rdt: Cleanup namespace to support RDT monitoring

2017-07-25 Thread Vikas Shivappa

Few of the data-structures have generic names although they are RDT
allocation specific. Rename them to be allocation specific to
accommodate RDT monitoring. E.g. s/enabled/alloc_enabled/

No functional change.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h   |  4 ++--
 arch/x86/kernel/cpu/intel_rdt.c  | 26 -
 arch/x86/kernel/cpu/intel_rdt.h  | 18 -
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 33 
 arch/x86/kernel/cpu/intel_rdt_schemata.c |  8 
 5 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 62a70bc..4dee77b 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -27,7 +27,7 @@ struct intel_pqr_state {
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
  * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
@@ -44,7 +44,7 @@ struct intel_pqr_state {
  */
 static inline void intel_rdt_sched_in(void)
 {
-   if (static_branch_likely(_enable_key)) {
+   if (static_branch_likely(_alloc_enable_key)) {
struct intel_pqr_state *state = this_cpu_ptr(_state);
int closid;
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 08872e9..835e1ff 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -173,8 +173,8 @@ static inline bool cache_alloc_hsw_probe(void)
r->default_ctrl = max_cbm;
r->cache.cbm_len = 20;
r->cache.min_cbm_bits = 2;
-   r->capable = true;
-   r->enabled = true;
+   r->alloc_capable = true;
+   r->alloc_enabled = true;
 
return true;
}
@@ -224,8 +224,8 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
r->data_width = 3;
rdt_get_mba_infofile(r);
 
-   r->capable = true;
-   r->enabled = true;
+   r->alloc_capable = true;
+   r->alloc_enabled = true;
 
return true;
 }
@@ -242,8 +242,8 @@ static void rdt_get_cache_config(int idx, struct 
rdt_resource *r)
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
r->data_width = (r->cache.cbm_len + 3) / 4;
rdt_get_cache_infofile(r);
-   r->capable = true;
-   r->enabled = true;
+   r->alloc_capable = true;
+   r->alloc_enabled = true;
 }
 
 static void rdt_get_cdp_l3_config(int type)
@@ -255,12 +255,12 @@ static void rdt_get_cdp_l3_config(int type)
r->cache.cbm_len = r_l3->cache.cbm_len;
r->default_ctrl = r_l3->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
-   r->capable = true;
+   r->alloc_capable = true;
/*
 * By default, CDP is disabled. CDP can be enabled by mount parameter
 * "cdp" during resctrl file system mount time.
 */
-   r->enabled = false;
+   r->alloc_enabled = false;
 }
 
 static int get_cache_id(int cpu, int level)
@@ -422,7 +422,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
d->id = id;
 
-   if (domain_setup_ctrlval(r, d)) {
+   if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
kfree(d);
return;
}
@@ -464,7 +464,7 @@ static int intel_rdt_online_cpu(unsigned int cpu)
struct rdt_resource *r;
 
mutex_lock(_mutex);
-   for_each_capable_rdt_resource(r)
+   for_each_alloc_capable_rdt_resource(r)
domain_add_cpu(cpu, r);
/* The cpu is set in default rdtgroup after online. */
cpumask_set_cpu(cpu, _default.cpu_mask);
@@ -480,7 +480,7 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
struct rdt_resource *r;
 
mutex_lock(_mutex);
-   for_each_capable_rdt_resource(r)
+   for_each_alloc_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
list_for_each_entry(rdtgrp, _all_groups, rdtgroup_list) {
if (cpumask_test_and_clear_cpu(cpu, >cpu_mask))
@@ -501,7 +501,7 @@ static __init void rdt_init_padding(void)
struct rdt_resource *r;
int cl;
 
-   for_each_capable_rdt_resource(r) {
+   for_each_alloc_capable_rdt_resource(r) {
cl = strlen(r->name);
if (cl > max_name_width)
max_name_width = cl;
@@ -565,7 +565,7 @@ static int __init intel_rdt_late_init(void)
return ret;
}
 
-   for_each_capable_rdt_resource(r)
+   for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);
 
return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index

[PATCH 08/28] x86/intel_rdt/cqm: Add RDT monitoring initialization

2017-07-25 Thread Vikas Shivappa

Add common data structures for RDT resource monitoring and perform RDT
monitoring related data structure initializations which include setting
up the RMID(Resource monitoring ID) lists and event list which the
resource supports.

[tony: some cleanup to make adding MBM easier later, remove "cqm"
from some names, make some data structure local to intel_rdt_monitor.c
static. Add copyright header]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/Makefile|   2 +-
 arch/x86/kernel/cpu/intel_rdt.c |  46 -
 arch/x86/kernel/cpu/intel_rdt.h |  44 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 161 
 4 files changed, 248 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/intel_rdt_monitor.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a576121..81b0060 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)   += umc.o
 
-obj-$(CONFIG_INTEL_RDT)+= intel_rdt.o intel_rdt_rdtgroup.o 
intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT)+= intel_rdt.o intel_rdt_rdtgroup.o 
intel_rdt_schemata.o intel_rdt_monitor.o
 
 obj-$(CONFIG_X86_MCE)  += mcheck/
 obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 98715c5..36e6454 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -55,6 +55,12 @@
  */
 int max_name_width, max_data_width;
 
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
 static void
 mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 static void
@@ -235,7 +241,7 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
return true;
 }
 
-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
+static void rdt_get_cache_alloc_config(int idx, struct rdt_resource *r)
 {
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_edx edx;
@@ -516,7 +522,7 @@ static __init void rdt_init_padding(void)
}
 }
 
-static __init bool get_rdt_resources(void)
+static __init bool get_rdt_alloc_resources(void)
 {
bool ret = false;
 
@@ -527,7 +533,8 @@ static __init bool get_rdt_resources(void)
return false;
 
if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
-   rdt_get_cache_config(1, _resources_all[RDT_RESOURCE_L3]);
+   rdt_get_cache_alloc_config(1,
+  _resources_all[RDT_RESOURCE_L3]);
if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
@@ -536,7 +543,8 @@ static __init bool get_rdt_resources(void)
}
if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
/* CPUID 0x10.2 fields are same format at 0x10.1 */
-   rdt_get_cache_config(2, _resources_all[RDT_RESOURCE_L2]);
+   rdt_get_cache_alloc_config(2,
+  _resources_all[RDT_RESOURCE_L2]);
ret = true;
}
 
@@ -548,6 +556,33 @@ static __init bool get_rdt_resources(void)
return ret;
 }
 
+static __init bool get_rdt_mon_resources(void)
+{
+   if (boot_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+   rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+   if (boot_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+   rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+   if (boot_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+   rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+   if (rdt_mon_features) {
+   if (!rdt_get_mon_l3_config(_resources_all[RDT_RESOURCE_L3]))
+   return true;
+   else
+   return false;
+   }
+
+   return false;
+}
+
+static __init bool get_rdt_resources(void)
+{
+   rdt_alloc_capable = get_rdt_alloc_resources();
+   rdt_mon_capable = get_rdt_mon_resources();
+
+   return (rdt_mon_capable || rdt_alloc_capable);
+}
+
 static int __init intel_rdt_late_init(void)
 {
struct rdt_resource *r;
@@ -573,6 +608,9 @@ static int __init intel_rdt_late_init(void)
for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);
 
+   for_each_mon_capable_rdt_resource(r)
+   pr_info("Intel RDT %s monitoring detected\n", r->name);
+
return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 29630af..993ab9d 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++

[PATCH 09/28] x86/intel_rdt/cqm: Add RMID(Resource monitoring ID) management

2017-07-25 Thread Vikas Shivappa

Hardware uses RMID(Resource monitoring ID) to keep track of each of the
RDT events associated with tasks. The number of RMIDs is dependent on
the SKU and is enumerated via CPUID. We add support to manage the RMIDs
which include managing the RMID allocation and reading LLC occupancy
for an RMID.

RMID allocation is managed by keeping a free list which is initialized
to all available RMIDs except for RMID 0 which is always reserved for
root group. RMIDs goto a limbo list once they are
freed since the RMIDs are still tagged to cache lines of the tasks which
were using them - thereby still having some occupancy. They continue to
be in limbo list until the occupancy < threshold_occupancy. The
threshold_occupancy is a user configurable value.
OS uses IA32_QM_CTR MSR to read the occupancy associated with an RMID
after programming the IA32_EVENTSEL MSR with the RMID.

[Tony: Improved limbo search]

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c |  32 +
 arch/x86/kernel/cpu/intel_rdt.h |   6 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 213 
 3 files changed, 251 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 36e6454..fe6dc75 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -320,6 +320,19 @@ static u32 delay_bw_map(unsigned long bw, struct 
rdt_resource *r)
wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
 }
 
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+   struct rdt_domain *d;
+
+   list_for_each_entry(d, >domains, list) {
+   /* Find the domain that contains this CPU */
+   if (cpumask_test_cpu(cpu, >cpu_mask))
+   return d;
+   }
+
+   return NULL;
+}
+
 void rdt_ctrl_update(void *arg)
 {
struct msr_param *m = arg;
@@ -397,6 +410,19 @@ static int domain_setup_ctrlval(struct rdt_resource *r, 
struct rdt_domain *d)
return 0;
 }
 
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+   if (is_llc_occupancy_enabled()) {
+   d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+  sizeof(unsigned long),
+  GFP_KERNEL);
+   if (!d->rmid_busy_llc)
+   return -ENOMEM;
+   }
+
+   return 0;
+}
+
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -438,6 +464,11 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
}
 
+   if (r->mon_capable && domain_setup_mon_state(r, d)) {
+   kfree(d);
+   return;
+   }
+
cpumask_set_cpu(cpu, >cpu_mask);
list_add_tail(>list, add_pos);
 }
@@ -456,6 +487,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource 
*r)
cpumask_clear_cpu(cpu, >cpu_mask);
if (cpumask_empty(>cpu_mask)) {
kfree(d->ctrl_val);
+   kfree(d->rmid_busy_llc);
list_del(>list);
kfree(d);
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 993ab9d..35bf8eb 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -19,6 +19,8 @@
 #define QOS_L3_OCCUP_EVENT_ID  0x01
 #define QOS_L3_MBM_TOTAL_EVENT_ID  0x02
 #define QOS_L3_MBM_LOCAL_EVENT_ID  0x03
+#define RMID_VAL_ERROR BIT_ULL(63)
+#define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
 /**
  * struct mon_evt - Entry in the event list of a resource
@@ -98,6 +100,8 @@ struct rftype {
  * @list:  all instances of this resource
  * @id:unique id for this instance
  * @cpu_mask:  which cpus share this resource
+ * @rmid_busy_llc:
+ * bitmap of which limbo RMIDs are above threshold
  * @ctrl_val:  array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:  new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -106,6 +110,7 @@ struct rdt_domain {
struct list_headlist;
int id;
struct cpumask  cpu_mask;
+   unsigned long   *rmid_busy_llc;
u32 *ctrl_val;
u32 new_ctrl;
boolhave_new_ctrl;
@@ -282,6 +287,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
   struct seq_file *s, void *v);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c

[PATCH 08/28] x86/intel_rdt/cqm: Add RDT monitoring initialization

2017-07-25 Thread Vikas Shivappa

Add common data structures for RDT resource monitoring and perform RDT
monitoring related data structure initializations which include setting
up the RMID(Resource monitoring ID) lists and event list which the
resource supports.

[tony: some cleanup to make adding MBM easier later, remove "cqm"
from some names, make some data structure local to intel_rdt_monitor.c
static. Add copyright header]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/Makefile|   2 +-
 arch/x86/kernel/cpu/intel_rdt.c |  46 -
 arch/x86/kernel/cpu/intel_rdt.h |  44 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 161 
 4 files changed, 248 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/intel_rdt_monitor.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a576121..81b0060 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)   += umc.o
 
-obj-$(CONFIG_INTEL_RDT)+= intel_rdt.o intel_rdt_rdtgroup.o 
intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT)+= intel_rdt.o intel_rdt_rdtgroup.o 
intel_rdt_schemata.o intel_rdt_monitor.o
 
 obj-$(CONFIG_X86_MCE)  += mcheck/
 obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 98715c5..36e6454 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -55,6 +55,12 @@
  */
 int max_name_width, max_data_width;
 
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
 static void
 mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 static void
@@ -235,7 +241,7 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
return true;
 }
 
-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
+static void rdt_get_cache_alloc_config(int idx, struct rdt_resource *r)
 {
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_edx edx;
@@ -516,7 +522,7 @@ static __init void rdt_init_padding(void)
}
 }
 
-static __init bool get_rdt_resources(void)
+static __init bool get_rdt_alloc_resources(void)
 {
bool ret = false;
 
@@ -527,7 +533,8 @@ static __init bool get_rdt_resources(void)
return false;
 
if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
-   rdt_get_cache_config(1, _resources_all[RDT_RESOURCE_L3]);
+   rdt_get_cache_alloc_config(1,
+  _resources_all[RDT_RESOURCE_L3]);
if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
@@ -536,7 +543,8 @@ static __init bool get_rdt_resources(void)
}
if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
/* CPUID 0x10.2 fields are same format at 0x10.1 */
-   rdt_get_cache_config(2, _resources_all[RDT_RESOURCE_L2]);
+   rdt_get_cache_alloc_config(2,
+  _resources_all[RDT_RESOURCE_L2]);
ret = true;
}
 
@@ -548,6 +556,33 @@ static __init bool get_rdt_resources(void)
return ret;
 }
 
+static __init bool get_rdt_mon_resources(void)
+{
+   if (boot_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+   rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+   if (boot_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+   rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+   if (boot_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+   rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+   if (rdt_mon_features) {
+   if (!rdt_get_mon_l3_config(_resources_all[RDT_RESOURCE_L3]))
+   return true;
+   else
+   return false;
+   }
+
+   return false;
+}
+
+static __init bool get_rdt_resources(void)
+{
+   rdt_alloc_capable = get_rdt_alloc_resources();
+   rdt_mon_capable = get_rdt_mon_resources();
+
+   return (rdt_mon_capable || rdt_alloc_capable);
+}
+
 static int __init intel_rdt_late_init(void)
 {
struct rdt_resource *r;
@@ -573,6 +608,9 @@ static int __init intel_rdt_late_init(void)
for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);
 
+   for_each_mon_capable_rdt_resource(r)
+   pr_info("Intel RDT %s monitoring detected\n", r->name);
+
return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 29630af..993ab9d 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -12,6 +12,29 @@

[PATCH 09/28] x86/intel_rdt/cqm: Add RMID(Resource monitoring ID) management

2017-07-25 Thread Vikas Shivappa

Hardware uses RMID(Resource monitoring ID) to keep track of each of the
RDT events associated with tasks. The number of RMIDs is dependent on
the SKU and is enumerated via CPUID. We add support to manage the RMIDs
which include managing the RMID allocation and reading LLC occupancy
for an RMID.

RMID allocation is managed by keeping a free list which is initialized
to all available RMIDs except for RMID 0 which is always reserved for
root group. RMIDs goto a limbo list once they are
freed since the RMIDs are still tagged to cache lines of the tasks which
were using them - thereby still having some occupancy. They continue to
be in limbo list until the occupancy < threshold_occupancy. The
threshold_occupancy is a user configurable value.
OS uses IA32_QM_CTR MSR to read the occupancy associated with an RMID
after programming the IA32_EVENTSEL MSR with the RMID.

[Tony: Improved limbo search]

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c |  32 +
 arch/x86/kernel/cpu/intel_rdt.h |   6 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 213 
 3 files changed, 251 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 36e6454..fe6dc75 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -320,6 +320,19 @@ static u32 delay_bw_map(unsigned long bw, struct 
rdt_resource *r)
wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
 }
 
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+   struct rdt_domain *d;
+
+   list_for_each_entry(d, >domains, list) {
+   /* Find the domain that contains this CPU */
+   if (cpumask_test_cpu(cpu, >cpu_mask))
+   return d;
+   }
+
+   return NULL;
+}
+
 void rdt_ctrl_update(void *arg)
 {
struct msr_param *m = arg;
@@ -397,6 +410,19 @@ static int domain_setup_ctrlval(struct rdt_resource *r, 
struct rdt_domain *d)
return 0;
 }
 
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+   if (is_llc_occupancy_enabled()) {
+   d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+  sizeof(unsigned long),
+  GFP_KERNEL);
+   if (!d->rmid_busy_llc)
+   return -ENOMEM;
+   }
+
+   return 0;
+}
+
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -438,6 +464,11 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
}
 
+   if (r->mon_capable && domain_setup_mon_state(r, d)) {
+   kfree(d);
+   return;
+   }
+
cpumask_set_cpu(cpu, >cpu_mask);
list_add_tail(>list, add_pos);
 }
@@ -456,6 +487,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource 
*r)
cpumask_clear_cpu(cpu, >cpu_mask);
if (cpumask_empty(>cpu_mask)) {
kfree(d->ctrl_val);
+   kfree(d->rmid_busy_llc);
list_del(>list);
kfree(d);
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 993ab9d..35bf8eb 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -19,6 +19,8 @@
 #define QOS_L3_OCCUP_EVENT_ID  0x01
 #define QOS_L3_MBM_TOTAL_EVENT_ID  0x02
 #define QOS_L3_MBM_LOCAL_EVENT_ID  0x03
+#define RMID_VAL_ERROR BIT_ULL(63)
+#define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
 /**
  * struct mon_evt - Entry in the event list of a resource
@@ -98,6 +100,8 @@ struct rftype {
  * @list:  all instances of this resource
  * @id:unique id for this instance
  * @cpu_mask:  which cpus share this resource
+ * @rmid_busy_llc:
+ * bitmap of which limbo RMIDs are above threshold
  * @ctrl_val:  array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:  new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -106,6 +110,7 @@ struct rdt_domain {
struct list_headlist;
int id;
struct cpumask  cpu_mask;
+   unsigned long   *rmid_busy_llc;
u32 *ctrl_val;
u32 new_ctrl;
boolhave_new_ctrl;
@@ -282,6 +287,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
   struct seq_file *s, void *v);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c 
b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index

[PATCH 02/28] x86/intel_rdt/cqm: Documentation for resctrl based RDT Monitoring

2017-07-25 Thread Vikas Shivappa

Add a description of resctrl based RDT(resource director technology)
monitoring extension and its usage.

[Tony: Added descriptions for how monitoring and allocation are measured
and some cleanups]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 Documentation/x86/intel_rdt_ui.txt | 316 -
 1 file changed, 278 insertions(+), 38 deletions(-)

diff --git a/Documentation/x86/intel_rdt_ui.txt 
b/Documentation/x86/intel_rdt_ui.txt
index c491a1b..76f21e2 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -6,8 +6,8 @@ Fenghua Yu 
 Tony Luck 
 Vikas Shivappa 
 
-This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the
-X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3".
+This feature is enabled by the CONFIG_INTEL_RDT Kconfig and the
+X86 /proc/cpuinfo flag bits "rdt", "cqm", "cat_l3" and "cdp_l3".
 
 To use the feature mount the file system:
 
@@ -17,6 +17,13 @@ mount options are:
 
 "cdp": Enable code/data prioritization in L3 cache allocations.
 
+RDT features are orthogonal. A particular system may support only
+monitoring, only control, or both monitoring and control.
+
+The mount succeeds if either of allocation or monitoring is present, but
+only those files and directories supported by the system will be created.
+For more details on the behavior of the interface during monitoring
+and allocation, see the "Resource alloc and monitor groups" section.
 
 Info directory
 --
@@ -24,7 +31,12 @@ Info directory
 The 'info' directory contains information about the enabled
 resources. Each resource has its own subdirectory. The subdirectory
 names reflect the resource names.
-Cache resource(L3/L2)  subdirectory contains the following files:
+
+Each subdirectory contains the following files with respect to
+allocation:
+
+Cache resource(L3/L2)  subdirectory contains the following files
+related to allocation:
 
 "num_closids": The number of CLOSIDs which are valid for this
resource. The kernel uses the smallest number of
@@ -36,7 +48,8 @@ Cache resource(L3/L2)  subdirectory contains the following 
files:
 "min_cbm_bits":The minimum number of consecutive bits which
must be set when writing a mask.
 
-Memory bandwitdh(MB) subdirectory contains the following files:
+Memory bandwitdh(MB) subdirectory contains the following files
+with respect to allocation:
 
 "min_bandwidth":   The minimum memory bandwidth percentage which
user can request.
@@ -52,48 +65,152 @@ Memory bandwitdh(MB) subdirectory contains the following 
files:
non-linear. This field is purely informational
only.
 
-Resource groups

+If RDT monitoring is available there will be an "L3_MON" directory
+with the following files:
+
+"num_rmids":   The number of RMIDs available. This is the
+   upper bound for how many "CTRL_MON" + "MON"
+   groups can be created.
+
+"mon_features":Lists the monitoring events if
+   monitoring is enabled for the resource.
+
+"max_threshold_occupancy":
+   Read/write file provides the largest value (in
+   bytes) at which a previously used LLC_occupancy
+   counter can be considered for re-use.
+
+
+Resource alloc and monitor groups
+-
+
 Resource groups are represented as directories in the resctrl file
-system. The default group is the root directory. Other groups may be
-created as desired by the system administrator using the "mkdir(1)"
-command, and removed using "rmdir(1)".
+system.  The default group is the root directory which, immediately
+after mounting, owns all the tasks and cpus in the system and can make
+full use of all resources.
+
+On a system with RDT control features additional directories can be
+created in the root directory that specify different amounts of each
+resource (see "schemata" below). The root and these additional top level
+directories are referred to as "CTRL_MON" groups below.
+
+On a system with RDT monitoring the root directory and other top level
+directories contain a directory named "mon_groups" in which additional
+directories can be created to monitor subsets of tasks in the CTRL_MON
+group that is their ancestor. These are called "MON" groups in the rest
+of this document.
+
+Removing a directory will move all tasks and cpus owned by the group it
+represents to the parent. Removing one of the created CTRL_MON groups
+will automatically remove all MON groups below it.
+
+All groups contain the following files:
+
+"tasks":
+   Reading this file shows the list of all tasks that belong to
+   this group. Writing a task id to

[PATCH 01/28] x86/perf/cqm: Wipe out perf based cqm

2017-07-25 Thread Vikas Shivappa

'perf cqm' never worked due to the incompatibility between perf
infrastructure and cqm hardware support.  The hardware uses RMIDs to
track the llc occupancy of tasks and these RMIDs are per package. This
makes monitoring a hierarchy like cgroup along with monitoring of tasks
separately difficult and several patches sent to lkml to fix them were
NACKed. Further more, the following issues in the current perf cqm make
it almost unusable:

1. No support to monitor the same group of tasks for which we do
allocation using resctrl.

2. It gives random and inaccurate data (mostly 0s) once we run out
of RMIDs due to issues in Recycling.

3. Recycling results in inaccuracy of data because we cannot
guarantee that the RMID was stolen from a task when it was not
pulling data into cache or even when it pulled the least data. Also
for monitoring llc_occupancy, if we stop using an RMID_x and then
start using an RMID_y after we reclaim an RMID from an other event,
we miss accounting all the occupancy that was tagged to RMID_x at a
later perf_count.

2. Recycling code makes the monitoring code complex including
scheduling because the event can lose RMID any time. Since MBM
counters count bandwidth for a period of time by taking snap shot of
total bytes at two different times, recycling complicates the way we
count MBM in a hierarchy. Also we need a spin lock while we do the
processing to account for MBM counter overflow. We also currently
use a spin lock in scheduling to prevent the RMID from being taken
away.

4. Lack of support when we run different kind of event like task,
system-wide and cgroup events together. Data mostly prints 0s. This
is also because we can have only one RMID tied to a cpu as defined
by the cqm hardware but a perf can at the same time tie multiple
events during one sched_in.

5. No support of monitoring a group of tasks. There is partial support
for cgroup but it does not work once there is a hierarchy of cgroups
or if we want to monitor a task in a cgroup and the cgroup itself.

6. No support for monitoring tasks for the lifetime without perf
overhead.

7. It reported the aggregate cache occupancy or memory bandwidth over
all sockets. But most cloud and VMM based use cases want to know the
individual per-socket usage.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/events/intel/Makefile  |2 +-
 arch/x86/events/intel/cqm.c | 1766 ---
 arch/x86/include/asm/intel_rdt_common.h |2 -
 arch/x86/kernel/cpu/intel_rdt.c |8 +
 include/linux/perf_event.h  |   18 -
 kernel/events/core.c|   11 +-
 kernel/trace/bpf_trace.c|2 +-
 7 files changed, 11 insertions(+), 1798 deletions(-)
 delete mode 100644 arch/x86/events/intel/cqm.c

diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index 06c2baa..e9d8520 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_CPU_SUP_INTEL)+= core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)+= core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)   += intel-rapl-perf.o
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
deleted file mode 100644
index 8c00dc0..000
--- a/arch/x86/events/intel/cqm.c
+++ /dev/null
@@ -1,1766 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include "../perf_event.h"
-
-#define MSR_IA32_QM_CTR0x0c8e
-#define MSR_IA32_QM_EVTSEL 0x0c8d
-
-#define MBM_CNTR_WIDTH 24
-/*
- * Guaranteed time in ms as per SDM where MBM counters will not overflow.
- */
-#define MBM_CTR_OVERFLOW_TIME  1000
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-static bool cqm_enabled, mbm_enabled;
-unsigned int mbm_socket_max;
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-static struct hrtimer *mbm_timers;
-/**
- * struct sample - mbm event's (local or total) data
- * @total_bytes#bytes since we began monitoring
- * @prev_msr   previous value of MSR
- */
-struct sample {
-   u64 total_bytes;
-   u64 prev_msr;
-};
-
-/*
- * samples profiled for total memory bandwidth type events
- */
-static struct sample *mbm_total;
-/*
- * samples profiled for local

[PATCH 17/28] x86/intel_rdt/cqm: Add cpus file support

2017-07-25 Thread Vikas Shivappa

The cpus file is extended to support resource monitoring. This is used
to over-ride the RMID of the default group when running on specific
CPUs. It works similar to the resource control. The "cpus" and
"cpus_list" file is present in default group, ctrl_mon groups and
monitor groups.

Each "cpus" file or cpu_list file reads a cpumask or list showing which
CPUs belong to the resource group. By default all online cpus belong to
the default root group. A CPU can be present in one "ctrl_mon" and one
"monitor" group simultaneously. They can be added to a resource group by
writing the CPU to the file. When a CPU is added to a ctrl_mon group it
is automatically removed from the previous ctrl_mon group. A CPU can be
added to a monitor group only if it is present in the parent ctrl_mon
group and when a CPU is added to a monitor group, it is automatically
removed from the previous monitor group. When CPUs go offline, they are
automatically removed from the ctrl_mon and monitor groups.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 110 ++-
 1 file changed, 93 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 7f8f52d..abc06ea 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -181,15 +181,17 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 /*
  * This is safe against intel_rdt_sched_in() called from __switch_to()
  * because __switch_to() is executed with interrupts disabled. A local call
- * from update_closid() is proteced against __switch_to() because
+ * from update_closid_rmid() is proteced against __switch_to() because
  * preemption is disabled.
  */
-static void update_cpu_closid(void *info)
+static void update_cpu_closid_rmid(void *info)
 {
struct rdtgroup *r = info;
 
-   if (r)
+   if (r) {
this_cpu_write(rdt_cpu_default.closid, r->closid);
+   this_cpu_write(rdt_cpu_default.rmid, r->mon.rmid);
+   }
 
/*
 * We cannot unconditionally write the MSR because the current
@@ -205,20 +207,72 @@ static void update_cpu_closid(void *info)
  * Per task closids/rmids must have been set up before calling this function.
  */
 static void
-update_closid(const struct cpumask *cpu_mask, struct rdtgroup *r)
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 {
int cpu = get_cpu();
 
if (cpumask_test_cpu(cpu, cpu_mask))
-   update_cpu_closid(r);
-   smp_call_function_many(cpu_mask, update_cpu_closid, r, 1);
+   update_cpu_closid_rmid(r);
+   smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
put_cpu();
 }
 
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask)
+{
+   struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+   struct list_head *head;
+
+   /* Check whether cpus belong to parent ctrl group */
+   cpumask_andnot(tmpmask, newmask, >cpu_mask);
+   if (cpumask_weight(tmpmask))
+   return -EINVAL;
+
+   /* Check whether cpus are dropped from this group */
+   cpumask_andnot(tmpmask, >cpu_mask, newmask);
+   if (cpumask_weight(tmpmask)) {
+   /* Give any dropped cpus to parent rdtgroup */
+   cpumask_or(>cpu_mask, >cpu_mask, tmpmask);
+   update_closid_rmid(tmpmask, prgrp);
+   }
+
+   /*
+* If we added cpus, remove them from previous group that owned them
+* and update per-cpu rmid
+*/
+   cpumask_andnot(tmpmask, newmask, >cpu_mask);
+   if (cpumask_weight(tmpmask)) {
+   head = >mon.crdtgrp_list;
+   list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+   if (crgrp == rdtgrp)
+   continue;
+   cpumask_andnot(>cpu_mask, >cpu_mask,
+  tmpmask);
+   }
+   update_closid_rmid(tmpmask, rdtgrp);
+   }
+
+   /* Done pushing/pulling - update this group with new mask */
+   cpumask_copy(>cpu_mask, newmask);
+
+   return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+   struct rdtgroup *crgrp;
+
+   cpumask_andnot(>cpu_mask, >cpu_mask, m);
+   /* update the child mon group masks as well*/
+   list_for_each_entry(crgrp, >mon.crdtgrp_list, mon.crdtgrp_list)
+   cpumask_and(>cpu_mask, >cpu_mask, >cpu_mask);
+}
+
 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
-  cpumask_var_t tmpmask)
+  cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 {
-   struct rdtgroup *r;
+   struct rdtgroup *r, *crgrp;
+   struct list_head *head;
 
/*

[PATCH 02/28] x86/intel_rdt/cqm: Documentation for resctrl based RDT Monitoring

2017-07-25 Thread Vikas Shivappa

Add a description of resctrl based RDT(resource director technology)
monitoring extension and its usage.

[Tony: Added descriptions for how monitoring and allocation are measured
and some cleanups]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 Documentation/x86/intel_rdt_ui.txt | 316 -
 1 file changed, 278 insertions(+), 38 deletions(-)

diff --git a/Documentation/x86/intel_rdt_ui.txt 
b/Documentation/x86/intel_rdt_ui.txt
index c491a1b..76f21e2 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -6,8 +6,8 @@ Fenghua Yu 
 Tony Luck 
 Vikas Shivappa 
 
-This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the
-X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3".
+This feature is enabled by the CONFIG_INTEL_RDT Kconfig and the
+X86 /proc/cpuinfo flag bits "rdt", "cqm", "cat_l3" and "cdp_l3".
 
 To use the feature mount the file system:
 
@@ -17,6 +17,13 @@ mount options are:
 
 "cdp": Enable code/data prioritization in L3 cache allocations.
 
+RDT features are orthogonal. A particular system may support only
+monitoring, only control, or both monitoring and control.
+
+The mount succeeds if either of allocation or monitoring is present, but
+only those files and directories supported by the system will be created.
+For more details on the behavior of the interface during monitoring
+and allocation, see the "Resource alloc and monitor groups" section.
 
 Info directory
 --
@@ -24,7 +31,12 @@ Info directory
 The 'info' directory contains information about the enabled
 resources. Each resource has its own subdirectory. The subdirectory
 names reflect the resource names.
-Cache resource(L3/L2)  subdirectory contains the following files:
+
+Each subdirectory contains the following files with respect to
+allocation:
+
+Cache resource(L3/L2)  subdirectory contains the following files
+related to allocation:
 
 "num_closids": The number of CLOSIDs which are valid for this
resource. The kernel uses the smallest number of
@@ -36,7 +48,8 @@ Cache resource(L3/L2)  subdirectory contains the following 
files:
 "min_cbm_bits":The minimum number of consecutive bits which
must be set when writing a mask.
 
-Memory bandwitdh(MB) subdirectory contains the following files:
+Memory bandwitdh(MB) subdirectory contains the following files
+with respect to allocation:
 
 "min_bandwidth":   The minimum memory bandwidth percentage which
user can request.
@@ -52,48 +65,152 @@ Memory bandwitdh(MB) subdirectory contains the following 
files:
non-linear. This field is purely informational
only.
 
-Resource groups

+If RDT monitoring is available there will be an "L3_MON" directory
+with the following files:
+
+"num_rmids":   The number of RMIDs available. This is the
+   upper bound for how many "CTRL_MON" + "MON"
+   groups can be created.
+
+"mon_features":Lists the monitoring events if
+   monitoring is enabled for the resource.
+
+"max_threshold_occupancy":
+   Read/write file provides the largest value (in
+   bytes) at which a previously used LLC_occupancy
+   counter can be considered for re-use.
+
+
+Resource alloc and monitor groups
+-
+
 Resource groups are represented as directories in the resctrl file
-system. The default group is the root directory. Other groups may be
-created as desired by the system administrator using the "mkdir(1)"
-command, and removed using "rmdir(1)".
+system.  The default group is the root directory which, immediately
+after mounting, owns all the tasks and cpus in the system and can make
+full use of all resources.
+
+On a system with RDT control features additional directories can be
+created in the root directory that specify different amounts of each
+resource (see "schemata" below). The root and these additional top level
+directories are referred to as "CTRL_MON" groups below.
+
+On a system with RDT monitoring the root directory and other top level
+directories contain a directory named "mon_groups" in which additional
+directories can be created to monitor subsets of tasks in the CTRL_MON
+group that is their ancestor. These are called "MON" groups in the rest
+of this document.
+
+Removing a directory will move all tasks and cpus owned by the group it
+represents to the parent. Removing one of the created CTRL_MON groups
+will automatically remove all MON groups below it.
+
+All groups contain the following files:
+
+"tasks":
+   Reading this file shows the list of all tasks that belong to
+   this group. Writing a task id to the file will add a task to the
+   group. If the group is a CTRL_MON group the task is removed from
+   whichever

[PATCH 01/28] x86/perf/cqm: Wipe out perf based cqm

2017-07-25 Thread Vikas Shivappa

'perf cqm' never worked due to the incompatibility between perf
infrastructure and cqm hardware support.  The hardware uses RMIDs to
track the llc occupancy of tasks and these RMIDs are per package. This
makes monitoring a hierarchy like cgroup along with monitoring of tasks
separately difficult and several patches sent to lkml to fix them were
NACKed. Further more, the following issues in the current perf cqm make
it almost unusable:

1. No support to monitor the same group of tasks for which we do
allocation using resctrl.

2. It gives random and inaccurate data (mostly 0s) once we run out
of RMIDs due to issues in Recycling.

3. Recycling results in inaccuracy of data because we cannot
guarantee that the RMID was stolen from a task when it was not
pulling data into cache or even when it pulled the least data. Also
for monitoring llc_occupancy, if we stop using an RMID_x and then
start using an RMID_y after we reclaim an RMID from an other event,
we miss accounting all the occupancy that was tagged to RMID_x at a
later perf_count.

2. Recycling code makes the monitoring code complex including
scheduling because the event can lose RMID any time. Since MBM
counters count bandwidth for a period of time by taking snap shot of
total bytes at two different times, recycling complicates the way we
count MBM in a hierarchy. Also we need a spin lock while we do the
processing to account for MBM counter overflow. We also currently
use a spin lock in scheduling to prevent the RMID from being taken
away.

4. Lack of support when we run different kind of event like task,
system-wide and cgroup events together. Data mostly prints 0s. This
is also because we can have only one RMID tied to a cpu as defined
by the cqm hardware but a perf can at the same time tie multiple
events during one sched_in.

5. No support of monitoring a group of tasks. There is partial support
for cgroup but it does not work once there is a hierarchy of cgroups
or if we want to monitor a task in a cgroup and the cgroup itself.

6. No support for monitoring tasks for the lifetime without perf
overhead.

7. It reported the aggregate cache occupancy or memory bandwidth over
all sockets. But most cloud and VMM based use cases want to know the
individual per-socket usage.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/events/intel/Makefile  |2 +-
 arch/x86/events/intel/cqm.c | 1766 ---
 arch/x86/include/asm/intel_rdt_common.h |2 -
 arch/x86/kernel/cpu/intel_rdt.c |8 +
 include/linux/perf_event.h  |   18 -
 kernel/events/core.c|   11 +-
 kernel/trace/bpf_trace.c|2 +-
 7 files changed, 11 insertions(+), 1798 deletions(-)
 delete mode 100644 arch/x86/events/intel/cqm.c

diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index 06c2baa..e9d8520 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_CPU_SUP_INTEL)+= core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)+= core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)   += intel-rapl-perf.o
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
deleted file mode 100644
index 8c00dc0..000
--- a/arch/x86/events/intel/cqm.c
+++ /dev/null
@@ -1,1766 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include "../perf_event.h"
-
-#define MSR_IA32_QM_CTR0x0c8e
-#define MSR_IA32_QM_EVTSEL 0x0c8d
-
-#define MBM_CNTR_WIDTH 24
-/*
- * Guaranteed time in ms as per SDM where MBM counters will not overflow.
- */
-#define MBM_CTR_OVERFLOW_TIME  1000
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-static bool cqm_enabled, mbm_enabled;
-unsigned int mbm_socket_max;
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-static struct hrtimer *mbm_timers;
-/**
- * struct sample - mbm event's (local or total) data
- * @total_bytes#bytes since we began monitoring
- * @prev_msr   previous value of MSR
- */
-struct sample {
-   u64 total_bytes;
-   u64 prev_msr;
-};
-
-/*
- * samples profiled for total memory bandwidth type events
- */
-static struct sample *mbm_total;
-/*
- * samples profiled for local memory bandwidth type events
- */

[PATCH 17/28] x86/intel_rdt/cqm: Add cpus file support

2017-07-25 Thread Vikas Shivappa

The cpus file is extended to support resource monitoring. This is used
to over-ride the RMID of the default group when running on specific
CPUs. It works similar to the resource control. The "cpus" and
"cpus_list" file is present in default group, ctrl_mon groups and
monitor groups.

Each "cpus" file or cpu_list file reads a cpumask or list showing which
CPUs belong to the resource group. By default all online cpus belong to
the default root group. A CPU can be present in one "ctrl_mon" and one
"monitor" group simultaneously. They can be added to a resource group by
writing the CPU to the file. When a CPU is added to a ctrl_mon group it
is automatically removed from the previous ctrl_mon group. A CPU can be
added to a monitor group only if it is present in the parent ctrl_mon
group and when a CPU is added to a monitor group, it is automatically
removed from the previous monitor group. When CPUs go offline, they are
automatically removed from the ctrl_mon and monitor groups.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 110 ++-
 1 file changed, 93 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 7f8f52d..abc06ea 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -181,15 +181,17 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 /*
  * This is safe against intel_rdt_sched_in() called from __switch_to()
  * because __switch_to() is executed with interrupts disabled. A local call
- * from update_closid() is proteced against __switch_to() because
+ * from update_closid_rmid() is proteced against __switch_to() because
  * preemption is disabled.
  */
-static void update_cpu_closid(void *info)
+static void update_cpu_closid_rmid(void *info)
 {
struct rdtgroup *r = info;
 
-   if (r)
+   if (r) {
this_cpu_write(rdt_cpu_default.closid, r->closid);
+   this_cpu_write(rdt_cpu_default.rmid, r->mon.rmid);
+   }
 
/*
 * We cannot unconditionally write the MSR because the current
@@ -205,20 +207,72 @@ static void update_cpu_closid(void *info)
  * Per task closids/rmids must have been set up before calling this function.
  */
 static void
-update_closid(const struct cpumask *cpu_mask, struct rdtgroup *r)
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 {
int cpu = get_cpu();
 
if (cpumask_test_cpu(cpu, cpu_mask))
-   update_cpu_closid(r);
-   smp_call_function_many(cpu_mask, update_cpu_closid, r, 1);
+   update_cpu_closid_rmid(r);
+   smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
put_cpu();
 }
 
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask)
+{
+   struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+   struct list_head *head;
+
+   /* Check whether cpus belong to parent ctrl group */
+   cpumask_andnot(tmpmask, newmask, >cpu_mask);
+   if (cpumask_weight(tmpmask))
+   return -EINVAL;
+
+   /* Check whether cpus are dropped from this group */
+   cpumask_andnot(tmpmask, >cpu_mask, newmask);
+   if (cpumask_weight(tmpmask)) {
+   /* Give any dropped cpus to parent rdtgroup */
+   cpumask_or(>cpu_mask, >cpu_mask, tmpmask);
+   update_closid_rmid(tmpmask, prgrp);
+   }
+
+   /*
+* If we added cpus, remove them from previous group that owned them
+* and update per-cpu rmid
+*/
+   cpumask_andnot(tmpmask, newmask, >cpu_mask);
+   if (cpumask_weight(tmpmask)) {
+   head = >mon.crdtgrp_list;
+   list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+   if (crgrp == rdtgrp)
+   continue;
+   cpumask_andnot(>cpu_mask, >cpu_mask,
+  tmpmask);
+   }
+   update_closid_rmid(tmpmask, rdtgrp);
+   }
+
+   /* Done pushing/pulling - update this group with new mask */
+   cpumask_copy(>cpu_mask, newmask);
+
+   return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+   struct rdtgroup *crgrp;
+
+   cpumask_andnot(>cpu_mask, >cpu_mask, m);
+   /* update the child mon group masks as well*/
+   list_for_each_entry(crgrp, >mon.crdtgrp_list, mon.crdtgrp_list)
+   cpumask_and(>cpu_mask, >cpu_mask, >cpu_mask);
+}
+
 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
-  cpumask_var_t tmpmask)
+  cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 {
-   struct rdtgroup *r;
+   struct rdtgroup *r, *crgrp;
+   struct list_head *head;
 
/* Check whether cpus are dropped from

[PATCH 05/28] x86/intel_rdt: Mark rdt_root and closid_alloc as static

2017-07-25 Thread Vikas Shivappa

From: Reinette Chatre 

Sparse reports that both of these can be static.

Make it so.

Signed-off-by: Reinette Chatre 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index fab8811..3273e88 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -36,7 +36,7 @@
 #include "intel_rdt.h"
 
 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-struct kernfs_root *rdt_root;
+static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
 LIST_HEAD(rdt_all_groups);
 
@@ -75,7 +75,7 @@ static void closid_init(void)
closid_free_map &= ~1;
 }
 
-int closid_alloc(void)
+static int closid_alloc(void)
 {
int closid = ffs(closid_free_map);
 
-- 
1.9.1

[PATCH 22/28] x86/intel_rdt/cqm: Add mount,umount support

2017-07-25 Thread Vikas Shivappa

Add monitoring support during mount and unmount. Since root directory is
a "ctrl_mon" directory which can control and monitor resources create
the "mon_groups" directory which can hold monitor groups and a
"mon_data" directory which would hold all monitoring data like the rest
of resource groups.

The mount succeeds if either of monitoring or control/allocation is
enabled. If only monitoring is enabled user can still create monitor
groups under the "/sys/fs/resctrl/mon_groups/" and any mkdir under root
would fail. If only control/allocation is enabled all of the monitoring
related directories/files would not exist and resctrl would work in
legacy mode.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.h  |  4 +++
 arch/x86/kernel/cpu/intel_rdt_monitor.c  |  1 +
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 61 +---
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 7fcaa5f..92a5d30 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -22,6 +22,8 @@
 #define RMID_VAL_ERROR BIT_ULL(63)
 #define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid: event id
@@ -59,6 +61,8 @@ struct rmid_read {
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
 
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
 enum rdt_group_type {
RDTCTRL_GROUP = 0,
RDTMON_GROUP,
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c 
b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index f8f06f5..6ae5cf5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -343,6 +343,7 @@ void mon_event_count(void *info)
}
}
 }
+
 static int dom_data_init(struct rdt_resource *r)
 {
struct rmid_entry *entry = NULL;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index a5e6d60..37698fb 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -35,6 +35,8 @@
 #include 
 #include "intel_rdt.h"
 
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
@@ -43,6 +45,12 @@
 /* Kernel fs node for "info" directory under root */
 static struct kernfs_node *kn_info;
 
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
 /*
  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  * we can keep a bitmap of free CLOSIDs in a single integer.
@@ -1045,6 +1053,10 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
}
 }
 
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+struct rdtgroup *prgrp,
+struct kernfs_node **mon_data_kn);
+
 static struct dentry *rdt_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
@@ -1056,7 +1068,7 @@ static struct dentry *rdt_mount(struct file_system_type 
*fs_type,
/*
 * resctrl file system can only be mounted once.
 */
-   if (static_branch_unlikely(_alloc_enable_key)) {
+   if (static_branch_unlikely(_enable_key)) {
dentry = ERR_PTR(-EBUSY);
goto out;
}
@@ -1075,15 +1087,47 @@ static struct dentry *rdt_mount(struct file_system_type 
*fs_type,
goto out_cdp;
}
 
+   if (rdt_mon_capable) {
+   ret = mongroup_create_dir(rdtgroup_default.kn,
+ NULL, "mon_groups",
+ _mongrp);
+   if (ret) {
+   dentry = ERR_PTR(ret);
+   goto out_info;
+   }
+   kernfs_get(kn_mongrp);
+
+   ret = mkdir_mondata_all(rdtgroup_default.kn,
+   _default, _mondata);
+   if (ret) {
+   dentry = ERR_PTR(ret);
+   goto out_mongrp;
+   }
+   kernfs_get(kn_mondata);
+   rdtgroup_default.mon.mon_data_kn = kn_mondata;
+   }
+
dentry = kernfs_mount(fs_type, flags, rdt_root,
  RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
-   goto out_destroy;
+   goto out_mondata;
 
-   static_branch_enable(_alloc_enable_key);
+   if (rdt_alloc_capable)
+

[PATCH 05/28] x86/intel_rdt: Mark rdt_root and closid_alloc as static

2017-07-25 Thread Vikas Shivappa

From: Reinette Chatre 

Sparse reports that both of these can be static.

Make it so.

Signed-off-by: Reinette Chatre 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index fab8811..3273e88 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -36,7 +36,7 @@
 #include "intel_rdt.h"
 
 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-struct kernfs_root *rdt_root;
+static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
 LIST_HEAD(rdt_all_groups);
 
@@ -75,7 +75,7 @@ static void closid_init(void)
closid_free_map &= ~1;
 }
 
-int closid_alloc(void)
+static int closid_alloc(void)
 {
int closid = ffs(closid_free_map);
 
-- 
1.9.1

[PATCH 22/28] x86/intel_rdt/cqm: Add mount,umount support

2017-07-25 Thread Vikas Shivappa

Add monitoring support during mount and unmount. Since root directory is
a "ctrl_mon" directory which can control and monitor resources create
the "mon_groups" directory which can hold monitor groups and a
"mon_data" directory which would hold all monitoring data like the rest
of resource groups.

The mount succeeds if either of monitoring or control/allocation is
enabled. If only monitoring is enabled user can still create monitor
groups under the "/sys/fs/resctrl/mon_groups/" and any mkdir under root
would fail. If only control/allocation is enabled all of the monitoring
related directories/files would not exist and resctrl would work in
legacy mode.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.h  |  4 +++
 arch/x86/kernel/cpu/intel_rdt_monitor.c  |  1 +
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 61 +---
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 7fcaa5f..92a5d30 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -22,6 +22,8 @@
 #define RMID_VAL_ERROR BIT_ULL(63)
 #define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid: event id
@@ -59,6 +61,8 @@ struct rmid_read {
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
 
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
 enum rdt_group_type {
RDTCTRL_GROUP = 0,
RDTMON_GROUP,
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c 
b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index f8f06f5..6ae5cf5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -343,6 +343,7 @@ void mon_event_count(void *info)
}
}
 }
+
 static int dom_data_init(struct rdt_resource *r)
 {
struct rmid_entry *entry = NULL;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index a5e6d60..37698fb 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -35,6 +35,8 @@
 #include 
 #include "intel_rdt.h"
 
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
@@ -43,6 +45,12 @@
 /* Kernel fs node for "info" directory under root */
 static struct kernfs_node *kn_info;
 
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
 /*
  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  * we can keep a bitmap of free CLOSIDs in a single integer.
@@ -1045,6 +1053,10 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
}
 }
 
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+struct rdtgroup *prgrp,
+struct kernfs_node **mon_data_kn);
+
 static struct dentry *rdt_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
@@ -1056,7 +1068,7 @@ static struct dentry *rdt_mount(struct file_system_type 
*fs_type,
/*
 * resctrl file system can only be mounted once.
 */
-   if (static_branch_unlikely(_alloc_enable_key)) {
+   if (static_branch_unlikely(_enable_key)) {
dentry = ERR_PTR(-EBUSY);
goto out;
}
@@ -1075,15 +1087,47 @@ static struct dentry *rdt_mount(struct file_system_type 
*fs_type,
goto out_cdp;
}
 
+   if (rdt_mon_capable) {
+   ret = mongroup_create_dir(rdtgroup_default.kn,
+ NULL, "mon_groups",
+ _mongrp);
+   if (ret) {
+   dentry = ERR_PTR(ret);
+   goto out_info;
+   }
+   kernfs_get(kn_mongrp);
+
+   ret = mkdir_mondata_all(rdtgroup_default.kn,
+   _default, _mondata);
+   if (ret) {
+   dentry = ERR_PTR(ret);
+   goto out_mongrp;
+   }
+   kernfs_get(kn_mondata);
+   rdtgroup_default.mon.mon_data_kn = kn_mondata;
+   }
+
dentry = kernfs_mount(fs_type, flags, rdt_root,
  RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
-   goto out_destroy;
+   goto out_mondata;
 
-   static_branch_enable(_alloc_enable_key);
+   if (rdt_alloc_capable)
+   static_branch_enable(_alloc_enable_key);
+   if

[PATCH 21/28] x86/intel_rdt/cqm: Add rmdir support

2017-07-25 Thread Vikas Shivappa

Resource groups (ctrl_mon and monitor groups) are represented by
directories in resctrl fs. Add support to remove the directories.

When a ctrl_mon directory is removed all the cpus and tasks are assigned
back to the root rdtgroup. When a monitor group is removed the cpus and
tasks are returned to the parent control group.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 90 ++--
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 59251592..a5e6d60 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1131,6 +1131,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
return 0;
 }
 
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+   return (rdt_alloc_capable &&
+   (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+   return (rdt_mon_capable &&
+   (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
 /*
  * Move tasks from one to the other group. If @from is NULL, then all tasks
  * in the systems are moved unconditionally (used for teardown).
@@ -1146,8 +1158,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, 
struct rdtgroup *to,
 
read_lock(_lock);
for_each_process_thread(p, t) {
-   if (!from || t->closid == from->closid) {
+   if (!from || is_closid_match(t, from) ||
+   is_rmid_match(t, from)) {
t->closid = to->closid;
+   t->rmid = to->mon.rmid;
+
 #ifdef CONFIG_SMP
/*
 * This is safe on x86 w/o barriers as the ordering
@@ -1166,6 +1181,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, 
struct rdtgroup *to,
read_unlock(_lock);
 }
 
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+   struct rdtgroup *sentry, *stmp;
+   struct list_head *head;
+
+   head = >mon.crdtgrp_list;
+   list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+   free_rmid(sentry->mon.rmid);
+   list_del(>mon.crdtgrp_list);
+   kfree(sentry);
+   }
+}
+
 /*
  * Forcibly remove all of subdirectories under root.
  */
@@ -1568,6 +1596,44 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, 
const char *name,
return -EPERM;
 }
 
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+   struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+   int cpu;
+
+   /* Give any tasks back to the parent group */
+   rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+   /* Update per cpu rmid of the moved CPUs first */
+   for_each_cpu(cpu, >cpu_mask)
+   per_cpu(rdt_cpu_default.rmid, cpu) = prdtgrp->mon.rmid;
+   /*
+* Update the MSR on moved CPUs and CPUs which have moved
+* task running on them.
+*/
+   cpumask_or(tmpmask, tmpmask, >cpu_mask);
+   update_closid_rmid(tmpmask, NULL);
+
+   rdtgrp->flags = RDT_DELETED;
+   free_rmid(rdtgrp->mon.rmid);
+
+   /*
+* Remove the rdtgrp from the parent ctrl_mon group's list
+*/
+   WARN_ON(list_empty(>mon.crdtgrp_list));
+   list_del(>mon.crdtgrp_list);
+
+   /*
+* one extra hold on this, will drop when we kfree(rdtgrp)
+* in rdtgroup_kn_unlock()
+*/
+   kernfs_get(kn);
+   kernfs_remove(rdtgrp->kn);
+
+   return 0;
+}
+
 static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
   cpumask_var_t tmpmask)
 {
@@ -1580,9 +1646,12 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, 
struct rdtgroup *rdtgrp,
cpumask_or(_default.cpu_mask,
   _default.cpu_mask, >cpu_mask);
 
-   /* Update per cpu closid of the moved CPUs first */
-   for_each_cpu(cpu, >cpu_mask)
+   /* Update per cpu closid and rmid of the moved CPUs first */
+   for_each_cpu(cpu, >cpu_mask) {
per_cpu(rdt_cpu_default.closid, cpu) = rdtgroup_default.closid;
+   per_cpu(rdt_cpu_default.rmid, cpu) = rdtgroup_default.mon.rmid;
+   }
+
/*
 * Update the MSR on moved CPUs and CPUs which have moved
 * task running on them.
@@ -1592,6 +1661,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, 
struct rdtgroup *rdtgrp,
 
rdtgrp->flags = RDT_DELETED;
closid_free(rdtgrp->closid);
+   free_rmid(rdtgrp->mon.rmid);
+
+   /*
+* Free all the child monitor group rmids.
+*/
+   free_all_child_rdtgrp(rdtgrp);
+
list_del(>rdtgroup_list);
 
/*
@@ -1622,10

[PATCH 21/28] x86/intel_rdt/cqm: Add rmdir support

2017-07-25 Thread Vikas Shivappa

Resource groups (ctrl_mon and monitor groups) are represented by
directories in resctrl fs. Add support to remove the directories.

When a ctrl_mon directory is removed all the cpus and tasks are assigned
back to the root rdtgroup. When a monitor group is removed the cpus and
tasks are returned to the parent control group.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 90 ++--
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 59251592..a5e6d60 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1131,6 +1131,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
return 0;
 }
 
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+   return (rdt_alloc_capable &&
+   (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+   return (rdt_mon_capable &&
+   (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
 /*
  * Move tasks from one to the other group. If @from is NULL, then all tasks
  * in the systems are moved unconditionally (used for teardown).
@@ -1146,8 +1158,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, 
struct rdtgroup *to,
 
read_lock(_lock);
for_each_process_thread(p, t) {
-   if (!from || t->closid == from->closid) {
+   if (!from || is_closid_match(t, from) ||
+   is_rmid_match(t, from)) {
t->closid = to->closid;
+   t->rmid = to->mon.rmid;
+
 #ifdef CONFIG_SMP
/*
 * This is safe on x86 w/o barriers as the ordering
@@ -1166,6 +1181,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, 
struct rdtgroup *to,
read_unlock(_lock);
 }
 
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+   struct rdtgroup *sentry, *stmp;
+   struct list_head *head;
+
+   head = >mon.crdtgrp_list;
+   list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+   free_rmid(sentry->mon.rmid);
+   list_del(>mon.crdtgrp_list);
+   kfree(sentry);
+   }
+}
+
 /*
  * Forcibly remove all of subdirectories under root.
  */
@@ -1568,6 +1596,44 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, 
const char *name,
return -EPERM;
 }
 
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+   struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+   int cpu;
+
+   /* Give any tasks back to the parent group */
+   rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+   /* Update per cpu rmid of the moved CPUs first */
+   for_each_cpu(cpu, >cpu_mask)
+   per_cpu(rdt_cpu_default.rmid, cpu) = prdtgrp->mon.rmid;
+   /*
+* Update the MSR on moved CPUs and CPUs which have moved
+* task running on them.
+*/
+   cpumask_or(tmpmask, tmpmask, >cpu_mask);
+   update_closid_rmid(tmpmask, NULL);
+
+   rdtgrp->flags = RDT_DELETED;
+   free_rmid(rdtgrp->mon.rmid);
+
+   /*
+* Remove the rdtgrp from the parent ctrl_mon group's list
+*/
+   WARN_ON(list_empty(>mon.crdtgrp_list));
+   list_del(>mon.crdtgrp_list);
+
+   /*
+* one extra hold on this, will drop when we kfree(rdtgrp)
+* in rdtgroup_kn_unlock()
+*/
+   kernfs_get(kn);
+   kernfs_remove(rdtgrp->kn);
+
+   return 0;
+}
+
 static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
   cpumask_var_t tmpmask)
 {
@@ -1580,9 +1646,12 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, 
struct rdtgroup *rdtgrp,
cpumask_or(_default.cpu_mask,
   _default.cpu_mask, >cpu_mask);
 
-   /* Update per cpu closid of the moved CPUs first */
-   for_each_cpu(cpu, >cpu_mask)
+   /* Update per cpu closid and rmid of the moved CPUs first */
+   for_each_cpu(cpu, >cpu_mask) {
per_cpu(rdt_cpu_default.closid, cpu) = rdtgroup_default.closid;
+   per_cpu(rdt_cpu_default.rmid, cpu) = rdtgroup_default.mon.rmid;
+   }
+
/*
 * Update the MSR on moved CPUs and CPUs which have moved
 * task running on them.
@@ -1592,6 +1661,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, 
struct rdtgroup *rdtgrp,
 
rdtgrp->flags = RDT_DELETED;
closid_free(rdtgrp->closid);
+   free_rmid(rdtgrp->mon.rmid);
+
+   /*
+* Free all the child monitor group rmids.
+*/
+   free_all_child_rdtgrp(rdtgrp);
+
list_del(>rdtgroup_list);
 
/*
@@ -1622,10 +1698,16 @@ static int

[PATCH 19/28] x86/intel_rdt/cqm: Add mon_data

2017-07-25 Thread Vikas Shivappa

Add a mon_data directory for the root rdtgroup and all other rdtgroups.
The directory holds all of the monitored data for all domains and events
of all resources being monitored.

The mon_data itself has a list of directories in the format
mon__. Each of these subdirectories contain one
file per event in the mode "0444". Reading the file displays a snapshot
of the monitored data for the event the file represents.

For ex, on a 2 socket Broadwell with llc_occupancy being
monitored the mon_data contents look as below:

$ ls /sys/fs/resctrl/p1/mon_data/
mon_L3_00
mon_L3_01

Each domain directory has one file per event:
$ ls /sys/fs/resctrl/p1/mon_data/mon_L3_00/
llc_occupancy

To read current llc_occupancy of ctrl_mon group p1
$ cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
33789096

[This patch idea is based on Tony's sample patches to organise data in a
per domain directory and have one file per event (and use the fp->priv to
store mon data bits)]

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c |   9 +-
 arch/x86/kernel/cpu/intel_rdt.h |  29 ++
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c |  55 ++-
 arch/x86/kernel/cpu/intel_rdt_monitor.c |  49 ++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c| 145 
 5 files changed, 284 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index cd48ec9..81d1cd3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -71,6 +71,7 @@
 struct rdt_resource rdt_resources_all[] = {
[RDT_RESOURCE_L3] =
{
+   .rid= RDT_RESOURCE_L3,
.name   = "L3",
.domains= domain_init(RDT_RESOURCE_L3),
.msr_base   = IA32_L3_CBM_BASE,
@@ -87,6 +88,7 @@ struct rdt_resource rdt_resources_all[] = {
},
[RDT_RESOURCE_L3DATA] =
{
+   .rid= RDT_RESOURCE_L3DATA,
.name   = "L3DATA",
.domains= domain_init(RDT_RESOURCE_L3DATA),
.msr_base   = IA32_L3_CBM_BASE,
@@ -103,6 +105,7 @@ struct rdt_resource rdt_resources_all[] = {
},
[RDT_RESOURCE_L3CODE] =
{
+   .rid= RDT_RESOURCE_L3CODE,
.name   = "L3CODE",
.domains= domain_init(RDT_RESOURCE_L3CODE),
.msr_base   = IA32_L3_CBM_BASE,
@@ -119,6 +122,7 @@ struct rdt_resource rdt_resources_all[] = {
},
[RDT_RESOURCE_L2] =
{
+   .rid= RDT_RESOURCE_L2,
.name   = "L2",
.domains= domain_init(RDT_RESOURCE_L2),
.msr_base   = IA32_L2_CBM_BASE,
@@ -135,6 +139,7 @@ struct rdt_resource rdt_resources_all[] = {
},
[RDT_RESOURCE_MBA] =
{
+   .rid= RDT_RESOURCE_MBA,
.name   = "MB",
.domains= domain_init(RDT_RESOURCE_MBA),
.msr_base   = IA32_MBA_THRTL_BASE,
@@ -362,8 +367,8 @@ void rdt_ctrl_update(void *arg)
  * caller, return the first domain whose id is bigger than the input id.
  * The domain list is sorted by id in ascending order.
  */
-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos)
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+  struct list_head **pos)
 {
struct rdt_domain *d;
struct list_head *l;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 6f07047..7fcaa5f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -33,6 +33,27 @@ struct mon_evt {
struct list_headlist;
 };
 
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid:   Resource id associated with the event file.
+ * @evtid: Event id associated with the event file
+ * @domid: The domain to which the event file belongs
+ */
+union mon_data_bits {
+   void *priv;
+   struct {
+   unsigned int rid: 10;
+   unsigned int evtid  : 8;
+   unsigned int domid  : 14;
+   } u;
+};
+
+struct rmid_read {
+   struct rdtgroup *rgrp;
+   int evtid;
+   u64 val;
+};
+
 extern unsigned int intel_cqm_threshold;
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
@@ -46,11 +67,13 @@ enum rdt_group_type {
 
 /**
  * struct mongroup - store mon group's data in

[PATCH 20/28] x86/intel_rdt: Separate the ctrl bits from rmdir

2017-07-25 Thread Vikas Shivappa

Re-factor the code to separate the ctrl group removal from the rmdir to
prepare to add RDT monitoring group removal.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 48 ++--
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 8f9b54e..59251592 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1568,20 +1568,10 @@ static int rdtgroup_mkdir(struct kernfs_node 
*parent_kn, const char *name,
return -EPERM;
 }
 
-static int rdtgroup_rmdir(struct kernfs_node *kn)
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+  cpumask_var_t tmpmask)
 {
-   int ret, cpu, closid = rdtgroup_default.closid;
-   struct rdtgroup *rdtgrp;
-   cpumask_var_t tmpmask;
-
-   if (!zalloc_cpumask_var(, GFP_KERNEL))
-   return -ENOMEM;
-
-   rdtgrp = rdtgroup_kn_lock_live(kn);
-   if (!rdtgrp) {
-   ret = -EPERM;
-   goto out;
-   }
+   int cpu;
 
/* Give any tasks back to the default group */
rdt_move_group_tasks(rdtgrp, _default, tmpmask);
@@ -1592,7 +1582,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
/* Update per cpu closid of the moved CPUs first */
for_each_cpu(cpu, >cpu_mask)
-   per_cpu(rdt_cpu_default.closid, cpu) = closid;
+   per_cpu(rdt_cpu_default.closid, cpu) = rdtgroup_default.closid;
/*
 * Update the MSR on moved CPUs and CPUs which have moved
 * task running on them.
@@ -1610,7 +1600,35 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 */
kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
-   ret = 0;
+
+   return 0;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+   struct kernfs_node *parent_kn = kn->parent;
+   struct rdtgroup *rdtgrp;
+   cpumask_var_t tmpmask;
+   int ret = 0;
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL))
+   return -ENOMEM;
+
+   rdtgrp = rdtgroup_kn_lock_live(kn);
+   if (!rdtgrp) {
+   ret = -EPERM;
+   goto out;
+   }
+
+   /*
+* If the rdtgroup is a ctrl_mon group and parent directory
+* is the root directory, remove the ctrl group.
+*/
+   if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
+   ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+   else
+   ret = -EPERM;
+
 out:
rdtgroup_kn_unlock(kn);
free_cpumask_var(tmpmask);
-- 
1.9.1

[PATCH 19/28] x86/intel_rdt/cqm: Add mon_data

2017-07-25 Thread Vikas Shivappa

Add a mon_data directory for the root rdtgroup and all other rdtgroups.
The directory holds all of the monitored data for all domains and events
of all resources being monitored.

The mon_data itself has a list of directories in the format
mon__. Each of these subdirectories contain one
file per event in the mode "0444". Reading the file displays a snapshot
of the monitored data for the event the file represents.

For ex, on a 2 socket Broadwell with llc_occupancy being
monitored the mon_data contents look as below:

$ ls /sys/fs/resctrl/p1/mon_data/
mon_L3_00
mon_L3_01

Each domain directory has one file per event:
$ ls /sys/fs/resctrl/p1/mon_data/mon_L3_00/
llc_occupancy

To read current llc_occupancy of ctrl_mon group p1
$ cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
33789096

[This patch idea is based on Tony's sample patches to organise data in a
per domain directory and have one file per event (and use the fp->priv to
store mon data bits)]

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c |   9 +-
 arch/x86/kernel/cpu/intel_rdt.h |  29 ++
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c |  55 ++-
 arch/x86/kernel/cpu/intel_rdt_monitor.c |  49 ++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c| 145 
 5 files changed, 284 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index cd48ec9..81d1cd3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -71,6 +71,7 @@
 struct rdt_resource rdt_resources_all[] = {
[RDT_RESOURCE_L3] =
{
+   .rid= RDT_RESOURCE_L3,
.name   = "L3",
.domains= domain_init(RDT_RESOURCE_L3),
.msr_base   = IA32_L3_CBM_BASE,
@@ -87,6 +88,7 @@ struct rdt_resource rdt_resources_all[] = {
},
[RDT_RESOURCE_L3DATA] =
{
+   .rid= RDT_RESOURCE_L3DATA,
.name   = "L3DATA",
.domains= domain_init(RDT_RESOURCE_L3DATA),
.msr_base   = IA32_L3_CBM_BASE,
@@ -103,6 +105,7 @@ struct rdt_resource rdt_resources_all[] = {
},
[RDT_RESOURCE_L3CODE] =
{
+   .rid= RDT_RESOURCE_L3CODE,
.name   = "L3CODE",
.domains= domain_init(RDT_RESOURCE_L3CODE),
.msr_base   = IA32_L3_CBM_BASE,
@@ -119,6 +122,7 @@ struct rdt_resource rdt_resources_all[] = {
},
[RDT_RESOURCE_L2] =
{
+   .rid= RDT_RESOURCE_L2,
.name   = "L2",
.domains= domain_init(RDT_RESOURCE_L2),
.msr_base   = IA32_L2_CBM_BASE,
@@ -135,6 +139,7 @@ struct rdt_resource rdt_resources_all[] = {
},
[RDT_RESOURCE_MBA] =
{
+   .rid= RDT_RESOURCE_MBA,
.name   = "MB",
.domains= domain_init(RDT_RESOURCE_MBA),
.msr_base   = IA32_MBA_THRTL_BASE,
@@ -362,8 +367,8 @@ void rdt_ctrl_update(void *arg)
  * caller, return the first domain whose id is bigger than the input id.
  * The domain list is sorted by id in ascending order.
  */
-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos)
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+  struct list_head **pos)
 {
struct rdt_domain *d;
struct list_head *l;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 6f07047..7fcaa5f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -33,6 +33,27 @@ struct mon_evt {
struct list_headlist;
 };
 
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid:   Resource id associated with the event file.
+ * @evtid: Event id associated with the event file
+ * @domid: The domain to which the event file belongs
+ */
+union mon_data_bits {
+   void *priv;
+   struct {
+   unsigned int rid: 10;
+   unsigned int evtid  : 8;
+   unsigned int domid  : 14;
+   } u;
+};
+
+struct rmid_read {
+   struct rdtgroup *rgrp;
+   int evtid;
+   u64 val;
+};
+
 extern unsigned int intel_cqm_threshold;
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
@@ -46,11 +67,13 @@ enum rdt_group_type {
 
 /**
  * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn

[PATCH 20/28] x86/intel_rdt: Separate the ctrl bits from rmdir

2017-07-25 Thread Vikas Shivappa

Re-factor the code to separate the ctrl group removal from the rmdir to
prepare to add RDT monitoring group removal.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 48 ++--
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 8f9b54e..59251592 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1568,20 +1568,10 @@ static int rdtgroup_mkdir(struct kernfs_node 
*parent_kn, const char *name,
return -EPERM;
 }
 
-static int rdtgroup_rmdir(struct kernfs_node *kn)
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+  cpumask_var_t tmpmask)
 {
-   int ret, cpu, closid = rdtgroup_default.closid;
-   struct rdtgroup *rdtgrp;
-   cpumask_var_t tmpmask;
-
-   if (!zalloc_cpumask_var(, GFP_KERNEL))
-   return -ENOMEM;
-
-   rdtgrp = rdtgroup_kn_lock_live(kn);
-   if (!rdtgrp) {
-   ret = -EPERM;
-   goto out;
-   }
+   int cpu;
 
/* Give any tasks back to the default group */
rdt_move_group_tasks(rdtgrp, _default, tmpmask);
@@ -1592,7 +1582,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
/* Update per cpu closid of the moved CPUs first */
for_each_cpu(cpu, >cpu_mask)
-   per_cpu(rdt_cpu_default.closid, cpu) = closid;
+   per_cpu(rdt_cpu_default.closid, cpu) = rdtgroup_default.closid;
/*
 * Update the MSR on moved CPUs and CPUs which have moved
 * task running on them.
@@ -1610,7 +1600,35 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 */
kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
-   ret = 0;
+
+   return 0;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+   struct kernfs_node *parent_kn = kn->parent;
+   struct rdtgroup *rdtgrp;
+   cpumask_var_t tmpmask;
+   int ret = 0;
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL))
+   return -ENOMEM;
+
+   rdtgrp = rdtgroup_kn_lock_live(kn);
+   if (!rdtgrp) {
+   ret = -EPERM;
+   goto out;
+   }
+
+   /*
+* If the rdtgroup is a ctrl_mon group and parent directory
+* is the root directory, remove the ctrl group.
+*/
+   if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
+   ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+   else
+   ret = -EPERM;
+
 out:
rdtgroup_kn_unlock(kn);
free_cpumask_var(tmpmask);
-- 
1.9.1

[PATCH 23/28] x86/intel_rdt: Introduce rdt_enable_key for scheduling

2017-07-25 Thread Vikas Shivappa

Introduce the usage of rdt_enable_key in sched_in code as a preparation
to add RDT monitoring support for sched_in.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 2c704d2..8c5be01 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -27,10 +27,12 @@ struct intel_pqr_state {
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
 DECLARE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ * __intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
  *
  * Following considerations are made so that this has minimal impact
  * on scheduler hot path:
@@ -42,7 +44,7 @@ struct intel_pqr_state {
  *
  * Must be called with preemption disabled.
  */
-static inline void intel_rdt_sched_in(void)
+static inline void __intel_rdt_sched_in(void)
 {
if (static_branch_likely(_alloc_enable_key)) {
struct intel_pqr_state *state = this_cpu_ptr(_state);
@@ -63,6 +65,12 @@ static inline void intel_rdt_sched_in(void)
}
 }
 
+static inline void intel_rdt_sched_in(void)
+{
+   if (static_branch_likely(_enable_key))
+   __intel_rdt_sched_in();
+}
+
 #else
 
 static inline void intel_rdt_sched_in(void) {}
-- 
1.9.1

[PATCH 23/28] x86/intel_rdt: Introduce rdt_enable_key for scheduling

2017-07-25 Thread Vikas Shivappa

Introduce the usage of rdt_enable_key in sched_in code as a preparation
to add RDT monitoring support for sched_in.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 2c704d2..8c5be01 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -27,10 +27,12 @@ struct intel_pqr_state {
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
 DECLARE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ * __intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
  *
  * Following considerations are made so that this has minimal impact
  * on scheduler hot path:
@@ -42,7 +44,7 @@ struct intel_pqr_state {
  *
  * Must be called with preemption disabled.
  */
-static inline void intel_rdt_sched_in(void)
+static inline void __intel_rdt_sched_in(void)
 {
if (static_branch_likely(_alloc_enable_key)) {
struct intel_pqr_state *state = this_cpu_ptr(_state);
@@ -63,6 +65,12 @@ static inline void intel_rdt_sched_in(void)
}
 }
 
+static inline void intel_rdt_sched_in(void)
+{
+   if (static_branch_likely(_enable_key))
+   __intel_rdt_sched_in();
+}
+
 #else
 
 static inline void intel_rdt_sched_in(void) {}
-- 
1.9.1

[PATCH 26/28] x86/intel_rdt/mbm: Basic counting of MBM events (total and local)

2017-07-25 Thread Vikas Shivappa

From: Tony Luck 

Check CPUID bits for whether each of the MBM events is supported.
Allocate space for each RMID for each counter in each domain to save
previous MSR counter value and running total of data.
Create files in each of the monitor directories.

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c | 23 +++-
 arch/x86/kernel/cpu/intel_rdt.h | 33 +
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c |  1 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 31 ++-
 4 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 83f35e3..767b4c3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -420,6 +420,8 @@ static int domain_setup_ctrlval(struct rdt_resource *r, 
struct rdt_domain *d)
 
 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 {
+   size_t tsize;
+
if (is_llc_occupancy_enabled()) {
d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
   sizeof(unsigned long),
@@ -427,6 +429,23 @@ static int domain_setup_mon_state(struct rdt_resource *r, 
struct rdt_domain *d)
if (!d->rmid_busy_llc)
return -ENOMEM;
}
+   if (is_mbm_total_enabled()) {
+   tsize = sizeof(*d->mbm_total);
+   d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+   if (!d->mbm_total) {
+   kfree(d->rmid_busy_llc);
+   return -ENOMEM;
+   }
+   }
+   if (is_mbm_local_enabled()) {
+   tsize = sizeof(*d->mbm_local);
+   d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+   if (!d->mbm_local) {
+   kfree(d->rmid_busy_llc);
+   kfree(d->mbm_total);
+   return -ENOMEM;
+   }
+   }
 
return 0;
 }
@@ -466,6 +485,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
 
d->id = id;
+   cpumask_set_cpu(cpu, >cpu_mask);
 
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
kfree(d);
@@ -477,7 +497,6 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
}
 
-   cpumask_set_cpu(cpu, >cpu_mask);
list_add_tail(>list, add_pos);
 
/*
@@ -509,6 +528,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource 
*r)
rmdir_mondata_subdir_allrdtgrp(r, d->id);
kfree(d->ctrl_val);
kfree(d->rmid_busy_llc);
+   kfree(d->mbm_total);
+   kfree(d->mbm_local);
list_del(>list);
kfree(d);
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 2660d15..2137d5e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -19,6 +19,9 @@
 #define QOS_L3_OCCUP_EVENT_ID  0x01
 #define QOS_L3_MBM_TOTAL_EVENT_ID  0x02
 #define QOS_L3_MBM_LOCAL_EVENT_ID  0x03
+
+#define MBM_CNTR_WIDTH 24
+
 #define RMID_VAL_ERROR BIT_ULL(63)
 #define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
@@ -50,6 +53,7 @@ struct mon_evt {
 
 struct rmid_read {
struct rdtgroup *rgrp;
+   struct rdt_domain   *d;
int evtid;
u64 val;
 };
@@ -160,12 +164,24 @@ struct rftype {
 };
 
 /**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks:Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr   Value of IA32_QM_CTR for this RMID last time we read it
+ */
+struct mbm_state {
+   u64 chunks;
+   u64 prev_msr;
+};
+
+/**
  * struct rdt_domain - group of cpus sharing an RDT resource
  * @list:  all instances of this resource
  * @id:unique id for this instance
  * @cpu_mask:  which cpus share this resource
  * @rmid_busy_llc:
  * bitmap of which limbo RMIDs are above threshold
+ * @mbm_total: saved state for MBM total bandwidth
+ * @mbm_local: saved state for MBM local bandwidth
  * @ctrl_val:  array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:  new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -175,6 +191,8 @@ struct rdt_domain {
int id;
struct cpumask  cpu_mask;
unsigned long   *rmid_busy_llc;
+   struct mbm_state*mbm_total;
+   struct mbm_state*mbm_local;
u32 *ctrl_val;
u32 new_ctrl;
bool

[PATCH 26/28] x86/intel_rdt/mbm: Basic counting of MBM events (total and local)

2017-07-25 Thread Vikas Shivappa

From: Tony Luck 

Check CPUID bits for whether each of the MBM events is supported.
Allocate space for each RMID for each counter in each domain to save
previous MSR counter value and running total of data.
Create files in each of the monitor directories.

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c | 23 +++-
 arch/x86/kernel/cpu/intel_rdt.h | 33 +
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c |  1 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 31 ++-
 4 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 83f35e3..767b4c3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -420,6 +420,8 @@ static int domain_setup_ctrlval(struct rdt_resource *r, 
struct rdt_domain *d)
 
 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 {
+   size_t tsize;
+
if (is_llc_occupancy_enabled()) {
d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
   sizeof(unsigned long),
@@ -427,6 +429,23 @@ static int domain_setup_mon_state(struct rdt_resource *r, 
struct rdt_domain *d)
if (!d->rmid_busy_llc)
return -ENOMEM;
}
+   if (is_mbm_total_enabled()) {
+   tsize = sizeof(*d->mbm_total);
+   d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+   if (!d->mbm_total) {
+   kfree(d->rmid_busy_llc);
+   return -ENOMEM;
+   }
+   }
+   if (is_mbm_local_enabled()) {
+   tsize = sizeof(*d->mbm_local);
+   d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+   if (!d->mbm_local) {
+   kfree(d->rmid_busy_llc);
+   kfree(d->mbm_total);
+   return -ENOMEM;
+   }
+   }
 
return 0;
 }
@@ -466,6 +485,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
 
d->id = id;
+   cpumask_set_cpu(cpu, >cpu_mask);
 
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
kfree(d);
@@ -477,7 +497,6 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
}
 
-   cpumask_set_cpu(cpu, >cpu_mask);
list_add_tail(>list, add_pos);
 
/*
@@ -509,6 +528,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource 
*r)
rmdir_mondata_subdir_allrdtgrp(r, d->id);
kfree(d->ctrl_val);
kfree(d->rmid_busy_llc);
+   kfree(d->mbm_total);
+   kfree(d->mbm_local);
list_del(>list);
kfree(d);
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 2660d15..2137d5e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -19,6 +19,9 @@
 #define QOS_L3_OCCUP_EVENT_ID  0x01
 #define QOS_L3_MBM_TOTAL_EVENT_ID  0x02
 #define QOS_L3_MBM_LOCAL_EVENT_ID  0x03
+
+#define MBM_CNTR_WIDTH 24
+
 #define RMID_VAL_ERROR BIT_ULL(63)
 #define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
@@ -50,6 +53,7 @@ struct mon_evt {
 
 struct rmid_read {
struct rdtgroup *rgrp;
+   struct rdt_domain   *d;
int evtid;
u64 val;
 };
@@ -160,12 +164,24 @@ struct rftype {
 };
 
 /**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks:Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr   Value of IA32_QM_CTR for this RMID last time we read it
+ */
+struct mbm_state {
+   u64 chunks;
+   u64 prev_msr;
+};
+
+/**
  * struct rdt_domain - group of cpus sharing an RDT resource
  * @list:  all instances of this resource
  * @id:unique id for this instance
  * @cpu_mask:  which cpus share this resource
  * @rmid_busy_llc:
  * bitmap of which limbo RMIDs are above threshold
+ * @mbm_total: saved state for MBM total bandwidth
+ * @mbm_local: saved state for MBM local bandwidth
  * @ctrl_val:  array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:  new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -175,6 +191,8 @@ struct rdt_domain {
int id;
struct cpumask  cpu_mask;
unsigned long   *rmid_busy_llc;
+   struct mbm_state*mbm_total;
+   struct mbm_state*mbm_local;
u32 *ctrl_val;
u32 new_ctrl;
boolhave_new_ctrl;
@@ -230,6 +248,21 @@ static inline bool

[PATCH 25/28] x86/intel_rdt/cqm: Add hotcpu support

2017-07-25 Thread Vikas Shivappa

Resource groups have a per domain directory under "mon_data". Add or
remove these directories as and when domains come online and go offline.
Also update the per cpu rmids and cache upon onlining and offlining
cpus.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c  | 41 +++-
 arch/x86/kernel/cpu/intel_rdt.h  |  9 ++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 47 
 3 files changed, 90 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 81d1cd3..83f35e3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -479,6 +479,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
cpumask_set_cpu(cpu, >cpu_mask);
list_add_tail(>list, add_pos);
+
+   /*
+* If resctrl is mounted, add
+* per domain monitor data directories.
+*/
+   if (static_branch_unlikely(_mon_enable_key))
+   mkdir_mondata_subdir_allrdtgrp(r, d);
 }
 
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -494,6 +501,12 @@ static void domain_remove_cpu(int cpu, struct rdt_resource 
*r)
 
cpumask_clear_cpu(cpu, >cpu_mask);
if (cpumask_empty(>cpu_mask)) {
+   /*
+* If resctrl is mounted, remove all the
+* per domain monitor data directories.
+*/
+   if (static_branch_unlikely(_mon_enable_key))
+   rmdir_mondata_subdir_allrdtgrp(r, d->id);
kfree(d->ctrl_val);
kfree(d->rmid_busy_llc);
list_del(>list);
@@ -501,13 +514,14 @@ static void domain_remove_cpu(int cpu, struct 
rdt_resource *r)
}
 }
 
-static void clear_closid(int cpu)
+static void clear_closid_rmid(int cpu)
 {
struct intel_pqr_state *state = this_cpu_ptr(_state);
 
per_cpu(rdt_cpu_default.closid, cpu) = 0;
state->closid = 0;
-   wrmsr(IA32_PQR_ASSOC, state->rmid, 0);
+   state->rmid = 0;
+   wrmsr(IA32_PQR_ASSOC, 0, 0);
 }
 
 static int intel_rdt_online_cpu(unsigned int cpu)
@@ -515,29 +529,42 @@ static int intel_rdt_online_cpu(unsigned int cpu)
struct rdt_resource *r;
 
mutex_lock(_mutex);
-   for_each_alloc_capable_rdt_resource(r)
+   for_each_capable_rdt_resource(r)
domain_add_cpu(cpu, r);
/* The cpu is set in default rdtgroup after online. */
cpumask_set_cpu(cpu, _default.cpu_mask);
-   clear_closid(cpu);
+   clear_closid_rmid(cpu);
mutex_unlock(_mutex);
 
return 0;
 }
 
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+   struct rdtgroup *cr;
+
+   list_for_each_entry(cr, >mon.crdtgrp_list, mon.crdtgrp_list) {
+   if (cpumask_test_and_clear_cpu(cpu, >cpu_mask)) {
+   break;
+   }
+   }
+}
+
 static int intel_rdt_offline_cpu(unsigned int cpu)
 {
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
 
mutex_lock(_mutex);
-   for_each_alloc_capable_rdt_resource(r)
+   for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
list_for_each_entry(rdtgrp, _all_groups, rdtgroup_list) {
-   if (cpumask_test_and_clear_cpu(cpu, >cpu_mask))
+   if (cpumask_test_and_clear_cpu(cpu, >cpu_mask)) {
+   clear_childcpus(rdtgrp, cpu);
break;
+   }
}
-   clear_closid(cpu);
+   clear_closid_rmid(cpu);
mutex_unlock(_mutex);
 
return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 7fcaa5f..2660d15 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -301,6 +301,11 @@ enum {
RDT_NUM_RESOURCES,
 };
 
+#define for_each_capable_rdt_resource(r) \
+   for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+r++) \
+   if (r->alloc_capable || r->mon_capable)
+
 #define for_each_alloc_capable_rdt_resource(r)   \
for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
 r++) \
@@ -360,5 +365,9 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 void mon_event_count(void *info);
 int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+   unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+   struct rdt_domain *d);
 
 #endif /* _ASM_X86_INTEL_RDT_H */

[PATCH 25/28] x86/intel_rdt/cqm: Add hotcpu support

2017-07-25 Thread Vikas Shivappa

Resource groups have a per domain directory under "mon_data". Add or
remove these directories as and when domains come online and go offline.
Also update the per cpu rmids and cache upon onlining and offlining
cpus.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c  | 41 +++-
 arch/x86/kernel/cpu/intel_rdt.h  |  9 ++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 47 
 3 files changed, 90 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 81d1cd3..83f35e3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -479,6 +479,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
cpumask_set_cpu(cpu, >cpu_mask);
list_add_tail(>list, add_pos);
+
+   /*
+* If resctrl is mounted, add
+* per domain monitor data directories.
+*/
+   if (static_branch_unlikely(_mon_enable_key))
+   mkdir_mondata_subdir_allrdtgrp(r, d);
 }
 
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -494,6 +501,12 @@ static void domain_remove_cpu(int cpu, struct rdt_resource 
*r)
 
cpumask_clear_cpu(cpu, >cpu_mask);
if (cpumask_empty(>cpu_mask)) {
+   /*
+* If resctrl is mounted, remove all the
+* per domain monitor data directories.
+*/
+   if (static_branch_unlikely(_mon_enable_key))
+   rmdir_mondata_subdir_allrdtgrp(r, d->id);
kfree(d->ctrl_val);
kfree(d->rmid_busy_llc);
list_del(>list);
@@ -501,13 +514,14 @@ static void domain_remove_cpu(int cpu, struct 
rdt_resource *r)
}
 }
 
-static void clear_closid(int cpu)
+static void clear_closid_rmid(int cpu)
 {
struct intel_pqr_state *state = this_cpu_ptr(_state);
 
per_cpu(rdt_cpu_default.closid, cpu) = 0;
state->closid = 0;
-   wrmsr(IA32_PQR_ASSOC, state->rmid, 0);
+   state->rmid = 0;
+   wrmsr(IA32_PQR_ASSOC, 0, 0);
 }
 
 static int intel_rdt_online_cpu(unsigned int cpu)
@@ -515,29 +529,42 @@ static int intel_rdt_online_cpu(unsigned int cpu)
struct rdt_resource *r;
 
mutex_lock(_mutex);
-   for_each_alloc_capable_rdt_resource(r)
+   for_each_capable_rdt_resource(r)
domain_add_cpu(cpu, r);
/* The cpu is set in default rdtgroup after online. */
cpumask_set_cpu(cpu, _default.cpu_mask);
-   clear_closid(cpu);
+   clear_closid_rmid(cpu);
mutex_unlock(_mutex);
 
return 0;
 }
 
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+   struct rdtgroup *cr;
+
+   list_for_each_entry(cr, >mon.crdtgrp_list, mon.crdtgrp_list) {
+   if (cpumask_test_and_clear_cpu(cpu, >cpu_mask)) {
+   break;
+   }
+   }
+}
+
 static int intel_rdt_offline_cpu(unsigned int cpu)
 {
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
 
mutex_lock(_mutex);
-   for_each_alloc_capable_rdt_resource(r)
+   for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
list_for_each_entry(rdtgrp, _all_groups, rdtgroup_list) {
-   if (cpumask_test_and_clear_cpu(cpu, >cpu_mask))
+   if (cpumask_test_and_clear_cpu(cpu, >cpu_mask)) {
+   clear_childcpus(rdtgrp, cpu);
break;
+   }
}
-   clear_closid(cpu);
+   clear_closid_rmid(cpu);
mutex_unlock(_mutex);
 
return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 7fcaa5f..2660d15 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -301,6 +301,11 @@ enum {
RDT_NUM_RESOURCES,
 };
 
+#define for_each_capable_rdt_resource(r) \
+   for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+r++) \
+   if (r->alloc_capable || r->mon_capable)
+
 #define for_each_alloc_capable_rdt_resource(r)   \
for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
 r++) \
@@ -360,5 +365,9 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 void mon_event_count(void *info);
 int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+   unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+   struct rdt_domain *d);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git

[PATCH 24/28] x86/intel_rdt/cqm: Add sched_in support

2017-07-25 Thread Vikas Shivappa

OS associates an RMID/CLOSid to a task by writing the per CPU
IA32_PQR_ASSOC MSR when a task is scheduled in.

The sched_in code will stay as no-op unless we are running on Intel SKU
which supports either resource control or monitoring and we also enable
them by mounting the resctrl fs.  The per cpu CLOSid/RMID values are
cached and the write is performed only when a task with a different
CLOSid/RMID is scheduled in.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h | 50 --
 arch/x86/kernel/cpu/intel_rdt.h|  4 ---
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 8c5be01..3badc0a 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -15,7 +15,8 @@
  *
  * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
  * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
+ * contains both parts, so we need to cache them. This also
+ * stores the user configured per cpu CLOSID and RMID.
  *
  * The cache also helps to avoid pointless updates if the value does
  * not change.
@@ -30,38 +31,45 @@ struct intel_pqr_state {
 
 DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
 
 /*
- * __intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
  *
  * Following considerations are made so that this has minimal impact
  * on scheduler hot path:
  * - This will stay as no-op unless we are running on an Intel SKU
- *   which supports resource control and we enable by mounting the
- *   resctrl file system.
- * - Caches the per cpu CLOSid values and does the MSR write only
- *   when a task with a different CLOSid is scheduled in.
- *
+ *   which supports resource control or monitoring and we enable by
+ *   mounting the resctrl file system.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ *   when a task with a different CLOSid/RMID is scheduled in.
+ * - We allocate RMIDs/CLOSids globally in order to keep this as
+ *   simple as possible.
  * Must be called with preemption disabled.
  */
-static inline void __intel_rdt_sched_in(void)
+static void __intel_rdt_sched_in(void)
 {
+   struct intel_pqr_state newstate = this_cpu_read(rdt_cpu_default);
+   struct intel_pqr_state *curstate = this_cpu_ptr(_state);
+
+   /*
+* If this task has a closid/rmid assigned, use it.
+* Else use the closid/rmid assigned to this cpu.
+*/
if (static_branch_likely(_alloc_enable_key)) {
-   struct intel_pqr_state *state = this_cpu_ptr(_state);
-   u32 closid;
+   if (current->closid)
+   newstate.closid = current->closid;
+   }
 
-   /*
-* If this task has a closid assigned, use it.
-* Else use the closid assigned to this cpu.
-*/
-   closid = current->closid;
-   if (closid == 0)
-   closid = this_cpu_read(rdt_cpu_default.closid);
+   if (static_branch_likely(_mon_enable_key)) {
+   if (current->rmid)
+   newstate.rmid = current->rmid;
+   }
 
-   if (closid != state->closid) {
-   state->closid = closid;
-   wrmsr(IA32_PQR_ASSOC, state->rmid, closid);
-   }
+   if (newstate.closid != curstate->closid ||
+   newstate.rmid != curstate->rmid) {
+   *curstate = newstate;
+   wrmsr(IA32_PQR_ASSOC, newstate.rmid, newstate.closid);
}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 92a5d30..7fcaa5f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -22,8 +22,6 @@
 #define RMID_VAL_ERROR BIT_ULL(63)
 #define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid: event id
@@ -61,8 +59,6 @@ struct rmid_read {
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
 
-DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-
 enum rdt_group_type {
RDTCTRL_GROUP = 0,
RDTMON_GROUP,
-- 
1.9.1

[PATCH 24/28] x86/intel_rdt/cqm: Add sched_in support

2017-07-25 Thread Vikas Shivappa

OS associates an RMID/CLOSid to a task by writing the per CPU
IA32_PQR_ASSOC MSR when a task is scheduled in.

The sched_in code will stay as no-op unless we are running on Intel SKU
which supports either resource control or monitoring and we also enable
them by mounting the resctrl fs.  The per cpu CLOSid/RMID values are
cached and the write is performed only when a task with a different
CLOSid/RMID is scheduled in.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/include/asm/intel_rdt_sched.h | 50 --
 arch/x86/kernel/cpu/intel_rdt.h|  4 ---
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt_sched.h 
b/arch/x86/include/asm/intel_rdt_sched.h
index 8c5be01..3badc0a 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -15,7 +15,8 @@
  *
  * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
  * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
+ * contains both parts, so we need to cache them. This also
+ * stores the user configured per cpu CLOSID and RMID.
  *
  * The cache also helps to avoid pointless updates if the value does
  * not change.
@@ -30,38 +31,45 @@ struct intel_pqr_state {
 
 DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
 
 /*
- * __intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
  *
  * Following considerations are made so that this has minimal impact
  * on scheduler hot path:
  * - This will stay as no-op unless we are running on an Intel SKU
- *   which supports resource control and we enable by mounting the
- *   resctrl file system.
- * - Caches the per cpu CLOSid values and does the MSR write only
- *   when a task with a different CLOSid is scheduled in.
- *
+ *   which supports resource control or monitoring and we enable by
+ *   mounting the resctrl file system.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ *   when a task with a different CLOSid/RMID is scheduled in.
+ * - We allocate RMIDs/CLOSids globally in order to keep this as
+ *   simple as possible.
  * Must be called with preemption disabled.
  */
-static inline void __intel_rdt_sched_in(void)
+static void __intel_rdt_sched_in(void)
 {
+   struct intel_pqr_state newstate = this_cpu_read(rdt_cpu_default);
+   struct intel_pqr_state *curstate = this_cpu_ptr(_state);
+
+   /*
+* If this task has a closid/rmid assigned, use it.
+* Else use the closid/rmid assigned to this cpu.
+*/
if (static_branch_likely(_alloc_enable_key)) {
-   struct intel_pqr_state *state = this_cpu_ptr(_state);
-   u32 closid;
+   if (current->closid)
+   newstate.closid = current->closid;
+   }
 
-   /*
-* If this task has a closid assigned, use it.
-* Else use the closid assigned to this cpu.
-*/
-   closid = current->closid;
-   if (closid == 0)
-   closid = this_cpu_read(rdt_cpu_default.closid);
+   if (static_branch_likely(_mon_enable_key)) {
+   if (current->rmid)
+   newstate.rmid = current->rmid;
+   }
 
-   if (closid != state->closid) {
-   state->closid = closid;
-   wrmsr(IA32_PQR_ASSOC, state->rmid, closid);
-   }
+   if (newstate.closid != curstate->closid ||
+   newstate.rmid != curstate->rmid) {
+   *curstate = newstate;
+   wrmsr(IA32_PQR_ASSOC, newstate.rmid, newstate.closid);
}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 92a5d30..7fcaa5f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -22,8 +22,6 @@
 #define RMID_VAL_ERROR BIT_ULL(63)
 #define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid: event id
@@ -61,8 +59,6 @@ struct rmid_read {
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
 
-DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-
 enum rdt_group_type {
RDTCTRL_GROUP = 0,
RDTMON_GROUP,
-- 
1.9.1

[PATCH 04/28] x86/intel_rdt: Change file names to accommodate RDT monitor code

2017-07-25 Thread Vikas Shivappa

Because the "perf cqm" and resctrl code were separately added and
indivdually configurable, there seem to be separate context switch code
and also things on global .h which are not really needed.

Move only the scheduling specific code and definitions to
 and the put all the other declarations to a
local intel_rdt.h.

h/t to Reinette Chatre for pointing out that we should separate the
public interfaces used by other parts of the kernel from private
objects shared between the various files comprising RDT.

No functional change.

Signed-off-by: Vikas Shivappa 
---
 MAINTAINERS  |   2 +-
 arch/x86/include/asm/intel_rdt.h | 286 ---
 arch/x86/include/asm/intel_rdt_common.h  |  25 ---
 arch/x86/include/asm/intel_rdt_sched.h   |  72 
 arch/x86/kernel/cpu/intel_rdt.c  |   5 +-
 arch/x86/kernel/cpu/intel_rdt.h  | 243 ++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |   4 +-
 arch/x86/kernel/cpu/intel_rdt_schemata.c |   2 +-
 arch/x86/kernel/process_32.c |   2 +-
 arch/x86/kernel/process_64.c |   2 +-
 10 files changed, 324 insertions(+), 319 deletions(-)
 delete mode 100644 arch/x86/include/asm/intel_rdt.h
 delete mode 100644 arch/x86/include/asm/intel_rdt_common.h
 create mode 100644 arch/x86/include/asm/intel_rdt_sched.h
 create mode 100644 arch/x86/kernel/cpu/intel_rdt.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 767e9d2..8388699 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10733,7 +10733,7 @@ M:  Fenghua Yu 
 L: linux-kernel@vger.kernel.org
 S: Supported
 F: arch/x86/kernel/cpu/intel_rdt*
-F: arch/x86/include/asm/intel_rdt*
+F: arch/x86/include/asm/intel_rdt_sched.h
 F: Documentation/x86/intel_rdt*
 
 READ-COPY UPDATE (RCU)
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
deleted file mode 100644
index ae1efc3..000
--- a/arch/x86/include/asm/intel_rdt.h
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_H
-#define _ASM_X86_INTEL_RDT_H
-
-#ifdef CONFIG_INTEL_RDT
-
-#include 
-#include 
-#include 
-
-#include 
-
-#define IA32_L3_QOS_CFG0xc81
-#define IA32_L3_CBM_BASE   0xc90
-#define IA32_L2_CBM_BASE   0xd10
-#define IA32_MBA_THRTL_BASE0xd50
-
-#define L3_QOS_CDP_ENABLE  0x01ULL
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn:kernfs node
- * @rdtgroup_list: linked list for all rdtgroups
- * @closid:closid for this rdtgroup
- * @cpu_mask:  CPUs assigned to this rdtgroup
- * @flags: status bits
- * @waitcount: how many cpus expect to find this
- * group when they acquire rdtgroup_mutex
- */
-struct rdtgroup {
-   struct kernfs_node  *kn;
-   struct list_headrdtgroup_list;
-   int closid;
-   struct cpumask  cpu_mask;
-   int flags;
-   atomic_twaitcount;
-};
-
-/* rdtgroup.flags */
-#defineRDT_DELETED 1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST 1
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name:  File name
- * @mode:  Access mode
- * @kf_ops:File operations
- * @flags: File specific RFTYPE_FLAGS_* flags
- * @seq_show:  Show content of the file
- * @write: Write to the file
- */
-struct rftype {
-   char*name;
-   umode_t mode;
-   struct kernfs_ops   *kf_ops;
-   unsigned long   flags;
-
-   int (*seq_show)(struct kernfs_open_file *of,
-   struct seq_file *sf, void *v);
-   /*
-* write() is the generic write callback which maps directly to
-* kernfs write operation and overrides all other operations.
-* Maximum write size is determined by ->max_write_len.
-*/
-   ssize_t (*write)(struct kernfs_open_file *of,
-char *buf, size_t nbytes, loff_t off);
-};
-
-/**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list:  all instances of this resource
- * @id:unique id for this instance
- * @cpu_mask:  which cpus share this resource
- * @ctrl_val:  array of cache or mem ctrl values (indexed by CLOSID)
- * @new_ctrl:  new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- */
-struct rdt_domain {
-   struct list_headlist;
-   int id;
-   struct cpumask  cpu_mask;
-   u32 *ctrl_val;
-   u32 new_ctrl;
-

[PATCH 04/28] x86/intel_rdt: Change file names to accommodate RDT monitor code

2017-07-25 Thread Vikas Shivappa

Because the "perf cqm" and resctrl code were separately added and
indivdually configurable, there seem to be separate context switch code
and also things on global .h which are not really needed.

Move only the scheduling specific code and definitions to
 and the put all the other declarations to a
local intel_rdt.h.

h/t to Reinette Chatre for pointing out that we should separate the
public interfaces used by other parts of the kernel from private
objects shared between the various files comprising RDT.

No functional change.

Signed-off-by: Vikas Shivappa 
---
 MAINTAINERS  |   2 +-
 arch/x86/include/asm/intel_rdt.h | 286 ---
 arch/x86/include/asm/intel_rdt_common.h  |  25 ---
 arch/x86/include/asm/intel_rdt_sched.h   |  72 
 arch/x86/kernel/cpu/intel_rdt.c  |   5 +-
 arch/x86/kernel/cpu/intel_rdt.h  | 243 ++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |   4 +-
 arch/x86/kernel/cpu/intel_rdt_schemata.c |   2 +-
 arch/x86/kernel/process_32.c |   2 +-
 arch/x86/kernel/process_64.c |   2 +-
 10 files changed, 324 insertions(+), 319 deletions(-)
 delete mode 100644 arch/x86/include/asm/intel_rdt.h
 delete mode 100644 arch/x86/include/asm/intel_rdt_common.h
 create mode 100644 arch/x86/include/asm/intel_rdt_sched.h
 create mode 100644 arch/x86/kernel/cpu/intel_rdt.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 767e9d2..8388699 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10733,7 +10733,7 @@ M:  Fenghua Yu 
 L: linux-kernel@vger.kernel.org
 S: Supported
 F: arch/x86/kernel/cpu/intel_rdt*
-F: arch/x86/include/asm/intel_rdt*
+F: arch/x86/include/asm/intel_rdt_sched.h
 F: Documentation/x86/intel_rdt*
 
 READ-COPY UPDATE (RCU)
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
deleted file mode 100644
index ae1efc3..000
--- a/arch/x86/include/asm/intel_rdt.h
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_H
-#define _ASM_X86_INTEL_RDT_H
-
-#ifdef CONFIG_INTEL_RDT
-
-#include 
-#include 
-#include 
-
-#include 
-
-#define IA32_L3_QOS_CFG0xc81
-#define IA32_L3_CBM_BASE   0xc90
-#define IA32_L2_CBM_BASE   0xd10
-#define IA32_MBA_THRTL_BASE0xd50
-
-#define L3_QOS_CDP_ENABLE  0x01ULL
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn:kernfs node
- * @rdtgroup_list: linked list for all rdtgroups
- * @closid:closid for this rdtgroup
- * @cpu_mask:  CPUs assigned to this rdtgroup
- * @flags: status bits
- * @waitcount: how many cpus expect to find this
- * group when they acquire rdtgroup_mutex
- */
-struct rdtgroup {
-   struct kernfs_node  *kn;
-   struct list_headrdtgroup_list;
-   int closid;
-   struct cpumask  cpu_mask;
-   int flags;
-   atomic_twaitcount;
-};
-
-/* rdtgroup.flags */
-#defineRDT_DELETED 1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST 1
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name:  File name
- * @mode:  Access mode
- * @kf_ops:File operations
- * @flags: File specific RFTYPE_FLAGS_* flags
- * @seq_show:  Show content of the file
- * @write: Write to the file
- */
-struct rftype {
-   char*name;
-   umode_t mode;
-   struct kernfs_ops   *kf_ops;
-   unsigned long   flags;
-
-   int (*seq_show)(struct kernfs_open_file *of,
-   struct seq_file *sf, void *v);
-   /*
-* write() is the generic write callback which maps directly to
-* kernfs write operation and overrides all other operations.
-* Maximum write size is determined by ->max_write_len.
-*/
-   ssize_t (*write)(struct kernfs_open_file *of,
-char *buf, size_t nbytes, loff_t off);
-};
-
-/**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list:  all instances of this resource
- * @id:unique id for this instance
- * @cpu_mask:  which cpus share this resource
- * @ctrl_val:  array of cache or mem ctrl values (indexed by CLOSID)
- * @new_ctrl:  new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- */
-struct rdt_domain {
-   struct list_headlist;
-   int id;
-   struct cpumask  cpu_mask;
-   u32 *ctrl_val;
-   u32 new_ctrl;
-   boolhave_new_ctrl;
-};
-
-/**

[PATCH 28/28] x86/intel_rdt/mbm: Handle counter overflow

2017-07-25 Thread Vikas Shivappa

Set up a delayed work queue for each domain that will read all
the MBM counters of active RMIDs once per second to make sure
that they don't wrap around between reads from users.

[Tony: Added the initializations for the work structure and completed
the patch]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c  | 21 ---
 arch/x86/kernel/cpu/intel_rdt.h  | 10 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c  | 63 
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |  9 +
 4 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 767b4c3..b906e0e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -348,12 +348,10 @@ void rdt_ctrl_update(void *arg)
int cpu = smp_processor_id();
struct rdt_domain *d;
 
-   list_for_each_entry(d, >domains, list) {
-   /* Find the domain that contains this CPU */
-   if (cpumask_test_cpu(cpu, >cpu_mask)) {
-   r->msr_update(d, m, r);
-   return;
-   }
+   d = get_domain_from_cpu(cpu, r);
+   if (d) {
+   r->msr_update(d, m, r);
+   return;
}
pr_warn_once("cpu %d not found in any domain for resource %s\n",
 cpu, r->name);
@@ -447,6 +445,11 @@ static int domain_setup_mon_state(struct rdt_resource *r, 
struct rdt_domain *d)
}
}
 
+   if (is_mbm_enabled()) {
+   INIT_DELAYED_WORK(>mbm_over, mbm_handle_overflow);
+   mbm_setup_overflow_handler(d);
+   }
+
return 0;
 }
 
@@ -531,7 +534,13 @@ static void domain_remove_cpu(int cpu, struct rdt_resource 
*r)
kfree(d->mbm_total);
kfree(d->mbm_local);
list_del(>list);
+   if (is_mbm_enabled())
+   cancel_delayed_work(>mbm_over);
kfree(d);
+   } else if (r == _resources_all[RDT_RESOURCE_L3] &&
+  cpu == d->mbm_work_cpu && is_mbm_enabled()) {
+   cancel_delayed_work(>mbm_over);
+   mbm_setup_overflow_handler(d);
}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index f160403..94e488a 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -21,10 +21,13 @@
 #define QOS_L3_MBM_LOCAL_EVENT_ID  0x03
 
 #define MBM_CNTR_WIDTH 24
+#define MBM_OVERFLOW_INTERVAL  1000
 
 #define RMID_VAL_ERROR BIT_ULL(63)
 #define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid: event id
@@ -183,6 +186,9 @@ struct mbm_state {
  * bitmap of which limbo RMIDs are above threshold
  * @mbm_total: saved state for MBM total bandwidth
  * @mbm_local: saved state for MBM local bandwidth
+ * @mbm_over:  worker to periodically read MBM h/w counters
+ * @mbm_work_cpu:
+ * worker cpu for MBM h/w counters
  * @ctrl_val:  array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:  new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -194,6 +200,8 @@ struct rdt_domain {
unsigned long   *rmid_busy_llc;
struct mbm_state*mbm_total;
struct mbm_state*mbm_local;
+   struct delayed_work mbm_over;
+   int mbm_work_cpu;
u32 *ctrl_val;
u32 new_ctrl;
boolhave_new_ctrl;
@@ -411,5 +419,7 @@ void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
struct rdt_domain *d);
 void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom);
+void mbm_handle_overflow(struct work_struct *work);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c 
b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 383a023..d6bfdfd 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -366,6 +366,69 @@ void mon_event_count(void *info)
}
 }
 
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+   struct rmid_read rr;
+
+   rr.first = false;
+   rr.d = d;
+
+   /*
+* This is protected from concurrent reads from user
+* as both the user and we hold the global mutex.
+*/
+   if (is_mbm_total_enabled()) {
+   rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+   __mon_event_count(rmid, );
+   }
+   if

[PATCH 28/28] x86/intel_rdt/mbm: Handle counter overflow

2017-07-25 Thread Vikas Shivappa

Set up a delayed work queue for each domain that will read all
the MBM counters of active RMIDs once per second to make sure
that they don't wrap around between reads from users.

[Tony: Added the initializations for the work structure and completed
the patch]

Signed-off-by: Tony Luck 
Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.c  | 21 ---
 arch/x86/kernel/cpu/intel_rdt.h  | 10 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c  | 63 
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |  9 +
 4 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 767b4c3..b906e0e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -348,12 +348,10 @@ void rdt_ctrl_update(void *arg)
int cpu = smp_processor_id();
struct rdt_domain *d;
 
-   list_for_each_entry(d, >domains, list) {
-   /* Find the domain that contains this CPU */
-   if (cpumask_test_cpu(cpu, >cpu_mask)) {
-   r->msr_update(d, m, r);
-   return;
-   }
+   d = get_domain_from_cpu(cpu, r);
+   if (d) {
+   r->msr_update(d, m, r);
+   return;
}
pr_warn_once("cpu %d not found in any domain for resource %s\n",
 cpu, r->name);
@@ -447,6 +445,11 @@ static int domain_setup_mon_state(struct rdt_resource *r, 
struct rdt_domain *d)
}
}
 
+   if (is_mbm_enabled()) {
+   INIT_DELAYED_WORK(>mbm_over, mbm_handle_overflow);
+   mbm_setup_overflow_handler(d);
+   }
+
return 0;
 }
 
@@ -531,7 +534,13 @@ static void domain_remove_cpu(int cpu, struct rdt_resource 
*r)
kfree(d->mbm_total);
kfree(d->mbm_local);
list_del(>list);
+   if (is_mbm_enabled())
+   cancel_delayed_work(>mbm_over);
kfree(d);
+   } else if (r == _resources_all[RDT_RESOURCE_L3] &&
+  cpu == d->mbm_work_cpu && is_mbm_enabled()) {
+   cancel_delayed_work(>mbm_over);
+   mbm_setup_overflow_handler(d);
}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index f160403..94e488a 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -21,10 +21,13 @@
 #define QOS_L3_MBM_LOCAL_EVENT_ID  0x03
 
 #define MBM_CNTR_WIDTH 24
+#define MBM_OVERFLOW_INTERVAL  1000
 
 #define RMID_VAL_ERROR BIT_ULL(63)
 #define RMID_VAL_UNAVAIL   BIT_ULL(62)
 
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid: event id
@@ -183,6 +186,9 @@ struct mbm_state {
  * bitmap of which limbo RMIDs are above threshold
  * @mbm_total: saved state for MBM total bandwidth
  * @mbm_local: saved state for MBM local bandwidth
+ * @mbm_over:  worker to periodically read MBM h/w counters
+ * @mbm_work_cpu:
+ * worker cpu for MBM h/w counters
  * @ctrl_val:  array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:  new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -194,6 +200,8 @@ struct rdt_domain {
unsigned long   *rmid_busy_llc;
struct mbm_state*mbm_total;
struct mbm_state*mbm_local;
+   struct delayed_work mbm_over;
+   int mbm_work_cpu;
u32 *ctrl_val;
u32 new_ctrl;
boolhave_new_ctrl;
@@ -411,5 +419,7 @@ void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
struct rdt_domain *d);
 void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom);
+void mbm_handle_overflow(struct work_struct *work);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c 
b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 383a023..d6bfdfd 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -366,6 +366,69 @@ void mon_event_count(void *info)
}
 }
 
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+   struct rmid_read rr;
+
+   rr.first = false;
+   rr.d = d;
+
+   /*
+* This is protected from concurrent reads from user
+* as both the user and we hold the global mutex.
+*/
+   if (is_mbm_total_enabled()) {
+   rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+   __mon_event_count(rmid, );
+   }
+   if (is_mbm_local_enabled()) {
+   rr.evtid =

[PATCH 27/28] x86/intel_rdt/mbm: Add mbm counter initialization

2017-07-25 Thread Vikas Shivappa

MBM counters are monotonically increasing counts representing the total
memory bytes at a particular time. In order to calculate total_bytes for
an rdtgroup, we store the value of the counter when we create an
rdtgroup or when a new domain comes online.

When the total_bytes(all memory controller bytes) or local_bytes(local
memory controller bytes) file in "mon_data" is read it shows the
total bytes for that rdtgroup since its creation. User can snapshot this
at different time intervals to obtain bytes/second.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.h | 9 +
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 5 +++--
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 7 +++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c| 4 
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 2137d5e..f160403 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -55,6 +55,7 @@ struct rmid_read {
struct rdtgroup *rgrp;
struct rdt_domain   *d;
int evtid;
+   boolfirst;
u64 val;
 };
 
@@ -263,6 +264,12 @@ static inline bool is_mbm_enabled(void)
return (is_mbm_total_enabled() || is_mbm_local_enabled());
 }
 
+static inline bool is_mbm_event(int e)
+{
+   return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+   e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
 /**
  * struct rdt_resource - attributes of an RDT resource
  * @rid:   The index of the resource
@@ -402,5 +409,7 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
unsigned int dom_id);
 void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+   struct rdtgroup *rdtgrp, int evtid, int first);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c 
b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index cf8e2c7..f6ea94f 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -287,7 +287,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 }
 
 void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
-   struct rdtgroup *rdtgrp, int evtid)
+   struct rdtgroup *rdtgrp, int evtid, int first)
 {
/*
 * setup the parameters to send to the IPI to read the data.
@@ -296,6 +296,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain 
*d,
rr->evtid = evtid;
rr->d = d;
rr->val = 0;
+   rr->first = first;
 
smp_call_function_any(>cpu_mask, mon_event_count, rr, 1);
 }
@@ -325,7 +326,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
goto out;
}
 
-   mon_event_read(, d, rdtgrp, evtid);
+   mon_event_read(, d, rdtgrp, evtid, false);
 
if (rr.val & RMID_VAL_ERROR)
seq_puts(m, "Error\n");
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c 
b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index ef0358b..383a023 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -321,6 +321,13 @@ static int __mon_event_count(u32 rmid, struct rmid_read 
*rr)
 */
return -EINVAL;
}
+
+   if (rr->first) {
+   m->prev_msr = tval;
+   m->chunks = 0;
+   return 0;
+   }
+
shift = 64 - MBM_CNTR_WIDTH;
chunks = (tval << shift) - (m->prev_msr << shift);
chunks >>= shift;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 90a22d3..50b5d03 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1352,6 +1352,7 @@ static int mkdir_mondata_subdir(struct kernfs_node 
*parent_kn,
union mon_data_bits priv;
struct kernfs_node *kn;
struct mon_evt *mevt;
+   struct rmid_read rr;
char name[32];
int ret;
 
@@ -1382,6 +1383,9 @@ static int mkdir_mondata_subdir(struct kernfs_node 
*parent_kn,
ret = mon_addfile(kn, mevt->name, priv.priv);
if (ret)
goto out_destroy;
+
+   if (is_mbm_event(mevt->evtid))
+   mon_event_read(, d, prgrp, mevt->evtid, true);
}
kernfs_activate(kn);
return 0;
-- 
1.9.1

[PATCH 27/28] x86/intel_rdt/mbm: Add mbm counter initialization

2017-07-25 Thread Vikas Shivappa

MBM counters are monotonically increasing counts representing the total
memory bytes at a particular time. In order to calculate total_bytes for
an rdtgroup, we store the value of the counter when we create an
rdtgroup or when a new domain comes online.

When the total_bytes(all memory controller bytes) or local_bytes(local
memory controller bytes) file in "mon_data" is read it shows the
total bytes for that rdtgroup since its creation. User can snapshot this
at different time intervals to obtain bytes/second.

Signed-off-by: Vikas Shivappa 
---
 arch/x86/kernel/cpu/intel_rdt.h | 9 +
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 5 +++--
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 7 +++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c| 4 
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 2137d5e..f160403 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -55,6 +55,7 @@ struct rmid_read {
struct rdtgroup *rgrp;
struct rdt_domain   *d;
int evtid;
+   boolfirst;
u64 val;
 };
 
@@ -263,6 +264,12 @@ static inline bool is_mbm_enabled(void)
return (is_mbm_total_enabled() || is_mbm_local_enabled());
 }
 
+static inline bool is_mbm_event(int e)
+{
+   return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+   e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
 /**
  * struct rdt_resource - attributes of an RDT resource
  * @rid:   The index of the resource
@@ -402,5 +409,7 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
unsigned int dom_id);
 void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+   struct rdtgroup *rdtgrp, int evtid, int first);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c 
b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index cf8e2c7..f6ea94f 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -287,7 +287,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 }
 
 void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
-   struct rdtgroup *rdtgrp, int evtid)
+   struct rdtgroup *rdtgrp, int evtid, int first)
 {
/*
 * setup the parameters to send to the IPI to read the data.
@@ -296,6 +296,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain 
*d,
rr->evtid = evtid;
rr->d = d;
rr->val = 0;
+   rr->first = first;
 
smp_call_function_any(>cpu_mask, mon_event_count, rr, 1);
 }
@@ -325,7 +326,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
goto out;
}
 
-   mon_event_read(, d, rdtgrp, evtid);
+   mon_event_read(, d, rdtgrp, evtid, false);
 
if (rr.val & RMID_VAL_ERROR)
seq_puts(m, "Error\n");
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c 
b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index ef0358b..383a023 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -321,6 +321,13 @@ static int __mon_event_count(u32 rmid, struct rmid_read 
*rr)
 */
return -EINVAL;
}
+
+   if (rr->first) {
+   m->prev_msr = tval;
+   m->chunks = 0;
+   return 0;
+   }
+
shift = 64 - MBM_CNTR_WIDTH;
chunks = (tval << shift) - (m->prev_msr << shift);
chunks >>= shift;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 90a22d3..50b5d03 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1352,6 +1352,7 @@ static int mkdir_mondata_subdir(struct kernfs_node 
*parent_kn,
union mon_data_bits priv;
struct kernfs_node *kn;
struct mon_evt *mevt;
+   struct rmid_read rr;
char name[32];
int ret;
 
@@ -1382,6 +1383,9 @@ static int mkdir_mondata_subdir(struct kernfs_node 
*parent_kn,
ret = mon_addfile(kn, mevt->name, priv.priv);
if (ret)
goto out_destroy;
+
+   if (is_mbm_event(mevt->evtid))
+   mon_event_read(, d, prgrp, mevt->evtid, true);
}
kernfs_activate(kn);
return 0;
-- 
1.9.1

< 1 2 3 4 5 6 7 8 9 10 >

501 - 600 of 2870 matches

Mail list logo