Attached is an early version of the ioctl based CAT interface we
have been working on.

NOTE: it does not compile, there is no locking, but should 
be sufficient for interested people to comment.


diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3622f..293726b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -757,6 +757,14 @@ config HPET_EMULATE_RTC
        def_bool y
        depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || 
RTC_DRV_CMOS=y)
 
+config CACHE_RESERVATION
+       tristate "Cache Reservation Support"
+       default n
+       ---help---
+         This feature makes use of Intel's Cache Allocation Technology to 
allow the
+         reservation of portions of the L3 cache to specific tasks. Please, see
+         Documentation/x86/cache-reservation.txt for more information.
+
 config APB_TIMER
        def_bool y if X86_INTEL_MID
        prompt "Intel MID APB Timer Support" if X86_INTEL_MID
diff --git a/arch/x86/include/uapi/asm/cache_reservation.h 
b/arch/x86/include/uapi/asm/cache_reservation.h
new file mode 100644
index 0000000..c4dcc95
--- /dev/null
+++ b/arch/x86/include/uapi/asm/cache_reservation.h
@@ -0,0 +1,64 @@
+enum cache_rsvt_flags {
+       CACHE_RSVT_ROUND_DOWN   =       (1 << 0),  /* round kbytes down */
+};
+
+enum cache_rsvt_type {
+       CACHE_RSVT_TYPE_CODE = 0, /* cache reservation is for code */
+       CACHE_RSVT_TYPE_DATA,     /* cache reservation is for data */
+       CACHE_RSVT_TYPE_BOTH,     /* cache reservation is for both */
+};
+
+struct cat_reservation {
+       __u64 kbytes;
+       __u32 type;
+       __u32 flags;
+       __u32 tcrid;
+       __u32 pad[11];
+};
+
+struct cat_reservation_cpumask {
+       size_t cpusetsize;
+       cpu_set_t *mask;
+       struct cat_reservation res;
+};
+
+struct pid_cat_reservation {
+       __u32 tcrid;
+       __s32 pid; 
+       __u32 pad[8];
+};
+
+struct cat_tcrid {
+       __u32 tcrid;
+       __u32 pad[7];
+};
+
+struct cat_reservation_list {
+       /* -- input -- */
+       struct cat_reservation *res;
+       /* how many bytes allocated for list */
+       size_t cat_res_size;
+       cpu_set_t *mask;
+       /* how many bytes allocated for mask */
+       size_t cpusetsize;
+
+       /* -- output -- */
+       /* size of each cpu_set_t entry copied to 
+         * cpu_set_t *mask
+         */
+       size_t cpumask_size;
+       __u32 pad[11];
+};
+
+struct cat_tcrid_tasks {
+       __u32 tcrid;
+       size_t nr_entries;
+       struct pid_t *list;
+};
+
+#define CAT_CREATE_RESERVATION _IOW(CATIO, 0x00, struct 
cat_reservation_cpumask)
+#define CAT_DELETE_RESERVATION _IOR(CATIO,  0x01, struct cat_tcrid)
+#define CAT_ATTACH_RESERVATION _IOW(CATIO, 0x02, struct pid_cat_reservation)
+#define CAT_DETACH_RESERVATION _IOW(CATIO, 0x03, struct pid_cat_reservation)
+#define CAT_GET_RESERVATIONS   _IOW(CATIO, 0x04, struct cat_reservation_list)
+#define CAT_GET_TCRID_TASKS     _IOW(CATIO, 0x05, struct)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ff..57129d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -110,6 +110,8 @@ obj-$(CONFIG_EFI)                   += sysfb_efi.o
 obj-$(CONFIG_PERF_EVENTS)              += perf_regs.o
 obj-$(CONFIG_TRACING)                  += tracepoint.o
 
+obj-$(CONFIG_CACHE_RESERVATION)        += cat/ 
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cat/Makefile b/arch/x86/kernel/cat/Makefile
new file mode 100644
index 0000000..031fd64
--- /dev/null
+++ b/arch/x86/kernel/cat/Makefile
@@ -0,0 +1 @@
+obj-y += cache_reservation.o
diff --git a/arch/x86/kernel/cat/cache_reservation.c 
b/arch/x86/kernel/cat/cache_reservation.c
new file mode 100644
index 0000000..4187a57
--- /dev/null
+++ b/arch/x86/kernel/cat/cache_reservation.c
@@ -0,0 +1,1244 @@
+
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpumask.h>
+#include <linux/topology.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "cache_reservation.h"
+#include <uapi/asm/cache_reservation.h>
+#include <asm/uaccess.h>
+
+/*
+ *
+ * There are two main data structures: tcrid entries, and tcrid lists.
+ * A tcrid entry contains size,type information and is used 
+ * to identify a cache allocation reservation.
+ * One task should not allocate more than one tcrid per type 
+ * unless that tcrid is to be shared with a different task.
+ * A tcrid list is a set of tcrid entries, and is mapped to (used by)
+ * one or more tasks.
+ * Each task is mapped to only one tcrid list.
+ * A tcrid entry can be in one or more tcrid lists at the same time.
+ *
+ * Mapping to Intel CAT:
+ *     * tcrid list maps one-to-one to a COS-ID.
+ *     * tcrid entry represents a range of bits 
+ *       in a number of (one or more) Cache Capacity Bitmasks,
+ *       which are specified in HW via IA32_L3_MASK_n MSRs.
+ *     * one tcrid entry can be in different locations 
+ *       in different sockets.
+ *     * tcrid entries of a tcrid list must be mapped contiguously
+ *       in hardware.
+ *
+ */
+
+unsigned long *closmap; 
+
+LIST_HEAD(tcr_global_list);
+DEFINE_MUTEX(tcr_list_mutex);
+
+DECLARE_BITMAP(tcrid_used_bitmap, CBM_LEN);
+struct tcr_entry *tcrid_table;
+static unsigned int total_tcrentry_bits;
+
+static unsigned int l3_cache_size;
+//static u32 max_closid;
+static u32 max_cbm_len;
+static unsigned int kbytes_per_cbm_bit;
+static unsigned int l3_nr_cbm_bits;
+
+static unsigned int max_sockets;
+
+struct cache_layout {
+       unsigned long *closmap;
+       u32 hw_shared_bitmask;
+       int id;
+       struct list_head link;
+       int nr_users;
+};
+
+LIST_HEAD(layout_list);
+
+struct per_socket_data {
+       /* start, end of shared region with HW */
+       u32 hw_shared_bitmask;
+       int initialized;
+       unsigned long *cosidzeromask;
+       struct cache_layout *layout;
+       unsigned int occupied_cbm_bits;
+};
+
+struct per_socket_data *psd;
+static unsigned int psd_size;
+
+/*
+ * CDP capable hardware: CDP-on by default.
+ * Use intel_cat_mode=cat kernel parameter to switch to cat.
+ */
+static bool __read_mostly enable_cdp = 1;
+module_param_named(ept, enable_cdp, bool, S_IRUGO);
+
+// protects addition to layout_list
+static DEFINE_RAW_SPINLOCK(cache_layout_lock);
+
+DECLARE_BITMAP(cache_layout_ids, MAX_LAYOUTS);
+
+struct cache_layout *find_create_layout(u32 hw_shared_bitmask)
+{
+       struct cache_layout *l;
+
+       raw_spin_lock(&cache_layout_lock);
+
+       list_for_each_entry(l, &layout_list, link) {
+               if (l->hw_shared_bitmask == hw_shared_bitmask)
+                       l->nr_users++;
+                       raw_spin_unlock(&cache_layout_lock);
+                       return l;
+       }
+
+       l = kzalloc(GFP_ATOMIC, sizeof(struct cache_layout));
+       if (!l) {
+               panic("%s alloc failed", __func__);
+       }
+       l->hw_shared_bitmask = hw_shared_bitmask;
+       l->id = find_first_zero_bit(cache_layout_ids, MAX_LAYOUTS);
+       if (l->id == MAX_LAYOUTS) {
+               printk(KERN_ERR "intel_cat: MAX_LAYOUTS exceeded\n");
+               /* reuse id 0 */
+               l = list_first_entry(&layout_list, struct cache_layout, link);
+               l->nr_users++;
+               raw_spin_unlock(&cache_layout_lock);
+               return l;
+       }
+       set_bit(l->id, cache_layout_ids);
+       l->nr_users++;
+       INIT_LIST_HEAD(&l->link);
+       list_add(&l->link, &layout_list);
+       raw_spin_unlock(&cache_layout_lock);
+       return l;
+}
+
+u32 maxtcrlist_id;
+
+int alloc_tcrid_table(void)
+{
+       struct tcr_entry *e;
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+       int i;
+
+       maxtcrlist_id = c->x86_cache_max_closid;
+
+       tcrid_table = kzalloc(GFP_KERNEL, CBM_LEN);
+       if (!tcrid_table)
+               return -ENOMEM;
+
+       for (i = 0; i < CBM_LEN; i++) {
+               unsigned int size;
+               e = &tcrid_table[i];
+               e->tcrid = i;
+               size = BITS_TO_LONGS(maxtcrlist_id) * 
+                       sizeof(unsigned long);
+               e->tcrlist_bmap = kzalloc(GFP_KERNEL, size);
+               if (!e->tcrlist_bmap) {
+                       goto out_err;
+               }
+       }
+
+       return 0;
+out_err:
+       for (i = 0; i < CBM_LEN; i++) {
+               e = &tcrid_table[i];
+               kfree(e->tcrlist_bmap);
+       }
+       kfree(tcrid_table);
+       return -ENOMEM;
+}
+
+
+#define reserved_cbm_bits 2
+int account_cbm_bits(struct cat_reservation_cpumask *crmask,
+                    unsigned int cbm_bits)
+{
+       unsigned int cpu;
+
+       
+       // const struct cpumask *cpumask
+       for_each_cpu(cpu, crmask->mask) {
+               unsigned int socket, free_cbm_bits;
+               struct per_socket_data *psd;
+
+               if (!cpu_online(cpu))
+                       return 1;
+
+               socket = topology_physical_package_id(cpu);
+               psd = get_socket_data(socket);
+               free_cbm_bits = l3_nr_cbm_bits - psd->occupied_cbm_bits;
+               if (cbm_bits > free_cbm_bits)
+                       return 1;
+       }
+
+       for_each_cpu(cpu, crmask->mask) {
+               unsigned int socket, free_cbm_bits;
+               struct per_socket_data *psd;
+
+               socket = topology_physical_package_id(cpu);
+               psd = get_socket_data(socket);
+               psd->occupied_cbm_bits += cbm_bits;
+       }
+       return 0;
+}
+
+int deaccount_cbm_bits(struct tcr_entry *e)
+{
+       unsigned int cpu;
+
+       for_each_cpu(cpu, e->mask) {
+               unsigned int socket;
+               struct per_socket_data *psd;
+
+               /* FIXME:
+                * 
+                *   1) alloc reservation
+                *   2) cpu offline
+                *   3) dealloc reservation
+                *   4) cpu online
+                */
+               if (!cpu_online(cpu))
+                       return 1;
+
+               socket = topology_physical_package_id(cpu);
+               psd = get_socket_data(socket);
+               psd->occupied_cbm_bits -= e->cbm_bits;
+       }
+       return 0;
+}
+
+struct tcr_entry *alloc_tcr_entry(struct cat_reservation_cpumask *crmask,
+                                 unsigned int cbm_bits)
+{
+       struct tcr_entry *e;
+       int i;
+
+       i = find_first_zero_bit(tcrid_used_bitmap, CBM_LEN);
+       if (i >= CBM_LEN) {
+               return ERR_PTR(-ENOMEM);
+       }
+
+       if (account_cbm_bits(cpumask, cbm_bits))
+               return ERR_PTR(-ENOMEM);
+
+       set_bit(i, tcrid_used_bitmap);
+       e = &tcrid_table[i];
+
+       return e;
+}
+
+struct tcr_entry *find_tcr_entry(u32 tcrid)
+{
+       struct tcr_entry *e;
+
+       if (tcrid >= CBM_LEN) {
+               return ERR_PTR(-EINVAL);
+       }
+       if (!test_bit(tcrid, tcrid_used_bitmap)) {
+               return ERR_PTR(-EINVAL);
+       }
+
+       e = &tcrid_table[tcrid];
+       return e;
+}
+
+void free_tcr_entry(struct tcr_entry *e)
+{
+       clear_bit(e->tcrid, tcrid_used_bitmap);
+       WARN_ON(!bitmap_empty(e->tcrlist_bmap, maxtcrlist_id));
+       deaccount_cbm_bits(e);
+       if (e->cpumask)
+               free_cpumask_var(e->cpumask);
+       e->cpumask = NULL;
+}
+
+int tcrentry_in_tcrlist(struct tcr_entry *e, struct tcr_list *l)
+{
+       return test_bit(l->id, e->tcrlist_bmap);
+}
+
+
+#if 0
+void tcrlist_changed(struct tcr_list *l)
+{
+       unsigned int size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long);
+       bitmap_clear(l->synced_to_socket, size);
+}
+#endif
+
+int add_tcrentry_to_tcrlist(struct tcr_entry *e, struct tcr_list *l)
+{
+       set_bit(l->id, e->tcrlist_bmap);
+       set_bit(e->tcrid, l->tcrentry_bmap);
+       return 0;
+}
+
+int remove_tcrentry_from_tcrlist(struct tcr_entry *e, struct tcr_list *l)
+{
+       clear_bit(l->id, e->tcrlist_bmap);
+       clear_bit(e->tcrid, l->tcrentry_bmap);
+       /* no more tcrlists referencing this tcrentry: undo allocation
+          on the cache layouts */
+       if (bitmap_empty(&e->tcrlist_bmap, maxtcrlist_id))
+               dealloc_contiguous_regions(e, l);
+       /* no more tcrentries on this tcrlist: unlink it from task */
+       if (bitmap_empty(&l->tcrentry_bmap, CBM_LEN))
+               unlink_tcrlist_from_tasks(l);
+               
+       return 0;
+}
+
+/*
+ * returns -ENOMEM if not enough space, -EPERM if no permission.
+ * returns 0 if reservation has been successful, copying actual
+ * number of kbytes reserved to "kbytes", type to type, and tcrid.
+ *
+ */
+int __create_cache_reservation(struct cat_reservation_cpumask *crmask,
+                              unsigned long argp)
+{
+       struct tcr_entry *e;
+       unsigned int cbm_bits;
+       unsigned int kbytes;
+       struct cat_reservation *cr = &crmask->res;
+       int ret;
+
+       if (cr->type != CACHE_RSVT_TYPE_BOTH && !enable_cdp)
+               return -ENOTSUPP;
+
+       if (cr->type & CACHE_RSVT_ROUND_DOWN)
+               kbytes = round_down(cr->kbytes, kbytes_per_cbm_bit);
+       else
+               kbytes = round_up(cr->kbytes, kbytes_per_cbm_bit);
+
+       if (kbytes > l3_cache_size)
+               return -ENOSPC;
+
+       cbm_bits = kbytes / kbytes_per_cbm_bit;
+
+       e = alloc_tcr_entry(crmask, cbm_bits);
+       if (IS_ERR(e))
+               return PTR_ERR(e);
+
+       /* fix up the cr with the info we got and copy to user */
+       cr->kbytes = kbytes;
+       cr->type = CACHE_RSVT_TYPE_BOTH;
+       cr->flags = 0;
+       cr->tcrid = e->tcrid;
+       ret = -EFAULT;
+       if (copy_to_user(argp, cr, sizeof(*cr)))
+               goto out_release_tcrid;
+
+       e->user_kbytes = cr->kbytes;
+       e->rounded_kbytes = kbytes;
+       e->cbm_bits = kbytes / kbytes_per_cbm_bit;
+       e->type = cr->type;
+
+       return 0;
+out_release_tcrid:
+       free_tcr_entry(e);
+       return ret;
+}
+
+int create_cache_reservation(struct cat_reservation_cpumask *crmask,
+                            unsigned long arg)
+{
+       cpumask_var_t new_mask;
+       int ret;
+       struct cat_reservation *cr = crmask->cr;
+
+        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+                return -ENOMEM;
+
+        ret = get_user_cpu_mask(crmask->mask, crmask->cpusetsize,
+                                  new_mask);
+        if (ret == 0)
+                ret = __create_cache_reservation(crmask, arg);
+
+       if (ret == 0) {
+               int len = crmask->cpusetsize;
+
+               size_t retlen = min_t(size_t, len, cpumask_size());
+
+               if (copy_to_user(crmask->mask, new_mask, retlen))
+                       ret = -EFAULT;
+               else
+                       ret = retlen;
+       }
+       if (ret > 0)
+               cr->cpumask = new_mask;
+       else
+               free_cpumask_var(new_mask);
+        return retval;
+}
+
+/*
+ * TCRentry -> TCRlist mapping:
+ * Each TCRlist is assigned an id from [0, ..., maxclosid]
+ * The id_to_tcrlist[maxclosid] structure contains pointers
+ * to tcrlist structures.
+ * TCRentries contains a bitmap[0, ..., maxclosid]. A bit
+ * set in this bitmap represents the fact that particular
+ * tcrlist references the tcrentry.
+ */
+struct tcr_list *id_to_tcrlist;
+#define TCRLIST_ID_SZ 128
+DECLARE_BITMAP(tcrlist_ids, TCRLIST_ID_SZ);
+
+static unsigned int alloc_tcrlist_id(void)
+{
+       unsigned int id;
+       id = find_first_zero_bit(&tcrlist_ids, TCRLIST_ID_SZ);
+       if (id < TCRLIST_ID_SZ)
+               set_bit(id, &tcrlist_ids);
+       return id;
+}
+
+static void free_tcrlist_id(unsigned int id)
+{
+       clear_bit(id, &tcrlist_ids);
+       id_to_tcrlist[id] = NULL;
+}
+
+
+struct tcr_list *alloc_tcrlist(void)
+{
+       unsigned int cpus_per_socket;
+       struct tcr_list *l;
+       unsigned int id;
+       u32 size;
+
+       l = kzalloc(sizeof(struct tcr_list), GFP_KERNEL);
+       if (!l) {
+               return ERR_PTR(-ENOMEM);
+       }
+       INIT_LIST_HEAD(&l->global_link);
+       INIT_LIST_HEAD(&l->tcr_list);
+       size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long);
+       l->synced_to_socket = kzalloc(GFP_KERNEL, size);
+       if (!l->synced_to_socket) {
+               kfree(l);
+               return ERR_PTR(-ENOMEM);
+       }
+       mutex_lock(&tcr_list_mutex);
+       id = alloc_tcrlist_id();
+       if (id >= TCRLIST_ID_SZ) {
+               kfree(l);
+               mutex_unlock(&tcr_list_mutex);
+               return ERR_PTR(-ENOMEM);
+       }
+       l->id = id;
+       id_to_tcrlist[id] = l;
+       list_add(&l->global_link, &tcr_global_list);
+
+       mutex_unlock(&tcr_list_mutex);
+       return l;
+}
+
+struct tcr_list *find_tcrlist(unsigned long *cmp_bmap)
+{
+       struct tcrlist *l;
+
+       list_for_each_entry(l, &tcr_global_list, global_link) {
+               if (bitmap_equal(l->tcrentry_bmap, &tcrentry_bmap, CBM_LEN)) 
+                       return l;
+       }
+       return NULL;
+}
+
+void free_tcrlist(struct tcr_list *l)
+{
+       mutex_lock(&tcr_list_mutex);
+       free_tcrlist_id(l->id);
+       mutex_unlock(&tcr_list_mutex);
+       kfree(l);
+}
+
+/*
+ * tcrlist is created when attaching a tcrentry to a task.
+ * 
+ * destroyed when either task count goes to zero, 
+ * or tcrentry count goes to zero.
+ *
+ */
+static void inc_use_count(struct tcr_list *l)
+{
+       l->nr_tasks++;
+}
+
+static void dec_use_count(struct tcr_list *l)
+{
+       l->nr_tasks--;
+       if (l->nr_tasks == 0)
+               free_tcrlist(l);
+}
+
+int link_tcrlist_to_task(struct task_struct *t, struct tcr_list *l)
+{
+       inc_use_count(l);
+       rcu_assign_pointer(t->tcrlist, l); 
+#if 0
+       #ifdef CONFIG_INTEL_CAT
+               struct list_head tcrlist_link;
+       #endif
+#endif
+
+       list_add(&t->tcrlist_link, &l->tasks);
+}
+
+int unlink_tcrlist_from_task(struct task_struct *t, struct tcr_list *l)
+{
+       rcu_assign_pointer(t->tcrlist, NULL);
+       rcu_synchronize();
+       list_del(&t->tcrlist_link);
+       dec_use_count(l);
+}
+
+void unlink_tcrlist_from_tasks(struct tcr_list *l)
+{
+       struct task_struct *tsk, *tsk2;
+
+       list_for_each_entry_safe(tsk, tsk2, &l->tasks, tcrlist_link) {
+               rcu_assign_pointer(tsk->tcrlist, NULL);
+               kick_task(tsk);
+       }
+       rcu_synchronize();
+
+       list_for_each_entry_safe(tsk, tsk2, &l->tasks, tcrlist_link) {
+               list_del(&t->tcrlist_link);
+               dec_use_count(l);
+       }
+}
+
+int delete_cache_reservation(struct cat_tcrid *i)
+{
+        struct tcr_entry *e;
+       int bit;
+
+        e = find_tcr_entry(i->tcrid);
+        if (IS_ERR(e)) {
+                return PTR_ERR(e);
+        }
+
+       for_each_set_bit(bit, &e->tcrlist_bmap, maxtcrlist_id) {
+               struct tcr_list *l;
+
+               l = id_to_tcrlist[id];
+               if (!l) {
+                       BUG_ON();
+                       return 0;
+               }
+               remove_tcrentry_from_tcrlist(e, l);
+               kick_tasks(l);
+       }
+       free_tcr_entry(e);
+       return 0;
+}
+
+
+int check_contiguous_region(struct tcr_entry *e, struct tcr_list *l,
+                           struct cache_layout *layout, int *size_p)
+{
+        unsigned long *temp_closmap;
+        u32 size = BITS_TO_LONGS(max_cbm_len) * sizeof(unsigned long);
+       struct tcr_list_per_socket *psd = l->psd[layout->id];
+       u32 cbm_bits;
+
+        temp_closmap = kzalloc(GFP_KERNEL, size);
+        if (!temp_closmap) {
+                return -ENOMEM;
+        }
+
+        memcpy(temp_closmap, layout->closmap, size);
+       /* mark cache ways shared with hw as busy */
+       bitmap_or(temp_closmap, &layout->hw_shared_bitmask, min(max_cbm_len, 
32));
+       cbm_bits = 0;
+       if (psd->cbm_end_bit) {
+               cbm_bits = psd->cbm_end_bit - psd->cbm_start_bit + 1;
+               bitmap_clear(temp_closmap, psd->cbm_start_bit, cbm_bits);
+       }
+
+       cbm_bits += e->cbm_bits;
+       s = bitmap_find_next_zero_area(temp_closmap, max_cbm_len, 0,
+                                  cbm_bits, 0);
+       if (s >= max_cbm_len) {
+               kfree(temp_closmap);
+               return -EBUSY;
+       }
+       *size_p = cbm_bits;
+       return s;
+}
+
+int alloc_contiguous_region(struct tcr_entry *e, struct tcr_list *l,
+                           struct cache_layout *layout)
+{
+       int size_p, r;
+       struct tcr_list_per_socket *psd = l->psd[layout->id];
+
+       r = check_contiguous_region(e, l, clayout, &size_p);
+       if (r < 0)
+               return r;
+       
+       psd->cbm_start_bit = r;
+       psd->cbm_end_bit = r + size_p;
+
+       for (bit = psd->cbm_start_bit; bit < psd->cbm_end_bit;
+               bit++) {
+               __set_bit(bit, layout->closmap);
+       }
+       return 0;
+}
+
+int alloc_contiguous_regions(struct tcr_entry *e, struct tcr_list *l)
+{
+       struct cache_layout *clayout;
+
+       list_for_each_entry(clayout, &layout_list, link) {
+               int size_p, r;
+
+               r = check_contiguous_region(e, l, clayout, &size_p);
+               if (r < 0)
+                       return error;
+               r = alloc_contiguous_region(e, l, clayout);
+               if (r) {
+                       WARN_ON(1);
+               }
+       }
+}
+
+int dealloc_contiguous_regions(struct tcr_entry *e, struct tcr_list *l)
+{
+       struct cache_layout *clayout;
+
+       list_for_each_entry(clayout, &layout_list, link) {
+               struct tcr_list_per_socket *psd = l->psd[clayout->id];
+               int bit;
+
+               for (bit = psd->cbm_start_bit; bit < psd->cbm_end_bit;
+                       bit++) {
+                       __clear_bit(bit, layout->closmap);
+               }
+       }
+}
+
+void kick_task(struct task_struct *tsk)
+{
+       set_tsk_need_resched(tsk);
+       kick_process(tsk);
+}
+
+/* When attach returns, any task attached to the tcrlist
+ * which has been modified must:
+ *     Task Running) sync_to_msr.
+ *     Task Not Running) nothing, as long as sync_to_msr is performed
+ *     when its scheduled in.
+ */
+void kick_tasks(struct tcr_list *l)
+{
+       struct task_struct *tsk;
+
+       list_for_each_entry(tsk, &l->tasks, tcrlist_link) {
+               set_tsk_need_resched(tsk);
+               kick_process(tsk);
+       }
+}
+
+int attach_cache_reservation(struct pid_cat_reservation *pcr)
+{
+       struct pid *pid;
+       struct task_struct *task;
+       struct tcr_list *l, *undo;
+       struct tcr_entry *e;
+
+       e = find_tcr_entry(pcr->tcrid);
+       if (IS_ERR(e)) {
+               return PTR_ERR(e);
+       }
+
+       pid = find_get_pid(pcr);
+       if (!pid) {
+               return -ENOSYS;
+       }
+
+       task = get_pid_task(task);
+       if (!task) {
+               put_pid(pid;
+               return -EINVAL;
+       }
+
+       if (!task->tcrlist) {
+               u64 b = 1UL << e->tcrid;
+
+               l = find_tcrlist(&b);
+               if (l) { 
+                       link_tcrlist_to_task(task,l);
+                       return 0;
+               }
+               l = alloc_tcrlist();
+               if (IS_ERR(l)) {
+                       put_pid(pid);
+                       put_task_struct(task);
+                       return PTR_ERR(l);
+               }
+               undo = l;
+       } else {
+               l = task->tcrlist;
+       }
+
+       if (tcrentry_in_tcrlist(e, l))
+               return -EINVAL;
+
+       if (l->nr_tasks > 1) {
+               struct tcrlist_entry *lnew;
+               u64 b = l->tcrentry_bmap;
+
+               set_bit(e->tcrid, &b);
+
+               lnew = find_tcrlist(&b);
+               if (lnew) {
+                       unlink_tcrlist_from_task(task, l);
+                       link_tcrlist_to_task(task, lnew);
+                       goto out;
+               }
+
+               lnew = alloc_tcrlist();
+               if (IS_ERR(lnew)) {
+                       put_pid(pid);
+                       put_task_struct(task);
+                       return PTR_ERR(lnew);
+               }
+
+               if (alloc_contiguous_regions(e, lnew) == -ENOSPC) {
+                       free_tcrlist(lnew);
+                       return -ENOSPC;
+               }
+               for_each_set_bit(bit, &l->tcrentry_bmap, CBM_LEN) {
+                       struct tcr_entry *et;
+
+                       et = &tcrid_table[bit];
+                       add_tcrentry_to_tcrlist(et, lnew);
+               }
+               unlink_tcrlist_from_task(task, l);
+               link_tcrlist_to_task(task, lnew);
+               l = lnew;
+       } else {
+               if (alloc_contiguous_regions(e, l) == -ENOSPC) {
+                       if (undo)
+                               free_tcrlist(undo);
+                       return -ENOSPC;
+               }
+       }
+
+       add_tcrentry_to_tcrlist(e, l);
+       kick_tasks(l);
+out:
+       put_pid(pid);
+       put_task_struct(task);
+       return 0;
+}
+
+int detach_cache_reservation(struct pid_cat_reservation *pcr)
+{
+       struct pid *pid;
+       struct task_struct *task;
+       struct tcr_list *l, *undo;
+       struct tcr_entry *e;
+       int err;
+
+       e = find_tcr_entry(pcr->tcrid);
+       if (IS_ERR(e)) {
+               return PTR_ERR(e);
+       }
+
+       pid = find_get_pid(pcr);
+       if (!pid) {
+               return -ENOSYS;
+       }
+
+       task = get_pid_task(task);
+       if (!task) {
+               put_pid(pid);
+               return -EINVAL;
+       }
+
+       l = task->tcrlist;
+       if (!l) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (!tcrentry_in_tcrlist(e, l))
+               return -EINVAL;
+
+       if (l->nr_tasks > 1) {
+               struct tcrlist_entry *lnew;
+               u64 b = l->tcrentry_bmap;
+
+               clear_bit(e->tcrid, &b);
+
+               lnew = find_tcrlist(&b);
+               if (lnew) {
+                       unlink_tcrlist_from_task(task, l);
+                       link_tcrlist_to_task(task, lnew);
+                       kick_task(task);
+                       goto out;
+               }
+
+               lnew = alloc_tcrlist();
+               if (IS_ERR(lnew)) {
+                       put_pid(pid);
+                       put_task_struct(task);
+                       return PTR_ERR(lnew);
+               }
+               for_each_set_bit(bit, &l->tcrentry_bmap, CBM_LEN) {
+                       struct tcr_entry *et;
+
+                       if (bit == e->tcrid)
+                               continue;
+
+                       et = &tcrid_table[bit];
+                       add_tcrentry_to_tcrlist(et, lnew);
+               }
+               unlink_tcrlist_from_task(task, l);
+               link_tcrlist_to_task(task, lnew);
+               l = lnew;
+               kick_task(task);
+       } else {
+               remove_tcrentry_from_tcrlist(e, l);
+       }
+       
+       err = 0;
+out:
+       put_pid(pid);
+       put_task_struct(task);
+       return err;
+}
+
+void sync_to_msr(struct task_struct *task, struct tcr_list *l,
+                unsigned int start, unsigned int end)
+{
+       u64 msr;
+       unsigned long bitmask = -1;
+       int len = end - start + 1;
+
+       bitmask = bitmask << (sizeof(unsigned long)*8 - len);
+       bitmask = bitmask >> (sizeof(unsigned long)*8 - end -1);
+
+       /* check and enforce cosidzero has [s,e] == 0 */
+       rdmsrl(CBM_FROM_INDEX(0), msr);
+       if (msr & bitmask)
+               wrmsrl(CBM_FROM_INDEX(0), msr & ~bitmask);
+
+       /* check and enforce this cosid has [s,e] == 1. */
+       rdmsrl(CBM_FROM_INDEX(l->id), msr);
+       if ((msr & bitmask) != bitmask)
+               wrmsrl(CBM_FROM_INDEX(l->id), msr | bitmask);
+
+       set_bit(this_socket, task->tcrlist->synced_to_socket);
+}               
+
+void __intel_rdt_sched_in(void)
+{
+       struct task_struct *task = current;
+       unsigned int cpu = smp_processor_id();
+       unsigned int this_socket = topology_physical_package_id(cpu);
+       unsigned int start, end;
+       struct per_socket_data *psd = get_socket_data(this_socket);
+
+       /*
+        * The CBM bitmask for a particular task is enforced
+        * on sched-in to a given processor, and only for the 
+        * range (cbm_start_bit,cbm_end_bit) which the 
+        * tcr_list (COSid) owns.
+        * This way we allow COSid0 (global task pool) to use
+        * reserved L3 cache on sockets where the tasks that
+        * reserve the cache have not been scheduled.
+        *
+        * Since reading the MSRs is slow, it is necessary to
+        * cache the MSR CBM map on each socket.
+        *
+        */
+
+       if (task->tcrlist == NULL) {
+               wrmsrl(CBM_FROM_INDEX(0), psd->cosidzeromask);
+       }
+       else if (test_bit(this_socket,
+                    task->tcrlist->synced_to_socket) == 0) {
+               spin_lock(&this_socket->msr_cbm_lock);
+               unsigned int start;
+               struct per_socket_data *psd = get_socket_data(this_socket);
+               struct cache_layout *layout = psd->layout;
+
+               start = task->tcrlist->psd[layout->id].cbm_start;
+               end = task->tcrlist->psd[layout->id].cbm_end;
+               sync_to_msr(task, tcrlist, start, end);
+               // barrier
+               spin_unlock(&this_socket->msr_cbm_lock);
+       }
+
+}
+
+static int get_reservations(struct cat_reservation_list *in,
+                           unsigned long arg)
+{
+       int r, bit;
+       struct cat_reservation *cr;
+       void *res_user_ptr, *cpumask_user_ptr;
+       unsigned int copied_entries;
+       unsigned int x, coffset, uoffset;
+       size_t cpumasksz;
+
+       cpumasksz = cpumask_size()*bitmap_weight(&tcrid_used_bitmap, CBM_LEN);
+       cpumasksz = min_t(size_t, cpumasksz);
+
+       x = sizeof(*cr)*cpumasksz;
+       if (x > in->cat_res_size)
+               return -ENOSPC;
+       if (cpumasksz > in->cpumask_size)
+               return -ENOSPC;
+
+       cr = kzalloc(GFP_KERNEL, sizeof(*cr));
+       if (!cr)
+               return -ENOMEM;
+
+       res_user_ptr = in->list;
+       cpumask_user_ptr = in->mask;
+
+       in->cpumask_size = cpumasksz;
+       r = -EFAULT;
+       if (copy_to_user(argp, &in, sizeof(*in)))
+               goto out;
+
+       uoffset = coffset = copied_entries = 0;
+
+       for_each_set_bit(bit, &tcrid_used_bitmap, CBM_LEN) {
+               struct tcr_entry *e = &tcrid_table[bit];
+
+               cr->kbytes = e->rounded_kbytes;
+               cr->type = e->type;
+               cr->flags = 0;
+               cr->tcrid = tcrid;
+
+               if (copy_to_user(user_ptr + uoffset, &cr, sizeof(*cr))) {
+                       r = -EFAULT;
+                       goto out;
+               }
+               uoffset += sizeof(*cr);
+
+               if (copy_to_user(cpumask_user_ptr + coffset, e->cpumask, 
cpumasksz)) {
+                       r = -EFAULT;
+                       goto out;
+               }
+               coffset += cpumasksz;
+               copied_entries++;
+
+               memset(cr, 0, sizeof(*cr));
+       }
+
+       copied_entries = r;
+
+out:
+       kfree(cr);
+       return r;
+}
+
+static int basic_cr_checks(struct cat_reservation *cr)
+{
+       int r;
+
+       r = -EINVAL;
+       if (cr->type != CACHE_RSVT_TYPE_CODE &&
+           cr->type != CACHE_RSVT_TYPE_DATA &&
+           cr->type != CACHE_RSVT_TYPE_BOTH)
+               return r;
+
+       if (cr->flags != 0 && cr->flags != CACHE_RSVT_ROUND_DOWN)
+               return r;
+
+       r = 0;
+       return r;
+}
+
+static long intelcat_ioctl(struct file *filp,
+                          unsigned int ioctl, unsigned long arg)
+{
+        long r = -EINVAL;
+       switch (ioctl) {
+               case CAT_CREATE_RESERVATION:
+                       struct cat_reservation_cpumask crmask;
+
+                       r = -EFAULT;
+                       if (copy_from_user(&crmask, argp, sizeof(crmask)))
+                               goto out;
+
+                       r = basic_cr_checks(&crmask.res);
+                       if (r)
+                               goto out;
+
+                       r = create_cache_reservation(&crmask, arg);
+       
+                       break;
+               case CAT_DELETE_RESERVATION:
+                       struct cat_tcrid tcrid;
+
+                       r = -EFAULT;
+                       if (copy_from_user(&tcrid, argp, sizeof(cr)))
+                               goto out;
+
+                       r = delete_cache_reservation(&tcrid);
+
+                       break;
+               case CAT_ATTACH_RESERVATION:
+                       struct pid_cat_reservation pcr;
+                       r = -EFAULT;
+               
+                       if (copy_from_user(&pcr, argp, sizeof(pcr)))
+                               goto out;
+                       r = attach_cache_reservation(&pcr);
+                       break;
+               case CAT_DETACH_RESERVATION:
+                       struct pid_cat_reservation pcr;
+                       r = -EFAULT;
+               
+                       if (copy_from_user(&pcr, argp, sizeof(pcr)))
+                               goto out;
+                       r = detach_cache_reservation(&pcr);
+                       break;
+               case CAT_GET_RESERVATIONS:
+                       struct cat_reservation_list *in;
+                       r = -EFAULT;
+               
+                       if (copy_from_user(&pcr, argp, sizeof(pcr)))
+                               goto out;
+
+                       r = get_reservations(in, argp);
+                       return r;
+               default:
+                       break;
+       }
+
+out:
+       return r;
+}
+
+static struct file_operations intelcat_chardev_ops = {
+       .unlocked_ioctl = intelcat_ioctl,
+       .compat_ioctl   = intelcat_ioctl,
+       .llseek         = noop_llseek,
+};
+
+static struct miscdevice intel_cat_misc = 
+{
+       INTEL_CAT_MINOR,
+       "intel_cat",
+       &intelcat_chardev_ops,
+};
+
+static int get_l3_cache_size(void)
+{
+       struct cpu_cacheinfo *cinfo;
+       struct cacheinfo *ci;
+
+       cinfo = get_cpu_cacheinfo(0);
+
+       if (cinfo && cinfo->num_levels >= 3) {
+               ci = cinfo->info_list[3];
+               l3_cache_size = ci->size;
+               return 0;
+       }
+       return -EINVAL;
+}
+
+static struct per_socket_data *get_socket_data(int socket)
+{
+       struct per_socket_data *data;
+
+       if (socket >= psd_size) {
+               BUG_ON();
+               return NULL;
+       }
+       return &psd[socket];
+}
+
+static int __init alloc_init_per_socket_data(void)
+{
+       psd = kzalloc(max_sockets * sizeof(struct per_socket_data));
+       if (!psd)
+               return -ENOMEM;
+       psd_size = max_sockets;
+       return 0;
+}
+
+static void percpu_init_hw_shared_zone(void)
+{
+       unsigned int cpu, this_socket;
+       struct cpuinfo_x86 *c;
+       uint32_t eax, ebx, ecx, edx;
+       struct per_socket_data *psd;
+       u32 size;
+
+       cpu = smp_processor_id();
+       this_socket = topology_physical_package_id(cpu);
+       psd = get_socket_data(this_socket);
+       c = &cpu_data(cpu);
+
+       cpuid_count(0x00000010, 1, &eax, &ebx, &ecx, &edx);
+       if (atomic_test_and_set(&psd->initialized))
+               return 0;
+       psd->hw_shared_bitmask = ebx;
+       // reserve 10% of cache ways for host
+       psd->reserved_for_host = c->x86_cache_max_cbm_len/10;
+       psd->reserved_for_host = max(psd->reserved_for_host,
+                       bitmap_weight(&psd->hw_shared_bitmask));
+       psd->layout = find_create_layout(psd->hw_shared_bitmask);
+       
+        size = BITS_TO_LONGS(c->x86_cache_max_cbm_len) * sizeof(unsigned long);
+       if (cdp_enabled)
+               size = 2*size;
+       psd->cosidzeromask = kzalloc(size, GFP_ATOMIC);
+       if (!closmap)
+               panic("%s allocation failed\n", __func__);
+               
+       memset(psd->cosidzeromask, 1, size);
+}
+
+static int cat_cpu_notifier(struct notifier_block *nfb,
+                            unsigned long action, void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+
+        switch (action) {
+                case CPU_ONLINE:
+                       percpu_init_hw_shared_zone();
+                       break;
+        }
+        return NOTIFY_OK;
+}
+
+static struct notifier_block cat_cpu_notifier_block = {
+        .notifier_call  = cat_cpu_notifier,
+        .priority = -INT_MAX
+};
+
+static int init_hw_shared_zone(void)
+{
+       cpumask_t cpumask;
+       int cpu;
+       unsigned long *topology_bmap;
+       int size = BITS_TO_LONGS(max_sockets * NR_CPUS) * sizeof(long);
+
+       topology_bmap = kzalloc(size, GFP_KERNEL);
+       if (!topology_bmap)
+               return -ENOMEM;
+
+       cpumask_zero(&cpumask);
+
+       for_each_online_cpu(cpu) {
+               phys_id = topology_physical_package_id(cpu);
+               if (test_and_set_bit(phys_id, topology_bmap))
+                       continue;
+               cpumask_set_cpu(cpu, &cpumask);
+       }
+               
+       smp_call_function_many(&cpumask,
+                               percpu_init_hw_shared_zone, 0, 1);
+
+       kfree(topology_bmap);
+
+       return 0;
+}
+
+
+static int __init intel_cat_mem_init(void)
+{
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+       u32 maxid;
+
+       err = -ENOMEM;
+
+       max_cbm_len = c->x86_cache_max_cbm_len;
+       maxid = max_closid = c->x86_cache_max_closid;
+       //maxid = max_closid = c->x86_cache_max_closid;
+       size = BITS_TO_LONGS(maxid) * sizeof(long);
+       closmap = kzalloc(size, GFP_KERNEL);
+       if (!closmap)
+               goto err_out;
+
+       size = maxid * sizeof(struct tcr_list *);
+       id_to_tcrlist = kzalloc(size, GFP_KERNEL);
+       if (!id_to_tcrlist)
+               goto err_out;
+
+       err = alloc_tcrid_table();
+       if (err)
+               goto err_out;
+
+       err = get_l3_cache_size();
+       if (err)
+               goto err_out;
+
+       /* kbytes per cbm bit = 
+        * L3 cache size in kbytes / capacity bitmask length.
+        */
+       kbytes_per_cbm_bit = (l3_cache_size >> 10) / max_cbm_len;
+
+       /* L3 cache size in kbytes / kbytes per cbm bit = 
+        * cbm bits in L3 cache.
+        */
+       l3_nr_cbm_bits = (l3_cache_size >> 10) / kbytes_per_cbm_bit;
+
+       err = alloc_init_per_socket_data();
+       if (err)
+               goto err_out;
+
+       init_hw_shared_zone();
+
+       /* bit 0 is reserved for global task pool */
+       set_bit(0, &tcrlist_ids);
+
+       return 0;
+err_out:
+       kfree(id_to_tcrlist);
+       kfree(closmap);
+       return err;
+}
+
+static int __init intel_cat_init(void)
+{
+       int r;
+       int cpu;
+
+       preempt_disable();
+       cpu = smp_processor_id();
+       cpus_per_socket = cpumask_weight(topology_core_cpumask(cpu));
+       max_sockets = NR_CPUS/cpus_per_socket;
+       preempt_enable();
+
+       r = misc_register(&intel_cat_misc);
+       if (r) {
+               printk(KERN_ERR "intel_cat: misc_register error = %d\n",r);
+               return r;
+       }
+
+       r = intel_cat_mem_init();
+       if (r) {
+               misc_unregister(&intel_cat_misc);
+       }
+
+       cpu_notifier_register_begin();
+       __register_hotcpu_notifier(&cat_cpu_notifier_block);
+       cpu_notifier_register_done();
+
+       return r;
+}
+
diff --git a/arch/x86/kernel/cat/cache_reservation.h 
b/arch/x86/kernel/cat/cache_reservation.h
new file mode 100644
index 0000000..e8146a0
--- /dev/null
+++ b/arch/x86/kernel/cat/cache_reservation.h
@@ -0,0 +1,47 @@
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+struct tcr_entry {
+       unsigned int tcrid;
+
+       unsigned long *tcrlist_bmap;
+
+       u64 user_kbytes;
+       u64 rounded_kbytes;
+       unsigned int cbm_bits;
+
+       u32 type;
+
+       cpumask_var_t *cpumask;
+};
+
+#define CBM_LEN 64
+#define MAX_LAYOUTS 10
+
+struct tcr_list_per_socket {
+       int cbm_start_bit, cbm_end_bit;
+};
+
+struct tcr_list {
+       /* cache allocation */
+       struct tcr_list_per_socket psd[MAX_LAYOUTS];
+
+       /* bitmap indicating whether cap_bitmask is synced to a given socket */
+       unsigned long *synced_to_socket;
+
+       /* TCRlist id */
+       unsigned int id;
+
+       // One bit per tcrentry.
+       DECLARE_BITMAP(tcrentry_bmap, CBM_LEN);
+       
+       // link in global tcrlist list
+       struct list_head global_link; 
+       // list of tasks referencing this tcr_list 
+       struct list_head tasks;
+       // nr of tasks referencing this tcr_list
+       unsigned int nr_tasks;
+};
+
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to