[PATCH v2 2/7] hugetlb: support migrate charging for surplus hugepages

2018-05-17 Thread TSUKADA Koutaro
Surplus hugepages allocated for migration also charge to memory cgroup.

Signed-off-by: TSUKADA Koutaro 
---
 hugetlb.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 679c151f..2e7b543 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1687,6 +1687,8 @@ static struct page *alloc_migrate_huge_page(struct hstate 
*h, gfp_t gfp_mask,
if (!page)
return NULL;

+   surplus_hugepage_set_charge(h, page);
+
/*
 * We do not account these pages as surplus because they are only
 * temporary and will be released properly on the last reference

-- 
Tsukada

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 7/7] memcg: supports movement of surplus hugepages statistics

2018-05-17 Thread TSUKADA Koutaro
When the task that charged surplus hugepages moves memory cgroup, it
updates the statistical information correctly.

Signed-off-by: TSUKADA Koutaro 
---
 memcontrol.c |   99 +++
 1 file changed, 99 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a8f1ff8..63f0922 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4698,12 +4698,110 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t 
*pmd,
return 0;
 }

+#ifdef CONFIG_HUGETLB_PAGE
+static enum mc_target_type get_mctgt_type_hugetlb(struct vm_area_struct *vma,
+   unsigned long addr, pte_t *pte, union mc_target *target)
+{
+   struct page *page = NULL;
+   pte_t entry;
+   enum mc_target_type ret = MC_TARGET_NONE;
+
+   if (!(mc.flags & MOVE_ANON))
+   return ret;
+
+   entry = huge_ptep_get(pte);
+   if (!pte_present(entry))
+   return ret;
+
+   page = pte_page(entry);
+   VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+   if (likely(!PageSurplusCharge(page)))
+   return ret;
+   if (page->mem_cgroup == mc.from) {
+   ret = MC_TARGET_PAGE;
+   if (target) {
+   get_page(page);
+   target->page = page;
+   }
+   }
+
+   return ret;
+}
+
+static int hugetlb_count_precharge_pte_range(pte_t *pte, unsigned long hmask,
+   unsigned long addr, unsigned long end,
+   struct mm_walk *walk)
+{
+   struct vm_area_struct *vma = walk->vma;
+   struct mm_struct *mm = walk->mm;
+   spinlock_t *ptl;
+   union mc_target target;
+
+   ptl = huge_pte_lock(hstate_vma(vma), mm, pte);
+   if (get_mctgt_type_hugetlb(vma, addr, pte, ) == MC_TARGET_PAGE) {
+   mc.precharge += (1 << compound_order(target.page));
+   put_page(target.page);
+   }
+   spin_unlock(ptl);
+
+   return 0;
+}
+
+static int hugetlb_move_charge_pte_range(pte_t *pte, unsigned long hmask,
+   unsigned long addr, unsigned long end,
+   struct mm_walk *walk)
+{
+   struct vm_area_struct *vma = walk->vma;
+   struct mm_struct *mm = walk->mm;
+   spinlock_t *ptl;
+   enum mc_target_type target_type;
+   union mc_target target;
+   struct page *page;
+   unsigned long nr_pages;
+
+   ptl = huge_pte_lock(hstate_vma(vma), mm, pte);
+   target_type = get_mctgt_type_hugetlb(vma, addr, pte, );
+   if (target_type == MC_TARGET_PAGE) {
+   page = target.page;
+   nr_pages = (1 << compound_order(page));
+   if (mc.precharge < nr_pages) {
+   put_page(page);
+   goto unlock;
+   }
+   if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) {
+   mc.precharge -= nr_pages;
+   mc.moved_charge += nr_pages;
+   }
+   put_page(page);
+   }
+unlock:
+   spin_unlock(ptl);
+
+   return 0;
+}
+#else
+static int hugetlb_count_precharge_pte_range(pte_t *pte, unsigned long hmask,
+   unsigned long addr, unsigned long end,
+   struct mm_walk *walk)
+{
+   return 0;
+}
+
+static int hugetlb_move_charge_pte_range(pte_t *pte, unsigned long hmask,
+   unsigned long addr, unsigned long end,
+   struct mm_walk *walk)
+{
+   return 0;
+}
+#endif
+
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
unsigned long precharge;

struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
+   .hugetlb_entry = hugetlb_count_precharge_pte_range,
.mm = mm,
};
down_read(>mmap_sem);
@@ -4981,6 +5079,7 @@ static void mem_cgroup_move_charge(void)
 {
struct mm_walk mem_cgroup_move_charge_walk = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
+   .hugetlb_entry = hugetlb_move_charge_pte_range,
.mm = mc.mm,
};

-- 
Tsukada

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 6/7] Documentation, hugetlb: describe about charge_surplus_hugepages,

2018-05-17 Thread TSUKADA Koutaro
Add a description about charge_surplus_hugepages.

Signed-off-by: TSUKADA Koutaro 
---
 hugetlbpage.txt |6 ++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index faf077d..af8d112 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -129,6 +129,11 @@ number of "surplus" huge pages from the kernel's normal 
page pool, when the
 persistent huge page pool is exhausted. As these surplus huge pages become
 unused, they are freed back to the kernel's normal page pool.

+/proc/sys/vm/charge_surplus_hugepages indicates to charge "surplus" huge pages
+obteined from the normal page pool to memory cgroup. If true, the amount to be
+overcommitted is limited within memory usage allowed by the memory cgroup to
+which the task belongs. The default value is false.
+
 When increasing the huge page pool size via nr_hugepages, any existing surplus
 pages will first be promoted to persistent huge pages.  Then, additional
 huge pages will be allocated, if necessary and if possible, to fulfill
@@ -169,6 +174,7 @@ Inside each of these directories, the same set of files 
will exist:
free_hugepages
resv_hugepages
surplus_hugepages
+   charge_surplus_hugepages

 which function as described above for the default huge page-sized case.

-- 
Tsukada


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/7] hugetlb: add charge_surplus_hugepages attribute

2018-05-17 Thread TSUKADA Koutaro
Add an entry for charge_surplus_hugepages to sysfs.

Signed-off-by: TSUKADA Koutaro 
---
 hugetlb.c |   25 +
 1 file changed, 25 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9a9549c..2f9bdbc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2662,6 +2662,30 @@ static ssize_t surplus_hugepages_show(struct kobject 
*kobj,
 }
 HSTATE_ATTR_RO(surplus_hugepages);

+static ssize_t charge_surplus_hugepages_show(struct kobject *kobj,
+   struct kobj_attribute *attr, char *buf)
+{
+   struct hstate *h = kobj_to_hstate(kobj, NULL);
+   return sprintf(buf, "%d\n", h->charge_surplus_huge_pages);
+}
+
+static ssize_t charge_surplus_hugepages_store(struct kobject *kobj,
+   struct kobj_attribute *attr, const char *buf, size_t len)
+{
+   int err;
+   unsigned long input;
+   struct hstate *h = kobj_to_hstate(kobj, NULL);
+
+   err = kstrtoul(buf, 10, );
+   if (err)
+   return err;
+
+   h->charge_surplus_huge_pages = input ? true : false;
+
+   return len;
+}
+HSTATE_ATTR(charge_surplus_hugepages);
+
 static struct attribute *hstate_attrs[] = {
_hugepages_attr.attr,
_overcommit_hugepages_attr.attr,
@@ -2671,6 +2695,7 @@ static ssize_t surplus_hugepages_show(struct kobject 
*kobj,
 #ifdef CONFIG_NUMA
_hugepages_mempolicy_attr.attr,
 #endif
+   _surplus_hugepages_attr.attr,
NULL,
 };

-- 
Tsukada


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/7] hugetlb: introduce charge_surplus_huge_pages to struct hstate

2018-05-17 Thread TSUKADA Koutaro
The charge_surplus_huge_pages indicates to charge surplus huge pages
obteined from the normal page pool to memory cgroup. The default value is
false.

This patch implements the core part of charging surplus hugepages. Use the
private and mem_cgroup member of the second entry of compound hugepage for
surplus hugepage charging.

Mark when surplus hugepage is obtained from normal pool, and charge to
memory cgroup at alloc_huge_page. Once the mapping of the page is decided,
commit the charge. surplus hugepages will uncharge or cancel at
free_huge_page.

Signed-off-by: TSUKADA Koutaro 
---
 include/linux/hugetlb.h |2
 mm/hugetlb.c|  100 
 2 files changed, 102 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 36fa6a2..33fe5be 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -158,6 +158,7 @@ unsigned long hugetlb_change_protection(struct 
vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot);

 bool is_hugetlb_entry_migration(pte_t pte);
+bool PageSurplusCharge(struct page *page);

 #else /* !CONFIG_HUGETLB_PAGE */

@@ -338,6 +339,7 @@ struct hstate {
unsigned int nr_huge_pages_node[MAX_NUMNODES];
unsigned int free_huge_pages_node[MAX_NUMNODES];
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+   bool charge_surplus_huge_pages; /* default to off */
 #ifdef CONFIG_CGROUP_HUGETLB
/* cgroup control files */
struct cftype cgroup_files[5];
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2186791..679c151f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"

 int hugetlb_max_hstate __read_mostly;
@@ -1236,6 +1237,90 @@ static inline void ClearPageHugeTemporary(struct page 
*page)
page[2].mapping = NULL;
 }

+#define HUGETLB_SURPLUS_CHARGE 1UL
+
+bool PageSurplusCharge(struct page *page)
+{
+   if (!PageHuge(page))
+   return false;
+   return page[1].private == HUGETLB_SURPLUS_CHARGE;
+}
+
+static inline void SetPageSurplusCharge(struct page *page)
+{
+   page[1].private = HUGETLB_SURPLUS_CHARGE;
+}
+
+static inline void ClearPageSurplusCharge(struct page *page)
+{
+   page[1].private = 0;
+}
+
+static inline void
+set_surplus_hugepage_memcg(struct page *page, struct mem_cgroup *memcg)
+{
+   page[1].mem_cgroup = memcg;
+}
+
+static inline struct mem_cgroup *get_surplus_hugepage_memcg(struct page *page)
+{
+   return page[1].mem_cgroup;
+}
+
+static void surplus_hugepage_set_charge(struct hstate *h, struct page *page)
+{
+   if (likely(!h->charge_surplus_huge_pages))
+   return;
+   if (unlikely(!page))
+   return;
+   SetPageSurplusCharge(page);
+}
+
+static int surplus_hugepage_try_charge(struct page *page, struct mm_struct *mm)
+{
+   struct mem_cgroup *memcg;
+
+   if (likely(!PageSurplusCharge(page)))
+   return 0;
+
+   if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, , true)) {
+   /* mem_cgroup oom invoked */
+   ClearPageSurplusCharge(page);
+   return -ENOMEM;
+   }
+   set_surplus_hugepage_memcg(page, memcg);
+
+   return 0;
+}
+
+static void surplus_hugepage_commit_charge(struct page *page)
+{
+   struct mem_cgroup *memcg;
+
+   if (likely(!PageSurplusCharge(page)))
+   return;
+
+   memcg = get_surplus_hugepage_memcg(page);
+   mem_cgroup_commit_charge(page, memcg, false, true);
+   set_surplus_hugepage_memcg(page, NULL);
+}
+
+static void surplus_hugepage_finalize_charge(struct page *page)
+{
+   struct mem_cgroup *memcg;
+
+   if (likely(!PageSurplusCharge(page)))
+   return;
+
+   memcg = get_surplus_hugepage_memcg(page);
+   if (memcg)
+   mem_cgroup_cancel_charge(page, memcg, true);
+   else
+   mem_cgroup_uncharge(page);
+   set_surplus_hugepage_memcg(page, NULL);
+   ClearPageSurplusCharge(page);
+}
+
 void free_huge_page(struct page *page)
 {
/*
@@ -1248,6 +1333,8 @@ void free_huge_page(struct page *page)
(struct hugepage_subpool *)page_private(page);
bool restore_reserve;

+   surplus_hugepage_finalize_charge(page);
+
set_page_private(page, 0);
page->mapping = NULL;
VM_BUG_ON_PAGE(page_count(page), page);
@@ -1583,6 +1670,8 @@ static struct page *alloc_surplus_huge_page(struct hstate 
*h, gfp_t gfp_mask,
 out_unlock:
spin_unlock(_lock);

+   surplus_hugepage_set_charge(h, page);
+
return page;
 }

@@ -2062,6 +2151,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
spin_unlock(_lock);

+   if (unlikely(surplus_hugepage_try_charge(page, vma->vm_mm))) {
+  

[PATCH v2 0/7] mm: pages for hugetlb's overcommit may be able to charge to memcg

2018-05-17 Thread TSUKADA Koutaro
Thanks to Mike Kravetz for comment on the previous version patch.

The purpose of this patch-set is to make it possible to control whether or
not to charge surplus hugetlb pages obtained by overcommitting to memory
cgroup. In the future, I am trying to accomplish limiting the memory usage
of applications that use both normal pages and hugetlb pages by the memory
cgroup(not use the hugetlb cgroup).

Applications that use shared libraries like libhugetlbfs.so use both normal
pages and hugetlb pages, but we do not know how much to use each. Please
suppose you want to manage the memory usage of such applications by cgroup
How do you set the memory cgroup and hugetlb cgroup limit when you want to
limit memory usage to 10GB?

If you set a limit of 10GB for each, the user can use a total of 20GB of
memory and can not limit it well. Since it is difficult to estimate the
ratio used by user of normal pages and hugetlb pages, setting limits of 2GB
to memory cgroup and 8GB to hugetlb cgroup is not very good idea. In such a
case, I thought that by using my patch-set, we could manage resources just
by setting 10GB as the limit of memory cgoup(there is no limit to hugetlb
cgroup).

In this patch-set, introduce the charge_surplus_huge_pages(boolean) to
struct hstate. If it is true, it charges to the memory cgroup to which the
task that obtained surplus hugepages belongs. If it is false, do nothing as
before, and the default value is false. The charge_surplus_huge_pages can
be controlled procfs or sysfs interfaces.

Since THP is very effective in environments with kernel page size of 4KB,
such as x86, there is no reason to positively use HugeTLBfs, so I think
that there is no situation to enable charge_surplus_huge_pages. However, in
some distributions such as arm64, the page size of the kernel is 64KB, and
the size of THP is too huge as 512MB, making it difficult to use. HugeTLBfs
may support multiple huge page sizes, and in such a special environment
there is a desire to use HugeTLBfs.

The patch set is for 4.17.0-rc3+. I don't know whether patch-set are
acceptable or not, so I just done a simple test.

Thanks,
Tsukada

TSUKADA Koutaro (7):
  hugetlb: introduce charge_surplus_huge_pages to struct hstate
  hugetlb: supports migrate charging for surplus hugepages
  memcg: use compound_order rather than hpage_nr_pages
  mm, sysctl: make charging surplus hugepages controllable
  hugetlb: add charge_surplus_hugepages attribute
  Documentation, hugetlb: describe about charge_surplus_hugepages
  memcg: supports movement of surplus hugepages statistics

 Documentation/vm/hugetlbpage.txt |6 +
 include/linux/hugetlb.h  |4 +
 kernel/sysctl.c  |7 +
 mm/hugetlb.c |  148 +++
 mm/memcontrol.c  |  109 +++-
 5 files changed, 269 insertions(+), 5 deletions(-)

-- 
Tsukada


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/7] mm, sysctl: make charging surplus hugepages controllable

2018-05-17 Thread TSUKADA Koutaro
Make the default hugetlb surplus hugepage controlable by
/proc/sys/vm/charge_surplus_hugepages.

Signed-off-by: TSUKADA Koutaro 
---
 include/linux/hugetlb.h |2 ++
 kernel/sysctl.c |7 +++
 mm/hugetlb.c|   21 +
 3 files changed, 30 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 33fe5be..9314b07 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -80,6 +80,8 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate 
*h, long max_hpages,
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, 
loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t 
*, loff_t *);
+int hugetlb_charge_surplus_handler(struct ctl_table *, int, void __user *,
+   size_t *, loff_t *);
 int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, 
size_t *, loff_t *);

 #ifdef CONFIG_NUMA
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a78cf7..d562d64 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1394,6 +1394,13 @@ static int sysrq_sysctl_handler(struct ctl_table *table, 
int write,
.mode   = 0644,
.proc_handler   = hugetlb_overcommit_handler,
},
+   {
+   .procname   = "charge_surplus_hugepages",
+   .data   = NULL,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = hugetlb_charge_surplus_handler,
+   },
 #endif
{
.procname   = "lowmem_reserve_ratio",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2e7b543..9a9549c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3069,6 +3069,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, 
int write,
return ret;
 }

+int hugetlb_charge_surplus_handler(struct ctl_table *table, int write,
+   void __user *buffer, size_t *length, loff_t *ppos)
+{
+   struct hstate *h = _hstate;
+   int tmp, ret;
+
+   if (!hugepages_supported())
+   return -EOPNOTSUPP;
+
+   tmp = h->charge_surplus_huge_pages ? 1 : 0;
+   table->data = 
+   table->maxlen = sizeof(int);
+   ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+   if (ret)
+   goto out;
+
+   if (write)
+   h->charge_surplus_huge_pages = tmp ? true : false;
+out:
+   return ret;
+}
 #endif /* CONFIG_SYSCTL */

 void hugetlb_report_meminfo(struct seq_file *m)

--
Tsukada

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/7] memcg: use compound_order rather than hpage_nr_pages

2018-05-17 Thread TSUKADA Koutaro
The current memcg implementation assumes that the compound page is THP.
In order to be able to charge surplus hugepage, we use compound_order.

Signed-off-by: TSUKADA Koutaro 
---
 memcontrol.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd3df3..a8f1ff8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4483,7 +4483,7 @@ static int mem_cgroup_move_account(struct page *page,
   struct mem_cgroup *to)
 {
unsigned long flags;
-   unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
+   unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1;
int ret;
bool anon;

@@ -5417,7 +5417,7 @@ int mem_cgroup_try_charge(struct page *page, struct 
mm_struct *mm,
  bool compound)
 {
struct mem_cgroup *memcg = NULL;
-   unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
+   unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1;
int ret = 0;

if (mem_cgroup_disabled())
@@ -5478,7 +5478,7 @@ int mem_cgroup_try_charge(struct page *page, struct 
mm_struct *mm,
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
  bool lrucare, bool compound)
 {
-   unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
+   unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1;

VM_BUG_ON_PAGE(!page->mapping, page);
VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5522,7 +5522,7 @@ void mem_cgroup_commit_charge(struct page *page, struct 
mem_cgroup *memcg,
 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
bool compound)
 {
-   unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
+   unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1;

if (mem_cgroup_disabled())
return;
@@ -5729,7 +5729,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page 
*newpage)

/* Force-charge the new page. The old one will be freed soon */
compound = PageTransHuge(newpage);
-   nr_pages = compound ? hpage_nr_pages(newpage) : 1;
+   nr_pages = compound ? (1 << compound_order(newpage)) : 1;

page_counter_charge(>memory, nr_pages);
if (do_memsw_account())

-- 
Tsukada

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[no subject]

2018-05-17 Thread Sherri Gallagher
Please reply me back
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 2/6] cpuset: Add new v2 cpuset.sched.domain flag

2018-05-17 Thread Waiman Long
A new cpuset.sched.domain boolean flag is added to cpuset v2. This new
flag indicates that the CPUs in the current cpuset should be treated
as a separate scheduling domain. This new flag is owned by the parent
and will cause the CPUs in the cpuset to be removed from the effective
CPUs of its parent.

This is implemented internally by adding a new isolated_cpus mask that
holds the CPUs belonging to child scheduling domain cpusets so that:

isolated_cpus | effective_cpus = cpus_allowed
isolated_cpus & effective_cpus = 0

This new flag can only be turned on in a cpuset if its parent is either
root or a scheduling domain itself with non-empty cpu list. The state
of this flag cannot be changed if the cpuset has children.

Signed-off-by: Waiman Long 
---
 Documentation/cgroup-v2.txt |  22 
 kernel/cgroup/cpuset.c  | 237 +++-
 2 files changed, 256 insertions(+), 3 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index cf7bac6..54d9e22 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1514,6 +1514,28 @@ Cpuset Interface Files
it is a subset of "cpuset.mems".  Its value will be affected
by memory nodes hotplug events.
 
+  cpuset.sched.domain
+   A read-write single value file which exists on non-root
+   cpuset-enabled cgroups.  It is a binary value flag that accepts
+   either "0" (off) or a non-zero value (on).  This flag is set
+   by the parent and is not delegatable.
+
+   If set, it indicates that the CPUs in the current cgroup will
+   be the root of a scheduling domain.  The root cgroup is always
+   a scheduling domain.  There are constraints on where this flag
+   can be set.  It can only be set in a cgroup if all the following
+   conditions are true.
+
+   1) The parent cgroup is also a scheduling domain with a non-empty
+  cpu list.
+   2) The list of CPUs are exclusive, i.e. they are not shared by
+  any of its siblings.
+   3) There is no child cgroups with cpuset enabled.
+
+   Setting this flag will take the CPUs away from the effective
+   CPUs of the parent cgroup. Once it is set, this flag cannot be
+   cleared if there are any child cgroups with cpuset enabled.
+
 
 Device controller
 -
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 419b758..e1a1af0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -109,6 +109,9 @@ struct cpuset {
cpumask_var_t effective_cpus;
nodemask_t effective_mems;
 
+   /* Isolated CPUs for scheduling domain children */
+   cpumask_var_t isolated_cpus;
+
/*
 * This is old Memory Nodes tasks took on.
 *
@@ -134,6 +137,9 @@ struct cpuset {
 
/* for custom sched domain */
int relax_domain_level;
+
+   /* for isolated_cpus */
+   int isolation_count;
 };
 
 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -175,6 +181,7 @@ static inline bool task_has_mempolicy(struct task_struct 
*task)
CS_SCHED_LOAD_BALANCE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
+   CS_SCHED_DOMAIN,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -203,6 +210,11 @@ static inline int is_sched_load_balance(const struct 
cpuset *cs)
return test_bit(CS_SCHED_LOAD_BALANCE, >flags);
 }
 
+static inline int is_sched_domain(const struct cpuset *cs)
+{
+   return test_bit(CS_SCHED_DOMAIN, >flags);
+}
+
 static inline int is_memory_migrate(const struct cpuset *cs)
 {
return test_bit(CS_MEMORY_MIGRATE, >flags);
@@ -220,7 +232,7 @@ static inline int is_spread_slab(const struct cpuset *cs)
 
 static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
- (1 << CS_MEM_EXCLUSIVE)),
+ (1 << CS_MEM_EXCLUSIVE) | (1 << CS_SCHED_DOMAIN)),
 };
 
 /**
@@ -902,7 +914,19 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
 
-   cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+   /*
+* If parent has isolated CPUs, include them in the list
+* of allowable CPUs.
+*/
+   if (parent->isolation_count) {
+   cpumask_or(new_cpus, parent->effective_cpus,
+  parent->isolated_cpus);
+   cpumask_and(new_cpus, new_cpus, cpu_online_mask);
+   cpumask_and(new_cpus, new_cpus, cp->cpus_allowed);
+   } else {
+   cpumask_and(new_cpus, cp->cpus_allowed,
+   parent->effective_cpus);
+   }
 
/*
 * If it 

[PATCH v8 3/6] cpuset: Add cpuset.sched.load_balance flag to v2

2018-05-17 Thread Waiman Long
The sched.load_balance flag is needed to enable CPU isolation similar to
what can be done with the "isolcpus" kernel boot parameter. Its value
can only be changed in a scheduling domain with no child cpusets. On
a non-scheduling domain cpuset, the value of sched.load_balance is
inherited from its parent.

This flag is set by the parent and is not delegatable.

Signed-off-by: Waiman Long 
---
 Documentation/cgroup-v2.txt | 24 
 kernel/cgroup/cpuset.c  | 53 +
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 54d9e22..071b634d 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1536,6 +1536,30 @@ Cpuset Interface Files
CPUs of the parent cgroup. Once it is set, this flag cannot be
cleared if there are any child cgroups with cpuset enabled.
 
+   A parent cgroup cannot distribute all its CPUs to child
+   scheduling domain cgroups unless its load balancing flag is
+   turned off.
+
+  cpuset.sched.load_balance
+   A read-write single value file which exists on non-root
+   cpuset-enabled cgroups.  It is a binary value flag that accepts
+   either "0" (off) or a non-zero value (on).  This flag is set
+   by the parent and is not delegatable.
+
+   When it is on, tasks within this cpuset will be load-balanced
+   by the kernel scheduler.  Tasks will be moved from CPUs with
+   high load to other CPUs within the same cpuset with less load
+   periodically.
+
+   When it is off, there will be no load balancing among CPUs on
+   this cgroup.  Tasks will stay in the CPUs they are running on
+   and will not be moved to other CPUs.
+
+   The initial value of this flag is "1".  This flag is then
+   inherited by child cgroups with cpuset enabled.  Its state
+   can only be changed on a scheduling domain cgroup with no
+   cpuset-enabled children.
+
 
 Device controller
 -
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e1a1af0..368e1b7 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -510,7 +510,7 @@ static int validate_change(struct cpuset *cur, struct 
cpuset *trial)
 
par = parent_cs(cur);
 
-   /* On legacy hiearchy, we must be a subset of our parent cpuset. */
+   /* On legacy hierarchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
@@ -1061,6 +1061,14 @@ static int update_isolated_cpumask(struct cpuset *cpuset,
goto out;
 
/*
+* A parent can't distribute all its CPUs to child scheduling
+* domain cpusets unless load balancing is off.
+*/
+   if (adding & !deleting && is_sched_load_balance(parent) &&
+   cpumask_equal(addmask, parent->effective_cpus))
+   goto out;
+
+   /*
 * Check if any CPUs in addmask or delmask are in a sibling cpuset.
 * An empty sibling cpus_allowed means it is the same as parent's
 * effective_cpus. This checking is skipped if the cpuset is dying.
@@ -1531,6 +1539,16 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
 
domain_flag_changed = (is_sched_domain(cs) != is_sched_domain(trialcs));
 
+   /*
+* On default hierachy, a load balance flag change is only allowed
+* in a scheduling domain with no child cpuset.
+*/
+   if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && balance_flag_changed &&
+  (!is_sched_domain(cs) || css_has_online_children(>css))) {
+   err = -EINVAL;
+   goto out;
+   }
+
if (domain_flag_changed) {
err = turning_on
? update_isolated_cpumask(cs, NULL, cs->cpus_allowed)
@@ -2187,6 +2205,14 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state 
*css, struct cftype *cft)
.flags = CFTYPE_NOT_ON_ROOT,
},
 
+   {
+   .name = "sched.load_balance",
+   .read_u64 = cpuset_read_u64,
+   .write_u64 = cpuset_write_u64,
+   .private = FILE_SCHED_LOAD_BALANCE,
+   .flags = CFTYPE_NOT_ON_ROOT,
+   },
+
{ } /* terminate */
 };
 
@@ -2200,19 +2226,38 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state 
*css, struct cftype *cft)
 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 {
struct cpuset *cs;
+   struct cgroup_subsys_state *errptr = ERR_PTR(-ENOMEM);
 
if (!parent_css)
return _cpuset.css;
 
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
-   return ERR_PTR(-ENOMEM);
+   return errptr;
if (!alloc_cpumask_var(>cpus_allowed, GFP_KERNEL))
goto free_cs;
if 

[PATCH v8 4/6] cpuset: Make generate_sched_domains() recognize isolated_cpus

2018-05-17 Thread Waiman Long
The generate_sched_domains() function and the hotplug code are modified
to make them use the newly introduced isolated_cpus mask for schedule
domains generation.

Signed-off-by: Waiman Long 
---
 kernel/cgroup/cpuset.c | 33 +
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 368e1b7..0e75f83 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -672,13 +672,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
int ndoms = 0;  /* number of sched domains in result */
int nslot;  /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
+   bool root_load_balance = is_sched_load_balance(_cpuset);
 
doms = NULL;
dattr = NULL;
csa = NULL;
 
/* Special case for the 99% of systems with one, full, sched domain */
-   if (is_sched_load_balance(_cpuset)) {
+   if (root_load_balance && !top_cpuset.isolation_count) {
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -701,6 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
csn = 0;
 
rcu_read_lock();
+   if (root_load_balance)
+   csa[csn++] = _cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, _cpuset) {
if (cp == _cpuset)
continue;
@@ -711,6 +714,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
 * parent's cpus, so just skip them, and then we call
 * update_domain_attr_tree() to calc relax_domain_level of
 * the corresponding sched domain.
+*
+* If root is load-balancing, we can skip @cp if it
+* is a subset of the root's effective_cpus.
 */
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -718,11 +724,16 @@ static int generate_sched_domains(cpumask_var_t **domains,
 housekeeping_cpumask(HK_FLAG_DOMAIN
continue;
 
+   if (root_load_balance &&
+   cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+   continue;
+
if (is_sched_load_balance(cp))
csa[csn++] = cp;
 
-   /* skip @cp's subtree */
-   pos_css = css_rightmost_descendant(pos_css);
+   /* skip @cp's subtree if not a scheduling domain */
+   if (!is_sched_domain(cp))
+   pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
 
@@ -849,7 +860,12 @@ static void rebuild_sched_domains_locked(void)
 * passing doms with offlined cpu to partition_sched_domains().
 * Anyways, hotplug work item will rebuild sched domains.
 */
-   if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+   if (!top_cpuset.isolation_count &&
+   !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+   goto out;
+
+   if (top_cpuset.isolation_count &&
+  !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
 
/* Generate domain masks and attrs */
@@ -2624,6 +2640,11 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
cpumask_copy(_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
 
+   /*
+* If isolated_cpus is populated, it is likely that the check below
+* will produce a false positive on cpus_updated when the cpu list
+* isn't changed. It is extra work, but it is better to be safe.
+*/
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, _cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
@@ -2632,6 +2653,10 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
spin_lock_irq(_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, _cpus);
+
+   if (top_cpuset.isolation_count)
+   cpumask_andnot(_cpus, _cpus,
+   top_cpuset.isolated_cpus);
cpumask_copy(top_cpuset.effective_cpus, _cpus);
spin_unlock_irq(_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 1/6] cpuset: Enable cpuset controller in default hierarchy

2018-05-17 Thread Waiman Long
Given the fact that thread mode had been merged into 4.14, it is now
time to enable cpuset to be used in the default hierarchy (cgroup v2)
as it is clearly threaded.

The cpuset controller had experienced feature creep since its
introduction more than a decade ago. Besides the core cpus and mems
control files to limit cpus and memory nodes, there are a bunch of
additional features that can be controlled from the userspace. Some of
the features are of doubtful usefulness and may not be actively used.

This patch enables cpuset controller in the default hierarchy with
a minimal set of features, namely just the cpus and mems and their
effective_* counterparts.  We can certainly add more features to the
default hierarchy in the future if there is a real user need for them
later on.

Alternatively, with the unified hiearachy, it may make more sense
to move some of those additional cpuset features, if desired, to
memory controller or may be to the cpu controller instead of staying
with cpuset.

Signed-off-by: Waiman Long 
---
 Documentation/cgroup-v2.txt | 90 ++---
 kernel/cgroup/cpuset.c  | 48 ++--
 2 files changed, 130 insertions(+), 8 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 74cdeae..cf7bac6 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -53,11 +53,13 @@ v1 is available under Documentation/cgroup-v1/.
5-3-2. Writeback
  5-4. PID
5-4-1. PID Interface Files
- 5-5. Device
- 5-6. RDMA
-   5-6-1. RDMA Interface Files
- 5-7. Misc
-   5-7-1. perf_event
+ 5-5. Cpuset
+   5.5-1. Cpuset Interface Files
+ 5-6. Device
+ 5-7. RDMA
+   5-7-1. RDMA Interface Files
+ 5-8. Misc
+   5-8-1. perf_event
  5-N. Non-normative information
5-N-1. CPU controller root cgroup process behaviour
5-N-2. IO controller root cgroup process behaviour
@@ -1435,6 +1437,84 @@ through fork() or clone(). These will return -EAGAIN if 
the creation
 of a new process would cause a cgroup policy to be violated.
 
 
+Cpuset
+--
+
+The "cpuset" controller provides a mechanism for constraining
+the CPU and memory node placement of tasks to only the resources
+specified in the cpuset interface files in a task's current cgroup.
+This is especially valuable on large NUMA systems where placing jobs
+on properly sized subsets of the systems with careful processor and
+memory placement to reduce cross-node memory access and contention
+can improve overall system performance.
+
+The "cpuset" controller is hierarchical.  That means the controller
+cannot use CPUs or memory nodes not allowed in its parent.
+
+
+Cpuset Interface Files
+~~
+
+  cpuset.cpus
+   A read-write multiple values file which exists on non-root
+   cpuset-enabled cgroups.
+
+   It lists the CPUs allowed to be used by tasks within this
+   cgroup.  The CPU numbers are comma-separated numbers or
+   ranges.  For example:
+
+ # cat cpuset.cpus
+ 0-4,6,8-10
+
+   An empty value indicates that the cgroup is using the same
+   setting as the nearest cgroup ancestor with a non-empty
+   "cpuset.cpus" or all the available CPUs if none is found.
+
+   The value of "cpuset.cpus" stays constant until the next update
+   and won't be affected by any CPU hotplug events.
+
+  cpuset.cpus.effective
+   A read-only multiple values file which exists on non-root
+   cpuset-enabled cgroups.
+
+   It lists the onlined CPUs that are actually allowed to be
+   used by tasks within the current cgroup.  If "cpuset.cpus"
+   is empty, it shows all the CPUs from the parent cgroup that
+   will be available to be used by this cgroup.  Otherwise, it is
+   a subset of "cpuset.cpus".  Its value will be affected by CPU
+   hotplug events.
+
+  cpuset.mems
+   A read-write multiple values file which exists on non-root
+   cpuset-enabled cgroups.
+
+   It lists the memory nodes allowed to be used by tasks within
+   this cgroup.  The memory node numbers are comma-separated
+   numbers or ranges.  For example:
+
+ # cat cpuset.mems
+ 0-1,3
+
+   An empty value indicates that the cgroup is using the same
+   setting as the nearest cgroup ancestor with a non-empty
+   "cpuset.mems" or all the available memory nodes if none
+   is found.
+
+   The value of "cpuset.mems" stays constant until the next update
+   and won't be affected by any memory nodes hotplug events.
+
+  cpuset.mems.effective
+   A read-only multiple values file which exists on non-root
+   cpuset-enabled cgroups.
+
+   It lists the onlined memory nodes that are actually allowed to
+   be used by tasks within the current cgroup.  If "cpuset.mems"
+   is empty, it shows all the memory nodes from the parent cgroup
+   

[PATCH v8 6/6] cpuset: Allow reporting of sched domain generation info

2018-05-17 Thread Waiman Long
This patch enables us to report sched domain generation information.

If DYNAMIC_DEBUG is enabled, issuing the following command

  echo "file cpuset.c +p" > /sys/kernel/debug/dynamic_debug/control

and setting loglevel to 8 will allow the kernel to show what scheduling
domain changes are being made.

Signed-off-by: Waiman Long 
---
 kernel/cgroup/cpuset.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index fb8aa82b..8f586e8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -820,6 +820,12 @@ static int generate_sched_domains(cpumask_var_t **domains,
}
BUG_ON(nslot != ndoms);
 
+#ifdef CONFIG_DEBUG_KERNEL
+   for (i = 0; i < ndoms; i++)
+   pr_debug("generate_sched_domains dom %d: %*pbl\n", i,
+cpumask_pr_args(doms[i]));
+#endif
+
 done:
kfree(csa);
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 0/6] Enable cpuset controller in default hierarchy

2018-05-17 Thread Waiman Long
v8:
 - Remove cpuset.cpus.isolated and add a new cpuset.sched.domain flag
   and rework the code accordingly.

v7:
 - Add a root-only cpuset.cpus.isolated control file for CPU isolation.
 - Enforce that load_balancing can only be turned off on cpusets with
   CPUs from the isolated list.
 - Update sched domain generation to allow cpusets with CPUs only
   from the isolated CPU list to be in separate root domains.

v6:
 - Hide cpuset control knobs in root cgroup.
 - Rename effective_cpus and effective_mems to cpus.effective and
   mems.effective respectively.
 - Remove cpuset.flags and add cpuset.sched_load_balance instead
   as the behavior of sched_load_balance has changed and so is
   not a simple flag.
 - Update cgroup-v2.txt accordingly.

v5:
 - Add patch 2 to provide the cpuset.flags control knob for the
   sched_load_balance flag which should be the only feature that is
   essential as a replacement of the "isolcpus" kernel boot parameter.

v4:
 - Further minimize the feature set by removing the flags control knob.

v3:
 - Further trim the additional features down to just memory_migrate.
 - Update Documentation/cgroup-v2.txt.

v6 patch: https://lkml.org/lkml/2018/3/21/530
v7 patch: https://lkml.org/lkml/2018/4/19/448

The purpose of this patchset is to provide a basic set of cpuset control
files for cgroup v2. This basic set includes the non-root "cpus",
"mems", "sched.load_balance" and "sched.domain". The "cpus.effective"
and "mems.effective" will appear in all cpuset-enabled cgroups.

The new control file that is unique to v2 is "sched.domain". It is a
boolean flag file that designates if a cgroup is a scheduling domain
with its own set of unique list of CPUs from scheduling perspective
disjointed from other scheduling domains. The root cgroup is always a
scheduling domain. Multiple levels of scheduling domains are supported
with some limitations. So a container scheduling domain root can behave
like a real root.

When a scheduling domain cgroup is removed, its list of exclusive CPUs
will be returned to the parent's cpus.effective automatically.

The "sched.load_balance" flag can only be changed in a scheduling domain.
with no child cpuset-enabled cgroups.

This patchset supports isolated CPUs in a child scheduling domain with
load balancing off. It also allows easy setup of multiple scheduling
domains without requiring the trick of turning load balancing off in the
root cgroup.

This patchset does not exclude the possibility of adding more features
in the future after careful consideration.

Patch 1 enables cpuset in cgroup v2 with cpus, mems and their
effective counterparts.

Patch 2 adds a new "sched.domain" control file for setting up multiple
scheduling domains. A scheduling domain implies cpu_exclusive.

Patch 3 adds a "sched.load_balance" flag to turn off load balancing in
a scheduling domain.

Patch 4 updates the scheduling domain genaration code to work with
the new scheduling domain feature.

Patch 5 exposes cpus.effective and mems.effective to the root cgroup as
enabling child scheduling domains will take CPUs away from the root cgroup.
So it will be nice to monitor what CPUs are left there.

Patch 6 enables the printing the debug information about scheduling
domain generation.

Waiman Long (6):
  cpuset: Enable cpuset controller in default hierarchy
  cpuset: Add new v2 cpuset.sched.domain flag
  cpuset: Add cpuset.sched.load_balance flag to v2
  cpuset: Make generate_sched_domains() recognize isolated_cpus
  cpuset: Expose cpus.effective and mems.effective on cgroup v2 root
  cpuset: Allow reporting of sched domain generation info

 Documentation/cgroup-v2.txt | 136 +++-
 kernel/cgroup/cpuset.c  | 375 ++--
 2 files changed, 492 insertions(+), 19 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 5/6] cpuset: Expose cpus.effective and mems.effective on cgroup v2 root

2018-05-17 Thread Waiman Long
Because of the fact that setting the "cpuset.sched.domain" in a direct
child of root can remove CPUs from the root's effective CPU list, it
makes sense to know what CPUs are left in the root cgroup for scheduling
purpose. So the "cpuset.cpus.effective" control file is now exposed in
the v2 cgroup root.

For consistency, the "cpuset.mems.effective" control file is exposed
as well.

Signed-off-by: Waiman Long 
---
 Documentation/cgroup-v2.txt | 4 ++--
 kernel/cgroup/cpuset.c  | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 071b634d..8739b10 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1474,7 +1474,7 @@ Cpuset Interface Files
and won't be affected by any CPU hotplug events.
 
   cpuset.cpus.effective
-   A read-only multiple values file which exists on non-root
+   A read-only multiple values file which exists on all
cpuset-enabled cgroups.
 
It lists the onlined CPUs that are actually allowed to be
@@ -1504,7 +1504,7 @@ Cpuset Interface Files
and won't be affected by any memory nodes hotplug events.
 
   cpuset.mems.effective
-   A read-only multiple values file which exists on non-root
+   A read-only multiple values file which exists on all
cpuset-enabled cgroups.
 
It lists the onlined memory nodes that are actually allowed to
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0e75f83..fb8aa82b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2203,14 +2203,12 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state 
*css, struct cftype *cft)
.name = "cpus.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_CPULIST,
-   .flags = CFTYPE_NOT_ON_ROOT,
},
 
{
.name = "mems.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_MEMLIST,
-   .flags = CFTYPE_NOT_ON_ROOT,
},
 
{
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: rename HINTS_DEDICATED to KVM_HINTS_REALTIME

2018-05-17 Thread Michael S. Tsirkin
On Thu, May 17, 2018 at 03:46:58PM -0300, Eduardo Habkost wrote:
> On Thu, May 17, 2018 at 05:54:24PM +0300, Michael S. Tsirkin wrote:
> > HINTS_DEDICATED seems to be somewhat confusing:
> > 
> > Guest doesn't really care whether it's the only task running on a host
> > CPU as long as it's not preempted.
> > 
> > And there are more reasons for Guest to be preempted than host CPU
> > sharing, for example, with memory overcommit it can get preempted on a
> > memory access, post copy migration can cause preemption, etc.
> > 
> > Let's call it KVM_HINTS_REALTIME which seems to better
> > match what guests expect.
> > 
> > Also, the flag most be set on all vCPUs - current guests assume this.
> > Note so in the documentation.
> > 
> > Signed-off-by: Michael S. Tsirkin 
> > ---
> >  Documentation/virtual/kvm/cpuid.txt  | 6 +++---
> >  arch/x86/include/uapi/asm/kvm_para.h | 2 +-
> >  arch/x86/kernel/kvm.c| 8 
> >  3 files changed, 8 insertions(+), 8 deletions(-)
> > 
> > diff --git a/Documentation/virtual/kvm/cpuid.txt 
> > b/Documentation/virtual/kvm/cpuid.txt
> > index d4f33eb8..ab022dc 100644
> > --- a/Documentation/virtual/kvm/cpuid.txt
> > +++ b/Documentation/virtual/kvm/cpuid.txt
> > @@ -72,8 +72,8 @@ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||24 || host will 
> > warn if no guest-side
> >  
> >  flag   || value || meaning
> >  
> > ==
> > -KVM_HINTS_DEDICATED|| 0 || guest checks this feature 
> > bit to
> > -   ||   || determine if there is vCPU 
> > pinning
> > -   ||   || and there is no vCPU 
> > over-commitment,
> > +KVM_HINTS_REALTIME || 0 || guest checks this feature 
> > bit to
> > +   ||   || determine that vCPUs are 
> > never
> > +   ||   || preempted for an unlimited 
> > time,
> > ||   || allowing optimizations
> 
> My understanding of the original patch is that the intention is
> to tell the guest that it is very unlikely to be preempted,
> so it
> can choose a more appropriate spinlock implementation.  This
> description implies that the guest will never be preempted, which
> is much stronger guarantee.

Note:

...  for an unlimited time.

> 
> Isn't this new description incompatible with existing usage of
> the hint, which might include people who just use vCPU pinning
> but no mlock?

Without mlock you should always use pv spinlocks.

Otherwise you risk blocking on a lock taken by
a VCPU that is in turn blocked on IO, where the IO
is not completing because CPU is being used up
spinning.

> -- 
> Eduardo
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: rename HINTS_DEDICATED to KVM_HINTS_REALTIME

2018-05-17 Thread Eduardo Habkost
On Thu, May 17, 2018 at 05:54:24PM +0300, Michael S. Tsirkin wrote:
> HINTS_DEDICATED seems to be somewhat confusing:
> 
> Guest doesn't really care whether it's the only task running on a host
> CPU as long as it's not preempted.
> 
> And there are more reasons for Guest to be preempted than host CPU
> sharing, for example, with memory overcommit it can get preempted on a
> memory access, post copy migration can cause preemption, etc.
> 
> Let's call it KVM_HINTS_REALTIME which seems to better
> match what guests expect.
> 
> Also, the flag most be set on all vCPUs - current guests assume this.
> Note so in the documentation.
> 
> Signed-off-by: Michael S. Tsirkin 
> ---
>  Documentation/virtual/kvm/cpuid.txt  | 6 +++---
>  arch/x86/include/uapi/asm/kvm_para.h | 2 +-
>  arch/x86/kernel/kvm.c| 8 
>  3 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/cpuid.txt 
> b/Documentation/virtual/kvm/cpuid.txt
> index d4f33eb8..ab022dc 100644
> --- a/Documentation/virtual/kvm/cpuid.txt
> +++ b/Documentation/virtual/kvm/cpuid.txt
> @@ -72,8 +72,8 @@ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||24 || host will 
> warn if no guest-side
>  
>  flag   || value || meaning
>  
> ==
> -KVM_HINTS_DEDICATED|| 0 || guest checks this feature bit 
> to
> -   ||   || determine if there is vCPU 
> pinning
> -   ||   || and there is no vCPU 
> over-commitment,
> +KVM_HINTS_REALTIME || 0 || guest checks this feature bit 
> to
> +   ||   || determine that vCPUs are never
> +   ||   || preempted for an unlimited 
> time,
> ||   || allowing optimizations

My understanding of the original patch is that the intention is
to tell the guest that it is very unlikely to be preempted, so it
can choose a more appropriate spinlock implementation.  This
description implies that the guest will never be preempted, which
is much stronger guarantee.

Isn't this new description incompatible with existing usage of
the hint, which might include people who just use vCPU pinning
but no mlock?

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v7 1/3] drm: Add writeback connector type

2018-05-17 Thread Liviu Dudau
From: Brian Starkey 

Writeback connectors represent writeback engines which can write the
CRTC output to a memory framebuffer. Add a writeback connector type and
related support functions.

Drivers should initialize a writeback connector with
drm_writeback_connector_init() which takes care of setting up all the
writeback-specific details on top of the normal functionality of
drm_connector_init().

Writeback connectors have a WRITEBACK_FB_ID property, used to set the
output framebuffer, and a WRITEBACK_PIXEL_FORMATS blob used to expose the
supported writeback formats to userspace.

When a framebuffer is attached to a writeback connector with the
WRITEBACK_FB_ID property, it is used only once (for the commit in which
it was included), and userspace can never read back the value of
WRITEBACK_FB_ID. WRITEBACK_FB_ID can only be set if the connector is
attached to a CRTC.

Changes since v1:
 - Added drm_writeback.c + documentation
 - Added helper to initialize writeback connector in one go
 - Added core checks
 - Squashed into a single commit
 - Dropped the client cap
 - Writeback framebuffers are no longer persistent

Changes since v2:
 Daniel Vetter:
 - Subclass drm_connector to drm_writeback_connector
 - Relax check to allow CRTC to be set without an FB
 - Add some writeback_ prefixes
 - Drop PIXEL_FORMATS_SIZE property, as it was unnecessary
 Gustavo Padovan:
 - Add drm_writeback_job to handle writeback signalling centrally

Changes since v3:
 - Rebased
 - Rename PIXEL_FORMATS -> WRITEBACK_PIXEL_FORMATS

Chances since v4:
 - Embed a drm_encoder inside the drm_writeback_connector to
   reduce the amount of boilerplate code required from the drivers
   that are using it.

Changes since v5:
 - Added Rob Clark's atomic_commit() vfunc to connector helper
   funcs, so that writeback jobs are committed from atomic helpers
 - Updated create_writeback_properties() signature to return an
   error code rather than a boolean false for failure.
 - Free writeback job with the connector state rather than when
   doing the cleanup_work()

Cc: linux-doc@vger.kernel.org
Signed-off-by: Brian Starkey 
[rebased and fixed conflicts]
Signed-off-by: Mihail Atanassov 
[rebased and added atomic_commit() vfunc for writeback jobs]
Signed-off-by: Rob Clark 
Signed-off-by: Liviu Dudau 
---
 Documentation/gpu/drm-kms.rst|   9 +
 drivers/gpu/drm/Makefile |   2 +-
 drivers/gpu/drm/drm_atomic.c | 128 
 drivers/gpu/drm/drm_atomic_helper.c  |  30 +++
 drivers/gpu/drm/drm_connector.c  |   4 +-
 drivers/gpu/drm/drm_writeback.c  | 256 +++
 include/drm/drm_atomic.h |   3 +
 include/drm/drm_connector.h  |  13 ++
 include/drm/drm_mode_config.h|  15 ++
 include/drm/drm_modeset_helper_vtables.h |  11 +
 include/drm/drm_writeback.h  |  88 
 include/uapi/drm/drm_mode.h  |   1 +
 12 files changed, 558 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/drm_writeback.c
 create mode 100644 include/drm/drm_writeback.h

diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst
index 1dffd1ac4cd44..809d403087f95 100644
--- a/Documentation/gpu/drm-kms.rst
+++ b/Documentation/gpu/drm-kms.rst
@@ -373,6 +373,15 @@ Connector Functions Reference
 .. kernel-doc:: drivers/gpu/drm/drm_connector.c
:export:
 
+Writeback Connectors
+
+
+.. kernel-doc:: drivers/gpu/drm/drm_writeback.c
+  :doc: overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_writeback.c
+  :export:
+
 Encoder Abstraction
 ===
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 50093ff4479b4..3d708959b224c 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -18,7 +18,7 @@ drm-y   :=drm_auth.o drm_bufs.o drm_cache.o \
drm_encoder.o drm_mode_object.o drm_property.o \
drm_plane.o drm_color_mgmt.o drm_print.o \
drm_dumb_buffers.o drm_mode_config.o drm_vblank.o \
-   drm_syncobj.o drm_lease.o
+   drm_syncobj.o drm_lease.o drm_writeback.o
 
 drm-$(CONFIG_DRM_LIB_RANDOM) += lib/drm_random.o
 drm-$(CONFIG_DRM_VM) += drm_vm.o
diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 7d25c42f22dbc..3f1e4b894803b 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "drm_crtc_internal.h"
@@ -668,6 +669,45 @@ static void drm_atomic_crtc_print_state(struct drm_printer 
*p,
crtc->funcs->atomic_print_state(p, state);
 }
 
+/**
+ * drm_atomic_connector_check - check connector state
+ * @connector: connector to check
+ * @state: connector state to check
+ *
+ * Provides core sanity checks for connector state.
+ *
+ 

[PATCH v2 2/7] PCI: dwc: Add MSI-X callbacks handler

2018-05-17 Thread Gustavo Pimentel
Change pcie_raise_irq() signature, namely the interrupt_num variable type
from u8 to u16 to accommodate 2048 maximum MSI-X interrupts.

Add PCIe config space capability search function.

Add sysfs set/get interface to allow the change of EP MSI-X maximum number.

Add EP MSI-X callback for triggering interruptions.

Signed-off-by: Gustavo Pimentel 
---
Change v1->v2:
 - Nothing changed, just to follow the patch set version.

 drivers/pci/dwc/pci-dra7xx.c   |   2 +-
 drivers/pci/dwc/pcie-artpec6.c |   2 +-
 drivers/pci/dwc/pcie-designware-ep.c   | 146 -
 drivers/pci/dwc/pcie-designware-plat.c |   4 +-
 drivers/pci/dwc/pcie-designware.h  |  14 +++-
 5 files changed, 163 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/dwc/pci-dra7xx.c b/drivers/pci/dwc/pci-dra7xx.c
index f688204..bdf948b 100644
--- a/drivers/pci/dwc/pci-dra7xx.c
+++ b/drivers/pci/dwc/pci-dra7xx.c
@@ -370,7 +370,7 @@ static void dra7xx_pcie_raise_msi_irq(struct dra7xx_pcie 
*dra7xx,
 }
 
 static int dra7xx_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-enum pci_epc_irq_type type, u8 interrupt_num)
+enum pci_epc_irq_type type, u16 interrupt_num)
 {
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pci);
diff --git a/drivers/pci/dwc/pcie-artpec6.c b/drivers/pci/dwc/pcie-artpec6.c
index 321b56c..9a2474b 100644
--- a/drivers/pci/dwc/pcie-artpec6.c
+++ b/drivers/pci/dwc/pcie-artpec6.c
@@ -428,7 +428,7 @@ static void artpec6_pcie_ep_init(struct dw_pcie_ep *ep)
 }
 
 static int artpec6_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
- enum pci_epc_irq_type type, u8 interrupt_num)
+ enum pci_epc_irq_type type, u16 interrupt_num)
 {
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
diff --git a/drivers/pci/dwc/pcie-designware-ep.c 
b/drivers/pci/dwc/pcie-designware-ep.c
index 1eec441..e5f2377 100644
--- a/drivers/pci/dwc/pcie-designware-ep.c
+++ b/drivers/pci/dwc/pcie-designware-ep.c
@@ -40,6 +40,39 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum 
pci_barno bar)
__dw_pcie_ep_reset_bar(pci, bar, 0);
 }
 
+u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
+ u8 cap)
+{
+   u8 cap_id, next_cap_ptr;
+   u16 reg;
+
+   reg = dw_pcie_readw_dbi(pci, cap_ptr);
+   next_cap_ptr = (reg & 0xff00) >> 8;
+   cap_id = (reg & 0x00ff);
+
+   if (!next_cap_ptr || cap_id > PCI_CAP_ID_MAX)
+   return 0;
+
+   if (cap_id == cap)
+   return cap_ptr;
+
+   return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap);
+}
+
+u8 dw_pcie_ep_find_capability(struct dw_pcie *pci, u8 cap)
+{
+   u8 next_cap_ptr;
+   u16 reg;
+
+   reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST);
+   next_cap_ptr = (reg & 0x00ff);
+
+   if (!next_cap_ptr)
+   return 0;
+
+   return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap);
+}
+
 static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no,
   struct pci_epf_header *hdr)
 {
@@ -241,8 +274,47 @@ static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 
func_no, u8 encode_int)
return 0;
 }
 
+static int dw_pcie_ep_get_msix(struct pci_epc *epc, u8 func_no)
+{
+   struct dw_pcie_ep *ep = epc_get_drvdata(epc);
+   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   u32 val, reg;
+
+   if (!ep->msix_cap)
+   return 0;
+
+   reg = ep->msix_cap + PCI_MSIX_FLAGS;
+   val = dw_pcie_readw_dbi(pci, reg);
+   if (!(val & PCI_MSIX_FLAGS_ENABLE))
+   return -EINVAL;
+
+   val &= PCI_MSIX_FLAGS_QSIZE;
+
+   return val;
+}
+
+static int dw_pcie_ep_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts)
+{
+   struct dw_pcie_ep *ep = epc_get_drvdata(epc);
+   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   u32 val, reg;
+
+   if (!ep->msix_cap)
+   return 0;
+
+   reg = ep->msix_cap + PCI_MSIX_FLAGS;
+   val = dw_pcie_readw_dbi(pci, reg);
+   val &= ~PCI_MSIX_FLAGS_QSIZE;
+   val |= interrupts;
+   dw_pcie_dbi_ro_wr_en(pci);
+   dw_pcie_writew_dbi(pci, reg, val);
+   dw_pcie_dbi_ro_wr_dis(pci);
+
+   return 0;
+}
+
 static int dw_pcie_ep_raise_irq(struct pci_epc *epc, u8 func_no,
-   enum pci_epc_irq_type type, u8 interrupt_num)
+   enum pci_epc_irq_type type, u16 interrupt_num)
 {
struct dw_pcie_ep *ep = epc_get_drvdata(epc);
 
@@ -282,6 +354,8 @@ static const struct pci_epc_ops epc_ops = {
.unmap_addr = dw_pcie_ep_unmap_addr,
.set_msi= dw_pcie_ep_set_msi,
.get_msi= dw_pcie_ep_get_msi,
+   .set_msix   = dw_pcie_ep_set_msix,
+ 

[PATCH v2 7/7] tools: PCI: Add MSI-X support

2018-05-17 Thread Gustavo Pimentel
Add MSI-X support to pcitest tool.

Add 2 new IOCTL commands:
 - Allow to reconfigure driver IRQ type in runtime.
 - Allow to retrieve current driver IRQ type configured.

Modify pcitest.sh script to accommodate MSI-X interrupt tests.

Signed-off-by: Gustavo Pimentel 
---
Change v1->v2:
 - Allow IRQ type driver reconfiguring in runtime, follwing Kishon's
suggestion.

 include/uapi/linux/pcitest.h |  3 +++
 tools/pci/pcitest.c  | 51 +++-
 tools/pci/pcitest.sh | 15 +
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h
index 953cf03..cbf422e 100644
--- a/include/uapi/linux/pcitest.h
+++ b/include/uapi/linux/pcitest.h
@@ -16,5 +16,8 @@
 #define PCITEST_WRITE  _IOW('P', 0x4, unsigned long)
 #define PCITEST_READ   _IOW('P', 0x5, unsigned long)
 #define PCITEST_COPY   _IOW('P', 0x6, unsigned long)
+#define PCITEST_MSIX   _IOW('P', 0x7, int)
+#define PCITEST_SET_IRQTYPE_IOW('P', 0x8, int)
+#define PCITEST_GET_IRQTYPE_IO('P', 0x9)
 
 #endif /* __UAPI_LINUX_PCITEST_H */
diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c
index 9074b47..af146bb 100644
--- a/tools/pci/pcitest.c
+++ b/tools/pci/pcitest.c
@@ -31,12 +31,17 @@
 #define BILLION 1E9
 
 static char *result[] = { "NOT OKAY", "OKAY" };
+static char *irq[] = { "LEGACY", "MSI", "MSI-X" };
 
 struct pci_test {
char*device;
charbarnum;
boollegacyirq;
unsigned intmsinum;
+   unsigned intmsixnum;
+   int irqtype;
+   boolset_irqtype;
+   boolget_irqtype;
boolread;
boolwrite;
boolcopy;
@@ -65,6 +70,24 @@ static int run_test(struct pci_test *test)
fprintf(stdout, "%s\n", result[ret]);
}
 
+   if (test->set_irqtype) {
+   ret = ioctl(fd, PCITEST_SET_IRQTYPE, test->irqtype);
+   fprintf(stdout, "SET IRQ TYPE TO %s:\t\t", irq[test->irqtype]);
+   if (ret < 0)
+   fprintf(stdout, "FAILED\n");
+   else
+   fprintf(stdout, "%s\n", result[ret]);
+   }
+
+   if (test->get_irqtype) {
+   ret = ioctl(fd, PCITEST_GET_IRQTYPE);
+   fprintf(stdout, "GET IRQ TYPE:\t\t");
+   if (ret < 0)
+   fprintf(stdout, "FAILED\n");
+   else
+   fprintf(stdout, "%s\n", irq[ret]);
+   }
+
if (test->legacyirq) {
ret = ioctl(fd, PCITEST_LEGACY_IRQ, 0);
fprintf(stdout, "LEGACY IRQ:\t");
@@ -83,6 +106,15 @@ static int run_test(struct pci_test *test)
fprintf(stdout, "%s\n", result[ret]);
}
 
+   if (test->msixnum > 0 && test->msixnum <= 2048) {
+   ret = ioctl(fd, PCITEST_MSIX, test->msixnum);
+   fprintf(stdout, "MSI-X%d:\t\t", test->msixnum);
+   if (ret < 0)
+   fprintf(stdout, "TEST FAILED\n");
+   else
+   fprintf(stdout, "%s\n", result[ret]);
+   }
+
if (test->write) {
ret = ioctl(fd, PCITEST_WRITE, test->size);
fprintf(stdout, "WRITE (%7ld bytes):\t\t", test->size);
@@ -133,7 +165,7 @@ int main(int argc, char **argv)
/* set default endpoint device */
test->device = "/dev/pci-endpoint-test.0";
 
-   while ((c = getopt(argc, argv, "D:b:m:lrwcs:")) != EOF)
+   while ((c = getopt(argc, argv, "D:b:m:x:i:Ilrwcs:")) != EOF)
switch (c) {
case 'D':
test->device = optarg;
@@ -151,6 +183,20 @@ int main(int argc, char **argv)
if (test->msinum < 1 || test->msinum > 32)
goto usage;
continue;
+   case 'x':
+   test->msixnum = atoi(optarg);
+   if (test->msixnum < 1 || test->msixnum > 2048)
+   goto usage;
+   continue;
+   case 'i':
+   test->irqtype = atoi(optarg);
+   if (test->irqtype < 0 || test->irqtype > 2)
+   goto usage;
+   test->set_irqtype = true;
+   continue;
+   case 'I':
+   test->get_irqtype = true;
+   continue;
case 'r':
test->read = true;
continue;
@@ -173,6 +219,9 @@ int main(int argc, char **argv)
"\t-D  PCI endpoint test device 
{default: /dev/pci-endpoint-test.0}\n"
"\t-b  BAR test (bar number between 
0..5)\n"
"\t-m  MSI test (msi number between 
1..32)\n"
+   "\t-x \tMSI-X test (msix number 
between 1..2048)\n"
+  

[PATCH v2 5/7] PCI: dwc: Add legacy interrupt callback handler

2018-05-17 Thread Gustavo Pimentel
Add a legacy interrupt callback handler. Currently DesignWare IP don't
allow trigger legacy interrupts.

Signed-off-by: Gustavo Pimentel 
---
Change v1->v2:
 - Nothing changed, just to follow the patch set version.

 drivers/pci/dwc/pcie-designware-ep.c   | 10 ++
 drivers/pci/dwc/pcie-designware-plat.c |  3 +--
 drivers/pci/dwc/pcie-designware.h  |  6 ++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/dwc/pcie-designware-ep.c 
b/drivers/pci/dwc/pcie-designware-ep.c
index a4baa0d..9822127 100644
--- a/drivers/pci/dwc/pcie-designware-ep.c
+++ b/drivers/pci/dwc/pcie-designware-ep.c
@@ -370,6 +370,16 @@ static const struct pci_epc_ops epc_ops = {
.stop   = dw_pcie_ep_stop,
 };
 
+int dw_pcie_ep_raise_legacy_irq(struct dw_pcie_ep *ep, u8 func_no)
+{
+   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   struct device *dev = pci->dev;
+
+   dev_err(dev, "EP cannot trigger legacy IRQs\n");
+
+   return -EINVAL;
+}
+
 int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no,
 u8 interrupt_num)
 {
diff --git a/drivers/pci/dwc/pcie-designware-plat.c 
b/drivers/pci/dwc/pcie-designware-plat.c
index 654dcb5..90a8c95 100644
--- a/drivers/pci/dwc/pcie-designware-plat.c
+++ b/drivers/pci/dwc/pcie-designware-plat.c
@@ -84,8 +84,7 @@ static int dw_plat_pcie_ep_raise_irq(struct dw_pcie_ep *ep, 
u8 func_no,
 
switch (type) {
case PCI_EPC_IRQ_LEGACY:
-   dev_err(pci->dev, "EP cannot trigger legacy IRQs\n");
-   return -EINVAL;
+   return dw_pcie_ep_raise_legacy_irq(ep, func_no);
case PCI_EPC_IRQ_MSI:
return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
case PCI_EPC_IRQ_MSIX:
diff --git a/drivers/pci/dwc/pcie-designware.h 
b/drivers/pci/dwc/pcie-designware.h
index a0ab12f..69e6e17 100644
--- a/drivers/pci/dwc/pcie-designware.h
+++ b/drivers/pci/dwc/pcie-designware.h
@@ -350,6 +350,7 @@ static inline int dw_pcie_allocate_domains(struct pcie_port 
*pp)
 void dw_pcie_ep_linkup(struct dw_pcie_ep *ep);
 int dw_pcie_ep_init(struct dw_pcie_ep *ep);
 void dw_pcie_ep_exit(struct dw_pcie_ep *ep);
+int dw_pcie_ep_raise_legacy_irq(struct dw_pcie_ep *ep, u8 func_no);
 int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no,
 u8 interrupt_num);
 int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
@@ -369,6 +370,11 @@ static inline void dw_pcie_ep_exit(struct dw_pcie_ep *ep)
 {
 }
 
+static inline int dw_pcie_ep_raise_legacy_irq(struct dw_pcie_ep *ep, u8 
func_no)
+{
+   return 0;
+}
+
 static inline int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no,
   u8 interrupt_num)
 {
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 6/7] misc: pci_endpoint_test: Add MSI-X support

2018-05-17 Thread Gustavo Pimentel
Add MSI-X support and update driver documentation accordingly.

Add new driver parameter to allow interruption type selection.

Add 2 new IOCTL commands:
 - Allow to reconfigure driver IRQ type in runtime.
 - Allow to retrieve current driver IRQ type configured.

Change Legacy/MSI/MSI-X test process, by having in a BAR:
 - Interrupt type triggered (added).
 - Interrupt ID number (moved from the command section).

Signed-off-by: Gustavo Pimentel 
---
Change v1->v2:
 - Allow IRQ type driver reconfiguring in runtime, follwing Kishon's
suggestion.

 Documentation/misc-devices/pci-endpoint-test.txt |   6 +
 drivers/misc/pci_endpoint_test.c | 261 +--
 drivers/pci/endpoint/functions/pci-epf-test.c|  81 +--
 3 files changed, 260 insertions(+), 88 deletions(-)

diff --git a/Documentation/misc-devices/pci-endpoint-test.txt 
b/Documentation/misc-devices/pci-endpoint-test.txt
index 4ebc359..58ccca4 100644
--- a/Documentation/misc-devices/pci-endpoint-test.txt
+++ b/Documentation/misc-devices/pci-endpoint-test.txt
@@ -10,6 +10,7 @@ The PCI driver for the test device performs the following 
tests
*) verifying addresses programmed in BAR
*) raise legacy IRQ
*) raise MSI IRQ
+   *) raise MSI-X IRQ
*) read data
*) write data
*) copy data
@@ -25,6 +26,11 @@ ioctl
  PCITEST_LEGACY_IRQ: Tests legacy IRQ
  PCITEST_MSI: Tests message signalled interrupts. The MSI number
  to be tested should be passed as argument.
+ PCITEST_MSIX: Tests message signalled interrupts. The MSI-X number
+ to be tested should be passed as argument.
+ PCITEST_SET_IRQTYPE: Changes driver IRQ type configuration. The IRQ type
+ should be passed as argument (0: Legacy, 1:MSI, 2:MSI-X).
+ PCITEST_GET_IRQTYPE: Gets driver IRQ type configuration.
  PCITEST_WRITE: Perform write tests. The size of the buffer should be passed
as argument.
  PCITEST_READ: Perform read tests. The size of the buffer should be passed
diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 7b37046..df2017f 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -35,38 +35,44 @@
 
 #include 
 
-#define DRV_MODULE_NAME"pci-endpoint-test"
-
-#define PCI_ENDPOINT_TEST_MAGIC0x0
-
-#define PCI_ENDPOINT_TEST_COMMAND  0x4
-#define COMMAND_RAISE_LEGACY_IRQ   BIT(0)
-#define COMMAND_RAISE_MSI_IRQ  BIT(1)
-#define MSI_NUMBER_SHIFT   2
-/* 6 bits for MSI number */
-#define COMMAND_READBIT(8)
-#define COMMAND_WRITE   BIT(9)
-#define COMMAND_COPYBIT(10)
-
-#define PCI_ENDPOINT_TEST_STATUS   0x8
-#define STATUS_READ_SUCCESS BIT(0)
-#define STATUS_READ_FAILBIT(1)
-#define STATUS_WRITE_SUCCESSBIT(2)
-#define STATUS_WRITE_FAIL   BIT(3)
-#define STATUS_COPY_SUCCESS BIT(4)
-#define STATUS_COPY_FAILBIT(5)
-#define STATUS_IRQ_RAISED   BIT(6)
-#define STATUS_SRC_ADDR_INVALID BIT(7)
-#define STATUS_DST_ADDR_INVALID BIT(8)
-
-#define PCI_ENDPOINT_TEST_LOWER_SRC_ADDR   0xc
+#define DRV_MODULE_NAME"pci-endpoint-test"
+
+#define IRQ_TYPE_LEGACY0
+#define IRQ_TYPE_MSI   1
+#define IRQ_TYPE_MSIX  2
+
+#define PCI_ENDPOINT_TEST_MAGIC0x0
+
+#define PCI_ENDPOINT_TEST_COMMAND  0x4
+#define COMMAND_RAISE_LEGACY_IRQ   BIT(0)
+#define COMMAND_RAISE_MSI_IRQ  BIT(1)
+#define COMMAND_RAISE_MSIX_IRQ BIT(2)
+#define COMMAND_READ   BIT(3)
+#define COMMAND_WRITE  BIT(4)
+#define COMMAND_COPY   BIT(5)
+
+#define PCI_ENDPOINT_TEST_STATUS   0x8
+#define STATUS_READ_SUCCESSBIT(0)
+#define STATUS_READ_FAIL   BIT(1)
+#define STATUS_WRITE_SUCCESS   BIT(2)
+#define STATUS_WRITE_FAIL  BIT(3)
+#define STATUS_COPY_SUCCESSBIT(4)
+#define STATUS_COPY_FAIL   BIT(5)
+#define STATUS_IRQ_RAISED  BIT(6)
+#define STATUS_SRC_ADDR_INVALIDBIT(7)
+#define STATUS_DST_ADDR_INVALIDBIT(8)
+
+#define PCI_ENDPOINT_TEST_LOWER_SRC_ADDR   0x0c
 #define PCI_ENDPOINT_TEST_UPPER_SRC_ADDR   0x10
 
 #define PCI_ENDPOINT_TEST_LOWER_DST_ADDR   0x14
 #define PCI_ENDPOINT_TEST_UPPER_DST_ADDR   0x18
 
-#define PCI_ENDPOINT_TEST_SIZE 0x1c
-#define PCI_ENDPOINT_TEST_CHECKSUM 0x20
+#define PCI_ENDPOINT_TEST_SIZE 0x1c
+#define PCI_ENDPOINT_TEST_CHECKSUM 0x20
+
+#define 

[PATCH v2 0/7] Add MSI-X support on pcitest tool

2018-05-17 Thread Gustavo Pimentel
Patch series made against Lorenzo's branches and also depends of:
 - pci/dwc
 - pci/endpoint

Add MSI-X support on pcitest tool.

Add new callbacks methods and handlers to trigger the MSI-X interrupts
on the EP DesignWare IP driver.

Allow to set/get MSI-X EP maximum capability number.

Rework on set/get and triggering MSI methods on EP DesignWare IP driver.

Add a new input parameter (msix) to pcitest tool to test MSI-X feature.

Update the pcitest.sh script to support MSI-X feature tests.


Gustavo Pimentel (7):
  PCI: endpoint: Add MSI-X interfaces
  PCI: dwc: Add MSI-X callbacks handler
  PCI: cadence: Update cdns_pcie_ep_raise_irq function signature
  PCI: dwc: Rework MSI callbacks handler
  PCI: dwc: Add legacy interrupt callback handler
  misc: pci_endpoint_test: Add MSI-X support
  tools: PCI: Add MSI-X support

 Documentation/misc-devices/pci-endpoint-test.txt |   6 +
 drivers/misc/pci_endpoint_test.c | 261 +--
 drivers/pci/cadence/pcie-cadence-ep.c|   3 +-
 drivers/pci/dwc/pci-dra7xx.c |   2 +-
 drivers/pci/dwc/pcie-artpec6.c   |   2 +-
 drivers/pci/dwc/pcie-designware-ep.c | 205 --
 drivers/pci/dwc/pcie-designware-plat.c   |   7 +-
 drivers/pci/dwc/pcie-designware.h|  31 +--
 drivers/pci/endpoint/functions/pci-epf-test.c|  81 +--
 drivers/pci/endpoint/pci-ep-cfs.c|  24 +++
 drivers/pci/endpoint/pci-epc-core.c  |  62 +-
 include/linux/pci-epc.h  |  13 +-
 include/linux/pci-epf.h  |   1 +
 include/uapi/linux/pcitest.h |   3 +
 tools/pci/pcitest.c  |  51 -
 tools/pci/pcitest.sh |  15 ++
 16 files changed, 639 insertions(+), 128 deletions(-)

-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/7] PCI: dwc: Rework MSI callbacks handler

2018-05-17 Thread Gustavo Pimentel
Remove duplicate defines located on pcie-designware.h file already
available on /include/uapi/linux/pci-regs.h file.

Add pci_epc_set_msi() maximum 32 interrupts validation.

Signed-off-by: Gustavo Pimentel 
---
Change v1->v2:
 - Nothing changed, just to follow the patch set version.

 drivers/pci/dwc/pcie-designware-ep.c | 49 
 drivers/pci/dwc/pcie-designware.h| 11 
 drivers/pci/endpoint/pci-epc-core.c  |  3 ++-
 3 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/drivers/pci/dwc/pcie-designware-ep.c 
b/drivers/pci/dwc/pcie-designware-ep.c
index e5f2377..a4baa0d 100644
--- a/drivers/pci/dwc/pcie-designware-ep.c
+++ b/drivers/pci/dwc/pcie-designware-ep.c
@@ -246,29 +246,38 @@ static int dw_pcie_ep_map_addr(struct pci_epc *epc, u8 
func_no,
 
 static int dw_pcie_ep_get_msi(struct pci_epc *epc, u8 func_no)
 {
-   int val;
struct dw_pcie_ep *ep = epc_get_drvdata(epc);
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   u32 val, reg;
+
+   if (!ep->msi_cap)
+   return 0;
 
-   val = dw_pcie_readw_dbi(pci, MSI_MESSAGE_CONTROL);
-   if (!(val & MSI_CAP_MSI_EN_MASK))
+   reg = ep->msi_cap + PCI_MSI_FLAGS;
+   val = dw_pcie_readw_dbi(pci, reg);
+   if (!(val & PCI_MSI_FLAGS_ENABLE))
return -EINVAL;
 
-   val = (val & MSI_CAP_MME_MASK) >> MSI_CAP_MME_SHIFT;
+   val = (val & PCI_MSI_FLAGS_QSIZE) >> 4;
+
return val;
 }
 
-static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 func_no, u8 encode_int)
+static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts)
 {
-   int val;
struct dw_pcie_ep *ep = epc_get_drvdata(epc);
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   u32 val, reg;
 
-   val = dw_pcie_readw_dbi(pci, MSI_MESSAGE_CONTROL);
-   val &= ~MSI_CAP_MMC_MASK;
-   val |= (encode_int << MSI_CAP_MMC_SHIFT) & MSI_CAP_MMC_MASK;
+   if (!ep->msi_cap)
+   return 0;
+
+   reg = ep->msi_cap + PCI_MSI_FLAGS;
+   val = dw_pcie_readw_dbi(pci, reg);
+   val &= ~PCI_MSI_FLAGS_QMASK;
+   val |= (interrupts << 1) & PCI_MSI_FLAGS_QMASK;
dw_pcie_dbi_ro_wr_en(pci);
-   dw_pcie_writew_dbi(pci, MSI_MESSAGE_CONTROL, val);
+   dw_pcie_writew_dbi(pci, reg, val);
dw_pcie_dbi_ro_wr_dis(pci);
 
return 0;
@@ -367,21 +376,29 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 
func_no,
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
struct pci_epc *epc = ep->epc;
u16 msg_ctrl, msg_data;
-   u32 msg_addr_lower, msg_addr_upper;
+   u32 msg_addr_lower, msg_addr_upper, reg;
u64 msg_addr;
bool has_upper;
int ret;
 
+   if (!ep->msi_cap)
+   return 0;
+
/* Raise MSI per the PCI Local Bus Specification Revision 3.0, 6.8.1. */
-   msg_ctrl = dw_pcie_readw_dbi(pci, MSI_MESSAGE_CONTROL);
+   reg = ep->msi_cap + PCI_MSI_FLAGS;
+   msg_ctrl = dw_pcie_readw_dbi(pci, reg);
has_upper = !!(msg_ctrl & PCI_MSI_FLAGS_64BIT);
-   msg_addr_lower = dw_pcie_readl_dbi(pci, MSI_MESSAGE_ADDR_L32);
+   reg = ep->msi_cap + PCI_MSI_ADDRESS_LO;
+   msg_addr_lower = dw_pcie_readl_dbi(pci, reg);
if (has_upper) {
-   msg_addr_upper = dw_pcie_readl_dbi(pci, MSI_MESSAGE_ADDR_U32);
-   msg_data = dw_pcie_readw_dbi(pci, MSI_MESSAGE_DATA_64);
+   reg = ep->msi_cap + PCI_MSI_ADDRESS_HI;
+   msg_addr_upper = dw_pcie_readl_dbi(pci, reg);
+   reg = ep->msi_cap + PCI_MSI_DATA_64;
+   msg_data = dw_pcie_readw_dbi(pci, reg);
} else {
msg_addr_upper = 0;
-   msg_data = dw_pcie_readw_dbi(pci, MSI_MESSAGE_DATA_32);
+   reg = ep->msi_cap + PCI_MSI_DATA_32;
+   msg_data = dw_pcie_readw_dbi(pci, reg);
}
msg_addr = ((u64) msg_addr_upper) << 32 | msg_addr_lower;
ret = dw_pcie_ep_map_addr(epc, func_no, ep->msi_mem_phys, msg_addr,
diff --git a/drivers/pci/dwc/pcie-designware.h 
b/drivers/pci/dwc/pcie-designware.h
index b22c5bb..a0ab12f 100644
--- a/drivers/pci/dwc/pcie-designware.h
+++ b/drivers/pci/dwc/pcie-designware.h
@@ -96,17 +96,6 @@
 #define PCIE_GET_ATU_INB_UNR_REG_OFFSET(region)
\
((0x3 << 20) | ((region) << 9) | (0x1 << 8))
 
-#define MSI_MESSAGE_CONTROL0x52
-#define MSI_CAP_MMC_SHIFT  1
-#define MSI_CAP_MMC_MASK   (7 << MSI_CAP_MMC_SHIFT)
-#define MSI_CAP_MME_SHIFT  4
-#define MSI_CAP_MSI_EN_MASK0x1
-#define MSI_CAP_MME_MASK   (7 << MSI_CAP_MME_SHIFT)
-#define MSI_MESSAGE_ADDR_L32   0x54
-#define MSI_MESSAGE_ADDR_U32   0x58
-#define MSI_MESSAGE_DATA_320x58
-#define MSI_MESSAGE_DATA_640x5C
-
 #define MAX_MSI_IRQS   256
 #define 

[PATCH v2 1/7] PCI: endpoint: Add MSI-X interfaces

2018-05-17 Thread Gustavo Pimentel
Add PCI_EPC_IRQ_MSIX type.

Add MSI-X callbacks signatures to the ops structure.

Add sysfs interface for set/get MSI-X capability maximum number.

Change pci_epc_raise_irq() signature, namely the interrupt_num variable type
from u8 to u16 to accommodate 2048 maximum MSI-X interrupts.

Signed-off-by: Gustavo Pimentel 
---
Change v1->v2:
 - Nothing changed, just to follow the patch set version.

 drivers/pci/endpoint/pci-ep-cfs.c   | 24 +++
 drivers/pci/endpoint/pci-epc-core.c | 59 -
 include/linux/pci-epc.h | 13 ++--
 include/linux/pci-epf.h |  1 +
 4 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/endpoint/pci-ep-cfs.c 
b/drivers/pci/endpoint/pci-ep-cfs.c
index 018ea34..d1288a0 100644
--- a/drivers/pci/endpoint/pci-ep-cfs.c
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -286,6 +286,28 @@ static ssize_t pci_epf_msi_interrupts_show(struct 
config_item *item,
   to_pci_epf_group(item)->epf->msi_interrupts);
 }
 
+static ssize_t pci_epf_msix_interrupts_store(struct config_item *item,
+const char *page, size_t len)
+{
+   u16 val;
+   int ret;
+
+   ret = kstrtou16(page, 0, );
+   if (ret)
+   return ret;
+
+   to_pci_epf_group(item)->epf->msix_interrupts = val;
+
+   return len;
+}
+
+static ssize_t pci_epf_msix_interrupts_show(struct config_item *item,
+   char *page)
+{
+   return sprintf(page, "%d\n",
+  to_pci_epf_group(item)->epf->msix_interrupts);
+}
+
 PCI_EPF_HEADER_R(vendorid)
 PCI_EPF_HEADER_W_u16(vendorid)
 
@@ -327,6 +349,7 @@ CONFIGFS_ATTR(pci_epf_, subsys_vendor_id);
 CONFIGFS_ATTR(pci_epf_, subsys_id);
 CONFIGFS_ATTR(pci_epf_, interrupt_pin);
 CONFIGFS_ATTR(pci_epf_, msi_interrupts);
+CONFIGFS_ATTR(pci_epf_, msix_interrupts);
 
 static struct configfs_attribute *pci_epf_attrs[] = {
_epf_attr_vendorid,
@@ -340,6 +363,7 @@ static struct configfs_attribute *pci_epf_attrs[] = {
_epf_attr_subsys_id,
_epf_attr_interrupt_pin,
_epf_attr_msi_interrupts,
+   _epf_attr_msix_interrupts,
NULL,
 };
 
diff --git a/drivers/pci/endpoint/pci-epc-core.c 
b/drivers/pci/endpoint/pci-epc-core.c
index b0ee427..a23aa75 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -137,7 +137,7 @@ EXPORT_SYMBOL_GPL(pci_epc_start);
  * Invoke to raise an MSI or legacy interrupt
  */
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
- enum pci_epc_irq_type type, u8 interrupt_num)
+ enum pci_epc_irq_type type, u16 interrupt_num)
 {
int ret;
unsigned long flags;
@@ -218,6 +218,63 @@ int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 
interrupts)
 EXPORT_SYMBOL_GPL(pci_epc_set_msi);
 
 /**
+ * pci_epc_get_msix() - get the number of MSI-X interrupt numbers allocated
+ * @epc: the EPC device to which MSI-X interrupts was requested
+ * @func_no: the endpoint function number in the EPC device
+ *
+ * Invoke to get the number of MSI-X interrupts allocated by the RC
+ */
+int pci_epc_get_msix(struct pci_epc *epc, u8 func_no)
+{
+   int interrupt;
+   unsigned long flags;
+
+   if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions)
+   return 0;
+
+   if (!epc->ops->get_msix)
+   return 0;
+
+   spin_lock_irqsave(>lock, flags);
+   interrupt = epc->ops->get_msix(epc, func_no);
+   spin_unlock_irqrestore(>lock, flags);
+
+   if (interrupt < 0)
+   return 0;
+
+   return interrupt + 1;
+}
+EXPORT_SYMBOL_GPL(pci_epc_get_msix);
+
+/**
+ * pci_epc_set_msix() - set the number of MSI-X interrupt numbers required
+ * @epc: the EPC device on which MSI-X has to be configured
+ * @func_no: the endpoint function number in the EPC device
+ * @interrupts: number of MSI-X interrupts required by the EPF
+ *
+ * Invoke to set the required number of MSI-X interrupts.
+ */
+int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts)
+{
+   int ret;
+   unsigned long flags;
+
+   if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions ||
+   interrupts < 1 || interrupts > 2048)
+   return -EINVAL;
+
+   if (!epc->ops->set_msix)
+   return 0;
+
+   spin_lock_irqsave(>lock, flags);
+   ret = epc->ops->set_msix(epc, func_no, interrupts - 1);
+   spin_unlock_irqrestore(>lock, flags);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_set_msix);
+
+/**
  * pci_epc_unmap_addr() - unmap CPU address from PCI address
  * @epc: the EPC device on which address is allocated
  * @func_no: the endpoint function number in the EPC device
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 243eaa5..c73abc2 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ 

[PATCH v2 3/7] PCI: cadence: Update cdns_pcie_ep_raise_irq function signature

2018-05-17 Thread Gustavo Pimentel
Change cdns_pcie_ep_raise_irq() signature, namely the interrupt_num
variable type from u8 to u16 to accommodate 2048 maximum MSI-X
interrupts.

Signed-off-by: Gustavo Pimentel 
Acked-by: Alan Douglas 
---
Change v1->v2:
 - Nothing changed, just to follow the patch set version.

 drivers/pci/cadence/pcie-cadence-ep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/cadence/pcie-cadence-ep.c 
b/drivers/pci/cadence/pcie-cadence-ep.c
index 3d8283e..6b713ca 100644
--- a/drivers/pci/cadence/pcie-cadence-ep.c
+++ b/drivers/pci/cadence/pcie-cadence-ep.c
@@ -363,7 +363,8 @@ static int cdns_pcie_ep_send_msi_irq(struct cdns_pcie_ep 
*ep, u8 fn,
 }
 
 static int cdns_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn,
- enum pci_epc_irq_type type, u8 interrupt_num)
+ enum pci_epc_irq_type type,
+ u16 interrupt_num)
 {
struct cdns_pcie_ep *ep = epc_get_drvdata(epc);
 
-- 
2.7.4


--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: rename HINTS_DEDICATED to KVM_HINTS_REALTIME

2018-05-17 Thread Paolo Bonzini
On 17/05/2018 16:54, Michael S. Tsirkin wrote:
> HINTS_DEDICATED seems to be somewhat confusing:
> 
> Guest doesn't really care whether it's the only task running on a host
> CPU as long as it's not preempted.
> 
> And there are more reasons for Guest to be preempted than host CPU
> sharing, for example, with memory overcommit it can get preempted on a
> memory access, post copy migration can cause preemption, etc.
> 
> Let's call it KVM_HINTS_REALTIME which seems to better
> match what guests expect.
> 
> Also, the flag most be set on all vCPUs - current guests assume this.
> Note so in the documentation.
> 
> Signed-off-by: Michael S. Tsirkin 
> ---
>  Documentation/virtual/kvm/cpuid.txt  | 6 +++---
>  arch/x86/include/uapi/asm/kvm_para.h | 2 +-
>  arch/x86/kernel/kvm.c| 8 
>  3 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/cpuid.txt 
> b/Documentation/virtual/kvm/cpuid.txt
> index d4f33eb8..ab022dc 100644
> --- a/Documentation/virtual/kvm/cpuid.txt
> +++ b/Documentation/virtual/kvm/cpuid.txt
> @@ -72,8 +72,8 @@ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||24 || host will 
> warn if no guest-side
>  
>  flag   || value || meaning
>  
> ==
> -KVM_HINTS_DEDICATED|| 0 || guest checks this feature bit 
> to
> -   ||   || determine if there is vCPU 
> pinning
> -   ||   || and there is no vCPU 
> over-commitment,
> +KVM_HINTS_REALTIME || 0 || guest checks this feature bit 
> to
> +   ||   || determine that vCPUs are never
> +   ||   || preempted for an unlimited 
> time,
> ||   || allowing optimizations
>  
> --
> diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
> b/arch/x86/include/uapi/asm/kvm_para.h
> index 4c851eb..0ede697 100644
> --- a/arch/x86/include/uapi/asm/kvm_para.h
> +++ b/arch/x86/include/uapi/asm/kvm_para.h
> @@ -29,7 +29,7 @@
>  #define KVM_FEATURE_PV_TLB_FLUSH 9
>  #define KVM_FEATURE_ASYNC_PF_VMEXIT  10
>  
> -#define KVM_HINTS_DEDICATED  0
> +#define KVM_HINTS_REALTIME  0
>  
>  /* The last 8 bits are used to indicate how to interpret the flags field
>   * in pvclock structure. If no bits are set, all flags are ignored.
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 7867417..5b2300b 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -457,7 +457,7 @@ static void __init sev_map_percpu_data(void)
>  static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
>  {
>   native_smp_prepare_cpus(max_cpus);
> - if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
> + if (kvm_para_has_hint(KVM_HINTS_REALTIME))
>   static_branch_disable(_spin_lock_key);
>  }
>  
> @@ -553,7 +553,7 @@ static void __init kvm_guest_init(void)
>   }
>  
>   if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
> - !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
> + !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
>   kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
>   pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
>  
> @@ -649,7 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void)
>   int cpu;
>  
>   if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
> - !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
> + !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
>   kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
>   for_each_possible_cpu(cpu) {
>   zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, 
> cpu),
> @@ -745,7 +745,7 @@ void __init kvm_spinlock_init(void)
>   if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
>   return;
>  
> - if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
> + if (kvm_para_has_hint(KVM_HINTS_REALTIME))
>   return;
>  
>   __pv_init_lock_hash();
> 

Queued, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: rename HINTS_DEDICATED to KVM_HINTS_REALTIME

2018-05-17 Thread Michael S. Tsirkin
HINTS_DEDICATED seems to be somewhat confusing:

Guest doesn't really care whether it's the only task running on a host
CPU as long as it's not preempted.

And there are more reasons for Guest to be preempted than host CPU
sharing, for example, with memory overcommit it can get preempted on a
memory access, post copy migration can cause preemption, etc.

Let's call it KVM_HINTS_REALTIME which seems to better
match what guests expect.

Also, the flag most be set on all vCPUs - current guests assume this.
Note so in the documentation.

Signed-off-by: Michael S. Tsirkin 
---
 Documentation/virtual/kvm/cpuid.txt  | 6 +++---
 arch/x86/include/uapi/asm/kvm_para.h | 2 +-
 arch/x86/kernel/kvm.c| 8 
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Documentation/virtual/kvm/cpuid.txt 
b/Documentation/virtual/kvm/cpuid.txt
index d4f33eb8..ab022dc 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -72,8 +72,8 @@ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||24 || host will warn 
if no guest-side
 
 flag   || value || meaning
 
==
-KVM_HINTS_DEDICATED|| 0 || guest checks this feature bit to
-   ||   || determine if there is vCPU 
pinning
-   ||   || and there is no vCPU 
over-commitment,
+KVM_HINTS_REALTIME || 0 || guest checks this feature bit to
+   ||   || determine that vCPUs are never
+   ||   || preempted for an unlimited time,
||   || allowing optimizations
 
--
diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h
index 4c851eb..0ede697 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -29,7 +29,7 @@
 #define KVM_FEATURE_PV_TLB_FLUSH   9
 #define KVM_FEATURE_ASYNC_PF_VMEXIT10
 
-#define KVM_HINTS_DEDICATED  0
+#define KVM_HINTS_REALTIME  0
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7867417..5b2300b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -457,7 +457,7 @@ static void __init sev_map_percpu_data(void)
 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
 {
native_smp_prepare_cpus(max_cpus);
-   if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+   if (kvm_para_has_hint(KVM_HINTS_REALTIME))
static_branch_disable(_spin_lock_key);
 }
 
@@ -553,7 +553,7 @@ static void __init kvm_guest_init(void)
}
 
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
-   !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
+   !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
 
@@ -649,7 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void)
int cpu;
 
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
-   !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
+   !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, 
cpu),
@@ -745,7 +745,7 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
 
-   if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+   if (kvm_para_has_hint(KVM_HINTS_REALTIME))
return;
 
__pv_init_lock_hash();
-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] i2c: clean up include/linux/i2c-*

2018-05-17 Thread Wolfram Sang
On Thu, Apr 19, 2018 at 10:00:06PM +0200, Wolfram Sang wrote:
> Move all plain platform_data includes to the platform_data-dir
> (except for i2c-pnx which can be moved into the driver itself).
> 
> My preference is to take these patches via the i2c tree. I can provide an
> immutable branch if needed. But we can also discuss those going in via
> arch-trees if dependencies are against us.

All applied to for-next!

The immutable branch is here:

git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git 
i2c/platform_data-immutable

Thanks,

   Wolfram



signature.asc
Description: PGP signature


Re: [PATCH v5 2/2] ThunderX2: Add Cavium ThunderX2 SoC UNCORE PMU driver

2018-05-17 Thread John Garry

On 16/05/2018 05:55, Ganapatrao Kulkarni wrote:

This patch adds a perf driver for the PMU UNCORE devices DDR4 Memory
Controller(DMC) and Level 3 Cache(L3C).



Hi,

Just some coding comments below:


ThunderX2 has 8 independent DMC PMUs to capture performance events
corresponding to 8 channels of DDR4 Memory Controller and 16 independent
L3C PMUs to capture events corresponding to 16 tiles of L3 cache.
Each PMU supports up to 4 counters. All counters lack overflow interrupt
and are sampled periodically.

Signed-off-by: Ganapatrao Kulkarni 
---
 drivers/perf/Kconfig |   8 +
 drivers/perf/Makefile|   1 +
 drivers/perf/thunderx2_pmu.c | 965 +++
 include/linux/cpuhotplug.h   |   1 +
 4 files changed, 975 insertions(+)
 create mode 100644 drivers/perf/thunderx2_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 28bb5a0..eafd0fc 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -85,6 +85,14 @@ config QCOM_L3_PMU
   Adds the L3 cache PMU into the perf events subsystem for
   monitoring L3 cache events.

+config THUNDERX2_PMU
+bool "Cavium ThunderX2 SoC PMU UNCORE"
+depends on ARCH_THUNDER2 && PERF_EVENTS && ACPI


Is the explicit dependency for PERF_EVENTS required, since we're under 
the PERF_EVENTS menu?


And IIRC for other perf drivers we required a dependency on ARM64 - is 
that required here also? I see arm_smccc_smc() calls in the code...



+   help
+ Provides support for ThunderX2 UNCORE events.
+ The SoC has PMU support in its L3 cache controller (L3C) and
+ in the DDR4 Memory Controller (DMC).
+
 config XGENE_PMU
 depends on ARCH_XGENE
 bool "APM X-Gene SoC PMU"
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..909f27f 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
+obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
 obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c
new file mode 100644
index 000..0401443
--- /dev/null
+++ b/drivers/perf/thunderx2_pmu.c
@@ -0,0 +1,965 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CAVIUM THUNDERX2 SoC PMU UNCORE
+ *
+ * Copyright (C) 2018 Cavium Inc.
+ * Author: Ganapatrao Kulkarni 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */


Isn't this the same as the SPDX?


+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* L3c and DMC has 16 and 8 channels per socket respectively.


L3C, right?


+ * Each Channel supports UNCORE PMU device and consists of
+ * 4 independent programmable counters. Counters are 32 bit
+ * and does not support overflow interrupt, they needs to be


/s/needs/need/, /s/does/do/


+ * sampled before overflow(i.e, at every 2 seconds).


how can you ensure that this value is low enough?

"I saw this comment in previous patch:
> Given that all channels compete for access to the muxed register
> interface, I suspect we need to try more often than once every 2
> seconds...

2 seconds seems to be sufficient. So far testing looks good."

Can you provide any more analytical reasoning than this?


+ */
+
+#define UNCORE_MAX_COUNTERS4
+#define UNCORE_L3_MAX_TILES16
+#define UNCORE_DMC_MAX_CHANNELS8
+
+#define UNCORE_HRTIMER_INTERVAL(2 * NSEC_PER_SEC)
+#define GET_EVENTID(ev)((ev->hw.config) & 0x1ff)
+#define GET_COUNTERID(ev)  ((ev->hw.idx) & 0xf)
+#define GET_CHANNELID(pmu_uncore)  (pmu_uncore->channel)
+#define DMC_EVENT_CFG(idx, val)((val) << (((idx) * 8) + 1))
+
+#define DMC_COUNTER_CTL0x234
+#define DMC_COUNTER_DATA   0x240
+#define L3C_COUNTER_CTL0xA8
+#define L3C_COUNTER_DATA   0xAC


I feel it's generally better to keep register offsets in numeric order 
(if indeed, that is what they are)



+
+#define THUNDERX2_SMC_CALL_ID  0xC200FF00
+#define THUNDERX2_SMC_SET_CHANNEL  0xB010
+
+enum thunderx2_uncore_l3_events {
+   L3_EVENT_NONE,
+   

Re: [PATCH v4 19/31] Documentation: kconfig: document a new Kconfig macro language

2018-05-17 Thread Masahiro Yamada
2018-05-17 15:38 GMT+09:00 Kees Cook :
> On Wed, May 16, 2018 at 11:16 PM, Masahiro Yamada
>  wrote:
>> Add a document for the macro language introduced to Kconfig.
>>
>> The motivation of this work is to move the compiler option tests to
>> Kconfig from Makefile.  A number of kernel features require the
>> compiler support.  Enabling such features blindly in Kconfig ends up
>> with a lot of nasty build-time testing in Makefiles.  If a chosen
>> feature turns out unsupported by the compiler, what the build system
>> can do is either to disable it (silently!) or to forcibly break the
>> build, despite Kconfig has let the user to enable it.  By moving the
>> compiler capability tests to Kconfig, features unsupported by the
>> compiler will be hidden automatically.
>>
>> This change was strongly prompted by Linus Torvalds.  You can find
>> his suggestions [1] [2] in ML.  The original idea was to add a new
>> attribute with 'option shell=...', but I found more generalized text
>> expansion would make Kconfig more powerful and lovely.  The basic
>> ideas are from Make, but there are some differences.
>>
>> [1]: https://lkml.org/lkml/2016/12/9/577
>> [2]: https://lkml.org/lkml/2018/2/7/527
>>
>> Signed-off-by: Masahiro Yamada 
>
> (Added Randy, Jon, and linux-doc to CC for more review)
>
> This should likely be written in .rst and linked to from the developer 
> index...
>
> https://www.kernel.org/doc/html/latest/doc-guide/sphinx.html#writing-documentation
>
> As for the content, though:
>
> Reviewed-by: Kees Cook 
>
> -Kees

At least, nothing in Documentation/kbuild/ has not been
converted to ReST yet.





>> ---
>>
>> Changes in v4:
>>  - Update according to the syntax change
>>
>> Changes in v3:
>>  - Newly added
>>
>> Changes in v2: None
>>
>>  Documentation/kbuild/kconfig-macro-language.txt | 252 
>> 
>>  MAINTAINERS |   2 +-
>>  2 files changed, 253 insertions(+), 1 deletion(-)
>>  create mode 100644 Documentation/kbuild/kconfig-macro-language.txt
>>
>> diff --git a/Documentation/kbuild/kconfig-macro-language.txt 
>> b/Documentation/kbuild/kconfig-macro-language.txt
>> new file mode 100644
>> index 000..a8dc792
>> --- /dev/null
>> +++ b/Documentation/kbuild/kconfig-macro-language.txt
>> @@ -0,0 +1,252 @@
>> +Concept
>> +---
>> +
>> +The basic idea was inspired by Make. When we look at Make, we notice sort of
>> +two languages in one. One language describes dependency graphs consisting of
>> +targets and prerequisites. The other is a macro language for performing 
>> textual
>> +substitution.
>> +
>> +There is clear distinction between the two language stages. For example, you
>> +can write a makefile like follows:
>> +
>> +APP := foo
>> +SRC := foo.c
>> +CC := gcc
>> +
>> +$(APP): $(SRC)
>> +$(CC) -o $(APP) $(SRC)
>> +
>> +The macro language replaces the variable references with their expanded 
>> form,
>> +and handles as if the source file were input like follows:
>> +
>> +foo: foo.c
>> +gcc -o foo foo.c
>> +
>> +Then, Make analyzes the dependency graph and determines the targets to be
>> +updated.
>> +
>> +The idea is quite similar in Kconfig - it is possible to describe a Kconfig
>> +file like this:
>> +
>> +CC := gcc
>> +
>> +config CC_HAS_FOO
>> +def_bool $(shell, $(srctree)/scripts/gcc-check-foo.sh $(CC))
>> +
>> +The macro language in Kconfig processes the source file into the following
>> +intermediate:
>> +
>> +config CC_HAS_FOO
>> +def_bool y
>> +
>> +Then, Kconfig moves onto the evaluation stage to resolve inter-symbol
>> +dependency as explained in kconfig-language.txt.
>> +
>> +
>> +Variables
>> +-
>> +
>> +Like in Make, a variable in Kconfig works as a macro variable.  A macro
>> +variable is expanded "in place" to yield a text string that may then be
>> +expanded further. To get the value of a variable, enclose the variable name 
>> in
>> +$( ). The parentheses are required even for single-letter variable names; 
>> $X is
>> +a syntax error. The curly brace form as in ${CC} is not supported either.
>> +
>> +There are two types of variables: simply expanded variables and recursively
>> +expanded variables.
>> +
>> +A simply expanded variable is defined using the := assignment operator. Its
>> +righthand side is expanded immediately upon reading the line from the 
>> Kconfig
>> +file.
>> +
>> +A recursively expanded variable is defined using the = assignment operator.
>> +Its righthand side is simply stored as the value of the variable without
>> +expanding it in any way. Instead, the expansion is performed when the 
>> variable
>> +is used.
>> +
>> +There is another type of assignment operator; += is used to append text to a
>> +variable. The righthand side of += is expanded immediately if the lefthand
>> +side was originally 

Re: [PATCH v4 19/31] Documentation: kconfig: document a new Kconfig macro language

2018-05-17 Thread Kees Cook
On Wed, May 16, 2018 at 11:16 PM, Masahiro Yamada
 wrote:
> Add a document for the macro language introduced to Kconfig.
>
> The motivation of this work is to move the compiler option tests to
> Kconfig from Makefile.  A number of kernel features require the
> compiler support.  Enabling such features blindly in Kconfig ends up
> with a lot of nasty build-time testing in Makefiles.  If a chosen
> feature turns out unsupported by the compiler, what the build system
> can do is either to disable it (silently!) or to forcibly break the
> build, despite Kconfig has let the user to enable it.  By moving the
> compiler capability tests to Kconfig, features unsupported by the
> compiler will be hidden automatically.
>
> This change was strongly prompted by Linus Torvalds.  You can find
> his suggestions [1] [2] in ML.  The original idea was to add a new
> attribute with 'option shell=...', but I found more generalized text
> expansion would make Kconfig more powerful and lovely.  The basic
> ideas are from Make, but there are some differences.
>
> [1]: https://lkml.org/lkml/2016/12/9/577
> [2]: https://lkml.org/lkml/2018/2/7/527
>
> Signed-off-by: Masahiro Yamada 

(Added Randy, Jon, and linux-doc to CC for more review)

This should likely be written in .rst and linked to from the developer index...

https://www.kernel.org/doc/html/latest/doc-guide/sphinx.html#writing-documentation

As for the content, though:

Reviewed-by: Kees Cook 

-Kees

> ---
>
> Changes in v4:
>  - Update according to the syntax change
>
> Changes in v3:
>  - Newly added
>
> Changes in v2: None
>
>  Documentation/kbuild/kconfig-macro-language.txt | 252 
> 
>  MAINTAINERS |   2 +-
>  2 files changed, 253 insertions(+), 1 deletion(-)
>  create mode 100644 Documentation/kbuild/kconfig-macro-language.txt
>
> diff --git a/Documentation/kbuild/kconfig-macro-language.txt 
> b/Documentation/kbuild/kconfig-macro-language.txt
> new file mode 100644
> index 000..a8dc792
> --- /dev/null
> +++ b/Documentation/kbuild/kconfig-macro-language.txt
> @@ -0,0 +1,252 @@
> +Concept
> +---
> +
> +The basic idea was inspired by Make. When we look at Make, we notice sort of
> +two languages in one. One language describes dependency graphs consisting of
> +targets and prerequisites. The other is a macro language for performing 
> textual
> +substitution.
> +
> +There is clear distinction between the two language stages. For example, you
> +can write a makefile like follows:
> +
> +APP := foo
> +SRC := foo.c
> +CC := gcc
> +
> +$(APP): $(SRC)
> +$(CC) -o $(APP) $(SRC)
> +
> +The macro language replaces the variable references with their expanded form,
> +and handles as if the source file were input like follows:
> +
> +foo: foo.c
> +gcc -o foo foo.c
> +
> +Then, Make analyzes the dependency graph and determines the targets to be
> +updated.
> +
> +The idea is quite similar in Kconfig - it is possible to describe a Kconfig
> +file like this:
> +
> +CC := gcc
> +
> +config CC_HAS_FOO
> +def_bool $(shell, $(srctree)/scripts/gcc-check-foo.sh $(CC))
> +
> +The macro language in Kconfig processes the source file into the following
> +intermediate:
> +
> +config CC_HAS_FOO
> +def_bool y
> +
> +Then, Kconfig moves onto the evaluation stage to resolve inter-symbol
> +dependency as explained in kconfig-language.txt.
> +
> +
> +Variables
> +-
> +
> +Like in Make, a variable in Kconfig works as a macro variable.  A macro
> +variable is expanded "in place" to yield a text string that may then be
> +expanded further. To get the value of a variable, enclose the variable name 
> in
> +$( ). The parentheses are required even for single-letter variable names; $X 
> is
> +a syntax error. The curly brace form as in ${CC} is not supported either.
> +
> +There are two types of variables: simply expanded variables and recursively
> +expanded variables.
> +
> +A simply expanded variable is defined using the := assignment operator. Its
> +righthand side is expanded immediately upon reading the line from the Kconfig
> +file.
> +
> +A recursively expanded variable is defined using the = assignment operator.
> +Its righthand side is simply stored as the value of the variable without
> +expanding it in any way. Instead, the expansion is performed when the 
> variable
> +is used.
> +
> +There is another type of assignment operator; += is used to append text to a
> +variable. The righthand side of += is expanded immediately if the lefthand
> +side was originally defined as a simple variable. Otherwise, its evaluation 
> is
> +deferred.
> +
> +The variable reference can take parameters, in the following form:
> +
> +  $(name,arg1,arg2,arg3)
> +
> +You can consider the parameterized reference as a function. (more precisely,
> +"user-defined function" in the