Re: [PATCH 5.0 000/238] 5.0.4-stable review

2019-03-22 Thread Naresh Kamboju
On Fri, 22 Mar 2019 at 17:42, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 5.0.4 release.
> There are 238 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Sun Mar 24 11:11:13 UTC 2019.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.0.4-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-5.0.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h
>

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Summary


kernel: 5.0.4-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-5.0.y
git commit: 6a3b25ca97204a3891527d88b6691b362fee82c8
git describe: v5.0.3-239-g6a3b25ca9720
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-5.0-oe/build/v5.0.3-239-g6a3b25ca9720

No regressions (compared to build v5.0.3)


No fixes (compared to build v5.0.3)


Ran 23161 total tests in the following environments and test suites.

Environments
--
- dragonboard-410c
- hi6220-hikey
- i386
- juno-r2
- qemu_arm
- qemu_arm64
- qemu_i386
- qemu_x86_64
- x15
- x86

Test Suites
---
* boot
* install-android-platform-tools-r2600
* kselftest
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* spectre-meltdown-checker-test
* ltp-fs-tests
* ltp-open-posix-tests
* kselftest-vsyscall-mode-native
* kselftest-vsyscall-mode-none

-- 
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH 4.19 000/280] 4.19.31-stable review

2019-03-22 Thread Naresh Kamboju
On Fri, 22 Mar 2019 at 17:26, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 4.19.31 release.
> There are 280 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Sun Mar 24 11:11:09 UTC 2019.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.31-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.19.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h
>

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Summary


kernel: 4.19.31-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-4.19.y
git commit: f4bc3dea377c7afbfe0a36d2c72e9031910349e9
git describe: v4.19.30-281-gf4bc3dea377c
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-4.19-oe/build/v4.19.30-281-gf4bc3dea377c

No regressions (compared to build v4.19.30)


No fixes (compared to build v4.19.30)


Ran 23127 total tests in the following environments and test suites.

Environments
--
- dragonboard-410c - arm64
- hi6220-hikey - arm64
- i386
- juno-r2 - arm64
- qemu_arm
- qemu_arm64
- qemu_i386
- qemu_x86_64
- x15 - arm
- x86_64

Test Suites
---
* boot
* install-android-platform-tools-r2600
* kselftest
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* spectre-meltdown-checker-test
* ltp-open-posix-tests
* kselftest-vsyscall-mode-native
* kselftest-vsyscall-mode-none

-- 
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH 4.14 000/183] 4.14.108-stable review

2019-03-22 Thread Naresh Kamboju
On Fri, 22 Mar 2019 at 17:17, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 4.14.108 release.
> There are 183 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Sun Mar 24 11:11:06 UTC 2019.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.108-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.14.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Summary


kernel: 4.14.108-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-4.14.y
git commit: f85b59a43475b5bcca299c897549d4aff496dda2
git describe: v4.14.107-184-gf85b59a43475
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-4.14-oe/build/v4.14.107-184-gf85b59a43475


No regressions (compared to build v4.14.107)

No fixes (compared to build v4.14.107)


Ran 22886 total tests in the following environments and test suites.

Environments
--
- dragonboard-410c - arm64
- hi6220-hikey - arm64
- i386
- juno-r2 - arm64
- qemu_arm
- qemu_arm64
- qemu_i386
- qemu_x86_64
- x15 - arm
- x86_64

Test Suites
---
* boot
* install-android-platform-tools-r2600
* kselftest
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* spectre-meltdown-checker-test
* ltp-open-posix-tests
* kselftest-vsyscall-mode-native
* kselftest-vsyscall-mode-none

-- 
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH 4.9 000/118] 4.9.165-stable review

2019-03-22 Thread Naresh Kamboju
On Fri, 22 Mar 2019 at 17:11, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 4.9.165 release.
> There are 118 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Sun Mar 24 11:11:02 UTC 2019.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.165-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.9.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h
>

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Summary


kernel: 4.9.165-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-4.9.y
git commit: 7d2ac480f8d77207af9d191a51f9ae2036117da4
git describe: v4.9.164-119-g7d2ac480f8d7
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-4.9-oe/build/v4.9.164-119-g7d2ac480f8d7


No regressions (compared to build v4.9.164)


No fixes (compared to build v4.9.164)

Ran 22660 total tests in the following environments and test suites.

Environments
--
- dragonboard-410c - arm64
- hi6220-hikey - arm64
- i386
- juno-r2 - arm64
- qemu_arm
- qemu_arm64
- qemu_i386
- qemu_x86_64
- x15 - arm
- x86_64

Test Suites
---
* boot
* install-android-platform-tools-r2600
* kselftest
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* spectre-meltdown-checker-test
* ltp-open-posix-tests
* kselftest-vsyscall-mode-native
* kselftest-vsyscall-mode-none

-- 
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH 4.4 000/230] 4.4.177-stable review

2019-03-22 Thread Naresh Kamboju
On Fri, 22 Mar 2019 at 16:57, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 4.4.177 release.
> There are 230 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Sun Mar 24 11:10:58 UTC 2019.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.177-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.4.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h
>

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Summary


kernel: 4.4.177-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-4.4.y
git commit: 6926dee23fa096361e7d170c7fa21a8b51eb7673
git describe: v4.4.176-231-g6926dee23fa0
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.176-231-g6926dee23fa0


No regressions (compared to build v4.4.176)

No fixes (compared to build v4.4.176)


Ran 17857 total tests in the following environments and test suites.

Environments
--
- i386
- juno-r2 - arm64
- qemu_arm
- qemu_i386
- qemu_x86_64
- x15 - arm
- x86_64

Test Suites
---
* boot
* kselftest
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-open-posix-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* spectre-meltdown-checker-test
* install-android-platform-tools-r2600
* kselftest-vsyscall-mode-native
* kselftest-vsyscall-mode-none

Summary


kernel: 4.4.177-rc1
git repo: https://git.linaro.org/lkft/arm64-stable-rc.git
git branch: 4.4.177-rc1-hikey-20190322-403
git commit: aedc0057d8f3e9e59eea8f9ee59fa773bec7914b
git describe: 4.4.177-rc1-hikey-20190322-403
Test details: 
https://qa-reports.linaro.org/lkft/linaro-hikey-stable-rc-4.4-oe/build/4.4.177-rc1-hikey-20190322-403


No regressions (compared to build 4.4.177-rc1-hikey-20190321-402)


No fixes (compared to build 4.4.177-rc1-hikey-20190321-402)

Ran 3000 total tests in the following environments and test suites.

Environments
--
- hi6220-hikey - arm64
- qemu_arm64

Test Suites
---
* boot
* install-android-platform-tools-r2600
* kselftest
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* spectre-meltdown-checker-test

-- 
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH 4.19 000/280] 4.19.31-stable review

2019-03-22 Thread Guenter Roeck

On 3/22/19 4:12 AM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 4.19.31 release.
There are 280 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun Mar 24 11:11:09 UTC 2019.
Anything received after that time might be too late.



Build results:
total: 156 pass: 156 fail: 0
Qemu test results:
total: 345 pass: 345 fail: 0

Guenter


[PATCH 04/10] mm: numa: promote pages to DRAM when it is accessed twice

2019-03-22 Thread Yang Shi
NUMA balancing would promote the pages to DRAM once it is accessed, but
it might be just one off access.  To reduce migration thrashing and
memory bandwidth pressure, introduce PG_promote flag to mark promote
candidate.  The page will be promoted to DRAM when it is accessed twice.
This might be a good way to filter out those one-off access pages.

PG_promote flag will be inherited by tail pages when THP gets split.
But, it will not be copied to the new page once the migration is done.

This approach is not definitely the optimal one to distinguish the
hot or cold pages.  It may need much more sophisticated algorithm to
distinguish hot or cold pages accurately.  Kernel may be not the good
place to implement such algorithm considering the complexity and potential
overhead.  But, kernel may still need such capability.

With NUMA balancing the whole workingset of the process may end up being
promoted to DRAM finally.  It depends on the page reclaim to demote
inactive pages to PMEM implemented by the following patch.

Signed-off-by: Yang Shi 
---
 include/linux/page-flags.h |  4 
 include/trace/events/mmflags.h |  3 ++-
 mm/huge_memory.c   | 10 ++
 mm/memory.c|  8 
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9f8712a..2d53166 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -131,6 +131,7 @@ enum pageflags {
PG_young,
PG_idle,
 #endif
+   PG_promote, /* Promote candidate for NUMA balancing */
__NR_PAGEFLAGS,
 
/* Filesystems */
@@ -348,6 +349,9 @@ static inline void page_init_poison(struct page *page, 
size_t size)
 PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 
+PAGEFLAG(Promote, promote, PF_ANY) __SETPAGEFLAG(Promote, promote, PF_ANY)
+   __CLEARPAGEFLAG(Promote, promote, PF_ANY)
+
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a1675d4..f13c2a1 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -100,7 +100,8 @@
{1UL << PG_mappedtodisk,"mappedtodisk"  },  \
{1UL << PG_reclaim, "reclaim"   },  \
{1UL << PG_swapbacked,  "swapbacked"},  \
-   {1UL << PG_unevictable, "unevictable"   }   \
+   {1UL << PG_unevictable, "unevictable"   },  \
+   {1UL << PG_promote, "promote"   }   \
 IF_HAVE_PG_MLOCK(PG_mlocked,   "mlocked"   )   \
 IF_HAVE_PG_UNCACHED(PG_uncached,   "uncached"  )   \
 IF_HAVE_PG_HWPOISON(PG_hwpoison,   "hwpoison"  )   \
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 404acdc..8268a3c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1589,6 +1589,15 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
  haddr + HPAGE_PMD_SIZE);
}
 
+   /* Promote page to DRAM when referenced twice */
+   if (!(node_isset(page_nid, def_alloc_nodemask)) &&
+   !PagePromote(page)) {
+   SetPagePromote(page);
+   put_page(page);
+   page_nid = -1;
+   goto clear_pmdnuma;
+   }
+
/*
 * Migrate the THP to the requested node, returns with page unlocked
 * and access rights restored.
@@ -2396,6 +2405,7 @@ static void __split_huge_page_tail(struct page *head, int 
tail,
 (1L << PG_workingset) |
 (1L << PG_locked) |
 (1L << PG_unevictable) |
+(1L << PG_promote) |
 (1L << PG_dirty)));
 
/* ->mapping in first tail page is compound_mapcount */
diff --git a/mm/memory.c b/mm/memory.c
index 47fe250..2494c11 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3680,6 +3680,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
goto out;
}
 
+   /* Promote the non-DRAM page when it is referenced twice */
+   if (!(node_isset(page_nid, def_alloc_nodemask)) &&
+   !PagePromote(page)) {
+   SetPagePromote(page);
+   put_page(page);
+   goto out;
+   }
+
/* Migrate to the requested node */
migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated) {
-- 
1.8.3.1



[PATCH 10/10] doc: elaborate the PMEM allocation rule

2019-03-22 Thread Yang Shi
non-DRAM nodes are excluded from default allocation node mask, elaborate
the rules.

Signed-off-by: Yang Shi 
---
 Documentation/vm/numa.rst | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
index 185d8a5..8c2fd5c 100644
--- a/Documentation/vm/numa.rst
+++ b/Documentation/vm/numa.rst
@@ -133,7 +133,7 @@ a subsystem allocates per CPU memory resources, for example.
 
 A typical model for making such an allocation is to obtain the node id of the
 node to which the "current CPU" is attached using one of the kernel's
-numa_node_id() or CPU_to_node() functions and then request memory from only
+numa_node_id() or cpu_to_node() functions and then request memory from only
 the node id returned.  When such an allocation fails, the requesting subsystem
 may revert to its own fallback path.  The slab kernel memory allocator is an
 example of this.  Or, the subsystem may choose to disable or not to enable
@@ -148,3 +148,8 @@ architectures transparently, kernel subsystems can use the 
numa_mem_id()
 or cpu_to_mem() function to locate the "local memory node" for the calling or
 specified CPU.  Again, this is the same node from which default, local page
 allocations will be attempted.
+
+If the architecture supports non-regular DRAM nodes, i.e. NVDIMM on x86, the
+non-DRAM nodes are hidden from default mode, IOWs the default allocation
+would not end up on non-DRAM nodes, unless thoes nodes are specified
+explicity by mempolicy. [see Documentation/vm/numa_memory_policy.txt.]
-- 
1.8.3.1



[PATCH 03/10] mm: mempolicy: promote page to DRAM for MPOL_HYBRID

2019-03-22 Thread Yang Shi
With MPOL_HYBRID the memory allocation may end up on non-DRAM node, this
may be not optimal for performance.  Promote pages to DRAM with NUMA
balancing for MPOL_HYBRID.

If DRAM nodes are specified, migrate to the specified nodes.  If no DRAM
node is specified, migrate to the local DRAM node.

Signed-off-by: Yang Shi 
---
 mm/mempolicy.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7d0a432..87bc691 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2339,6 +2339,7 @@ int mpol_misplaced(struct page *page, struct 
vm_area_struct *vma, unsigned long
struct zoneref *z;
int curnid = page_to_nid(page);
unsigned long pgoff;
+   nodemask_t nmask;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = NUMA_NO_NODE;
@@ -2363,7 +2364,24 @@ int mpol_misplaced(struct page *page, struct 
vm_area_struct *vma, unsigned long
break;
 
case MPOL_HYBRID:
-   /* Fall through */
+   if (node_isset(curnid, pol->v.nodes) &&
+   node_isset(curnid, def_alloc_nodemask))
+   /* The page is already on DRAM node */
+   goto out;
+
+   /*
+* Promote to the DRAM node specified by the policy, or
+* the local DRAM node if no DRAM node is specified.
+*/
+   nodes_and(nmask, pol->v.nodes, def_alloc_nodemask);
+
+   z = first_zones_zonelist(
+   node_zonelist(numa_node_id(), GFP_HIGHUSER),
+   gfp_zone(GFP_HIGHUSER),
+   nodes_empty(nmask) ? _alloc_nodemask : );
+   polnid = z->zone->node;
+
+   break;
 
case MPOL_BIND:
 
-- 
1.8.3.1



Re: [PATCH 4.9 000/118] 4.9.165-stable review

2019-03-22 Thread Guenter Roeck

On 3/22/19 4:14 AM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 4.9.165 release.
There are 118 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun Mar 24 11:11:02 UTC 2019.
Anything received after that time might be too late.



Build results:
total: 172 pass: 172 fail: 0
Qemu test results:
total: 316 pass: 316 fail: 0

Guenter


Re: [PATCH 5.0 000/238] 5.0.4-stable review

2019-03-22 Thread Guenter Roeck

On 3/22/19 4:13 AM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 5.0.4 release.
There are 238 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun Mar 24 11:11:13 UTC 2019.
Anything received after that time might be too late.


Build results:
total: 159 pass: 159 fail: 0
Qemu test results:
total: 345 pass: 345 fail: 0

Guenter


Re: [PATCH 3.16 00/16] 3.16.64-rc1 review

2019-03-22 Thread Guenter Roeck

On 3/22/19 6:44 AM, Guenter Roeck wrote:

On 3/21/19 10:20 PM, Ben Hutchings wrote:

This is the start of the stable review cycle for the 3.16.64 release.
There are 16 patches in this series, which will be posted as responses
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Mon Mar 25 18:00:00 UTC 2019.
Anything received after that time might be too late.



Build results:
 total: 137 pass: 136 fail: 1
Failed builds:
 i386:tools/perf
Qemu test results:
 total: 222 pass: 221 fail: 1
Failed tests:
 arm:midway:multi_v7_defconfig:mem2G:ecx-2000:initrd

The failed qemu test simply hangs. The test passes with v3.16.63.
I started a bisect.



False positive.

Sorry for the noise.

Guenter


[PATCH 06/10] mm: vmscan: demote anon DRAM pages to PMEM node

2019-03-22 Thread Yang Shi
Since PMEM provides larger capacity than DRAM and has much lower
access latency than disk, so it is a good choice to use as a middle
tier between DRAM and disk in page reclaim path.

With PMEM nodes, the demotion path of anonymous pages could be:

DRAM -> PMEM -> swap device

This patch demotes anonymous pages only for the time being and demote
THP to PMEM in a whole.  However this may cause expensive page reclaim
and/or compaction on PMEM node if there is memory pressure on it.  But,
considering the capacity of PMEM and allocation only happens on PMEM
when PMEM is specified explicity, such cases should be not that often.
So, it sounds worth keeping THP in a whole instead of splitting it.

Demote pages to the cloest non-DRAM node even though the system is
swapless.  The current logic of page reclaim just scan anon LRU when
swap is on and swappiness is set properly.  Demoting to PMEM doesn't
need care whether swap is available or not.  But, reclaiming from PMEM
still skip anon LRU is swap is not available.

The demotion just happens between DRAM node and its cloest PMEM node.
Demoting to a remote PMEM node is not allowed for now.

And, define a new migration reason for demotion, called MR_DEMOTE.
Demote page via async migration to avoid blocking.

Signed-off-by: Yang Shi 
---
 include/linux/migrate.h|  1 +
 include/trace/events/migrate.h |  3 +-
 mm/debug.c |  1 +
 mm/internal.h  | 22 ++
 mm/vmscan.c| 99 ++
 5 files changed, 107 insertions(+), 19 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e13d9bf..78c8dda 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -25,6 +25,7 @@ enum migrate_reason {
MR_MEMPOLICY_MBIND,
MR_NUMA_MISPLACED,
MR_CONTIG_RANGE,
+   MR_DEMOTE,
MR_TYPES
 };
 
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 705b33d..c1d5b36 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -20,7 +20,8 @@
EM( MR_SYSCALL, "syscall_or_cpuset")\
EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind")  \
EM( MR_NUMA_MISPLACED,  "numa_misplaced")   \
-   EMe(MR_CONTIG_RANGE,"contig_range")
+   EM( MR_CONTIG_RANGE,"contig_range") \
+   EMe(MR_DEMOTE,  "demote")
 
 /*
  * First define the enums in the above macros to be exported to userspace
diff --git a/mm/debug.c b/mm/debug.c
index c0b31b6..cc0d7df 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -25,6 +25,7 @@
"mempolicy_mbind",
"numa_misplaced",
"cma",
+   "demote",
 };
 
 const struct trace_print_flags pageflag_names[] = {
diff --git a/mm/internal.h b/mm/internal.h
index 46ad0d8..0152300 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -303,6 +303,19 @@ static inline int find_next_best_node(int node, nodemask_t 
*used_node_mask,
 }
 #endif
 
+static inline bool has_nonram_online(void)
+{
+   int i = 0;
+
+   for_each_online_node(i) {
+   /* Have PMEM node online? */
+   if (!node_isset(i, def_alloc_nodemask))
+   return true;
+   }
+
+   return false;
+}
+
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
@@ -565,5 +578,14 @@ static inline bool is_migrate_highatomic_page(struct page 
*page)
 }
 
 void setup_zone_pageset(struct zone *zone);
+
+#ifdef CONFIG_NUMA
 extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+#else
+static inline struct page *alloc_new_node_page(struct page *page,
+  unsigned long node)
+{
+   return NULL;
+}
+#endif
 #endif /* __MM_INTERNAL_H */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a5ad0b3..bdcab6b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1094,6 +1094,19 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 
+static inline bool is_demote_ok(struct pglist_data *pgdat)
+{
+   /* Current node is not DRAM node */
+   if (!node_isset(pgdat->node_id, def_alloc_nodemask))
+   return false;
+
+   /* No online PMEM node */
+   if (!has_nonram_online())
+   return false;
+
+   return true;
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -1106,6 +1119,7 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
 {
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
+   LIST_HEAD(demote_pages);
unsigned nr_reclaimed = 0;
 
memset(stat, 0, sizeof(*stat));
@@ -1262,6 +1276,22 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
}
 
/*
+* 

[PATCH 05/10] mm: page_alloc: make find_next_best_node could skip DRAM node

2019-03-22 Thread Yang Shi
Need find the cloest non-DRAM node to demote DRAM pages.  Add
"skip_ram_node" parameter to find_next_best_node() to skip DRAM node on
demand.

Signed-off-by: Yang Shi 
---
 mm/internal.h   | 11 +++
 mm/page_alloc.c | 15 +++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 9eeaf2b..46ad0d8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -292,6 +292,17 @@ static inline bool is_data_mapping(vm_flags_t flags)
return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
 }
 
+#ifdef CONFIG_NUMA
+extern int find_next_best_node(int node, nodemask_t *used_node_mask,
+  bool skip_ram_node);
+#else
+static inline int find_next_best_node(int node, nodemask_t *used_node_mask,
+ bool skip_ram_node)
+{
+   return 0;
+}
+#endif
+
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 68ad8c6..07d767b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5375,6 +5375,7 @@ int numa_zonelist_order_handler(struct ctl_table *table, 
int write,
  * find_next_best_node - find the next node that should appear in a given 
node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
+ * @skip_ram_node: find next best non-DRAM node
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
@@ -5386,7 +5387,8 @@ int numa_zonelist_order_handler(struct ctl_table *table, 
int write,
  *
  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
  */
-static int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node(int node, nodemask_t *used_node_mask,
+   bool skip_ram_node)
 {
int n, val;
int min_val = INT_MAX;
@@ -5394,13 +5396,19 @@ static int find_next_best_node(int node, nodemask_t 
*used_node_mask)
const struct cpumask *tmp = cpumask_of_node(0);
 
/* Use the local node if we haven't already */
-   if (!node_isset(node, *used_node_mask)) {
+   if (!node_isset(node, *used_node_mask) &&
+   !skip_ram_node) {
node_set(node, *used_node_mask);
return node;
}
 
for_each_node_state(n, N_MEMORY) {
 
+   /* Find next best non-DRAM node */
+   if (skip_ram_node &&
+   (node_isset(n, def_alloc_nodemask)))
+   continue;
+
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
@@ -5432,7 +5440,6 @@ static int find_next_best_node(int node, nodemask_t 
*used_node_mask)
return best_node;
 }
 
-
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
@@ -5494,7 +5501,7 @@ static void build_zonelists(pg_data_t *pgdat)
nodes_clear(used_mask);
 
memset(node_order, 0, sizeof(node_order));
-   while ((node = find_next_best_node(local_node, _mask)) >= 0) {
+   while ((node = find_next_best_node(local_node, _mask, false)) >= 
0) {
/*
 * We don't want to pressure a particular node.
 * So adding penalty to the first node in same
-- 
1.8.3.1



[PATCH 07/10] mm: vmscan: add page demotion counter

2019-03-22 Thread Yang Shi
Demoted pages are counted into reclaim_state->nr_demoted instead of
nr_reclaimed since they are not reclaimed actually.  They are still in
memory, but just migrated to PMEM.

Add pgdemote_kswapd and pgdemote_direct VM counters showed in
/proc/vmstat.

Signed-off-by: Yang Shi 
---
 include/linux/vm_event_item.h |  2 ++
 include/linux/vmstat.h|  1 +
 mm/vmscan.c   | 14 ++
 mm/vmstat.c   |  2 ++
 4 files changed, 19 insertions(+)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 47a3441..499a3aa 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -32,6 +32,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGREFILL,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
+   PGDEMOTE_KSWAPD,
+   PGDEMOTE_DIRECT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 2db8d60..eb5d21c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -29,6 +29,7 @@ struct reclaim_stat {
unsigned nr_activate;
unsigned nr_ref_keep;
unsigned nr_unmap_fail;
+   unsigned nr_demoted;
 };
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bdcab6b..3c7ba7e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1286,6 +1286,10 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
 
if (has_nonram_online()) {
list_add(>lru, _pages);
+   if (PageTransHuge(page))
+   stat->nr_demoted += HPAGE_PMD_NR;
+   else
+   stat->nr_demoted++;
unlock_page(page);
continue;
}
@@ -1523,7 +1527,17 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
putback_movable_pages(_pages);
 
list_splice(_pages, _pages);
+
+   if (err > 0)
+   stat->nr_demoted -= err;
+   else
+   stat->nr_demoted = 0;
}
+
+   if (current_is_kswapd())
+   __count_vm_events(PGDEMOTE_KSWAPD, stat->nr_demoted);
+   else
+   __count_vm_events(PGDEMOTE_DIRECT, stat->nr_demoted);
}
 
mem_cgroup_uncharge_list(_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 36b56f8..0e863e7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1192,6 +1192,8 @@ int fragmentation_index(struct zone *zone, unsigned int 
order)
"pgrefill",
"pgsteal_kswapd",
"pgsteal_direct",
+   "pgdemote_kswapd",
+   "pgdemote_direct",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_direct_throttle",
-- 
1.8.3.1



Re: [PATCH 4.14 000/183] 4.14.108-stable review

2019-03-22 Thread Guenter Roeck

On 3/22/19 4:13 AM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 4.14.108 release.
There are 183 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun Mar 24 11:11:06 UTC 2019.
Anything received after that time might be too late.



Build results:
total: 172 pass: 172 fail: 0
Qemu test results:
total: 329 pass: 329 fail: 0

Guenter


[PATCH 02/10] mm: mempolicy: introduce MPOL_HYBRID policy

2019-03-22 Thread Yang Shi
Introduce a new NUMA policy, MPOL_HYBRID.  It behaves like MPOL_BIND,
but since we need migrate pages from non-DRAM node (i.e. PMEM node) to
DRAM node on demand, MPOL_HYBRID would do page migration on numa fault,
so it would have MPOL_F_MOF set by default.

The NUMA balancing stuff will be enabled in the following patch.

Signed-off-by: Yang Shi 
---
 include/uapi/linux/mempolicy.h |  1 +
 mm/mempolicy.c | 56 +-
 2 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 3354774..0fdc73d 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -22,6 +22,7 @@ enum {
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
+   MPOL_HYBRID,
MPOL_MAX,   /* always last member of enum */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index af171cc..7d0a432 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -31,6 +31,10 @@
  *but useful to set in a VMA when you have a non default
  *process policy.
  *
+ * hybrid Only allocate memory on specific set of nodes. If the set of
+ *nodes include non-DRAM nodes, NUMA balancing would promote
+ *the page to DRAM node.
+ *
  * defaultAllocate on the local node first, or when on a VMA
  *use the process policy. This is what Linux always did
  *   in a NUMA aware kernel and still does by, ahem, default.
@@ -191,6 +195,17 @@ static int mpol_new_bind(struct mempolicy *pol, const 
nodemask_t *nodes)
return 0;
 }
 
+static int mpol_new_hybrid(struct mempolicy *pol, const nodemask_t *nodes)
+{
+   if (nodes_empty(*nodes))
+   return -EINVAL;
+
+   /* Hybrid policy would promote pages in page fault */
+   pol->flags |= MPOL_F_MOF;
+   pol->v.nodes = *nodes;
+   return 0;
+}
+
 /*
  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
  * any, for the new policy.  mpol_new() has already validated the nodes
@@ -401,6 +416,10 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
.create = mpol_new_bind,
.rebind = mpol_rebind_nodemask,
},
+   [MPOL_HYBRID] = {
+   .create = mpol_new_hybrid,
+   .rebind = mpol_rebind_nodemask,
+   },
 };
 
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -782,6 +801,8 @@ static void get_policy_nodemask(struct mempolicy *p, 
nodemask_t *nodes)
return;
 
switch (p->mode) {
+   case MPOL_HYBRID:
+   /* Fall through */
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
@@ -1721,8 +1742,12 @@ static int apply_policy_zone(struct mempolicy *policy, 
enum zone_type zone)
  */
 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 {
-   /* Lower zones don't get a nodemask applied for MPOL_BIND */
-   if (unlikely(policy->mode == MPOL_BIND) &&
+   /*
+* Lower zones don't get a nodemask applied for MPOL_BIND
+* or MPOL_HYBRID.
+*/
+   if (unlikely((policy->mode == MPOL_BIND) ||
+   (policy->mode == MPOL_HYBRID)) &&
apply_policy_zone(policy, gfp_zone(gfp)) &&
cpuset_nodemask_valid_mems_allowed(>v.nodes))
return >v.nodes;
@@ -1742,7 +1767,9 @@ static int policy_node(gfp_t gfp, struct mempolicy 
*policy,
 * because we might easily break the expectation to stay on the
 * requested node and not break the policy.
 */
-   WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & 
__GFP_THISNODE));
+   WARN_ON_ONCE((policy->mode == MPOL_BIND ||
+policy->mode == MPOL_HYBRID) &&
+(gfp & __GFP_THISNODE));
}
 
return nd;
@@ -1786,6 +1813,8 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
 
+   case MPOL_HYBRID:
+   /* Fall through */
case MPOL_BIND: {
struct zoneref *z;
 
@@ -1856,7 +1885,8 @@ static inline unsigned interleave_nid(struct mempolicy 
*pol,
  * @addr: address in @vma for shared policy lookup and interleave policy
  * @gfp_flags: for requested zone
  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ * @nodemask: pointer to nodemask pointer for MPOL_BIND or MPOL_HYBRID
+ * nodemask
  *
  * Returns a nid suitable for a huge page allocation and a pointer
  * to the struct mempolicy for conditional unref after allocation.
@@ -1871,14 +1901,16 @@ int huge_node(struct vm_area_struct *vma, unsigned long 
addr, gfp_t gfp_flags,
int nid;
 
*mpol = 

[PATCH 08/10] mm: numa: add page promotion counter

2019-03-22 Thread Yang Shi
Add counter for page promotion for NUMA balancing.

Signed-off-by: Yang Shi 
---
 include/linux/vm_event_item.h | 1 +
 mm/huge_memory.c  | 4 
 mm/memory.c   | 4 
 mm/vmstat.c   | 1 +
 4 files changed, 10 insertions(+)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 499a3aa..9f52a62 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -51,6 +51,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NUMA_HINT_FAULTS,
NUMA_HINT_FAULTS_LOCAL,
NUMA_PAGE_MIGRATE,
+   NUMA_PAGE_PROMOTE,
 #endif
 #ifdef CONFIG_MIGRATION
PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8268a3c..9d5f5ce 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1607,6 +1607,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
+   if (!node_isset(page_nid, def_alloc_nodemask) &&
+   node_isset(target_nid, def_alloc_nodemask))
+   count_vm_numa_events(NUMA_PAGE_PROMOTE, HPAGE_PMD_NR);
+
flags |= TNF_MIGRATED;
page_nid = target_nid;
} else
diff --git a/mm/memory.c b/mm/memory.c
index 2494c11..554191b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3691,6 +3691,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
/* Migrate to the requested node */
migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated) {
+   if (!node_isset(page_nid, def_alloc_nodemask) &&
+   node_isset(target_nid, def_alloc_nodemask))
+   count_vm_numa_event(NUMA_PAGE_PROMOTE);
+
page_nid = target_nid;
flags |= TNF_MIGRATED;
} else
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0e863e7..4b44fc8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1220,6 +1220,7 @@ int fragmentation_index(struct zone *zone, unsigned int 
order)
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
+   "numa_pages_promoted",
 #endif
 #ifdef CONFIG_MIGRATION
"pgmigrate_success",
-- 
1.8.3.1



Re: [PATCH 3.18 000/134] 3.18.137-stable review

2019-03-22 Thread Guenter Roeck

On 3/22/19 4:13 AM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 3.18.137 release.
There are 134 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun Mar 24 11:10:54 UTC 2019.
Anything received after that time might be too late.



Build results:
total: 155 pass: 155 fail: 0
Qemu test results:
total: 226 pass: 226 fail: 0

Guenter


[PATCH 09/10] doc: add description for MPOL_HYBRID mode

2019-03-22 Thread Yang Shi
Add description for MPOL_HYBRID mode in kernel documentation.

Signed-off-by: Yang Shi 
---
 Documentation/admin-guide/mm/numa_memory_policy.rst | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst 
b/Documentation/admin-guide/mm/numa_memory_policy.rst
index d78c5b3..3db8257 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -198,6 +198,16 @@ MPOL_BIND
the node in the set with sufficient free memory that is
closest to the node where the allocation takes place.
 
+MPOL_HYBRID
+This mode specifies that the page allocation must happen on the
+nodes specified by the policy.  If both DRAM and non-DRAM nodes
+are specified, NUMA balancing may promote the pages from non-DRAM
+nodes to the specified DRAM nodes.  If only non-DRAM nodes are
+specified, NUMA balancing may promote the pages to any available
+DRAM nodes.  Any other policy doesn't do such page promotion.  The
+default mode may do NUMA balancing, but non-DRAM nodes are masked
+off for default mode.
+
 MPOL_PREFERRED
This mode specifies that the allocation should be attempted
from the single node specified in the policy.  If that
-- 
1.8.3.1



[RFC PATCH 0/10] Another Approach to Use PMEM as NUMA Node

2019-03-22 Thread Yang Shi


With Dave Hansen's patches merged into Linus's tree

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c221c0b0308fd01d9fb33a16f64d2fd95f8830a4

PMEM could be hot plugged as NUMA node now. But, how to use PMEM as NUMA node
effectively and efficiently is still a question. 

There have been a couple of proposals posted on the mailing list [1] [2].

The patchset is aimed to try a different approach from this proposal [1]
to use PMEM as NUMA nodes.

The approach is designed to follow the below principles:

1. Use PMEM as normal NUMA node, no special gfp flag, zone, zonelist, etc.

2. DRAM first/by default. No surprise to existing applications and default
running. PMEM will not be allocated unless its node is specified explicitly
by NUMA policy. Some applications may be not very sensitive to memory latency,
so they could be placed on PMEM nodes then have hot pages promote to DRAM
gradually.

3. Compatible with current NUMA policy semantics.

4. Don't assume hardware topology. But, the patchset still assumes two tier
heterogeneous memory system. I understood generalizing multi tier heterogeneous
memory had been discussed before. I do agree that is preferred eventually.
However, currently kernel doesn't have such capability yet. When HMAT is fully
ready we definitely could extract NUMA topology from it.

5. Control memory allocation and hot/cold pages promotion/demotion on per VMA
basis.

To achieve the above principles, the design can be summarized by the
following points:

1. Per node global fallback zonelists (include both DRAM and PMEM), use
def_alloc_nodemask to exclude non-DRAM nodes from default allocation unless
they are specified by mempolicy. Currently kernel just can distinguish volatile
and non-volatile memory. So, just build the nodemask by SRAT flag. In the
future it may be better to build the nodemask with more exposed hardware
information, i.e. HMAT attributes so that it could be extended to multi tier
memory system easily.

2. Introduce a new mempolicy, called MPOL_HYBRID to keep other mempolicy
semantics intact. We would like to have memory placement control on per process
or even per VMA granularity. So, mempolicy sounds more reasonable than madvise.
The new mempolicy is mainly used for launching processes on PMEM nodes then
migrate hot pages to DRAM nodes via NUMA balancing. MPOL_BIND could bind to
PMEM nodes too, but migrating to DRAM nodes would just break the semantic of
it. MPOL_PREFERRED can't constraint the allocation to PMEM nodes. So, it sounds
a new mempolicy is needed to fulfill the usecase.

3. The new mempolicy would promote pages to DRAM via NUMA balancing. IMHO, I
don't think kernel is a good place to implement sophisticated hot/cold page
distinguish algorithm due to the complexity and overhead. But, kernel should
have such capability. NUMA balancing sounds like a good start point.

4. Promote twice faulted page. Use PG_promote to track if a page is faulted
twice. This is an optimization to NUMA balancing to reduce the migration
thrashing and overhead for migrating from PMEM.

5. When DRAM has memory pressure, demote page to PMEM via page reclaim path.
This is quite similar to other proposals. Then NUMA balancing will promote
page to DRAM as long as the page is referenced again. But, the
promotion/demotion still assumes two tier main memory. And, the demotion may
break mempolicy.

6. Anonymous page only for the time being since NUMA balancing can't promote
unmapped page cache.

The patchset still misses a some pieces and is pre-mature, but I would like to
post to LKML to gather more feedback and comments and have more eyes on it to
make sure I'm on the right track.

Any comment is welcome.


TODO:

1. Promote page cache. There are a couple of ways to handle this in kernel,
i.e. promote via active LRU in reclaim path on PMEM node, or promote in
mark_page_accessed().

2. Promote/demote HugeTLB. Now HugeTLB is not on LRU and NUMA balancing just
skips it.

3. May place kernel pages (i.e. page table, slabs, etc) on DRAM only.

4. Support the new mempolicy in userspace tools, i.e. numactl.


[1]: https://lore.kernel.org/linux-mm/20181226131446.330864...@intel.com/
[2]: 
https://lore.kernel.org/linux-mm/20190321200157.29678-1-keith.bu...@intel.com/


Yang Shi (10):
  mm: control memory placement by nodemask for two tier main memory
  mm: mempolicy: introduce MPOL_HYBRID policy
  mm: mempolicy: promote page to DRAM for MPOL_HYBRID
  mm: numa: promote pages to DRAM when it is accessed twice
  mm: page_alloc: make find_next_best_node could skip DRAM node
  mm: vmscan: demote anon DRAM pages to PMEM node
  mm: vmscan: add page demotion counter
  mm: numa: add page promotion counter
  doc: add description for MPOL_HYBRID mode
  doc: elaborate the PMEM allocation rule

 Documentation/admin-guide/mm/numa_memory_policy.rst |  10 
 Documentation/vm/numa.rst   |   7 ++-
 arch/x86/mm/numa.c   

[PATCH 01/10] mm: control memory placement by nodemask for two tier main memory

2019-03-22 Thread Yang Shi
When running applications on the machine with NVDIMM as NUMA node, the
memory allocation may end up on NVDIMM node.  This may result in silent
performance degradation and regression due to the difference of hardware
property.

DRAM first should be obeyed to prevent from surprising regression.  Any
non-DRAM nodes should be excluded from default allocation.  Use nodemask
to control the memory placement.  Introduce def_alloc_nodemask which has
DRAM nodes set only.  Any non-DRAM allocation should be specified by
NUMA policy explicitly.

In the future we may be able to extract the memory charasteristics from
HMAT or other source to build up the default allocation nodemask.
However, just distinguish DRAM and PMEM (non-DRAM) nodes by SRAT flag
for the time being.

Signed-off-by: Yang Shi 
---
 arch/x86/mm/numa.c |  1 +
 drivers/acpi/numa.c|  8 
 include/linux/mmzone.h |  3 +++
 mm/page_alloc.c| 18 --
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index dfb6c4d..d9e0ca4 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -626,6 +626,7 @@ static int __init numa_init(int (*init_func)(void))
nodes_clear(numa_nodes_parsed);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
+   nodes_clear(def_alloc_nodemask);
memset(_meminfo, 0, sizeof(numa_meminfo));
WARN_ON(memblock_set_node(0, ULLONG_MAX, ,
  MAX_NUMNODES));
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 867f6e3..79dfedf 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -296,6 +296,14 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
*slit)
goto out_err_bad_srat;
}
 
+   /*
+* Non volatile memory is excluded from zonelist by default.
+* Only regular DRAM nodes are set in default allocation node
+* mask.
+*/
+   if (!(ma->flags & ACPI_SRAT_MEM_NON_VOLATILE))
+   node_set(node, def_alloc_nodemask);
+
node_set(node, numa_nodes_parsed);
 
pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fba7741..063c3b4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -927,6 +927,9 @@ extern int numa_zonelist_order_handler(struct ctl_table *, 
int,
 extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
 extern struct zone *next_zone(struct zone *zone);
 
+/* Regular DRAM nodes */
+extern nodemask_t def_alloc_nodemask;
+
 /**
  * for_each_online_pgdat - helper macro to iterate over all online nodes
  * @pgdat - pointer to a pg_data_t variable
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 03fcf73..68ad8c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -134,6 +134,8 @@ struct pcpu_drain {
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
+nodemask_t def_alloc_nodemask __read_mostly;
+
 /*
  * A cached value of the page's pageblock's migratetype, used when the page is
  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
@@ -4524,12 +4526,24 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, 
unsigned int order,
 {
ac->high_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
-   ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
 
+   if (!nodemask) {
+   /* Non-DRAM node is preferred node */
+   if (!node_isset(preferred_nid, def_alloc_nodemask))
+   /*
+* With MPOL_PREFERRED policy, once PMEM is allowed,
+* can falback to all memory nodes.
+*/
+   ac->nodemask = _states[N_MEMORY];
+   else
+   ac->nodemask = _alloc_nodemask;
+   } else
+   ac->nodemask = nodemask;
+
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
-   if (!ac->nodemask)
+   if (nodes_equal(*ac->nodemask, def_alloc_nodemask))
ac->nodemask = _current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
-- 
1.8.3.1



Re: [PATCH 4.4 000/230] 4.4.177-stable review

2019-03-22 Thread Guenter Roeck

On 3/22/19 4:12 AM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 4.4.177 release.
There are 230 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun Mar 24 11:10:58 UTC 2019.
Anything received after that time might be too late.



Build results:
total: 171 pass: 171 fail: 0
Qemu test results:
total: 292 pass: 292 fail: 0

Guenter


[PATCH] rcutorture: Select from only online CPUs

2019-03-22 Thread Joel Fernandes (Google)
The rcutorture jitter.sh script selects a random CPU but does not check
if it is offline or online. This leads to taskset errors many times. On
my machine, hyper threading is disabled so half the cores are offline
causing taskset errors a lot of times. Let us fix this by checking from
only the online CPUs on the system.

Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/rcutorture/bin/jitter.sh | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh 
b/tools/testing/selftests/rcutorture/bin/jitter.sh
index 3633828375e3..53bf9d99b5cd 100755
--- a/tools/testing/selftests/rcutorture/bin/jitter.sh
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -47,10 +47,19 @@ do
exit 0;
fi
 
-   # Set affinity to randomly selected CPU
+   # Set affinity to randomly selected online CPU
cpus=`ls /sys/devices/system/cpu/*/online |
sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' |
grep -v '^0*$'`
+
+   for c in $cpus; do
+   if [ "$(cat /sys/devices/system/cpu/cpu$c/online)" == "1" ];
+   then
+   cpus_tmp="$cpus_tmp $c"
+   fi
+   done
+   cpus=$cpus_tmp
+
cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN {
srand(n + me + systime());
ncpus = split(cpus, ca);
-- 
2.21.0.392.gf8f6787159e-goog



Re: [PATCH] mfd: fix a potential NULL pointer dereference

2019-03-22 Thread Kangjie Lu
Hi Lee Jones,

Can you review this patch?

Thanks. 

> On Mar 9, 2019, at 2:04 AM, Kangjie Lu  wrote:
> 
> In case devm_kzalloc fails, the fix does NULL check and returns
> -ENOMEM upon failure so as to avoid NULL pointer dereference.
> 
> Signed-off-by: Kangjie Lu 
> ---
> drivers/mfd/sm501.c | 3 +++
> 1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c
> index a530972c5a7e..e0173bf4b0dc 100644
> --- a/drivers/mfd/sm501.c
> +++ b/drivers/mfd/sm501.c
> @@ -1145,6 +1145,9 @@ static int sm501_register_gpio_i2c_instance(struct 
> sm501_devdata *sm,
>   lookup = devm_kzalloc(>dev,
> sizeof(*lookup) + 3 * sizeof(struct gpiod_lookup),
> GFP_KERNEL);
> + if (!lookup)
> + return -ENOMEM;
> +
>   lookup->dev_id = "i2c-gpio";
>   if (iic->pin_sda < 32)
>   lookup->table[0].chip_label = "SM501-LOW";
> -- 
> 2.17.1
> 



Re: [RFC PATCH] x86/cpu/hygon: Fix phys_proc_id calculation logic for multi-die processor

2019-03-22 Thread Pu Wen
On 2019/3/23 0:52, Peter Zijlstra wrote:
> On Fri, Mar 22, 2019 at 05:44:45PM +0100, Borislav Petkov wrote:
>> On Sat, Mar 23, 2019 at 12:19:01AM +0800, Pu Wen wrote:
>>> That 6 is not a magic number.
>>
>> Well, if I see a naked 6, then it is only magic to me. Now if it were a
>> proper define with a descriptive name...
> 
> Does AMD/Hygon not have a CPUID leaf to read useful things like this
> from?

ApicId[6] is read from CPUID_Fn0001_EBX[30].
Please see section 2.1.11.1 and 2.1.10.2.1.3 of referrence [1].

Reference:
[1] 
https://www.amd.com/system/files/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf

-- 
Regards,
Pu Wen


Re: [PATCH] memstick: fix a potential NULL pointer dereference

2019-03-22 Thread Kangjie Lu
Hi Maxim,

Can you review this patch? 

Thanks,

> On Mar 9, 2019, at 1:59 AM, Kangjie Lu  wrote:
> 
> In case alloc_ordered_workqueue fails, the fix returns ENOMEM to
> avoid potential NULL pointer dereference.
> 
> Signed-off-by: Kangjie Lu 
> ---
> drivers/memstick/core/ms_block.c | 5 +
> 1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/memstick/core/ms_block.c 
> b/drivers/memstick/core/ms_block.c
> index 82daccc9ea62..8e00de414567 100644
> --- a/drivers/memstick/core/ms_block.c
> +++ b/drivers/memstick/core/ms_block.c
> @@ -2149,6 +2149,11 @@ static int msb_init_disk(struct memstick_dev *card)
> 
>   msb->usage_count = 1;
>   msb->io_queue = alloc_ordered_workqueue("ms_block", WQ_MEM_RECLAIM);
> + if (!msb->io_queue) {
> + rc = -ENOMEM;
> + goto out_put_disk;
> + }
> +
>   INIT_WORK(>io_work, msb_io_work);
>   sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1);
> 
> -- 
> 2.17.1
> 



Re: [RFC 2/2] rcutree: Add checks for dynticks counters in rcu_is_cpu_rrupt_from_idle

2019-03-22 Thread Joel Fernandes
On Fri, Mar 22, 2019 at 09:29:39PM -0400, Joel Fernandes (Google) wrote:
> In the future we would like to combine the dynticks and dynticks_nesting
> counters thus leading to simplifying the code. At the moment we cannot
> do that due to concerns about usermode upcalls appearing to RCU as half
> of an interrupt. Byungchul tried to do it in [1] but the
> "half-interrupt" concern was raised. It is half because, what RCU
> expects is rcu_irq_enter() and rcu_irq_exit() pairs when the usermode
> exception happens. However, only rcu_irq_enter() is observed. This
> concern may not be valid anymore, but at least it used to be the case.
> 
> Out of abundance of caution, Paul added warnings [2] in the RCU code
> which if not fired by 2021 may allow us to assume that such
> half-interrupt scenario cannot happen any more, which can lead to
> simplification of this code.
> 
> Summary of the changes are the following:
> 
> (1) In preparation for this combination of counters in the future, we
> first need to first be sure that rcu_rrupt_from_idle cannot be called
> from anywhere but a hard-interrupt because previously, the comments
> suggested otherwise so let us be sure. We discussed this here [3]. We
> use the services of lockdep to accomplish this.
> 
> (2) Further rcu_rrupt_from_idle() is not explicit about how it is using
> the counters which can lead to weird future bugs. This patch therefore
> makes it more explicit about the specific counter values being tested
> 
> (3) Lastly, we check for counter underflows just to be sure these are
> not happening, because the previous code in rcu_rrupt_from_idle() was
> allowing the case where the counters can underflow, and the function
> would still return true. Now we are checking for specific values so let
> us be confident by additional checking, that such underflows don't
> happen. Any case, if they do, we should fix them and the screaming
> warning is appropriate. All these checks checks are NOOPs if PROVE_RCU
> and PROVE_LOCKING are disabled.
> 
> [1] https://lore.kernel.org/patchwork/patch/952349/
> [2] Commit e11ec65cc8d6 ("rcu: Add warning to detect half-interrupts")
> [3] https://lore.kernel.org/lkml/20190312150514.gb249...@google.com/
> 
> Cc: byungchul.p...@lge.com
> Cc: kernel-t...@android.com
> Cc: r...@vger.kernel.org
> Signed-off-by: Joel Fernandes (Google) 
> ---
>  kernel/rcu/tree.c | 21 +
>  1 file changed, 17 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 9180158756d2..d94c8ed29f6b 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -381,16 +381,29 @@ static void __maybe_unused 
> rcu_momentary_dyntick_idle(void)
>  }
>  
>  /**
> - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from 
> idle
> + * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
>   *
> - * If the current CPU is idle or running at a first-level (not nested)
> + * If the current CPU is idle and running at a first-level (not nested)
>   * interrupt from idle, return true.  The caller must have at least
>   * disabled preemption.
>   */
>  static int rcu_is_cpu_rrupt_from_idle(void)
>  {
> - return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 &&
> -__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1;
> + /* Called only from within the scheduling-clock interrupt */
> + lockdep_assert_in_irq();
> +
> + /* Check for counter underflows */
> + RCU_LOCKDEP_WARN(
> + (__this_cpu_read(rcu_data.dynticks_nesting) < 0) &&
> + (__this_cpu_read(rcu_data.dynticks_nmi_nesting) < 0),

 
This condition for the warning is supposed to be || instead of &&. Sorry.

Or, I will just use 2 RCU_LOCKDEP_WARN(s) here, that's better.

thanks,

 - Joel


[PATCH] kernel: watchdog: fixed spelling mistake

2019-03-22 Thread Arash Fotouhi
Fixed a spelling mistake.

Signed-off-by: Arash Fotouhi 
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8fbfda9..7e23e5e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -588,7 +588,7 @@ static void lockup_detector_reconfigure(void)
  * Create the watchdog thread infrastructure and configure the detector(s).
  *
  * The threads are not unparked as watchdog_allowed_mask is empty.  When
- * the threads are sucessfully initialized, take the proper locks and
+ * the threads are successfully initialized, take the proper locks and
  * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
  */
 static __init void lockup_detector_setup(void)
-- 
2.7.4



Re: mount.nfs: Protocol error after upgrade to linux/master

2019-03-22 Thread Tetsuo Handa
On 2019/03/23 7:45, Kees Cook wrote:
> It breaks the backward-compat for the "security=" line. If a system is
> booted with CONFIG_LSM="minors...,apparmor" and "security=selinux",
> neither apparmor nor selinux will be initialized. The logic on
> "security=..." depends on the other LSMs being present in the list.

Really? The logic on "security=..." does not depend on LSM_FLAG_LEGACY_MAJOR
LSMs being present in the CONFIG_LSM= list, for ordered_lsm_parse() does

  (Step 1) Enable LSM_ORDER_FIRST module (i.e. capability).

  (Step 2) Disable LSM_FLAG_LEGACY_MAJOR modules which was not specified
   by "security=" parameter when "security=" parameter was specified.

  (Step 3) Enable modules specified by "lsm=" parameter (or CONFIG_LSM= settings
   if "lsm=" parameter was not specified).

  (Step 4) Enable up to one LSM_FLAG_LEGACY_MAJOR module which was specified
   by "security=" parameter when "security=" parameter was specified.

  (Step 5) Disable all unused modules.

and (Step 4) will compensate for lack of that module in (Step 3).


[PATCH] input: pm8xxx-vibrator: fix a potential NULL pointer dereference

2019-03-22 Thread Kangjie Lu
In case of_device_get_match_data fails to find the matched data,
returns -ENODEV

Signed-off-by: Kangjie Lu 
---
 drivers/input/misc/pm8xxx-vibrator.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/input/misc/pm8xxx-vibrator.c 
b/drivers/input/misc/pm8xxx-vibrator.c
index 7dd1c1fbe42a..740e59c11808 100644
--- a/drivers/input/misc/pm8xxx-vibrator.c
+++ b/drivers/input/misc/pm8xxx-vibrator.c
@@ -196,6 +196,8 @@ static int pm8xxx_vib_probe(struct platform_device *pdev)
vib->vib_input_dev = input_dev;
 
regs = of_device_get_match_data(>dev);
+   if (unlikely(!regs))
+   return -ENODEV;
 
/* operate in manual mode */
error = regmap_read(vib->regmap, regs->drv_addr, );
-- 
2.17.1



Re: [PATCH] infiniband: cxgb4: fix a potential NULL pointer dereference

2019-03-22 Thread Kangjie Lu



> On Mar 8, 2019, at 11:19 PM, Kangjie Lu  wrote:
> 
> get_skb may fail and return NULL. The fix returns "ENOMEM"
> when it fails to avoid NULL dereference.
> 
> Signed-off-by: Kangjie Lu 
> ---
> drivers/infiniband/hw/cxgb4/cm.c | 3 +++
> 1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/cxgb4/cm.c 
> b/drivers/infiniband/hw/cxgb4/cm.c
> index 8221813219e5..502a54d57e2c 100644
> --- a/drivers/infiniband/hw/cxgb4/cm.c
> +++ b/drivers/infiniband/hw/cxgb4/cm.c
> @@ -1919,6 +1919,9 @@ static int send_fw_act_open_req(struct c4iw_ep *ep, 
> unsigned int atid)
>   int win;
> 
>   skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
> + if (!skb)
> + return -ENOMEM;
> +

Can someone review this patch? Thanks.

>   req = __skb_put_zero(skb, sizeof(*req));
>   req->op_compl = htonl(WR_OP_V(FW_OFLD_CONNECTION_WR));
>   req->len16_pkd = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16)));
> -- 
> 2.17.1
> 



You should read will before reply, Honourable Barrister Aziz Dake..

2019-03-22 Thread Aziz Dake
Attn: Sir/Madam

I am Honourable Barrister Aziz the personal resident Attorney here in
Burkina Faso to Late Mr. Muammar Muhammad Abu Minyar al-Gaddafi of
Libya c. 1942 – 20 October 2011.

My client Late Mr. Muammar Muhammad Abu Minyar al-Gaddafi c. 1942 – 20
October 2011, was having a deposit sum of {thirty million four Hundred
thousand united state dollars} only ($30.4M USD) with a security
finance firm affiliated with African development bank here in Burkina
Faso.

With the above explanation’s I want to move this money from Burkina
Faso to your country, affidavit on your name, but note that this is a
deal between me and you and should not be related to anybody until the
deal is over for security reasons, please if interested reply as soon
as possible.

Thanks,
Honourable Barrister Aziz Dake.


From: Mr.Ahmed Owain

2019-03-22 Thread Mr.Ahmed Owain
Good Day,

Please accept my apologies for writing you a surprise letter.I am
Mr.Ahmed Owain, account Manager with an investment bank here in
Burkina Faso.I have a very important business I want to discuss with
you.There is a draft account opened in my firm by a long-time client
of our bank.I have the opportunity of transferring the left over fund
(15.8 Million UsDollars)Fiftheen Million Eight Hundred Thousand United
States of American Dollars of one of my Bank clients who died at the
collapsing of the world trade center at the United States on September
11th 2001.

I want to invest this funds and introduce you to our bank for this
deal.All I require is your honest co-operation and I guarantee you
that this will be executed under a legitimate arrangement that will
protect us from any breach of the law.I agree that 40% of this money
will be for you as my foreign partner,50% for me while 10% is for
establishing of foundation for the less privilleges in your country.If
you are really interested in my proposal further details of the
Transfer will be forwarded unto you as soon as I receive your
willingness mail for a successful transfer.

Yours Sincerely,
Mr.Ahmed Owain,


Re: [RFC PATCH] x86/cpu/hygon: Fix phys_proc_id calculation logic for multi-die processor

2019-03-22 Thread Pu Wen
On 2019/3/23 1:16, Borislav Petkov wrote:
> On Sat, Mar 23, 2019 at 12:19:01AM +0800, Pu Wen wrote:
>>> Sounds to me like you're programming the initial APIC ID not
>>> the same way as AMD do...
>>
>> In the same way.
> 
> So why do you need to do something different than what AMD does?

Current physical id is computed via "phys_proc_id = initial_apicid >>
bits". 

For 4-Die 2 socket system, the physical id of socket 2 is:
initial_apicid >> bits = 0b1xx >> 6 = 1.
The result is true.

But for 2-Die 2 socket system, the physical id of socket 2 is:
initial_apicid >> bits = 0b10x >> 5 = 2,
and for 1-Die 2 socket system, the physical id of socket 2 is:
initial_apicid >> bits = 0b100 >> 4 = 4.
The results are not correct any more.

So the adjustment for the 1-Die/2-Die 2 socket system is needed.
And just use ApicId[6], which already defined the right thing, as the
socket ID.

-- 
Regards,
Pu Wen


Re: [PATCH] firmware: arm_scmi: check return value of idr_find

2019-03-22 Thread Kangjie Lu



> On Mar 8, 2019, at 10:02 PM, Kangjie Lu  wrote:
> 
> idr_find may return NULL, so check its return value and return an
> error code.

Can someone review this patch? Thanks.

> 
> Signed-off-by: Kangjie Lu 
> ---
> drivers/firmware/arm_scmi/driver.c | 2 ++
> 1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/firmware/arm_scmi/driver.c 
> b/drivers/firmware/arm_scmi/driver.c
> index 8f952f2f1a29..35faa203d549 100644
> --- a/drivers/firmware/arm_scmi/driver.c
> +++ b/drivers/firmware/arm_scmi/driver.c
> @@ -709,6 +709,8 @@ scmi_mbox_chan_setup(struct scmi_info *info, struct 
> device *dev, int prot_id)
> 
>   if (scmi_mailbox_check(np)) {
>   cinfo = idr_find(>tx_idr, SCMI_PROTOCOL_BASE);
> + if (!cinfo)
> + return -EINVAL;
>   goto idr_alloc;
>   }
> 
> -- 
> 2.17.1
> 



[rcu:dev.2019.03.20b 59/83] kernel/rcu/tree_plugin.h:612:8: error: 'rcu_softirq_enabled' undeclared; did you mean 'trace_softirqs_enabled'?

2019-03-22 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git 
dev.2019.03.20b
head:   6d4434b4b4df791620743178e1419de882b44c7b
commit: d0f41d341df0520e900cac03de05bbbd11abdcd9 [59/83] rcu: Enable 
elimination of Tree-RCU softirq processing
config: i386-randconfig-x002-201911 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
git checkout d0f41d341df0520e900cac03de05bbbd11abdcd9
# save the attached .config to linux build tree
make ARCH=i386 

Note: the rcu/dev.2019.03.20b HEAD 6d4434b4b4df791620743178e1419de882b44c7b 
builds fine.
  It only hurts bisectibility.

All errors (new ones prefixed by >>):

   In file included from kernel/rcu/tree.c:3489:0:
   kernel/rcu/tree_plugin.h: In function 'rcu_read_unlock_special':
>> kernel/rcu/tree_plugin.h:612:8: error: 'rcu_softirq_enabled' undeclared 
>> (first use in this function); did you mean 'trace_softirqs_enabled'?
   if (rcu_softirq_enabled)
   ^~~
   trace_softirqs_enabled
   kernel/rcu/tree_plugin.h:612:8: note: each undeclared identifier is reported 
only once for each function it appears in

vim +612 kernel/rcu/tree_plugin.h

   588  
   589  /*
   590   * Handle special cases during rcu_read_unlock(), such as needing to
   591   * notify RCU core processing or task having blocked during the RCU
   592   * read-side critical section.
   593   */
   594  static void rcu_read_unlock_special(struct task_struct *t)
   595  {
   596  unsigned long flags;
   597  bool preempt_bh_were_disabled =
   598  !!(preempt_count() & (PREEMPT_MASK | 
SOFTIRQ_MASK));
   599  bool irqs_were_disabled;
   600  
   601  /* NMI handlers cannot block and cannot safely manipulate 
state. */
   602  if (in_nmi())
   603  return;
   604  
   605  local_irq_save(flags);
   606  irqs_were_disabled = irqs_disabled_flags(flags);
   607  if (preempt_bh_were_disabled || irqs_were_disabled) {
   608  WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, 
false);
   609  /* Need to defer quiescent state until everything is 
enabled. */
   610  if (irqs_were_disabled) {
   611  /* Enabling irqs does not reschedule, so... */
 > 612  if (rcu_softirq_enabled)
   613  raise_softirq_irqoff(RCU_SOFTIRQ);
   614  else
   615  invoke_rcu_core();
   616  } else {
   617  /* Enabling BH or preempt does reschedule, 
so... */
   618  set_tsk_need_resched(current);
   619  set_preempt_need_resched();
   620  }
   621  local_irq_restore(flags);
   622  return;
   623  }
   624  WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
   625  rcu_preempt_deferred_qs_irqrestore(t, flags);
   626  }
   627  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH -next] ASoC: simple-card-utils: remove set but not used variable ''

2019-03-22 Thread YueHaibing
Fixes gcc '-Wunused-but-set-variable' warning:

sound/soc/generic/simple-card-utils.c: In function 'asoc_simple_parse_clk':
sound/soc/generic/simple-card-utils.c:164:18: warning:
 parameter 'dai_name' set but not used [-Wunused-but-set-parameter]

It's not used since commit 0580dde59438 ("ASoC: simple-card-utils: add
asoc_simple_debug_info()"), so can be removed.

Signed-off-by: YueHaibing 
---
 sound/soc/generic/simple-card-utils.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sound/soc/generic/simple-card-utils.c 
b/sound/soc/generic/simple-card-utils.c
index 4ed68348f939..db1458a19985 100644
--- a/sound/soc/generic/simple-card-utils.c
+++ b/sound/soc/generic/simple-card-utils.c
@@ -173,10 +173,8 @@ int asoc_simple_parse_clk(struct device *dev,
 * see
 *  soc-core.c :: snd_soc_init_multicodec()
 */
-   if (dlc) {
+   if (dlc)
dai_of_node = dlc->of_node;
-   dai_name= dlc->dai_name;
-   }
 
/*
 * Parse dai->sysclk come from "clocks = <>"







Re: [RFC PATCH] x86/cpu/hygon: Fix phys_proc_id calculation logic for multi-die processor

2019-03-22 Thread Pu Wen

On 2019/3/23 0:44, Borislav Petkov wrote:

On Sat, Mar 23, 2019 at 12:19:01AM +0800, Pu Wen wrote:

That 6 is not a magic number.


Well, if I see a naked 6, then it is only magic to me. Now if it were a
proper define with a descriptive name...


So maybe define it as:
#define APICID_SOCKET_ID BIT(6)

--
Regards,
Pu Wen


[RFC 2/2] rcutree: Add checks for dynticks counters in rcu_is_cpu_rrupt_from_idle

2019-03-22 Thread Joel Fernandes (Google)
In the future we would like to combine the dynticks and dynticks_nesting
counters thus leading to simplifying the code. At the moment we cannot
do that due to concerns about usermode upcalls appearing to RCU as half
of an interrupt. Byungchul tried to do it in [1] but the
"half-interrupt" concern was raised. It is half because, what RCU
expects is rcu_irq_enter() and rcu_irq_exit() pairs when the usermode
exception happens. However, only rcu_irq_enter() is observed. This
concern may not be valid anymore, but at least it used to be the case.

Out of abundance of caution, Paul added warnings [2] in the RCU code
which if not fired by 2021 may allow us to assume that such
half-interrupt scenario cannot happen any more, which can lead to
simplification of this code.

Summary of the changes are the following:

(1) In preparation for this combination of counters in the future, we
first need to first be sure that rcu_rrupt_from_idle cannot be called
from anywhere but a hard-interrupt because previously, the comments
suggested otherwise so let us be sure. We discussed this here [3]. We
use the services of lockdep to accomplish this.

(2) Further rcu_rrupt_from_idle() is not explicit about how it is using
the counters which can lead to weird future bugs. This patch therefore
makes it more explicit about the specific counter values being tested

(3) Lastly, we check for counter underflows just to be sure these are
not happening, because the previous code in rcu_rrupt_from_idle() was
allowing the case where the counters can underflow, and the function
would still return true. Now we are checking for specific values so let
us be confident by additional checking, that such underflows don't
happen. Any case, if they do, we should fix them and the screaming
warning is appropriate. All these checks checks are NOOPs if PROVE_RCU
and PROVE_LOCKING are disabled.

[1] https://lore.kernel.org/patchwork/patch/952349/
[2] Commit e11ec65cc8d6 ("rcu: Add warning to detect half-interrupts")
[3] https://lore.kernel.org/lkml/20190312150514.gb249...@google.com/

Cc: byungchul.p...@lge.com
Cc: kernel-t...@android.com
Cc: r...@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9180158756d2..d94c8ed29f6b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -381,16 +381,29 @@ static void __maybe_unused 
rcu_momentary_dyntick_idle(void)
 }
 
 /**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from 
idle
+ * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
  *
- * If the current CPU is idle or running at a first-level (not nested)
+ * If the current CPU is idle and running at a first-level (not nested)
  * interrupt from idle, return true.  The caller must have at least
  * disabled preemption.
  */
 static int rcu_is_cpu_rrupt_from_idle(void)
 {
-   return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 &&
-  __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1;
+   /* Called only from within the scheduling-clock interrupt */
+   lockdep_assert_in_irq();
+
+   /* Check for counter underflows */
+   RCU_LOCKDEP_WARN(
+   (__this_cpu_read(rcu_data.dynticks_nesting) < 0) &&
+   (__this_cpu_read(rcu_data.dynticks_nmi_nesting) < 0),
+   "RCU dynticks nesting counters underflow!");
+
+   /* Are we at first interrupt nesting level? */
+   if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
+   return false;
+
+   /* Does CPU appear to be idle from an RCU standpoint? */
+   return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
 }
 
 #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
-- 
2.21.0.392.gf8f6787159e-goog



[RFC 1/2] lockdep: Add assertion to check if in an interrupt

2019-03-22 Thread Joel Fernandes (Google)
In rcu_rrupt_from_idle, we want to check if it is called from within an
interrupt, but want to do such checking only for debug builds. lockdep
already tracks when we enter an interrupt. Let us expose it as an
assertion macro so it can be used to assert this.

Suggested-by: Steven Rostedt 
Cc: kernel-t...@android.com
Cc: r...@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/lockdep.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index c5335df2372f..d24f564823d3 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -601,11 +601,18 @@ do {  
\
  "IRQs not disabled as expected\n");   \
} while (0)
 
+#define lockdep_assert_in_irq() do {   \
+   WARN_ONCE(debug_locks && !current->lockdep_recursion && \
+ !current->hardirq_context,\
+ "Not in hardirq as expected\n");  \
+   } while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
+# define lockdep_assert_in_irq() do { } while (0)
 #endif
 
 #ifdef CONFIG_LOCKDEP
-- 
2.21.0.392.gf8f6787159e-goog



Re: [PATCH v3] rcu: Allow to eliminate softirq processing from rcutree

2019-03-22 Thread Joel Fernandes
On Fri, Mar 22, 2019 at 05:25:19PM -0700, Paul E. McKenney wrote:
> On Fri, Mar 22, 2019 at 07:48:19PM -0400, Joel Fernandes wrote:
> > On Wed, Mar 20, 2019 at 10:13:33PM +0100, Sebastian Andrzej Siewior wrote:
> > > Running RCU out of softirq is a problem for some workloads that would
> > > like to manage RCU core processing independently of other softirq
> > > work, for example, setting kthread priority.  This commit therefore
> > > introduces the `rcunosoftirq' option which moves the RCU core work
> > > from softirq to a per-CPU/per-flavor SCHED_OTHER kthread named rcuc.
> > > The SCHED_OTHER approach avoids the scalability problems that appeared
> > > with the earlier attempt to move RCU core processing to from softirq
> > > to kthreads.  That said, kernels built with RCU_BOOST=y will run the
> > > rcuc kthreads at the RCU-boosting priority.
> > [snip]
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 0f31b79eb6761..05a1e42fdaf10 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -51,6 +51,12 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include "../time/tick-internal.h"
> > >  
> > >  #include "tree.h"
> > >  #include "rcu.h"
> > > @@ -92,6 +98,9 @@ struct rcu_state rcu_state = {
> > >  /* Dump rcu_node combining tree at boot to verify correct setup. */
> > >  static bool dump_tree;
> > >  module_param(dump_tree, bool, 0444);
> > > +/* Move RCU_SOFTIRQ to rcuc kthreads. */
> > > +static bool use_softirq = 1;
> > > +module_param(use_softirq, bool, 0444);
> > >  /* Control rcu_node-tree auto-balancing at boot time. */
> > >  static bool rcu_fanout_exact;
> > >  module_param(rcu_fanout_exact, bool, 0444);
> > > @@ -2253,7 +2262,7 @@ void rcu_force_quiescent_state(void)
> > >  EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
> > >  
> > >  /* Perform RCU core processing work for the current CPU.  */
> > > -static __latent_entropy void rcu_core(struct softirq_action *unused)
> > > +static __latent_entropy void rcu_core(void)
> > >  {
> > >   unsigned long flags;
> > >   struct rcu_data *rdp = raw_cpu_ptr(_data);
> > > @@ -2295,6 +2304,34 @@ static __latent_entropy void rcu_core(struct 
> > > softirq_action *unused)
> > >   trace_rcu_utilization(TPS("End RCU core"));
> > >  }
> > >  
> > > +static void rcu_core_si(struct softirq_action *h)
> > > +{
> > > + rcu_core();
> > > +}
> > > +
> > > +static void rcu_wake_cond(struct task_struct *t, int status)
> > > +{
> > > + /*
> > > +  * If the thread is yielding, only wake it when this
> > > +  * is invoked from idle
> > > +  */
> > > + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
> > > + wake_up_process(t);
> > > +}
> > > +
> > > +static void invoke_rcu_core_kthread(void)
> > > +{
> > > + struct task_struct *t;
> > > + unsigned long flags;
> > > +
> > > + local_irq_save(flags);
> > > + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
> > > + t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
> > > + if (t != NULL && t != current)
> > > + rcu_wake_cond(t, 
> > > __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
> > > + local_irq_restore(flags);
> > > +}
> > > +
> > >  /*
> > >   * Schedule RCU callback invocation.  If the running implementation of 
> > > RCU
> > >   * does not support RCU priority boosting, just do a direct call, 
> > > otherwise
> > > @@ -2306,19 +2343,95 @@ static void invoke_rcu_callbacks(struct rcu_data 
> > > *rdp)
> > >  {
> > >   if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
> > >   return;
> > > - if (likely(!rcu_state.boost)) {
> > > - rcu_do_batch(rdp);
> > > - return;
> > > - }
> > > - invoke_rcu_callbacks_kthread();
> > > + if (rcu_state.boost || !use_softirq)
> > > + invoke_rcu_core_kthread();
> > > + rcu_do_batch(rdp);
> > 
> > Shouldn't there be an else before the rcu_do_batch? If we are waking up the
> > rcuc thread, then that will do the rcu_do_batch when it runs right?
> > 
> > Something like:
> > if (rcu_state.boost || !use_softirq)
> > invoke_rcu_core_kthread();
> > else
> > rcu_do_batch(rdp);
> > 
> > Previous code similarly had a return; also.
> 
> I believe that you are correct, so I will give it a shot.  Good eyes!

Thanks! Also I am sending some the lockdep dyntick checking patches shortly :)

> > >  }
> > >  
> > > +/*
> > > + * Wake up this CPU's rcuc kthread to do RCU core processing.
> > > + */
> > >  static void invoke_rcu_core(void)
> > >  {
> > > - if (cpu_online(smp_processor_id()))
> > > + if (!cpu_online(smp_processor_id()))
> > > + return;
> > > + if (use_softirq)
> > >   raise_softirq(RCU_SOFTIRQ);
> > > + else
> > > + invoke_rcu_core_kthread();
> > >  }
> > >  
> > > +static void rcu_cpu_kthread_park(unsigned int cpu)
> > > +{
> > > + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
> > > +}
> > > +
> > > +static 

Re: [PATCH v3] staging: ralink-gdma: Use u32 over uint32_t

2019-03-22 Thread Joe Perches
On Fri, 2019-03-22 at 22:43 +0530, Bharath Vedartham wrote:
> This fixes the checkpatch.pl warning: "Prefer u32 over uint32_t"

Please run your proposed patches through checkpatch before
submitting them.

> Signed-off-by: Bharath Vedartham 
> ---
> Changes since v2
>   - Improved changelog
>   - Thanks for the good feedback. I am a beginner. I will learn
> and grow. :)

More feedback for you.

> diff --git a/drivers/staging/ralink-gdma/ralink-gdma.c 
> b/drivers/staging/ralink-gdma/ralink-gdma.c
[]
> @@ -157,14 +157,14 @@ static struct gdma_dma_desc *to_gdma_dma_desc(struct 
> virt_dma_desc *vdesc)
>   return container_of(vdesc, struct gdma_dma_desc, vdesc);
>  }
>  
> -static inline uint32_t gdma_dma_read(struct gdma_dma_dev *dma_dev,
> +static inline u32 gdma_dma_read(struct gdma_dma_dev *dma_dev,
>unsigned int reg)

Please realign the line following your change.




[PATCH v2] PCI/LINK: bw_notification: Do not leave interrupt handler NULL

2019-03-22 Thread Alexandru Gagniuc
A threaded IRQ with a NULL handler does not work with level-triggered
interrupts. request_threaded_irq() will return an error:

  genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq 16
  pcie_bw_notification: probe of :00:1b.0:pcie010 failed with error -22

For level interrupts we need to silence the interrupt before exiting
the IRQ handler, so just clear the PCI_EXP_LNKSTA_LBMS bit there.

Fixes: e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth 
notification")
Reported-by: Linus Torvalds 
Signed-off-by: Alexandru Gagniuc 
---
Changes since v1:
 - move pcie_update_link_speed() to irq to prevent duplicate read of link_status
 - Add Fixes: to commit message
 
 drivers/pci/pcie/bw_notification.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pcie/bw_notification.c 
b/drivers/pci/pcie/bw_notification.c
index d2eae3b7cc0f..c48746f1cf3c 100644
--- a/drivers/pci/pcie/bw_notification.c
+++ b/drivers/pci/pcie/bw_notification.c
@@ -44,11 +44,10 @@ static void pcie_disable_link_bandwidth_notification(struct 
pci_dev *dev)
pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
 }
 
-static irqreturn_t pcie_bw_notification_handler(int irq, void *context)
+static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
 {
struct pcie_device *srv = context;
struct pci_dev *port = srv->port;
-   struct pci_dev *dev;
u16 link_status, events;
int ret;
 
@@ -58,6 +57,17 @@ static irqreturn_t pcie_bw_notification_handler(int irq, 
void *context)
if (ret != PCIBIOS_SUCCESSFUL || !events)
return IRQ_NONE;
 
+   pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
+   pcie_update_link_speed(port->subordinate, link_status);
+   return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t pcie_bw_notification_handler(int irq, void *context)
+{
+   struct pcie_device *srv = context;
+   struct pci_dev *port = srv->port;
+   struct pci_dev *dev;
+
/*
 * Print status from downstream devices, not this root port or
 * downstream switch port.
@@ -67,8 +77,6 @@ static irqreturn_t pcie_bw_notification_handler(int irq, void 
*context)
__pcie_print_link_status(dev, false);
up_read(_bus_sem);
 
-   pcie_update_link_speed(port->subordinate, link_status);
-   pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
return IRQ_HANDLED;
 }
 
@@ -80,7 +88,8 @@ static int pcie_bandwidth_notification_probe(struct 
pcie_device *srv)
if (!pcie_link_bandwidth_notification_supported(srv->port))
return -ENODEV;
 
-   ret = request_threaded_irq(srv->irq, NULL, pcie_bw_notification_handler,
+   ret = request_threaded_irq(srv->irq, pcie_bw_notification_irq,
+  pcie_bw_notification_handler,
   IRQF_SHARED, "PCIe BW notif", srv);
if (ret)
return ret;
-- 
2.19.2



Re: [PATCH 1/2] platform/x86: intel_pmc_core: Convert to a platform_driver

2019-03-22 Thread Rajat Jain
Hi Rajneesh,



On Fri, Mar 22, 2019 at 12:56 PM Bhardwaj, Rajneesh
 wrote:
>
> Some suggestions below
>
> On 18-Mar-19 8:36 PM, Rajat Jain wrote:
>
> On Sat, Mar 16, 2019 at 1:30 AM Rajneesh Bhardwaj
>  wrote:
>
> On Wed, Mar 13, 2019 at 03:21:23PM -0700, Rajat Jain wrote:
>
> Convert the intel_pmc_core driver to a platform driver. There is no
> functional change. Some code that tries to determine what kind of
> CPU this is, has been moved code is moved from pmc_core_probe() to
>
> Possible typo here.
>
> Ummm, you mean grammar error I guess? Sure, I will rephrase.
>
> pmc_core_init().
>
> Signed-off-by: Rajat Jain 
>
> Thanks for sending this. This is certainly useful to support suspend-resume
> functionality for this driver which is otherwise only possible with PM
> notifiers otherwise and that is not desirable. Initially this was a PCI
> driver and after design discussion it was converted to module. I would like
> to consult Andy and Srinivas for their opinion about binding it to actual
> platform bus instead of the virtual bus as in its current form. In one of the
> internal versions, we used a known acpi PNP HID.
>
> Sure, if there is an established ACPI PNP HID, then we could bind it
> using that, on platforms where we are still developing BIOS /
> coreboot. However, this might not be possible for shipping systems
> (Kabylake / skylake) where there is no plan to change the BIOS.
>
> In one of our internal patches, i had used HID of power engine plugin. IIRC, 
> During my testing it was working on KBL, CNL with UEFI BIOS but i highly 
> recommend testing it.
>
> ---8<8<-
>
> +static const struct acpi_device_id pmc_acpi_ids[] = {
>
> + {"INT33A1", 0}, /* _HID for Intel Power Engine, _CID PNP0D80*/
>
> + { }
>
>  };

We do not have this device in any of our ACPI tables today. If Intel
can confirm that this is a well known HID to be used for attaching
this driver, we can start putting it on our platform's ACPI going
forward (Whiskeylake, Cometlake, Cannonlake, Icelake ...). But I
believe we also need to have this driver attach with the device on
older platforms (Skylake, Kabylake, Amberlake) that are already
shipping, and running a Non UEFI BIOS (that may not have this HID
since it is not published).

Currently the intel_pmc_core driver attaches itself to the following
table of CPU families, without regard to whether it has that HID in
the ACPI or not:

static const struct x86_cpu_id intel_pmc_core_ids[] = {
INTEL_CPU_FAM6(SKYLAKE_MOBILE, spt_reg_map),
INTEL_CPU_FAM6(SKYLAKE_DESKTOP, spt_reg_map),
INTEL_CPU_FAM6(KABYLAKE_MOBILE, spt_reg_map),
INTEL_CPU_FAM6(KABYLAKE_DESKTOP, spt_reg_map),
INTEL_CPU_FAM6(CANNONLAKE_MOBILE, cnp_reg_map),
INTEL_CPU_FAM6(ICELAKE_MOBILE, icl_reg_map),
{}
};

So to avoid a regression, I suggest that we still maintain the above
table (may be eliminate few entries) and always attach if the CPU is
among the table, and if the CPU is not among the table, use the ACPI
HID to attach. I propose to attach to at least Skylake and Kabylake
systems using the table above, and for Canonlake and Icelake and
newer, we can rely on BIOS providing the ACPI HID. Of course I do not
know if all non-Google Canonlake/Icelake platforms will have this HID
in their BIOS. If we are not sure, we can include Canonlake and
Icelake also in that list, an. Please let me know what do you think.

Thanks,

Rajat

>
>
>
> -builtin_pci_driver(intel_pmc_core_driver);
>
> +static struct platform_driver pmc_plat_driver = {
>
> + .remove = pmc_plat_remove,
>
> + .probe = pmc_plat_probe,
>
> + .driver = {
>
> + .name = "pmc_core_driver",
>
> + .acpi_match_table = ACPI_PTR(pmc_acpi_ids),
>
> + },
>
> +};
>
> ---
> This is rebased off
> git://git.infradead.org/linux-platform-drivers-x86.git/for-next
>
>  drivers/platform/x86/intel_pmc_core.c | 93 ---
>  1 file changed, 68 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/platform/x86/intel_pmc_core.c 
> b/drivers/platform/x86/intel_pmc_core.c
> index f2c621b55f49..55578d07610c 100644
> --- a/drivers/platform/x86/intel_pmc_core.c
> +++ b/drivers/platform/x86/intel_pmc_core.c
> @@ -19,6 +19,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>
>  #include 
> @@ -854,12 +855,59 @@ static const struct dmi_system_id pmc_core_dmi_table[]  
> = {
>   {}
>  };
>
> -static int __init pmc_core_probe(void)
> +static int pmc_core_probe(struct platform_device *pdev)
>  {
> - struct pmc_dev *pmcdev = 
> + struct pmc_dev *pmcdev = platform_get_drvdata(pdev);
> + int err;
> +
> + pmcdev->regbase = ioremap(pmcdev->base_addr,
> +   pmcdev->map->regmap_length);
> + if (!pmcdev->regbase)
> + return -ENOMEM;
> +
> + mutex_init(>lock);
> + pmcdev->pmc_xram_read_bit = 

Re: [PATCH v3] rcu: Allow to eliminate softirq processing from rcutree

2019-03-22 Thread Paul E. McKenney
On Fri, Mar 22, 2019 at 07:48:19PM -0400, Joel Fernandes wrote:
> On Wed, Mar 20, 2019 at 10:13:33PM +0100, Sebastian Andrzej Siewior wrote:
> > Running RCU out of softirq is a problem for some workloads that would
> > like to manage RCU core processing independently of other softirq
> > work, for example, setting kthread priority.  This commit therefore
> > introduces the `rcunosoftirq' option which moves the RCU core work
> > from softirq to a per-CPU/per-flavor SCHED_OTHER kthread named rcuc.
> > The SCHED_OTHER approach avoids the scalability problems that appeared
> > with the earlier attempt to move RCU core processing to from softirq
> > to kthreads.  That said, kernels built with RCU_BOOST=y will run the
> > rcuc kthreads at the RCU-boosting priority.
> [snip]
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 0f31b79eb6761..05a1e42fdaf10 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -51,6 +51,12 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include "../time/tick-internal.h"
> >  
> >  #include "tree.h"
> >  #include "rcu.h"
> > @@ -92,6 +98,9 @@ struct rcu_state rcu_state = {
> >  /* Dump rcu_node combining tree at boot to verify correct setup. */
> >  static bool dump_tree;
> >  module_param(dump_tree, bool, 0444);
> > +/* Move RCU_SOFTIRQ to rcuc kthreads. */
> > +static bool use_softirq = 1;
> > +module_param(use_softirq, bool, 0444);
> >  /* Control rcu_node-tree auto-balancing at boot time. */
> >  static bool rcu_fanout_exact;
> >  module_param(rcu_fanout_exact, bool, 0444);
> > @@ -2253,7 +2262,7 @@ void rcu_force_quiescent_state(void)
> >  EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
> >  
> >  /* Perform RCU core processing work for the current CPU.  */
> > -static __latent_entropy void rcu_core(struct softirq_action *unused)
> > +static __latent_entropy void rcu_core(void)
> >  {
> > unsigned long flags;
> > struct rcu_data *rdp = raw_cpu_ptr(_data);
> > @@ -2295,6 +2304,34 @@ static __latent_entropy void rcu_core(struct 
> > softirq_action *unused)
> > trace_rcu_utilization(TPS("End RCU core"));
> >  }
> >  
> > +static void rcu_core_si(struct softirq_action *h)
> > +{
> > +   rcu_core();
> > +}
> > +
> > +static void rcu_wake_cond(struct task_struct *t, int status)
> > +{
> > +   /*
> > +* If the thread is yielding, only wake it when this
> > +* is invoked from idle
> > +*/
> > +   if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
> > +   wake_up_process(t);
> > +}
> > +
> > +static void invoke_rcu_core_kthread(void)
> > +{
> > +   struct task_struct *t;
> > +   unsigned long flags;
> > +
> > +   local_irq_save(flags);
> > +   __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
> > +   t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
> > +   if (t != NULL && t != current)
> > +   rcu_wake_cond(t, 
> > __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
> > +   local_irq_restore(flags);
> > +}
> > +
> >  /*
> >   * Schedule RCU callback invocation.  If the running implementation of RCU
> >   * does not support RCU priority boosting, just do a direct call, otherwise
> > @@ -2306,19 +2343,95 @@ static void invoke_rcu_callbacks(struct rcu_data 
> > *rdp)
> >  {
> > if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
> > return;
> > -   if (likely(!rcu_state.boost)) {
> > -   rcu_do_batch(rdp);
> > -   return;
> > -   }
> > -   invoke_rcu_callbacks_kthread();
> > +   if (rcu_state.boost || !use_softirq)
> > +   invoke_rcu_core_kthread();
> > +   rcu_do_batch(rdp);
> 
> Shouldn't there be an else before the rcu_do_batch? If we are waking up the
> rcuc thread, then that will do the rcu_do_batch when it runs right?
> 
> Something like:
>   if (rcu_state.boost || !use_softirq)
>   invoke_rcu_core_kthread();
>   else
>   rcu_do_batch(rdp);
> 
> Previous code similarly had a return; also.

I believe that you are correct, so I will give it a shot.  Good eyes!

> >  }
> >  
> > +/*
> > + * Wake up this CPU's rcuc kthread to do RCU core processing.
> > + */
> >  static void invoke_rcu_core(void)
> >  {
> > -   if (cpu_online(smp_processor_id()))
> > +   if (!cpu_online(smp_processor_id()))
> > +   return;
> > +   if (use_softirq)
> > raise_softirq(RCU_SOFTIRQ);
> > +   else
> > +   invoke_rcu_core_kthread();
> >  }
> >  
> > +static void rcu_cpu_kthread_park(unsigned int cpu)
> > +{
> > +   per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
> > +}
> > +
> > +static int rcu_cpu_kthread_should_run(unsigned int cpu)
> > +{
> > +   return __this_cpu_read(rcu_data.rcu_cpu_has_work);
> > +}
> > +
> > +/*
> > + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces
> > + * the RCU softirq used in configurations of RCU that do not support RCU
> > + * priority boosting.
> > + */
> > +static void 

Re: [PATCH v2 00/11] arm64: dts: g12a: Add boards peripherals

2019-03-22 Thread Kevin Hilman
Neil Armstrong  writes:

> Following [1], add regulators, bluetooth, UART and ADC keys on :
> - meson-g12a-x96-max
> - meson-g12a-u200
> - meson-g12a-sei510
>
> Changes since v1:
> - Collected martin's reviewed-bys
> - Added missing regulators in commit logs
> - fixed x96 dc_in regulator
> - fixed includes overall
> - removed dwc2 enable floowing disabled remove in [1]

I queud patches 1-5/11 for v5.2 (branch: v5.2/dt64).  The others have
some dependencies, or you mentioned you'll do a v3.

Kevin


Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

2019-03-22 Thread Subhra Mazumdar



On 3/21/19 2:20 PM, Julien Desfossez wrote:

On Tue, Mar 19, 2019 at 10:31 PM Subhra Mazumdar 
wrote:

On 3/18/19 8:41 AM, Julien Desfossez wrote:


On further investigation, we could see that the contention is mostly in the
way rq locks are taken. With this patchset, we lock the whole core if
cpu.tag is set for at least one cgroup. Due to this, __schedule() is more or
less serialized for the core and that attributes to the performance loss
that we are seeing. We also saw that newidle_balance() takes considerably
long time in load_balance() due to the rq spinlock contention. Do you think
it would help if the core-wide locking was only performed when absolutely
needed ?

Is the core wide lock primarily responsible for the regression? I ran 
upto patch

12 which also has the core wide lock for tagged cgroups and also calls
newidle_balance() from pick_next_task(). I don't see any regression.  Of 
course
the core sched version of pick_next_task() may be doing more but 
comparing with

the __pick_next_task() it doesn't look too horrible.


Re: [PATCH v4 4/5] soc: qcom: socinfo: Expose custom attributes

2019-03-22 Thread Stephen Boyd
Quoting Vaishali Thakkar (2019-03-20 22:51:20)
> On Thu, 14 Mar 2019 at 21:28, Stephen Boyd  wrote:
> >
> > Quoting Vaishali Thakkar (2019-03-14 04:25:16)
> > > On Fri, 1 Mar 2019 at 03:02, Stephen Boyd  wrote:
> >
> > >
> > > In the case of converting it to cpu native during probe, I'll need to
> > > declare an extra struct with u32 being the parsed version for it to be
> > > correct. Wouldn't it add extra overhead?
> >
> > Yes it would be some small extra overhead that could be allocated on the
> > kernel's heap. What's the maximum size? A hundred bytes or so?
> >
> > I don't see much of a problem with this approach. It simplifies the
> > patch series because nothing new is introduced in debugfs core and the
> > endian conversion is done once in one place instead of being scattered
> > throughout the code. Sounds like a good improvement to me.
> >
> 
> Yes, it's true that this approach is better than introducing new endian
> functions in debugfs core but we should also keep in mind that this is
> applicable only for 4 use cases. For other usecases, we want to print
> string and hex values. So, I would either need new debugfs core
> functions for the same. I tried introducing debugfs_create_str for string
> values but we're ending up with introducing bunch of other helpers in
> the core as simple_attr_read expects integer values. Similarly, for hex
> values , I can't use debugfs_create_u32 as defined attributes in the
> core has unsigned int as a specifier, will need to introduce some extra
> helpers over there again.

I imagine there are other uses of printing a string and hex value in
debugfs. There's debugfs_create_x32() and debugfs_create_x64() for the
hex value printing part (if you want that format). There's also
debugfs_create_devm_seqfile() which may work to print a string from some
struct member. I'm not sure why you're using simple_attr_read(). Where
does that become important?

> 
> Also, in case of keeping all other cases as it is, it'll look quite
> asymmetric to use debugfs u32 function in init and using local macros
> for other cases. I can have DEBUGFS_UINT_ADD like wrapper
> macro for debugfs_create_u32 but again not sure if doing
> all of this looks better than what we have at the moment as just having
> 3 local macros covering our all cases without having lot of duplicated
> code.
> 
>  Let me know if about your opinion on the same. Thanks.

My opinion is still that it would be best to push things that aren't SoC
specific into the debugfs core and try to use as much from the core as
possible. There doesn't seem to be anything very SoC specific here so
I'm lost why this isn't doable.



Re: [PATCH REBASED] mm, memcg: Make scan aggression always exclude protection

2019-03-22 Thread Chris Down

Chris Down writes:
Are you certain? If so, I don't see what you mean. This is how the 
code looks in Linus' tree after the fixups:


Hmm, apparently this actually didn't go into Linus' tree yet, so yeah, seems 
worth having as a fixup maybe indeed.


Re: [PATCH] genirq: Respect IRQCHIP_SKIP_SET_WAKE in irq_chip_set_wake_parent()

2019-03-22 Thread Stephen Boyd
Quoting Thomas Gleixner (2019-03-21 02:26:26)
> On Fri, 15 Mar 2019, Stephen Boyd wrote:
> 
> > This function returns an error if a child interrupt controller calls
> > irq_chip_set_wake_parent() but that parent interrupt controller has the
> > IRQCHIP_SKIP_SET_WAKE flag. Let's return 0 for success instead because
> > there isn't anything to do.
> > 
> > There's also the possibility that a parent indicates that we should skip
> > it, but the grandparent has an .irq_set_wake callback. Let's iterate
> > through the parent chain as long as the IRQCHIP_SKIP_SET_WAKE flag isn't
> > set so we can find the first parent that needs to handle the wake
> > configuration. This fixes a problem on my Qualcomm sdm845 device where
> > I'm trying to enable wake on an irq from the gpio controller that's a
> > child of the qcom pdc interrupt controller. The qcom pdc interrupt
> > controller has the IRQCHIP_SKIP_SET_WAKE flag set, and so does the
> > grandparent (ARM GIC), causing this function to return a failure because
> > the parent controller doesn't have the .irq_set_wake callback set.
> 
> It took me some time to distangle that changelog and I don't think that
> this is the right thing to do.

Yes, your diagram would be a useful addition to the commit text.

> 
> set_irq_wake_real() returns 0 when the topmost chip has SKIP_SET_WAKE set.

Just to confirm, the topmost chip would be chip B or chip C below?

> 
> So let's assume we have the following chains:
> 
>   chip A -> chip B 
> 
>   chip A -> chip B -> chip C
> 
> chip A has SKIP_SET_WAKE not set
> chip B has SKIP_SET_WAKE set
> chip C has SKIP_SET_WAKE not set and invokes irq_chip_set_wake_parent()
> 
> Now assume we have interrupt X connected to chip B and interrupt Y
> connected to chip C.
> 
> If irq_set_wake() is called for interrupt X, then the function returns
> without trying to invoke the set_wake() callback of chip A.

It's not clear to me that having SKIP_SET_WAKE set means "completely
ignore set wake for irqs from this domain" vs. "skip setting wake here
because the .irq_set_wake() is intentionally omitted for this chip".
Reading Santosh's reasoning in commit 60f96b41f71d ("genirq: Add
IRQCHIP_SKIP_SET_WAKE flag") just further confuses me because it sounds
like the latter.

> 
> If irq_set_wake() is called for interrupt Y, irq_chip_set_wake_parent() is
> invoked from chip C which then skips chip B, but tries to invoke the
> callback on chip A.
> 
> That's inconsistent and changes the existing behaviour. So IMO, the right
> thing to do is to return 0 from irq_chip_set_wake_parent() when the parent
> has SKIP_SET_WAKE set and not to try to follow the whole chain. That should
> fix your problem nicely w/o changing behaviour.

Ok. I understand that with hierarchical chips you want it to be explicit
in the code that a parent chip needs to be called or not. This works for
me, and it's actually how I had originally solved this problem. Will you
merge your patch or do you want me to resend it with some updated commit
text?



Re: [PATCH v3] rcu: Allow to eliminate softirq processing from rcutree

2019-03-22 Thread Joel Fernandes
On Wed, Mar 20, 2019 at 10:13:33PM +0100, Sebastian Andrzej Siewior wrote:
> Running RCU out of softirq is a problem for some workloads that would
> like to manage RCU core processing independently of other softirq
> work, for example, setting kthread priority.  This commit therefore
> introduces the `rcunosoftirq' option which moves the RCU core work
> from softirq to a per-CPU/per-flavor SCHED_OTHER kthread named rcuc.
> The SCHED_OTHER approach avoids the scalability problems that appeared
> with the earlier attempt to move RCU core processing to from softirq
> to kthreads.  That said, kernels built with RCU_BOOST=y will run the
> rcuc kthreads at the RCU-boosting priority.
[snip]
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 0f31b79eb6761..05a1e42fdaf10 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -51,6 +51,12 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "../time/tick-internal.h"
>  
>  #include "tree.h"
>  #include "rcu.h"
> @@ -92,6 +98,9 @@ struct rcu_state rcu_state = {
>  /* Dump rcu_node combining tree at boot to verify correct setup. */
>  static bool dump_tree;
>  module_param(dump_tree, bool, 0444);
> +/* Move RCU_SOFTIRQ to rcuc kthreads. */
> +static bool use_softirq = 1;
> +module_param(use_softirq, bool, 0444);
>  /* Control rcu_node-tree auto-balancing at boot time. */
>  static bool rcu_fanout_exact;
>  module_param(rcu_fanout_exact, bool, 0444);
> @@ -2253,7 +2262,7 @@ void rcu_force_quiescent_state(void)
>  EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
>  
>  /* Perform RCU core processing work for the current CPU.  */
> -static __latent_entropy void rcu_core(struct softirq_action *unused)
> +static __latent_entropy void rcu_core(void)
>  {
>   unsigned long flags;
>   struct rcu_data *rdp = raw_cpu_ptr(_data);
> @@ -2295,6 +2304,34 @@ static __latent_entropy void rcu_core(struct 
> softirq_action *unused)
>   trace_rcu_utilization(TPS("End RCU core"));
>  }
>  
> +static void rcu_core_si(struct softirq_action *h)
> +{
> + rcu_core();
> +}
> +
> +static void rcu_wake_cond(struct task_struct *t, int status)
> +{
> + /*
> +  * If the thread is yielding, only wake it when this
> +  * is invoked from idle
> +  */
> + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
> + wake_up_process(t);
> +}
> +
> +static void invoke_rcu_core_kthread(void)
> +{
> + struct task_struct *t;
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
> + t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
> + if (t != NULL && t != current)
> + rcu_wake_cond(t, 
> __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
> + local_irq_restore(flags);
> +}
> +
>  /*
>   * Schedule RCU callback invocation.  If the running implementation of RCU
>   * does not support RCU priority boosting, just do a direct call, otherwise
> @@ -2306,19 +2343,95 @@ static void invoke_rcu_callbacks(struct rcu_data *rdp)
>  {
>   if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
>   return;
> - if (likely(!rcu_state.boost)) {
> - rcu_do_batch(rdp);
> - return;
> - }
> - invoke_rcu_callbacks_kthread();
> + if (rcu_state.boost || !use_softirq)
> + invoke_rcu_core_kthread();
> + rcu_do_batch(rdp);

Shouldn't there be an else before the rcu_do_batch? If we are waking up the
rcuc thread, then that will do the rcu_do_batch when it runs right?

Something like:
if (rcu_state.boost || !use_softirq)
invoke_rcu_core_kthread();
else
rcu_do_batch(rdp);

Previous code similarly had a return; also.

>  }
>  
> +/*
> + * Wake up this CPU's rcuc kthread to do RCU core processing.
> + */
>  static void invoke_rcu_core(void)
>  {
> - if (cpu_online(smp_processor_id()))
> + if (!cpu_online(smp_processor_id()))
> + return;
> + if (use_softirq)
>   raise_softirq(RCU_SOFTIRQ);
> + else
> + invoke_rcu_core_kthread();
>  }
>  
> +static void rcu_cpu_kthread_park(unsigned int cpu)
> +{
> + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
> +}
> +
> +static int rcu_cpu_kthread_should_run(unsigned int cpu)
> +{
> + return __this_cpu_read(rcu_data.rcu_cpu_has_work);
> +}
> +
> +/*
> + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces
> + * the RCU softirq used in configurations of RCU that do not support RCU
> + * priority boosting.
> + */
> +static void rcu_cpu_kthread(unsigned int cpu)
> +{
> + unsigned int *statusp = this_cpu_ptr(_data.rcu_cpu_kthread_status);
> + char work, *workp = this_cpu_ptr(_data.rcu_cpu_has_work);
> + int spincnt;
> +
> + for (spincnt = 0; spincnt < 10; spincnt++) {
> + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
> + 

Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

2019-03-22 Thread Tim Chen
On 3/22/19 4:28 PM, Tim Chen wrote:
> On 3/19/19 7:29 PM, Subhra Mazumdar wrote:
>>
>> On 3/18/19 8:41 AM, Julien Desfossez wrote:
>>> The case where we try to acquire the lock on 2 runqueues belonging to 2
>>> different cores requires the rq_lockp wrapper as well otherwise we
>>> frequently deadlock in there.
>>>
>>> This fixes the crash reported in
>>> 1552577311-8218-1-git-send-email-jdesfos...@digitalocean.com
>>>
>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index 76fee56..71bb71f 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -2078,7 +2078,7 @@ static inline void double_rq_lock(struct rq *rq1, 
>>> struct rq *rq2)
>>>   raw_spin_lock(rq_lockp(rq1));
>>>   __acquire(rq2->lock);    /* Fake it out ;) */
>>>   } else {
>>> -    if (rq1 < rq2) {
>>> +    if (rq_lockp(rq1) < rq_lockp(rq2)) {
>>>   raw_spin_lock(rq_lockp(rq1));
>>>   raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
>>>   } else {
> 
> 
> Pawan was seeing occasional crashes and lock up that's avoided by doing the 
> following.
> We're trying to dig a little more tracing to see why pick_next_entity is 
> returning
> NULL.
> 

We found the root cause was a missing chunk when we port Subhra's fix of 
pick_next_entity

 * Someone really wants this to run. If it's not unfair, run it.
*/
-   if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+   if (left && cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left)
+   < 1) 

That fixes the problem of pick_next_entity returning NULL.  sorry for the noise.

Tim


Re: [RFC][PATCH 03/16] sched: Wrap rq::lock access

2019-03-22 Thread Tim Chen
On 3/19/19 7:29 PM, Subhra Mazumdar wrote:
> 
> On 3/18/19 8:41 AM, Julien Desfossez wrote:
>> The case where we try to acquire the lock on 2 runqueues belonging to 2
>> different cores requires the rq_lockp wrapper as well otherwise we
>> frequently deadlock in there.
>>
>> This fixes the crash reported in
>> 1552577311-8218-1-git-send-email-jdesfos...@digitalocean.com
>>
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 76fee56..71bb71f 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2078,7 +2078,7 @@ static inline void double_rq_lock(struct rq *rq1, 
>> struct rq *rq2)
>>   raw_spin_lock(rq_lockp(rq1));
>>   __acquire(rq2->lock);    /* Fake it out ;) */
>>   } else {
>> -    if (rq1 < rq2) {
>> +    if (rq_lockp(rq1) < rq_lockp(rq2)) {
>>   raw_spin_lock(rq_lockp(rq1));
>>   raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
>>   } else {


Pawan was seeing occasional crashes and lock up that's avoided by doing the 
following.
We're trying to dig a little more tracing to see why pick_next_entity is 
returning
NULL.

Tim

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5349ebedc645..4c7f353b8900 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7031,6 +7031,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf
}
 
se = pick_next_entity(cfs_rq, curr);
+   if (!se)
+   return NULL;
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
 
@@ -7070,6 +7072,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf
 
do {
se = pick_next_entity(cfs_rq, NULL);
+   if (!se)
+   return NULL;
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);


Re: [PATCH 04/10] x86, olpc: Use a correct version when making up a battery node

2019-03-22 Thread Thomas Gleixner
On Sun, 10 Mar 2019, Lubomir Rintel wrote:

Subject prefix ...

> The XO-1 and XO-1.5 batteries apparently differ in an ability to report
> ambient temperature. We need to use a different compatible string for the
> XO-1.5 battery.
> 
> Previously olpc_dt_fixup() used the presence od the battery node's

s/od/of/

>  
> +int olpc_dt_compatible_match(phandle node, const char *compat)
> +{
> + char buf[64];
> + int plen;
> + char *p;
> + int len;

Please coalesce variables of the same type. No point in wasting space.

char buf[64], *p;
int plen, len;

Hmm?

> +
> + if (olpc_dt_compatible_match(node, "olpc,xo1-battery")) {
> + /* If we have a olpc,xo1-battery compatible, then we're
> +  * running a new enough firmware that already has
> +  * the dcon node.
> +  */

Comment style:

 /*
  * This is a proper multi line comment even
  * if networking people use that horrible style
  * above.
  */

With those nitpicks fixed:

Acked-by: Thomas Gleixner 


[PATCH 1/8] vfio/mdev: Fix to not do put_device on device_register failure

2019-03-22 Thread Parav Pandit
device_register() performs put_device() if device_add() fails.
This balances with device_initialize().

mdev core performing put_device() when device_register() fails,
is an error that puts already released device again.
Therefore, don't put the device on error.

Fixes: 7b96953bc640 ("vfio: Mediated device Core driver")
Signed-off-by: Parav Pandit 
---
 drivers/vfio/mdev/mdev_core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 0212f0e..3e5880a 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -318,10 +318,8 @@ int mdev_device_create(struct kobject *kobj, struct device 
*dev, uuid_le uuid)
dev_set_name(>dev, "%pUl", uuid.b);
 
ret = device_register(>dev);
-   if (ret) {
-   put_device(>dev);
+   if (ret)
goto mdev_fail;
-   }
 
ret = mdev_device_create_ops(kobj, mdev);
if (ret)
-- 
1.8.3.1



[PATCH 2/8] vfio/mdev: Avoid release parent reference during error path

2019-03-22 Thread Parav Pandit
During mdev parent registration in mdev_register_device(),
if parent device is duplicate, it releases the reference of existing
parent device.
This is incorrect. Existing parent device should not be touched.

Fixes: 7b96953bc640 ("vfio: Mediated device Core driver")
Signed-off-by: Parav Pandit 
---
 drivers/vfio/mdev/mdev_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 3e5880a..4f213e4d 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -182,6 +182,7 @@ int mdev_register_device(struct device *dev, const struct 
mdev_parent_ops *ops)
/* Check for duplicate */
parent = __find_parent_device(dev);
if (parent) {
+   parent = NULL;
ret = -EEXIST;
goto add_dev_err;
}
-- 
1.8.3.1



[PATCH 5/8] vfio/mdev: Avoid masking error code to EBUSY

2019-03-22 Thread Parav Pandit
Instead of masking return error to -EBUSY, return actual error
returned by the driver.

Signed-off-by: Parav Pandit 
---
 drivers/vfio/mdev/mdev_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 3d91f62..ab05464 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -142,7 +142,7 @@ static int mdev_device_remove_ops(struct mdev_device *mdev, 
bool force_remove)
 */
ret = parent->ops->remove(mdev);
if (ret && !force_remove)
-   return -EBUSY;
+   return ret;
 
sysfs_remove_groups(>dev.kobj, parent->ops->mdev_attr_groups);
return 0;
-- 
1.8.3.1



[PATCH 7/8] vfio/mdev: Fix aborting mdev child device removal if one fails

2019-03-22 Thread Parav Pandit
device_for_each_child() stops executing callback function for remaining
child devices, if callback hits an error.
Each child mdev device is independent of each other.
While unregistering parent device, mdev core must remove all child mdev
devices.
Therefore, mdev_device_remove_cb() always returns success so that
device_for_each_child doesn't abort if one child removal hits error.

While at it, improve remove and unregister functions for below simplicity.

There isn't need to pass forced flag pointer during mdev parent
removal which invokes mdev_device_remove(). So simplify the flow.

mdev_device_remove() is called from two paths.
1. mdev_unregister_driver()
 mdev_device_remove_cb()
   mdev_device_remove()
2. remove_store()
 mdev_device_remove()

When device is removed by user using remote_store(), device under
removal is mdev device.
When device is removed during parent device removal using generic child
iterator, mdev check is already done using dev_is_mdev().

Hence, remove the unnecessary loop in mdev_device_remove().

Fixes: 7b96953bc640 ("vfio: Mediated device Core driver")
Signed-off-by: Parav Pandit 
---
 drivers/vfio/mdev/mdev_core.c | 24 +---
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index ab05464..944a058 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -150,10 +150,10 @@ static int mdev_device_remove_ops(struct mdev_device 
*mdev, bool force_remove)
 
 static int mdev_device_remove_cb(struct device *dev, void *data)
 {
-   if (!dev_is_mdev(dev))
-   return 0;
+   if (dev_is_mdev(dev))
+   mdev_device_remove(dev, true);
 
-   return mdev_device_remove(dev, data ? *(bool *)data : true);
+   return 0;
 }
 
 /*
@@ -241,7 +241,6 @@ int mdev_register_device(struct device *dev, const struct 
mdev_parent_ops *ops)
 void mdev_unregister_device(struct device *dev)
 {
struct mdev_parent *parent;
-   bool force_remove = true;
 
mutex_lock(_list_lock);
parent = __find_parent_device(dev);
@@ -255,8 +254,7 @@ void mdev_unregister_device(struct device *dev)
list_del(>next);
class_compat_remove_link(mdev_bus_compat_class, dev, NULL);
 
-   device_for_each_child(dev, (void *)_remove,
- mdev_device_remove_cb);
+   device_for_each_child(dev, NULL, mdev_device_remove_cb);
 
parent_remove_sysfs_files(parent);
 
@@ -346,24 +344,12 @@ int mdev_device_create(struct kobject *kobj, struct 
device *dev, uuid_le uuid)
 
 int mdev_device_remove(struct device *dev, bool force_remove)
 {
-   struct mdev_device *mdev, *tmp;
+   struct mdev_device *mdev;
struct mdev_parent *parent;
struct mdev_type *type;
int ret;
 
mdev = to_mdev_device(dev);
-
-   mutex_lock(_list_lock);
-   list_for_each_entry(tmp, _list, next) {
-   if (tmp == mdev)
-   break;
-   }
-
-   if (tmp != mdev) {
-   mutex_unlock(_list_lock);
-   return -ENODEV;
-   }
-
if (!mdev->active) {
mutex_unlock(_list_lock);
return -EAGAIN;
-- 
1.8.3.1



[PATCH 0/8] vfio/mdev: Improve vfio/mdev core module

2019-03-22 Thread Parav Pandit
As we would like to use mdev subsystem for wider use case as
discussed in [1], [2] apart from an offline discussion.
This use case is also discussed with wider forum in [4] in track
'Lightweight NIC HW functions for container offload use cases'.

This series is prep-work and improves vfio/mdev module in following ways.

Patch-1 and 2 Fixes releasing parent dev reference during error unwinding
of mdev create and mdev parent registration.
Patch-3 Simplifies mdev device for unused kref.
Patch-4 Drops redundant extern prefix of exported symbols.
Patch-5 Returns right error code from vendor driver.
Patch-6 Fixes to use right sysfs remove sequence.
Patch-7 Fixes removing all child devices if one of them fails.
Patch 8 Brings improvements to mdev in following ways.

1. Fix race conditions among mdev parent's create(), remove() and
mdev parent unregistration routines that leads to call traces.

2. Setup vendor mdev device before placing the device on mdev bus.
This ensures that vfio_mdev or any other module that accesses mdev,
is rightly in any of the callbacks of mdev_register_driver().
This follows Linux driver model now.
Similarly follow exact reverse remove sequence, i.e. to take away the
device first from the bus before removing underlying hardware mdev.

This series is tested using
(a) mtty with VM using vfio_mdev driver for positive tests.
(b) mtty with vfio_mdev with error race condition cases of create,
remove and mtty driver.
(c) mlx5 core driver using RFC patches [3] and internal patches.
Internal patches are large and cannot be combined with this
prep-work patches. It will posted once prep-work completes.

[1] https://www.spinics.net/lists/netdev/msg556978.html
[2] https://lkml.org/lkml/2019/3/7/696
[3] https://lkml.org/lkml/2019/3/8/819
[4] https://netdevconf.org/0x13/session.html?workshop-hardware-offload


Parav Pandit (8):
  vfio/mdev: Fix to not do put_device on device_register failure
  vfio/mdev: Avoid release parent reference during error path
  vfio/mdev: Removed unused kref
  vfio/mdev: Drop redundant extern for exported symbols
  vfio/mdev: Avoid masking error code to EBUSY
  vfio/mdev: Follow correct remove sequence
  vfio/mdev: Fix aborting mdev child device removal if one fails
  vfio/mdev: Improve the create/remove sequence

 drivers/vfio/mdev/mdev_core.c| 164 +++
 drivers/vfio/mdev/mdev_private.h |   8 +-
 drivers/vfio/mdev/mdev_sysfs.c   |   8 +-
 include/linux/mdev.h |  21 +++--
 4 files changed, 98 insertions(+), 103 deletions(-)

-- 
1.8.3.1



[PATCH 8/8] vfio/mdev: Improve the create/remove sequence

2019-03-22 Thread Parav Pandit
There are five problems with current code structure.
1. mdev device is placed on the mdev bus before it is created in the
vendor driver. Once a device is placed on the mdev bus without creating
its supporting underlying vendor device, an open() can get triggered by
userspace on partially initialized device.
Below ladder diagram highlight it.

  cpu-0   cpu-1
  -   -
   create_store()
 mdev_create_device()
   device_register()
  ...
 vfio_mdev_probe()
 ...creates char device
vfio_mdev_open()
  parent->ops->open(mdev)
vfio_ap_mdev_open()
  matrix_mdev = NULL
[...]
parent->ops->create()
  vfio_ap_mdev_create()
mdev_set_drvdata(mdev, matrix_mdev);
/* Valid pointer set above */

2. Current creation sequence is,
   parent->ops_create()
   groups_register()

Remove sequence is,
   parent->ops->remove()
   groups_unregister()
However, remove sequence should be exact mirror of creation sequence.
Once this is achieved, all users of the mdev will be terminated first
before removing underlying vendor device.
(Follow standard linux driver model).
At that point vendor's remove() ops shouldn't failed because device is
taken off the bus that should terminate the users.

3. Additionally any new mdev driver that wants to work on mdev device
during probe() routine registered using mdev_register_driver() needs to
get stable mdev structure.

4. In following sequence, child devices created while removing mdev parent
device can be left out, or it may lead to race of removing half
initialized child mdev devices.

issue-1:

   cpu-0 cpu-1
   - -
  mdev_unregister_device()
 device_for_each_child()
mdev_device_remove_cb()
mdev_device_remove()
create_store()
  mdev_device_create()   [...]
   device_register()
  parent_remove_sysfs_files()
  /* BUG: device added by cpu-0
   * whose parent is getting removed.
   */

issue-2:

   cpu-0 cpu-1
   - -
create_store()
  mdev_device_create()   [...]
   device_register()

   [...]  mdev_unregister_device()
 device_for_each_child()
mdev_device_remove_cb()
mdev_device_remove()

   mdev_create_sysfs_files()
   /* BUG: create is adding
* sysfs files for a device
* which is undergoing removal.
*/
 parent_remove_sysfs_files()

5. Below crash is observed when user initiated remove is in progress
and mdev_unregister_driver() completes parent unregistration.

   cpu-0 cpu-1
   - -
remove_store()
   mdev_device_remove()
   active = false;
  mdev_unregister_device()
remove type
   [...]
   mdev_remove_ops() crashes.

This is similar race like create() racing with mdev_unregister_device().

mtty mtty: MDEV: Registered
iommu: Adding device 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 to group 57
vfio_mdev 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001: MDEV: group_id = 57
mdev_device_remove sleep started
mtty mtty: MDEV: Unregistering
mtty_dev: Unloaded!
BUG: unable to handle kernel paging request at c027d668
PGD af9818067 P4D af9818067 PUD af981a067 PMD 8583c3067 PTE 0
Oops:  [#1] SMP PTI
CPU: 15 PID: 3517 Comm: bash Kdump: loaded Not tainted 5.0.0-rc7-vdevbus+ #2
Hardware name: Supermicro SYS-6028U-TR4+/X10DRU-i+, BIOS 2.0b 08/09/2016
RIP: 0010:mdev_device_remove_ops+0x1a/0x50 [mdev]
Call Trace:
 mdev_device_remove+0xef/0x130 [mdev]
 remove_store+0x77/0xa0 [mdev]
 kernfs_fop_write+0x113/0x1a0
 __vfs_write+0x33/0x1b0
 ? rcu_read_lock_sched_held+0x64/0x70
 ? rcu_sync_lockdep_assert+0x2a/0x50
 ? __sb_start_write+0x121/0x1b0
 ? vfs_write+0x17c/0x1b0
 vfs_write+0xad/0x1b0
 ? trace_hardirqs_on_thunk+0x1a/0x1c
 ksys_write+0x55/0xc0
 do_syscall_64+0x5a/0x210

Therefore, mdev core is improved in following ways to overcome above
issues.

1. Before placing mdev devices on the bus, perform vendor drivers
creation which supports the mdev creation.
This ensures that mdev specific all necessary fields are initialized
before a given mdev can be accessed by bus driver.

2. During remove flow, first remove 

Re: [PATCH 4.4 000/230] 4.4.177-stable review

2019-03-22 Thread kernelci.org bot
stable-rc/linux-4.4.y boot: 42 boots: 3 failed, 39 passed 
(v4.4.176-231-g6926dee23fa0)

Full Boot Summary: 
https://kernelci.org/boot/all/job/stable-rc/branch/linux-4.4.y/kernel/v4.4.176-231-g6926dee23fa0/
Full Build Summary: 
https://kernelci.org/build/stable-rc/branch/linux-4.4.y/kernel/v4.4.176-231-g6926dee23fa0/

Tree: stable-rc
Branch: linux-4.4.y
Git Describe: v4.4.176-231-g6926dee23fa0
Git Commit: 6926dee23fa096361e7d170c7fa21a8b51eb7673
Git URL: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Tested: 20 unique boards, 11 SoC families, 8 builds out of 190

Boot Regressions Detected:

arm:

multi_v7_defconfig:
gcc-7:
  tegra124-nyan-big:
  lab-collabora: failing since 24 days (last pass: 
v4.4.174-119-gf5fe3003919f - first fail: v4.4.176-30-ge577ed472fce)

tegra_defconfig:
gcc-7:
  tegra124-nyan-big:
  lab-collabora: failing since 24 days (last pass: 
v4.4.174-119-gf5fe3003919f - first fail: v4.4.176-30-ge577ed472fce)

Boot Failures Detected:

arm:

multi_v7_defconfig:
gcc-7:
tegra124-nyan-big: 1 failed lab

tegra_defconfig:
gcc-7:
tegra124-nyan-big: 1 failed lab

arm64:

defconfig:
gcc-7:
qcom-qdf2400: 1 failed lab

---
For more info write to 


[PATCH 4/8] vfio/mdev: Drop redundant extern for exported symbols

2019-03-22 Thread Parav Pandit
There is no need use 'extern' for exported functions.

Signed-off-by: Parav Pandit 
---
 include/linux/mdev.h | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index b6e048e..0924c48 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -118,21 +118,20 @@ struct mdev_driver {
 
 #define to_mdev_driver(drv)container_of(drv, struct mdev_driver, driver)
 
-extern void *mdev_get_drvdata(struct mdev_device *mdev);
-extern void mdev_set_drvdata(struct mdev_device *mdev, void *data);
-extern uuid_le mdev_uuid(struct mdev_device *mdev);
+void *mdev_get_drvdata(struct mdev_device *mdev);
+void mdev_set_drvdata(struct mdev_device *mdev, void *data);
+uuid_le mdev_uuid(struct mdev_device *mdev);
 
 extern struct bus_type mdev_bus_type;
 
-extern int  mdev_register_device(struct device *dev,
-const struct mdev_parent_ops *ops);
-extern void mdev_unregister_device(struct device *dev);
+int mdev_register_device(struct device *dev, const struct mdev_parent_ops 
*ops);
+void mdev_unregister_device(struct device *dev);
 
-extern int  mdev_register_driver(struct mdev_driver *drv, struct module 
*owner);
-extern void mdev_unregister_driver(struct mdev_driver *drv);
+int mdev_register_driver(struct mdev_driver *drv, struct module *owner);
+void mdev_unregister_driver(struct mdev_driver *drv);
 
-extern struct device *mdev_parent_dev(struct mdev_device *mdev);
-extern struct device *mdev_dev(struct mdev_device *mdev);
-extern struct mdev_device *mdev_from_dev(struct device *dev);
+struct device *mdev_parent_dev(struct mdev_device *mdev);
+struct device *mdev_dev(struct mdev_device *mdev);
+struct mdev_device *mdev_from_dev(struct device *dev);
 
 #endif /* MDEV_H */
-- 
1.8.3.1



[PATCH 6/8] vfio/mdev: Follow correct remove sequence

2019-03-22 Thread Parav Pandit
mdev_remove_sysfs_files() should follow exact mirror sequence of a
create, similar to what is followed in error unwinding path of
mdev_create_sysfs_files().

Fixes: 7b96953bc640 ("vfio: Mediated device Core driver")
Signed-off-by: Parav Pandit 
---
 drivers/vfio/mdev/mdev_sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index ce5dd21..c782fa9 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -280,7 +280,7 @@ int  mdev_create_sysfs_files(struct device *dev, struct 
mdev_type *type)
 
 void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type)
 {
+   sysfs_remove_files(>kobj, mdev_device_attrs);
sysfs_remove_link(>kobj, "mdev_type");
sysfs_remove_link(type->devices_kobj, dev_name(dev));
-   sysfs_remove_files(>kobj, mdev_device_attrs);
 }
-- 
1.8.3.1



[PATCH 3/8] vfio/mdev: Removed unused kref

2019-03-22 Thread Parav Pandit
Remove unused kref from the mdev_device structure.

Fixes: 7b96953bc640 ("vfio: Mediated device Core driver")
Signed-off-by: Parav Pandit 
---
 drivers/vfio/mdev/mdev_core.c| 1 -
 drivers/vfio/mdev/mdev_private.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 4f213e4d..3d91f62 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -311,7 +311,6 @@ int mdev_device_create(struct kobject *kobj, struct device 
*dev, uuid_le uuid)
mutex_unlock(_list_lock);
 
mdev->parent = parent;
-   kref_init(>ref);
 
mdev->dev.parent  = dev;
mdev->dev.bus = _bus_type;
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index b5819b7..84b2b6c 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -30,7 +30,6 @@ struct mdev_device {
struct mdev_parent *parent;
uuid_le uuid;
void *driver_data;
-   struct kref ref;
struct list_head next;
struct kobject *type_kobj;
bool active;
-- 
1.8.3.1



Re: [PATCH 03/10] x86, olpc: Trivial code move in DT fixup

2019-03-22 Thread Thomas Gleixner
On Sun, 10 Mar 2019, Lubomir Rintel wrote:

Same subject prefix as with previous patch please.

> This makes the following patch more concise.

Acked-by: Thomas Gleixner 


Re: [PATCH 02/10] x86, olpc: Don't split string literals when fixing up the DT

2019-03-22 Thread Thomas Gleixner
On Sun, 10 Mar 2019, Lubomir Rintel wrote:

Please use 'x86/platform/olpc:' as prefix in the subject.

> It was pointed out in a review, and checkpatch.pl complains about this.
> Breaking it down into multiple ofw evaluations works just as well, and
> perhaps even reads better.

perhaps?

Other than that:

Acked-by: Thomas Gleixner 


Re: [PATCH v2 0/8] arm64: dts: g12a: Add peripherals

2019-03-22 Thread Kevin Hilman
Neil Armstrong  writes:

> Add following peripherals :
> - AO Clock + Reset Controller
> - Pinctrl
> - UARTs
> - SAR-ADC
> - USB
> - Mali GPU
>
> USB depends on the PCIE clock and missing Reset bindings.

I left this one out for now.

> SARADC bindings is merged.

And this one, as it depends on new clock IDs.  I need a stable tag for
those.

> USB bindings has been reviewed, but not merged yet.

Also depends on a reset patch for the IDs used.  This was just merged by
Phillip, but I need a stable tag before I can merg this.

> Mali GPU bindings are still in review.

I've left the Mali nodes out for now also.

IOW, I queued patches 1-5/8 (branch: v5.2/dt64), and will wait on the
others until their dependencies are cleared up and/or I have stable tags
to use.

In the future, please note all the out-of-tree dependencies for the
series.

Thanks,

Kevin


Re: [PATCH 4.9 000/118] 4.9.165-stable review

2019-03-22 Thread kernelci.org bot
stable-rc/linux-4.9.y boot: 47 boots: 2 failed, 45 passed 
(v4.9.164-119-g7d2ac480f8d7)

Full Boot Summary: 
https://kernelci.org/boot/all/job/stable-rc/branch/linux-4.9.y/kernel/v4.9.164-119-g7d2ac480f8d7/
Full Build Summary: 
https://kernelci.org/build/stable-rc/branch/linux-4.9.y/kernel/v4.9.164-119-g7d2ac480f8d7/

Tree: stable-rc
Branch: linux-4.9.y
Git Describe: v4.9.164-119-g7d2ac480f8d7
Git Commit: 7d2ac480f8d77207af9d191a51f9ae2036117da4
Git URL: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Tested: 24 unique boards, 13 SoC families, 9 builds out of 197

Boot Regressions Detected:

arm:

multi_v7_defconfig:
gcc-7:
  tegra124-nyan-big:
  lab-collabora: failing since 24 days (last pass: v4.9.158 - first 
fail: v4.9.160-64-g0c0f9f653c9f)

tegra_defconfig:
gcc-7:
  tegra124-nyan-big:
  lab-collabora: failing since 24 days (last pass: v4.9.158 - first 
fail: v4.9.160-64-g0c0f9f653c9f)

Boot Failures Detected:

arm:

multi_v7_defconfig:
gcc-7:
tegra124-nyan-big: 1 failed lab

tegra_defconfig:
gcc-7:
tegra124-nyan-big: 1 failed lab

---
For more info write to 


Re: [PATCH 4.19 000/280] 4.19.31-stable review

2019-03-22 Thread kernelci.org bot
stable-rc/linux-4.19.y boot: 59 boots: 0 failed, 59 passed 
(v4.19.30-281-gf4bc3dea377c)

Full Boot Summary: 
https://kernelci.org/boot/all/job/stable-rc/branch/linux-4.19.y/kernel/v4.19.30-281-gf4bc3dea377c/
Full Build Summary: 
https://kernelci.org/build/stable-rc/branch/linux-4.19.y/kernel/v4.19.30-281-gf4bc3dea377c/

Tree: stable-rc
Branch: linux-4.19.y
Git Describe: v4.19.30-281-gf4bc3dea377c
Git Commit: f4bc3dea377c7afbfe0a36d2c72e9031910349e9
Git URL: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Tested: 34 unique boards, 16 SoC families, 10 builds out of 206

---
For more info write to 


Re: [PATCH REBASED] mm, memcg: Make scan aggression always exclude protection

2019-03-22 Thread Roman Gushchin
On Fri, Mar 22, 2019 at 10:49:46PM +, Chris Down wrote:
> Roman Gushchin writes:
> > I've noticed that the old version is just wrong: if cgroup_size is way 
> > smaller
> > than max(min, low), scan will be set to -lruvec_size.
> > Given that it's unsigned long, we'll end up with scanning the whole list
> > (due to clamp() below).
> 
> Are you certain? If so, I don't see what you mean. This is how the code
> looks in Linus' tree after the fixups:
> 
>unsigned long cgroup_size = mem_cgroup_size(memcg);
>unsigned long baseline = 0;
> 
>if (!sc->memcg_low_reclaim)
>baseline = lruvec_size;
>scan = lruvec_size * cgroup_size / protection - baseline;

> 
> This works correctly as far as I can tell:

I'm blaming the old version, not the new one.

New one is perfectly fine, thanks to these lines:
+   /* Avoid TOCTOU with earlier protection check */
+   cgroup_size = max(cgroup_size, protection);

The old one was racy.

Thanks!


Re: [PATCH] EDAC, {skx|i10nm}_edac: Fix randconfig build error

2019-03-22 Thread Borislav Petkov
On Thu, Mar 21, 2019 at 03:13:39PM -0700, Luck, Tony wrote:
> 
> From: Qiuxu Zhuo 
> 
> Kbuild failed on the kernel configurations below:

...

I've massaged this into the below and running randconfigs now:

---
From: Qiuxu Zhuo 
Date: Thu, 21 Mar 2019 15:13:39 -0700
Subject: [PATCH] EDAC, skx, i10nm: Make skx_common.c a pure library

The following Kconfig constellations fail randconfig builds:

  CONFIG_ACPI_NFIT=y
  CONFIG_EDAC_DEBUG=y
  CONFIG_EDAC_SKX=m
  CONFIG_EDAC_I10NM=y

or

  CONFIG_ACPI_NFIT=y
  CONFIG_EDAC_DEBUG=y
  CONFIG_EDAC_SKX=y
  CONFIG_EDAC_I10NM=m

with:
  ...
  CC [M]  drivers/edac/skx_common.o
  ...
  .../skx_common.o:.../skx_common.c:672: undefined reference to `__this_module'

That is because if one of the two drivers - skx_edac or i10nm_edac - is
built-in and the other one is a module, the shared file skx_common.c
gets linked into a module object by kbuild. Therefore, when linking that
same file into vmlinux, the '__this_module' symbol used in debugfs isn't
defined, leading to the above error.

Fix it by moving all debugfs code from skx_common.c to both skx_base.c
and i10nm_base.c respectively. Thus, skx_common.c doesn't refer to
'__this_module' symbol anymore.

Clarify its purpose at the top of the file for future reference, while
at it.

 [ bp: Make text more readable. ]

Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server 
processors")
Reported-by: Arnd Bergmann 
Signed-off-by: Qiuxu Zhuo 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Cc: James Morse 
Cc: Mauro Carvalho Chehab 
Cc: linux-edac 
Link: https://lkml.kernel.org/r/20190321221339.GA32323@agluck-desk
---
 drivers/edac/i10nm_base.c | 52 +--
 drivers/edac/skx_base.c   | 50 +-
 drivers/edac/skx_common.c | 57 +++
 drivers/edac/skx_common.h |  8 --
 4 files changed, 109 insertions(+), 58 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index c334fb7c63df..6f06aec4877c 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -181,6 +181,54 @@ static struct notifier_block i10nm_mce_dec = {
.priority   = MCE_PRIO_EDAC,
 };
 
+#ifdef CONFIG_EDAC_DEBUG
+/*
+ * Debug feature.
+ * Exercise the address decode logic by writing an address to
+ * /sys/kernel/debug/edac/i10nm_test/addr.
+ */
+static struct dentry *i10nm_test;
+
+static int debugfs_u64_set(void *data, u64 val)
+{
+   struct mce m;
+
+   pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val);
+
+   memset(, 0, sizeof(m));
+   /* ADDRV + MemRd + Unknown channel */
+   m.status = MCI_STATUS_ADDRV + 0x90;
+   /* One corrected error */
+   m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT);
+   m.addr = val;
+   skx_mce_check_error(NULL, 0, );
+
+   return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
+
+static void setup_i10nm_debug(void)
+{
+   i10nm_test = edac_debugfs_create_dir("i10nm_test");
+   if (!i10nm_test)
+   return;
+
+   if (!edac_debugfs_create_file("addr", 0200, i10nm_test,
+ NULL, _u64_wo)) {
+   debugfs_remove(i10nm_test);
+   i10nm_test = NULL;
+   }
+}
+
+static void teardown_i10nm_debug(void)
+{
+   debugfs_remove_recursive(i10nm_test);
+}
+#else
+static inline void setup_i10nm_debug(void) {}
+static inline void teardown_i10nm_debug(void) {}
+#endif /*CONFIG_EDAC_DEBUG*/
+
 static int __init i10nm_init(void)
 {
u8 mc = 0, src_id = 0, node_id = 0;
@@ -249,7 +297,7 @@ static int __init i10nm_init(void)
 
opstate_init();
mce_register_decode_chain(_mce_dec);
-   setup_skx_debug("i10nm_test");
+   setup_i10nm_debug();
 
i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
 
@@ -262,7 +310,7 @@ static int __init i10nm_init(void)
 static void __exit i10nm_exit(void)
 {
edac_dbg(2, "\n");
-   teardown_skx_debug();
+   teardown_i10nm_debug();
mce_unregister_decode_chain(_mce_dec);
skx_adxl_put();
skx_remove();
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index adae4c848ca1..a5c8fa3a249a 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -540,6 +540,54 @@ static struct notifier_block skx_mce_dec = {
.priority   = MCE_PRIO_EDAC,
 };
 
+#ifdef CONFIG_EDAC_DEBUG
+/*
+ * Debug feature.
+ * Exercise the address decode logic by writing an address to
+ * /sys/kernel/debug/edac/skx_test/addr.
+ */
+static struct dentry *skx_test;
+
+static int debugfs_u64_set(void *data, u64 val)
+{
+   struct mce m;
+
+   pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val);
+
+   memset(, 0, sizeof(m));
+   /* ADDRV + MemRd + Unknown channel */
+   m.status = MCI_STATUS_ADDRV + 0x90;
+   /* One corrected error */
+   m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT);
+   

Dear Friend,

2019-03-22 Thread mrs clara david
Dear Friend,

I am Mrs Clara David. am sending you this brief letter to solicit your
partnership to transfer $18.5 million US Dollars.I shall send you more
information and procedures when I receive positive response from you.
please send me a message in my Email box (mrsclarad...@gmail.com)
as i wait to hear from you.

Best regard
Mrs Clara David.


Re: [PATCH] dt-bindings: reset: meson-g12a: Add missing USB2 PHY resets

2019-03-22 Thread Kevin Hilman
Hi Philip,

Philipp Zabel  writes:

> On Mon, 2019-03-04 at 11:49 +0100, Neil Armstrong wrote:
>> The G12A Documentation lacked these 2 reset lines, but they are present and
>> used for each USB 2 PHYs.
>> 
>> Add them to the dt-bindings for the upcoming USB support.
>> 
>> Fixes: dbfc54534dfc ("dt-bindings: reset: meson: add g12a bindings")
>> Signed-off-by: Neil Armstrong 
>> ---
>>  include/dt-bindings/reset/amlogic,meson-g12a-reset.h | 5 -
>>  1 file changed, 4 insertions(+), 1 deletion(-)
>> 
>> diff --git a/include/dt-bindings/reset/amlogic,meson-g12a-reset.h 
>> b/include/dt-bindings/reset/amlogic,meson-g12a-reset.h
>> index 8063e8314eef..6d487c5eba2c 100644
>> --- a/include/dt-bindings/reset/amlogic,meson-g12a-reset.h
>> +++ b/include/dt-bindings/reset/amlogic,meson-g12a-reset.h
>> @@ -51,7 +51,10 @@
>>  #define RESET_SD_EMMC_A 44
>>  #define RESET_SD_EMMC_B 45
>>  #define RESET_SD_EMMC_C 46
>> -/*  47-60 */
>> +/*  47  */
>> +#define RESET_USB_PHY20 48
>> +#define RESET_USB_PHY21 49
>> +/*  50-60   */
>>  #define RESET_AUDIO_CODEC   61
>>  /*  62-63   */
>>  /*  RESET2  */
>
> Thank you, applied to reset/fixes with Martin's review tag.

Could ou make a immtable tag for this in your tree?  This is needed for
some upcoming DT users we'd like to queue for the next cycle.

Thanks,

Kevin


Re: [PATCH REBASED] mm, memcg: Make scan aggression always exclude protection

2019-03-22 Thread Chris Down

Roman Gushchin writes:

However, we can race with the emin/elow update and end up with negative scan,
especially if cgroup_size is about the effective protection size


Yeah, it's possible but unlikely, hence the TOCTOU check. :-)


[tip:timers/urgent] clocksource/drivers/clps711x: Make clps711x_clksrc_init() static

2019-03-22 Thread tip-bot for YueHaibing
Commit-ID:  d18a7408d7be0f34a120d99051ed5187d9727728
Gitweb: https://git.kernel.org/tip/d18a7408d7be0f34a120d99051ed5187d9727728
Author: YueHaibing 
AuthorDate: Fri, 22 Mar 2019 22:37:08 +0800
Committer:  Thomas Gleixner 
CommitDate: Fri, 22 Mar 2019 22:59:32 +0100

clocksource/drivers/clps711x: Make clps711x_clksrc_init() static

Fix sparse warning:

drivers/clocksource/clps711x-timer.c:96:13: warning:
 symbol 'clps711x_clksrc_init' was not declared. Should it be static?

Signed-off-by: YueHaibing 
Signed-off-by: Thomas Gleixner 
Cc: 
Cc: 
Cc: 
Link: https://lkml.kernel.org/r/20190322143708.12716-1-yuehaib...@huawei.com

---
 drivers/clocksource/clps711x-timer.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/clps711x-timer.c 
b/drivers/clocksource/clps711x-timer.c
index a8dd80576c95..cdc251524f5e 100644
--- a/drivers/clocksource/clps711x-timer.c
+++ b/drivers/clocksource/clps711x-timer.c
@@ -93,8 +93,9 @@ static int __init _clps711x_clkevt_init(struct clk *clock, 
void __iomem *base,
   "clps711x-timer", clkevt);
 }
 
-void __init clps711x_clksrc_init(void __iomem *tc1_base, void __iomem 
*tc2_base,
-unsigned int irq)
+static void __init clps711x_clksrc_init(void __iomem *tc1_base,
+   void __iomem *tc2_base,
+   unsigned int irq)
 {
struct clk *tc1 = clk_get_sys("clps711x-timer.0", NULL);
struct clk *tc2 = clk_get_sys("clps711x-timer.1", NULL);


Re: [PATCH REBASED] mm, memcg: Make scan aggression always exclude protection

2019-03-22 Thread Roman Gushchin
On Fri, Mar 22, 2019 at 03:29:10PM -0700, Roman Gushchin wrote:
> On Fri, Mar 22, 2019 at 04:03:07PM +, Chris Down wrote:
> > This patch is an incremental improvement on the existing
> > memory.{low,min} relative reclaim work to base its scan pressure
> > calculations on how much protection is available compared to the current
> > usage, rather than how much the current usage is over some protection
> > threshold.
> > 
> > Previously the way that memory.low protection works is that if you are
> > 50% over a certain baseline, you get 50% of your normal scan pressure.
> > This is certainly better than the previous cliff-edge behaviour, but it
> > can be improved even further by always considering memory under the
> > currently enforced protection threshold to be out of bounds. This means
> > that we can set relatively low memory.low thresholds for variable or
> > bursty workloads while still getting a reasonable level of protection,
> > whereas with the previous version we may still trivially hit the 100%
> > clamp. The previous 100% clamp is also somewhat arbitrary, whereas this
> > one is more concretely based on the currently enforced protection
> > threshold, which is likely easier to reason about.
> > 
> > There is also a subtle issue with the way that proportional reclaim
> > worked previously -- it promotes having no memory.low, since it makes
> > pressure higher during low reclaim. This happens because we base our
> > scan pressure modulation on how far memory.current is between memory.min
> > and memory.low, but if memory.low is unset, we only use the overage
> > method. In most cromulent configurations, this then means that we end up
> > with *more* pressure than with no memory.low at all when we're in low
> > reclaim, which is not really very usable or expected.
> > 
> > With this patch, memory.low and memory.min affect reclaim pressure in a
> > more understandable and composable way. For example, from a user
> > standpoint, "protected" memory now remains untouchable from a reclaim
> > aggression standpoint, and users can also have more confidence that
> > bursty workloads will still receive some amount of guaranteed
> > protection.
> > 
> > Signed-off-by: Chris Down 
> > Reviewed-by: Roman Gushchin 
> > Cc: Johannes Weiner 
> > Cc: Andrew Morton 
> > Cc: Michal Hocko 
> > Cc: Tejun Heo 
> > Cc: Roman Gushchin 
> > Cc: Dennis Zhou 
> > Cc: linux-kernel@vger.kernel.org
> > Cc: cgro...@vger.kernel.org
> > Cc: linux...@kvack.org
> > Cc: kernel-t...@fb.com
> > ---
> >  include/linux/memcontrol.h | 25 
> >  mm/vmscan.c| 61 +-
> >  2 files changed, 32 insertions(+), 54 deletions(-)
> > 
> > No functional changes, just rebased.
> > 
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index b226c4bafc93..799de23edfb7 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -333,17 +333,17 @@ static inline bool mem_cgroup_disabled(void)
> > return !cgroup_subsys_enabled(memory_cgrp_subsys);
> >  }
> >  
> > -static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
> > -unsigned long *min, unsigned long *low)
> > +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
> > + bool in_low_reclaim)
> >  {
> > -   if (mem_cgroup_disabled()) {
> > -   *min = 0;
> > -   *low = 0;
> > -   return;
> > -   }
> > +   if (mem_cgroup_disabled())
> > +   return 0;
> > +
> > +   if (in_low_reclaim)
> > +   return READ_ONCE(memcg->memory.emin);
> >  
> > -   *min = READ_ONCE(memcg->memory.emin);
> > -   *low = READ_ONCE(memcg->memory.elow);
> > +   return max(READ_ONCE(memcg->memory.emin),
> > +  READ_ONCE(memcg->memory.elow));
> >  }
> >  
> >  enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> > @@ -845,11 +845,10 @@ static inline void memcg_memory_event_mm(struct 
> > mm_struct *mm,
> >  {
> >  }
> >  
> > -static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
> > -unsigned long *min, unsigned long *low)
> > +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
> > + bool in_low_reclaim)
> >  {
> > -   *min = 0;
> > -   *low = 0;
> > +   return 0;
> >  }
> >  
> >  static inline enum mem_cgroup_protection mem_cgroup_protected(
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index f6b9b45f731d..d5daa224364d 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2374,12 +2374,13 @@ static void get_scan_count(struct lruvec *lruvec, 
> > struct mem_cgroup *memcg,
> > int file = is_file_lru(lru);
> > unsigned long lruvec_size;
> > unsigned long scan;
> > -   unsigned long min, low;
> > +   unsigned long protection;
> >  
> > 

[tip:timers/urgent] clocksource/drivers/mips-gic-timer: Make gic_compare_irqaction static

2019-03-22 Thread tip-bot for YueHaibing
Commit-ID:  9039de4034775f4420bf01fa879f8c04b3cd6bba
Gitweb: https://git.kernel.org/tip/9039de4034775f4420bf01fa879f8c04b3cd6bba
Author: YueHaibing 
AuthorDate: Fri, 22 Mar 2019 22:43:59 +0800
Committer:  Thomas Gleixner 
CommitDate: Fri, 22 Mar 2019 22:59:33 +0100

clocksource/drivers/mips-gic-timer: Make gic_compare_irqaction static

Fix sparse warning:

drivers/clocksource/mips-gic-timer.c:70:18: warning:
 symbol 'gic_compare_irqaction' was not declared. Should it be static?

Signed-off-by: YueHaibing 
Signed-off-by: Thomas Gleixner 
Cc: 
Link: https://lkml.kernel.org/r/20190322144359.19516-1-yuehaib...@huawei.com

---
 drivers/clocksource/mips-gic-timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clocksource/mips-gic-timer.c 
b/drivers/clocksource/mips-gic-timer.c
index 54f8a331b53a..37671a5d4ed9 100644
--- a/drivers/clocksource/mips-gic-timer.c
+++ b/drivers/clocksource/mips-gic-timer.c
@@ -67,7 +67,7 @@ static irqreturn_t gic_compare_interrupt(int irq, void 
*dev_id)
return IRQ_HANDLED;
 }
 
-struct irqaction gic_compare_irqaction = {
+static struct irqaction gic_compare_irqaction = {
.handler = gic_compare_interrupt,
.percpu_dev_id = _clockevent_device,
.flags = IRQF_PERCPU | IRQF_TIMER,


[tip:timers/urgent] clocksource/drivers/timer-ti-dm: Make omap_dm_timer_set_load_start() static

2019-03-22 Thread tip-bot for YueHaibing
Commit-ID:  008258d995a637c77c10a5d087d134eed49a6572
Gitweb: https://git.kernel.org/tip/008258d995a637c77c10a5d087d134eed49a6572
Author: YueHaibing 
AuthorDate: Fri, 22 Mar 2019 22:43:02 +0800
Committer:  Thomas Gleixner 
CommitDate: Fri, 22 Mar 2019 22:59:33 +0100

clocksource/drivers/timer-ti-dm: Make omap_dm_timer_set_load_start() static

Fix sparse warning:

drivers/clocksource/timer-ti-dm.c:589:5: warning:
 symbol 'omap_dm_timer_set_load_start' was not declared. Should it be static?

Signed-off-by: YueHaibing 
Signed-off-by: Thomas Gleixner 
Cc: 
Link: https://lkml.kernel.org/r/20190322144302.6704-1-yuehaib...@huawei.com

---
 drivers/clocksource/timer-ti-dm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/timer-ti-dm.c 
b/drivers/clocksource/timer-ti-dm.c
index c364027638e1..3352da6ed61f 100644
--- a/drivers/clocksource/timer-ti-dm.c
+++ b/drivers/clocksource/timer-ti-dm.c
@@ -586,8 +586,8 @@ static int omap_dm_timer_set_load(struct omap_dm_timer 
*timer, int autoreload,
 }
 
 /* Optimized set_load which removes costly spin wait in timer_start */
-int omap_dm_timer_set_load_start(struct omap_dm_timer *timer, int autoreload,
-unsigned int load)
+static int omap_dm_timer_set_load_start(struct omap_dm_timer *timer,
+   int autoreload, unsigned int load)
 {
u32 l;
 


[tip:perf/urgent] perf bpf: Extract logic to create program names from perf_event__synthesize_one_bpf_prog()

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  fc462ac75b36daaa61e9bda7fba66ed1b3a500b4
Gitweb: https://git.kernel.org/tip/fc462ac75b36daaa61e9bda7fba66ed1b3a500b4
Author: Song Liu 
AuthorDate: Tue, 19 Mar 2019 09:54:53 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 21 Mar 2019 11:27:04 -0300

perf bpf: Extract logic to create program names from 
perf_event__synthesize_one_bpf_prog()

Extract logic to create program names to synthesize_bpf_prog_name(), so
that it can be reused in header.c:print_bpf_prog_info().

This commit doesn't change the behavior.

Signed-off-by: Song Liu 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Link: http://lkml.kernel.org/r/20190319165454.1298742-2-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/bpf-event.c | 62 +
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 2a8c245ca942..d5b041649f26 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -111,6 +111,38 @@ static int perf_env__fetch_btf(struct perf_env *env,
return 0;
 }
 
+static int synthesize_bpf_prog_name(char *buf, int size,
+   struct bpf_prog_info *info,
+   struct btf *btf,
+   u32 sub_id)
+{
+   u8 (*prog_tags)[BPF_TAG_SIZE] = (void *)(uintptr_t)(info->prog_tags);
+   void *func_infos = (void *)(uintptr_t)(info->func_info);
+   u32 sub_prog_cnt = info->nr_jited_ksyms;
+   const struct bpf_func_info *finfo;
+   const char *short_name = NULL;
+   const struct btf_type *t;
+   int name_len;
+
+   name_len = snprintf(buf, size, "bpf_prog_");
+   name_len += snprintf_hex(buf + name_len, size - name_len,
+prog_tags[sub_id], BPF_TAG_SIZE);
+   if (btf) {
+   finfo = func_infos + sub_id * info->func_info_rec_size;
+   t = btf__type_by_id(btf, finfo->type_id);
+   short_name = btf__name_by_offset(btf, t->name_off);
+   } else if (sub_id == 0 && sub_prog_cnt == 1) {
+   /* no subprog */
+   if (info->name[0])
+   short_name = info->name;
+   } else
+   short_name = "F";
+   if (short_name)
+   name_len += snprintf(buf + name_len, size - name_len,
+"_%s", short_name);
+   return name_len;
+}
+
 /*
  * Synthesize PERF_RECORD_KSYMBOL and PERF_RECORD_BPF_EVENT for one bpf
  * program. One PERF_RECORD_BPF_EVENT is generated for the program. And
@@ -135,7 +167,6 @@ static int perf_event__synthesize_one_bpf_prog(struct 
perf_session *session,
struct bpf_prog_info_node *info_node;
struct bpf_prog_info *info;
struct btf *btf = NULL;
-   bool has_btf = false;
struct perf_env *env;
u32 sub_prog_cnt, i;
int err = 0;
@@ -189,19 +220,13 @@ static int perf_event__synthesize_one_bpf_prog(struct 
perf_session *session,
btf = NULL;
goto out;
}
-   has_btf = true;
perf_env__fetch_btf(env, info->btf_id, btf);
}
 
/* Synthesize PERF_RECORD_KSYMBOL */
for (i = 0; i < sub_prog_cnt; i++) {
-   u8 (*prog_tags)[BPF_TAG_SIZE] = (void 
*)(uintptr_t)(info->prog_tags);
-   __u32 *prog_lens  = (__u32 *)(uintptr_t)(info->jited_func_lens);
+   __u32 *prog_lens = (__u32 *)(uintptr_t)(info->jited_func_lens);
__u64 *prog_addrs = (__u64 *)(uintptr_t)(info->jited_ksyms);
-   void *func_infos  = (void *)(uintptr_t)(info->func_info);
-   const struct bpf_func_info *finfo;
-   const char *short_name = NULL;
-   const struct btf_type *t;
int name_len;
 
*ksymbol_event = (struct ksymbol_event){
@@ -214,26 +239,9 @@ static int perf_event__synthesize_one_bpf_prog(struct 
perf_session *session,
.ksym_type = PERF_RECORD_KSYMBOL_TYPE_BPF,
.flags = 0,
};
-   name_len = snprintf(ksymbol_event->name, KSYM_NAME_LEN,
-   "bpf_prog_");
-   name_len += snprintf_hex(ksymbol_event->name + name_len,
-KSYM_NAME_LEN - name_len,
-prog_tags[i], BPF_TAG_SIZE);
-   if (has_btf) {
-   finfo = func_infos + i * info->func_info_rec_size;
-   t = btf__type_by_id(btf, finfo->type_id);
-   short_name = btf__name_by_offset(btf, t->name_off);
-   } else if (i == 0 && sub_prog_cnt == 1) {
-   /* no subprog */
-   if 

[tip:perf/urgent] perf tools: Save bpf_prog_info and BTF of new BPF programs

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  d56354dc49091e33d9ffca732ac913ed2df70537
Gitweb: https://git.kernel.org/tip/d56354dc49091e33d9ffca732ac913ed2df70537
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:51 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 21 Mar 2019 11:27:04 -0300

perf tools: Save bpf_prog_info and BTF of new BPF programs

To fully annotate BPF programs with source code mapping, 4 different
information are needed:

1) PERF_RECORD_KSYMBOL
2) PERF_RECORD_BPF_EVENT
3) bpf_prog_info
4) btf

This patch handles 3) and 4) for BPF programs loaded after 'perf
record|top'.

For timely process of these information, a dedicated event is added to
the side band evlist.

When PERF_RECORD_BPF_EVENT is received via the side band event, the
polling thread gathers 3) and 4) vis sys_bpf and store them in perf_env.

This information is saved to perf.data at the end of 'perf record'.

Committer testing:

The 'wakeup_watermark' member in 'struct perf_event_attr' is inside a
unnamed union, so can't be used in a struct designated initialization
with older gccs, get it out of that, isolating as 'attr.wakeup_watermark
= 1;' to work with all gcc versions.

We also need to add '--no-bpf-event' to the 'perf record'
perf_event_attr tests in 'perf test', as the way that that test goes is
to intercept the events being setup and looking if they match the fields
described in the control files, since now it finds first the side band
event used to catch the PERF_RECORD_BPF_EVENT, they all fail.

With these issues fixed:

Same scenario as for testing BPF programs loaded before 'perf record' or
'perf top' starts, only start the BPF programs after 'perf record|top',
so that its information get collected by the sideband threads, the rest
works as for the programs loaded before start monitoring.

Add missing 'inline' to the bpf_event__add_sb_event() when
HAVE_LIBBPF_SUPPORT is not defined, fixing the build in systems without
binutils devel files installed.

Signed-off-by: Song Liu 
Reviewed-by: Jiri Olsa 
Tested-by: Arnaldo Carvalho de Melo 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Link: http://lkml.kernel.org/r/20190312053051.2690567-16-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-record.c|   3 +
 tools/perf/builtin-top.c   |   3 +
 tools/perf/tests/attr/test-record-C0   |   2 +-
 tools/perf/tests/attr/test-record-basic|   2 +-
 tools/perf/tests/attr/test-record-branch-any   |   2 +-
 .../perf/tests/attr/test-record-branch-filter-any  |   2 +-
 .../tests/attr/test-record-branch-filter-any_call  |   2 +-
 .../tests/attr/test-record-branch-filter-any_ret   |   2 +-
 tools/perf/tests/attr/test-record-branch-filter-hv |   2 +-
 .../tests/attr/test-record-branch-filter-ind_call  |   2 +-
 tools/perf/tests/attr/test-record-branch-filter-k  |   2 +-
 tools/perf/tests/attr/test-record-branch-filter-u  |   2 +-
 tools/perf/tests/attr/test-record-count|   2 +-
 tools/perf/tests/attr/test-record-data |   2 +-
 tools/perf/tests/attr/test-record-freq |   2 +-
 tools/perf/tests/attr/test-record-graph-default|   2 +-
 tools/perf/tests/attr/test-record-graph-dwarf  |   2 +-
 tools/perf/tests/attr/test-record-graph-fp |   2 +-
 tools/perf/tests/attr/test-record-group|   2 +-
 tools/perf/tests/attr/test-record-group-sampling   |   2 +-
 tools/perf/tests/attr/test-record-group1   |   2 +-
 tools/perf/tests/attr/test-record-no-buffering |   2 +-
 tools/perf/tests/attr/test-record-no-inherit   |   2 +-
 tools/perf/tests/attr/test-record-no-samples   |   2 +-
 tools/perf/tests/attr/test-record-period   |   2 +-
 tools/perf/tests/attr/test-record-raw  |   2 +-
 tools/perf/util/bpf-event.c| 100 +
 tools/perf/util/bpf-event.h|  15 
 28 files changed, 145 insertions(+), 24 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 6f645fd72fed..4e2d953d4bc5 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1238,6 +1238,9 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
goto out_child;
}
 
+   if (!opts->no_bpf_event)
+   bpf_event__add_sb_event(_evlist, >header.env);
+
if (perf_evlist__start_sb_thread(sb_evlist, >opts.target)) {
pr_debug("Couldn't start the BPF side band thread:\nBPF 
programs starting from now on won't be annotatable\n");
opts->no_bpf_event = true;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3ce8a8db6c1d..1999d6533d12 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1637,6 +1637,9 @@ int cmd_top(int argc, const char **argv)
  

Re: [PATCH REBASED] mm, memcg: Make scan aggression always exclude protection

2019-03-22 Thread Chris Down

Roman Gushchin writes:

I've noticed that the old version is just wrong: if cgroup_size is way smaller
than max(min, low), scan will be set to -lruvec_size.
Given that it's unsigned long, we'll end up with scanning the whole list
(due to clamp() below).


Are you certain? If so, I don't see what you mean. This is how the code looks 
in Linus' tree after the fixups:


   unsigned long cgroup_size = mem_cgroup_size(memcg);
   unsigned long baseline = 0;

   if (!sc->memcg_low_reclaim)
   baseline = lruvec_size;
   scan = lruvec_size * cgroup_size / protection - baseline;

This works correctly as far as I can tell:

low reclaim case:

   In [1]: cgroup_size=50; lruvec_size=10; protection=2000; baseline=0; 
lruvec_size * cgroup_size // protection - baseline
   Out[1]: 0

normal case:

   In [2]: cgroup_size=3000; lruvec_size=10; protection=2000; 
baseline=lruvec_size; lruvec_size * cgroup_size // protection - baseline
   Out[2]: 5


Re: [PATCH] nvdimm: btt_devs: fix a NULL pointer dereference and a memory leak

2019-03-22 Thread Verma, Vishal L
On Tue, 2019-03-12 at 03:15 -0500, Kangjie Lu wrote:
> In case kmemdup fails, the fix releases resources and returns to
> avoid the NULL pointer dereference.
> Also, the error paths in the following code should release
> resources to avoid memory leaks.
> 
> Signed-off-by: Kangjie Lu 
> ---
>  drivers/nvdimm/btt_devs.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 

Looks good,
Reviewed-by: Vishal Verma 

> diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
> index 795ad4ff35ca..565ea0b6f765 100644
> --- a/drivers/nvdimm/btt_devs.c
> +++ b/drivers/nvdimm/btt_devs.c
> @@ -196,8 +196,13 @@ static struct device *__nd_btt_create(struct
> nd_region *nd_region,
>   }
>  
>   nd_btt->lbasize = lbasize;
> - if (uuid)
> + if (uuid) {
>   uuid = kmemdup(uuid, 16, GFP_KERNEL);
> + if (!uuid) {
> + kfree(nd_btt);
> + return NULL;
> + }
> + }
>   nd_btt->uuid = uuid;
>   dev = _btt->dev;
>   dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id);
> @@ -209,6 +214,7 @@ static struct device *__nd_btt_create(struct
> nd_region *nd_region,
>   dev_dbg(>dev, "failed, already claimed by %s\n",
>   dev_name(ndns->claim));
>   put_device(dev);
> + kfree(uuid);
>   return NULL;
>   }
>   return dev;



[tip:timers/urgent] clocksource/drivers/tcb_clksrc: Make tc_clksrc_suspend/resume() static

2019-03-22 Thread tip-bot for YueHaibing
Commit-ID:  bddee90af621914f08a03d546419fc293e9140d8
Gitweb: https://git.kernel.org/tip/bddee90af621914f08a03d546419fc293e9140d8
Author: YueHaibing 
AuthorDate: Fri, 22 Mar 2019 22:39:40 +0800
Committer:  Thomas Gleixner 
CommitDate: Fri, 22 Mar 2019 22:59:33 +0100

clocksource/drivers/tcb_clksrc: Make tc_clksrc_suspend/resume() static

Fix sparse warnings:

drivers/clocksource/tcb_clksrc.c:74:6: warning:
 symbol 'tc_clksrc_suspend' was not declared. Should it be static?
drivers/clocksource/tcb_clksrc.c:89:6: warning:
 symbol 'tc_clksrc_resume' was not declared. Should it be static?

Signed-off-by: YueHaibing 
Signed-off-by: Thomas Gleixner 
Cc: 
Cc: 
Cc: 
Link: https://lkml.kernel.org/r/20190322143940.12396-1-yuehaib...@huawei.com

---
 drivers/clocksource/tcb_clksrc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 43f4d5c4d6fa..f987027ca566 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -71,7 +71,7 @@ static u64 tc_get_cycles32(struct clocksource *cs)
return readl_relaxed(tcaddr + ATMEL_TC_REG(0, CV));
 }
 
-void tc_clksrc_suspend(struct clocksource *cs)
+static void tc_clksrc_suspend(struct clocksource *cs)
 {
int i;
 
@@ -86,7 +86,7 @@ void tc_clksrc_suspend(struct clocksource *cs)
bmr_cache = readl(tcaddr + ATMEL_TC_BMR);
 }
 
-void tc_clksrc_resume(struct clocksource *cs)
+static void tc_clksrc_resume(struct clocksource *cs)
 {
int i;
 


[PATCH 2/2] cpufreq: intel_pstate: Also use cppc nominal_perf for base_frequency

2019-03-22 Thread Srinivas Pandruvada
ACPI specifications stat that if the "Guaranteed Performance Register" is
not implemented, OSPM assumes guaranteed performance is always equal to
nominal performance. So for invalid and unimplemented guaranteed
performance register, use nominal performance as guaranteed performance.

This change will fallback to nominal_perf when guranteed_perf is invalid.
If nominal_perf is also invalid, then fallback to existing implementation,
which is to read from HWP Capabilities MSR.

Fixes: 86d333a8cc7f ("cpufreq: intel_pstate: Add base_frequency attribute")
Signed-off-by: Srinivas Pandruvada 
Cc: 4.20+  # 4.20+
---
 drivers/cpufreq/intel_pstate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 7b4b0a7ac68b..e16dea241c55 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -385,6 +385,9 @@ static int intel_pstate_get_cppc_guranteed(int cpu)
if (ret)
return ret;
 
+   if (!cppc_perf.guaranteed_perf)
+   return cppc_perf.nominal_perf;
+
return cppc_perf.guaranteed_perf;
 }
 
-- 
2.17.2



[PATCH 1/2] ACPI / CPPC: Fix processing for guaranteed performance

2019-03-22 Thread Srinivas Pandruvada
As per ACPI specification "Guaranteed Performance Register" is a "Buffer"
field. It can't be "Integer" field. So treat "Integer" type as invalid and
ignore "Guaranteed Performance Register".
Also save one cpc_read() call, when "Guaranteed Performance Register" is
not present, which means register defined as:
"Register(SystemMemory, 0, 0, 0, 0)".

Fixes: 29523f095397 ("ACPI / CPPC: Add support for guaranteed performance")
Signed-off-by: Srinivas Pandruvada 
Cc: 4.20+  # 4.20+
---
 drivers/acpi/cppc_acpi.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 1b207fca1420..3f6c290e06af 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1150,8 +1150,14 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps 
*perf_caps)
cpc_read(cpunum, nominal_reg, );
perf_caps->nominal_perf = nom;
 
-   cpc_read(cpunum, guaranteed_reg, );
-   perf_caps->guaranteed_perf = guaranteed;
+   if (guaranteed_reg->type == ACPI_TYPE_INTEGER  ||
+   (guaranteed_reg->type == ACPI_TYPE_BUFFER &&
+IS_NULL_REG(_reg->cpc_entry.reg))) {
+   perf_caps->guaranteed_perf = 0;
+   } else {
+   cpc_read(cpunum, guaranteed_reg, );
+   perf_caps->guaranteed_perf = guaranteed;
+   }
 
cpc_read(cpunum, lowest_non_linear_reg, _nonlinear);
perf_caps->lowest_nonlinear_perf = min_nonlinear;
-- 
2.17.2



Re: INFO: task hung in vivid_stop_generating_vid_cap

2019-03-22 Thread syzbot

syzbot has bisected this bug to:

commit f2fe89061d79706eca5c47e4efdc09bbc171e74a
Author: Helen Koike 
Date:   Fri Apr 7 17:55:19 2017 +

[media] vimc: Virtual Media Controller core, capture and sensor

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=12ea247d20
start commit:   9f51ae62 Merge git://git.kernel.org/pub/scm/linux/kernel/g..
git tree:   upstream
final crash:https://syzkaller.appspot.com/x/report.txt?x=11ea247d20
console output: https://syzkaller.appspot.com/x/log.txt?x=16ea247d20
kernel config:  https://syzkaller.appspot.com/x/.config?x=62118286bb772a24
dashboard link: https://syzkaller.appspot.com/bug?extid=06283a66a648cd073885
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=15701a3340
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=154c8e4d40

Reported-by: syzbot+06283a66a648cd073...@syzkaller.appspotmail.com
Fixes: f2fe89061d79 ("[media] vimc: Virtual Media Controller core, capture  
and sensor")


For information about bisection process see: https://goo.gl/tpsmEJ#bisection


[tip:perf/urgent] perf annotate: Enable annotation of BPF programs

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  6987561c9e86eace45f2dbb0c564964a63f4150a
Gitweb: https://git.kernel.org/tip/6987561c9e86eace45f2dbb0c564964a63f4150a
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:48 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Wed, 20 Mar 2019 16:43:15 -0300

perf annotate: Enable annotation of BPF programs

In symbol__disassemble(), DSO_BINARY_TYPE__BPF_PROG_INFO dso calls into
a new function symbol__disassemble_bpf(), where annotation line
information is filled based on the bpf_prog_info and btf data saved in
given perf_env.

symbol__disassemble_bpf() uses binutils's libopcodes to disassemble bpf
programs.

Committer testing:

After fixing this:

  -   u64 *addrs = (u64 *)(info_linear->info.jited_ksyms);
  +   u64 *addrs = (u64 
*)(uintptr_t)(info_linear->info.jited_ksyms);

Detected when crossbuilding to a 32-bit arch.

And making all this dependent on HAVE_LIBBFD_SUPPORT and
HAVE_LIBBPF_SUPPORT:

1) Have a BPF program running, one that has BTF info, etc, I used
   the tools/perf/examples/bpf/augmented_raw_syscalls.c put in place
   by 'perf trace'.

  # grep -B1 augmented_raw ~/.perfconfig
  [trace]
add_events = 
/home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.c
  #
  # perf trace -e *mmsg
  dnf/6245 sendmmsg(20, 0x7f5485a88030, 2, MSG_NOSIGNAL) = 2
  NetworkManager/10055 sendmmsg(22, 0x7f8126ad1bb0, 2, 
MSG_NOSIGNAL) = 2

2) Then do a 'perf record' system wide for a while:

  # perf record -a
  ^C[ perf record: Woken up 68 times to write data ]
  [ perf record: Captured and wrote 19.427 MB perf.data (366891 samples) ]
  #

3) Check that we captured BPF and BTF info in the perf.data file:

  # perf report --header-only | grep 'b[pt]f'
  # event : name = cycles:ppp, , id = { 294789, 294790, 294791, 294792, 294793, 
294794, 294795, 294796 }, size = 112, { sample_period, sample_freq } = 4000, 
sample_type = IP|TID|TIME|CPU|PERIOD, read_format = ID, disabled = 1, inherit = 
1, mmap = 1, comm = 1, freq = 1, task = 1, precise_ip = 3, sample_id_all = 1, 
exclude_guest = 1, mmap2 = 1, comm_exec = 1, ksymbol = 1, bpf_event = 1
  # bpf_prog_info of id 13
  # bpf_prog_info of id 14
  # bpf_prog_info of id 15
  # bpf_prog_info of id 16
  # bpf_prog_info of id 17
  # bpf_prog_info of id 18
  # bpf_prog_info of id 21
  # bpf_prog_info of id 22
  # bpf_prog_info of id 41
  # bpf_prog_info of id 42
  # btf info of id 2
  #

4) Check which programs got recorded:

   # perf report | grep bpf_prog | head
 0.16%  exe  bpf_prog_819967866022f1e1_sys_enter  [k] 
bpf_prog_819967866022f1e1_sys_enter
 0.14%  exe  bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] 
bpf_prog_c1bd85c092d6e4aa_sys_exit
 0.08%  fuse-overlayfs   bpf_prog_819967866022f1e1_sys_enter  [k] 
bpf_prog_819967866022f1e1_sys_enter
 0.07%  fuse-overlayfs   bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] 
bpf_prog_c1bd85c092d6e4aa_sys_exit
 0.01%  clang-4.0bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] 
bpf_prog_c1bd85c092d6e4aa_sys_exit
 0.01%  clang-4.0bpf_prog_819967866022f1e1_sys_enter  [k] 
bpf_prog_819967866022f1e1_sys_enter
 0.00%  clangbpf_prog_c1bd85c092d6e4aa_sys_exit   [k] 
bpf_prog_c1bd85c092d6e4aa_sys_exit
 0.00%  runc bpf_prog_819967866022f1e1_sys_enter  [k] 
bpf_prog_819967866022f1e1_sys_enter
 0.00%  clangbpf_prog_819967866022f1e1_sys_enter  [k] 
bpf_prog_819967866022f1e1_sys_enter
 0.00%  sh   bpf_prog_c1bd85c092d6e4aa_sys_exit   [k] 
bpf_prog_c1bd85c092d6e4aa_sys_exit
  #

  This was with the default --sort order for 'perf report', which is:

--sort comm,dso,symbol

  If we just look for the symbol, for instance:

   # perf report --sort symbol | grep bpf_prog | head
 0.26%  [k] bpf_prog_819967866022f1e1_sys_enter-  -
 0.24%  [k] bpf_prog_c1bd85c092d6e4aa_sys_exit -  -
   #

  or the DSO:

   # perf report --sort dso | grep bpf_prog | head
 0.26%  bpf_prog_819967866022f1e1_sys_enter
 0.24%  bpf_prog_c1bd85c092d6e4aa_sys_exit
  #

We'll see the two BPF programs that augmented_raw_syscalls.o puts in
place,  one attached to the raw_syscalls:sys_enter and another to the
raw_syscalls:sys_exit tracepoints, as expected.

Now we can finally do, from the command line, annotation for one of
those two symbols, with the original BPF program source coude intermixed
with the disassembled JITed code:

  # perf annotate --stdio2 bpf_prog_819967866022f1e1_sys_enter

  Samples: 950  of event 'cycles:ppp', 4000 Hz, Event count (approx.): 
553756947, [percent: local period]
  bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter
  Percent  int sys_enter(struct syscall_enter_args *args)
   53.41 push   %rbp

0.63 mov%rsp,%rbp
0.31 sub$0x170,%rsp
1.93 sub$0x28,%rbp
7.02 mov%rbx,0x0(%rbp)
3.20  

[tip:perf/urgent] perf bpf: Show more BPF program info in print_bpf_prog_info()

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  f8dfeae009effc0b6dac2741cf8d7cbb91edb982
Gitweb: https://git.kernel.org/tip/f8dfeae009effc0b6dac2741cf8d7cbb91edb982
Author: Song Liu 
AuthorDate: Tue, 19 Mar 2019 09:54:54 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 21 Mar 2019 11:27:04 -0300

perf bpf: Show more BPF program info in print_bpf_prog_info()

This patch enables showing bpf program name, address, and size in the
header.

Before the patch:

  perf report --header-only
  ...
  # bpf_prog_info of id 9
  # bpf_prog_info of id 10
  # bpf_prog_info of id 13

After the patch:

  # bpf_prog_info 9: bpf_prog_7be49e3934a125ba addr 0xa0024947 size 229
  # bpf_prog_info 10: bpf_prog_2a142ef67aaad174 addr 0xa007c94d size 229
  # bpf_prog_info 13: bpf_prog_47368425825d7384_task__task_newt addr 
0xa0251137 size 369

Committer notes:

Fix the fallback definition when HAVE_LIBBPF_SUPPORT is not defined,
i.e. add the missing 'static inline' and add the __maybe_unused to the
args. Also add stdio.h since we now use FILE * in bpf-event.h.

Signed-off-by: Song Liu 
Tested-by: Arnaldo Carvalho de Melo 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Link: http://lkml.kernel.org/r/20190319165454.1298742-3-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/bpf-event.c | 40 
 tools/perf/util/bpf-event.h | 11 ++-
 tools/perf/util/header.c|  5 +++--
 3 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index d5b041649f26..2a4a0da35632 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -438,3 +438,43 @@ int bpf_event__add_sb_event(struct perf_evlist **evlist,
 
return perf_evlist__add_sb_event(evlist, , bpf_event__sb_cb, env);
 }
+
+void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info,
+   struct perf_env *env,
+   FILE *fp)
+{
+   __u32 *prog_lens = (__u32 *)(uintptr_t)(info->jited_func_lens);
+   __u64 *prog_addrs = (__u64 *)(uintptr_t)(info->jited_ksyms);
+   char name[KSYM_NAME_LEN];
+   struct btf *btf = NULL;
+   u32 sub_prog_cnt, i;
+
+   sub_prog_cnt = info->nr_jited_ksyms;
+   if (sub_prog_cnt != info->nr_prog_tags ||
+   sub_prog_cnt != info->nr_jited_func_lens)
+   return;
+
+   if (info->btf_id) {
+   struct btf_node *node;
+
+   node = perf_env__find_btf(env, info->btf_id);
+   if (node)
+   btf = btf__new((__u8 *)(node->data),
+  node->data_size);
+   }
+
+   if (sub_prog_cnt == 1) {
+   synthesize_bpf_prog_name(name, KSYM_NAME_LEN, info, btf, 0);
+   fprintf(fp, "# bpf_prog_info %u: %s addr 0x%llx size %u\n",
+   info->id, name, prog_addrs[0], prog_lens[0]);
+   return;
+   }
+
+   fprintf(fp, "# bpf_prog_info %u:\n", info->id);
+   for (i = 0; i < sub_prog_cnt; i++) {
+   synthesize_bpf_prog_name(name, KSYM_NAME_LEN, info, btf, i);
+
+   fprintf(fp, "# \tsub_prog %u: %s addr 0x%llx size %u\n",
+   i, name, prog_addrs[i], prog_lens[i]);
+   }
+}
diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h
index 8cb1189149ec..04c33b3bfe28 100644
--- a/tools/perf/util/bpf-event.h
+++ b/tools/perf/util/bpf-event.h
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include "event.h"
+#include 
 
 struct machine;
 union perf_event;
@@ -38,7 +39,9 @@ int perf_event__synthesize_bpf_events(struct perf_session 
*session,
  struct record_opts *opts);
 int bpf_event__add_sb_event(struct perf_evlist **evlist,
 struct perf_env *env);
-
+void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info,
+   struct perf_env *env,
+   FILE *fp);
 #else
 static inline int machine__process_bpf_event(struct machine *machine 
__maybe_unused,
 union perf_event *event 
__maybe_unused,
@@ -61,5 +64,11 @@ static inline int bpf_event__add_sb_event(struct perf_evlist 
**evlist __maybe_un
return 0;
 }
 
+static inline void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info 
__maybe_unused,
+ struct perf_env *env 
__maybe_unused,
+ FILE *fp __maybe_unused)
+{
+
+}
 #endif // HAVE_LIBBPF_SUPPORT
 #endif
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 01dda2f65d36..b9e693825873 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1468,8 +1468,9 @@ static void print_bpf_prog_info(struct feat_fd *ff, FILE 
*fp)
 
  

[PATCH 0/2] Correct the processing for base_frequency

2019-03-22 Thread Srinivas Pandruvada
The base_frequency display in cpufreq sysfs for intel_pstate gets the
guaranteed ratio by reading CPPC guaranteed performance register as a
first preference before falling back to x86 MSR for Hardware P-state
Capabilities. The current code in cppc_acpi.c assumed that "guaranteed
performance register" can be an integer field, which is invalid as per
ACPI spec. So this change explicitly check for INTEGER values for
invalid BIOS/firmware and ignore. 
Also guaranteed performance register field is optional and when not
present, nominal performance can be used as the guaranteed performance.
But spec calls that this is true only in non-autonomous mode. So
no change is made in cppc_acpi.c to make nominal as guaranteed in this
case to avoid dependency on autonomous and non-autonomous mode. Instead
a change is added to intel_pstate driver, which is specific to x86 to
make nominal as guaranteed when guaranteed performance field is absent
or has invalid value.
Also we are working to clarify this non-autonomous mode requirement
through ACPI standard body. 

Srinivas Pandruvada (2):
  ACPI / CPPC: Fix processing for guaranteed performance
  cpufreq: intel_pstate: Also use cppc nominal_perf for base_frequency

 drivers/acpi/cppc_acpi.c   | 10 --
 drivers/cpufreq/intel_pstate.c |  3 +++
 2 files changed, 11 insertions(+), 2 deletions(-)

-- 
2.17.2



Re: [PATCH 4.14 000/183] 4.14.108-stable review

2019-03-22 Thread kernelci.org bot
stable-rc/linux-4.14.y boot: 52 boots: 0 failed, 51 passed with 1 
untried/unknown (v4.14.107-184-gf85b59a43475)

Full Boot Summary: 
https://kernelci.org/boot/all/job/stable-rc/branch/linux-4.14.y/kernel/v4.14.107-184-gf85b59a43475/
Full Build Summary: 
https://kernelci.org/build/stable-rc/branch/linux-4.14.y/kernel/v4.14.107-184-gf85b59a43475/

Tree: stable-rc
Branch: linux-4.14.y
Git Describe: v4.14.107-184-gf85b59a43475
Git Commit: f85b59a43475b5bcca299c897549d4aff496dda2
Git URL: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Tested: 29 unique boards, 15 SoC families, 9 builds out of 201

---
For more info write to 


Re: mount.nfs: Protocol error after upgrade to linux/master

2019-03-22 Thread Kees Cook
On Thu, Mar 21, 2019 at 2:10 PM Tetsuo Handa
 wrote:
>
> On 2019/03/22 1:38, Kees Cook wrote:
> > This is mostly good. I'd like to keep the other LSMs listed though
> > (similar to what I had originally) so that if a legacy-major doesn't
> > initialize, later ones will be. I want to remove the concept of
> > "major" LSMs. The only thing that should matter is init order...
>
> Excuse me? Are you saying that
>
>   if a legacy-major (which is defined as the "Default security module")
>   doesn't initialize, later ones (any of selinux,smack,tomoyo,apparmor
>   except the one which is defined as "Default security module") will be
>   initialized
>
> ? That sounds strange to me. Any of selinux,smack,tomoyo,apparmor can be
> initialized when specified by lsm= kernel command line option (or security=
> kernel command line option if lsm= kernel command line option is not
> specified), won't it?

It breaks the backward-compat for the "security=" line. If a system is
booted with CONFIG_LSM="minors...,apparmor" and "security=selinux",
neither apparmor nor selinux will be initialized. The logic on
"security=..." depends on the other LSMs being present in the list.

-Kees

-- 
Kees Cook


[tip:perf/urgent] perf evlist: Introduce side band thread

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  657ee5531903339b06697581532ed32d4762526e
Gitweb: https://git.kernel.org/tip/657ee5531903339b06697581532ed32d4762526e
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:50 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 21 Mar 2019 11:27:03 -0300

perf evlist: Introduce side band thread

This patch introduces side band thread that captures extended
information for events like PERF_RECORD_BPF_EVENT.

This new thread uses its own evlist that uses ring buffer with very low
watermark for lower latency.

To use side band thread, we need to:

1. add side band event(s) by calling perf_evlist__add_sb_event();
2. calls perf_evlist__start_sb_thread();
3. at the end of perf run, perf_evlist__stop_sb_thread().

In the next patch, we use this thread to handle PERF_RECORD_BPF_EVENT.

Committer notes:

Add fix by Jiri Olsa for when te sb_tread can't get started and then at
the end the stop_sb_thread() segfaults when joining the (non-existing)
thread.

That can happen when running 'perf top' or 'perf record' as a normal
user, for instance.

Further checks need to be done on top of this to more graciously handle
these possible failure scenarios.

Signed-off-by: Song Liu 
Reviewed-by: Jiri Olsa 
Tested-by: Arnaldo Carvalho de Melo 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Link: http://lkml.kernel.org/r/20190312053051.2690567-15-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-record.c |   9 
 tools/perf/builtin-top.c|   9 
 tools/perf/util/evlist.c| 119 
 tools/perf/util/evlist.h|  12 +
 tools/perf/util/evsel.h |   6 +++
 5 files changed, 155 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e79faccd7842..6f645fd72fed 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1137,6 +1137,7 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
struct perf_data *data = >data;
struct perf_session *session;
bool disabled = false, draining = false;
+   struct perf_evlist *sb_evlist = NULL;
int fd;
 
atexit(record__sig_exit);
@@ -1237,6 +1238,11 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
goto out_child;
}
 
+   if (perf_evlist__start_sb_thread(sb_evlist, >opts.target)) {
+   pr_debug("Couldn't start the BPF side band thread:\nBPF 
programs starting from now on won't be annotatable\n");
+   opts->no_bpf_event = true;
+   }
+
err = record__synthesize(rec, false);
if (err < 0)
goto out_child;
@@ -1487,6 +1493,9 @@ out_child:
 
 out_delete_session:
perf_session__delete(session);
+
+   if (!opts->no_bpf_event)
+   perf_evlist__stop_sb_thread(sb_evlist);
return status;
 }
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index c2ea22c4ea67..3ce8a8db6c1d 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1501,6 +1501,7 @@ int cmd_top(int argc, const char **argv)
"number of thread to run event synthesize"),
OPT_END()
};
+   struct perf_evlist *sb_evlist = NULL;
const char * const top_usage[] = {
"perf top []",
NULL
@@ -1636,8 +1637,16 @@ int cmd_top(int argc, const char **argv)
goto out_delete_evlist;
}
 
+   if (perf_evlist__start_sb_thread(sb_evlist, target)) {
+   pr_debug("Couldn't start the BPF side band thread:\nBPF 
programs starting from now on won't be annotatable\n");
+   opts->no_bpf_event = true;
+   }
+
status = __cmd_top();
 
+   if (!opts->no_bpf_event)
+   perf_evlist__stop_sb_thread(sb_evlist);
+
 out_delete_evlist:
perf_evlist__delete(top.evlist);
perf_session__delete(top.session);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index ed20f4379956..ec78e93085de 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -19,6 +19,7 @@
 #include "debug.h"
 #include "units.h"
 #include "asm/bug.h"
+#include "bpf-event.h"
 #include 
 #include 
 
@@ -1856,3 +1857,121 @@ struct perf_evsel *perf_evlist__reset_weak_group(struct 
perf_evlist *evsel_list,
}
return leader;
 }
+
+int perf_evlist__add_sb_event(struct perf_evlist **evlist,
+ struct perf_event_attr *attr,
+ perf_evsel__sb_cb_t cb,
+ void *data)
+{
+   struct perf_evsel *evsel;
+   bool new_evlist = (*evlist) == NULL;
+
+   if (*evlist == NULL)
+   *evlist = perf_evlist__new();
+   if (*evlist == NULL)
+   return -1;
+
+   if (!attr->sample_id_all) {
+   pr_warning("enabling 

[tip:perf/urgent] perf top: Add option --no-bpf-event

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  ee7a112fbcc8edb4cf2f84ce5fcc2da7818fd4b8
Gitweb: https://git.kernel.org/tip/ee7a112fbcc8edb4cf2f84ce5fcc2da7818fd4b8
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:46 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 19 Mar 2019 16:52:07 -0300

perf top: Add option --no-bpf-event

This patch adds option --no-bpf-event to 'perf top', which is the same
as the option of 'perf record'.

The following patches will use this option.

Committer testing:

  # perf top -vv 2> /tmp/perf_event_attr.out
  # cat  /tmp/perf_event_attr.out
  
  perf_event_attr:
size 112
{ sample_period, sample_freq }   4000
sample_type  IP|TID|TIME|CPU|PERIOD
read_format  ID
disabled 1
inherit  1
mmap 1
comm 1
freq 1
task 1
precise_ip   3
sample_id_all1
exclude_guest1
mmap21
comm_exec1
ksymbol  1
bpf_event1
  
  #

After this patch:

  # perf top --no-bpf-event -vv 2> /tmp/perf_event_attr.out
  # cat  /tmp/perf_event_attr.out
  
  perf_event_attr:
size 112
{ sample_period, sample_freq }   4000
sample_type  IP|TID|TIME|CPU|PERIOD
read_format  ID
disabled 1
inherit  1
mmap 1
comm 1
freq 1
task 1
precise_ip   3
sample_id_all1
exclude_guest1
mmap21
comm_exec1
ksymbol  1
  
  #

Signed-off-by: Song Liu 
Tested-by: Arnaldo Carvalho de Melo 
Reviewed-by: Jiri Olsa 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Cc: kernel-t...@fb.com
Link: http://lkml.kernel.org/r/20190312053051.2690567-11-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-top.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 77e6190211d2..c2ea22c4ea67 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1469,6 +1469,7 @@ int cmd_top(int argc, const char **argv)
"Display raw encoding of assembly instructions (default)"),
OPT_BOOLEAN(0, "demangle-kernel", _conf.demangle_kernel,
"Enable kernel symbol demangling"),
+   OPT_BOOLEAN(0, "no-bpf-event", _opts.no_bpf_event, "do not 
record bpf events"),
OPT_STRING(0, "objdump", _opts.objdump_path, "path",
"objdump binary to use for disassembly and annotations"),
OPT_STRING('M', "disassembler-style", 
_opts.disassembler_style, "disassembler style",


[tip:perf/urgent] perf build: Check what binutils's 'disassembler()' signature to use

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  8a1b1718214cfd945fef14b3031e4e7262882a86
Gitweb: https://git.kernel.org/tip/8a1b1718214cfd945fef14b3031e4e7262882a86
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:48 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Wed, 20 Mar 2019 16:42:10 -0300

perf build: Check what binutils's 'disassembler()' signature to use

Commit 003ca0fd2286 ("Refactor disassembler selection") in the binutils
repo, which changed the disassembler() function signature, so we must
use the feature test introduced in fb982666e380 ("tools/bpftool: fix
bpftool build with bintutils >= 2.9") to deal with that.

Committer testing:

After adding the missing function call to test-all.c, and:

  FEATURE_CHECK_LDFLAGS-disassembler-four-args = -bfd -lopcodes

And the fallbacks for cases where we need -liberty and sometimes -lz to
tools/perf/Makefile.config, we get:

  $ make -C tools/perf O=/tmp/build/perf install-bin
  make: Entering directory '/home/acme/git/perf/tools/perf'
BUILD:   Doing 'make -j8' parallel build

  Auto-detecting system features:
  ... dwarf: [ on  ]
  ...dwarf_getlocations: [ on  ]
  ... glibc: [ on  ]
  ...  gtk2: [ on  ]
  ...  libaudit: [ on  ]
  ...libbfd: [ on  ]
  ...libelf: [ on  ]
  ...   libnuma: [ on  ]
  ...numa_num_possible_cpus: [ on  ]
  ...   libperl: [ on  ]
  ... libpython: [ on  ]
  ...  libslang: [ on  ]
  ... libcrypto: [ on  ]
  ... libunwind: [ on  ]
  ...libdw-dwarf-unwind: [ on  ]
  ...  zlib: [ on  ]
  ...  lzma: [ on  ]
  ... get_cpuid: [ on  ]
  ...   bpf: [ on  ]
  ...libaio: [ on  ]
  ...disassembler-four-args: [ on  ]
CC   /tmp/build/perf/jvmti/libjvmti.o
CC   /tmp/build/perf/builtin-bench.o
  
  $
  $

The feature detection test-all.bin gets successfully built and linked:

  $ ls -la /tmp/build/perf/feature/test-all.bin
  -rwxrwxr-x. 1 acme acme 2680352 Mar 19 11:07 
/tmp/build/perf/feature/test-all.bin
  $ nm /tmp/build/perf/feature/test-all.bin  | grep -w disassembler
  00061f90 T disassembler
  $

Time to move on to the patches that make use of this disassembler()
routine in binutils's libopcodes.

Signed-off-by: Song Liu 
Tested-by: Arnaldo Carvalho de Melo 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Jakub Kicinski 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Roman Gushchin 
Cc: Stanislav Fomichev 
Link: http://lkml.kernel.org/r/20190312053051.2690567-13-songliubrav...@fb.com
[ split from a larger patch, added missing 
FEATURE_CHECK_LDFLAGS-disassembler-four-args ]
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/build/Makefile.feature   | 6 --
 tools/build/feature/test-all.c | 5 +
 tools/perf/Makefile.config | 9 +
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 61e46d54a67c..8d3864b061f3 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -66,7 +66,8 @@ FEATURE_TESTS_BASIC :=  \
 sched_getcpu   \
 sdt\
 setns  \
-libaio
+libaio \
+disassembler-four-args
 
 # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
 # of all feature tests
@@ -118,7 +119,8 @@ FEATURE_DISPLAY ?=  \
  lzma   \
  get_cpuid  \
  bpf   \
- libaio
+ libaio\
+ disassembler-four-args
 
 # Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features.
 # If in the future we need per-feature checks/flags for features not
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index e903b86b742f..7853e6d91090 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -178,6 +178,10 @@
 # include "test-reallocarray.c"
 #undef main
 
+#define main main_test_disassembler_four_args
+# include "test-disassembler-four-args.c"
+#undef main
+
 int main(int argc, char *argv[])
 {
main_test_libpython();
@@ -219,6 +223,7 @@ int main(int argc, char *argv[])
main_test_setns();
main_test_libaio();
main_test_reallocarray();
+   main_test_disassembler_four_args();
 
return 0;
 }
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index df4ad45599ca..fe3f97e342fa 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -227,6 +227,8 @@ FEATURE_CHECK_LDFLAGS-libpython-version := 
$(PYTHON_EMBED_LDOPTS)
 

[tip:perf/urgent] perf bpf: Process PERF_BPF_EVENT_PROG_LOAD for annotation

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  3ca3877a9732b68cf0289367a859f6c163a79bfa
Gitweb: https://git.kernel.org/tip/3ca3877a9732b68cf0289367a859f6c163a79bfa
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:49 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 19 Mar 2019 16:52:07 -0300

perf bpf: Process PERF_BPF_EVENT_PROG_LOAD for annotation

This patch adds processing of PERF_BPF_EVENT_PROG_LOAD, which sets
proper DSO type/id/etc of memory regions mapped to BPF programs to
DSO_BINARY_TYPE__BPF_PROG_INFO.

Signed-off-by: Song Liu 
Reviewed-by: Jiri Olsa 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Cc: kernel-t...@fb.com
Link: http://lkml.kernel.org/r/20190312053051.2690567-14-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/bpf-event.c | 54 +
 1 file changed, 54 insertions(+)

diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index a4fc52b4ffae..852e960692cb 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -12,6 +12,7 @@
 #include "machine.h"
 #include "env.h"
 #include "session.h"
+#include "map.h"
 
 #define ptr_to_u64(ptr)((__u64)(unsigned long)(ptr))
 
@@ -25,12 +26,65 @@ static int snprintf_hex(char *buf, size_t size, unsigned 
char *data, size_t len)
return ret;
 }
 
+static int machine__process_bpf_event_load(struct machine *machine,
+  union perf_event *event,
+  struct perf_sample *sample 
__maybe_unused)
+{
+   struct bpf_prog_info_linear *info_linear;
+   struct bpf_prog_info_node *info_node;
+   struct perf_env *env = machine->env;
+   int id = event->bpf_event.id;
+   unsigned int i;
+
+   /* perf-record, no need to handle bpf-event */
+   if (env == NULL)
+   return 0;
+
+   info_node = perf_env__find_bpf_prog_info(env, id);
+   if (!info_node)
+   return 0;
+   info_linear = info_node->info_linear;
+
+   for (i = 0; i < info_linear->info.nr_jited_ksyms; i++) {
+   u64 *addrs = (u64 *)(info_linear->info.jited_ksyms);
+   u64 addr = addrs[i];
+   struct map *map;
+
+   map = map_groups__find(>kmaps, addr);
+
+   if (map) {
+   map->dso->binary_type = DSO_BINARY_TYPE__BPF_PROG_INFO;
+   map->dso->bpf_prog.id = id;
+   map->dso->bpf_prog.sub_id = i;
+   map->dso->bpf_prog.env = env;
+   }
+   }
+   return 0;
+}
+
 int machine__process_bpf_event(struct machine *machine __maybe_unused,
   union perf_event *event,
   struct perf_sample *sample __maybe_unused)
 {
if (dump_trace)
perf_event__fprintf_bpf_event(event, stdout);
+
+   switch (event->bpf_event.type) {
+   case PERF_BPF_EVENT_PROG_LOAD:
+   return machine__process_bpf_event_load(machine, event, sample);
+
+   case PERF_BPF_EVENT_PROG_UNLOAD:
+   /*
+* Do not free bpf_prog_info and btf of the program here,
+* as annotation still need them. They will be freed at
+* the end of the session.
+*/
+   break;
+   default:
+   pr_debug("unexpected bpf_event type of %d\n",
+event->bpf_event.type);
+   break;
+   }
return 0;
 }
 


[tip:perf/urgent] perf symbols: Introduce DSO_BINARY_TYPE__BPF_PROG_INFO

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  9b86d04d53b98399017fea44e9047165ffe12d42
Gitweb: https://git.kernel.org/tip/9b86d04d53b98399017fea44e9047165ffe12d42
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:48 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 19 Mar 2019 16:52:07 -0300

perf symbols: Introduce DSO_BINARY_TYPE__BPF_PROG_INFO

Introduce a new dso type DSO_BINARY_TYPE__BPF_PROG_INFO for BPF programs. In
symbol__disassemble(), DSO_BINARY_TYPE__BPF_PROG_INFO dso will call into a new
function symbol__disassemble_bpf() in an upcoming patch, where annotation line
information is filled based bpf_prog_info and btf saved in given perf_env.

Committer notes:

Removed the unnamed union with 'bpf_prog' and 'cache' in 'struct dso',
to fix this bug when exiting 'perf top':

  # perf top
  perf: Segmentation fault
   backtrace 
  perf[0x5a785a]
  /lib64/libc.so.6(+0x385bf)[0x7fd68443c5bf]
  perf(rb_first+0x2b)[0x4d6eeb]
  perf(dso__delete+0xb7)[0x4dffb7]
  perf[0x4f9e37]
  perf(perf_session__delete+0x64)[0x504df4]
  perf(cmd_top+0x1957)[0x454467]
  perf[0x4aad18]
  perf(main+0x61c)[0x42ec7c]
  /lib64/libc.so.6(__libc_start_main+0xf2)[0x7fd684428412]
  perf(_start+0x2d)[0x42eead]
  #
  # addr2line -fe ~/bin/perf 0x4dffb7
  dso_cache__free
  /home/acme/git/perf/tools/perf/util/dso.c:713

That is trying to access the dso->data.cache, and that is not used with
BPF programs, so we end up accessing what is in bpf_prog.first_member,
b00m.

Signed-off-by: Song Liu 
Reviewed-by: Jiri Olsa 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Cc: kernel-t...@fb.com
Link: http://lkml.kernel.org/r/20190312053051.2690567-13-songliubrav...@fb.com
[ split from a larger patch ]
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/dso.c| 1 +
 tools/perf/util/dso.h| 8 
 tools/perf/util/symbol.c | 1 +
 3 files changed, 10 insertions(+)

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index ab8a455d2283..e059976d9d93 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -184,6 +184,7 @@ int dso__read_binary_type_filename(const struct dso *dso,
case DSO_BINARY_TYPE__KALLSYMS:
case DSO_BINARY_TYPE__GUEST_KALLSYMS:
case DSO_BINARY_TYPE__JAVA_JIT:
+   case DSO_BINARY_TYPE__BPF_PROG_INFO:
case DSO_BINARY_TYPE__NOT_FOUND:
ret = -1;
break;
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index bb417c54c25a..6e3f63781e51 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -14,6 +14,7 @@
 
 struct machine;
 struct map;
+struct perf_env;
 
 enum dso_binary_type {
DSO_BINARY_TYPE__KALLSYMS = 0,
@@ -35,6 +36,7 @@ enum dso_binary_type {
DSO_BINARY_TYPE__KCORE,
DSO_BINARY_TYPE__GUEST_KCORE,
DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO,
+   DSO_BINARY_TYPE__BPF_PROG_INFO,
DSO_BINARY_TYPE__NOT_FOUND,
 };
 
@@ -189,6 +191,12 @@ struct dso {
u64  debug_frame_offset;
u64  eh_frame_hdr_offset;
} data;
+   /* bpf prog information */
+   struct {
+   u32 id;
+   u32 sub_id;
+   struct perf_env *env;
+   } bpf_prog;
 
union { /* Tool specific area */
void *priv;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 58442ca5e3c4..5cbad55cd99d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1455,6 +1455,7 @@ static bool dso__is_compatible_symtab_type(struct dso 
*dso, bool kmod,
case DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO:
return true;
 
+   case DSO_BINARY_TYPE__BPF_PROG_INFO:
case DSO_BINARY_TYPE__NOT_FOUND:
default:
return false;


[tip:perf/urgent] perf bpf: Save BTF in a rbtree in perf_env

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  3792cb2ff43b1b193136a03ce1336462a827d792
Gitweb: https://git.kernel.org/tip/3792cb2ff43b1b193136a03ce1336462a827d792
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:44 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 19 Mar 2019 16:52:07 -0300

perf bpf: Save BTF in a rbtree in perf_env

BTF contains information necessary to annotate BPF programs. This patch
saves BTF for BPF programs loaded in the system.

Signed-off-by: Song Liu 
Reviewed-by: Jiri Olsa 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Cc: kernel-t...@fb.com
Link: http://lkml.kernel.org/r/20190312053051.2690567-9-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/bpf-event.c | 23 
 tools/perf/util/bpf-event.h |  7 +
 tools/perf/util/env.c   | 67 +
 tools/perf/util/env.h   |  5 
 4 files changed, 102 insertions(+)

diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 37ee4e2a728a..a4fc52b4ffae 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -34,6 +34,28 @@ int machine__process_bpf_event(struct machine *machine 
__maybe_unused,
return 0;
 }
 
+static int perf_env__fetch_btf(struct perf_env *env,
+  u32 btf_id,
+  struct btf *btf)
+{
+   struct btf_node *node;
+   u32 data_size;
+   const void *data;
+
+   data = btf__get_raw_data(btf, _size);
+
+   node = malloc(data_size + sizeof(struct btf_node));
+   if (!node)
+   return -1;
+
+   node->id = btf_id;
+   node->data_size = data_size;
+   memcpy(node->data, data, data_size);
+
+   perf_env__insert_btf(env, node);
+   return 0;
+}
+
 /*
  * Synthesize PERF_RECORD_KSYMBOL and PERF_RECORD_BPF_EVENT for one bpf
  * program. One PERF_RECORD_BPF_EVENT is generated for the program. And
@@ -113,6 +135,7 @@ static int perf_event__synthesize_one_bpf_prog(struct 
perf_session *session,
goto out;
}
has_btf = true;
+   perf_env__fetch_btf(env, info->btf_id, btf);
}
 
/* Synthesize PERF_RECORD_KSYMBOL */
diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h
index fad932f7404f..b9ec394dc7c7 100644
--- a/tools/perf/util/bpf-event.h
+++ b/tools/perf/util/bpf-event.h
@@ -16,6 +16,13 @@ struct bpf_prog_info_node {
struct rb_node  rb_node;
 };
 
+struct btf_node {
+   struct rb_node  rb_node;
+   u32 id;
+   u32 data_size;
+   chardata[];
+};
+
 #ifdef HAVE_LIBBPF_SUPPORT
 int machine__process_bpf_event(struct machine *machine, union perf_event 
*event,
   struct perf_sample *sample);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 98cd36f0e317..c6351b557bb0 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -64,6 +64,58 @@ struct bpf_prog_info_node 
*perf_env__find_bpf_prog_info(struct perf_env *env,
return node;
 }
 
+void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node)
+{
+   struct rb_node *parent = NULL;
+   __u32 btf_id = btf_node->id;
+   struct btf_node *node;
+   struct rb_node **p;
+
+   down_write(>bpf_progs.lock);
+   p = >bpf_progs.btfs.rb_node;
+
+   while (*p != NULL) {
+   parent = *p;
+   node = rb_entry(parent, struct btf_node, rb_node);
+   if (btf_id < node->id) {
+   p = &(*p)->rb_left;
+   } else if (btf_id > node->id) {
+   p = &(*p)->rb_right;
+   } else {
+   pr_debug("duplicated btf %u\n", btf_id);
+   goto out;
+   }
+   }
+
+   rb_link_node(_node->rb_node, parent, p);
+   rb_insert_color(_node->rb_node, >bpf_progs.btfs);
+   env->bpf_progs.btfs_cnt++;
+out:
+   up_write(>bpf_progs.lock);
+}
+
+struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id)
+{
+   struct btf_node *node = NULL;
+   struct rb_node *n;
+
+   down_read(>bpf_progs.lock);
+   n = env->bpf_progs.btfs.rb_node;
+
+   while (n) {
+   node = rb_entry(n, struct btf_node, rb_node);
+   if (btf_id < node->id)
+   n = n->rb_left;
+   else if (btf_id > node->id)
+   n = n->rb_right;
+   else
+   break;
+   }
+
+   up_read(>bpf_progs.lock);
+   return node;
+}
+
 /* purge data in bpf_progs.infos tree */
 static void perf_env__purge_bpf(struct perf_env *env)
 {
@@ -86,6 +138,20 @@ static void perf_env__purge_bpf(struct perf_env *env)
 
env->bpf_progs.infos_cnt = 0;
 
+   root = >bpf_progs.btfs;
+   next = rb_first(root);
+
+   

[tip:perf/urgent] perf feature detection: Add -lopcodes to feature-libbfd

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  31be9478ed7f43d6351e0d5a2257ca76609c83d3
Gitweb: https://git.kernel.org/tip/31be9478ed7f43d6351e0d5a2257ca76609c83d3
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:47 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 19 Mar 2019 16:52:07 -0300

perf feature detection: Add -lopcodes to feature-libbfd

Both libbfd and libopcodes are distributed with binutil-dev/devel. When
libbfd is present, it is OK to assume that libopcodes also present. This
has been a safe assumption for bpftool.

This patch adds -lopcodes to perf/Makefile.config. libopcodes will be
used in the next commit for BPF annotation.

Signed-off-by: Song Liu 
Reviewed-by: Jiri Olsa 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Cc: kernel-t...@fb.com
Link: http://lkml.kernel.org/r/20190312053051.2690567-12-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/Makefile.config | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 0f11d5891301..df4ad45599ca 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -713,7 +713,7 @@ else
 endif
 
 ifeq ($(feature-libbfd), 1)
-  EXTLIBS += -lbfd
+  EXTLIBS += -lbfd -lopcodes
 else
   # we are on a system that requires -liberty and (maybe) -lz
   # to link against -lbfd; test each case individually here
@@ -724,10 +724,10 @@ else
   $(call feature_check,libbfd-liberty-z)
 
   ifeq ($(feature-libbfd-liberty), 1)
-EXTLIBS += -lbfd -liberty
+EXTLIBS += -lbfd -lopcodes -liberty
   else
 ifeq ($(feature-libbfd-liberty-z), 1)
-  EXTLIBS += -lbfd -liberty -lz
+  EXTLIBS += -lbfd -lopcodes -liberty -lz
 endif
   endif
 endif


[tip:perf/urgent] perf bpf: Save BTF information as headers to perf.data

2019-03-22 Thread tip-bot for Song Liu
Commit-ID:  a70a1123174ab592c5fa8ecf09f9fad9b335b872
Gitweb: https://git.kernel.org/tip/a70a1123174ab592c5fa8ecf09f9fad9b335b872
Author: Song Liu 
AuthorDate: Mon, 11 Mar 2019 22:30:45 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 19 Mar 2019 16:52:07 -0300

perf bpf: Save BTF information as headers to perf.data

This patch enables 'perf record' to save BTF information as headers to
perf.data.

A new header type HEADER_BPF_BTF is introduced for this data.

Committer testing:

As root, being on the kernel sources top level directory, run:

# perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c -e *msg

Just to compile and load a BPF program that attaches to the
raw_syscalls:sys_{enter,exit} tracepoints to trace the syscalls ending
in "msg" (recvmsg, sendmsg, recvmmsg, sendmmsg, etc).

Make sure you have a recent enough clang, say version 9, to get the
BTF ELF sections needed for this testing:

  # clang --version | head -1
  clang version 9.0.0 (https://git.llvm.org/git/clang.git/ 
7906282d3afec5dfdc2b27943fd6c0309086c507) (https://git.llvm.org/git/llvm.git/ 
a1b5de1ff8ae8bc79dc8e86e1f82565229bd0500)
  # readelf -SW tools/perf/examples/bpf/augmented_raw_syscalls.o | grep BTF
[22] .BTF  PROGBITS 000ede 000b0e 00
  0   0  1
[23] .BTF.ext  PROGBITS 0019ec 0002a0 00
  0   0  1
[24] .rel.BTF.ext  REL  002fa8 000270 10
 30  23  8

Then do a systemwide perf record session for a few seconds:

  # perf record -a sleep 2s

Then look at:

  # perf report --header-only | grep b[pt]f
  # event : name = cycles:ppp, , id = { 1116204, 1116205, 1116206, 1116207, 
1116208, 1116209, 1116210, 1116211 }, size = 112, { sample_period, sample_freq 
} = 4000, sample_type = IP|TID|TIME|PERIOD, read_format = ID, disabled = 1, 
inherit = 1, mmap = 1, comm = 1, freq = 1, enable_on_exec = 1, task = 1, 
precise_ip = 3, sample_id_all = 1, exclude_guest = 1, mmap2 = 1, comm_exec = 1, 
ksymbol = 1, bpf_event = 1
  # bpf_prog_info of id 13
  # bpf_prog_info of id 14
  # bpf_prog_info of id 15
  # bpf_prog_info of id 16
  # bpf_prog_info of id 17
  # bpf_prog_info of id 18
  # bpf_prog_info of id 21
  # bpf_prog_info of id 22
  # bpf_prog_info of id 51
  # bpf_prog_info of id 52
  # btf info of id 8
  #

We need to show more info about these BPF and BTF entries , but that can
be done later.

Signed-off-by: Song Liu 
Reviewed-by: Jiri Olsa 
Tested-by: Arnaldo Carvalho de Melo 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stanislav Fomichev 
Cc: kernel-t...@fb.com
Link: http://lkml.kernel.org/r/20190312053051.2690567-10-songliubrav...@fb.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/header.c | 101 ++-
 tools/perf/util/header.h |   1 +
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index e6a81af516f6..01dda2f65d36 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -928,6 +928,39 @@ static int write_bpf_prog_info(struct feat_fd *ff 
__maybe_unused,
 }
 #endif // HAVE_LIBBPF_SUPPORT
 
+static int write_bpf_btf(struct feat_fd *ff,
+struct perf_evlist *evlist __maybe_unused)
+{
+   struct perf_env *env = >ph->env;
+   struct rb_root *root;
+   struct rb_node *next;
+   int ret;
+
+   down_read(>bpf_progs.lock);
+
+   ret = do_write(ff, >bpf_progs.btfs_cnt,
+  sizeof(env->bpf_progs.btfs_cnt));
+
+   if (ret < 0)
+   goto out;
+
+   root = >bpf_progs.btfs;
+   next = rb_first(root);
+   while (next) {
+   struct btf_node *node;
+
+   node = rb_entry(next, struct btf_node, rb_node);
+   next = rb_next(>rb_node);
+   ret = do_write(ff, >id,
+  sizeof(u32) * 2 + node->data_size);
+   if (ret < 0)
+   goto out;
+   }
+out:
+   up_read(>bpf_progs.lock);
+   return ret;
+}
+
 static int cpu_cache_level__sort(const void *a, const void *b)
 {
struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a;
@@ -1442,6 +1475,28 @@ static void print_bpf_prog_info(struct feat_fd *ff, FILE 
*fp)
up_read(>bpf_progs.lock);
 }
 
+static void print_bpf_btf(struct feat_fd *ff, FILE *fp)
+{
+   struct perf_env *env = >ph->env;
+   struct rb_root *root;
+   struct rb_node *next;
+
+   down_read(>bpf_progs.lock);
+
+   root = >bpf_progs.btfs;
+   next = rb_first(root);
+
+   while (next) {
+   struct btf_node *node;
+
+   node = rb_entry(next, struct btf_node, rb_node);
+   next = rb_next(>rb_node);
+   fprintf(fp, "# btf info of id %u\n", node->id);
+   }
+
+   up_read(>bpf_progs.lock);
+}
+
 static void 

  1   2   3   4   5   6   7   8   9   10   >