[tip:perf/core] perf top: Support lookup of symbols in other mount namespaces.

2017-07-26 Thread tip-bot for Krister Johansen
Commit-ID:  868a832918f621b7576655c00067f20326ef3931
Gitweb: http://git.kernel.org/tip/868a832918f621b7576655c00067f20326ef3931
Author: Krister Johansen 
AuthorDate: Wed, 5 Jul 2017 18:48:12 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 25 Jul 2017 22:43:16 -0300

perf top: Support lookup of symbols in other mount namespaces.

The perf top command needs to unshare its fs from the helper threads in
order to successfully setns(2) during its symbol lookup.  It also needs
to impelement a force flag to ignore ownership of perf-.map files.

Signed-off-by: Krister Johansen 
Cc: Alexander Shishkin 
Cc: Brendan Gregg 
Cc: Peter Zijlstra 
Cc: Thomas-Mich Richter 
Link: 
http://lkml.kernel.org/r/1499305693-1599-6-git-send-email-k...@templeofstupid.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/Documentation/perf-top.txt |  4 
 tools/perf/builtin-top.c  | 15 +++
 2 files changed, 19 insertions(+)

diff --git a/tools/perf/Documentation/perf-top.txt 
b/tools/perf/Documentation/perf-top.txt
index e71d638..d864ea6 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -237,6 +237,10 @@ Default is to monitor all CPUS.
 --hierarchy::
Enable hierarchy output.
 
+--force::
+   Don't do ownership validation.
+
+
 INTERACTIVE PROMPTING KEYS
 --
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index e5a8f24..ee954bd 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -587,6 +587,13 @@ static void *display_thread_tui(void *arg)
.refresh= top->delay_secs,
};
 
+   /* In order to read symbols from other namespaces perf to  needs to call
+* setns(2).  This isn't permitted if the struct_fs has multiple users.
+* unshare(2) the fs so that we may continue to setns into namespaces
+* that we're observing.
+*/
+   unshare(CLONE_FS);
+
perf_top__sort_new_samples(top);
 
/*
@@ -628,6 +635,13 @@ static void *display_thread(void *arg)
struct perf_top *top = arg;
int delay_msecs, c;
 
+   /* In order to read symbols from other namespaces perf to  needs to call
+* setns(2).  This isn't permitted if the struct_fs has multiple users.
+* unshare(2) the fs so that we may continue to setns into namespaces
+* that we're observing.
+*/
+   unshare(CLONE_FS);
+
display_setup_sig();
pthread__unblock_sigwinch();
 repeat:
@@ -1206,6 +1220,7 @@ int cmd_top(int argc, const char **argv)
"Show raw trace event output (do not use print fmt or 
plugins)"),
OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
"Show entries in a hierarchy"),
+   OPT_BOOLEAN(0, "force", &symbol_conf.force, "don't complain, do it"),
OPT_END()
};
const char * const top_usage[] = {


[tip:perf/core] perf probe: Allow placing uprobes in alternate namespaces.

2017-07-20 Thread tip-bot for Krister Johansen
Commit-ID:  544abd44c7064c8a58a6bd2073d757f6b91d98c5
Gitweb: http://git.kernel.org/tip/544abd44c7064c8a58a6bd2073d757f6b91d98c5
Author: Krister Johansen 
AuthorDate: Wed, 5 Jul 2017 18:48:10 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 18 Jul 2017 23:14:10 -0300

perf probe: Allow placing uprobes in alternate namespaces.

Teaches perf how to place a uprobe on a file that's in a different mount
namespace.  The user must add the probe using the --target-ns argument
to perf probe.  Once it has been placed, it may be recorded against
without further namespace-specific commands.

Signed-off-by: Krister Johansen 
Cc: Alexander Shishkin 
Cc: Brendan Gregg 
Cc: Peter Zijlstra 
Cc: Ravi Bangoria 
[ PPC build fixed by Ravi: ]
Link: 
http://lkml.kernel.org/r/1500287542-6219-1-git-send-email-ravi.bango...@linux.vnet.ibm.com
Cc: Thomas-Mich Richter 
[ Fix !HAVE_DWARF_SUPPORT build ]
Link: 
http://lkml.kernel.org/r/1499305693-1599-4-git-send-email-k...@templeofstupid.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/Documentation/perf-probe.txt |  9 
 tools/perf/arch/powerpc/util/sym-handling.c |  2 +-
 tools/perf/builtin-probe.c  | 43 ++--
 tools/perf/util/namespaces.c| 13 +
 tools/perf/util/namespaces.h|  2 +
 tools/perf/util/probe-event.c   | 80 +++--
 tools/perf/util/probe-event.h   | 10 ++--
 7 files changed, 125 insertions(+), 34 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt 
b/tools/perf/Documentation/perf-probe.txt
index 165c2b1..a42aabc 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -130,6 +130,11 @@ OPTIONS
 --max-probes=NUM::
Set the maximum number of probe points for an event. Default is 128.
 
+--target-ns=PID:
+   Obtain mount namespace information from the target pid.  This is
+   used when creating a uprobe for a process that resides in a
+   different mount namespace from the perf(1) utility.
+
 -x::
 --exec=PATH::
Specify path to the executable or shared library file for user
@@ -264,6 +269,10 @@ Add probes at malloc() function on libc
 
  ./perf probe -x /lib/libc.so.6 malloc or ./perf probe /lib/libc.so.6 malloc
 
+Add a uprobe to a target process running in a different mount namespace
+
+ ./perf probe --target-ns  -x /lib64/libc.so.6 malloc
+
 SEE ALSO
 
 linkperf:perf-trace[1], linkperf:perf-record[1], linkperf:perf-buildid-cache[1]
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c 
b/tools/perf/arch/powerpc/util/sym-handling.c
index bf9a259..9c4e23d 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -126,7 +126,7 @@ void arch__post_process_probe_trace_events(struct 
perf_probe_event *pev,
struct rb_node *tmp;
int i = 0;
 
-   map = get_target_map(pev->target, pev->uprobes);
+   map = get_target_map(pev->target, pev->nsi, pev->uprobes);
if (!map || map__load(map) < 0)
return;
 
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index cf9f9e9..3fb98d5 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -58,6 +58,7 @@ static struct {
struct line_range line_range;
char *target;
struct strfilter *filter;
+   struct nsinfo *nsi;
 } params;
 
 /* Parse an event definition. Note that any error must die. */
@@ -80,6 +81,9 @@ static int parse_probe_event(const char *str)
params.target_used = true;
}
 
+   if (params.nsi)
+   pev->nsi = nsinfo__get(params.nsi);
+
/* Parse a perf-probe command into event */
ret = parse_perf_probe_command(str, pev);
pr_debug("%d arguments\n", pev->nargs);
@@ -189,7 +193,7 @@ static int opt_set_target(const struct option *opt, const 
char *str,
 
/* Expand given path to absolute path, except for modulename */
if (params.uprobes || strchr(str, '/')) {
-   tmp = realpath(str, NULL);
+   tmp = nsinfo__realpath(str, params.nsi);
if (!tmp) {
pr_warning("Failed to get the absolute path of 
%s: %m\n", str);
return ret;
@@ -208,6 +212,34 @@ static int opt_set_target(const struct option *opt, const 
char *str,
return ret;
 }
 
+static int opt_set_target_ns(const struct option *opt __maybe_unused,
+const char *str, int unset __maybe_unused)
+{
+   int ret = -ENOENT;
+   pid_t ns_pid;
+   struct nsinfo *nsip;
+
+   if (str) {
+   errno = 0;
+   ns_pid = (pid_t)strtol(str, NULL, 10);
+   if (errno != 0) {
+   ret = -errno;
+   

[tip:perf/core] perf buildid-cache: Cache debuginfo

2017-07-20 Thread tip-bot for Krister Johansen
Commit-ID:  d2396999c998b4e0006aef247e154eff0ed3d8f9
Gitweb: http://git.kernel.org/tip/d2396999c998b4e0006aef247e154eff0ed3d8f9
Author: Krister Johansen 
AuthorDate: Wed, 5 Jul 2017 18:48:13 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 18 Jul 2017 23:14:11 -0300

perf buildid-cache: Cache debuginfo

If a stripped binary is placed in the cache, the user is in a situation
where there's a cached elf file present, but it doesn't have any symtab
to use for name resolution.  Grab the debuginfo for binaries that don't
end in .ko.  This yields a better chance of resolving symbols from older
traces.

Signed-off-by: Krister Johansen 
Cc: Alexander Shishkin 
Cc: Brendan Gregg 
Cc: Peter Zijlstra 
Cc: Thomas-Mich Richter 
Link: 
http://lkml.kernel.org/r/1499305693-1599-7-git-send-email-k...@templeofstupid.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-buildid-cache.c |  2 +-
 tools/perf/util/annotate.c |  2 +-
 tools/perf/util/build-id.c | 72 +++---
 tools/perf/util/build-id.h |  3 +-
 tools/perf/util/dso.c  |  8 -
 tools/perf/util/dso.h  |  1 +
 tools/perf/util/machine.c  |  3 +-
 tools/perf/util/symbol.c   | 12 +--
 8 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-buildid-cache.c 
b/tools/perf/builtin-buildid-cache.c
index d65bd86..e3eb624 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -243,7 +243,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int 
parm __maybe_unused)
char filename[PATH_MAX];
u8 build_id[BUILD_ID_SIZE];
 
-   if (dso__build_id_filename(dso, filename, sizeof(filename)) &&
+   if (dso__build_id_filename(dso, filename, sizeof(filename), false) &&
filename__read_build_id(filename, build_id,
sizeof(build_id)) != sizeof(build_id)) {
if (errno == ENOENT)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index ef434b5..1742510 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1347,7 +1347,7 @@ static int dso__disassemble_filename(struct dso *dso, 
char *filename, size_t fil
!dso__is_kcore(dso))
return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX;
 
-   build_id_filename = dso__build_id_filename(dso, NULL, 0);
+   build_id_filename = dso__build_id_filename(dso, NULL, 0, false);
if (build_id_filename) {
__symbol__join_symfs(filename, filename_size, 
build_id_filename);
free(build_id_filename);
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index f7bfd90..e966515 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -243,12 +243,15 @@ static bool build_id_cache__valid_id(char *sbuild_id)
return result;
 }
 
-static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso)
+static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso,
+   bool is_debug)
 {
-   return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : "elf");
+   return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : (is_debug ?
+   "debug" : "elf"));
 }
 
-char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
+char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size,
+bool is_debug)
 {
bool is_kallsyms = dso__is_kallsyms((struct dso *)dso);
bool is_vdso = dso__is_vdso((struct dso *)dso);
@@ -270,7 +273,8 @@ char *dso__build_id_filename(const struct dso *dso, char 
*bf, size_t size)
ret = asnprintf(&bf, size, "%s", linkname);
else
ret = asnprintf(&bf, size, "%s/%s", linkname,
-build_id_cache__basename(is_kallsyms, is_vdso));
+build_id_cache__basename(is_kallsyms, is_vdso,
+ is_debug));
if (ret < 0 || (!alloc && size < (unsigned int)ret))
bf = NULL;
free(linkname);
@@ -603,12 +607,40 @@ static int build_id_cache__add_sdt_cache(const char 
*sbuild_id,
 #define build_id_cache__add_sdt_cache(sbuild_id, realname, nsi) (0)
 #endif
 
+static char *build_id_cache__find_debug(const char *sbuild_id,
+   struct nsinfo *nsi)
+{
+   char *realname = NULL;
+   char *debugfile;
+   struct nscookie nsc;
+   size_t len = 0;
+
+   debugfile = calloc(1, PATH_MAX);
+   if (!debugfile)
+   goto out;
+
+   len = __symbol__join_symfs(debugfile, PATH_MAX,
+  "/usr/lib/debug/.build-id/");
+   snprintf(d

[tip:perf/core] perf buildid-cache: Support binary objects from other namespaces

2017-07-20 Thread tip-bot for Krister Johansen
Commit-ID:  f045b8c4b36baddcfbdd4d3d956446e688b0b3cd
Gitweb: http://git.kernel.org/tip/f045b8c4b36baddcfbdd4d3d956446e688b0b3cd
Author: Krister Johansen 
AuthorDate: Wed, 5 Jul 2017 18:48:11 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 18 Jul 2017 23:14:11 -0300

perf buildid-cache: Support binary objects from other namespaces

Teach buildid-cache how to add, remove, and update binary objects from
other mount namespaces.  Allow probe events tracing binaries in
different namespaces to add their objects to the probe and build-id
caches too.  As a handy side effect, this also lets us access SDT probes
in binaries from alternate mount namespaces.

Signed-off-by: Krister Johansen 
Tested-by: Brendan Gregg 
Cc: Alexander Shishkin 
Cc: Peter Zijlstra 
Cc: Thomas-Mich Richter 
Link: 
http://lkml.kernel.org/r/1499305693-1599-5-git-send-email-k...@templeofstupid.com
[ Add util/namespaces.c to tools/perf/util/python-ext-sources, to fix the 
python binding 'perf test' ]
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/Documentation/perf-buildid-cache.txt |  5 +++
 tools/perf/Documentation/perf-probe.txt |  5 +++
 tools/perf/builtin-buildid-cache.c  | 52 ++---
 tools/perf/builtin-probe.c  |  2 +-
 tools/perf/tests/sdt.c  |  4 +-
 tools/perf/util/build-id.c  | 47 ++
 tools/perf/util/build-id.h  |  9 +++--
 tools/perf/util/dso.c   | 12 +-
 tools/perf/util/parse-events.c  |  2 +-
 tools/perf/util/probe-event.c   |  6 +--
 tools/perf/util/probe-file.c| 19 +
 tools/perf/util/probe-file.h|  4 +-
 tools/perf/util/python-ext-sources  |  1 +
 tools/perf/util/symbol.c| 19 ++---
 tools/perf/util/util.c  | 34 +---
 tools/perf/util/util.h  |  2 +
 16 files changed, 160 insertions(+), 63 deletions(-)

diff --git a/tools/perf/Documentation/perf-buildid-cache.txt 
b/tools/perf/Documentation/perf-buildid-cache.txt
index 058064d..8468100 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -61,6 +61,11 @@ OPTIONS
 --verbose::
Be more verbose.
 
+--target-ns=PID:
+   Obtain mount namespace information from the target pid.  This is
+   used when creating a uprobe for a process that resides in a
+   different mount namespace from the perf(1) utility.
+
 SEE ALSO
 
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/Documentation/perf-probe.txt 
b/tools/perf/Documentation/perf-probe.txt
index a42aabc..d7e4869 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -273,6 +273,11 @@ Add a uprobe to a target process running in a different 
mount namespace
 
  ./perf probe --target-ns  -x /lib64/libc.so.6 malloc
 
+Add a USDT probe to a target process running in a different mount namespace
+
+ ./perf probe --target-ns  -x 
/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.121-0.b13.el7_3.x86_64/jre/lib/amd64/server/libjvm.so
 %sdt_hotspot:thread__sleep__end
+
+
 SEE ALSO
 
 linkperf:perf-trace[1], linkperf:perf-record[1], linkperf:perf-buildid-cache[1]
diff --git a/tools/perf/builtin-buildid-cache.c 
b/tools/perf/builtin-buildid-cache.c
index 9eba7f1..d65bd86 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -14,6 +14,7 @@
 #include 
 #include "builtin.h"
 #include "perf.h"
+#include "namespaces.h"
 #include "util/cache.h"
 #include "util/debug.h"
 #include "util/header.h"
@@ -165,33 +166,41 @@ static int build_id_cache__add_kcore(const char 
*filename, bool force)
return 0;
 }
 
-static int build_id_cache__add_file(const char *filename)
+static int build_id_cache__add_file(const char *filename, struct nsinfo *nsi)
 {
char sbuild_id[SBUILD_ID_SIZE];
u8 build_id[BUILD_ID_SIZE];
int err;
+   struct nscookie nsc;
 
-   if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) 
{
+   nsinfo__mountns_enter(nsi, &nsc);
+   err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+   nsinfo__mountns_exit(&nsc);
+   if (err < 0) {
pr_debug("Couldn't read a build-id in %s\n", filename);
return -1;
}
 
build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-   err = build_id_cache__add_s(sbuild_id, filename,
+   err = build_id_cache__add_s(sbuild_id, filename, nsi,
false, false);
pr_debug("Adding %s %s: %s\n", sbuild_id, filename,
 err

[tip:perf/core] perf maps: Lookup maps in both intitial mountns and inner mountns.

2017-07-20 Thread tip-bot for Krister Johansen
Commit-ID:  bf2e710b3cb8445c052f2ff50b4875a2523bb279
Gitweb: http://git.kernel.org/tip/bf2e710b3cb8445c052f2ff50b4875a2523bb279
Author: Krister Johansen 
AuthorDate: Wed, 5 Jul 2017 18:48:09 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 18 Jul 2017 23:14:09 -0300

perf maps: Lookup maps in both intitial mountns and inner mountns.

If a process is in a mountns and has symbols in /tmp/perf-.map,
look first in the namespace using the tgid for the pidns that the
process might be in.  If the map isn't found there, try looking in the
mountns where perf is running, and use the tgid that's appropriate for
perf's pid namespace.  If all else fails, use the original pid.

This allows us to locate a symbol map file in the mount namespace, if it
was generated there.  However, we also try the tool's /tmp in case it's
there instead.

Signed-off-by: Krister Johansen 
Tested-by: Brendan Gregg 
Cc: Alexander Shishkin 
Cc: Peter Zijlstra 
Cc: Thomas-Mich Richter 
Link: 
http://lkml.kernel.org/r/1499305693-1599-3-git-send-email-k...@templeofstupid.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/machine.c|  4 +--
 tools/perf/util/map.c| 23 +---
 tools/perf/util/map.h|  2 +-
 tools/perf/util/namespaces.c | 83 
 tools/perf/util/namespaces.h |  5 ++-
 tools/perf/util/symbol.c | 70 ++---
 6 files changed, 160 insertions(+), 27 deletions(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 2e9eb6a..246b441 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1392,7 +1392,7 @@ int machine__process_mmap2_event(struct machine *machine,
 
map = map__new(machine, event->mmap2.start,
event->mmap2.len, event->mmap2.pgoff,
-   event->mmap2.pid, event->mmap2.maj,
+   event->mmap2.maj,
event->mmap2.min, event->mmap2.ino,
event->mmap2.ino_generation,
event->mmap2.prot,
@@ -1450,7 +1450,7 @@ int machine__process_mmap_event(struct machine *machine, 
union perf_event *event
 
map = map__new(machine, event->mmap.start,
event->mmap.len, event->mmap.pgoff,
-   event->mmap.pid, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0,
event->mmap.filename,
type, thread);
 
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 5dc60ca..bdaa0a4 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -146,11 +146,13 @@ void map__init(struct map *map, enum map_type type,
 }
 
 struct map *map__new(struct machine *machine, u64 start, u64 len,
-u64 pgoff, u32 pid, u32 d_maj, u32 d_min, u64 ino,
+u64 pgoff, u32 d_maj, u32 d_min, u64 ino,
 u64 ino_gen, u32 prot, u32 flags, char *filename,
 enum map_type type, struct thread *thread)
 {
struct map *map = malloc(sizeof(*map));
+   struct nsinfo *nsi = NULL;
+   struct nsinfo *nnsi;
 
if (map != NULL) {
char newfilename[PATH_MAX];
@@ -168,9 +170,11 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
map->ino_generation = ino_gen;
map->prot = prot;
map->flags = flags;
+   nsi = nsinfo__get(thread->nsinfo);
 
-   if ((anon || no_dso) && type == MAP__FUNCTION) {
-   snprintf(newfilename, sizeof(newfilename), 
"/tmp/perf-%d.map", pid);
+   if ((anon || no_dso) && nsi && type == MAP__FUNCTION) {
+   snprintf(newfilename, sizeof(newfilename),
+"/tmp/perf-%d.map", nsi->pid);
filename = newfilename;
}
 
@@ -180,6 +184,16 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
}
 
if (vdso) {
+   /* The vdso maps are always on the host and not the
+* container.  Ensure that we don't use setns to look
+* them up.
+*/
+   nnsi = nsinfo__copy(nsi);
+   if (nnsi) {
+   nsinfo__put(nsi);
+   nnsi->need_setns = false;
+   nsi = nnsi;
+   }
pgoff = 0;
dso = machine__findnew_vdso(machine, thread);
} else
@@ -201,11 +215,12 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
if (type != MAP__FUNCTION)

[tip:perf/core] perf symbols: Find symbols in different mount namespace

2017-07-20 Thread tip-bot for Krister Johansen
Commit-ID:  843ff37bb59edbe51d64e77ba1b3245a15a4dd9f
Gitweb: http://git.kernel.org/tip/843ff37bb59edbe51d64e77ba1b3245a15a4dd9f
Author: Krister Johansen 
AuthorDate: Wed, 5 Jul 2017 18:48:08 -0700
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 18 Jul 2017 23:14:09 -0300

perf symbols: Find symbols in different mount namespace

Teach perf how to resolve symbols from binaries that are in a different
mount namespace from the tool.  This allows perf to generate meaningful
stack traces even if the binary resides in a different mount namespace
from the tool.

Signed-off-by: Krister Johansen 
Tested-by: Brendan Gregg 
Cc: Alexander Shishkin 
Cc: Peter Zijlstra 
Cc: Thomas-Mich Richter 
Link: 
http://lkml.kernel.org/r/1499305693-1599-2-git-send-email-k...@templeofstupid.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/dso.c|   1 +
 tools/perf/util/dso.h|   2 +
 tools/perf/util/map.c|   2 +
 tools/perf/util/namespaces.c | 127 +++
 tools/perf/util/namespaces.h |  33 +++
 tools/perf/util/symbol.c |  11 
 tools/perf/util/thread.c |   3 +
 tools/perf/util/thread.h |   1 +
 8 files changed, 180 insertions(+)

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 4e7ab61..beda40e 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1236,6 +1236,7 @@ void dso__delete(struct dso *dso)
dso_cache__free(dso);
dso__free_a2l(dso);
zfree(&dso->symsrc_filename);
+   nsinfo__zput(dso->nsinfo);
pthread_mutex_destroy(&dso->lock);
free(dso);
 }
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index bd061ba..78ec637 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include "map.h"
+#include "namespaces.h"
 #include "build-id.h"
 
 enum dso_binary_type {
@@ -187,6 +188,7 @@ struct dso {
void *priv;
u64  db_id;
};
+   struct nsinfo   *nsinfo;
refcount_t   refcnt;
char name[0];
 };
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 2179b2d..5dc60ca 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -16,6 +16,7 @@
 #include "machine.h"
 #include 
 #include "srcline.h"
+#include "namespaces.h"
 #include "unwind.h"
 
 static void __maps__insert(struct maps *maps, struct map *map);
@@ -200,6 +201,7 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
if (type != MAP__FUNCTION)
dso__set_loaded(dso, map->type);
}
+   dso->nsinfo = nsinfo__get(thread->nsinfo);
dso__put(dso);
}
return map;
diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
index 67dcbcc..bcc6bb1 100644
--- a/tools/perf/util/namespaces.c
+++ b/tools/perf/util/namespaces.c
@@ -9,9 +9,13 @@
 #include "namespaces.h"
 #include "util.h"
 #include "event.h"
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
+#include 
 
 struct namespaces *namespaces__new(struct namespaces_event *event)
 {
@@ -35,3 +39,126 @@ void namespaces__free(struct namespaces *namespaces)
 {
free(namespaces);
 }
+
+void nsinfo__init(struct nsinfo *nsi)
+{
+   char oldns[PATH_MAX];
+   char *newns = NULL;
+   struct stat old_stat;
+   struct stat new_stat;
+
+   if (snprintf(oldns, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
+   return;
+
+   if (asprintf(&newns, "/proc/%d/ns/mnt", nsi->pid) == -1)
+   return;
+
+   if (stat(oldns, &old_stat) < 0)
+   goto out;
+
+   if (stat(newns, &new_stat) < 0)
+   goto out;
+
+   /* Check if the mount namespaces differ, if so then indicate that we
+* want to switch as part of looking up dso/map data.
+*/
+   if (old_stat.st_ino != new_stat.st_ino) {
+   nsi->need_setns = true;
+   nsi->mntns_path = newns;
+   newns = NULL;
+   }
+
+out:
+   free(newns);
+}
+
+struct nsinfo *nsinfo__new(pid_t pid)
+{
+   struct nsinfo *nsi = calloc(1, sizeof(*nsi));
+
+   if (nsi != NULL) {
+   nsi->pid = pid;
+   nsi->need_setns = false;
+   nsinfo__init(nsi);
+   refcount_set(&nsi->refcnt, 1);
+   }
+
+   return nsi;
+}
+
+void nsinfo__delete(struct nsinfo *nsi)
+{
+   zfree(&nsi->mntns_path);
+   free(nsi);
+}
+
+struct nsinfo *nsinfo__get(struct nsinfo *nsi)
+{
+   if (nsi)
+   refcount_inc(&nsi->refcnt);
+   return nsi;
+}
+
+void nsinfo__put(struct nsinfo *nsi)
+{
+   if (nsi && refcount_dec_and_t

Re: [PATCH v2 tip/perf/core 1/6] perf symbols: find symbols in different mount namespace

2017-07-11 Thread Krister Johansen
On Tue, Jul 11, 2017 at 09:51:16AM -0300, Arnaldo Carvalho de Melo wrote:
> Right, we need to use the build-id and look it up in a database
> populated somehow.
> 
> perf right now, by default, collects the build-ids in a table, at the
> end of the recording session, trying not to disrupt the monitored
> workload by not processing anything, just reading from the buffers and
> dumping to a file.
> 
> It will also try to populate the build-id, trying first to make a
> hardlink and copying it if it fails.
> 
> If we can get the build-id at the time of the mmap(binary), as part of
> the loading of binaries, that would be ideal, as we're touching the file
> headers anyway and the build-id is a small enough cookie.
> 
> But again, we should first try to do as far as we can with the
> infrastructure we have in the kernel and tooling libraries, lots of
> workloads will be serviced just fine with that.

Sorry, I was slow to pick up on what you're saying here.  I agree that
getting the build-id delivered in an event from the kernel would
accelerate the process of determining whether you have or need to cache
a binary in the build-id cache.  Without such a modification, perf has
to look at each binary to determine whether the build ids match.  If we
got the build-id in the event payload, then we only need to look to see
if the build-id is cached.  If it's not, then we can undertake the more
complicated lookup path.

That would be an improvement over what we can do today.

-K


Re: [PATCH v2 tip/perf/core 1/6] perf symbols: find symbols in different mount namespace

2017-07-10 Thread Krister Johansen
On Mon, Jul 10, 2017 at 07:52:49PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Jul 10, 2017 at 03:39:25PM -0700, Krister Johansen escreveu:
> > On Mon, Jul 10, 2017 at 08:17:00AM +0200, Thomas-Mich Richter wrote:
> > > On 07/07/2017 09:36 PM, Krister Johansen wrote:
> > > > On Thu, Jul 06, 2017 at 04:41:30PM -0300, Arnaldo Carvalho de Melo 
> > > > wrote:
> > > >> Em Wed, Jul 05, 2017 at 06:48:08PM -0700, Krister Johansen escreveu:
> > > >>> Teach perf how to resolve symbols from binaries that are in a 
> > > >>> different
> > > >>> mount namespace from the tool.  This allows perf to generate 
> > > >>> meaningful
> > > >>> stack traces even if the binary resides in a different mount namespace
> > > >>> from the tool.
> > > >>
> > > >> I was trying to find a way to test after applying each of the patches 
> > > >> in
> > > >> this series, when it ocurred to me that if a process that appears on a
> > > >> perf.data file has exit, how can we access /proc/%ITS_PID/something?
> > > > 
> > > > You're correct.  We can't access /proc//whatever once the process
> > > > has exited.  That was the impeteus for patches 4 and 6, which allow us
> > > > to capture the binary (and debuginfo, if it exists) into the buildid
> > > > cache so that if we do have a trace that exists after a process or
> > > > container exists, we'll still be able to resolve some of the symbols.
> 
> > > Any ideas on how to extend this to be able to resolve symbols after
> > > the process/container exited?
> > > I believe it boils down on how to interpret the mnt inode number in the
> > > PERF_RECORD_NAMESPACE record...
> > > Can this be done post-mortem? Maybe the PERF_RECORD_NAMESPACE record
> > > has to contain more data than just the inode number?
>  
> > I think we're talking past one another.  If the container exits then the
> > inode numbers that identify mount namespace are referring to something
> > that is no longer valid.  There's no mount namespace to enter in order
> > to locate the binary objects.  They may be on a volume that's no longer
> > mounted.
>  
> > I have a pair of patches in the existing set that copies the binary
> > objects into the buildid cache.  This lets you resolve the symbols after
> > the container has exited, provided that you recorded the buildids during
> > the trace.
>  
> > If you apply all the patches in this set, you should be able to generate
> > traces that you can look at with script or report even after the process
> > has exited.  I've been able to do it in my tests, at least.
> 
> I will work on testing them soon, I just wanted this discussion to take
> place, what you did seems to be the best we can do with the existing
> kernel infrastructure, and is a clear advance, so we need to test and
> merge it.

Happy to have the discussion. Aplologies if having the patches
iteratively add to one another isn't the best way to have this reviewed
and understood.  If you just apply the first few, you don't get the
support to pull these into the build-id cache.

> Getting the build-ids for the binaries is the key here, then its just a
> matter of populating a database where to get the matching binaries, we
> wouldn't need even to copy the actual binaries at record time.

Unfortunately, it's not sufficient to save the path to the target binary
because it's possible that after the container exits, and the namespace
is destroyed, there may be no path that describes to the host how to
access the files in the container.  There are two different interactions
here that frustrate this:

1. Containers run under a pivoted root, so the containers view of the
path may be different from the host's view of the path.  E.g. /usr/bin/node
in the container may actually be /var/container_a/root/usr/bin/node, or
something like that.  However, see #2.

2. It's also entirely possible for a container to have mounted a
filesystem that's not accessible or mounted from the host.  If, for
example, you're using docker with the direct-lvm storage driver, then
your storage device may be mounted in the vfs attached to the container,
but have no mount in the host's vfs.  In a situation like this, once the
container exits, the that lvm filesystem is unmounted.  In order to
access the files in that container, you basically need to setns(2) into
the container's mount namespace and look up the files using the a path
that resolves in the mount namespace of perf's target.

-K


Re: [PATCH v2 tip/perf/core 1/6] perf symbols: find symbols in different mount namespace

2017-07-10 Thread Krister Johansen
On Mon, Jul 10, 2017 at 08:17:00AM +0200, Thomas-Mich Richter wrote:
> On 07/07/2017 09:36 PM, Krister Johansen wrote:
> > On Thu, Jul 06, 2017 at 04:41:30PM -0300, Arnaldo Carvalho de Melo wrote:
> >> Em Wed, Jul 05, 2017 at 06:48:08PM -0700, Krister Johansen escreveu:
> >>> Teach perf how to resolve symbols from binaries that are in a different
> >>> mount namespace from the tool.  This allows perf to generate meaningful
> >>> stack traces even if the binary resides in a different mount namespace
> >>> from the tool.
> >>
> >> I was trying to find a way to test after applying each of the patches in
> >> this series, when it ocurred to me that if a process that appears on a
> >> perf.data file has exit, how can we access /proc/%ITS_PID/something?
> > 
> > You're correct.  We can't access /proc//whatever once the process
> > has exited.  That was the impeteus for patches 4 and 6, which allow us
> > to capture the binary (and debuginfo, if it exists) into the buildid
> > cache so that if we do have a trace that exists after a process or
> > container exists, we'll still be able to resolve some of the symbols.
> 
> Any ideas on how to extend this to be able to resolve symbols after
> the process/container exited?
> I believe it boils down on how to interpret the mnt inode number in the
> PERF_RECORD_NAMESPACE record...
> Can this be done post-mortem? Maybe the PERF_RECORD_NAMESPACE record
> has to contain more data than just the inode number?

I think we're talking past one another.  If the container exits then the
inode numbers that identify mount namespace are referring to something
that is no longer valid.  There's no mount namespace to enter in order
to locate the binary objects.  They may be on a volume that's no longer
mounted.

I have a pair of patches in the existing set that copies the binary
objects into the buildid cache.  This lets you resolve the symbols after
the container has exited, provided that you recorded the buildids during
the trace.

If you apply all the patches in this set, you should be able to generate
traces that you can look at with script or report even after the process
has exited.  I've been able to do it in my tests, at least.

-K


Re: [PATCH 4.9 00/41] 4.9.37-stable review

2017-07-10 Thread Krister Johansen
Hey Greg,

On Mon, Jul 10, 2017 at 07:10:33PM +0200, Greg Kroah-Hartman wrote:
> This is the start of the stable review cycle for the 4.9.37 release.
> There are 41 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Wed Jul 12 17:10:36 UTC 2017.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.37-rc1.gz
> or in the git tree and branch at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.9.y
> and the diffstat can be found below.
> 
> thanks,
> 
> greg k-h


In 4.9.36 there were some patches to tools/perf that got pulled in that
break that part of build.  It looks like a couple of dependent patches
were omitted.  I don't see these patches included in 4.9.37.  Is it
possible to get those included so that tools/perf builds again?

Here's a link to the prior thread:

https://lkml.org/lkml/2017/7/5/537

Thanks,

-K



Re: [PATCH v2 tip/perf/core 1/6] perf symbols: find symbols in different mount namespace

2017-07-07 Thread Krister Johansen
On Thu, Jul 06, 2017 at 04:41:30PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Wed, Jul 05, 2017 at 06:48:08PM -0700, Krister Johansen escreveu:
> > Teach perf how to resolve symbols from binaries that are in a different
> > mount namespace from the tool.  This allows perf to generate meaningful
> > stack traces even if the binary resides in a different mount namespace
> > from the tool.
> 
> I was trying to find a way to test after applying each of the patches in
> this series, when it ocurred to me that if a process that appears on a
> perf.data file has exit, how can we access /proc/%ITS_PID/something?

You're correct.  We can't access /proc//whatever once the process
has exited.  That was the impeteus for patches 4 and 6, which allow us
to capture the binary (and debuginfo, if it exists) into the buildid
cache so that if we do have a trace that exists after a process or
container exists, we'll still be able to resolve some of the symbols.

-K


[PATCH v2 tip/perf/core 4/6] perf buildid-cache: support binary objects from other namespaces

2017-07-05 Thread Krister Johansen
Teach buildid-cache how to add, remove, and update binary objects from
other mount namespaces.  Allow probe events tracing binaries in
different namespaces to add their objects to the probe and build-id
caches too.  As a handy side effect, this also lets us access SDT probes
in binaries from alternate mount namespaces.

Signed-off-by: Krister Johansen 
---
 tools/perf/Documentation/perf-buildid-cache.txt |  5 +++
 tools/perf/Documentation/perf-probe.txt |  5 +++
 tools/perf/builtin-buildid-cache.c  | 52 ++---
 tools/perf/builtin-probe.c  |  2 +-
 tools/perf/tests/sdt.c  |  4 +-
 tools/perf/util/build-id.c  | 47 ++
 tools/perf/util/build-id.h  |  9 +++--
 tools/perf/util/dso.c   | 12 +-
 tools/perf/util/parse-events.c  |  2 +-
 tools/perf/util/probe-event.c   |  6 +--
 tools/perf/util/probe-file.c| 19 +
 tools/perf/util/probe-file.h|  4 +-
 tools/perf/util/symbol.c| 19 ++---
 tools/perf/util/util.c  | 34 +---
 tools/perf/util/util.h  |  2 +
 15 files changed, 159 insertions(+), 63 deletions(-)

diff --git a/tools/perf/Documentation/perf-buildid-cache.txt 
b/tools/perf/Documentation/perf-buildid-cache.txt
index 058064d..8468100 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -61,6 +61,11 @@ OPTIONS
 --verbose::
Be more verbose.
 
+--target-ns=PID:
+   Obtain mount namespace information from the target pid.  This is
+   used when creating a uprobe for a process that resides in a
+   different mount namespace from the perf(1) utility.
+
 SEE ALSO
 
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/Documentation/perf-probe.txt 
b/tools/perf/Documentation/perf-probe.txt
index a42aabc..d7e4869 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -273,6 +273,11 @@ Add a uprobe to a target process running in a different 
mount namespace
 
  ./perf probe --target-ns  -x /lib64/libc.so.6 malloc
 
+Add a USDT probe to a target process running in a different mount namespace
+
+ ./perf probe --target-ns  -x 
/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.121-0.b13.el7_3.x86_64/jre/lib/amd64/server/libjvm.so
 %sdt_hotspot:thread__sleep__end
+
+
 SEE ALSO
 
 linkperf:perf-trace[1], linkperf:perf-record[1], linkperf:perf-buildid-cache[1]
diff --git a/tools/perf/builtin-buildid-cache.c 
b/tools/perf/builtin-buildid-cache.c
index 9eba7f1..d65bd86 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -14,6 +14,7 @@
 #include 
 #include "builtin.h"
 #include "perf.h"
+#include "namespaces.h"
 #include "util/cache.h"
 #include "util/debug.h"
 #include "util/header.h"
@@ -165,33 +166,41 @@ static int build_id_cache__add_kcore(const char 
*filename, bool force)
return 0;
 }
 
-static int build_id_cache__add_file(const char *filename)
+static int build_id_cache__add_file(const char *filename, struct nsinfo *nsi)
 {
char sbuild_id[SBUILD_ID_SIZE];
u8 build_id[BUILD_ID_SIZE];
int err;
+   struct nscookie nsc;
 
-   if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) 
{
+   nsinfo__mountns_enter(nsi, &nsc);
+   err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+   nsinfo__mountns_exit(&nsc);
+   if (err < 0) {
pr_debug("Couldn't read a build-id in %s\n", filename);
return -1;
}
 
build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-   err = build_id_cache__add_s(sbuild_id, filename,
+   err = build_id_cache__add_s(sbuild_id, filename, nsi,
false, false);
pr_debug("Adding %s %s: %s\n", sbuild_id, filename,
 err ? "FAIL" : "Ok");
return err;
 }
 
-static int build_id_cache__remove_file(const char *filename)
+static int build_id_cache__remove_file(const char *filename, struct nsinfo 
*nsi)
 {
u8 build_id[BUILD_ID_SIZE];
char sbuild_id[SBUILD_ID_SIZE];
+   struct nscookie nsc;
 
int err;
 
-   if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) 
{
+   nsinfo__mountns_enter(nsi, &nsc);
+   err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+   nsinfo__mountns_exit(&nsc);
+   if (err < 0) {
pr_debug("Couldn't read a build-id in %s\n", filename);
return -1;
}
@@ -

[PATCH v2 tip/perf/core 3/6] perf probe: allow placing uprobes in alternate namespaces.

2017-07-05 Thread Krister Johansen
Teaches perf how to place a uprobe on a file that's in a different mount
namespace.  The user must add the probe using the --target-ns argument
to perf probe.  Once it has been placed, it may be recorded against
without further namespace-specific commands.

Signed-off-by: Krister Johansen 
---
 tools/perf/Documentation/perf-probe.txt |  9 
 tools/perf/builtin-probe.c  | 43 --
 tools/perf/util/namespaces.c| 13 ++
 tools/perf/util/namespaces.h|  2 +
 tools/perf/util/probe-event.c   | 79 ++---
 tools/perf/util/probe-event.h   | 10 +++--
 6 files changed, 123 insertions(+), 33 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt 
b/tools/perf/Documentation/perf-probe.txt
index 165c2b1..a42aabc 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -130,6 +130,11 @@ OPTIONS
 --max-probes=NUM::
Set the maximum number of probe points for an event. Default is 128.
 
+--target-ns=PID:
+   Obtain mount namespace information from the target pid.  This is
+   used when creating a uprobe for a process that resides in a
+   different mount namespace from the perf(1) utility.
+
 -x::
 --exec=PATH::
Specify path to the executable or shared library file for user
@@ -264,6 +269,10 @@ Add probes at malloc() function on libc
 
  ./perf probe -x /lib/libc.so.6 malloc or ./perf probe /lib/libc.so.6 malloc
 
+Add a uprobe to a target process running in a different mount namespace
+
+ ./perf probe --target-ns  -x /lib64/libc.so.6 malloc
+
 SEE ALSO
 
 linkperf:perf-trace[1], linkperf:perf-record[1], linkperf:perf-buildid-cache[1]
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index cf9f9e9..3fb98d5 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -58,6 +58,7 @@ static struct {
struct line_range line_range;
char *target;
struct strfilter *filter;
+   struct nsinfo *nsi;
 } params;
 
 /* Parse an event definition. Note that any error must die. */
@@ -80,6 +81,9 @@ static int parse_probe_event(const char *str)
params.target_used = true;
}
 
+   if (params.nsi)
+   pev->nsi = nsinfo__get(params.nsi);
+
/* Parse a perf-probe command into event */
ret = parse_perf_probe_command(str, pev);
pr_debug("%d arguments\n", pev->nargs);
@@ -189,7 +193,7 @@ static int opt_set_target(const struct option *opt, const 
char *str,
 
/* Expand given path to absolute path, except for modulename */
if (params.uprobes || strchr(str, '/')) {
-   tmp = realpath(str, NULL);
+   tmp = nsinfo__realpath(str, params.nsi);
if (!tmp) {
pr_warning("Failed to get the absolute path of 
%s: %m\n", str);
return ret;
@@ -208,6 +212,34 @@ static int opt_set_target(const struct option *opt, const 
char *str,
return ret;
 }
 
+static int opt_set_target_ns(const struct option *opt __maybe_unused,
+const char *str, int unset __maybe_unused)
+{
+   int ret = -ENOENT;
+   pid_t ns_pid;
+   struct nsinfo *nsip;
+
+   if (str) {
+   errno = 0;
+   ns_pid = (pid_t)strtol(str, NULL, 10);
+   if (errno != 0) {
+   ret = -errno;
+   pr_warning("Failed to parse %s as a pid: %s\n", str,
+  strerror(errno));
+   return ret;
+   }
+   nsip = nsinfo__new(ns_pid);
+   if (nsip && nsip->need_setns)
+   params.nsi = nsinfo__get(nsip);
+   nsinfo__put(nsip);
+
+   ret = 0;
+   }
+
+   return ret;
+}
+
+
 /* Command option callbacks */
 
 #ifdef HAVE_DWARF_SUPPORT
@@ -299,6 +331,7 @@ static void cleanup_params(void)
line_range__clear(¶ms.line_range);
free(params.target);
strfilter__delete(params.filter);
+   nsinfo__put(params.nsi);
memset(¶ms, 0, sizeof(params));
 }
 
@@ -554,6 +587,8 @@ __cmd_probe(int argc, const char **argv)
OPT_BOOLEAN(0, "cache", &probe_conf.cache, "Manipulate probe cache"),
OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
   "Look for files with symbols relative to this directory"),
+   OPT_CALLBACK(0, "target-ns", NULL, "pid",
+"target pid for namespace contexts", opt_set_target_ns),
OPT_END()
};
int ret;
@@ -634,15 +669,15 @@ __cmd_probe(int argc, const char **argv)
pr_err_with_code("  Error

[PATCH v2 tip/perf/core 6/6] perf buildid-cache: cache debuginfo

2017-07-05 Thread Krister Johansen
If a stripped binary is placed in the cache, the user is in a situation
where there's a cached elf file present, but it doesn't have any symtab
to use for name resolution.  Grab the debuginfo for binaries that don't
end in .ko.  This yields a better chance of resolving symbols from
older traces.

Signed-off-by: Krister Johansen 
---
 tools/perf/builtin-buildid-cache.c |  2 +-
 tools/perf/util/annotate.c |  2 +-
 tools/perf/util/build-id.c | 72 +++---
 tools/perf/util/build-id.h |  3 +-
 tools/perf/util/dso.c  |  8 -
 tools/perf/util/dso.h  |  1 +
 tools/perf/util/machine.c  |  3 +-
 tools/perf/util/symbol.c   | 12 +--
 8 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-buildid-cache.c 
b/tools/perf/builtin-buildid-cache.c
index d65bd86..e3eb624 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -243,7 +243,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int 
parm __maybe_unused)
char filename[PATH_MAX];
u8 build_id[BUILD_ID_SIZE];
 
-   if (dso__build_id_filename(dso, filename, sizeof(filename)) &&
+   if (dso__build_id_filename(dso, filename, sizeof(filename), false) &&
filename__read_build_id(filename, build_id,
sizeof(build_id)) != sizeof(build_id)) {
if (errno == ENOENT)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index be1caab..7ce940d 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1327,7 +1327,7 @@ static int dso__disassemble_filename(struct dso *dso, 
char *filename, size_t fil
!dso__is_kcore(dso))
return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX;
 
-   build_id_filename = dso__build_id_filename(dso, NULL, 0);
+   build_id_filename = dso__build_id_filename(dso, NULL, 0, false);
if (build_id_filename) {
__symbol__join_symfs(filename, filename_size, 
build_id_filename);
free(build_id_filename);
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index f7bfd90..e966515 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -243,12 +243,15 @@ static bool build_id_cache__valid_id(char *sbuild_id)
return result;
 }
 
-static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso)
+static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso,
+   bool is_debug)
 {
-   return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : "elf");
+   return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : (is_debug ?
+   "debug" : "elf"));
 }
 
-char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
+char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size,
+bool is_debug)
 {
bool is_kallsyms = dso__is_kallsyms((struct dso *)dso);
bool is_vdso = dso__is_vdso((struct dso *)dso);
@@ -270,7 +273,8 @@ char *dso__build_id_filename(const struct dso *dso, char 
*bf, size_t size)
ret = asnprintf(&bf, size, "%s", linkname);
else
ret = asnprintf(&bf, size, "%s/%s", linkname,
-build_id_cache__basename(is_kallsyms, is_vdso));
+build_id_cache__basename(is_kallsyms, is_vdso,
+ is_debug));
if (ret < 0 || (!alloc && size < (unsigned int)ret))
bf = NULL;
free(linkname);
@@ -603,12 +607,40 @@ static int build_id_cache__add_sdt_cache(const char 
*sbuild_id,
 #define build_id_cache__add_sdt_cache(sbuild_id, realname, nsi) (0)
 #endif
 
+static char *build_id_cache__find_debug(const char *sbuild_id,
+   struct nsinfo *nsi)
+{
+   char *realname = NULL;
+   char *debugfile;
+   struct nscookie nsc;
+   size_t len = 0;
+
+   debugfile = calloc(1, PATH_MAX);
+   if (!debugfile)
+   goto out;
+
+   len = __symbol__join_symfs(debugfile, PATH_MAX,
+  "/usr/lib/debug/.build-id/");
+   snprintf(debugfile + len, PATH_MAX - len, "%.2s/%s.debug", sbuild_id,
+sbuild_id + 2);
+
+   nsinfo__mountns_enter(nsi, &nsc);
+   realname = realpath(debugfile, NULL);
+   if (realname && access(realname, R_OK))
+   zfree(&realname);
+   nsinfo__mountns_exit(&nsc);
+out:
+   free(debugfile);
+   return realname;
+}
+
 int build_id_cache__add_s(const char *sbuild_id, const char *name,
  struct nsinfo *nsi, bool is_kallsyms, bool is_vdso)

[PATCH v2 tip/perf/core 2/6] perf maps: lookup maps in both intitial mountns and inner mountns.

2017-07-05 Thread Krister Johansen
If a process is in a mountns and has symbols in /tmp/perf-.map,
look first in the namespace using the tgid for the pidns that the
process might be in.  If the map isn't found there, try looking in
the mountns where perf is running, and use the tgid that's appropriate
for perf's pid namespace.  If all else fails, use the original pid.

This allows us to locate a symbol map file in the mount namespace, if
it was generated there.  However, we also try the tool's /tmp in case
it's there instead.

Signed-off-by: Krister Johansen 
---
 tools/perf/util/machine.c|  4 +--
 tools/perf/util/map.c| 23 +---
 tools/perf/util/map.h|  2 +-
 tools/perf/util/namespaces.c | 83 
 tools/perf/util/namespaces.h |  5 ++-
 tools/perf/util/symbol.c | 70 ++---
 6 files changed, 160 insertions(+), 27 deletions(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index d7f31cb..4b0f16c 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1390,7 +1390,7 @@ int machine__process_mmap2_event(struct machine *machine,
 
map = map__new(machine, event->mmap2.start,
event->mmap2.len, event->mmap2.pgoff,
-   event->mmap2.pid, event->mmap2.maj,
+   event->mmap2.maj,
event->mmap2.min, event->mmap2.ino,
event->mmap2.ino_generation,
event->mmap2.prot,
@@ -1448,7 +1448,7 @@ int machine__process_mmap_event(struct machine *machine, 
union perf_event *event
 
map = map__new(machine, event->mmap.start,
event->mmap.len, event->mmap.pgoff,
-   event->mmap.pid, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0,
event->mmap.filename,
type, thread);
 
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 5dc60ca..bdaa0a4 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -146,11 +146,13 @@ void map__init(struct map *map, enum map_type type,
 }
 
 struct map *map__new(struct machine *machine, u64 start, u64 len,
-u64 pgoff, u32 pid, u32 d_maj, u32 d_min, u64 ino,
+u64 pgoff, u32 d_maj, u32 d_min, u64 ino,
 u64 ino_gen, u32 prot, u32 flags, char *filename,
 enum map_type type, struct thread *thread)
 {
struct map *map = malloc(sizeof(*map));
+   struct nsinfo *nsi = NULL;
+   struct nsinfo *nnsi;
 
if (map != NULL) {
char newfilename[PATH_MAX];
@@ -168,9 +170,11 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
map->ino_generation = ino_gen;
map->prot = prot;
map->flags = flags;
+   nsi = nsinfo__get(thread->nsinfo);
 
-   if ((anon || no_dso) && type == MAP__FUNCTION) {
-   snprintf(newfilename, sizeof(newfilename), 
"/tmp/perf-%d.map", pid);
+   if ((anon || no_dso) && nsi && type == MAP__FUNCTION) {
+   snprintf(newfilename, sizeof(newfilename),
+"/tmp/perf-%d.map", nsi->pid);
filename = newfilename;
}
 
@@ -180,6 +184,16 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
}
 
if (vdso) {
+   /* The vdso maps are always on the host and not the
+* container.  Ensure that we don't use setns to look
+* them up.
+*/
+   nnsi = nsinfo__copy(nsi);
+   if (nnsi) {
+   nsinfo__put(nsi);
+   nnsi->need_setns = false;
+   nsi = nnsi;
+   }
pgoff = 0;
dso = machine__findnew_vdso(machine, thread);
} else
@@ -201,11 +215,12 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
if (type != MAP__FUNCTION)
dso__set_loaded(dso, map->type);
}
-   dso->nsinfo = nsinfo__get(thread->nsinfo);
+   dso->nsinfo = nsi;
dso__put(dso);
}
return map;
 out_delete:
+   nsinfo__put(nsi);
free(map);
return NULL;
 }
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index f9e8ac8..73aacf7 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -141,7 +141,7 @@ struct thread;
 void map__init(struct map *map, enum map_type type,
   u64 start, u64 end, u64 pgof

[PATCH v2 tip/perf/core 5/6] perf top: support lookup of symbols in other mount namespaces.

2017-07-05 Thread Krister Johansen
The perf top command needs to unshare its fs from the helper threads
in order to successfully setns(2) during its symbol lookup.  It also
needs to impelement a force flag to ignore ownership of perf-.map
files.

Signed-off-by: Krister Johansen 
---
 tools/perf/Documentation/perf-top.txt |  4 
 tools/perf/builtin-top.c  | 15 +++
 2 files changed, 19 insertions(+)

diff --git a/tools/perf/Documentation/perf-top.txt 
b/tools/perf/Documentation/perf-top.txt
index e71d638..d864ea6 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -237,6 +237,10 @@ Default is to monitor all CPUS.
 --hierarchy::
Enable hierarchy output.
 
+--force::
+   Don't do ownership validation.
+
+
 INTERACTIVE PROMPTING KEYS
 --
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 6052376..ef2aef4 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -586,6 +586,13 @@ static void *display_thread_tui(void *arg)
.refresh= top->delay_secs,
};
 
+   /* In order to read symbols from other namespaces perf to  needs to call
+* setns(2).  This isn't permitted if the struct_fs has multiple users.
+* unshare(2) the fs so that we may continue to setns into namespaces
+* that we're observing.
+*/
+   (void) unshare(CLONE_FS);
+
perf_top__sort_new_samples(top);
 
/*
@@ -627,6 +634,13 @@ static void *display_thread(void *arg)
struct perf_top *top = arg;
int delay_msecs, c;
 
+   /* In order to read symbols from other namespaces perf to  needs to call
+* setns(2).  This isn't permitted if the struct_fs has multiple users.
+* unshare(2) the fs so that we may continue to setns into namespaces
+* that we're observing.
+*/
+   (void) unshare(CLONE_FS);
+
display_setup_sig();
pthread__unblock_sigwinch();
 repeat:
@@ -1205,6 +1219,7 @@ int cmd_top(int argc, const char **argv)
"Show raw trace event output (do not use print fmt or 
plugins)"),
OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
"Show entries in a hierarchy"),
+   OPT_BOOLEAN(0, "force", &symbol_conf.force, "don't complain, do it"),
OPT_END()
};
const char * const top_usage[] = {
-- 
2.7.4



[PATCH v2 tip/perf/core 1/6] perf symbols: find symbols in different mount namespace

2017-07-05 Thread Krister Johansen
Teach perf how to resolve symbols from binaries that are in a different
mount namespace from the tool.  This allows perf to generate meaningful
stack traces even if the binary resides in a different mount namespace
from the tool.

Signed-off-by: Krister Johansen 
---
 tools/perf/util/dso.c|   1 +
 tools/perf/util/dso.h|   2 +
 tools/perf/util/map.c|   2 +
 tools/perf/util/namespaces.c | 127 +++
 tools/perf/util/namespaces.h |  33 +++
 tools/perf/util/symbol.c |  11 
 tools/perf/util/thread.c |   3 +
 tools/perf/util/thread.h |   1 +
 8 files changed, 180 insertions(+)

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 4e7ab61..beda40e 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1236,6 +1236,7 @@ void dso__delete(struct dso *dso)
dso_cache__free(dso);
dso__free_a2l(dso);
zfree(&dso->symsrc_filename);
+   nsinfo__zput(dso->nsinfo);
pthread_mutex_destroy(&dso->lock);
free(dso);
 }
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index bd061ba..78ec637 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include "map.h"
+#include "namespaces.h"
 #include "build-id.h"
 
 enum dso_binary_type {
@@ -187,6 +188,7 @@ struct dso {
void *priv;
u64  db_id;
};
+   struct nsinfo   *nsinfo;
refcount_t   refcnt;
char name[0];
 };
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 2179b2d..5dc60ca 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -16,6 +16,7 @@
 #include "machine.h"
 #include 
 #include "srcline.h"
+#include "namespaces.h"
 #include "unwind.h"
 
 static void __maps__insert(struct maps *maps, struct map *map);
@@ -200,6 +201,7 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
if (type != MAP__FUNCTION)
dso__set_loaded(dso, map->type);
}
+   dso->nsinfo = nsinfo__get(thread->nsinfo);
dso__put(dso);
}
return map;
diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
index 67dcbcc..bcc6bb1 100644
--- a/tools/perf/util/namespaces.c
+++ b/tools/perf/util/namespaces.c
@@ -9,9 +9,13 @@
 #include "namespaces.h"
 #include "util.h"
 #include "event.h"
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
+#include 
 
 struct namespaces *namespaces__new(struct namespaces_event *event)
 {
@@ -35,3 +39,126 @@ void namespaces__free(struct namespaces *namespaces)
 {
free(namespaces);
 }
+
+void nsinfo__init(struct nsinfo *nsi)
+{
+   char oldns[PATH_MAX];
+   char *newns = NULL;
+   struct stat old_stat;
+   struct stat new_stat;
+
+   if (snprintf(oldns, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
+   return;
+
+   if (asprintf(&newns, "/proc/%d/ns/mnt", nsi->pid) == -1)
+   return;
+
+   if (stat(oldns, &old_stat) < 0)
+   goto out;
+
+   if (stat(newns, &new_stat) < 0)
+   goto out;
+
+   /* Check if the mount namespaces differ, if so then indicate that we
+* want to switch as part of looking up dso/map data.
+*/
+   if (old_stat.st_ino != new_stat.st_ino) {
+   nsi->need_setns = true;
+   nsi->mntns_path = newns;
+   newns = NULL;
+   }
+
+out:
+   free(newns);
+}
+
+struct nsinfo *nsinfo__new(pid_t pid)
+{
+   struct nsinfo *nsi = calloc(1, sizeof(*nsi));
+
+   if (nsi != NULL) {
+   nsi->pid = pid;
+   nsi->need_setns = false;
+   nsinfo__init(nsi);
+   refcount_set(&nsi->refcnt, 1);
+   }
+
+   return nsi;
+}
+
+void nsinfo__delete(struct nsinfo *nsi)
+{
+   zfree(&nsi->mntns_path);
+   free(nsi);
+}
+
+struct nsinfo *nsinfo__get(struct nsinfo *nsi)
+{
+   if (nsi)
+   refcount_inc(&nsi->refcnt);
+   return nsi;
+}
+
+void nsinfo__put(struct nsinfo *nsi)
+{
+   if (nsi && refcount_dec_and_test(&nsi->refcnt))
+   nsinfo__delete(nsi);
+}
+
+void nsinfo__mountns_enter(struct nsinfo *nsi, struct nscookie *nc)
+{
+   char curpath[PATH_MAX];
+   int oldns = -1;
+   int newns = -1;
+
+   if (nc == NULL)
+   return;
+
+   nc->oldns = -1;
+   nc->newns = -1;
+
+   if (!nsi || !nsi->need_setns)
+   return;
+
+   if (snprintf(curpath, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
+   return;
+
+   oldns = open(curpath, O_RDONLY);
+   if (oldns < 

[PATCH v2 tip/perf/core 0/6] namespace tracing improvements

2017-07-05 Thread Krister Johansen
This patch set is similar to a set of features I implemented for bcc
back in April.  At a high-level it does the following:

- Allow the tracing tools to resolve symbols if the traced process has a
  binary that is in a different mount namespace from perf.

- Allow the perf-.map files to live either in /tmp in the target
  process' mount namespace, or the tracing process's /tmp in its own
  mount namespace.

- Adds the ability to set and record against uprobes and USDT probes
  when the target process resides in a different mount namespace.

With these changes I can get pretty good insight into what containerized
processes on my systems are doing.  In addition to the above, I also
modified the builid cache code to cache relevant bits from the target
namespaces so that it's possible to preserve more information should a
container exit and take the mounted volumes along with it.

-K

Changes v1 -> v2:

- stylistic cleanups in namespace.c (Arnaldo)
- zfree instead of free in nsinfo__delete (Arnaldo)
- un-reflow map__new and dso__load (Arnaldo)
- introduce nsinfo__realpath (Arnaldo)
- move documentation into the patch that introduced the change. (Arnaldo)
- update help descriptions (Brendan)
- add perf-probe examples for uprobes and USDT (Brendan)


Krister Johansen (6):
  perf symbols: find symbols in different mount namespace
  perf maps: lookup maps in both intitial mountns and inner mountns.
  perf probe: allow placing uprobes in alternate namespaces.
  perf buildid-cache: support binary objects from other namespaces
  perf top: support lookup of symbols in other mount namespaces.
  perf buildid-cache: cache debuginfo

 tools/perf/Documentation/perf-buildid-cache.txt |   5 +
 tools/perf/Documentation/perf-probe.txt |  14 ++
 tools/perf/Documentation/perf-top.txt   |   4 +
 tools/perf/builtin-buildid-cache.c  |  54 --
 tools/perf/builtin-probe.c  |  45 -
 tools/perf/builtin-top.c|  15 ++
 tools/perf/tests/sdt.c  |   4 +-
 tools/perf/util/annotate.c  |   2 +-
 tools/perf/util/build-id.c  | 119 ++---
 tools/perf/util/build-id.h  |  12 +-
 tools/perf/util/dso.c   |  21 ++-
 tools/perf/util/dso.h   |   3 +
 tools/perf/util/machine.c   |   7 +-
 tools/perf/util/map.c   |  23 ++-
 tools/perf/util/map.h   |   2 +-
 tools/perf/util/namespaces.c| 211 
 tools/perf/util/namespaces.h|  38 +
 tools/perf/util/parse-events.c  |   2 +-
 tools/perf/util/probe-event.c   |  85 ++
 tools/perf/util/probe-event.h   |  10 +-
 tools/perf/util/probe-file.c|  19 ++-
 tools/perf/util/probe-file.h|   4 +-
 tools/perf/util/symbol.c|  92 +--
 tools/perf/util/thread.c|   3 +
 tools/perf/util/thread.h|   1 +
 tools/perf/util/util.c  |  34 +++-
 tools/perf/util/util.h  |   2 +
 27 files changed, 713 insertions(+), 118 deletions(-)

-- 
2.7.4



Re: [PATCH tip/perf/core 3/7] perf probe: allow placing uprobes in alternate namespaces.

2017-07-05 Thread Krister Johansen
On Mon, Jul 03, 2017 at 03:46:41PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Fri, Jun 30, 2017 at 07:18:55PM -0700, Krister Johansen escreveu:
> > Teaches perf how to place a uprobe on a file that's in a different mount
> > namespace.  The user must add the probe using the --target-ns argument
> > to perf probe.  Once it has been placed, it may be recorded against
> > without further namespace-specific commands.
> > 
> > Signed-off-by: Krister Johansen 
> > ---
> >  tools/perf/builtin-probe.c| 44 ++--
> >  tools/perf/util/probe-event.c | 79 
> > +--
> >  tools/perf/util/probe-event.h | 10 --
> >  3 files changed, 101 insertions(+), 32 deletions(-)
> > 
> > diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
> > index cf9f9e9..5ab2e00 100644
> > --- a/tools/perf/builtin-probe.c
> > +++ b/tools/perf/builtin-probe.c
> > @@ -58,6 +58,7 @@ static struct {
> > struct line_range line_range;
> > char *target;
> > struct strfilter *filter;
> > +   struct nsinfo *nsi;
> >  } params;
> >  
> >  /* Parse an event definition. Note that any error must die. */
> > @@ -80,6 +81,9 @@ static int parse_probe_event(const char *str)
> > params.target_used = true;
> > }
> >  
> > +   if (params.nsi)
> > +   pev->nsi = nsinfo__get(params.nsi);
> > +
> > /* Parse a perf-probe command into event */
> > ret = parse_perf_probe_command(str, pev);
> > pr_debug("%d arguments\n", pev->nargs);
> > @@ -178,6 +182,7 @@ static int opt_set_target(const struct option *opt, 
> > const char *str,
> >  {
> > int ret = -ENOENT;
> > char *tmp;
> > +   struct nscookie nsc;
> >  
> > if  (str) {
> > if (!strcmp(opt->long_name, "exec"))
> > @@ -189,7 +194,9 @@ static int opt_set_target(const struct option *opt, 
> > const char *str,
> >  
> > /* Expand given path to absolute path, except for modulename */
> > if (params.uprobes || strchr(str, '/')) {
> > +   nsinfo__mountns_enter(params.nsi, &nsc);
> > tmp = realpath(str, NULL);
> > +   nsinfo__mountns_exit(&nsc);
> 
> Perhaps have a nsinfo__realpath()? Don't know if this will be used
> elsewhere, but looks shorter.

Sure.  I think I do this in a few different places.  I'd be happy to
pull this into its own function and re-use instead.

> > if (!tmp) {
> > pr_warning("Failed to get the absolute path of 
> > %s: %m\n", str);
> > return ret;
> > @@ -208,6 +215,34 @@ static int opt_set_target(const struct option *opt, 
> > const char *str,
> > return ret;
> >  }
> >  
> > +static int opt_set_target_ns(const struct option *opt __maybe_unused,
> > +const char *str, int unset __maybe_unused)
> > +{
> > +   int ret = -ENOENT;
> > +   pid_t ns_pid;
> > +   struct nsinfo *nsip;
> > +
> > +   if (str) {
> > +   errno = 0;
> > +   ns_pid = (pid_t)strtol(str, NULL, 10);
> > +   if (errno != 0) {
> > +   ret = -errno;
> > +   pr_warning("Failed to parse %s as a pid: %s\n", str,
> > +  strerror(errno));
> > +   return ret;
> > +   }
> > +   nsip = nsinfo__new(ns_pid);
> > +   if (nsip && nsip->need_setns)
> > +   params.nsi = nsinfo__get(nsip);
> > +   nsinfo__put(nsip);
> > +
> > +   ret = 0;
> > +   }
> > +
> > +   return ret;
> > +}
> > +
> > +
> >  /* Command option callbacks */
> >  
> >  #ifdef HAVE_DWARF_SUPPORT
> > @@ -299,6 +334,7 @@ static void cleanup_params(void)
> > line_range__clear(¶ms.line_range);
> > free(params.target);
> > strfilter__delete(params.filter);
> > +   nsinfo__put(params.nsi);
> > memset(¶ms, 0, sizeof(params));
> >  }
> >  
> > @@ -554,6 +590,8 @@ __cmd_probe(int argc, const char **argv)
> > OPT_BOOLEAN(0, "cache", &probe_conf.cache, "Manipulate probe cache"),
> > OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
> >"Look for files with symbols relative to this dir

Re: [PATCH tip/perf/core 6/7] perf documentation: updates for target-ns.

2017-07-05 Thread Krister Johansen
On Mon, Jul 03, 2017 at 03:48:36PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Fri, Jun 30, 2017 at 07:18:58PM -0700, Krister Johansen escreveu:
> > Update the perf documentation to describe the --target-ns option to
> > probe and buildid-cache.  Note the existence of the new --force flag
> > to top.
> 
> Can you please chop this into pieces and add each bit to the previous
> patches where this option was introduced?

Yes; will do.  Once I have, I'll send out an updated version of the
patchset that includes this and the other changes you've requested.

> > Signed-off-by: Krister Johansen 
> > ---
> >  tools/perf/Documentation/perf-buildid-cache.txt | 5 +
> >  tools/perf/Documentation/perf-probe.txt | 5 +
> >  tools/perf/Documentation/perf-top.txt   | 4 
> >  3 files changed, 14 insertions(+)
> > 
> > diff --git a/tools/perf/Documentation/perf-buildid-cache.txt 
> > b/tools/perf/Documentation/perf-buildid-cache.txt
> > index 058064d..8468100 100644
> > --- a/tools/perf/Documentation/perf-buildid-cache.txt
> > +++ b/tools/perf/Documentation/perf-buildid-cache.txt
> > @@ -61,6 +61,11 @@ OPTIONS
> >  --verbose::
> > Be more verbose.
> >  
> > +--target-ns=PID:
> > +   Obtain mount namespace information from the target pid.  This is
> > +   used when creating a uprobe for a process that resides in a
> > +   different mount namespace from the perf(1) utility.
> > +
> >  SEE ALSO
> >  
> >  linkperf:perf-record[1], linkperf:perf-report[1], 
> > linkperf:perf-buildid-list[1]
> > diff --git a/tools/perf/Documentation/perf-probe.txt 
> > b/tools/perf/Documentation/perf-probe.txt
> > index 165c2b1..c1a126b 100644
> > --- a/tools/perf/Documentation/perf-probe.txt
> > +++ b/tools/perf/Documentation/perf-probe.txt
> > @@ -130,6 +130,11 @@ OPTIONS
> >  --max-probes=NUM::
> > Set the maximum number of probe points for an event. Default is 128.
> >  
> > +--target-ns=PID:
> > +   Obtain mount namespace information from the target pid.  This is
> > +   used when creating a uprobe for a process that resides in a
> > +   different mount namespace from the perf(1) utility.
> > +
> >  -x::
> >  --exec=PATH::
> > Specify path to the executable or shared library file for user
> > diff --git a/tools/perf/Documentation/perf-top.txt 
> > b/tools/perf/Documentation/perf-top.txt
> > index e71d638..d864ea6 100644
> > --- a/tools/perf/Documentation/perf-top.txt
> > +++ b/tools/perf/Documentation/perf-top.txt
> > @@ -237,6 +237,10 @@ Default is to monitor all CPUS.
> >  --hierarchy::
> > Enable hierarchy output.
> >  
> > +--force::
> > +   Don't do ownership validation.
> > +
> > +
> >  INTERACTIVE PROMPTING KEYS
> >  --
> >  
> > -- 
> > 2.7.4


Re: [PATCH tip/perf/core 1/7] perf symbols: find symbols in different mount namespace

2017-07-05 Thread Krister Johansen
On Mon, Jul 03, 2017 at 03:38:27PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Fri, Jun 30, 2017 at 07:18:53PM -0700, Krister Johansen escreveu:
> > Teach perf how to resolve symbols from binaries that are in a different
> > mount namespace from the tool.  This allows perf to generate meaningful
> > stack traces even if the binary resides in a different mount namespace
> > from the tool.
> 
> Clear implementation overall, followed tools/perf/ coding style, uses
> reference counts, etc, great! some comments below

Thanks!  Appreciate the review comments.

> > Signed-off-by: Krister Johansen 
> > ---
> >  tools/perf/util/dso.c|   1 +
> >  tools/perf/util/dso.h|   2 +
> >  tools/perf/util/map.c|   2 +
> >  tools/perf/util/namespaces.c | 127 
> > +++
> >  tools/perf/util/namespaces.h |  33 +++
> >  tools/perf/util/symbol.c |  11 
> >  tools/perf/util/thread.c |   3 +
> >  tools/perf/util/thread.h |   1 +
> >  8 files changed, 180 insertions(+)
> > 
> > diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
> > index 4e7ab61..beda40e 100644
> > --- a/tools/perf/util/dso.c
> > +++ b/tools/perf/util/dso.c
> > @@ -1236,6 +1236,7 @@ void dso__delete(struct dso *dso)
> > dso_cache__free(dso);
> > dso__free_a2l(dso);
> > zfree(&dso->symsrc_filename);
> > +   nsinfo__zput(dso->nsinfo);
> > pthread_mutex_destroy(&dso->lock);
> > free(dso);
> >  }
> > diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
> > index bd061ba..78ec637 100644
> > --- a/tools/perf/util/dso.h
> > +++ b/tools/perf/util/dso.h
> > @@ -10,6 +10,7 @@
> >  #include 
> >  #include 
> >  #include "map.h"
> > +#include "namespaces.h"
> >  #include "build-id.h"
> >  
> >  enum dso_binary_type {
> > @@ -187,6 +188,7 @@ struct dso {
> > void *priv;
> > u64  db_id;
> > };
> > +   struct nsinfo   *nsinfo;
> > refcount_t   refcnt;
> > char name[0];
> >  };
> > diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
> > index 2179b2d..5dc60ca 100644
> > --- a/tools/perf/util/map.c
> > +++ b/tools/perf/util/map.c
> > @@ -16,6 +16,7 @@
> >  #include "machine.h"
> >  #include 
> >  #include "srcline.h"
> > +#include "namespaces.h"
> >  #include "unwind.h"
> >  
> >  static void __maps__insert(struct maps *maps, struct map *map);
> > @@ -200,6 +201,7 @@ struct map *map__new(struct machine *machine, u64 
> > start, u64 len,
> > if (type != MAP__FUNCTION)
> > dso__set_loaded(dso, map->type);
> > }
> > +   dso->nsinfo = nsinfo__get(thread->nsinfo);
> > dso__put(dso);
> > }
> > return map;
> > diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
> > index 67dcbcc..d7f31e0 100644
> > --- a/tools/perf/util/namespaces.c
> > +++ b/tools/perf/util/namespaces.c
> > @@ -9,9 +9,13 @@
> >  #include "namespaces.h"
> >  #include "util.h"
> >  #include "event.h"
> > +#include 
> > +#include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  struct namespaces *namespaces__new(struct namespaces_event *event)
> >  {
> > @@ -35,3 +39,126 @@ void namespaces__free(struct namespaces *namespaces)
> >  {
> > free(namespaces);
> >  }
> > +
> > +void nsinfo__init(struct nsinfo *nsi)
> > +{
> > +   char oldns[PATH_MAX];
> > +   char *newns = NULL;
> > +   struct stat old_stat;
> > +   struct stat new_stat;
> > +
> > +   if (snprintf(oldns, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
> > +   return;
> > +
> > +   if (asprintf(&newns, "/proc/%d/ns/mnt", nsi->pid) == -1)
> > +   return;
> > +
> > +   if (stat(oldns, &old_stat) < 0)
> > +   goto out;
> > +
> > +   if (stat(newns, &new_stat) < 0)
> > +   goto out;
> > +
> > +   /* Check if the mount namespaces differ, if so then indicate that we
> > +* want to switch as part of looking up dso/map data.
> > +*/
> > +   if (old_stat.st_ino != new_stat.st_ino) {
> > +   nsi->need_setns = true;
> > +

Re: [PATCH 4.9 131/172] perf probe: Fix to probe on gcc generated functions in modules

2017-07-05 Thread Krister Johansen
Hey Greg,

> 4.9-stable review patch.  If anyone has any objections, please let me know.
> 
> --
> 
> From: Masami Hiramatsu 
> 
> 
> [ Upstream commit 613f050d68a8ed3c0b18b9568698908ef7bbc1f7 ]
> 
> Fix to probe on gcc generated functions on modules. Since
> probing on a module is based on its symbol name, it should
> be adjusted on actual symbols.
> 
> E.g. without this fix, perf probe shows probe definition
> on non-exist symbol as below.
> 
>   $ perf probe -m build-x86_64/net/netfilter/nf_nat.ko -F in_range*
>   in_range.isra.12
>   $ perf probe -m build-x86_64/net/netfilter/nf_nat.ko -D in_range
>   p:probe/in_range nf_nat:in_range+0
> 
> With this fix, perf probe correctly shows a probe on
> gcc-generated symbol.
> 
>   $ perf probe -m build-x86_64/net/netfilter/nf_nat.ko -D in_range
>   p:probe/in_range nf_nat:in_range.isra.12+0
> 
> This also fixes same problem on online module as below.
> 
>   $ perf probe -m i915 -D assert_plane
>   p:probe/assert_plane i915:assert_plane.constprop.134+0
> 
> Signed-off-by: Masami Hiramatsu 
> Tested-by: Arnaldo Carvalho de Melo 
> Cc: Jiri Olsa 
> Cc: Namhyung Kim 
> Cc: Peter Zijlstra 
> Link: 
> http://lkml.kernel.org/r/148411450673.9978.14905987549651656075.stgit@devbox
> Signed-off-by: Arnaldo Carvalho de Melo 
> Signed-off-by: Sasha Levin 
> Signed-off-by: Greg Kroah-Hartman 
> ---
>  tools/perf/util/probe-event.c  |   45 
> ++---
>  tools/perf/util/probe-finder.c |7 --
>  tools/perf/util/probe-finder.h |3 ++
>  3 files changed, 37 insertions(+), 18 deletions(-)
> 
> --- a/tools/perf/util/probe-event.c
> +++ b/tools/perf/util/probe-event.c
> @@ -645,18 +645,31 @@ static int add_exec_to_probe_trace_event
>   return ret;
>  }
>  
> -static int add_module_to_probe_trace_events(struct probe_trace_event *tevs,
> - int ntevs, const char *module)
> +static int
> +post_process_module_probe_trace_events(struct probe_trace_event *tevs,
> +int ntevs, const char *module,
> +struct debuginfo *dinfo)
>  {
> + Dwarf_Addr text_offs = 0;
>   int i, ret = 0;
>   char *mod_name = NULL;
> + struct map *map;
>  
>   if (!module)
>   return 0;
>  
> - mod_name = find_module_name(module);
> + map = get_target_map(module, false);
> + if (!map || debuginfo__get_text_offset(dinfo, &text_offs, true) < 0) {
> + pr_warning("Failed to get ELF symbols for %s\n", module);
> + return -EINVAL;
> + }
>  
> + mod_name = find_module_name(module);
>   for (i = 0; i < ntevs; i++) {
> + ret = post_process_probe_trace_point(&tevs[i].point,
> + map, (unsigned long)text_offs);
> + if (ret < 0)
> + break;
>   tevs[i].point.module =
>   strdup(mod_name ? mod_name : module);
>   if (!tevs[i].point.module) {
> @@ -666,6 +679,8 @@ static int add_module_to_probe_trace_eve
>   }
>  
>   free(mod_name);
> + map__put(map);
> +
>   return ret;
>  }
>  
> @@ -722,7 +737,7 @@ arch__post_process_probe_trace_events(st
>  static int post_process_probe_trace_events(struct perf_probe_event *pev,
>  struct probe_trace_event *tevs,
>  int ntevs, const char *module,
> -bool uprobe)
> +bool uprobe, struct debuginfo *dinfo)
>  {
>   int ret;
>  
> @@ -730,7 +745,8 @@ static int post_process_probe_trace_even
>   ret = add_exec_to_probe_trace_events(tevs, ntevs, module);
>   else if (module)
>   /* Currently ref_reloc_sym based probe is not for drivers */
> - ret = add_module_to_probe_trace_events(tevs, ntevs, module);
> + ret = post_process_module_probe_trace_events(tevs, ntevs,
> +  module, dinfo);
>   else
>   ret = post_process_kernel_probe_trace_events(tevs, ntevs);
>  
> @@ -774,30 +790,27 @@ static int try_to_find_probe_trace_event
>   }
>   }
>  
> - debuginfo__delete(dinfo);
> -
>   if (ntevs > 0) {/* Succeeded to find trace events */
>   pr_debug("Found %d probe_trace_events.\n", ntevs);
>   ret = post_process_probe_trace_events(pev, *tevs, ntevs,
> - pev->target, pev->uprobes);
> + pev->target, pev->uprobes, dinfo);
>   if (ret < 0 || ret == ntevs) {
> + pr_debug("Post processing failed or all events are 
> skipped. (%d)\n", ret);
>   clear_probe_trace_events(*tevs, ntevs);
>   zfree(tevs);
> + ntevs = 0;

[PATCH tip/perf/core 2/7] perf maps: lookup maps in both intitial mountns and inner mountns.

2017-06-30 Thread Krister Johansen
If a process is in a mountns and has symbols in /tmp/perf-.map,
look first in the namespace using the tgid for the pidns that the
process might be in.  If the map isn't found there, try looking in
the mountns where perf is running, and use the tgid that's appropriate
for perf's pid namespace.  If all else fails, use the original pid.

This allows us to locate a symbol map file in the mount namespace, if
it was generated there.  However, we also try the tool's /tmp in case
it's there instead.

Signed-off-by: Krister Johansen 
---
 tools/perf/util/machine.c| 19 --
 tools/perf/util/map.c| 29 
 tools/perf/util/map.h|  8 ++---
 tools/perf/util/namespaces.c | 83 
 tools/perf/util/namespaces.h |  5 ++-
 tools/perf/util/symbol.c | 71 ++---
 6 files changed, 172 insertions(+), 43 deletions(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index d7f31cb..ed98881 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1388,13 +1388,10 @@ int machine__process_mmap2_event(struct machine 
*machine,
else
type = MAP__FUNCTION;
 
-   map = map__new(machine, event->mmap2.start,
-   event->mmap2.len, event->mmap2.pgoff,
-   event->mmap2.pid, event->mmap2.maj,
-   event->mmap2.min, event->mmap2.ino,
-   event->mmap2.ino_generation,
-   event->mmap2.prot,
-   event->mmap2.flags,
+   map = map__new(machine, event->mmap2.start, event->mmap2.len,
+   event->mmap2.pgoff, event->mmap2.maj, event->mmap2.min,
+   event->mmap2.ino, event->mmap2.ino_generation,
+   event->mmap2.prot, event->mmap2.flags,
event->mmap2.filename, type, thread);
 
if (map == NULL)
@@ -1446,11 +1443,9 @@ int machine__process_mmap_event(struct machine *machine, 
union perf_event *event
else
type = MAP__FUNCTION;
 
-   map = map__new(machine, event->mmap.start,
-   event->mmap.len, event->mmap.pgoff,
-   event->mmap.pid, 0, 0, 0, 0, 0, 0,
-   event->mmap.filename,
-   type, thread);
+   map = map__new(machine, event->mmap.start, event->mmap.len,
+   event->mmap.pgoff, 0, 0, 0, 0, 0, 0,
+   event->mmap.filename, type, thread);
 
if (map == NULL)
goto out_problem_map;
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 5dc60ca..2bd20cb 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -145,12 +145,14 @@ void map__init(struct map *map, enum map_type type,
refcount_set(&map->refcnt, 1);
 }
 
-struct map *map__new(struct machine *machine, u64 start, u64 len,
-u64 pgoff, u32 pid, u32 d_maj, u32 d_min, u64 ino,
-u64 ino_gen, u32 prot, u32 flags, char *filename,
-enum map_type type, struct thread *thread)
+struct map *map__new(struct machine *machine, u64 start, u64 len, u64 pgoff,
+u32 d_maj, u32 d_min, u64 ino, u64 ino_gen, u32 prot,
+u32 flags, char *filename, enum map_type type,
+struct thread *thread)
 {
struct map *map = malloc(sizeof(*map));
+   struct nsinfo *nsi = NULL;
+   struct nsinfo *nnsi;
 
if (map != NULL) {
char newfilename[PATH_MAX];
@@ -168,9 +170,11 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
map->ino_generation = ino_gen;
map->prot = prot;
map->flags = flags;
+   nsi = nsinfo__get(thread->nsinfo);
 
-   if ((anon || no_dso) && type == MAP__FUNCTION) {
-   snprintf(newfilename, sizeof(newfilename), 
"/tmp/perf-%d.map", pid);
+   if ((anon || no_dso) && nsi && type == MAP__FUNCTION) {
+   snprintf(newfilename, sizeof(newfilename),
+"/tmp/perf-%d.map", nsi->pid);
filename = newfilename;
}
 
@@ -180,6 +184,16 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
}
 
if (vdso) {
+   /* The vdso maps are always on the host and not the
+* container.  Ensure that we don't use setns to look
+* them up.
+*/
+   nnsi = nsinfo__copy(nsi);
+   if (nnsi) {
+   nsinfo__put(nsi);
+  

[PATCH tip/perf/core 1/7] perf symbols: find symbols in different mount namespace

2017-06-30 Thread Krister Johansen
Teach perf how to resolve symbols from binaries that are in a different
mount namespace from the tool.  This allows perf to generate meaningful
stack traces even if the binary resides in a different mount namespace
from the tool.

Signed-off-by: Krister Johansen 
---
 tools/perf/util/dso.c|   1 +
 tools/perf/util/dso.h|   2 +
 tools/perf/util/map.c|   2 +
 tools/perf/util/namespaces.c | 127 +++
 tools/perf/util/namespaces.h |  33 +++
 tools/perf/util/symbol.c |  11 
 tools/perf/util/thread.c |   3 +
 tools/perf/util/thread.h |   1 +
 8 files changed, 180 insertions(+)

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 4e7ab61..beda40e 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1236,6 +1236,7 @@ void dso__delete(struct dso *dso)
dso_cache__free(dso);
dso__free_a2l(dso);
zfree(&dso->symsrc_filename);
+   nsinfo__zput(dso->nsinfo);
pthread_mutex_destroy(&dso->lock);
free(dso);
 }
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index bd061ba..78ec637 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include "map.h"
+#include "namespaces.h"
 #include "build-id.h"
 
 enum dso_binary_type {
@@ -187,6 +188,7 @@ struct dso {
void *priv;
u64  db_id;
};
+   struct nsinfo   *nsinfo;
refcount_t   refcnt;
char name[0];
 };
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 2179b2d..5dc60ca 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -16,6 +16,7 @@
 #include "machine.h"
 #include 
 #include "srcline.h"
+#include "namespaces.h"
 #include "unwind.h"
 
 static void __maps__insert(struct maps *maps, struct map *map);
@@ -200,6 +201,7 @@ struct map *map__new(struct machine *machine, u64 start, 
u64 len,
if (type != MAP__FUNCTION)
dso__set_loaded(dso, map->type);
}
+   dso->nsinfo = nsinfo__get(thread->nsinfo);
dso__put(dso);
}
return map;
diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
index 67dcbcc..d7f31e0 100644
--- a/tools/perf/util/namespaces.c
+++ b/tools/perf/util/namespaces.c
@@ -9,9 +9,13 @@
 #include "namespaces.h"
 #include "util.h"
 #include "event.h"
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
+#include 
 
 struct namespaces *namespaces__new(struct namespaces_event *event)
 {
@@ -35,3 +39,126 @@ void namespaces__free(struct namespaces *namespaces)
 {
free(namespaces);
 }
+
+void nsinfo__init(struct nsinfo *nsi)
+{
+   char oldns[PATH_MAX];
+   char *newns = NULL;
+   struct stat old_stat;
+   struct stat new_stat;
+
+   if (snprintf(oldns, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
+   return;
+
+   if (asprintf(&newns, "/proc/%d/ns/mnt", nsi->pid) == -1)
+   return;
+
+   if (stat(oldns, &old_stat) < 0)
+   goto out;
+
+   if (stat(newns, &new_stat) < 0)
+   goto out;
+
+   /* Check if the mount namespaces differ, if so then indicate that we
+* want to switch as part of looking up dso/map data.
+*/
+   if (old_stat.st_ino != new_stat.st_ino) {
+   nsi->need_setns = true;
+   nsi->mntns_path = newns;
+   newns = NULL;
+   }
+
+out:
+   free(newns);
+}
+
+struct nsinfo *nsinfo__new(pid_t pid)
+{
+   struct nsinfo *nsi = calloc(1, sizeof(*nsi));
+
+   if (nsi != NULL) {
+   nsi->pid = pid;
+   nsi->need_setns = false;
+   nsinfo__init(nsi);
+   refcount_set(&nsi->refcnt, 1);
+   }
+
+   return nsi;
+}
+
+void nsinfo__delete(struct nsinfo *nsi)
+{
+   free(nsi->mntns_path);
+   free(nsi);
+}
+
+struct nsinfo *nsinfo__get(struct nsinfo *nsi)
+{
+   if (nsi)
+   refcount_inc(&nsi->refcnt);
+   return nsi;
+}
+
+void nsinfo__put(struct nsinfo *nsi)
+{
+   if (nsi && refcount_dec_and_test(&nsi->refcnt))
+   nsinfo__delete(nsi);
+}
+
+void nsinfo__mountns_enter(struct nsinfo *nsi, struct nscookie *nc)
+{
+   char curpath[PATH_MAX];
+   int oldns = -1;
+   int newns = -1;
+
+   if (nc == NULL)
+   return;
+
+   nc->oldns = -1;
+   nc->newns = -1;
+
+   if (!nsi || !nsi->need_setns)
+   return;
+
+   if (snprintf(curpath, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
+   return;
+
+   oldns = open(curpath, O_RDONLY);
+   if (oldns < 0)
+   

[PATCH tip/perf/core 7/7] perf buildid-cache: cache debuginfo

2017-06-30 Thread Krister Johansen
If a stripped binary is placed in the cache, the user is in a situation
where there's a cached elf file present, but it doesn't have any symtab
to use for name resolution.  Grab the debuginfo for binaries that don't
end in .ko.  This yields a better chance of resolving symbols from
older traces.

Signed-off-by: Krister Johansen 
---
 tools/perf/builtin-buildid-cache.c |  2 +-
 tools/perf/util/annotate.c |  2 +-
 tools/perf/util/build-id.c | 72 +++---
 tools/perf/util/build-id.h |  3 +-
 tools/perf/util/dso.c  |  8 -
 tools/perf/util/dso.h  |  1 +
 tools/perf/util/machine.c  |  3 +-
 tools/perf/util/symbol.c   | 12 +--
 8 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-buildid-cache.c 
b/tools/perf/builtin-buildid-cache.c
index 0b9a43e..2e35d80 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -243,7 +243,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int 
parm __maybe_unused)
char filename[PATH_MAX];
u8 build_id[BUILD_ID_SIZE];
 
-   if (dso__build_id_filename(dso, filename, sizeof(filename)) &&
+   if (dso__build_id_filename(dso, filename, sizeof(filename), false) &&
filename__read_build_id(filename, build_id,
sizeof(build_id)) != sizeof(build_id)) {
if (errno == ENOENT)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index be1caab..7ce940d 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1327,7 +1327,7 @@ static int dso__disassemble_filename(struct dso *dso, 
char *filename, size_t fil
!dso__is_kcore(dso))
return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX;
 
-   build_id_filename = dso__build_id_filename(dso, NULL, 0);
+   build_id_filename = dso__build_id_filename(dso, NULL, 0, false);
if (build_id_filename) {
__symbol__join_symfs(filename, filename_size, 
build_id_filename);
free(build_id_filename);
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 0a58287..789d20a 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -243,12 +243,15 @@ static bool build_id_cache__valid_id(char *sbuild_id)
return result;
 }
 
-static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso)
+static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso,
+   bool is_debug)
 {
-   return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : "elf");
+   return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : (is_debug ?
+   "debug" : "elf"));
 }
 
-char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
+char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size,
+bool is_debug)
 {
bool is_kallsyms = dso__is_kallsyms((struct dso *)dso);
bool is_vdso = dso__is_vdso((struct dso *)dso);
@@ -270,7 +273,8 @@ char *dso__build_id_filename(const struct dso *dso, char 
*bf, size_t size)
ret = asnprintf(&bf, size, "%s", linkname);
else
ret = asnprintf(&bf, size, "%s/%s", linkname,
-build_id_cache__basename(is_kallsyms, is_vdso));
+build_id_cache__basename(is_kallsyms, is_vdso,
+ is_debug));
if (ret < 0 || (!alloc && size < (unsigned int)ret))
bf = NULL;
free(linkname);
@@ -606,12 +610,40 @@ static int build_id_cache__add_sdt_cache(const char 
*sbuild_id,
 #define build_id_cache__add_sdt_cache(sbuild_id, realname, nsi) (0)
 #endif
 
+static char *build_id_cache__find_debug(const char *sbuild_id,
+   struct nsinfo *nsi)
+{
+   char *realname = NULL;
+   char *debugfile;
+   struct nscookie nsc;
+   size_t len = 0;
+
+   debugfile = calloc(1, PATH_MAX);
+   if (!debugfile)
+   goto out;
+
+   len = __symbol__join_symfs(debugfile, PATH_MAX,
+  "/usr/lib/debug/.build-id/");
+   snprintf(debugfile + len, PATH_MAX - len, "%.2s/%s.debug", sbuild_id,
+sbuild_id + 2);
+
+   nsinfo__mountns_enter(nsi, &nsc);
+   realname = realpath(debugfile, NULL);
+   if (realname && access(realname, R_OK))
+   zfree(&realname);
+   nsinfo__mountns_exit(&nsc);
+out:
+   free(debugfile);
+   return realname;
+}
+
 int build_id_cache__add_s(const char *sbuild_id, const char *name,
  struct nsinfo *nsi, bool is_kallsyms, bool is_vdso)

[PATCH tip/perf/core 5/7] perf top: support lookup of symbols in other mount namespaces.

2017-06-30 Thread Krister Johansen
The perf top command needs to unshare its fs from the helper threads
in order to successfully setns(2) during its symbol lookup.  It also
needs to impelement a force flag to ignore ownership of perf-.map
files.

Signed-off-by: Krister Johansen 
---
 tools/perf/builtin-top.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 2bcfa46..c918fef 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -586,6 +586,13 @@ static void *display_thread_tui(void *arg)
.refresh= top->delay_secs,
};
 
+   /* In order to read symbols from other namespaces perf to  needs to call
+* setns(2).  This isn't permitted if the struct_fs has multiple users.
+* unshare(2) the fs so that we may continue to setns into namespaces
+* that we're observing.
+*/
+   (void) unshare(CLONE_FS);
+
perf_top__sort_new_samples(top);
 
/*
@@ -627,6 +634,13 @@ static void *display_thread(void *arg)
struct perf_top *top = arg;
int delay_msecs, c;
 
+   /* In order to read symbols from other namespaces perf to  needs to call
+* setns(2).  This isn't permitted if the struct_fs has multiple users.
+* unshare(2) the fs so that we may continue to setns into namespaces
+* that we're observing.
+*/
+   (void) unshare(CLONE_FS);
+
display_setup_sig();
pthread__unblock_sigwinch();
 repeat:
@@ -1205,6 +1219,7 @@ int cmd_top(int argc, const char **argv)
"Show raw trace event output (do not use print fmt or 
plugins)"),
OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
"Show entries in a hierarchy"),
+   OPT_BOOLEAN(0, "force", &symbol_conf.force, "don't complain, do it"),
OPT_END()
};
const char * const top_usage[] = {
-- 
2.7.4



[PATCH tip/perf/core 6/7] perf documentation: updates for target-ns.

2017-06-30 Thread Krister Johansen
Update the perf documentation to describe the --target-ns option to
probe and buildid-cache.  Note the existence of the new --force flag
to top.

Signed-off-by: Krister Johansen 
---
 tools/perf/Documentation/perf-buildid-cache.txt | 5 +
 tools/perf/Documentation/perf-probe.txt | 5 +
 tools/perf/Documentation/perf-top.txt   | 4 
 3 files changed, 14 insertions(+)

diff --git a/tools/perf/Documentation/perf-buildid-cache.txt 
b/tools/perf/Documentation/perf-buildid-cache.txt
index 058064d..8468100 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -61,6 +61,11 @@ OPTIONS
 --verbose::
Be more verbose.
 
+--target-ns=PID:
+   Obtain mount namespace information from the target pid.  This is
+   used when creating a uprobe for a process that resides in a
+   different mount namespace from the perf(1) utility.
+
 SEE ALSO
 
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/Documentation/perf-probe.txt 
b/tools/perf/Documentation/perf-probe.txt
index 165c2b1..c1a126b 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -130,6 +130,11 @@ OPTIONS
 --max-probes=NUM::
Set the maximum number of probe points for an event. Default is 128.
 
+--target-ns=PID:
+   Obtain mount namespace information from the target pid.  This is
+   used when creating a uprobe for a process that resides in a
+   different mount namespace from the perf(1) utility.
+
 -x::
 --exec=PATH::
Specify path to the executable or shared library file for user
diff --git a/tools/perf/Documentation/perf-top.txt 
b/tools/perf/Documentation/perf-top.txt
index e71d638..d864ea6 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -237,6 +237,10 @@ Default is to monitor all CPUS.
 --hierarchy::
Enable hierarchy output.
 
+--force::
+   Don't do ownership validation.
+
+
 INTERACTIVE PROMPTING KEYS
 --
 
-- 
2.7.4



[PATCH tip/perf/core 3/7] perf probe: allow placing uprobes in alternate namespaces.

2017-06-30 Thread Krister Johansen
Teaches perf how to place a uprobe on a file that's in a different mount
namespace.  The user must add the probe using the --target-ns argument
to perf probe.  Once it has been placed, it may be recorded against
without further namespace-specific commands.

Signed-off-by: Krister Johansen 
---
 tools/perf/builtin-probe.c| 44 ++--
 tools/perf/util/probe-event.c | 79 +--
 tools/perf/util/probe-event.h | 10 --
 3 files changed, 101 insertions(+), 32 deletions(-)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index cf9f9e9..5ab2e00 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -58,6 +58,7 @@ static struct {
struct line_range line_range;
char *target;
struct strfilter *filter;
+   struct nsinfo *nsi;
 } params;
 
 /* Parse an event definition. Note that any error must die. */
@@ -80,6 +81,9 @@ static int parse_probe_event(const char *str)
params.target_used = true;
}
 
+   if (params.nsi)
+   pev->nsi = nsinfo__get(params.nsi);
+
/* Parse a perf-probe command into event */
ret = parse_perf_probe_command(str, pev);
pr_debug("%d arguments\n", pev->nargs);
@@ -178,6 +182,7 @@ static int opt_set_target(const struct option *opt, const 
char *str,
 {
int ret = -ENOENT;
char *tmp;
+   struct nscookie nsc;
 
if  (str) {
if (!strcmp(opt->long_name, "exec"))
@@ -189,7 +194,9 @@ static int opt_set_target(const struct option *opt, const 
char *str,
 
/* Expand given path to absolute path, except for modulename */
if (params.uprobes || strchr(str, '/')) {
+   nsinfo__mountns_enter(params.nsi, &nsc);
tmp = realpath(str, NULL);
+   nsinfo__mountns_exit(&nsc);
if (!tmp) {
pr_warning("Failed to get the absolute path of 
%s: %m\n", str);
return ret;
@@ -208,6 +215,34 @@ static int opt_set_target(const struct option *opt, const 
char *str,
return ret;
 }
 
+static int opt_set_target_ns(const struct option *opt __maybe_unused,
+const char *str, int unset __maybe_unused)
+{
+   int ret = -ENOENT;
+   pid_t ns_pid;
+   struct nsinfo *nsip;
+
+   if (str) {
+   errno = 0;
+   ns_pid = (pid_t)strtol(str, NULL, 10);
+   if (errno != 0) {
+   ret = -errno;
+   pr_warning("Failed to parse %s as a pid: %s\n", str,
+  strerror(errno));
+   return ret;
+   }
+   nsip = nsinfo__new(ns_pid);
+   if (nsip && nsip->need_setns)
+   params.nsi = nsinfo__get(nsip);
+   nsinfo__put(nsip);
+
+   ret = 0;
+   }
+
+   return ret;
+}
+
+
 /* Command option callbacks */
 
 #ifdef HAVE_DWARF_SUPPORT
@@ -299,6 +334,7 @@ static void cleanup_params(void)
line_range__clear(¶ms.line_range);
free(params.target);
strfilter__delete(params.filter);
+   nsinfo__put(params.nsi);
memset(¶ms, 0, sizeof(params));
 }
 
@@ -554,6 +590,8 @@ __cmd_probe(int argc, const char **argv)
OPT_BOOLEAN(0, "cache", &probe_conf.cache, "Manipulate probe cache"),
OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
   "Look for files with symbols relative to this directory"),
+   OPT_CALLBACK(0, "target-ns", NULL, "pid",
+"target pid for namespace information", opt_set_target_ns),
OPT_END()
};
int ret;
@@ -634,15 +672,15 @@ __cmd_probe(int argc, const char **argv)
pr_err_with_code("  Error: Failed to show event list.", 
ret);
return ret;
case 'F':
-   ret = show_available_funcs(params.target, params.filter,
-   params.uprobes);
+   ret = show_available_funcs(params.target, params.nsi,
+  params.filter, params.uprobes);
if (ret < 0)
pr_err_with_code("  Error: Failed to show functions.", 
ret);
return ret;
 #ifdef HAVE_DWARF_SUPPORT
case 'L':
ret = show_line_range(¶ms.line_range, params.target,
- params.uprobes);
+ params.nsi, params.uprobes);
if (ret < 0)
pr_err_with_code("  Error: Failed to show lines.", re

[PATCH tip/perf/core 0/7] namespace tracing improvements

2017-06-30 Thread Krister Johansen
This patch set is similar to a set of features I implemented for bcc
back in April.  At a high-level it does the following:

- Allow the tracing tools to resolve symbols if the traced process has a
  binary that is in a different mount namespace from perf.

- Allow the perf-.map files to live either in /tmp in the target
  process' mount namespace, or the tracing process's /tmp in its own
  mount namespace.

- Adds the ability to set and record against uprobes and USDT probes
  when the target process resides in a different mount namespace.

With these changes I can get pretty good insight into what containerized
processes on my systems are doing.  In addition to the above, I also
modified the builid cache code to cache relevant bits from the target
namespaces so that it's possible to preserve more information should a
container exit and take the mounted volumes along with it.

-K

Krister Johansen (7):
  perf symbols: find symbols in different mount namespace
  perf maps: lookup maps in both intitial mountns and inner mountns.
  perf probe: allow placing uprobes in alternate namespaces.
  perf buildid-cache: support binary objects from other namespaces
  perf top: support lookup of symbols in other mount namespaces.
  perf documentation: updates for target-ns.
  perf buildid-cache: cache debuginfo

 tools/perf/Documentation/perf-buildid-cache.txt |   5 +
 tools/perf/Documentation/perf-probe.txt |   5 +
 tools/perf/Documentation/perf-top.txt   |   4 +
 tools/perf/builtin-buildid-cache.c  |  54 +--
 tools/perf/builtin-probe.c  |  46 +-
 tools/perf/builtin-top.c|  15 ++
 tools/perf/tests/sdt.c  |   4 +-
 tools/perf/util/annotate.c  |   2 +-
 tools/perf/util/build-id.c  | 120 +++---
 tools/perf/util/build-id.h  |  12 +-
 tools/perf/util/dso.c   |  21 ++-
 tools/perf/util/dso.h   |   3 +
 tools/perf/util/machine.c   |  22 ++-
 tools/perf/util/map.c   |  29 +++-
 tools/perf/util/map.h   |   8 +-
 tools/perf/util/namespaces.c| 198 
 tools/perf/util/namespaces.h|  36 +
 tools/perf/util/parse-events.c  |   2 +-
 tools/perf/util/probe-event.c   |  85 ++
 tools/perf/util/probe-event.h   |  10 +-
 tools/perf/util/probe-file.c|  19 ++-
 tools/perf/util/probe-file.h|   4 +-
 tools/perf/util/symbol.c|  93 +--
 tools/perf/util/thread.c|   3 +
 tools/perf/util/thread.h|   1 +
 tools/perf/util/util.c  |  34 +++-
 tools/perf/util/util.h  |   2 +
 27 files changed, 706 insertions(+), 131 deletions(-)

-- 
2.7.4



[PATCH tip/perf/core 4/7] perf buildid-cache: support binary objects from other namespaces

2017-06-30 Thread Krister Johansen
Teach buildid-cache how to add, remove, and update binary objects from
other mount namespaces.  Allow probe events tracing binaries in
different namespaces to add their objects to the probe and build-id
caches too.  As a handy side effect, this also lets us access SDT probes
in binaries from alternate mount namespaces.

Signed-off-by: Krister Johansen 
---
 tools/perf/builtin-buildid-cache.c | 52 +++---
 tools/perf/builtin-probe.c |  2 +-
 tools/perf/tests/sdt.c |  4 +--
 tools/perf/util/build-id.c | 48 ---
 tools/perf/util/build-id.h |  9 ---
 tools/perf/util/dso.c  | 12 -
 tools/perf/util/parse-events.c |  2 +-
 tools/perf/util/probe-event.c  |  6 ++---
 tools/perf/util/probe-file.c   | 19 +-
 tools/perf/util/probe-file.h   |  4 +--
 tools/perf/util/symbol.c   | 19 ++
 tools/perf/util/util.c | 34 -
 tools/perf/util/util.h |  2 ++
 13 files changed, 152 insertions(+), 61 deletions(-)

diff --git a/tools/perf/builtin-buildid-cache.c 
b/tools/perf/builtin-buildid-cache.c
index 9eba7f1..0b9a43e 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -14,6 +14,7 @@
 #include 
 #include "builtin.h"
 #include "perf.h"
+#include "namespaces.h"
 #include "util/cache.h"
 #include "util/debug.h"
 #include "util/header.h"
@@ -165,33 +166,41 @@ static int build_id_cache__add_kcore(const char 
*filename, bool force)
return 0;
 }
 
-static int build_id_cache__add_file(const char *filename)
+static int build_id_cache__add_file(const char *filename, struct nsinfo *nsi)
 {
char sbuild_id[SBUILD_ID_SIZE];
u8 build_id[BUILD_ID_SIZE];
int err;
+   struct nscookie nsc;
 
-   if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) 
{
+   nsinfo__mountns_enter(nsi, &nsc);
+   err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+   nsinfo__mountns_exit(&nsc);
+   if (err < 0) {
pr_debug("Couldn't read a build-id in %s\n", filename);
return -1;
}
 
build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-   err = build_id_cache__add_s(sbuild_id, filename,
+   err = build_id_cache__add_s(sbuild_id, filename, nsi,
false, false);
pr_debug("Adding %s %s: %s\n", sbuild_id, filename,
 err ? "FAIL" : "Ok");
return err;
 }
 
-static int build_id_cache__remove_file(const char *filename)
+static int build_id_cache__remove_file(const char *filename, struct nsinfo 
*nsi)
 {
u8 build_id[BUILD_ID_SIZE];
char sbuild_id[SBUILD_ID_SIZE];
+   struct nscookie nsc;
 
int err;
 
-   if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) 
{
+   nsinfo__mountns_enter(nsi, &nsc);
+   err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+   nsinfo__mountns_exit(&nsc);
+   if (err < 0) {
pr_debug("Couldn't read a build-id in %s\n", filename);
return -1;
}
@@ -204,13 +213,13 @@ static int build_id_cache__remove_file(const char 
*filename)
return err;
 }
 
-static int build_id_cache__purge_path(const char *pathname)
+static int build_id_cache__purge_path(const char *pathname, struct nsinfo *nsi)
 {
struct strlist *list;
struct str_node *pos;
int err;
 
-   err = build_id_cache__list_build_ids(pathname, &list);
+   err = build_id_cache__list_build_ids(pathname, nsi, &list);
if (err)
goto out;
 
@@ -256,24 +265,30 @@ static int build_id_cache__fprintf_missing(struct 
perf_session *session, FILE *f
return 0;
 }
 
-static int build_id_cache__update_file(const char *filename)
+static int build_id_cache__update_file(const char *filename, struct nsinfo 
*nsi)
 {
u8 build_id[BUILD_ID_SIZE];
char sbuild_id[SBUILD_ID_SIZE];
+   struct nscookie nsc;
 
-   int err = 0;
+   int err;
 
-   if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) 
{
+   nsinfo__mountns_enter(nsi, &nsc);
+   err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+   nsinfo__mountns_exit(&nsc);
+   if (err < 0) {
pr_debug("Couldn't read a build-id in %s\n", filename);
return -1;
}
+   err = 0;
 
build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
if (build_id_cache__cached(sbuild_id))
err = build_id_cache__remove_s(sbuild_id);
 
if (!err)
-   err = build_

Re: [PATCH 2/4] swait: add the missing killable swaits

2017-06-30 Thread Krister Johansen
On Thu, Jun 29, 2017 at 09:03:42PM -0700, Linus Torvalds wrote:
> On Thu, Jun 29, 2017 at 12:15 PM, Marcelo Tosatti  wrote:
> > On Thu, Jun 29, 2017 at 09:13:29AM -0700, Linus Torvalds wrote:
> >>
> >> swait uses special locking and has odd semantics that are not at all
> >> the same as the default wait queue ones. It should not be used without
> >> very strong reasons (and honestly, the only strong enough reason seems
> >> to be "RT").
> >
> > Performance shortcut:
> >
> > https://lkml.org/lkml/2016/2/25/301
> 
> Now, admittedly I don't know the code and really may be entirely off,
> but looking at the commit (no need to go to the lkml archives - it's
> commit 8577370fb0cb ("KVM: Use simple waitqueue for vcpu->wq") in
> mainline), I really think the swait() use is simply not correct if
> there can be multiple waiters, exactly because swake_up() only wakes
> up a single entry.
> 
> So either there is only a single entry, or *all* the code like
> 
> dvcpu->arch.wait = 0;
> 
> -   if (waitqueue_active(&dvcpu->wq))
> -   wake_up_interruptible(&dvcpu->wq);
> +   if (swait_active(&dvcpu->wq))
> +   swake_up(&dvcpu->wq);
> 
> is simply wrong. If there are multiple blockers, and you just cleared
> "arch.wait", I think they should *all* be woken up. And that's not
> what swake_up() does.

Code like this is probably wrong for another reason too.  The
swait_active() is likely redudant, since swake_up() also calls
swait_active().  The check in swake_up() returns if it thinks there are
no active waiters.  However, the synchronization needed to ensure a
proper wakeup is left as an exercise to swake_up's caller.

There have been a couple of other discussions around this topic
recently:

https://lkml.org/lkml/2017/5/25/722
https://lkml.org/lkml/2017/6/8/1222

The above is better written as the following, but even then you still
have the single/multiple wakeup problem:

 -   if (waitqueue_active(&dvcpu->wq))
 -   wake_up_interruptible(&dvcpu->wq);
 +   smp_mb();
 +   swake_up(&dvcpu->wq);


Just to add to the confusion, the last time I checked, the semantics of
swake_up() even differ between RT Linux and mainline, which makes this
even more confusing.

-K


Re: [PATCH tip/sched/core] Add comments to aid in safer usage of swake_up.

2017-06-14 Thread Krister Johansen
On Wed, Jun 14, 2017 at 11:02:40AM -0400, Steven Rostedt wrote:
> On Wed, 14 Jun 2017 09:10:15 -0400
> Steven Rostedt  wrote:
> 
> > Now let's make it simpler. I'll even add the READ_ONCE and WRITE_ONCE
> > where applicable.
> > 
> > 
> > CPU0CPU1
> > 
> > LOCK(A)
> > 
> >  LOCK(B)
> >  WRITE_ONCE(X, INIT)
> > 
> >  (the cpu may postpone writing X)
> > 
> >  (the cpu can fetch wq list here)
> >   list_add(wq, q)
> > 
> >  UNLOCK(B)
> > 
> >  (the cpu may fetch old value of X)
> > 
> >  (write of X happens here)
> > 
> >  if (READ_ONCE(X) != init)
> >schedule();
> > 
> > UNLOCK(A)
> > 
> >  if (list_empty(wq))
> >return;
> > 
> > Tell me again how the READ_ONCE() and WRITE_ONCE() helps in this
> > scenario?
> > 
> > Because we are using spinlocks, this wont be an issue for most
> > architectures. The bug happens if the fetching of the list_empty()
> > leaks into before the UNLOCK(A).
> > 
> > If the reading/writing of the list and the reading/writing of gp_flags
> > gets reversed in either direction by the CPU, then we have a problem.
> 
> FYI..
> 
> Both sides need a memory barrier. Otherwise, even with a memory barrier
> on CPU1 we can still have:
> 
> 
>   CPU0CPU1
>   
> 
>   LOCK(A)
>  LOCK(B)
> 
>  list_add(wq, q)
> 
>  (cpu waits to write wq list)
> 
>  (cpu fetches X)
> 
>WRITE_ONCE(X, INIT)
> 
>   UNLOCK(A)
> 
>   smp_mb();
> 
>   if (list_empty(wq))
>  return;
> 
>  (cpu writes wq list)
> 
>  UNLOCK(B)
> 
>  if (READ_ONCE(X) != INIT)
>schedule()
> 
> 
> Luckily for us, there is a memory barrier on CPU0. In
> prepare_to_swait() we have:
> 
>   raw_spin_lock_irqsave(&q->lock, flags);
>   __prepare_to_swait(q, wait);
>   set_current_state(state);
>   raw_spin_unlock_irqrestore(&q->lock, flags);
> 
> And that set_current_state() call includes a memory barrier, which will
> prevent the above from happening, as the addition to the wq list must
> be flushed before fetching X.
> 
> I still strongly believe that the swait_active() requires a memory
> barrier.

FWLIW, I agree.  There was a smb_mb() in RT-linux's equivalent of
swait_activate().

https://www.spinics.net/lists/linux-rt-users/msg10340.html

If the barrier goes in swait_active() then we don't have to require all
of the callers of swait_active and swake_up to issue the barrier
instead.  Handling this in swait_active is likely to be less error
prone.  Though, we could also do something like wq_has_sleeper() and use
that preferentially in swake_up and its variants.

-K


[PATCH tip/sched/core] Add comments to aid in safer usage of swake_up.

2017-06-08 Thread Krister Johansen
The behavior of swake_up() differs from that of wake_up(), and from the
swake_up() that came from RT linux. A memory barrier, or some other
synchronization, is needed prior to a swake_up so that the waiter sees
the condition set by the waker, and so that the waker does not see an
empty wait list.

Signed-off-by: Krister Johansen 
---
 include/linux/swait.h | 27 +++
 kernel/sched/swait.c  | 18 +-
 2 files changed, 44 insertions(+), 1 deletion(-)

 This came out of a discussion that Paul McKenney and I had about
 whether other callers of swake_up() knew that they needed to issue
 some kind of barrier prior to invoking swake_up.  In the case of
 wake_up(), the caller will always wake any waiters.  With the simple
 queues, a swait_active occurs locklessly prior to the wakeup so
 additional synchronization may be needed on behalf of the caller.

diff --git a/include/linux/swait.h b/include/linux/swait.h
index c1f9c62..fede974 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -79,6 +79,33 @@ extern void __init_swait_queue_head(struct swait_queue_head 
*q, const char *name
DECLARE_SWAIT_QUEUE_HEAD(name)
 #endif
 
+/**
+ * swait_active -- locklessly test for waiters on the queue
+ * @q: the swait_queue to test for waiters
+ *
+ * returns true if the wait list is not empty
+ *
+ * NOTE: this function is lockless and requires care, incorrect usage _will_
+ * lead to sporadic and non-obvious failure.
+ *
+ * Use either while holding swait_queue_head_t::lock or when used for wakeups
+ * with an extra smp_mb() like:
+ *
+ *  CPU0 - wakerCPU1 - waiter
+ *
+ *  @cond = true;
+ *  smp_mb();
+ *  if (swait_active(wq))   swait_event(wq, cond);
+ *swake_up(wq);
+ *
+ *
+ * Because without the explicit smp_mb() it's possible for the
+ * swait_active() load to get hoisted over the @cond store such that we'll
+ * observe an empty wait list while the waiter might not observe @cond.
+ *
+ * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
+ * which (when the lock is uncontended) are of roughly equal cost.
+ */
 static inline int swait_active(struct swait_queue_head *q)
 {
return !list_empty(&q->task_list);
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 3d5610d..6e949a8 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -29,6 +29,16 @@ void swake_up_locked(struct swait_queue_head *q)
 }
 EXPORT_SYMBOL(swake_up_locked);
 
+/**
+ * swake_up - wake up a waiter on a simple waitqueue
+ * @q: the simple wait queue head
+ *
+ * In order for this to function properly, since it uses swait_active()
+ * internally, some kind of memory barrier must occur prior to calling this.
+ * Typically, this will be smp_mb__after_atomic(), but if the value is
+ * manipulated non-atomically, one may need to use a less regular barrier, such
+ * as smp_mb().  spin_unlock() does not guarantee a memory barrier.
+ */
 void swake_up(struct swait_queue_head *q)
 {
unsigned long flags;
@@ -45,7 +55,13 @@ EXPORT_SYMBOL(swake_up);
 /*
  * Does not allow usage from IRQ disabled, since we must be able to
  * release IRQs to guarantee bounded hold time.
- */
+ *
+ * In order for this to function properly, since it uses swait_active()
+ * internally, some kind of memory barrier must occur prior to calling this.
+ * Typically, this will be smp_mb__after_atomic(), but if the value is
+ * manipulated non-atomically, one may need to use a less regular barrier, such
+ * as smp_mb().  spin_unlock() does not guarantee a memory barrier.
+ */
 void swake_up_all(struct swait_queue_head *q)
 {
struct swait_queue *curr;
-- 
2.7.4



Re: [PATCH tip/core/rcu 45/88] rcu: Add memory barriers for NOCB leader wakeup

2017-06-08 Thread Krister Johansen
On Thu, Jun 08, 2017 at 04:47:43PM -0700, Paul E. McKenney wrote:
> On Thu, Jun 08, 2017 at 02:28:14PM -0700, Krister Johansen wrote:
> > On Thu, Jun 08, 2017 at 01:55:00PM -0700, Paul E. McKenney wrote:
> > > Interesting!  This is the first that I have heard that this was anything
> > > other than a theoretical bug.  To the comment in your second URL, it is
> > > wise to recall that a seismologist was in fact arrested for failing to
> > > predict an earthquake.  Later acquitted/pardoned/whatever, but arrested
> > > nonetheless.  ;-)
> > 
> > Point taken.  I do realize that we all make mistakes, and certainly I do
> > too.
> 
> Indeed!  Let's just say that the author of that email will have no
> trouble returning the favor, and sooner rather than later.  ;-)

No doubt he's polishing up an extra small extra tight pair of handcuffs
with my name on them.

> >   Perhaps I should have said that my survey of current callers of
> > swake_up() was enough to convince me that I didn't have an immediate
> > problem elsewhere, but that I'm not familiar enough with the code base
> > to make that statement with a lot of authority.  The concern being that if
> > the patch came from RT-linux where the barrier was present in
> > swake_up(), are there other places where swake_up() callers still assume
> > this is being handled on their behalf?
> > 
> > As part of this, I also pondered whether I should add a comment around
> > swake_up(), similar to what's already there for waitqueue_active.
> > I wasn't sure how subtle this is for other consumers, though.
> 
> In my case, I assume I need barriers for swake_up(), which is why I
> found this bug by inspection.  Still, I wouldn't mind a comment.
> Others might have other opinions.

Since you don't mind, I've prepared a small patch for those comments.  I'll
send that in a separate thread.

Thanks again,

-K


Re: [PATCH tip/core/rcu 45/88] rcu: Add memory barriers for NOCB leader wakeup

2017-06-08 Thread Krister Johansen
On Thu, Jun 08, 2017 at 01:55:00PM -0700, Paul E. McKenney wrote:
> On Thu, Jun 08, 2017 at 01:11:48PM -0700, Krister Johansen wrote:
> > May I impose upon you to CC this patch to stable, and tag it as fixing
> > abedf8e241?  I ran into this on a production 4.9 branch.  When I
> > debugged it, I discovered that it went all the way back to 4.6.  The
> > tl;dr is that at least for some environments, the missed wakeup
> > manifests itself as a series of hung-task warnings to console and if I'm
> > unlucky it can also generate a hang that can block interactive logins
> > via ssh.
> 
> Interesting!  This is the first that I have heard that this was anything
> other than a theoretical bug.  To the comment in your second URL, it is
> wise to recall that a seismologist was in fact arrested for failing to
> predict an earthquake.  Later acquitted/pardoned/whatever, but arrested
> nonetheless.  ;-)

Point taken.  I do realize that we all make mistakes, and certainly I do
too.  Perhaps I should have said that my survey of current callers of
swake_up() was enough to convince me that I didn't have an immediate
problem elsewhere, but that I'm not familiar enough with the code base
to make that statement with a lot of authority.  The concern being that if
the patch came from RT-linux where the barrier was present in
swake_up(), are there other places where swake_up() callers still assume
this is being handled on their behalf?

As part of this, I also pondered whether I should add a comment around
swake_up(), similar to what's already there for waitqueue_active.
I wasn't sure how subtle this is for other consumers, though.

> Silliness aside, does my patch actually fix your problem in practice as
> well as in theory?  If so, may I have your Tested-by?

Yes, it absolutely does.  Consider it given:

Tested-by: Krister Johansen 

> Impressive investigative effort, by the way!

Thanks!

-K


Re: [PATCH tip/core/rcu 45/88] rcu: Add memory barriers for NOCB leader wakeup

2017-06-08 Thread Krister Johansen
Hi Paul,

On Thu, May 25, 2017 at 02:59:18PM -0700, Paul E. McKenney wrote:
> Wait/wakeup operations do not guarantee ordering on their own.  Instead,
> either locking or memory barriers are required.  This commit therefore
> adds memory barriers to wake_nocb_leader() and nocb_leader_wait().
> 
> Signed-off-by: Paul E. McKenney 
> ---
>  kernel/rcu/tree_plugin.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index 0b1042545116..573fbe9640a0 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -1810,6 +1810,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool 
> force)
>   if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
>   /* Prior smp_mb__after_atomic() orders against prior enqueue. */
>   WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
> + smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
>   swake_up(&rdp_leader->nocb_wq);
>   }
>  }
> @@ -2064,6 +2065,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
>* nocb_gp_head, where they await a grace period.
>*/
>   gotcbs = false;
> + smp_mb(); /* wakeup before ->nocb_head reads. */
>   for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
>   rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
>   if (!rdp->nocb_gp_head)

May I impose upon you to CC this patch to stable, and tag it as fixing
abedf8e241?  I ran into this on a production 4.9 branch.  When I
debugged it, I discovered that it went all the way back to 4.6.  The
tl;dr is that at least for some environments, the missed wakeup
manifests itself as a series of hung-task warnings to console and if I'm
unlucky it can also generate a hang that can block interactive logins
via ssh.

In case it's useful, I'm attching the RCA that I sent out before I
circulated a similar patch for internal review.  You beat me to the
punch in terms of getting a patch out a public mailing list.

--- >8 cut here 8< ---

Production boxes go through fits where they generate spurts of hung task
warnings.  Sometimes, the slowdown is enough to ensnare a process
holding the rtnl_lock, or prevent systemd from letting ssh logins on to
the box.

Below is a splat from a relatively benign ocurrence:

   INFO: task uc-spawn:34443 blocked for more than 120 seconds.
 Not tainted 4.9.4-1.el7.x86_64 #1
   "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
   uc-spawnD0 34443  1 0x0082
883fec88e4c0  883ff1b4ac00 883ffddd9540
883fee999600 c9001c907c30 8175e3f6 0202
883fee999600 7fff 883fee999600 7fff
   Call Trace:
[] ? __schedule+0x1a6/0x650
[] schedule+0x36/0x80
[] schedule_timeout+0x21c/0x3a0
[] wait_for_completion+0xf2/0x130
[] ? wake_up_q+0x80/0x80
[] __wait_rcu_gp+0xd7/0x100
[] synchronize_sched+0x5e/0x80
[] ? __call_rcu+0x320/0x320
[] ? trace_raw_output_rcu_utilization+0x60/0x60
[] kern_unmount+0x2b/0x40
[] mq_put_mnt+0x15/0x20
[] put_ipc_ns+0x3e/0xa0
[] free_nsproxy+0x38/0x90
[] switch_task_namespaces+0x59/0x60
[] exit_task_namespaces+0x10/0x20
[] do_exit+0x2b7/0xac0
[] ? syscall_trace_enter+0x1d0/0x2b0
[] do_group_exit+0x3f/0xb0
[] SyS_exit_group+0x14/0x20
[] do_syscall_64+0x67/0x180
[] entry_SYSCALL64_slow_path+0x25/0x25

In order to debug this further, a kdump was obtained.

crash> ps 34443
   PIDPPID  CPU   TASKST  %MEM VSZRSS  COMM
  34443  1  33  883fee999600  UN   0.0   0  0  uc-spawn

crash> bt 34443
PID: 34443  TASK: 883fee999600  CPU: 33  COMMAND: "uc-spawn"
 #0 [c9001c907bb8] __schedule at 8175e3f6
 #1 [c9001c907c38] schedule at 8175e8d6
 #2 [c9001c907c50] schedule_timeout at 81761aec
 #3 [c9001c907cf0] wait_for_completion at 8175f3b2
 #4 [c9001c907d50] __wait_rcu_gp at 810eec27
 #5 [c9001c907da0] synchronize_sched at 810f264e
 #6 [c9001c907df0] kern_unmount at 8125673b
 #7 [c9001c907e08] mq_put_mnt at 812d6f85
 #8 [c9001c907e18] put_ipc_ns at 812d76de
 #9 [c9001c907e30] free_nsproxy at 810ab768
#10 [c9001c907e48] switch_task_namespaces at 810ab8c9
#11 [c9001c907e70] exit_task_namespaces at 810ab8e0
#12 [c9001c907e80] do_exit at 8108d787
#13 [c9001c907ee8] do_group_exit at 8108e00f
#14 [c9001c907f18] sys_exit_group at 8108e094
#15 [c9001c907f28] do_syscall_64 at 81003a47
RIP: 7f6b6e1f4209  RSP: 7ffc488028b8  RFLAGS: 0246
RAX: ffda  RBX: 0001  RCX: 7f6b6e1f4209
RDX: 0001  RSI:   RDI: 0001
RBP: 7f6b6e4ea840   R8: 003c   R9: 0

[tip:perf/urgent] perf callchain: Reference count maps

2017-02-03 Thread tip-bot for Krister Johansen
Commit-ID:  aa33b9b9a2ebb00d33c83a5312d4fbf2d5aeba36
Gitweb: http://git.kernel.org/tip/aa33b9b9a2ebb00d33c83a5312d4fbf2d5aeba36
Author: Krister Johansen 
AuthorDate: Thu, 5 Jan 2017 22:23:31 -0800
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 2 Feb 2017 11:39:09 -0300

perf callchain: Reference count maps

If dso__load_kcore frees all of the existing maps, but one has already
been attached to a callchain cursor node, then we can get a SIGSEGV in
any function that happens to try to use this invalid cursor.  Use the
existing map refcount mechanism to forestall cleanup of a map until the
cursor iterates past the node.

Signed-off-by: Krister Johansen 
Tested-by: Arnaldo Carvalho de Melo 
Cc: Frederic Weisbecker 
Cc: Masami Hiramatsu 
Cc: Namhyung Kim 
Cc: sta...@kernel.org
Fixes: 84c2cafa2889 ("perf tools: Reference count struct map")
Link: http://lkml.kernel.org/r/20170106062331.gb2...@templeofstupid.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/callchain.c | 11 +--
 tools/perf/util/callchain.h |  6 ++
 tools/perf/util/hist.c  |  7 +++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 4292251..8b610dd 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -437,7 +437,7 @@ fill_node(struct callchain_node *node, struct 
callchain_cursor *cursor)
}
call->ip = cursor_node->ip;
call->ms.sym = cursor_node->sym;
-   call->ms.map = cursor_node->map;
+   call->ms.map = map__get(cursor_node->map);
 
if (cursor_node->branch) {
call->branch_count = 1;
@@ -477,6 +477,7 @@ add_child(struct callchain_node *parent,
 
list_for_each_entry_safe(call, tmp, &new->val, list) {
list_del(&call->list);
+   map__zput(call->ms.map);
free(call);
}
free(new);
@@ -761,6 +762,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
list->ms.map, list->ms.sym,
false, NULL, 0, 0);
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -811,7 +813,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
}
 
node->ip = ip;
-   node->map = map;
+   map__zput(node->map);
+   node->map = map__get(map);
node->sym = sym;
node->branch = branch;
node->nr_loop_iter = nr_loop_iter;
@@ -1142,11 +1145,13 @@ static void free_callchain_node(struct callchain_node 
*node)
 
list_for_each_entry_safe(list, tmp, &node->parent_val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
list_for_each_entry_safe(list, tmp, &node->val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -1210,6 +1215,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
goto out;
*new = *chain;
new->has_children = false;
+   map__get(new->ms.map);
list_add_tail(&new->list, &head);
}
parent = parent->parent;
@@ -1230,6 +1236,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
 out:
list_for_each_entry_safe(chain, new, &head, list) {
list_del(&chain->list);
+   map__zput(chain->ms.map);
free(chain);
}
return -ENOMEM;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 35c8e37..4f4b60f 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include "event.h"
+#include "map.h"
 #include "symbol.h"
 
 #define HELP_PAD "\t\t\t\t"
@@ -184,8 +185,13 @@ int callchain_merge(struct callchain_cursor *cursor,
  */
 static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
 {
+   struct callchain_cursor_node *node;
+
cursor->nr = 0;
cursor->last = &cursor->first;
+
+   for (node = cursor->first; node != NULL; node = node->next)
+   map__zput(node->map);
 }
 
 int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 6770a96..7d1b7d3 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1,6 +1,7 @@
 #include "util

[tip:perf/core] perf callchain: Reference count maps

2017-02-01 Thread tip-bot for Krister Johansen
Commit-ID:  9c68ae98c6f714ef573826cfc9055af1bd5e97b1
Gitweb: http://git.kernel.org/tip/9c68ae98c6f714ef573826cfc9055af1bd5e97b1
Author: Krister Johansen 
AuthorDate: Thu, 5 Jan 2017 22:23:31 -0800
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 31 Jan 2017 16:19:06 -0300

perf callchain: Reference count maps

If dso__load_kcore frees all of the existing maps, but one has already
been attached to a callchain cursor node, then we can get a SIGSEGV in
any function that happens to try to use this invalid cursor.  Use the
existing map refcount mechanism to forestall cleanup of a map until the
cursor iterates past the node.

Signed-off-by: Krister Johansen 
Tested-by: Arnaldo Carvalho de Melo 
Cc: Frederic Weisbecker 
Cc: Masami Hiramatsu 
Cc: Namhyung Kim 
Cc: sta...@kernel.org
Fixes: 84c2cafa2889 ("perf tools: Reference count struct map")
Link: http://lkml.kernel.org/r/20170106062331.gb2...@templeofstupid.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/callchain.c | 11 +--
 tools/perf/util/callchain.h |  6 ++
 tools/perf/util/hist.c  |  7 +++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index e16db30..aba9534 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -449,7 +449,7 @@ fill_node(struct callchain_node *node, struct 
callchain_cursor *cursor)
}
call->ip = cursor_node->ip;
call->ms.sym = cursor_node->sym;
-   call->ms.map = cursor_node->map;
+   call->ms.map = map__get(cursor_node->map);
 
if (cursor_node->branch) {
call->branch_count = 1;
@@ -489,6 +489,7 @@ add_child(struct callchain_node *parent,
 
list_for_each_entry_safe(call, tmp, &new->val, list) {
list_del(&call->list);
+   map__zput(call->ms.map);
free(call);
}
free(new);
@@ -773,6 +774,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
list->ms.map, list->ms.sym,
false, NULL, 0, 0);
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -823,7 +825,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
}
 
node->ip = ip;
-   node->map = map;
+   map__zput(node->map);
+   node->map = map__get(map);
node->sym = sym;
node->branch = branch;
node->nr_loop_iter = nr_loop_iter;
@@ -1154,11 +1157,13 @@ static void free_callchain_node(struct callchain_node 
*node)
 
list_for_each_entry_safe(list, tmp, &node->parent_val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
list_for_each_entry_safe(list, tmp, &node->val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -1222,6 +1227,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
goto out;
*new = *chain;
new->has_children = false;
+   map__get(new->ms.map);
list_add_tail(&new->list, &head);
}
parent = parent->parent;
@@ -1242,6 +1248,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
 out:
list_for_each_entry_safe(chain, new, &head, list) {
list_del(&chain->list);
+   map__zput(chain->ms.map);
free(chain);
}
return -ENOMEM;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 35c8e37..4f4b60f 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include "event.h"
+#include "map.h"
 #include "symbol.h"
 
 #define HELP_PAD "\t\t\t\t"
@@ -184,8 +185,13 @@ int callchain_merge(struct callchain_cursor *cursor,
  */
 static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
 {
+   struct callchain_cursor_node *node;
+
cursor->nr = 0;
cursor->last = &cursor->first;
+
+   for (node = cursor->first; node != NULL; node = node->next)
+   map__zput(node->map);
 }
 
 int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index cff2e90..32c6a93 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1,6 +1,7 @@
 #include "util

Re: [PATCH v3 perf/core] perf script: fix a use after free crash.

2017-01-20 Thread Krister Johansen
Hey Arnaldo,

On Thu, Jan 05, 2017 at 10:23:31PM -0800, Krister Johansen wrote:
> If dso__load_kcore frees all of the existing maps, but one has already
> been attached to a callchain cursor node, then we can get a SIGSEGV in
> any function that happens to try to use this invalid cursor.  Use the
> existing map refcount mechanism to forestall cleanup of a map until the
> cursor iterates past the node.

It's been a couple of weeks since I sent you the v3 of this patch.  Last
time I fiddled with it, I was able to reproduce your 'perf top' core,
and was able to verify that the latest patch I sent out could survive
running 'perf top' through the course of a full kernel build.

Is there anything else I can do to help with this one?

Thanks,

-K


Re: [PATCH v2 perf/core] perf script: fix a use after free crash.

2017-01-05 Thread Krister Johansen
On Wed, Jan 04, 2017 at 12:37:39AM -0800, Krister Johansen wrote:
> On Mon, Jan 02, 2017 at 09:30:33PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Jan 02, 2017 at 04:39:04PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Mon, Jan 02, 2017 at 02:36:57PM -0300, Arnaldo Carvalho de Melo 
> > > escreveu:
> > > > Em Mon, Jan 02, 2017 at 02:35:30PM -0300, Arnaldo Carvalho de Melo 
> > > > escreveu:
> > > > > Em Mon, Jan 02, 2017 at 12:15:14PM -0300, Arnaldo Carvalho de Melo 
> > > > > escreveu:
> > > >  {
> > > > zfree(&iter->priv);
> > > > iter->he = NULL;
> > > > +   map__zput(al->map);
> > > 
> > > What this pairs to? I was expecting that since this is called via:
> > > 
> > >hist_entry_iter__add()
> > >{
> > >
> > >err2 = iter->ops->finish_entry(iter, al);
> > >}
> > > 
> > > Then it would have to match something done earlier in
> > > hist_entry_iter__add(), most likely by some iter->ops->() method, but I
> > > couldn'd find anything to that extent, can you clarify?
> > 
> > With the following patch it has been running all day, care to explain
> > why it is needed? I need to run this on valgrind or with Masami's
> > refcount debugger to get more clues :-\
>
> Let me try a version of this patch that dispenses with the code in both
> fill_callchain_info() and iter_finish_cumulative_entry.  However, before
> I do that I'll make sure I can figure out how to reproduce the core that
> you were seeing.  I don't want to send you yet another patch that
> doesn't run.  The busy system may be the clue I needed.  I had been
> running these on a mostly idle system.

I was able to reproduce your assertion failure using more load on a
system.  After fixing up the issues caused by incorrectly updating the
refcounts in the hist_entry_iter's function pointers, I re-ran my own
tests and found that I was still getting a failure for 'perf report'.

The problem there seemed to be that a map was getting into a hist entry,
but was freed before the entry was actually created.  The call to
hist_entry__init() that took a reference on the map was actually getting
freed memory by the time it tried to increment the reference count.

To address that, I modified hist_entry_iter__add() to take a reference
on the incoming al->map, if one exists.  It drops that reference on the
way out of the function.  With that change, I'm able to pass both your
'perf top' test under load and my own tests against a kernel without
debug info, but a kmod that has it.

I'll send out a v3 in just a moment.

Thanks again,

-K


[PATCH v3 perf/core] perf script: fix a use after free crash.

2017-01-05 Thread Krister Johansen
If dso__load_kcore frees all of the existing maps, but one has already
been attached to a callchain cursor node, then we can get a SIGSEGV in
any function that happens to try to use this invalid cursor.  Use the
existing map refcount mechanism to forestall cleanup of a map until the
cursor iterates past the node.

Signed-off-by: Krister Johansen 
---
 tools/perf/util/callchain.c | 11 +--
 tools/perf/util/callchain.h |  6 ++
 tools/perf/util/hist.c  | 10 ++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 4292251..8b610dd 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -437,7 +437,7 @@ fill_node(struct callchain_node *node, struct 
callchain_cursor *cursor)
}
call->ip = cursor_node->ip;
call->ms.sym = cursor_node->sym;
-   call->ms.map = cursor_node->map;
+   call->ms.map = map__get(cursor_node->map);
 
if (cursor_node->branch) {
call->branch_count = 1;
@@ -477,6 +477,7 @@ add_child(struct callchain_node *parent,
 
list_for_each_entry_safe(call, tmp, &new->val, list) {
list_del(&call->list);
+   map__zput(call->ms.map);
free(call);
}
free(new);
@@ -761,6 +762,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
list->ms.map, list->ms.sym,
false, NULL, 0, 0);
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -811,7 +813,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
}
 
node->ip = ip;
-   node->map = map;
+   map__zput(node->map);
+   node->map = map__get(map);
node->sym = sym;
node->branch = branch;
node->nr_loop_iter = nr_loop_iter;
@@ -1142,11 +1145,13 @@ static void free_callchain_node(struct callchain_node 
*node)
 
list_for_each_entry_safe(list, tmp, &node->parent_val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
list_for_each_entry_safe(list, tmp, &node->val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -1210,6 +1215,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
goto out;
*new = *chain;
new->has_children = false;
+   map__get(new->ms.map);
list_add_tail(&new->list, &head);
}
parent = parent->parent;
@@ -1230,6 +1236,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
 out:
list_for_each_entry_safe(chain, new, &head, list) {
list_del(&chain->list);
+   map__zput(chain->ms.map);
free(chain);
}
return -ENOMEM;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 35c8e37..4f4b60f 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include "event.h"
+#include "map.h"
 #include "symbol.h"
 
 #define HELP_PAD "\t\t\t\t"
@@ -184,8 +185,13 @@ int callchain_merge(struct callchain_cursor *cursor,
  */
 static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
 {
+   struct callchain_cursor_node *node;
+
cursor->nr = 0;
cursor->last = &cursor->first;
+
+   for (node = cursor->first; node != NULL; node = node->next)
+   map__zput(node->map);
 }
 
 int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 6770a96..9dabbae 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1,6 +1,7 @@
 #include "util.h"
 #include "build-id.h"
 #include "hist.h"
+#include "map.h"
 #include "session.h"
 #include "sort.h"
 #include "evlist.h"
@@ -1019,6 +1020,12 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, 
struct addr_location *al,
 int max_stack_depth, void *arg)
 {
int err, err2;
+   struct map *alm;
+
+   alm = NULL;
+
+   if (al && al->map)
+   alm = map__get(al->map);
 
err = sample__resolve_callchain(iter->sample, &callchain_cursor, 
&iter->parent,
iter->evsel, al, max_stack_depth);
@@ -1058,6 +1065,9 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, 
struct addr_location *al,
if (!err)
err = err2;
 
+   if (alm)
+   map__put(alm);
+
return err;
 }
 
-- 
2.7.4



Re: [PATCH v2 perf/core] perf script: fix a use after free crash.

2017-01-04 Thread Krister Johansen
On Mon, Jan 02, 2017 at 09:30:33PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Jan 02, 2017 at 04:39:04PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Mon, Jan 02, 2017 at 02:36:57PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Mon, Jan 02, 2017 at 02:35:30PM -0300, Arnaldo Carvalho de Melo 
> > > escreveu:
> > > > Em Mon, Jan 02, 2017 at 12:15:14PM -0300, Arnaldo Carvalho de Melo 
> > > > escreveu:
> > >  {
> > > zfree(&iter->priv);
> > > iter->he = NULL;
> > > +   map__zput(al->map);
> > 
> > What this pairs to? I was expecting that since this is called via:
> > 
> >hist_entry_iter__add()
> >{
> >
> >err2 = iter->ops->finish_entry(iter, al);
> >}
> > 
> > Then it would have to match something done earlier in
> > hist_entry_iter__add(), most likely by some iter->ops->() method, but I
> > couldn'd find anything to that extent, can you clarify?
> 
> With the following patch it has been running all day, care to explain
> why it is needed? I need to run this on valgrind or with Masami's
> refcount debugger to get more clues :-\
> 
> - Arnaldo
> 
> diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> index 72f5c82798e9..c27bda16e9cd 100644
> --- a/tools/perf/util/hist.c
> +++ b/tools/perf/util/hist.c
> @@ -980,7 +980,6 @@ iter_finish_cumulative_entry(struct hist_entry_iter *iter,
>  {
>   zfree(&iter->priv);
>   iter->he = NULL;
> - map__zput(al->map);
>  
>   return 0;
>  }

Thanks for taking the time to debug it this far.  I certainly appreciate
it.

The map_zput() in iter_finish_cumulative_entry() was intended to offset
the map_get() in iter_next_cumulative_entry() -> fill_callchain_info().
The call to fill_callchain_info should perform a map__get() on the
addr_location that gets passed in.  However, it looks like I mistakenly
assumed that fill_callcahin_info would always get called from
iter_next_cumulative_entry when clearly that doesn't happen if
callchain_cursor_current returns a NULL node.

Looking back, it's not entirely clear to me that the map__get() in
fill_callchain_info is necessary either, since its only caller is the
hist_iter_cumulative used by hist_entry_iter__add.

Let me try a version of this patch that dispenses with the code in both
fill_callchain_info() and iter_finish_cumulative_entry.  However, before
I do that I'll make sure I can figure out how to reproduce the core that
you were seeing.  I don't want to send you yet another patch that
doesn't run.  The busy system may be the clue I needed.  I had been
running these on a mostly idle system.

Thanks again, and apologies for all of the inconvenience.

-K


Re: [PATCH v4 0/3] perf: add support for analyzing events for containers

2017-01-04 Thread Krister Johansen
On Tue, Jan 03, 2017 at 04:57:54PM +0530, Hari Bathini wrote:
> On Thursday 29 December 2016 07:11 AM, Krister Johansen wrote:
> >On Fri, Dec 16, 2016 at 12:06:55AM +0530, Hari Bathini wrote:
> >>This patch-set overcomes this limitation by using cgroup identifier as
> >>container unique identifier. A new PERF_RECORD_NAMESPACES event that
> >>records namespaces related info is introduced, from which the cgroup
> >>namespace's device & inode numbers are used as cgroup identifier. This
> >>is based on the assumption that each container is created with it's own
> >>cgroup namespace allowing assessment/analysis of multiple containers
> >>using cgroup identifier.
> >Why choose cgroups when the kernel dispenses namespace-unique
> >identifiers. Cgroup membership can be arbitrary.  Moreover, cgroup and
> 
> Agreed. But doesn't that hold for any other namespace or a combination
> of namespaces as well?

I guess that's part of my concern.  There is no container-unique
identifier on the system, since the notion of containers is a construct
of higer-level software.  You're depending on the fact that some popular
container software packages put their processes in separate cgroups.
Some of the stranger problems I've debugged with containers involve
abuses of nsenter(1) and shared subtrees.  In cases like that, if you
filter by cgroup you may miss other interfering processes that are in
one or more of the namespaces associated with the container, but not its
cgroup.  It's possible I misunderstood.  Is the cgroup id being used to
filter events, or just for display purposes?

-K


Re: [PATCH v4 0/3] perf: add support for analyzing events for containers

2016-12-28 Thread Krister Johansen
On Fri, Dec 16, 2016 at 12:06:55AM +0530, Hari Bathini wrote:
> This patch-set overcomes this limitation by using cgroup identifier as
> container unique identifier. A new PERF_RECORD_NAMESPACES event that
> records namespaces related info is introduced, from which the cgroup
> namespace's device & inode numbers are used as cgroup identifier. This
> is based on the assumption that each container is created with it's own
> cgroup namespace allowing assessment/analysis of multiple containers
> using cgroup identifier.

Why choose cgroups when the kernel dispenses namespace-unique
identifiers. Cgroup membership can be arbitrary.  Moreover, cgroup and
namespace destruction are handled by separate subsystems.  It's possible
to have a cgroup notifier run prior to network namespace teardown
occurring.

If it were me, I'd re-use existing convention to identify the namespaces
you want to monitor.  The code in nsenter(1) can take a namespace that's
been bind mount'd on a file, or extract the ns information from a task
in /procfs.

My biggest concern is how the sample data is handled after it has been
collected.  Both namespaces and cgroups don't survive reboots.  Will the
records will contain all the persistent state needed to run a report or
script command at a later date?

Does this code attempt to enter alternate namespaces in order to record
stack/symbol information for a '-g' style trace?  If so, how are you
holding on to that information?  There's no guarantee that a particular
container will be alive or have its filesystems reachable from the host
if the trace data is evaluated at a later time.

-K


Re: [PATCH v2 perf/core] perf script: fix a use after free crash.

2016-12-28 Thread Krister Johansen
On Tue, Nov 22, 2016 at 04:01:06PM -0300, Arnaldo Carvalho de Melo wrote:
> Sorry for the overly long delay, trying it now after fixing up a
> conflict with a recent patchkit (branch stuff) I tested it by running
> 'perf top -g' and I'm getting some assertion bugs:
> 
> 
> # perf top -g
>1.34% filemap_map_pages
>  - 0.59% alloc_pages_vma
>   1.20% __alloc_pages_nodemask
> -5.87% 0.45%  [kernel][k] handle_mm_fault
>- 1.94% handle_mm_fault
> 1.34% filemap_map_pages
>   - 0.59% alloc_pages_vma
>1.22% __alloc_pages_nodemask
> +5.75% 0.03%  perf[.] 
> hist_entry_iter__add
> +4.46% 0.00%  [unknown]   [.] 
> -4.06% 2.74%  libc-2.23.so[.] _int_malloc
>- 1.95% 0
> 1.94% _int_malloc
> -3.20% 0.23%  perf[.] 
> iter_add_next_cumulative_entry
>- 1.49% iter_add_next_cumulative_entry
>   - 1.43% __hists__add_entry
>  2.58% 0.01%  [kernel][k] 
> return_from_SYSCALL_64
>  2.57% 2.55%  libperl.so.5.22.2   [.] Perl_fbm_instr
> -2.54% 2.51%  liblzma.so.5.2.2[.] lzma_decode
>- 2.51% lzma_decode
>  2.33% 0.00%  ld-2.23.so  [.] _dl_sysdep_start
> +2.24% 0.04%  ld-2.23.so  [.] dl_main
>  2.13% 0.03%  [kernel][k] ext4_readdir
>  2.09% 0.01%  [kernel][k] sys_newstat
>  2.08% 0.04%  [kernel][k] vfs_fstatat
>  2.07% 0.02%  [kernel][k] SYSC_newstat
>  2.02% 0.01%  [kernel][k] iterate_dir
> -1.96% 0.17%  [kernel][k] 
> __alloc_pages_nodemask
>- 1.37% __alloc_pages_nodemask
> perf: util/map.c:246: map__exit: Assertion 
> `!(!((&map->rb_node)->__rb_parent_color == (unsigned long)(&map->rb_node)))' 
> failed.
>   
>  Aborted (core dumped)
> [root@jouet ~]# 
> 
> 
> I'll try to investigate this further later/tomorrow, find the updated patch 
> below.
> 
> - Arnaldo
> 
> commit af04d2c4a5d1f6bd7f4971118e4e1153cc7c2506
> Author: Krister Johansen 
> Date:   Tue Oct 11 02:28:39 2016 -0700
> 
> perf callchain: Fix a use after free crash due to refcounting bug
> 
> If dso__load_kcore frees all of the existing maps, but one has already
> been attached to a callchain cursor node, then we can get a SIGSEGV in
> any function that happens to try to use this invalid cursor.  Use the
> existing map refcount mechanism to forestall cleanup of a map until the
> cursor iterates past the node.
> 
> Signed-off-by: Krister Johansen 
> Cc: Frederic Weisbecker 
> Cc: Masami Hiramatsu 
> Cc: Namhyung Kim 
> Link: http://lkml.kernel.org/r/20161011092839.gc7...@templeofstupid.com
> Signed-off-by: Arnaldo Carvalho de Melo 
> 
> diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
> index 823befd8209a..18bb7caee535 100644
> --- a/tools/perf/util/callchain.c
> +++ b/tools/perf/util/callchain.c
> @@ -437,7 +437,7 @@ fill_node(struct callchain_node *node, struct 
> callchain_cursor *cursor)
>   }
>   call->ip = cursor_node->ip;
>   call->ms.sym = cursor_node->sym;
> - call->ms.map = cursor_node->map;
> + call->ms.map = map__get(cursor_node->map);
>  
>   if (cursor_node->branch) {
>   call->branch_count = 1;
> @@ -477,6 +477,7 @@ add_child(struct callchain_node *parent,
>  
>   list_for_each_entry_safe(call, tmp, &new->val, list) {
>   list_del(&call->list);
> + map__zput(call->ms.map);
>   free(call);
>   }
>   free(new);
> @@ -761,6 +762,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
>   list->ms.map, list->ms.sym,
>   false, NULL, 0, 0);
>   list_del(&list->list);
> + map__zput(list->ms.map);
>   free(list);
>   }
>  
> @@ -811,7 +813,8 @@ int callchain_cursor_append(struct callchain_cursor 
> *cursor,
>   }
>

Re: [PATCH v2 perf/core] perf script: fix a use after free crash.

2016-12-01 Thread Krister Johansen
Hey Arnaldo,

On Tue, Nov 22, 2016 at 04:01:06PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Thu, Nov 10, 2016 at 04:40:46PM -0800, Krister Johansen escreveu:
> > Thanks.  As part of processing this did you run into any problems?
> > Would you like me to rebase against the latest perf/core and re-send the
> > patch?
> 
> Sorry for the overly long delay, trying it now after fixing up a
> conflict with a recent patchkit (branch stuff) I tested it by running
> 'perf top -g' and I'm getting some assertion bugs:

I appreciate you taking another stab at pulling this in.  My turn to
apologize for the delay.

> # perf top -g
>1.34% filemap_map_pages
>  - 0.59% alloc_pages_vma
>   1.20% __alloc_pages_nodemask
> -5.87% 0.45%  [kernel][k] handle_mm_fault
>- 1.94% handle_mm_fault
> 1.34% filemap_map_pages
>   - 0.59% alloc_pages_vma
>1.22% __alloc_pages_nodemask
> +5.75% 0.03%  perf[.] 
> hist_entry_iter__add
> +4.46% 0.00%  [unknown]   [.] 
> -4.06% 2.74%  libc-2.23.so[.] _int_malloc
>- 1.95% 0
> 1.94% _int_malloc
> -3.20% 0.23%  perf[.] 
> iter_add_next_cumulative_entry
>- 1.49% iter_add_next_cumulative_entry
>   - 1.43% __hists__add_entry
>  2.58% 0.01%  [kernel][k] 
> return_from_SYSCALL_64
>  2.57% 2.55%  libperl.so.5.22.2   [.] Perl_fbm_instr
> -2.54% 2.51%  liblzma.so.5.2.2[.] lzma_decode
>- 2.51% lzma_decode
>  2.33% 0.00%  ld-2.23.so  [.] _dl_sysdep_start
> +2.24% 0.04%  ld-2.23.so  [.] dl_main
>  2.13% 0.03%  [kernel][k] ext4_readdir
>  2.09% 0.01%  [kernel][k] sys_newstat
>  2.08% 0.04%  [kernel][k] vfs_fstatat
>  2.07% 0.02%  [kernel][k] SYSC_newstat
>  2.02% 0.01%  [kernel][k] iterate_dir
> -1.96% 0.17%  [kernel][k] 
> __alloc_pages_nodemask
>- 1.37% __alloc_pages_nodemask
> perf: util/map.c:246: map__exit: Assertion 
> `!(!((&map->rb_node)->__rb_parent_color == (unsigned long)(&map->rb_node)))' 
> failed.

Assuming that I'd failed to test 'perf top -g' I went ahead and re-ran
this with the last version of the patch I sent out parented against the
4.8 STABLE branch.  That didn't trigger any assertion failures for me.

Is this branch that gave you merge conflicts now in perf/core or
otherwise publicly avilable?  If so, I'd be happy to try to resolve any
conflicts and re-test against it.  The copy of the patch you sent out
didn't look obviously incorrect.

Thanks,

-K


Re: [PATCH v2 perf/core] perf script: fix a use after free crash.

2016-11-10 Thread Krister Johansen
On Wed, Oct 26, 2016 at 11:44:53AM -0200, Arnaldo Carvalho de Melo wrote:
> Em Tue, Oct 25, 2016 at 05:20:10PM -0700, Krister Johansen escreveu:
> > On Tue, Oct 11, 2016 at 02:28:39AM -0700, Krister Johansen wrote:
> > > If dso__load_kcore frees all of the existing maps, but one has already
> > > been attached to a callchain cursor node, then we can get a SIGSEGV in
> > > any function that happens to try to use this invalid cursor.  Use the
> > > existing map refcount mechanism to forestall cleanup of a map until the
> > > cursor iterates past the node.
> > 
> > It has been a couple of weeks since I sent out v2 of this patch.  I
> > understand that folks here have plenty of irons in the fire, but I
> > wanted to double-check that nobody was waiting on me for additional
> > information or changes.
> 
> It was a mix of waiting for more people to review it, or for Masami to
> run its refcount debugger on it, ended up falling thru the cracks.
> 
> I'll try to process it now.

Thanks.  As part of processing this did you run into any problems?
Would you like me to rebase against the latest perf/core and re-send the
patch?

-K


Re: [PATCH v2 perf/core] perf script: fix a use after free crash.

2016-10-25 Thread Krister Johansen
On Tue, Oct 11, 2016 at 02:28:39AM -0700, Krister Johansen wrote:
> If dso__load_kcore frees all of the existing maps, but one has already
> been attached to a callchain cursor node, then we can get a SIGSEGV in
> any function that happens to try to use this invalid cursor.  Use the
> existing map refcount mechanism to forestall cleanup of a map until the
> cursor iterates past the node.

It has been a couple of weeks since I sent out v2 of this patch.  I
understand that folks here have plenty of irons in the fire, but I
wanted to double-check that nobody was waiting on me for additional
information or changes.

Thanks,

-K


Re: [PATCH v4 3/3] Make core_pattern support namespace

2016-10-25 Thread Krister Johansen
On Tue, Oct 25, 2016 at 03:28:56PM +0800, Cao Shufeng wrote:
> From: Zhao Lei 
> It will bring us following benefit:
> 1: Each container can change their own coredump setting
>based on operation on /proc/sys/kernel/core_pattern
> 2: Coredump setting changed in host will not affect
>running containers.
> 3: Support both case of "putting coredump in guest" and
>"putting curedump in host".

Would you explain more about case #3 here?  In particular, I'm curious
what the impact is for systems that have already configured core_pattern
with the understanding that the program might be invoked to handle
either a host or a container core.  In particular, is there any way to
specify that the container handler fall back to the host handler?

On the systems that I've configured, /proc/sys is mounted read-only in the
container.  The host has a special program run from core_pattern that
determines which container generated the core.  It then stores the cores
in a directory that uniquely identifies the container.  The cores are
isolated on their own filesystem, and given a quota per-container.  The
eventual goal is to have a service evacuate the cores to an object store
where we can make them available to the customer via a web service.

Does your change still allow a global handler in the host to process
cores from containers?  Or is that behavior removed completely?

-K


[PATCH v2 perf/core] perf script: fix a use after free crash.

2016-10-11 Thread Krister Johansen
If dso__load_kcore frees all of the existing maps, but one has already
been attached to a callchain cursor node, then we can get a SIGSEGV in
any function that happens to try to use this invalid cursor.  Use the
existing map refcount mechanism to forestall cleanup of a map until the
cursor iterates past the node.

Signed-off-by: Krister Johansen 
---
 tools/perf/util/callchain.c | 13 +++--
 tools/perf/util/callchain.h |  6 ++
 tools/perf/util/hist.c  |  2 ++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 07fd30b..002bf5d 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -439,7 +439,7 @@ fill_node(struct callchain_node *node, struct 
callchain_cursor *cursor)
}
call->ip = cursor_node->ip;
call->ms.sym = cursor_node->sym;
-   call->ms.map = cursor_node->map;
+   call->ms.map = map__get(cursor_node->map);
list_add_tail(&call->list, &node->val);
 
callchain_cursor_advance(cursor);
@@ -464,6 +464,7 @@ add_child(struct callchain_node *parent,
 
list_for_each_entry_safe(call, tmp, &new->val, list) {
list_del(&call->list);
+   map__zput(call->ms.map);
free(call);
}
free(new);
@@ -732,6 +733,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
callchain_cursor_append(cursor, list->ip,
list->ms.map, list->ms.sym);
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -780,7 +782,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
}
 
node->ip = ip;
-   node->map = map;
+   map__zput(node->map);
+   node->map = map__get(map);
node->sym = sym;
 
cursor->nr++;
@@ -830,6 +833,8 @@ int fill_callchain_info(struct addr_location *al, struct 
callchain_cursor_node *
goto out;
}
 
+   map__get(al->map);
+
if (al->map->groups == &al->machine->kmaps) {
if (machine__is_host(al->machine)) {
al->cpumode = PERF_RECORD_MISC_KERNEL;
@@ -947,11 +952,13 @@ static void free_callchain_node(struct callchain_node 
*node)
 
list_for_each_entry_safe(list, tmp, &node->parent_val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
list_for_each_entry_safe(list, tmp, &node->val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -1015,6 +1022,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
goto out;
*new = *chain;
new->has_children = false;
+   map__get(new->ms.map);
list_add_tail(&new->list, &head);
}
parent = parent->parent;
@@ -1035,6 +1043,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
 out:
list_for_each_entry_safe(chain, new, &head, list) {
list_del(&chain->list);
+   map__zput(chain->ms.map);
free(chain);
}
return -ENOMEM;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 13e7554..798ba81 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include "event.h"
+#include "map.h"
 #include "symbol.h"
 
 #define HELP_PAD "\t\t\t\t"
@@ -178,8 +179,13 @@ int callchain_merge(struct callchain_cursor *cursor,
  */
 static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
 {
+   struct callchain_cursor_node *node;
+
cursor->nr = 0;
cursor->last = &cursor->first;
+
+   for (node = cursor->first; node != NULL; node = node->next)
+   map__zput(node->map);
 }
 
 int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index b02992e..609b365 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1,6 +1,7 @@
 #include "util.h"
 #include "build-id.h"
 #include "hist.h"
+#include "map.h"
 #include "session.h"
 #include "sort.h"
 #include "evlist.h"
@@ -979,6 +980,7 @@ iter_finish_cumulative_entry(struct hist_entry_iter *iter,
 {
zfree(&iter->priv);
iter->he = NULL;
+   map__zput(al->map);
 
return 0;
 }
-- 
2.7.4



Re: callchain map refcounting fixes was Re: [PATCH perf/core] perf script: fix a use after free crash.

2016-10-11 Thread Krister Johansen
On Sat, Oct 08, 2016 at 11:13:21PM -0700, Krister Johansen wrote:
> On Fri, Oct 07, 2016 at 11:22:00AM +0900, Namhyung Kim wrote:
> > On Wed, Oct 05, 2016 at 08:45:24AM -0300, Arnaldo Carvalho de Melo wrote:
> > > Em Sat, Oct 01, 2016 at 08:13:36PM -0700, Krister Johansen escreveu:
> > > > diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> > > > index b02992e..f8335e8 100644
> > > > --- a/tools/perf/util/hist.c
> > > > +++ b/tools/perf/util/hist.c
> > > > @@ -979,6 +982,7 @@ iter_finish_cumulative_entry(struct hist_entry_iter 
> > > > *iter,
> > > >  {
> > > > zfree(&iter->priv);
> > > > iter->he = NULL;
> > > > +   map__zput(al->map);
> > 
> > What is this needed?  Why other places like iter_finish_normal_entry
> > isn't?
> 
> I put a map__zput() here because iter_next_cumulative_entry() calls
> fill_callchain_info(), which takes a reference on the al->map in the
> struct addr_location that it returns.  I had forgotten all of this by
> the time you asked.  When I went back to work out my rationale, I
> stumbled across addr_location__put().  Think it would make sense to
> relocate the put of al->map there instead?

I gave the addr_location__put() approach a try, but it caused me to
remember why I had kept this small.  There are certain circumstances
where callers that lookup maps don't take references, seemingly because
the map is already contained within a map group.  If I move this to
addr_location__put(), then I need to expand the scope of this change.
Right now, it's just focusing on making sure that any map that gets
embedded into a callchain cursor, or retrieved from one and held onto,
gets referenced.

In other words, moving this to addr_location__put() frees a bunch of
maps that are acquired from elsewhere without their reference counts
incremented.

I made the rest of the modifications we discussed.  I'll send a v2 patch
in a separate e-mail.

-K


Re: callchain map refcounting fixes was Re: [PATCH perf/core] perf script: fix a use after free crash.

2016-10-08 Thread Krister Johansen
Hi Namhyung,

Thanks for looking this over.

On Fri, Oct 07, 2016 at 11:22:00AM +0900, Namhyung Kim wrote:
> On Wed, Oct 05, 2016 at 08:45:24AM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Sat, Oct 01, 2016 at 08:13:36PM -0700, Krister Johansen escreveu:

> > > diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
> > > index 07fd30b..15c89b2 100644
> > > --- a/tools/perf/util/callchain.c
> > > +++ b/tools/perf/util/callchain.c
> > > @@ -439,7 +439,7 @@ fill_node(struct callchain_node *node, struct 
> > > callchain_cursor *cursor)
> > >   }
> > >   call->ip = cursor_node->ip;
> > >   call->ms.sym = cursor_node->sym;
> > > - call->ms.map = cursor_node->map;
> > > + call->ms.map = map__get(cursor_node->map);
> > >   list_add_tail(&call->list, &node->val);
> > >  
> > >   callchain_cursor_advance(cursor);
> > > @@ -464,6 +464,7 @@ add_child(struct callchain_node *parent,
> > >  
> > >   list_for_each_entry_safe(call, tmp, &new->val, list) {
> > >   list_del(&call->list);
> > > + map__zput(call->ms.map);
> > >   free(call);
> > >   }
> > >   free(new);
> > > @@ -732,6 +733,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
> > >   callchain_cursor_append(cursor, list->ip,
> > >   list->ms.map, list->ms.sym);
> > >   list_del(&list->list);
> > > + map__zput(list->ms.map);
> > >   free(list);
> > >   }
> > >  
> > > @@ -780,7 +782,8 @@ int callchain_cursor_append(struct callchain_cursor 
> > > *cursor,
> > >   }
> > >  
> > >   node->ip = ip;
> > > - node->map = map;
> > > + map__zput(node->map);
> > > + node->map = map__get(map);
> > >   node->sym = sym;
> > >  
> > >   cursor->nr++;
> > > @@ -830,6 +833,8 @@ int fill_callchain_info(struct addr_location *al, 
> > > struct callchain_cursor_node *
> > >   goto out;
> > >   }
> > >  
> > > + map__get(al->map);
> > > +
> > >   if (al->map->groups == &al->machine->kmaps) {
> > >   if (machine__is_host(al->machine)) {
> > >   al->cpumode = PERF_RECORD_MISC_KERNEL;
> > > @@ -947,11 +952,13 @@ static void free_callchain_node(struct 
> > > callchain_node *node)
> > >  
> > >   list_for_each_entry_safe(list, tmp, &node->parent_val, list) {
> > >   list_del(&list->list);
> > > + map__zput(list->ms.map);
> > >   free(list);
> > >   }
> > >  
> > >   list_for_each_entry_safe(list, tmp, &node->val, list) {
> > >   list_del(&list->list);
> > > + map__zput(list->ms.map);
> > >   free(list);
> > >   }
> > >  
> > > @@ -1035,6 +1042,7 @@ int callchain_node__make_parent_list(struct 
> > > callchain_node *node)
> > >  out:
> > >   list_for_each_entry_safe(chain, new, &head, list) {
> > >   list_del(&chain->list);
> > > + map__zput(chain->ms.map);
> 
> I think you need to grab the refcnt in the "while (parent)" loop above.


Thanks; good catch.  I'll fix this.

> > > diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> > > index b02992e..f8335e8 100644
> > > --- a/tools/perf/util/hist.c
> > > +++ b/tools/perf/util/hist.c
> > > @@ -1,6 +1,7 @@
> > >  #include "util.h"
> > >  #include "build-id.h"
> > >  #include "hist.h"
> > > +#include "map.h"
> > >  #include "session.h"
> > >  #include "sort.h"
> > >  #include "evlist.h"
> > > @@ -970,6 +971,8 @@ iter_add_next_cumulative_entry(struct hist_entry_iter 
> > > *iter,
> > >  
> > >   if (symbol_conf.use_callchain)
> > >   callchain_append(he->callchain, &cursor, sample->period);
> > > + /* Cleanup temporary cursor. */
> > > + callchain_cursor_snapshot_rele(&cursor);
> 
> This callchain shotshot is used in a short period of time, and it's
> guaranteed that the maps in callchains will not freed due to refcnt in
> the orignal callchain cursor.  So I think we can skip to get/put
> refcnt on the snapshot cursor.  Also "rele" seems not a good name..

Ok. I'll remove the get/put from the snapshot stuff, and will excise the
rele function.

> > >   return 0;
> > >  }
> > >  
> > > @@ -979,6 +982,7 @@ iter_finish_cumulative_entry(struct hist_entry_iter 
> > > *iter,
> > >  {
> > >   zfree(&iter->priv);
> > >   iter->he = NULL;
> > > + map__zput(al->map);
> 
> What is this needed?  Why other places like iter_finish_normal_entry
> isn't?

I put a map__zput() here because iter_next_cumulative_entry() calls
fill_callchain_info(), which takes a reference on the al->map in the
struct addr_location that it returns.  I had forgotten all of this by
the time you asked.  When I went back to work out my rationale, I
stumbled across addr_location__put().  Think it would make sense to
relocate the put of al->map there instead?

Thanks for the feedback.  I'll send out a v2 once I get your changes
incorporated and tested.

-K


Re: callchain map refcounting fixes was Re: [PATCH perf/core] perf script: fix a use after free crash.

2016-10-05 Thread Krister Johansen
On Wed, Oct 05, 2016 at 08:45:24AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Sat, Oct 01, 2016 at 08:13:36PM -0700, Krister Johansen escreveu:
> > If dso__load_kcore frees all of the existing maps, but one has already
> > been attached to a callchain cursor node, then we can get a SIGSEGV in
> > any function that happens to try to use this cursor with the invalid
> > map.  Use the existing map refcount mechanism to forestall cleanup of a
> > map until the cursor iterates past the node.
> 
> Seems ok, thanks for working on this! Can you provide a test case that
> causes the SEGV so that I can, in addition to reviewing your changes and
> auditing the code to check if all cases ara plugged, to reproduce the
> problem?

Absolutely.  Thanks for taking the time to look it over.

AFIACT, this occurs when you're probing a .ko with its own debug
information, but when the kernel has none.  The kernel and all of the
in-tree modules were built through an RPM build that strips out
debuginfo into a separate package.  On this particular system, the
kernel debuginfo packages were not installed.  However, I had recently
changed a dkms build of an external module to use -g and to not strip.
We had one lonely .ko with all of its debug information inside, and a
kernel that perf was going to need to use kallsyms.  I was able to
insert the kprobe into the module and record events.  It was just script
and report that caused the core.

It should be possible to reproduce this by running script against a
trace that was recorded from kprobes in a module that has debug
inforation, but while running a kernel that does not.  I didn't specify
any special options for lookup of vmlinux.  I just let the tool figure
it out.

If you think it'd be useful, I can send along the notes that I took when
I debugged this.  They're about 15k, which is why I would hesitate to
send it to the entire list unsolicited.

Thanks again,

-K


[PATCH perf/core] perf script: fix a use after free crash.

2016-10-01 Thread Krister Johansen
If dso__load_kcore frees all of the existing maps, but one has already
been attached to a callchain cursor node, then we can get a SIGSEGV in
any function that happens to try to use this cursor with the invalid
map.  Use the existing map refcount mechanism to forestall cleanup of a
map until the cursor iterates past the node.

Signed-off-by: Krister Johansen 
---
 tools/perf/util/callchain.c | 12 ++--
 tools/perf/util/callchain.h | 20 
 tools/perf/util/hist.c  |  4 
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 07fd30b..15c89b2 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -439,7 +439,7 @@ fill_node(struct callchain_node *node, struct 
callchain_cursor *cursor)
}
call->ip = cursor_node->ip;
call->ms.sym = cursor_node->sym;
-   call->ms.map = cursor_node->map;
+   call->ms.map = map__get(cursor_node->map);
list_add_tail(&call->list, &node->val);
 
callchain_cursor_advance(cursor);
@@ -464,6 +464,7 @@ add_child(struct callchain_node *parent,
 
list_for_each_entry_safe(call, tmp, &new->val, list) {
list_del(&call->list);
+   map__zput(call->ms.map);
free(call);
}
free(new);
@@ -732,6 +733,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
callchain_cursor_append(cursor, list->ip,
list->ms.map, list->ms.sym);
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -780,7 +782,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
}
 
node->ip = ip;
-   node->map = map;
+   map__zput(node->map);
+   node->map = map__get(map);
node->sym = sym;
 
cursor->nr++;
@@ -830,6 +833,8 @@ int fill_callchain_info(struct addr_location *al, struct 
callchain_cursor_node *
goto out;
}
 
+   map__get(al->map);
+
if (al->map->groups == &al->machine->kmaps) {
if (machine__is_host(al->machine)) {
al->cpumode = PERF_RECORD_MISC_KERNEL;
@@ -947,11 +952,13 @@ static void free_callchain_node(struct callchain_node 
*node)
 
list_for_each_entry_safe(list, tmp, &node->parent_val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
list_for_each_entry_safe(list, tmp, &node->val, list) {
list_del(&list->list);
+   map__zput(list->ms.map);
free(list);
}
 
@@ -1035,6 +1042,7 @@ int callchain_node__make_parent_list(struct 
callchain_node *node)
 out:
list_for_each_entry_safe(chain, new, &head, list) {
list_del(&chain->list);
+   map__zput(chain->ms.map);
free(chain);
}
return -ENOMEM;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 13e7554..0d944ef 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include "event.h"
+#include "map.h"
 #include "symbol.h"
 
 #define HELP_PAD "\t\t\t\t"
@@ -178,8 +179,13 @@ int callchain_merge(struct callchain_cursor *cursor,
  */
 static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
 {
+   struct callchain_cursor_node *node;
+
cursor->nr = 0;
cursor->last = &cursor->first;
+
+   for (node = cursor->first; node != NULL; node = node->next)
+   map__zput(node->map);
 }
 
 int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
@@ -238,12 +244,26 @@ int perf_callchain_config(const char *var, const char 
*value);
 static inline void callchain_cursor_snapshot(struct callchain_cursor *dest,
 struct callchain_cursor *src)
 {
+   struct callchain_cursor_node *node;
+
*dest = *src;
 
dest->first = src->curr;
dest->nr -= src->pos;
+
+   for (node = dest->first; node != NULL; node = node->next)
+   map__get(node->map);
 }
 
+static inline void callchain_cursor_snapshot_rele(struct callchain_cursor 
*curs)
+{
+   struct callchain_cursor_node *node;
+
+   for (node = curs->first; node != NULL; node = node->next)
+   map__put(node->map);
+}
+
+
 #ifdef HAVE_SKIP_CALLCHAIN_IDX
 int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain);
 #else
diff --