Andi Kleen <a...@firstfloor.org> writes: Ping^2 for the patch series!
> Andi Kleen <a...@firstfloor.org> writes: > > Ping for the patch series! > >> From: Andi Kleen <a...@linux.intel.com> >> >> Using autofdo is currently something difficult. It requires using the >> model specific branches taken event, which differs on different CPUs. >> The example shown in the manual requires a special patched version of >> perf that is non standard, and also will likely not work everywhere. >> >> This patch adds a new gcc-auto-profile script that figures out the >> correct event and runs perf. The script is installed with on Linux systems. >> >> Since maintaining the script would be somewhat tedious (needs changes >> every time a new CPU comes out) I auto generated it from the online >> Intel event database. The script to do that is in contrib and can be >> rerun. >> >> Right now there is no test if perf works in configure. This >> would vary depending on the build and target system, and since >> it currently doesn't work in virtualization and needs uptodate >> kernel it may often fail in common distribution build setups. >> >> So Linux just hardcodes installing the script, but it may fail at runtime. >> >> This is needed to actually make use of autofdo in a generic way >> in the build system and in the test suite. >> >> So far the script is not installed. >> >> gcc/: >> 2016-03-27 Andi Kleen <a...@linux.intel.com> >> >> * doc/invoke.texi: Document gcc-auto-profile >> * gcc-auto-profile: Create. >> >> contrib/: >> >> 2016-03-27 Andi Kleen <a...@linux.intel.com> >> >> * gen_autofdo_event.py: New file to regenerate >> gcc-auto-profile. >> --- >> contrib/gen_autofdo_event.py | 155 >> +++++++++++++++++++++++++++++++++++++++++++ >> gcc/doc/invoke.texi | 31 +++++++-- >> gcc/gcc-auto-profile | 70 +++++++++++++++++++ >> 3 files changed, 251 insertions(+), 5 deletions(-) >> create mode 100755 contrib/gen_autofdo_event.py >> create mode 100755 gcc/gcc-auto-profile >> >> diff --git a/contrib/gen_autofdo_event.py b/contrib/gen_autofdo_event.py >> new file mode 100755 >> index 0000000..db4db33 >> --- /dev/null >> +++ b/contrib/gen_autofdo_event.py >> @@ -0,0 +1,155 @@ >> +#!/usr/bin/python >> +# generate Intel taken branches Linux perf event script for autofdo >> profiling >> + >> +# Copyright (C) 2016 Free Software Foundation, Inc. >> +# >> +# GCC is free software; you can redistribute it and/or modify it under >> +# the terms of the GNU General Public License as published by the Free >> +# Software Foundation; either version 3, or (at your option) any later >> +# version. >> +# >> +# GCC is distributed in the hope that it will be useful, but WITHOUT ANY >> +# WARRANTY; without even the implied warranty of MERCHANTABILITY or >> +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License >> +# for more details. >> +# >> +# You should have received a copy of the GNU General Public License >> +# along with GCC; see the file COPYING3. If not see >> +# <http://www.gnu.org/licenses/>. */ >> + >> +# run it with perf record -b -e EVENT program ... >> +# The Linux Kernel needs to support the PMU of the current CPU, and >> +# it will likely not work in VMs. >> +# add --all to print for all cpus, otherwise for current cpu >> +# add --script to generate shell script to run correct event >> +# >> +# requires internet (https) access. this may require setting up a proxy >> +# with export https_proxy=... >> +# >> +import urllib2 >> +import sys >> +import json >> +import argparse >> +import collections >> + >> +baseurl = "https://download.01.org/perfmon" >> + >> +target_events = (u'BR_INST_RETIRED.NEAR_TAKEN', >> + u'BR_INST_EXEC.TAKEN', >> + u'BR_INST_RETIRED.TAKEN_JCC', >> + u'BR_INST_TYPE_RETIRED.COND_TAKEN') >> + >> +ap = argparse.ArgumentParser() >> +ap.add_argument('--all', '-a', help='Print for all CPUs', >> action='store_true') >> +ap.add_argument('--script', help='Generate shell script', >> action='store_true') >> +args = ap.parse_args() >> + >> +eventmap = collections.defaultdict(list) >> + >> +def get_cpu_str(): >> + with open('/proc/cpuinfo', 'r') as c: >> + vendor, fam, model = None, None, None >> + for j in c: >> + n = j.split() >> + if n[0] == 'vendor_id': >> + vendor = n[2] >> + elif n[0] == 'model' and n[1] == ':': >> + model = int(n[2]) >> + elif n[0] == 'cpu' and n[1] == 'family': >> + fam = int(n[3]) >> + if vendor and fam and model: >> + return "%s-%d-%X" % (vendor, fam, model), model >> + return None, None >> + >> +def find_event(eventurl, model): >> + print >>sys.stderr, "Downloading", eventurl >> + u = urllib2.urlopen(eventurl) >> + events = json.loads(u.read()) >> + u.close() >> + >> + found = 0 >> + for j in events: >> + if j[u'EventName'] in target_events: >> + event = "cpu/event=%s,umask=%s/" % (j[u'EventCode'], >> j[u'UMask']) >> + if u'PEBS' in j and j[u'PEBS'] > 0: >> + event += "p" >> + if args.script: >> + eventmap[event].append(model) >> + else: >> + print j[u'EventName'], "event for model", model, "is", event >> + found += 1 >> + return found >> + >> +if not args.all: >> + cpu, model = get_cpu_str() >> + if not cpu: >> + sys.exit("Unknown CPU type") >> + >> +url = baseurl + "/mapfile.csv" >> +print >>sys.stderr, "Downloading", url >> +u = urllib2.urlopen(url) >> +found = 0 >> +cpufound = 0 >> +for j in u: >> + n = j.rstrip().split(',') >> + if len(n) >= 4 and (args.all or n[0] == cpu) and n[3] == "core": >> + if args.all: >> + vendor, fam, model = n[0].split("-") >> + model = int(model, 16) >> + cpufound += 1 >> + found += find_event(baseurl + n[2], model) >> +u.close() >> + >> +if args.script: >> + print '''#!/bin/sh >> +# profile workload for gcc profile feedback (autofdo) using Linux perf >> +# auto generated. to regenerate for new CPUs run >> +# contrib/gen_autofdo_event.py --shell --all in gcc source >> + >> +# usages: >> +# gcc-auto-profile program (profile program and children) >> +# gcc-auto-profile -a sleep X (profile all for X secs, may need >> root) >> +# gcc-auto-profile -p PID sleep X (profile PID) >> +# gcc-auto-profile --kernel -a sleep X (profile kernel) >> +# gcc-auto-profile --all -a sleep X (profile kernel and user space) >> + >> +# identify branches taken event for CPU >> +# >> + >> +FLAGS=u >> + >> +if [ "$1" = "--kernel" ] ; then >> + FLAGS=k >> + shift >> +fi >> +if [ "$1" == "--all" ] ; then >> + FLAGS=uk >> + shift >> +fi >> + >> +if ! grep -q Intel /proc/cpuinfo ] ; then >> + echo >&2 "Only Intel CPUs supported" >> + exit 1 >> +fi >> + >> +if grep -q hypervisor /proc/cpuinfo ; then >> + echo >&2 "Warning: branch profiling may not be functional in VMs" >> +fi >> + >> +case `egrep -q "^cpu family\s*: 6" /proc/cpuinfo && >> + egrep "^model\s*:" /proc/cpuinfo | head -1` in''' >> + for event, mod in eventmap.iteritems(): >> + for m in mod[:-1]: >> + print "model*:\ %s|\\" % m >> + print 'model*:\ %s) E="%s$FLAGS" ;;' % (mod[-1], event) >> + print '''*) >> +echo >&2 "Unknown CPU. Run contrib/gen_autofdo_event.py --all --script to >> update script." >> + exit 1 ;;''' >> + print "esac" >> + print 'exec perf record -e $E -b "$@"' >> + >> +if cpufound == 0 and not args.all: >> + sys.exit('CPU %s not found' % cpu) >> + >> +if found == 0: >> + sys.exit('Branch event not found') >> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi >> index 9e54bb7..427d89a 100644 >> --- a/gcc/doc/invoke.texi >> +++ b/gcc/doc/invoke.texi >> @@ -8249,13 +8249,34 @@ which are generally profitable only with profile >> feedback available: >> If omitted, it defaults to @file{fbdata.afdo} in the current directory. >> >> Producing an AutoFDO profile data file requires running your program >> -with the @command{perf} utility on a supported GNU/Linux target system. >> +with the @command{gcc-auto-profile} utility on a supported GNU/Linux target >> system. @command{gcc-auto-profile} calls the @command{perf} utility. >> +It also requires Last-Branch-Record support, which typically requires >> +a new enough kernel not running virtualized. >> +@command{gcc-auto-profile} accepts the same arguments as @command{perf >> record}. >> For more information, see @uref{https://perf.wiki.kernel.org/}. >> >> -E.g. >> @smallexample >> -perf record -e br_inst_retired:near_taken -b -o perf.data \ >> - -- your_program >> +gcc-auto-profile your_program >> +@end smallexample >> + >> +On larger programs the resulting perf.data file may be very large. >> +In this case it can be better to reduce the sampling rate. >> +Collect samples every million taken branches: >> + >> +@smallexample >> +gcc-auto-profile -c 1000000 program >> +@end smallexample >> + >> +Or only profile representative run intervals of the program: >> + >> +@smallexample >> +gcc-auto-profile -p PID-OF-PROGRAM sleep 5 >> +@end smallexample >> + >> +Profile complete system for 10 seconds (may require root) >> + >> +@smallexample >> +gcc-auto-profile -a sleep 10 >> @end smallexample >> >> Then use the @command{create_gcov} tool to convert the raw profile data >> @@ -8266,7 +8287,7 @@ See @uref{https://github.com/google/autofdo}. >> E.g. >> @smallexample >> create_gcov --binary=your_program.unstripped --profile=perf.data \ >> - --gcov=profile.afdo >> + --gcov=profile.afdo -gcov_version 1 >> @end smallexample >> @end table >> >> diff --git a/gcc/gcc-auto-profile b/gcc/gcc-auto-profile >> new file mode 100755 >> index 0000000..c6712b2 >> --- /dev/null >> +++ b/gcc/gcc-auto-profile >> @@ -0,0 +1,70 @@ >> +#!/bin/sh >> +# profile workload for gcc profile feedback (autofdo) using Linux perf >> +# auto generated. to regenerate for new CPUs run >> +# contrib/gen_autofdo_event.py --shell --all in gcc source >> + >> +# usages: >> +# gcc-auto-profile program (profile program and children) >> +# gcc-auto-profile -a sleep X (profile all for X secs, may need >> root) >> +# gcc-auto-profile -p PID sleep X (profile PID) >> +# gcc-auto-profile --kernel -a sleep X (profile kernel) >> +# gcc-auto-profile --all -a sleep X (profile kernel and user space) >> + >> +# identify branches taken event for CPU >> +# >> + >> +FLAGS=u >> + >> +if [ "$1" = "--kernel" ] ; then >> + FLAGS=k >> + shift >> +fi >> +if [ "$1" == "--all" ] ; then >> + FLAGS=uk >> + shift >> +fi >> + >> +if ! grep -q Intel /proc/cpuinfo ] ; then >> + echo >&2 "Only Intel CPUs supported" >> + exit 1 >> +fi >> + >> +if grep -q hypervisor /proc/cpuinfo ; then >> + echo >&2 "Warning: branch profiling may not be functional in VMs" >> +fi >> + >> +case `egrep -q "^cpu family\s*: 6" /proc/cpuinfo && >> + egrep "^model\s*:" /proc/cpuinfo | head -1` in >> +model*:\ 55|\ >> +model*:\ 77|\ >> +model*:\ 76) E="cpu/event=0xC4,umask=0xFE/p$FLAGS" ;; >> +model*:\ 42|\ >> +model*:\ 45|\ >> +model*:\ 58|\ >> +model*:\ 62|\ >> +model*:\ 60|\ >> +model*:\ 69|\ >> +model*:\ 70|\ >> +model*:\ 63|\ >> +model*:\ 61|\ >> +model*:\ 71|\ >> +model*:\ 86|\ >> +model*:\ 78|\ >> +model*:\ 94) E="cpu/event=0xC4,umask=0x20/p$FLAGS" ;; >> +model*:\ 46|\ >> +model*:\ 30|\ >> +model*:\ 31|\ >> +model*:\ 26|\ >> +model*:\ 47|\ >> +model*:\ 37|\ >> +model*:\ 44) E="cpu/event=0x88,umask=0x40/p$FLAGS" ;; >> +model*:\ 28|\ >> +model*:\ 38|\ >> +model*:\ 39|\ >> +model*:\ 54|\ >> +model*:\ 53) E="cpu/event=0x88,umask=0x41/p$FLAGS" ;; >> +*) >> +echo >&2 "Unknown CPU. Run contrib/gen_autofdo_event.py --all --script to >> update script." >> + exit 1 ;; >> +esac >> +exec perf record -e $E -b "$@" -- a...@linux.intel.com -- Speaking for myself only