Good day - I'd much appreciate some advice as to why, on my Intel x86_64 ( DisplayFamily_DisplayModel : 06_3CH ), running either Linux 4.12.10, or Linux 3.10.0, any attempt to count all of : PERF_COUNT_HW_BRANCH_INSTRUCTIONS (or raw config 0xC4) , and PERF_COUNT_HW_BRANCH_MISSES (or raw config 0xC5), and combined with PERF_COUNT_HW_CACHE_REFERENCES (or raw config 0x4F2E ), and PERF_COUNT_HW_CACHE_MISSES (or raw config 0x412E) , results in ALL COUNTERS BEING 0 in a read of the Group FD or mmap sample area.
This is demonstrated by the example program, which will use perf_event_open() to create a Group Leader FD for the first event, and associate all other events with that Event Group , so that it will read all events on the group FD . The perf_event_open() calls and the ioctl(event_fd, PERF_EVENT_IOC_ID, &id) calls all return successfully , but if I combine ANY of ( PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES ) with any of ( PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES ) in the Event Group, ALL events have '0' event->value. Demo : 1. Compile program to use kernel mapped Generic Events: $ gcc -std=gnu11 -o perf_bug perf_bug.c Running program shows all counters have 0 values, since both CACHE & BRANCH hits+misses are being requested: $ ./perf_bug EVENT: Branch Instructions : 0 EVENT: Branch Misses : 0 EVENT: Instructions : 0 EVENT: CPU Cycles : 0 EVENT: Ref. CPU Cycles : 0 EVENT: Bus Cycles : 0 EVENT: Cache References : 0 EVENT: Cache Misses : 0 NOT registering interest in EITHER the BRANCH counters OR the CACHE counters fixes the problem: Compile without registering for BRANCH_INSTRUCTIONS or BRANCH_MISSES: $ gcc -std=gnu11 -DNO_BUG_NO_BRANCH -o perf_bug perf_bug.c $ ./perf_bug EVENT: Instructions : 914 EVENT: CPU Cycles : 4110 EVENT: Ref. CPU Cycles : 4437 EVENT: Bus Cycles : 152 EVENT: Cache References : 1 EVENT: Cache Misses : 1 Compile without registering for CACHE_REFERENCES or CACHE_MISSES: $ gcc -std=gnu11 -DNO_BUG_NO_CACHE -o perf_bug perf_bug.c $ ./perf_bug EVENT: Branch Instructions : 106 EVENT: Branch Misses : 6 EVENT: Instructions : 914 EVENT: CPU Cycles : 4132 EVENT: Ref. CPU Cycles : 8526 EVENT: Bus Cycles : 295 The same thing happens if I do not use Generic Events, but rather "dynamic raw PMU" events, by putting the hex values from /sys/bus/event_source/devices/cpu/events/? into the perf_event_attr config, OR'ed with (1<<63), and using the PERF_TYPE_RAW perf_event_attr type value : $ gcc -DUSE_RAW_PMU -o perf_bug perf_bug.c $ ./perf_bug EVENT: Branch Instructions : 0 EVENT: Branch Misses : 0 EVENT: Instructions : 0 EVENT: CPU Cycles : 0 EVENT: Ref. CPU Cycles : 0 EVENT: Bus Cycles : 0 EVENT: Cache References : 0 EVENT: Cache Misses : 0 $ gcc -DUSE_RAW_PMU -DNO_BUG_NO_BRANCH -o perf_bug perf_bug.c $ ./perf_bug EVENT: Instructions : 914 EVENT: CPU Cycles : 4102 EVENT: Ref. CPU Cycles : 4959 EVENT: Bus Cycles : 171 EVENT: Cache References : 2 EVENT: Cache Misses : 2 $ gcc -DUSE_RAW_PMU -DNO_BUG_NO_CACHE -o perf_bug perf_bug.c $ ./perf_bug EVENT: Branch Instructions : 106 EVENT: Branch Misses : 6 EVENT: Instructions : 914 EVENT: CPU Cycles : 4108 EVENT: Ref. CPU Cycles : 10817 EVENT: Bus Cycles : 373 The perf tool itself seems to have the same issue: With CACHE & BRANCH counters does not work : $ perf stat -e '{r0c4,r0c5,r0c0,r03c,r0300,r013c,r04F2E,r0412E}:SIu' sleep 1 Performance counter stats for 'sleep 1': <not counted> r0c4 (0.00%) <not counted> r0c5 (0.00%) <not counted> r0c0 (0.00%) <not counted> r03c (0.00%) <not counted> r0300 (0.00%) <not counted> r013c (0.00%) <not counted> r04F2E (0.00%) <not supported> r0412E 1.001652932 seconds time elapsed Some events weren't counted. Try disabling the NMI watchdog: echo 0 > /proc/sys/kernel/nmi_watchdog perf stat ... echo 1 > /proc/sys/kernel/nmi_watchdog Disabling the NMI watchdog makes no difference . It is very strange that perf thinks 'r0412E' is not supported : $ cat /sys/bus/event_source/devices/cpu/cache_misses event=0x2e,umask=0x41 The kernel should not be advertizing an unsupported event in a /sys/bus/event_source/devices/cpu/events/ file, should it ? So perf stat has the same problem - without either Cache or Branch counters seems to work fine: without cache: $ perf stat -e '{r0c4,r0c5,r0c0,r03c,r0300,r013c}:SIu' sleep 1 Performance counter stats for 'sleep 1': 37740 r0c4 3557 r0c5 188552 r0c0 311684 r03c 360963 r0300 12461 r013c 1.001508109 seconds time elapsed without branch: $ perf stat -e '{r0c0,r03c,r0300,r013c,r04F2E,r0412E}:SIu' sleep 1 Performance counter stats for 'sleep 1': 188554 r0c0 320242 r03c 452748 r0300 15633 r013c 4145 r04F2E 3022 r0412E 1.001810421 seconds time elapsed proving again that perf's claim that 'r0412E' is not supported is bogus. The Intel SDM's table 19-1 Architectural events, which ALL Intel CPUs are meant to support, does include 'Event: 2EH | Umask: 4FH : LLC Reference ' and 'Event: 2EH | Umask: 41H : LLC Miss' , as well as : 'Event : C4H | Umask: 00H : Branch Instructions Retired' and 'Event : C5H | Umask: 00H : Branch Misses Retired' . So why can't perf count them all in the same group? Please , can anyone enlighten me as to what is going on here ? Why can't I count all of ( BRANCH_INSTRUCTIONS , BRANCH_MISSES , CACHE_REFERENCES, CACHE_MISSES ) in the same Perf Event Group ? Thanks in advance for any replies, Best Regards, Jason
/* Demonstration of Linux PERF bug: * Linux is unable to count BRANCH_INSTRUCTIONS or BRANCH_MISSES * in non-sampled mode. */ #include <sys/types.h> #include <stdint.h> #include <stdbool.h> #include <unistd.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <errno.h> #include <string.h> #include <stdio.h> #include <linux/perf_event.h> static int perf_event_open ( struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags ) { int ret; ret = (int) syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); return ret; } int main( int argc, const char *const* argv) { struct perf_event_attr pea = {0}; struct evcfg { uint64_t perf_type; uint64_t perf_cfg; const char *name; int fd; uint64_t id; } pe [] = { #ifndef USE_RAW_PMU // so we can test using Generic Kernel Event mapping: #ifndef NO_BUG_NO_BRANCH { PERF_TYPE_HARDWARE , PERF_COUNT_HW_BRANCH_INSTRUCTIONS , "Branch Instructions" , -1, 0 } , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_BRANCH_MISSES , "Branch Misses" , -1, 0 } , #endif { PERF_TYPE_HARDWARE , PERF_COUNT_HW_INSTRUCTIONS , "Instructions" , -1, 0 } , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_CPU_CYCLES , "CPU Cycles" , -1, 0 } , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_REF_CPU_CYCLES , "Ref. CPU Cycles" , -1, 0 } , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_BUS_CYCLES , "Bus Cycles" , -1, 0 } #ifndef NO_BUG_NO_CACHE , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_CACHE_REFERENCES , "Cache References" , -1, 0 } , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_CACHE_MISSES , "Cache Misses" , -1, 0 } #endif #else // or test using raw PMU codes - these come from the Intel SDM, Chapter 19, Table 19-1, // and I've checked they are identical to the values in // /sys/bus/event_source/devices/cpu/events/{ #ifndef NO_BUG_NO_BRANCH { PERF_TYPE_RAW , (1UL<<63U) | 0xC4 // branch_instructions , "Branch Instructions" , -1, 0 } , { PERF_TYPE_RAW , (1UL<<63U) | 0xC5 // branch_misses , "Branch Misses" , -1, 0 } , #endif { PERF_TYPE_RAW , (1UL<<63U) | 0xC0 // instructions , "Instructions" , -1, 0 } , { PERF_TYPE_RAW , (1UL<<63U) | 0x3C // cpu cycles , "CPU Cycles" , -1, 0 } , { PERF_TYPE_RAW , (1UL<<63U) | 0x0300 // ref cpu cycles , "Ref. CPU Cycles" , -1, 0 } , { PERF_TYPE_RAW , (1UL<<63U) | 0x013C // bus cycles , "Bus Cycles" , -1, 0 } #ifndef NO_BUG_NO_CACHE , { PERF_TYPE_RAW , (1UL<<63U) | 0x04F2E // cache references , "Cache References" , -1, 0 } , { PERF_TYPE_RAW , (1UL<<63U) | 0x0412E // cache misses , "Cache Misses" , -1, 0 } #endif #endif }; #define N_EV (sizeof(pe)/sizeof(struct evcfg)) int fd=-1; int n_ev=0; pid_t pid=getpid(); for(; n_ev < N_EV; n_ev += 1) { memset(&pea, '\0', sizeof(pea)); pea.size = PERF_ATTR_SIZE_VER5; pea.type = pe[n_ev].perf_type; pea.config = pe[n_ev].perf_cfg; pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING ; pea.disabled=1; pea.exclude_kernel = 1; pea.exclude_idle = 1; pea.exclude_hv = 1; if((pe[n_ev].fd = perf_event_open ( &pea, pid, -1, fd, 0) ) == -1 ) { fprintf(stderr,"perf_event_open failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } if( fd == -1) fd = pe[n_ev].fd; // this is the Group Leader FD if( 0 != ioctl( pe[n_ev].fd, PERF_EVENT_IOC_ID, &pe[n_ev].id)) { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed for #%d : %d : '%s'.\n", n_ev, errno, strerror(errno)); return 1; } } if( 0 != ioctl( fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP)) { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_RESET) failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } // do something to measure - let's try 100 long divisions: uint64_t a_num = 0x0102030405060708; uint64_t b_num = ~a_num; int cnt=100; if( 0 != ioctl( fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP)) { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } do { a_num=(b_num /= a_num); } while(--cnt); if( 0 != ioctl( fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP)) { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } struct { uint64_t nr, time_enabled, time_running; struct event { uint64_t value,id; } ev[N_EV]; } events; if( read(fd, &events, sizeof(events)) != sizeof(events)) { fprintf(stderr,"read of event group leader FD failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } if( events.nr != N_EV ) { fprintf(stderr,"unexpected number of events read: %lu\n", events.nr); return 1; } struct event *ev = &events.ev[0]; bool non_zero_event=false; do { bool found=0; for(n_ev=0; n_ev < N_EV; n_ev += 1) { if( pe[n_ev].id == ev->id ) { found = true; break; } } if( ! found ) { fprintf(stderr,"Kernel returned unknown event ID: %lu", ev->id); return 1; } printf("EVENT: %s : %lu\n", pe[n_ev].name, ev->value); if (!non_zero_event) non_zero_event = ev->value != 0; ++ev; } while( --events.nr ); return (non_zero_event ? 0 : 1); }