Signed-off-by: Tom St Denis <tom.stde...@amd.com>
---
 doc/sphinx/source/index.rst    |   1 +
 doc/sphinx/source/profiler.rst |  36 ++++++++++++
 doc/umr.1                      |   4 ++
 src/app/CMakeLists.txt         |   1 +
 src/app/main.c                 |  15 ++++-
 src/app/print_waves.c          |   4 +-
 src/app/profile.c              | 128 +++++++++++++++++++++++++++++++++++++++++
 src/app/ring_read.c            |  12 +++-
 src/lib/dump_ib.c              |   4 +-
 src/lib/umr_llvm_disasm.c      |  48 +++++++++++++++-
 src/umr.h                      |   3 +-
 src/umrapp.h                   |   4 +-
 12 files changed, 246 insertions(+), 14 deletions(-)
 create mode 100644 doc/sphinx/source/profiler.rst
 create mode 100644 src/app/profile.c

diff --git a/doc/sphinx/source/index.rst b/doc/sphinx/source/index.rst
index fd8b2561e570..fec89140db70 100644
--- a/doc/sphinx/source/index.rst
+++ b/doc/sphinx/source/index.rst
@@ -15,6 +15,7 @@ UMR: User Mode Register Debugger
    basic
    register_access
    wave_status
+   profiler
    vm_decoding
    ring
    top
diff --git a/doc/sphinx/source/profiler.rst b/doc/sphinx/source/profiler.rst
new file mode 100644
index 000000000000..0e44cfd2825d
--- /dev/null
+++ b/doc/sphinx/source/profiler.rst
@@ -0,0 +1,36 @@
+=========
+Profiling
+=========
+
+When testing a shader compiler and/or a shader under testing
+a profile of where the GPU tends to spend time can be generated with
+the umr "--profiler" command:
+
+::
+
+       --profiler <nsamples> <usec_delay>
+
+Which will capture 'nsamples' many wave samples with a delay of at
+least 'usec_delay' microseconds between them.  The output then
+contains the sorted list of addresses and opcodes in descending order.
+For example,
+
+::
+
+        2865 hits (13 %)       2@0x100009c68    0xc4001c0f 0x00000100          
exp mrt0 v0, v0, v1, v1 done compr vm
+        1199 hits ( 5 %)       2@0x1055e9724    0xc40008cf 0x0f090706          
exp pos0 v6, v7, v9, v15 done
+        1155 hits ( 5 %)       2@0x100009c48    0xbf8c0f70 0x16000080          
s_waitcnt vmcnt(0)
+         710 hits ( 3 %)       2@0x10000acf0    0xc4001c0f 0x00000100          
exp mrt0 v0, v0, v1, v1 done compr vm
+         633 hits ( 3 %)       2@0x1023f14c4    0xc400040f 0x00000100          
exp mrt0 v0, v0, v1, v1 compr
+         633 hits ( 3 %)       2@0x100008d64    0xbf8c0f70 0x0a161b12          
s_waitcnt vmcnt(0)
+         617 hits ( 2 %)       2@0x10000a238    0xf0800700 0x00020400          
image_sample v[4:6], v0, s[8:15], s[0:3] dmask:0x7
+       ...<snip>...
+
+Indicates that the opcode at VMID 2 offset 0x100009C68 had waves halted
+there 2865 times (13% of all captured wave data).  The next columns
+indicate the raw opcode data and the last columns are the LLVM disassembly
+of the opcode.
+
+When testing a known shader this can be used to determine where
+the bulk of the processing time is spent.
+
diff --git a/doc/umr.1 b/doc/umr.1
index f1f5fec55946..a777d9312054 100644
--- a/doc/umr.1
+++ b/doc/umr.1
@@ -118,6 +118,10 @@ from stdin.
 Disassemble 'size' bytes (in hex) from a given address (in hex).  The size can 
be
 specified as zero to have umr try and compute the shader size.
 
+.IP "--profiler, -prof <nsamples> <usec_delay>"
+Capture 'nsamples' samples of wave data with at least usec_delay microseconds
+between captures.
+
 .IP "--update, -u" <filename>
 Specify update file to add, change, or delete registers from the register
 database.  Useful for adding registers that are not including in the kernel 
headers.
diff --git a/src/app/CMakeLists.txt b/src/app/CMakeLists.txt
index 4dceebb00e0d..7512a54f68bf 100644
--- a/src/app/CMakeLists.txt
+++ b/src/app/CMakeLists.txt
@@ -6,6 +6,7 @@ project(umr)
 add_library(umrapp
   print.c
   print_config.c
+  profile.c
   ring_read.c
   scan.c
   scan_log.c
diff --git a/src/app/main.c b/src/app/main.c
index 600f3ca02988..d6571e77b74d 100644
--- a/src/app/main.c
+++ b/src/app/main.c
@@ -495,13 +495,23 @@ int main(int argc, char **argv)
                                        shader.addr = address;
                                        size = umr_compute_shader_size(asic, 
&shader);
                                }
-                               umr_vm_disasm(asic, vmid, address, 0, size);
+                               umr_vm_disasm(asic, vmid, address, 0, size, 
NULL);
 
                                i += 2;
                        } else {
                                printf("--vm-disasm requires two parameters\n");
                                return EXIT_FAILURE;
                        }
+               } else if (!strcmp(argv[i], "-prof") || !strcmp(argv[i], 
"--profiler")) {
+                       if (i + 2 < argc) {
+                               if (!asic)
+                                       asic = get_asic();
+                               umr_profiler(asic, atoi(argv[i+1]), 
atoi(argv[i+2]));
+                               i += 2;
+                       } else {
+                               printf("--profiler requires two parameters\n");
+                               return EXIT_FAILURE;
+                       }
                } else if (!strcmp(argv[i], "--option") || !strcmp(argv[i], 
"-O")) {
                        if (i + 1 < argc) {
                                parse_options(argv[i+1]);
@@ -581,6 +591,9 @@ int main(int argc, char **argv)
 "\n\t--vm-disasm, -vdis [<vmid>@]<address> <size>"
        "\n\t\tDisassemble 'size' bytes (in hex) from a given address (in hex). 
 The size can"
        "\n\t\tbe specified as zero to have umr try and compute the shader 
size.\n"
+"\n\t--profiler, -prof <nsamples> <usec_delay>"
+       "\n\t\tCapture 'nsamples' samples of wave data with at least usec_delay"
+       "\n\t\tmicroseconds between captures.\n"
 "\n\t--option -O <string>[,<string>,...]\n\t\tEnable various flags: bits, 
bitsfull, empty_log, follow, no_follow_ib, named, many,"
        "\n\t\tuse_pci, use_colour, read_smc, quiet, no_kernel, verbose, 
halt_waves, disasm_early_term.\n"
 "\n\n", UMR_BUILD_VER, UMR_BUILD_REV);
diff --git a/src/app/print_waves.c b/src/app/print_waves.c
index d901bc902ff3..6965f7f31854 100644
--- a/src/app/print_waves.c
+++ b/src/app/print_waves.c
@@ -100,7 +100,7 @@ void umr_print_waves(struct umr_asic *asic)
                        }
 
                        pgm_addr = (((uint64_t)wd->ws.pc_hi << 32) | 
wd->ws.pc_lo) - (NUM_OPCODE_WORDS*4)/2;
-                       umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, 
(((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4);
+                       umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, 
(((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4, NULL);
                } else {
                        first = 0;
                        
printf("\n------------------------------------------------------\nse%u.sh%u.cu%u.simd%u.wave%u\n",
@@ -222,7 +222,7 @@ void umr_print_waves(struct umr_asic *asic)
 
                        printf("\n\nPGM_MEM:\n");
                        pgm_addr = (((uint64_t)wd->ws.pc_hi << 32) | 
wd->ws.pc_lo) - (NUM_OPCODE_WORDS*4)/2;
-                       umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, 
(((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4);
+                       umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, 
(((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4, NULL);
 
                        Hv("LDS_ALLOC", wd->ws.lds_alloc.value);
                        PP(lds_alloc, lds_base);
diff --git a/src/app/profile.c b/src/app/profile.c
new file mode 100644
index 000000000000..3ba3b36efe64
--- /dev/null
+++ b/src/app/profile.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Tom St Denis <tom.stde...@amd.com>
+ *
+ */
+#include "umrapp.h"
+
+struct umr_profiler_hit {
+       uint32_t
+               vmid,
+               inst_dw0,
+               inst_dw1;
+
+       uint64_t
+               pc;
+};
+
+struct umr_profiler_rle {
+       struct umr_profiler_hit data;
+       uint32_t cnt;
+};
+
+static int comp_hits(const void *A, const void *B)
+{
+       return memcmp(A, B, sizeof(struct umr_profiler_hit));
+}
+
+static int comp_rle(const void *A, const void *B)
+{
+       const struct umr_profiler_rle *a = A, *b = B;
+       return b->cnt - a->cnt;
+}
+
+void umr_profiler(struct umr_asic *asic, int samples, int delay)
+{
+       struct umr_profiler_hit *ophit, *phit;
+       struct umr_profiler_rle *prle;
+       struct umr_wave_data *owd, *wd;
+       unsigned nitems, nmax, x, y, z;
+
+       nmax = samples;
+       nitems = 0;
+       ophit = phit = calloc(nmax, sizeof *phit);
+
+       while (samples--) {
+               fprintf(stderr, "%5u samples left\r", samples);
+               fflush(stderr);
+               do {
+                       umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
+                       if (delay)
+                               usleep(delay);
+                       umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_HALT);
+                       wd = umr_scan_wave_data(asic);
+               } while (!wd);
+
+               // loop through data ...
+               while (wd) {
+                       phit[nitems].vmid = wd->ws.hw_id.vm_id;
+                       phit[nitems].inst_dw0 = wd->ws.wave_inst_dw0;
+                       phit[nitems].inst_dw1 = wd->ws.wave_inst_dw1;
+                       phit[nitems++].pc = ((uint64_t)wd->ws.pc_hi << 32) | 
wd->ws.pc_lo;
+
+                       if (nitems == nmax) {
+                               nmax += 1000;
+                               ophit = realloc(phit, nmax * sizeof(*phit));
+                               phit = ophit;
+                       }
+
+                       owd = wd->next;
+                       free(wd);
+                       wd = owd;
+               }
+       }
+       umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
+
+       qsort(phit, nitems, sizeof(*phit), comp_hits);
+       prle = calloc(nitems, sizeof *prle);
+       for (z = y = 0, x = 1; x < nitems; x++) {
+               if (memcmp(&phit[x], &phit[y], sizeof(*phit))) {
+                       prle[z].data = phit[y];
+                       prle[z++].cnt = x - y;
+                       y = x;
+               }
+       }
+
+       qsort(prle, z, sizeof(*prle), comp_rle);
+       for (x = 0; x < z; x++) {
+               char *str[2];
+               unsigned char buf[8];
+
+               memset(str, 0, sizeof(str));
+               memcpy(buf, &prle[x].data.inst_dw0, 4);
+               memcpy(buf + 4, &prle[x].data.inst_dw1, 4);
+               umr_llvm_disasm(asic, buf, 8, 0, &str[0]);
+
+               printf("%5u hits (%2u %%)\t%u@0x%llx\t 0x%08lx 0x%08lx\t%s\n",
+                       prle[x].cnt,
+                       (prle[x].cnt * 100) / nitems,
+                       (unsigned)prle[x].data.vmid,
+                       (unsigned long long)prle[x].data.pc,
+                       (unsigned long)prle[x].data.inst_dw0,
+                       (unsigned long)prle[x].data.inst_dw1, str[0]);
+               free(str[0]);
+               free(str[1]);
+       }
+
+       free(prle);
+       free(phit);
+}
diff --git a/src/app/ring_read.c b/src/app/ring_read.c
index 3ccec1be6d90..112e9f0414ad 100644
--- a/src/app/ring_read.c
+++ b/src/app/ring_read.c
@@ -32,6 +32,7 @@ void umr_read_ring(struct umr_asic *asic, char *ringpath)
        uint32_t wptr, rptr, drv_wptr, ringsize, start, end, value,
                 *ring_data;
        struct umr_ring_decoder decoder, *pdecoder, *ppdecoder;
+       struct umr_wave_data *wd;
 
        memset(ringname, 0, sizeof ringname);
        memset(from, 0, sizeof from);
@@ -146,18 +147,25 @@ void umr_read_ring(struct umr_asic *asic, char *ringpath)
        free(ring_data);
        printf("\n");
 
-       umr_dump_shaders(asic, &decoder);
+       wd = umr_scan_wave_data(asic);
+       umr_dump_shaders(asic, &decoder, wd);
        pdecoder = decoder.next_ib;
        while (pdecoder) {
                if (asic->options.follow_ib) {
                        umr_dump_ib(asic, pdecoder);
-                       umr_dump_shaders(asic, pdecoder);
+                       umr_dump_shaders(asic, pdecoder, wd);
                }
                ppdecoder = pdecoder->next_ib;
                free(pdecoder);
                pdecoder = ppdecoder;
        }
 
+       while (wd) {
+               struct umr_wave_data *pwd = wd->next;
+               free(wd);
+               wd = pwd;
+       }
+
 end:
        if (asic->options.halt_waves)
                umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
diff --git a/src/lib/dump_ib.c b/src/lib/dump_ib.c
index cdcbb8a70edd..d5e68d6981a0 100644
--- a/src/lib/dump_ib.c
+++ b/src/lib/dump_ib.c
@@ -67,7 +67,7 @@ void umr_dump_ib(struct umr_asic *asic, struct 
umr_ring_decoder *decoder)
        printf("End of IB\n\n");
 }
 
-void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder)
+void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder, 
struct umr_wave_data *wd)
 {
        struct umr_shaders_pgm *pshader, *shader;
 
@@ -79,7 +79,7 @@ void umr_dump_shaders(struct umr_asic *asic, struct 
umr_ring_decoder *decoder)
                                BLUE, (unsigned)shader->vmid, RST,
                                YELLOW, (unsigned long 
long)shader->src.ib_base, RST,
                                YELLOW, (unsigned)shader->src.ib_offset * 4, 
RST);
-               umr_vm_disasm(asic, shader->vmid, shader->addr, 0, 
shader->size);
+               umr_vm_disasm(asic, shader->vmid, shader->addr, 0, 
shader->size, wd);
                printf("\n");
                pshader = shader->next;
                free(shader);
diff --git a/src/lib/umr_llvm_disasm.c b/src/lib/umr_llvm_disasm.c
index 68f23f990fd2..5e1adf39a262 100644
--- a/src/lib/umr_llvm_disasm.c
+++ b/src/lib/umr_llvm_disasm.c
@@ -85,10 +85,31 @@ int umr_llvm_disasm(struct umr_asic *asic,
        return 0;
 }
 
-void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, 
uint64_t PC, uint32_t size)
+static struct umr_wave_data *find_wave(struct umr_wave_data *wd, unsigned 
vmid, uint64_t addr)
 {
-       uint32_t *opcodes, x;
+       while (wd) {
+               uint64_t PC;
+               PC = ((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo;
+               if (wd->ws.hw_id.vm_id == vmid && addr == PC)
+                       break;
+               wd = wd->next;
+       }
+       return wd;
+}
+
+
+void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, 
uint64_t PC, uint32_t size, struct umr_wave_data *wd)
+{
+       uint32_t *opcodes, x, nwave, wavehits;
        char **opcode_strs = NULL;
+       struct umr_wave_data *pwd;
+
+       wavehits = nwave = 0;
+       pwd = wd;
+       while (pwd) {
+               ++nwave;
+               pwd = pwd->next;
+       }
 
        opcodes = calloc(size/4, sizeof(*opcodes));
        if (!opcodes)
@@ -106,14 +127,35 @@ void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, 
uint64_t addr, uint64_t
                        printf(" * ");
                else
                        printf("   ");
-               printf("pgm[%s%lu%s@%s0x%llx%s + %s0x%-4x%s] = 
%s0x%08lx%s\t%s%s%s\n",
+               printf("pgm[%s%lu%s@%s0x%llx%s + %s0x%-4x%s] = 
%s0x%08lx%s\t%s%-60s%s\t",
                        BLUE, (unsigned long)vmid, RST,
                        YELLOW, (unsigned long long)addr, RST,
                        YELLOW, (unsigned)x * 4, RST,
                        BLUE, (unsigned long)opcodes[x], RST,
                        GREEN, opcode_strs[x], RST);
                free(opcode_strs[x]);
+
+               if (wd) {
+                       unsigned n;
+                       pwd = find_wave(wd, vmid, addr + x * 4);
+                       n = 0;
+                       while (pwd) {
+                               ++n;
+                               ++wavehits;
+                               if (asic->options.bitfields)
+                                       printf("[se%u.sh%u.cu%u.simd%u.wave%u] 
",
+                                               (unsigned)pwd->se, 
(unsigned)pwd->sh, (unsigned)pwd->cu, (unsigned)pwd->ws.hw_id.simd_id, 
(unsigned)pwd->ws.hw_id.wave_id);
+                               pwd = find_wave(pwd->next, vmid, addr + x * 4);
+                       }
+                       if (n)
+                               printf("[%3u waves (%3u %%)]", n, (n * 100) / 
nwave);
+               }
+               printf("\n");
        }
+       printf("End of disassembly.\n");
+
+       if (wd && wavehits)
+               printf("\t%u waves in this shader (out of %u active waves)\n", 
wavehits, nwave);
 
        free(opcode_strs);
        free(opcodes);
diff --git a/src/umr.h b/src/umr.h
index e99ee965527e..f026e82be98e 100644
--- a/src/umr.h
+++ b/src/umr.h
@@ -621,12 +621,13 @@ int umr_sq_cmd_halt_waves(struct umr_asic *asic, enum 
umr_sq_cmd_halt_resume mod
 /* IB/ring decoding/dumping/etc */
 void umr_print_decode(struct umr_asic *asic, struct umr_ring_decoder *decoder, 
uint32_t ib);
 void umr_dump_ib(struct umr_asic *asic, struct umr_ring_decoder *decoder);
-void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder);
+void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder, 
struct umr_wave_data *wd);
 
 int umr_llvm_disasm(struct umr_asic *asic,
                                        uint8_t *inst, unsigned inst_bytes,
                                        uint64_t PC,
                                        char **disasm_text);
+void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, 
uint64_t PC, uint32_t size, struct umr_wave_data *wd);
 uint32_t umr_compute_shader_size(struct umr_asic *asic,
                                                                 struct 
umr_shaders_pgm *shader);
 
diff --git a/src/umrapp.h b/src/umrapp.h
index 2f52d3093abe..e11a7d6e53f5 100644
--- a/src/umrapp.h
+++ b/src/umrapp.h
@@ -48,6 +48,4 @@ void umr_top(struct umr_asic *asic);
 
 void umr_print_config(struct umr_asic *asic);
 void umr_print_waves(struct umr_asic *asic);
-
-void umr_app_disasm(struct umr_asic *asic);
-void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, 
uint64_t PC, uint32_t size);
+void umr_profiler(struct umr_asic *asic, int samples, int delay);
-- 
2.14.3

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to