Re: [Mesa-dev] [PATCH] nvc0: add MP performance counters for SM35 (GK110:GM107)

2016-02-16 Thread Samuel Pitoiset



On 02/16/2016 10:04 PM, Ilia Mirkin wrote:

On Tue, Feb 16, 2016 at 3:59 PM, Samuel Pitoiset
 wrote:

  static inline const struct nvc0_hw_sm_query_cfg **
  nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
  {
+   const struct nvc0_hw_sm_query_cfg **queries = NULL;
 struct nouveau_device *dev = screen->base.device;

-   if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
-  return sm20_hw_sm_queries;
-   return sm21_hw_sm_queries;
+   switch (dev->chipset & ~0xf) {
+   case 0xc0:
+   case 0xd0:
+  if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+ queries = sm20_hw_sm_queries;
+  else
+ queries = sm21_hw_sm_queries;
+  break;
+   case 0xe0:
+  queries = sm30_hw_sm_queries;
+  break;
+   case 0xf0:
+   case 0x100:
+  queries = sm35_hw_sm_queries;
+  break;
+   default:
+  break;
+   }
+   return queries;
  }


This might be wider to do based on 3d class. For example GK20A (aka
0xea chipset) uses SM35.


Yeah, maybe this could improve readability.
Anyway, when all performance counters will be upstream, I think it would 
be good to refactor the code (or try to).



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] nvc0: add MP performance counters for SM35 (GK110:GM107)

2016-02-16 Thread Ilia Mirkin
On Tue, Feb 16, 2016 at 3:59 PM, Samuel Pitoiset
 wrote:
>  static inline const struct nvc0_hw_sm_query_cfg **
>  nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
>  {
> +   const struct nvc0_hw_sm_query_cfg **queries = NULL;
> struct nouveau_device *dev = screen->base.device;
>
> -   if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
> -  return sm20_hw_sm_queries;
> -   return sm21_hw_sm_queries;
> +   switch (dev->chipset & ~0xf) {
> +   case 0xc0:
> +   case 0xd0:
> +  if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
> + queries = sm20_hw_sm_queries;
> +  else
> + queries = sm21_hw_sm_queries;
> +  break;
> +   case 0xe0:
> +  queries = sm30_hw_sm_queries;
> +  break;
> +   case 0xf0:
> +   case 0x100:
> +  queries = sm35_hw_sm_queries;
> +  break;
> +   default:
> +  break;
> +   }
> +   return queries;
>  }

This might be wider to do based on 3d class. For example GK20A (aka
0xea chipset) uses SM35.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] nvc0: add MP performance counters for SM35 (GK110:GM107)

2016-02-16 Thread Samuel Pitoiset
Because compute support is not enabled by default for these chipsets,
NVF0_COMPUTE=1 needs to be used, along with GALLIUM_HUD to enable
performance counters.

Signed-off-by: Samuel Pitoiset 
---
 .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c| 755 ++---
 .../drivers/nouveau/nvc0/nvc0_query_hw_sm.h|   2 +
 .../drivers/nouveau/nvc0/nve4_compute.xml.h|   4 +
 3 files changed, 667 insertions(+), 94 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c 
b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index 68c8ff5..b584532 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -65,6 +65,7 @@ static const char *nve4_hw_sm_query_names[] =
"local_load_transactions",
"local_store",
"local_store_transactions",
+   "not_predicated_off_thread_inst_executed",
"prof_trigger_00",
"prof_trigger_01",
"prof_trigger_02",
@@ -78,6 +79,7 @@ static const char *nve4_hw_sm_query_names[] =
"shared_store",
"shared_store_replay",
"sm_cta_launched",
+   "thread_inst_executed",
"threads_launched",
"uncached_global_load_transaction",
"warps_launched",
@@ -169,6 +171,49 @@ static const uint64_t nve4_read_hw_sm_counters_code[] =
0x80001de7ULL
 };
 
+static const uint64_t nvf0_read_hw_sm_counters_code[] =
+{
+   /* Same kernel as GK104:GK110 */
+   0x0880808080808080ULL,
+   0x8640109c0022ULL,
+   0x8640019c0032ULL,
+   0x8640021c0002ULL,
+   0x8640029c0006ULL,
+   0x8640031c000aULL,
+   0x8640039c000eULL,
+   0x8640041c0012ULL,
+   0x08ac1080108c8080ULL,
+   0x8640049c0016ULL,
+   0x8640051c001aULL,
+   0x8640059c001eULL,
+   0xdb201c007f9c201eULL,
+   0x64c03c1c002aULL,
+   0xc0020a1c3021ULL,
+   0x64c03c9c002eULL,
+   0x0810a0808010b810ULL,
+   0xc001041c3025ULL,
+   0x1820003cULL,
+   0xdb201c007f9c243eULL,
+   0xc1c0301c2021ULL,
+   0xc1c0081c2431ULL,
+   0xc1c0021c2435ULL,
+   0xe080069c2026ULL,
+   0x08b010b010b010a0ULL,
+   0xe080061c2022ULL,
+   0xe4c03c00051c0032ULL,
+   0xe084041c282aULL,
+   0xe4c03c00059c0036ULL,
+   0xe08040007f9c2c2eULL,
+   0xe084049c3032ULL,
+   0xfe80001c2800ULL,
+   0x08b81080b010ULL,
+   0x64c03c00011c0002ULL,
+   0xe08040007f9c3436ULL,
+   0xfe8020043010ULL,
+   0xfc80281c3000ULL,
+   0x181c003cULL,
+};
+
 /* For simplicity, we will allocate as many group slots as we allocate counter
  * slots. This means that a single counter which wants to source from 2 groups
  * will have to be declared as using 2 counter slots. This shouldn't really be
@@ -192,64 +237,539 @@ struct nvc0_hw_sm_query_cfg
uint8_t norm[2]; /* normalization num,denom */
 };
 
-#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, 
NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, 
{}, {}, {} }, 1, { nu, dn } }
-#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, 
NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, 
{}, {}, {} }, 1, { nu, dn } }
+#define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 
NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }
+#define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, 
NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }
+#define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
+
+/*  Compute capability 3.0 (GK104:GK110)  */
+static const struct nvc0_hw_sm_query_cfg
+sm30_active_cycles =
+{
+   .ctr[0]   = _CB(0x0001, B6, WARP, 0x),
+   .num_counters = 1,
+   .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_active_warps =
+{
+   .ctr[0]   = _CB(0x003f, B6, WARP, 0x31483104),
+   .num_counters = 1,
+   .norm = { 2, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_atom_cas_count =
+{
+   .ctr[0]   = _CA(0x0001, B6, BRANCH, 0x4),
+   .num_counters = 1,
+   .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_atom_count =
+{
+   .ctr[0]   = _CA(0x0001, B6, BRANCH, 0x),
+   .num_counters = 1,
+   .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_branch =
+{
+   .ctr[0]   = _CA(0x0001, B6, BRANCH, 0x000c),
+   .num_counters = 1,
+   .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_divergent_branch =
+{
+   .ctr[0]   = _CA(0x0001, B6, BRANCH, 0x0010),
+   .num_counters = 1,
+   .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_gld_request =
+{
+   .ctr[0]   = _CA(0x0001, B6, LDST, 0x0010),
+   .num_counters = 1,
+   .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_gld_mem_div_replay =
+{
+   .ctr[0]   = _CB(0x0001, B6, REPLAY,