Re: [Nouveau] [RFC PATCH 6/8] nv50: add support for compute/graphics global performance counters

2015-06-25 Thread Ilia Mirkin
What's with the \%'s everywhere?

On Mon, Jun 22, 2015 at 4:53 PM, Samuel Pitoiset
 wrote:
> This commit adds support for both compute and graphics global
> performance counters which have been reverse engineered with
> CUPTI (Linux) and PerfKit (Windows).
>
> Currently, only one query type can be monitored at the same time because
> the Gallium's HUD doesn't fit pretty well. This will be improved later.
>
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/gallium/drivers/nouveau/nv50/nv50_query.c  | 1057 
> +++-
>  src/gallium/drivers/nouveau/nv50/nv50_screen.h |   35 +
>  2 files changed, 1087 insertions(+), 5 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c 
> b/src/gallium/drivers/nouveau/nv50/nv50_query.c
> index 1162110..b9d2914 100644
> --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
> +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
> @@ -27,6 +27,8 @@
>  #include "nv50/nv50_context.h"
>  #include "nv_object.xml.h"
>
> +#include "nouveau_perfmon.h"
> +
>  #define NV50_QUERY_STATE_READY   0
>  #define NV50_QUERY_STATE_ACTIVE  1
>  #define NV50_QUERY_STATE_ENDED   2
> @@ -51,10 +53,25 @@ struct nv50_query {
> boolean is64bit;
> struct nouveau_mm_allocation *mm;
> struct nouveau_fence *fence;
> +   struct nouveau_object *perfdom;
>  };
>
>  #define NV50_QUERY_ALLOC_SPACE 256
>
> +#ifdef DEBUG
> +static void nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args);
> +#endif
> +
> +static boolean
> +nv50_hw_pm_query_create(struct nv50_context *, struct nv50_query *);
> +static void
> +nv50_hw_pm_query_destroy(struct nv50_context *, struct nv50_query *);
> +static boolean
> +nv50_hw_pm_query_begin(struct nv50_context *, struct nv50_query *);
> +static void nv50_hw_pm_query_end(struct nv50_context *, struct nv50_query *);
> +static boolean nv50_hw_pm_query_result(struct nv50_context *,
> +struct nv50_query *, boolean, void *);
> +
>  static INLINE struct nv50_query *
>  nv50_query(struct pipe_query *pipe)
>  {
> @@ -96,12 +113,18 @@ nv50_query_allocate(struct nv50_context *nv50, struct 
> nv50_query *q, int size)
>  static void
>  nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
>  {
> +   struct nv50_context *nv50 = nv50_context(pipe);
> +   struct nv50_query *q = nv50_query(pq);
> +
> if (!pq)
>return;
>
> -   nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
> -   nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
> -   FREE(nv50_query(pq));
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST))
> +  nv50_hw_pm_query_destroy(nv50, q);
> +
> +   nv50_query_allocate(nv50, q, 0);
> +   nouveau_fence_ref(NULL, &q->fence);
> +   FREE(q);
>  }
>
>  static struct pipe_query *
> @@ -130,6 +153,11 @@ nv50_query_create(struct pipe_context *pipe, unsigned 
> type, unsigned index)
>q->data -= 32 / sizeof(*q->data); /* we advance before query_begin ! */
> }
>
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) 
> {
> +  if (!nv50_hw_pm_query_create(nv50, q))
> + return NULL;
> +   }
> +
> return (struct pipe_query *)q;
>  }
>
> @@ -154,6 +182,7 @@ nv50_query_begin(struct pipe_context *pipe, struct 
> pipe_query *pq)
> struct nv50_context *nv50 = nv50_context(pipe);
> struct nouveau_pushbuf *push = nv50->base.pushbuf;
> struct nv50_query *q = nv50_query(pq);
> +   boolean ret = TRUE;
>
> if (!pq)
>return FALSE;
> @@ -211,10 +240,13 @@ nv50_query_begin(struct pipe_context *pipe, struct 
> pipe_query *pq)
>nv50_query_get(push, q, 0x10, 0x5002);
>break;
> default:
> +  if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= 
> NV50_HW_PM_QUERY_LAST)) {
> + ret = nv50_hw_pm_query_begin(nv50, q);
> +  }
>break;
> }
> q->state = NV50_QUERY_STATE_ACTIVE;
> -   return true;
> +   return ret;
>  }
>
>  static void
> @@ -274,7 +306,9 @@ nv50_query_end(struct pipe_context *pipe, struct 
> pipe_query *pq)
>q->state = NV50_QUERY_STATE_READY;
>break;
> default:
> -  assert(0);
> +  if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= 
> NV50_HW_PM_QUERY_LAST)) {
> + nv50_hw_pm_query_end(nv50, q);
> +  }
>break;
> }
>
> @@ -309,6 +343,10 @@ nv50_query_result(struct pipe_context *pipe, struct 
> pipe_query *pq,
> if (!pq)
>return FALSE;
>
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) 
> {
> +  return nv50_hw_pm_query_result(nv50, q, wait, result);
> +   }
> +
> if (q->state != NV50_QUERY_STATE_READY)
>nv50_query_update(q);
>
> @@ -488,6 +526,1015 @@ nva0_so_target_save_offset(struct pipe_context *pipe,
> nv50_query_end(pipe, targ->pq);
>  }
>
> +/* === HARDWARE GLOBAL PERFORMANCE COUNTERS for NV50 === */
> +
> +struct nv50_hw_pm_source_cfg
> +{
> +   const char *name;
> +   uint64_t value;
> +};
> +
> +struct nv50_hw_pm_signal_cf

Re: [Nouveau] [RFC PATCH 4/8] nv50: configure the ring buffer for reading back PM counters

2015-06-25 Thread Ilia Mirkin
Yeah, this whole thing has to be guarded by a drm version check,
otherwise it'll end up with errors in dmesg I assume. Perhaps only
allocate screen->query when the drm version matches, and gate things
on that for the rest of the code?

On Mon, Jun 22, 2015 at 4:53 PM, Samuel Pitoiset
 wrote:
> To write data at the right offset, the kernel has to know some
> parameters of this ring buffer, like the number of domains and the
> maximum number of queries.
>
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/gallium/drivers/nouveau/nv50/nv50_screen.c | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c 
> b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
> index 3a99cc8..53817c0 100644
> --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
> +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
> @@ -441,6 +441,13 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
>
> BEGIN_NV04(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1);
> PUSH_DATA (push, screen->sw->handle);
> +   BEGIN_NV04(push, SUBC_SW(0x0190), 1);
> +   PUSH_DATA (push, screen->query->handle);
> +   // XXX: Maybe add a check for DRM version here ?
> +   BEGIN_NV04(push, SUBC_SW(0x0600), 1);
> +   PUSH_DATA (push, NV50_HW_PM_RING_BUFFER_MAX_QUERIES);
> +   BEGIN_NV04(push, SUBC_SW(0x0604), 1);
> +   PUSH_DATA (push, NV50_HW_PM_RING_BUFFER_NUM_DOMAINS);

FYI you can do BEGIN_NV04(..., 2), since they're sequential.

>
> BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
> PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
> --
> 2.4.4
>
> ___
> Nouveau mailing list
> Nouveau@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [RFC PATCH 3/8] nv50: allocate and map a notifier buffer object for PM

2015-06-25 Thread Ilia Mirkin
On Mon, Jun 22, 2015 at 4:53 PM, Samuel Pitoiset
 wrote:
> This notifier buffer object will be used to read back global performance
> counters results written by the kernel.
>
> For each domain, we will store the handle of the perfdom object, an
> array of 4 counters and the number of cycles. Like the Gallium's HUD,
> we keep a list of busy queries in a ring in order to prevent stalls
> when reading queries.
>
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/gallium/drivers/nouveau/nv50/nv50_screen.c | 29 
> ++
>  src/gallium/drivers/nouveau/nv50/nv50_screen.h |  6 ++
>  2 files changed, 35 insertions(+)
>
> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c 
> b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
> index c985344..3a99cc8 100644
> --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
> +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
> @@ -368,6 +368,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
> nouveau_object_del(&screen->m2mf);
> nouveau_object_del(&screen->sync);
> nouveau_object_del(&screen->sw);
> +   nouveau_object_del(&screen->query);
>
> nouveau_screen_fini(&screen->base);
>
> @@ -699,9 +700,11 @@ nv50_screen_create(struct nouveau_device *dev)
> struct nv50_screen *screen;
> struct pipe_screen *pscreen;
> struct nouveau_object *chan;
> +   struct nv04_fifo *fifo;
> uint64_t value;
> uint32_t tesla_class;
> unsigned stack_size;
> +   uint32_t length;
> int ret;
>
> screen = CALLOC_STRUCT(nv50_screen);
> @@ -727,6 +730,7 @@ nv50_screen_create(struct nouveau_device *dev)
> screen->base.pushbuf->rsvd_kick = 5;
>
> chan = screen->base.channel;
> +   fifo = chan->data;
>
> pscreen->destroy = nv50_screen_destroy;
> pscreen->context_create = nv50_create;
> @@ -772,6 +776,23 @@ nv50_screen_create(struct nouveau_device *dev)
>goto fail;
> }
>
> +   /* Compute size (in bytes) of the notifier buffer object which is used
> +* in order to read back global performance counters results written
> +* by the kernel. For each domain, we store the handle of the perfdom
> +* object, an array of 4 counters and the number of cycles. Like for
> +* the Gallium's HUD, we keep a list of busy queries in a ring in order
> +* to prevent stalls when reading queries. */
> +   length = (1 + (NV50_HW_PM_RING_BUFFER_NUM_DOMAINS * 6) *
> +  NV50_HW_PM_RING_BUFFER_MAX_QUERIES) * 4;

This calculation may become apparent to me later, but it certainly
isn't now. What's the *6? You refer to an array of 4 counters...
should that have been 6 counters? Or should this have been a 4?

> +
> +   ret = nouveau_object_new(chan, 0xbeef0302, NOUVEAU_NOTIFIER_CLASS,
> +&(struct nv04_notify){ .length = length },
> +sizeof(struct nv04_notify), &screen->query);
> +   if (ret) {
> +   NOUVEAU_ERR("Failed to allocate notifier object for PM: %d\n", ret);
> +   goto fail;
> +   }
> +
> ret = nouveau_object_new(chan, 0xbeef506e, 0x506e,
>  NULL, 0, &screen->sw);
> if (ret) {
> @@ -845,6 +866,14 @@ nv50_screen_create(struct nouveau_device *dev)
> nouveau_heap_init(&screen->gp_code_heap, 0, 1 << NV50_CODE_BO_SIZE_LOG2);
> nouveau_heap_init(&screen->fp_code_heap, 0, 1 << NV50_CODE_BO_SIZE_LOG2);
>
> +   ret = nouveau_bo_wrap(screen->base.device, fifo->notify, 
> &screen->notify_bo);
> +   if (ret == 0)
> +  nouveau_bo_map(screen->notify_bo, 0, screen->base.client);

ret = ...

> +   if (ret) {
> +  NOUVEAU_ERR("Failed to map notifier object for PM: %d\n", ret);
> +  goto fail;
> +   }
> +
> nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
>
> screen->TPs = util_bitcount(value & 0x);
> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h 
> b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> index 69fdfdb..71a5247 100644
> --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> @@ -59,6 +59,7 @@ struct nv50_screen {
> struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
> struct nouveau_bo *stack_bo;
> struct nouveau_bo *tls_bo;
> +   struct nouveau_bo *notify_bo;
>
> unsigned TPs;
> unsigned MPsInTP;
> @@ -89,6 +90,7 @@ struct nv50_screen {
> } fence;
>
> struct nouveau_object *sync;
> +   struct nouveau_object *query;
>
> struct nouveau_object *tesla;
> struct nouveau_object *eng2d;
> @@ -96,6 +98,10 @@ struct nv50_screen {
> struct nouveau_object *sw;
>  };
>
> +/* Parameters of the ring buffer used to read back global PM counters. */
> +#define NV50_HW_PM_RING_BUFFER_NUM_DOMAINS 8
> +#define NV50_HW_PM_RING_BUFFER_MAX_QUERIES 9 /* HUD_NUM_QUERIES + 1 */
> +
>  static INLINE struct nv50_screen *
>  nv50_screen(struct pipe_screen *screen)
>  {
> --
> 2.4.4
>
> ___
> Nouveau mailing list
> Nouveau@lists.freedeskto

[Nouveau] [Bug 70354] [NVE6, NVE7] HUB_INIT timeout on graph init, blob fw doesn't help

2015-06-25 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=70354

--- Comment #55 from buhman  ---
If anyone wants to play with bskeggs' hack:

http://cgit.freedesktop.org/~darktama/nouveau/commit/?h=hack-gk106m

He's tested W541; I've tested W540 (K2100). You'll likely need nouveau.runpm=0
to avoid hanging. PRIME works. vdpau "works". 3D performance at pstate 0a is
something like 50% of nvidia.

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] What are the restrictions around loading indirect constbuf values

2015-06-25 Thread Ilia Mirkin
Hello,

We recently tracked down a bug on Tesla GPUs (i.e. G80-GT218) whereby
it appears that instructions like

0028: b5000409 08000780 add rn f32 $r2 $r2 neg c0[$a1]
0040: b500060d 08004780 add rn f32 $r3 $r3 neg c0[$a1+0x4]

or with nvdisasm:

.headerflags@"EF_CUDA_SM12 EF_CUDA_PTX_SM(EF_CUDA_SM12)"
/**/ FADD R2, R2, -c[0x0][A1+0x0];  /* 0x08000780b5000409 */
/*0008*/ FADD R3, R3, -c[0x0][A1+0x1];  /* 0x08004780b500060d */

don't appear to execute properly. However just MOV'ing the values into
registers works fine. This was observed on a G92 chip. See bug
https://bugs.freedesktop.org/show_bug.cgi?id=91056.

I was hoping you could save me some time and let me know what
instructions can load things like c0[$a1+4] (or maybe it's only in
combination with the modifier?), and which Tesla-family GPU's have
those restrictions.

Thanks,

  -ilia
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [Bug 91056] The Bard's Tale (2005, native) has rendering issues

2015-06-25 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=91056

--- Comment #4 from Ilia Mirkin  ---
Created attachment 116707
  --> https://bugs.freedesktop.org/attachment.cgi?id=116707&action=edit
provisional fix

OK, so this patch appears to fix it. The shaders at the end of that opt trace
have

0028: b5000409 08000780 add rn f32 $r2 $r2 neg c0[$a1]
0040: b500060d 08004780 add rn f32 $r3 $r3 neg c0[$a1+0x4]

etc in them. Which seems innocuous enough, but... something about it is bad.
Perhaps the neg + indirect. Perhaps just indirect. Perhaps... who knows. Needs
testing.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau