Re: [Nouveau] [RFC PATCH 6/8] nv50: add support for compute/graphics global performance counters
What's with the \%'s everywhere? On Mon, Jun 22, 2015 at 4:53 PM, Samuel Pitoiset wrote: > This commit adds support for both compute and graphics global > performance counters which have been reverse engineered with > CUPTI (Linux) and PerfKit (Windows). > > Currently, only one query type can be monitored at the same time because > the Gallium's HUD doesn't fit pretty well. This will be improved later. > > Signed-off-by: Samuel Pitoiset > --- > src/gallium/drivers/nouveau/nv50/nv50_query.c | 1057 > +++- > src/gallium/drivers/nouveau/nv50/nv50_screen.h | 35 + > 2 files changed, 1087 insertions(+), 5 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c > b/src/gallium/drivers/nouveau/nv50/nv50_query.c > index 1162110..b9d2914 100644 > --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c > +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c > @@ -27,6 +27,8 @@ > #include "nv50/nv50_context.h" > #include "nv_object.xml.h" > > +#include "nouveau_perfmon.h" > + > #define NV50_QUERY_STATE_READY 0 > #define NV50_QUERY_STATE_ACTIVE 1 > #define NV50_QUERY_STATE_ENDED 2 > @@ -51,10 +53,25 @@ struct nv50_query { > boolean is64bit; > struct nouveau_mm_allocation *mm; > struct nouveau_fence *fence; > + struct nouveau_object *perfdom; > }; > > #define NV50_QUERY_ALLOC_SPACE 256 > > +#ifdef DEBUG > +static void nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args); > +#endif > + > +static boolean > +nv50_hw_pm_query_create(struct nv50_context *, struct nv50_query *); > +static void > +nv50_hw_pm_query_destroy(struct nv50_context *, struct nv50_query *); > +static boolean > +nv50_hw_pm_query_begin(struct nv50_context *, struct nv50_query *); > +static void nv50_hw_pm_query_end(struct nv50_context *, struct nv50_query *); > +static boolean nv50_hw_pm_query_result(struct nv50_context *, > +struct nv50_query *, boolean, void *); > + > static INLINE struct nv50_query * > nv50_query(struct pipe_query *pipe) > { > @@ -96,12 +113,18 @@ nv50_query_allocate(struct nv50_context *nv50, struct > nv50_query *q, int size) > static void > nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq) > { > + struct nv50_context *nv50 = nv50_context(pipe); > + struct nv50_query *q = nv50_query(pq); > + > if (!pq) >return; > > - nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0); > - nouveau_fence_ref(NULL, &nv50_query(pq)->fence); > - FREE(nv50_query(pq)); > + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) > + nv50_hw_pm_query_destroy(nv50, q); > + > + nv50_query_allocate(nv50, q, 0); > + nouveau_fence_ref(NULL, &q->fence); > + FREE(q); > } > > static struct pipe_query * > @@ -130,6 +153,11 @@ nv50_query_create(struct pipe_context *pipe, unsigned > type, unsigned index) >q->data -= 32 / sizeof(*q->data); /* we advance before query_begin ! */ > } > > + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) > { > + if (!nv50_hw_pm_query_create(nv50, q)) > + return NULL; > + } > + > return (struct pipe_query *)q; > } > > @@ -154,6 +182,7 @@ nv50_query_begin(struct pipe_context *pipe, struct > pipe_query *pq) > struct nv50_context *nv50 = nv50_context(pipe); > struct nouveau_pushbuf *push = nv50->base.pushbuf; > struct nv50_query *q = nv50_query(pq); > + boolean ret = TRUE; > > if (!pq) >return FALSE; > @@ -211,10 +240,13 @@ nv50_query_begin(struct pipe_context *pipe, struct > pipe_query *pq) >nv50_query_get(push, q, 0x10, 0x5002); >break; > default: > + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= > NV50_HW_PM_QUERY_LAST)) { > + ret = nv50_hw_pm_query_begin(nv50, q); > + } >break; > } > q->state = NV50_QUERY_STATE_ACTIVE; > - return true; > + return ret; > } > > static void > @@ -274,7 +306,9 @@ nv50_query_end(struct pipe_context *pipe, struct > pipe_query *pq) >q->state = NV50_QUERY_STATE_READY; >break; > default: > - assert(0); > + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= > NV50_HW_PM_QUERY_LAST)) { > + nv50_hw_pm_query_end(nv50, q); > + } >break; > } > > @@ -309,6 +343,10 @@ nv50_query_result(struct pipe_context *pipe, struct > pipe_query *pq, > if (!pq) >return FALSE; > > + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) > { > + return nv50_hw_pm_query_result(nv50, q, wait, result); > + } > + > if (q->state != NV50_QUERY_STATE_READY) >nv50_query_update(q); > > @@ -488,6 +526,1015 @@ nva0_so_target_save_offset(struct pipe_context *pipe, > nv50_query_end(pipe, targ->pq); > } > > +/* === HARDWARE GLOBAL PERFORMANCE COUNTERS for NV50 === */ > + > +struct nv50_hw_pm_source_cfg > +{ > + const char *name; > + uint64_t value; > +}; > + > +struct nv50_hw_pm_signal_cf
Re: [Nouveau] [RFC PATCH 4/8] nv50: configure the ring buffer for reading back PM counters
Yeah, this whole thing has to be guarded by a drm version check, otherwise it'll end up with errors in dmesg I assume. Perhaps only allocate screen->query when the drm version matches, and gate things on that for the rest of the code? On Mon, Jun 22, 2015 at 4:53 PM, Samuel Pitoiset wrote: > To write data at the right offset, the kernel has to know some > parameters of this ring buffer, like the number of domains and the > maximum number of queries. > > Signed-off-by: Samuel Pitoiset > --- > src/gallium/drivers/nouveau/nv50/nv50_screen.c | 7 +++ > 1 file changed, 7 insertions(+) > > diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c > b/src/gallium/drivers/nouveau/nv50/nv50_screen.c > index 3a99cc8..53817c0 100644 > --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c > +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c > @@ -441,6 +441,13 @@ nv50_screen_init_hwctx(struct nv50_screen *screen) > > BEGIN_NV04(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1); > PUSH_DATA (push, screen->sw->handle); > + BEGIN_NV04(push, SUBC_SW(0x0190), 1); > + PUSH_DATA (push, screen->query->handle); > + // XXX: Maybe add a check for DRM version here ? > + BEGIN_NV04(push, SUBC_SW(0x0600), 1); > + PUSH_DATA (push, NV50_HW_PM_RING_BUFFER_MAX_QUERIES); > + BEGIN_NV04(push, SUBC_SW(0x0604), 1); > + PUSH_DATA (push, NV50_HW_PM_RING_BUFFER_NUM_DOMAINS); FYI you can do BEGIN_NV04(..., 2), since they're sequential. > > BEGIN_NV04(push, NV50_3D(COND_MODE), 1); > PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS); > -- > 2.4.4 > > ___ > Nouveau mailing list > Nouveau@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/nouveau ___ Nouveau mailing list Nouveau@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/nouveau
Re: [Nouveau] [RFC PATCH 3/8] nv50: allocate and map a notifier buffer object for PM
On Mon, Jun 22, 2015 at 4:53 PM, Samuel Pitoiset wrote: > This notifier buffer object will be used to read back global performance > counters results written by the kernel. > > For each domain, we will store the handle of the perfdom object, an > array of 4 counters and the number of cycles. Like the Gallium's HUD, > we keep a list of busy queries in a ring in order to prevent stalls > when reading queries. > > Signed-off-by: Samuel Pitoiset > --- > src/gallium/drivers/nouveau/nv50/nv50_screen.c | 29 > ++ > src/gallium/drivers/nouveau/nv50/nv50_screen.h | 6 ++ > 2 files changed, 35 insertions(+) > > diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c > b/src/gallium/drivers/nouveau/nv50/nv50_screen.c > index c985344..3a99cc8 100644 > --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c > +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c > @@ -368,6 +368,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen) > nouveau_object_del(&screen->m2mf); > nouveau_object_del(&screen->sync); > nouveau_object_del(&screen->sw); > + nouveau_object_del(&screen->query); > > nouveau_screen_fini(&screen->base); > > @@ -699,9 +700,11 @@ nv50_screen_create(struct nouveau_device *dev) > struct nv50_screen *screen; > struct pipe_screen *pscreen; > struct nouveau_object *chan; > + struct nv04_fifo *fifo; > uint64_t value; > uint32_t tesla_class; > unsigned stack_size; > + uint32_t length; > int ret; > > screen = CALLOC_STRUCT(nv50_screen); > @@ -727,6 +730,7 @@ nv50_screen_create(struct nouveau_device *dev) > screen->base.pushbuf->rsvd_kick = 5; > > chan = screen->base.channel; > + fifo = chan->data; > > pscreen->destroy = nv50_screen_destroy; > pscreen->context_create = nv50_create; > @@ -772,6 +776,23 @@ nv50_screen_create(struct nouveau_device *dev) >goto fail; > } > > + /* Compute size (in bytes) of the notifier buffer object which is used > +* in order to read back global performance counters results written > +* by the kernel. For each domain, we store the handle of the perfdom > +* object, an array of 4 counters and the number of cycles. Like for > +* the Gallium's HUD, we keep a list of busy queries in a ring in order > +* to prevent stalls when reading queries. */ > + length = (1 + (NV50_HW_PM_RING_BUFFER_NUM_DOMAINS * 6) * > + NV50_HW_PM_RING_BUFFER_MAX_QUERIES) * 4; This calculation may become apparent to me later, but it certainly isn't now. What's the *6? You refer to an array of 4 counters... should that have been 6 counters? Or should this have been a 4? > + > + ret = nouveau_object_new(chan, 0xbeef0302, NOUVEAU_NOTIFIER_CLASS, > +&(struct nv04_notify){ .length = length }, > +sizeof(struct nv04_notify), &screen->query); > + if (ret) { > + NOUVEAU_ERR("Failed to allocate notifier object for PM: %d\n", ret); > + goto fail; > + } > + > ret = nouveau_object_new(chan, 0xbeef506e, 0x506e, > NULL, 0, &screen->sw); > if (ret) { > @@ -845,6 +866,14 @@ nv50_screen_create(struct nouveau_device *dev) > nouveau_heap_init(&screen->gp_code_heap, 0, 1 << NV50_CODE_BO_SIZE_LOG2); > nouveau_heap_init(&screen->fp_code_heap, 0, 1 << NV50_CODE_BO_SIZE_LOG2); > > + ret = nouveau_bo_wrap(screen->base.device, fifo->notify, > &screen->notify_bo); > + if (ret == 0) > + nouveau_bo_map(screen->notify_bo, 0, screen->base.client); ret = ... > + if (ret) { > + NOUVEAU_ERR("Failed to map notifier object for PM: %d\n", ret); > + goto fail; > + } > + > nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); > > screen->TPs = util_bitcount(value & 0x); > diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h > b/src/gallium/drivers/nouveau/nv50/nv50_screen.h > index 69fdfdb..71a5247 100644 > --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h > +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h > @@ -59,6 +59,7 @@ struct nv50_screen { > struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */ > struct nouveau_bo *stack_bo; > struct nouveau_bo *tls_bo; > + struct nouveau_bo *notify_bo; > > unsigned TPs; > unsigned MPsInTP; > @@ -89,6 +90,7 @@ struct nv50_screen { > } fence; > > struct nouveau_object *sync; > + struct nouveau_object *query; > > struct nouveau_object *tesla; > struct nouveau_object *eng2d; > @@ -96,6 +98,10 @@ struct nv50_screen { > struct nouveau_object *sw; > }; > > +/* Parameters of the ring buffer used to read back global PM counters. */ > +#define NV50_HW_PM_RING_BUFFER_NUM_DOMAINS 8 > +#define NV50_HW_PM_RING_BUFFER_MAX_QUERIES 9 /* HUD_NUM_QUERIES + 1 */ > + > static INLINE struct nv50_screen * > nv50_screen(struct pipe_screen *screen) > { > -- > 2.4.4 > > ___ > Nouveau mailing list > Nouveau@lists.freedeskto
[Nouveau] [Bug 70354] [NVE6, NVE7] HUB_INIT timeout on graph init, blob fw doesn't help
https://bugs.freedesktop.org/show_bug.cgi?id=70354 --- Comment #55 from buhman --- If anyone wants to play with bskeggs' hack: http://cgit.freedesktop.org/~darktama/nouveau/commit/?h=hack-gk106m He's tested W541; I've tested W540 (K2100). You'll likely need nouveau.runpm=0 to avoid hanging. PRIME works. vdpau "works". 3D performance at pstate 0a is something like 50% of nvidia. -- You are receiving this mail because: You are the assignee for the bug. ___ Nouveau mailing list Nouveau@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/nouveau
[Nouveau] What are the restrictions around loading indirect constbuf values
Hello, We recently tracked down a bug on Tesla GPUs (i.e. G80-GT218) whereby it appears that instructions like 0028: b5000409 08000780 add rn f32 $r2 $r2 neg c0[$a1] 0040: b500060d 08004780 add rn f32 $r3 $r3 neg c0[$a1+0x4] or with nvdisasm: .headerflags@"EF_CUDA_SM12 EF_CUDA_PTX_SM(EF_CUDA_SM12)" /**/ FADD R2, R2, -c[0x0][A1+0x0]; /* 0x08000780b5000409 */ /*0008*/ FADD R3, R3, -c[0x0][A1+0x1]; /* 0x08004780b500060d */ don't appear to execute properly. However just MOV'ing the values into registers works fine. This was observed on a G92 chip. See bug https://bugs.freedesktop.org/show_bug.cgi?id=91056. I was hoping you could save me some time and let me know what instructions can load things like c0[$a1+4] (or maybe it's only in combination with the modifier?), and which Tesla-family GPU's have those restrictions. Thanks, -ilia ___ Nouveau mailing list Nouveau@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/nouveau
[Nouveau] [Bug 91056] The Bard's Tale (2005, native) has rendering issues
https://bugs.freedesktop.org/show_bug.cgi?id=91056 --- Comment #4 from Ilia Mirkin --- Created attachment 116707 --> https://bugs.freedesktop.org/attachment.cgi?id=116707&action=edit provisional fix OK, so this patch appears to fix it. The shaders at the end of that opt trace have 0028: b5000409 08000780 add rn f32 $r2 $r2 neg c0[$a1] 0040: b500060d 08004780 add rn f32 $r3 $r3 neg c0[$a1+0x4] etc in them. Which seems innocuous enough, but... something about it is bad. Perhaps the neg + indirect. Perhaps just indirect. Perhaps... who knows. Needs testing. -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ Nouveau mailing list Nouveau@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/nouveau