[Nouveau] [PATCH] nouveau: expose BO domain in the public API

2010-08-12 Thread Luca Barbieri
This can allow drivers to make better choices.

Since it is just a field appended to a struct, compatibility is preserved.
---
 nouveau/nouveau_bo.c  |4 ++--
 nouveau/nouveau_bo.h  |3 +++
 nouveau/nouveau_private.h |1 -
 nouveau/nouveau_pushbuf.c |2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/nouveau/nouveau_bo.c b/nouveau/nouveau_bo.c
index 32b23b6..28cf5b3 100644
--- a/nouveau/nouveau_bo.c
+++ b/nouveau/nouveau_bo.c
@@ -47,7 +47,7 @@ static int
 nouveau_bo_info(struct nouveau_bo_priv *nvbo, struct drm_nouveau_gem_info *arg)
 {
nvbo-handle = nvbo-base.handle = arg-handle;
-   nvbo-domain = arg-domain;
+   nvbo-base.domain = arg-domain;
nvbo-size = arg-size;
nvbo-offset = arg-offset;
nvbo-map_handle = arg-map_handle;
@@ -534,7 +534,7 @@ nouveau_bo_emit_buffer(struct nouveau_channel *chan, struct 
nouveau_bo *bo)
pbbo-valid_domains = NOUVEAU_GEM_DOMAIN_VRAM | NOUVEAU_GEM_DOMAIN_GART;
pbbo-read_domains = 0;
pbbo-write_domains = 0;
-   pbbo-presumed.domain = nvbo-domain;
+   pbbo-presumed.domain = nvbo-base.domain;
pbbo-presumed.offset = nvbo-offset;
pbbo-presumed.valid = 1;
return pbbo;
diff --git a/nouveau/nouveau_bo.h b/nouveau/nouveau_bo.h
index 1e77ab0..932f189 100644
--- a/nouveau/nouveau_bo.h
+++ b/nouveau/nouveau_bo.h
@@ -48,6 +48,9 @@ struct nouveau_bo {
 
uint32_t tile_mode;
uint32_t tile_flags;
+
+   /* last known information from kernel */
+   uint32_t domain;
 };
 
 int
diff --git a/nouveau/nouveau_private.h b/nouveau/nouveau_private.h
index 53928d2..312fe70 100644
--- a/nouveau/nouveau_private.h
+++ b/nouveau/nouveau_private.h
@@ -122,7 +122,6 @@ struct nouveau_bo_priv {
 
/* Last known information from kernel on buffer status */
uint64_t offset;
-   uint32_t domain;
 };
 #define nouveau_bo(n) ((struct nouveau_bo_priv *)(n))
 
diff --git a/nouveau/nouveau_pushbuf.c b/nouveau/nouveau_pushbuf.c
index 53da8cf..27f0df2 100644
--- a/nouveau/nouveau_pushbuf.c
+++ b/nouveau/nouveau_pushbuf.c
@@ -211,7 +211,7 @@ nouveau_pushbuf_bo_unref(struct nouveau_pushbuf_priv *nvpb, 
int index)
return;
 
if (pbbo-presumed.valid == 0) {
-   nvbo-domain = pbbo-presumed.domain;
+   bo-domain = pbbo-presumed.domain;
nvbo-offset = pbbo-presumed.offset;
}
 
-- 
1.7.0.4

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] nvfx

2010-07-24 Thread Luca Barbieri
On Fri, Jul 23, 2010 at 7:01 PM, Patrice Mandin
mandin.patr...@orange.fr wrote:
 Le Fri, 18 Jun 2010 18:43:27 +0200
 Marek Olšák mar...@gmail.com a écrit:

 On Fri, Jun 18, 2010 at 6:05 PM, Patrice Mandin 
 mandin.patr...@orange.frwrote:

  Le Thu, 17 Jun 2010 03:35:19 +0200
  Marek Olšák mar...@gmail.com a écrit:
 
   On Fri, Jun 11, 2010 at 3:37 PM, Xavier Chantry 
  chantry.xav...@gmail.comwrote:
  
Hi Marek
   
Thanks a lot for your rebasing work.
Here is my report :
   
- all my games that broke with temporaries patch (they were either
completely black or lot of black screen flash every frame) behave
badly, but in different ways :
* etracer is very slow and often crash in ttm code [1] (I think this
is an old bug that just resurrected, no idea why)
* foobillard is very slow and still flash a bit
* strangely, neverball seems to work, I get similar results than with
old nvfx-next-6b branch with temporaries reverted. no black flash
while playing.
* glest segfault [2]
   
I also compared with piglit the old nvfx branch with the new merged one
  :
114/174 vs 113/174
That looks quite good with 3 new pass, but 4 new fail :
* fbo-copypix
Returncode was -6
* glean pbo
   
  ../../../../src/gallium/auxiliary/util/u_inlines.h:77:pipe_reference:
Assertion `pipe_is_referenced(reference)\\\' failed.
* texCombine4:  FAIL rgba8, db, z24, s8, win+pmap, id 33
* fp-long-alu
   
  
   Hi Xavier,
  
   Sorry for the late reply.
  
   The assertion in pipe_reference can be fixed quite easily I think. There
  is
   pipe_*_reference missing somewhere.
  
   Concerning fp-long-alu, there are new CAPs for shader limits which should
  be
   filled out but I don't know what values to put there. If the two get
  fixed,
   it will hopefully be just 2 new failures along with 3 that pass.
  
  
There is just fp-long-alu that is for sure a regression caused by new
master code (some gallium changes). I don't know about the 3 others.
   
It might be worth to re-test everything on your new branch with this
patch reverted :
nvfx: rewrite render temporaries code, also affecting 2D and resource
  code
   
  
   Here is the tree with the commit reverted:
  
   git://anongit.freedesktop.org/~mareko/mesahttp://anongit.freedesktop.org/%7Emareko/mesanvfx-next-6b-notemps
  
   It is compile-tested so if it does not work, there is nothing I can do
  about
   it.
  
   -Marek
 
  I just tested the new tree nvfx-next-6b-notemps, I think we should go
  some commits before, because Luca removed the check in fbo setting
  between difference in bits between colour and depth/stencil buffer, and
  nv30 hw does not support that (they must be equal) and doing that
  simply hang the gpu.
 
  Commit 4787059263755fb92b2bb09ac71658d9b4cc9368 'nvfx: new 2D: add
  support for render temporaries' removed this check and fbo tests
  trigger the bug. Maybe Luca was planning to use temporaries to avoid
  this check, but unfortunately we do not know if he finished it or not.

 I do not want to guess here. I would like to hear something from Luca before
 doing anything else with his code.

 It has been 3 months since latest Luca's sign, and I'm afraid it will
 be a while before it changes :-(.

 Marek, if you are still interested in merging stuff from nvfx-next-6b,
 I would like to propose up to commit
 9a4c66b0d1963c4d90fccac41e7aa48105835857 included (as I said before,
 the following ones bring back nasty regression regarding fbo stuff,
 hanging the gpu hard).

 Luca did many fixes in his branch so it's really bad these ones are not
 merged back to master. Also, I don't feel like touching the nvfx
 backend in master till these fixes are merged.

I got busy with other things and then essentially lost interest;
furthermore, my nv40 is not even working right now.

As for the code, sure, feel free to make it work and merge it :)
The new temporary code did indeed screw things up, and I think it also
had other stuff mixed in that I didn't factor out.
However, the old temporary code is wrong because the temporary is
per-surface, while it should be attached to the resource, so that
multiple surfaces, and direct use of the resource itself, works
properly.

nv40 should support having mismatching color/depth as long as both are
linear and not swizzled.
I'm not sure if that is the case on nv30 or if there the restriction
applies to both cases.

There are also two branches starting with RFC in mesa git that add
correct nv30 ARB_texture_rectangle and correct glsl support, which
require minor Gallium changes but for which IIRC I never got answers
back from Gallium maintainers.

There should also be code in other branches to add control flow for
nv40 fragment programs (and perhaps other features I don't remember
right now).
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] Support writing out the pushbuffer in renouveau trace format (v2)

2010-04-13 Thread Luca Barbieri
Simply putting the dump in the renouveau directory where a renouveau
dump was taken previously seems to work for me (probably because we
use the same handle values as nVidia?).

But yes, the tools should be improved here and dumping the objclass of
the grobjs would be necessary for that.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] Support writing out the pushbuffer in renouveau trace format

2010-04-12 Thread Luca Barbieri
This patch causes libdrm, when NOUVEAU_DUMP=1 is set, to write the
pushbuffer to stdout instead of submitting it to the card.

renouveau-parse can then be used to parse it and obtain a readable
trace.

This is very useful for debugging and optimizing the Gallium driver.
---
 nouveau/nouveau_private.h |2 ++
 nouveau/nouveau_pushbuf.c |   29 +
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/nouveau/nouveau_private.h b/nouveau/nouveau_private.h
index 5a952f7..aa8a9c8 100644
--- a/nouveau/nouveau_private.h
+++ b/nouveau/nouveau_private.h
@@ -40,6 +40,8 @@
 #define CALPB_BUFFERS 4
 #define CALPB_BUFSZ   16384
 struct nouveau_pushbuf_priv {
+   char dump;
+   char no_submit;
uint32_t cal_suffix0;
uint32_t cal_suffix1;
struct nouveau_bo *buffer[CALPB_BUFFERS];
diff --git a/nouveau/nouveau_pushbuf.c b/nouveau/nouveau_pushbuf.c
index 28b8018..8afdfdc 100644
--- a/nouveau/nouveau_pushbuf.c
+++ b/nouveau/nouveau_pushbuf.c
@@ -124,6 +124,9 @@ nouveau_pushbuf_init(struct nouveau_channel *chan)
if (ret)
return ret;
 
+   nvpb-dump = !!getenv(NOUVEAU_DUMP);
+   nvpb-no_submit = !!getenv(NOUVEAU_NO_SUBMIT);
+
ret = nouveau_pushbuf_space(chan, 0);
if (ret)
return ret;
@@ -235,6 +238,22 @@ nouveau_pushbuf_flush(struct nouveau_channel *chan, 
unsigned min)
if (!nvpb-nr_push)
return 0;
 
+   if(nvpb-dump) {
+   unsigned i;
+   for(i = 0; i  nvpb-nr_push; ++i) {
+   uint32_t *p, *pend;
+   struct nouveau_bo *bo = (struct nouveau_bo 
*)nvpb-buffers[nvpb-push[i].bo_index].user_priv;
+   if(!bo-map)
+   nouveau_bo_map(bo, NOUVEAU_BO_RD);
+   p = bo-map + nvpb-push[i].offset;
+   pend = (char*)p + nvpb-push[i].length;
+   printf(# pb #%i offset %i dwords %i\n, (int)i, 
(int)nvpb-push[i].offset, (int)(pend - p));
+   for(; p != pend; ++p)
+   printf(%08x\n, *p);
+   printf(# end\n);
+   }
+   }
+
req.channel = chan-id;
req.nr_push = nvpb-nr_push;
req.push = (uint64_t)(unsigned long)nvpb-push;
@@ -245,10 +264,12 @@ nouveau_pushbuf_flush(struct nouveau_channel *chan, 
unsigned min)
req.suffix0 = nvpb-cal_suffix0;
req.suffix1 = nvpb-cal_suffix1;
 
-   do {
-   ret = drmCommandWriteRead(nvdev-fd, DRM_NOUVEAU_GEM_PUSHBUF,
- req, sizeof(req));
-   } while (ret == -EAGAIN);
+   if(!nvpb-no_submit) {
+   do {
+   ret = drmCommandWriteRead(nvdev-fd, 
DRM_NOUVEAU_GEM_PUSHBUF,
+ req, sizeof(req));
+   } while (ret == -EAGAIN);
+   }
nvpb-cal_suffix0 = req.suffix0;
nvpb-cal_suffix1 = req.suffix1;
nvdev-base.vm_vram_size = req.vram_available;
-- 
1.7.0.1.147.g6d84b

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] Support writing out the pushbuffer in renouveau trace format (v2)

2010-04-12 Thread Luca Barbieri
Changes in v2:
- Unmap buffers we mapped, avoid assertion
- Silence warnings

This patch causes libdrm, when NOUVEAU_DUMP=1 is set, to write the
pushbuffer to stdout instead of submitting it to the card.

renouveau-parse can then be used to parse it and obtain a readable
trace.

This is very useful for debugging and optimizing the Gallium driver.
---
 nouveau/nouveau_private.h |2 ++
 nouveau/nouveau_pushbuf.c |   35 +++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/nouveau/nouveau_private.h b/nouveau/nouveau_private.h
index 5a952f7..aa8a9c8 100644
--- a/nouveau/nouveau_private.h
+++ b/nouveau/nouveau_private.h
@@ -40,6 +40,8 @@
 #define CALPB_BUFFERS 4
 #define CALPB_BUFSZ   16384
 struct nouveau_pushbuf_priv {
+   char dump;
+   char no_submit;
uint32_t cal_suffix0;
uint32_t cal_suffix1;
struct nouveau_bo *buffer[CALPB_BUFFERS];
diff --git a/nouveau/nouveau_pushbuf.c b/nouveau/nouveau_pushbuf.c
index 28b8018..ac8ca2f 100644
--- a/nouveau/nouveau_pushbuf.c
+++ b/nouveau/nouveau_pushbuf.c
@@ -124,6 +124,9 @@ nouveau_pushbuf_init(struct nouveau_channel *chan)
if (ret)
return ret;
 
+   nvpb-dump = !!getenv(NOUVEAU_DUMP);
+   nvpb-no_submit = !!getenv(NOUVEAU_NO_SUBMIT);
+
ret = nouveau_pushbuf_space(chan, 0);
if (ret)
return ret;
@@ -235,6 +238,28 @@ nouveau_pushbuf_flush(struct nouveau_channel *chan, 
unsigned min)
if (!nvpb-nr_push)
return 0;
 
+   if(nvpb-dump) {
+   unsigned i;
+   for(i = 0; i  nvpb-nr_push; ++i) {
+   uint32_t *p, *pend;
+   struct nouveau_bo *bo = (struct nouveau_bo *)(unsigned 
long)nvpb-buffers[nvpb-push[i].bo_index].user_priv;
+   int mapped = 0;
+   if(!bo-map)
+   {
+   mapped = 1;
+   nouveau_bo_map(bo, NOUVEAU_BO_RD);
+   }
+   p = (uint32_t*)((char*)bo-map + nvpb-push[i].offset);
+   pend = (uint32_t*)((char*)p + nvpb-push[i].length);
+   printf(# pb #%i offset %i dwords %i\n, (int)i, 
(int)nvpb-push[i].offset, (int)(pend - p));
+   for(; p  pend; ++p)
+   printf(%08x\n, *p);
+   printf(# end\n);
+   if(mapped)
+   nouveau_bo_unmap(bo);
+   }
+   }
+
req.channel = chan-id;
req.nr_push = nvpb-nr_push;
req.push = (uint64_t)(unsigned long)nvpb-push;
@@ -245,10 +270,12 @@ nouveau_pushbuf_flush(struct nouveau_channel *chan, 
unsigned min)
req.suffix0 = nvpb-cal_suffix0;
req.suffix1 = nvpb-cal_suffix1;
 
-   do {
-   ret = drmCommandWriteRead(nvdev-fd, DRM_NOUVEAU_GEM_PUSHBUF,
- req, sizeof(req));
-   } while (ret == -EAGAIN);
+   if(!nvpb-no_submit) {
+   do {
+   ret = drmCommandWriteRead(nvdev-fd, 
DRM_NOUVEAU_GEM_PUSHBUF,
+ req, sizeof(req));
+   } while (ret == -EAGAIN);
+   }
nvpb-cal_suffix0 = req.suffix0;
nvpb-cal_suffix1 = req.suffix1;
nvdev-base.vm_vram_size = req.vram_available;
-- 
1.7.0.1.147.g6d84b

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [Mesa3d-dev] [radeonhd] Re: Status of s3tc patent in respect to open-source drivers and workarounds

2010-03-29 Thread Luca Barbieri
Interestingly, the post-trial judge opinion at
http://wi.findacase.com/research/wfrmDocViewer.aspx/xq/fac.%5CFDCT%5CWWI%5C2008%5C20080801_734.WWI.htm/qx
contains the following text:


Plaintiff’s expert, Dr. Stevenson, testified that the ‘327 patent is
directed to “a special
purpose hardware component designed and optimized specifically for
high speed graphics
processing. 
The specification makes it plain that the invention does not relate to
software for graphics. As the inventors noted, such programs “are well
known in the art.
[...]
Claim 17 does not say in so many words that the method it discloses is
a rasterization
circuit operating on a floating point format, but that is what it describes.
Reading the disputed claims as disclosing hardware is not reading a
preferred embodiment in the claims; it is simply
reading the claims as the person of ordinary skill would read a patent
directed to special purpose hardware.


This seems to indicate that it would be safe to implement floating
point textures/framebuffers in Mesa, as both SGI and ATI and the court
seemed to agree that the patent applies specifically to hardware.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [Mesa3d-dev] Status of s3tc patent in respect to open-source drivers and workarounds

2010-03-28 Thread Luca Barbieri
If the application provides s3tc-encoded data through
glCompressedTexImage (usually loaded from a pre-compressed texture
stored on disk), Mesa will pass it unaltered to the graphics card (as
long as the driver/card supports DXT* format ids) and will not need to
use any encoding or decoding algorithms.

The problem is that if the application supplies uncompressed data,
Mesa would need to run an encoding algorithm to be able to use
compressed textures.

Conversely, if software rendering is necessary, and the application
provides compressed textures, Mesa will need to run a decoding
algorithm to be able to sample from the texture.

So the workaround (and what commercial games usually do) is to ship
pre-compressed textures along with the game, as well as uncompressed
textures in case the card/rendered do not support texture compression.
An end-user side solution is to download, compile and install
libtxc_dxtn.so, which Mesa will use if present to decode and encode
compressed textures.

Furthermore, a GPU can be used to do decoding using its native
sampling support, and some may also support encoding.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] nv40: remove leftover nv40_transfer.c from unification into nvfx

2010-03-15 Thread Luca Barbieri
---
 src/gallium/drivers/nv40/nv40_transfer.c |  181 --
 1 files changed, 0 insertions(+), 181 deletions(-)
 delete mode 100644 src/gallium/drivers/nv40/nv40_transfer.c

diff --git a/src/gallium/drivers/nv40/nv40_transfer.c 
b/src/gallium/drivers/nv40/nv40_transfer.c
deleted file mode 100644
index 3d8c8e8..000
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ /dev/null
@@ -1,181 +0,0 @@
-#include pipe/p_state.h
-#include pipe/p_defines.h
-#include util/u_inlines.h
-#include util/u_format.h
-#include util/u_memory.h
-#include util/u_math.h
-#include nouveau/nouveau_winsys.h
-#include nv40_context.h
-#include nvfx_screen.h
-#include nvfx_state.h
-
-struct nv40_transfer {
-   struct pipe_transfer base;
-   struct pipe_surface *surface;
-   boolean direct;
-};
-
-static void
-nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned 
height,
- struct pipe_texture *template)
-{
-   memset(template, 0, sizeof(struct pipe_texture));
-   template-target = pt-target;
-   template-format = pt-format;
-   template-width0 = width;
-   template-height0 = height;
-   template-depth0 = 1;
-   template-last_level = 0;
-   template-nr_samples = pt-nr_samples;
-
-   template-tex_usage = PIPE_TEXTURE_USAGE_DYNAMIC |
- NOUVEAU_TEXTURE_USAGE_LINEAR;
-}
-
-static struct pipe_transfer *
-nv40_transfer_new(struct pipe_context *pcontext, struct pipe_texture *pt,
- unsigned face, unsigned level, unsigned zslice,
- enum pipe_transfer_usage usage,
- unsigned x, unsigned y, unsigned w, unsigned h)
-{
-struct pipe_screen *pscreen = pcontext-screen;
-   struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
-   struct nv40_transfer *tx;
-   struct pipe_texture tx_tex_template, *tx_tex;
-
-   tx = CALLOC_STRUCT(nv40_transfer);
-   if (!tx)
-   return NULL;
-
-   pipe_texture_reference(tx-base.texture, pt);
-   tx-base.x = x;
-   tx-base.y = y;
-   tx-base.width = w;
-   tx-base.height = h;
-   tx-base.stride = mt-level[level].pitch;
-   tx-base.usage = usage;
-   tx-base.face = face;
-   tx-base.level = level;
-   tx-base.zslice = zslice;
-
-   /* Direct access to texture */
-   if ((pt-tex_usage  PIPE_TEXTURE_USAGE_DYNAMIC ||
-debug_get_bool_option(NOUVEAU_NO_TRANSFER, TRUE/*XXX:FALSE*/)) 
-   pt-tex_usage  NOUVEAU_TEXTURE_USAGE_LINEAR)
-   {
-   tx-direct = true;
-   tx-surface = pscreen-get_tex_surface(pscreen, pt,
-  face, level, zslice,
-  
pipe_transfer_buffer_flags(tx-base));
-   return tx-base;
-   }
-
-   tx-direct = false;
-
-   nv40_compatible_transfer_tex(pt, w, h, tx_tex_template);
-
-   tx_tex = pscreen-texture_create(pscreen, tx_tex_template);
-   if (!tx_tex)
-   {
-   FREE(tx);
-   return NULL;
-   }
-
-   tx-base.stride = ((struct nvfx_miptree*)tx_tex)-level[0].pitch;
-
-   tx-surface = pscreen-get_tex_surface(pscreen, tx_tex,
-  0, 0, 0,
-  
pipe_transfer_buffer_flags(tx-base));
-
-   pipe_texture_reference(tx_tex, NULL);
-
-   if (!tx-surface)
-   {
-   pipe_surface_reference(tx-surface, NULL);
-   FREE(tx);
-   return NULL;
-   }
-
-   if (usage  PIPE_TRANSFER_READ) {
-   struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
-   struct pipe_surface *src;
-
-   src = pscreen-get_tex_surface(pscreen, pt,
-  face, level, zslice,
-  PIPE_BUFFER_USAGE_GPU_READ);
-
-   /* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
-   /* TODO: Check if SIFM can un-swizzle */
-   nvscreen-eng2d-copy(nvscreen-eng2d,
- tx-surface, 0, 0,
- src, x, y,
- w, h);
-
-   pipe_surface_reference(src, NULL);
-   }
-
-   return tx-base;
-}
-
-static void
-nv40_transfer_del(struct pipe_context *pcontext, struct pipe_transfer *ptx)
-{
-   struct nv40_transfer *tx = (struct nv40_transfer *)ptx;
-
-   if (!tx-direct  (ptx-usage  PIPE_TRANSFER_WRITE)) {
-   struct pipe_screen *pscreen = pcontext-screen;
-   struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
-   struct pipe_surface *dst;
-
-   dst = pscreen-get_tex_surface(pscreen, ptx-texture,
-  ptx-face, ptx-level, 
ptx-zslice,
-

Re: [Nouveau] Interrupt setting

2010-03-13 Thread Luca Barbieri
 So a GPU itself updates the sequence # of each fence in a specific register, 
 and we can let the Nouveau driver wait for a target
 value to be written.
 Do you know when the value is actually written?

When the FIFO command instructing the GPU to do the write is executed.

 If it is written when a DMA transfer is done, we dont know exactly when the 
 corresponding GPU operation is finished.
 Do you think it is possible to wait for a completion of a GPU operation?

The current assumption is that FIFO commands are executed
synchronously, so when the FIFO executes the command to update the
fence value, all previous FIFO commands should have been completed.

The current driver just does a CPU busy loop, continuously reading the
fence register until the value read is large enough.

There should be some work by Francisco Jerez and perhaps Ben Skeggs on
using an interrupt-based mechanism instead (the one you described,
most likely), but I'm not sure what the status of that is.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] Interrupt setting

2010-03-13 Thread Luca Barbieri
 Since you create one fence object for each pushbuf, I thought that we can
 synchronize only with last the command.
 Not sure if my assumption is correct...
All the commands in the pushbuffer are executed sequentially and the
fence setting command is written at the end of the pushbuffer, so when
the fence register is updated, all commands will have been already
executed.

However, this does indeed mean that to wait for the completion of any
command, we also need to wait for the completion of all the other
commands in the pushbuffer sent in the pushbuf ioctl call.

This could be improved, but I doubt it is worth the significant
additional complexity and CPU performance costs. Userspace can always
send commands in smaller chunks if it wants to.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] nv30/nv40 Gallium drivers unification

2010-03-13 Thread Luca Barbieri
Currently the nv30 and nv40 Gallium drivers are very similar, and
contain about 5000 lines of essentially duplicate code.

I prepared a patchset (which can be found at
http://repo.or.cz/w/mesa/mesa-lb.git/shortlog/refs/heads/unification+fixes)
which gradually unifies the drivers, one file per the commit.

A new nvfx directory is created, and unified files are put there one by one.
After all patches are applied, the nv30 and nv40 directories are
removed and the only the new nvfx directory remains.

The first patches unify the engine naming (s/curie/eng3d/g;
s/rankine/eng3d), and switch nv40 to use the NV34TCL_ constants.
Initial versions of this work changed renouveau.xml to create a new
NVFXTCL object, but the current version doesn't need any
renouveau.xml modification at all.

The unification+fixes branch referenced above is the one that should
be tested.
The unification branch contains just the unification, with no
behavior changes, while unification+fixes also fixes swtnl and quad
rendering, allowing to better test the unification. Some cleanups on
top of the unfication are also included.

That same repository also contains other branches with significant
improvements on top of the unification, but I'm still not proposing
them for inclusion as they need more testing and some fixes.

While there are some branches in the Mesa repository that would
conflict with this, such branches seem to be popping up continuously
(and this is good!), so waiting until they are merged probably won't
really work.

The conflicts are minimal anyway and the driver fixes can be very
easily reconstructed over the unified codebase.

How about merging this?
Any objections? Any comments?
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] Gallium driver and compatibility issues

2010-03-12 Thread Luca Barbieri
It is not surprising that some (or most) 3D applications don't
actually work correctly with nouveau on nv3x right now.

The driver will probably improve in the future.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] Lost R300/NV40 development work you may have

2010-03-11 Thread Luca Barbieri
On the Radeon IRC channel
(http://www.radeonhd.org/?page=archive_displayc=radeonm=1y=2009d=2009-1-26),
you expressed interest in Corbin Simpson's R300 LLVM work, at
git://anongit.freedesktop.org/~csimpson/llvm, which was based on
Stephane Marchesin's earlier NV40 work.

Both these projects have since been lost due to an hard drive crash.
If you happened to still have that git tree on your hard drive, it
would be extremely appreciated if you could provide it in the form you
prefer (for instance a reply to all to this mail with a .tar.gz
attached).

Thanks a lot for you attention.

Best regards,
Luca Barbieri
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] making 0.0.16 into 1.0.0

2010-03-05 Thread Luca Barbieri
Another possible reason for breaking ABI that hasn't yet been
mentioned is the fact that right now any DRM client can trivially lock
up the GPU and/or corrupt GPU/GART memory belonging to other clients.
This happens often with GL driver bugs and is quite annoying for
developers and testers of them, so it may be desirable to alter the
driver to provide real protection and make this impossible.
Depending on the way it is done, it may require significant changes
and an ABI break.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] renouveau/nv10: remove duplicate vertex buffer registers

2010-03-01 Thread Luca Barbieri
On Mon, Mar 1, 2010 at 2:34 AM, Francisco Jerez curroje...@riseup.net wrote:
 Luca Barbieri l...@luca-barbieri.com writes:

 NV10TCL defines the vertex buffer registers both as arrays and as
 individual named registers.

 This causes duplicate register definitions and the individual registers
 are not used either by the DDX or by the Mesa driver.

 Francisco Jerez said to remove them all.

 I forgot to ask: why do they bother you?

I am experimenting with rewriting renouveau-gen.c in Python, and
stored registers in a dictionary with the register offset as key.

Having multiple registers at the same offset obviously causes issues
with that, and this is the only instance of that problem, so it seems
better to remove it than adding complexity to tools to handle it.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 3/5] renouveau/nv40: set NV40TCL_LINE_STIPPLE_PATTERN to hexa like nv30

2010-02-26 Thread Luca Barbieri
---
 renouveau.xml |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/renouveau.xml b/renouveau.xml
index 2b6e8d7..d305a8e 100644
--- a/renouveau.xml
+++ b/renouveau.xml
@@ -4271,7 +4271,7 @@
 reg32 offset=0x1db4 name=LINE_STIPPLE_ENABLE type=boolean/
 reg32 offset=0x1db8 name=LINE_STIPPLE_PATTERN type=bitfield
   bitfield name=FACTOR high=15 low=0 type=int/
-  bitfield name=PATTERN high=31 low=16 type=int/
+  bitfield name=PATTERN high=31 low=16 type=hexa/
 /reg32
 reg32 offset=0x1e40 name=VTX_ATTR_1F size=16 type=float/
 reg32 offset=0x1e94/
-- 
1.6.6.1.476.g01ddb

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 4/5] renouveau/nv30: remove clip planes #6 and #7

2010-02-26 Thread Luca Barbieri
These are defined for nv30 and not nv40, and they probably don't
exist in the hardware.

Both DirectX and OpenGL nVidia drivers support only 6 clip planes on
pre-nv50 hardware.

Neither the DDX nor the Gallium driver support user clip planes at all
on nv30.

This makes the definition the same as nv40, so they can be unified.
---
 renouveau.xml |2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/renouveau.xml b/renouveau.xml
index d305a8e..f73ef04 100644
--- a/renouveau.xml
+++ b/renouveau.xml
@@ -3681,8 +3681,6 @@
   bitfield name=PLANE3 high=13 low=13 type=boolean/
   bitfield name=PLANE4 high=17 low=17 type=boolean/
   bitfield name=PLANE5 high=21 low=21 type=boolean/
-  bitfield name=PLANE6 high=25 low=25 type=boolean/
-  bitfield name=PLANE7 high=29 low=29 type=boolean/
 /reg32
 reg32 offset=0x147c name=POLYGON_STIPPLE_ENABLE type=boolean/
 reg32 offset=0x1480 name=POLYGON_STIPPLE_PATTERN size=32 
type=hexa/
-- 
1.6.6.1.476.g01ddb

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] drm/nouveau: fix missing spin_unlock in failure path

2010-02-20 Thread Luca Barbieri
Found by sparse.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_gem.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c 
b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 03d8935..d7ace31 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -557,11 +557,11 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev,
 
spin_lock(nvbo-bo.lock);
ret = ttm_bo_wait(nvbo-bo, false, false, false);
+   spin_unlock(nvbo-bo.lock);
if (ret) {
NV_ERROR(dev, reloc wait_idle failed: %d\n, ret);
break;
}
-   spin_unlock(nvbo-bo.lock);
 
nouveau_bo_wr32(nvbo, r-reloc_bo_offset  2, data);
}
-- 
1.6.6.1.476.g01ddb

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel (v2)

2010-02-09 Thread Luca Barbieri
Changes in v2:
- Addressed review comments

nouveau_bo_wait will make the GPU channel wait for fence if possible,
otherwise falling back to waiting with the CPU using ttm_bo_wait.

The nouveau_fence_sync function currently returns -ENOSYS, and is
the focus of the next patch.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_bo.c|   68 ++-
 drivers/gpu/drm/nouveau/nouveau_drv.h   |2 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |6 +++
 drivers/gpu/drm/nouveau/nouveau_gem.c   |   20 +
 4 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c 
b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 028719f..2da6acf 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -35,6 +35,70 @@
 
 #include linux/log2.h
 
+int
+nouveau_bo_wait(struct ttm_buffer_object *bo, struct nouveau_channel *chan)
+{
+   int ret = 0;
+
+   if (likely(!bo-sync_obj))
+   return 0;
+
+   spin_lock(bo-lock);
+   if (chan) {
+   struct nouveau_fence *new_fence;
+   struct nouveau_channel *waited_chan;
+
+   do {
+   struct nouveau_fence *prev_fence;
+   prev_fence = bo-sync_obj;
+
+   waited_chan = nouveau_fence_channel(prev_fence);
+   if (likely(!waited_chan || waited_chan == chan))
+   break;
+
+   nouveau_fence_ref(prev_fence);
+
+   ret = ttm_bo_wait(bo, false, false, true);
+   if (!ret)
+   goto unref_break;
+
+   if (unlikely(prev_fence != bo-sync_obj))
+   goto unref_continue;
+
+   spin_unlock(bo-lock);
+   new_fence = nouveau_fence_sync(prev_fence, chan);
+   spin_lock(bo-lock);
+
+   if (likely(!IS_ERR(new_fence))) {
+   if (likely(bo-sync_obj)) {
+   if (unlikely(bo-sync_obj != 
prev_fence)) {
+   nouveau_fence_unref((void 
**)new_fence);
+   continue;
+   }
+   nouveau_fence_unref((void 
**)bo-sync_obj);
+   }
+   bo-sync_obj = new_fence;
+   ret = 0;
+unref_break:
+   nouveau_fence_unref((void **)prev_fence);
+   break;
+   }
+
+   if (unlikely(prev_fence != bo-sync_obj)) {
+unref_continue:
+   nouveau_fence_unref((void **)prev_fence);
+   continue;
+   }
+
+   nouveau_fence_unref((void **)prev_fence);
+   ret = ttm_bo_wait(bo, false, false, false);
+   } while (0);
+   } else
+   ret = ttm_bo_wait(bo, false, false, false);
+   spin_unlock(bo-lock);
+   return ret;
+}
+
 static void
 nouveau_bo_del_ttm(struct ttm_buffer_object *bo)
 {
@@ -469,8 +533,10 @@ nouveau_bo_move_accel_cleanup(struct nouveau_channel *chan,
 
ret = ttm_bo_move_accel_cleanup(nvbo-bo, fence, NULL,
evict, no_wait, new_mem);
+
+   /* TODO: this should be redundant, since we do the check in validate */
if (nvbo-channel  nvbo-channel != chan)
-   ret = nouveau_fence_wait(fence, NULL, false, false);
+   ret = nouveau_bo_wait(nvbo-bo, nvbo-channel);
nouveau_fence_unref((void *)fence);
return ret;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 64987a9..bb9024c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -,6 +,7 @@ extern int nv04_crtc_create(struct drm_device *, int 
index);
 
 /* nouveau_bo.c */
 extern struct ttm_bo_driver nouveau_bo_driver;
+extern int nouveau_bo_wait(struct ttm_buffer_object *bo, struct 
nouveau_channel *chan);
 extern int nouveau_bo_new(struct drm_device *, struct nouveau_channel *,
  int size, int align, uint32_t flags,
  uint32_t tile_mode, uint32_t tile_flags,
@@ -1136,6 +1137,7 @@ extern int nouveau_fence_emit(struct nouveau_fence *);
 struct nouveau_channel *nouveau_fence_channel(struct nouveau_fence *);
 extern bool nouveau_fence_signalled(void *obj, void *arg);
 extern int nouveau_fence_wait(void *obj, void *arg, bool lazy, bool intr);
+extern struct nouveau_fence *nouveau_fence_sync(struct nouveau_fence *, struct 
nouveau_channel *);
 extern int nouveau_fence_flush(void *obj, void *arg

[Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator (v2)

2010-02-09 Thread Luca Barbieri
Changes in v2:
- Addressed review comments
- Fixed lockless algorithm (must not dec if negative in addition to if 0)
- Made spinlock irqsave (fences are completed in IRQs)

This patch adds code to allocate semaphores in a dynamic way using
a lockless algorithm.

1. Semaphore BOs

Semaphore BOs are BOs containing semaphores. Each is 4KB large and
contains 1024 4-byte semaphores. They are pinned and mapped.

Semaphore BOs are allocated on-demand and freed at device takedown.
Those that are not fully allocated are kept on a free list.

Each is assigned an handle. DMA objects and references are created
on demand for each channel that needs to use a semaphore BO.
Those objects and references are automatically destroyed at channel
destruction time.

Typically only a single semaphore BO will be used.

2. Semaphore allocation

Each semaphore BO contains a bitmask of free semaphores within the BO.
Allocation is done in a lockless fashion using a count of free
semaphores and the bitmask.

Semaphores are released once the fence on the waiting side passed.
This is done by adding fields to nouveau_fence.

Semaphore values are zeroed when the semaphore BO is allocated, and
are afterwards only modified by the GPU.

This is performed by storing a bitmask that allows to alternate
between using the values 0 and 1 for a given semaphore.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_drv.h   |9 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |  265 +++
 drivers/gpu/drm/nouveau/nouveau_state.c |4 +
 3 files changed, 278 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index bb9024c..93e5427 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -621,6 +621,13 @@ struct drm_nouveau_private {
struct {
struct dentry *channel_root;
} debugfs;
+
+   struct {
+   spinlock_t free_list_lock;
+   struct nouveau_sem_bo *free_list;
+   uint32_t handles;
+   uint32_t max_handles;
+   } sem;
 };
 
 static inline struct drm_nouveau_private *
@@ -1142,6 +1149,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void **obj);
 extern void *nouveau_fence_ref(void *obj);
 extern void nouveau_fence_handler(struct drm_device *dev, int channel);
+extern void nouveau_fence_device_init(struct drm_device *dev);
+extern void nouveau_fence_device_takedown(struct drm_device *dev);
 
 /* nouveau_gem.c */
 extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c 
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 9b1c2c3..7157148 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -32,6 +32,13 @@
 
 #define USE_REFCNT (dev_priv-card_type = NV_10)
 
+#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
+
+/* reading fences can be very expensive
+ * use a threshold that would only use up half a single sem_bo
+ */
+#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / 
(NOUVEAU_MAX_CHANNEL_NR * 2))
+
 struct nouveau_fence {
struct nouveau_channel *channel;
struct kref refcount;
@@ -47,6 +54,240 @@ nouveau_fence(void *sync_obj)
return (struct nouveau_fence *)sync_obj;
 }
 
+struct nouveau_sem_bo {
+   struct nouveau_sem_bo *next;
+   struct nouveau_bo *bo;
+   uint32_t handle;
+
+   /* = 0: num_free + 1 slots are free, sem_bo is or is about to be on 
free_list
+   -1: all allocated, sem_bo is NOT on free_list
+   */
+   atomic_t num_free;
+
+   DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+   DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+   DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
+};
+
+struct nouveau_sem {
+   struct nouveau_sem_bo *sem_bo;
+   unsigned num;
+   uint32_t value;
+};
+
+static struct nouveau_sem_bo*
+nouveau_sem_bo_alloc(struct drm_device *dev)
+{
+   struct drm_nouveau_private *dev_priv = dev-dev_private;
+   struct nouveau_sem_bo *sem_bo;
+   struct nouveau_bo *bo;
+   int flags = TTM_PL_FLAG_VRAM;
+   int ret;
+   bool is_iomem;
+   void *mem;
+   unsigned handle;
+
+   do {
+   handle = dev_priv-sem.handles;
+   if (handle = dev_priv-sem.max_handles)
+   return NULL;
+   } while (cmpxchg(dev_priv-sem.handles, handle, handle + 1) != handle);
+
+   sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+   if (!sem_bo)
+   return NULL;
+
+   sem_bo-handle = NvSem + handle;
+
+   ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
+   0, 0x, true, true, bo);
+   if (ret)
+   goto out_free;
+
+   sem_bo-bo = bo;
+
+   ret = nouveau_bo_pin

[Nouveau] [PATCH 3/3] Use semaphores for fully on-GPU interchannel synchronization (v2)

2010-02-09 Thread Luca Barbieri
Changes in v2:
- Addressed review comments

This patch implements the nouveau_fence_sync interface introduced
in the previous patch using dynamically allocated semaphores,
introduced in the other previous patch.

This is tested on NV40, but should work on NV17-NV50 (previous cards
will just fallback to CPU waiting).

Unlike a previously posted patch, this patch does not make any use of
software methods and is designed to do all work on the GPU, and be
as fast as possible.

To perform inter-channel synchronization, commands are emitted on
both channels involved.

First, a semaphore is allocated, and a valid handle for it is inserted
in the channel if necessary.

DMA_SEMAPHORE is set only if different from the last used one. This
is usually not the case, and thus SEMAPHORE interrupts only happen
once per channel usually.

After that, SEMAPHORE_OFFSET is set if changed and then either ACQUIRE
or RELEASE is used.

On the waiting channel, a fence is also emitted. Once that fence
expires, the semaphore is released and can be reused for any purpose.

This results in synchronization taking place fully on the GPU, with
no CPU waiting necessary.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_drv.h   |7 ++
 drivers/gpu/drm/nouveau/nouveau_fence.c |  136 +--
 2 files changed, 136 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 93e5427..d3aa20e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -195,6 +195,8 @@ struct nouveau_channel {
uint32_t sequence;
uint32_t sequence_ack;
uint32_t last_sequence_irq;
+   atomic_t sem_count;
+   unsigned sem_threshold;
} fence;
 
/* DMA push buffer */
@@ -255,6 +257,11 @@ struct nouveau_channel {
char name[32];
struct drm_info_list info;
} debugfs;
+
+   struct {
+   unsigned handle;
+   unsigned num;
+   } sem;
 };
 
 struct nouveau_instmem_engine {
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c 
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 7157148..b4b016f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -46,6 +46,9 @@ struct nouveau_fence {
 
uint32_t sequence;
bool signalled;
+
+   struct nouveau_sem_bo *sem_bo;
+   int sem_num;
 };
 
 static inline struct nouveau_fence *
@@ -297,10 +300,123 @@ nouveau_fence_del(struct kref *ref)
kfree(fence);
 }
 
+static inline void
+nouveau_sem_emit(struct nouveau_channel *chan, struct nouveau_sem *sem, 
unsigned op)
+{
+   uint32_t handle = sem-sem_bo-handle;
+   if (chan-sem.handle != handle) {
+   BEGIN_RING(chan, NvSubSw, NV_SW_DMA_SEMAPHORE, 1);
+   OUT_RING(chan, handle);
+   chan-sem.handle = handle;
+   }
+   if (chan-sem.num != sem-num) {
+   BEGIN_RING(chan, NvSubSw, NV_SW_SEMAPHORE_OFFSET, 1);
+   OUT_RING(chan, sem-num  2);
+   chan-sem.num = sem-num;
+   }
+   BEGIN_RING(chan, NvSubSw, op, 1);
+   OUT_RING(chan, sem-value);
+}
+
+/* Currently this ignores waited_fence-sequence and syncs the last fence on 
waited_fence-channel
+ * If a better GPU synchronization mechanism is discovered, then the actual 
fence may be used.
+ * Note that sem_fence is a fence on the *waiting *channel, used to free the 
semaphore.
+ */
 struct nouveau_fence*
 nouveau_fence_sync(struct nouveau_fence *waited_fence, struct nouveau_channel 
*chan)
 {
-   return ERR_PTR(-ENOSYS);
+   struct nouveau_channel *waited_chan;
+   struct drm_device *dev;
+   struct drm_nouveau_private *dev_priv;
+   struct nouveau_sem sem;
+   uint32_t handle;
+   int ret;
+   struct nouveau_fence *sem_fence;
+   unsigned long flags;
+
+   dev = chan-dev;
+   dev_priv = chan-dev-dev_private;
+
+   if (dev_priv-chipset  0x17)
+   return ERR_PTR(-ENOSYS);
+
+   waited_chan = waited_fence-channel;
+
+   ret = RING_SPACE(chan, 6 + 2);
+   if (ret)
+   return ERR_PTR(ret);
+
+   ret = RING_SPACE(waited_chan, 6);
+   if (ret)
+   return ERR_PTR(ret);
+
+   /* try to reclaim semaphores when we hit the threshold
+  this helps keeping a low number of active semaphores
+
+  Note that in the DRI2 case this is never triggered
+  since we wait for fences on both channels.
+
+  However, if buffers were all different, this could be
+  necessary.
+   */
+   if (atomic_read(chan-fence.sem_count) = chan-fence.sem_threshold) {
+   spin_lock_irqsave(chan-fence.lock, flags);
+   if (atomic_read(chan-fence.sem_count) = 
chan-fence.sem_threshold

Re: [Nouveau] [PATCH 1/2] libdrm/nouveau: new optimized libdrm pushbuffer ABI

2010-02-08 Thread Luca Barbieri
 IMO, the changes are good.  However, DRM_NOUVEAU_HEADER_PATCHLEVEL is
 used to indicate the version of the kernel interface that's supported,
 and not the libdrm API version.

OK.

Perhaps it would be useful to add a libdrm API version number as well?
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] drm/nouveau: enlarge GART aperture (v2)

2010-02-08 Thread Luca Barbieri
Changes in v2:
- Compute size based on ramin_rsvd_size

This patch enlarges the PCI GART aperture to 512 MB, or the space
covered by a DMA object filling half RAMIN.

The current 64MB aperture is too small and should be enlarged.
The optimal amound may be card/system-dependent, so a more sophisticated
approach may be preferable.

Could anyone with an nv04 test whether this doesn't break there?

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_sgdma.c |   14 --
 1 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c 
b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
index 4c7f1e4..5a52006 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
@@ -227,11 +227,21 @@ nouveau_sgdma_init(struct drm_device *dev)
int i, ret;
 
if (dev_priv-card_type  NV_50) {
-   aper_size = (64 * 1024 * 1024);
+   /* TODO: can we safely raise this to 1GB, 2GB or 4GB? */
+   unsigned limit = 512 * 1024;
+
+   /* use up half ramin for the GART ctxdma object */
+   aper_size = dev_priv-ramin_rsvd_vram  1;
+   if(aper_size  limit)
+   aper_size = limit;
+   aper_size = 10; /* each 4KB page needs a 4 byte entry */
+
obj_size  = (aper_size  NV_CTXDMA_PAGE_SHIFT) * 4;
obj_size += 8; /* ctxdma header */
} else {
-   /* 1 entire VM page table */
+   /* 1 entire VM page table
+* TODO: can we raise this so that it can potentially cover all 
system memory?
+*/
aper_size = (512 * 1024 * 1024);
obj_size  = (aper_size  NV_CTXDMA_PAGE_SHIFT) * 8;
}
-- 
1.6.6.1.476.g01ddb

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel

2010-02-01 Thread Luca Barbieri
nouveau_bo_wait will make the GPU channel wait for fence if possible,
otherwise falling back to waiting with the CPU using ttm_bo_wait.

The nouveau_fence_sync function currently returns -ENOSYS, and is
the focus of the next patch.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_bo.c|   68 ++-
 drivers/gpu/drm/nouveau/nouveau_drv.h   |2 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |6 +++
 drivers/gpu/drm/nouveau/nouveau_gem.c   |   20 +
 4 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c 
b/drivers/gpu/drm/nouveau/nouveau_bo.c
index db0ed4c..8afc17e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -35,6 +35,70 @@
 
 #include linux/log2.h
 
+int
+nouveau_bo_wait(struct ttm_buffer_object *bo, struct nouveau_channel *chan)
+{
+   int ret = 0;
+
+   if (likely(!bo-sync_obj))
+   return 0;
+
+   spin_lock(bo-lock);
+   if (chan) {
+   struct nouveau_fence *new_fence;
+   struct nouveau_channel *waited_chan;
+
+   do {
+   struct nouveau_fence *prev_fence;
+   prev_fence = bo-sync_obj;
+
+   waited_chan = nouveau_fence_channel(prev_fence);
+   if (likely(!waited_chan || waited_chan == chan))
+   break;
+
+   nouveau_fence_ref(prev_fence);
+
+   ret = ttm_bo_wait(bo, false, false, true);
+   if (!ret)
+   goto unref_break;
+
+   if (unlikely(prev_fence != bo-sync_obj))
+   goto unref_continue;
+
+   spin_unlock(bo-lock);
+   new_fence = nouveau_fence_sync(prev_fence, chan);
+   spin_lock(bo-lock);
+
+   if (likely(!IS_ERR(new_fence))) {
+   if (likely(bo-sync_obj)) {
+   if (unlikely(bo-sync_obj != 
prev_fence)) {
+   nouveau_fence_unref((void 
**)new_fence);
+   continue;
+   }
+   nouveau_fence_unref((void 
**)bo-sync_obj);
+   }
+   bo-sync_obj = new_fence;
+   ret = 0;
+unref_break:
+   nouveau_fence_unref((void **)prev_fence);
+   break;
+   }
+
+   if (unlikely(prev_fence != bo-sync_obj)) {
+unref_continue:
+   nouveau_fence_unref((void **)prev_fence);
+   continue;
+   }
+
+   nouveau_fence_unref((void **)prev_fence);
+   ret = ttm_bo_wait(bo, false, false, false);
+   } while (0);
+   } else
+   ret = ttm_bo_wait(bo, false, false, false);
+   spin_unlock(bo-lock);
+   return ret;
+}
+
 static void
 nouveau_bo_del_ttm(struct ttm_buffer_object *bo)
 {
@@ -469,8 +533,10 @@ nouveau_bo_move_accel_cleanup(struct nouveau_channel *chan,
 
ret = ttm_bo_move_accel_cleanup(nvbo-bo, fence, NULL,
evict, no_wait, new_mem);
+
+   /* TODO: this should be redundant, since we do the check in validate */
if (nvbo-channel  nvbo-channel != chan)
-   ret = nouveau_fence_wait(fence, NULL, false, false);
+   ret = nouveau_bo_wait(nvbo-bo, nvbo-channel);
nouveau_fence_unref((void *)fence);
return ret;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 5445cef..2b78ee6 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -1110,6 +1110,7 @@ extern int nv04_crtc_create(struct drm_device *, int 
index);
 
 /* nouveau_bo.c */
 extern struct ttm_bo_driver nouveau_bo_driver;
+extern int nouveau_bo_wait(struct ttm_buffer_object *bo, struct 
nouveau_channel *chan);
 extern int nouveau_bo_new(struct drm_device *, struct nouveau_channel *,
  int size, int align, uint32_t flags,
  uint32_t tile_mode, uint32_t tile_flags,
@@ -1135,6 +1136,7 @@ extern int nouveau_fence_emit(struct nouveau_fence *);
 struct nouveau_channel *nouveau_fence_channel(struct nouveau_fence *);
 extern bool nouveau_fence_signalled(void *obj, void *arg);
 extern int nouveau_fence_wait(void *obj, void *arg, bool lazy, bool intr);
+extern struct nouveau_fence *nouveau_fence_sync(struct nouveau_fence *, struct 
nouveau_channel *);
 extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void

[Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Luca Barbieri
This patch adds code to allocate semaphores in a dynamic way using
an algorithm with a lockless fast path.

1. Semaphore BOs

Semaphore BOs are BOs containing semaphores. Each is 4KB large and
contains 1024 4-byte semaphores. They are pinned.

Semaphore BOs are allocated on-demand and freed at device takedown.
Those that are not fully allocated are kept on a free list.

Each is assigned an handle. DMA objects and references are created
on demand for each channel that needs to use a semaphore BO.
Those objects and references are automatically destroyed at channel
destruction time.

Typically only a single semaphore BO will be needed.

2. Semaphore allocation

Each semaphore BO contains a bitmask of free semaphores within the BO.
Allocation is done in a lockless fashion using a count of free
semaphores and the bitmask.

Semaphores are released once the fence on the waiting side passed.
This is done by adding fields to nouveau_fence.

Semaphore values are zeroed when the semaphore BO is allocated, and
are afterwards only modified by the GPU.

This is performed by storing a bitmask that allows to alternate
between using the values 0 and 1 for a given semaphore.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_dma.h   |1 +
 drivers/gpu/drm/nouveau/nouveau_drv.h   |7 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |  243 +++
 drivers/gpu/drm/nouveau/nouveau_state.c |4 +
 4 files changed, 255 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h 
b/drivers/gpu/drm/nouveau/nouveau_dma.h
index dabfd65..0658979 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.h
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
@@ -69,6 +69,7 @@ enum {
NvGdiRect   = 0x800c,
NvImageBlit = 0x800d,
NvSw= 0x800e,
+   NvSem   = 0x8100, /* range of 16M handles */
 
/* G80+ display objects */
NvEvoVRAM   = 0x0100,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 2b78ee6..0a7abc7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -620,6 +620,11 @@ struct drm_nouveau_private {
struct {
struct dentry *channel_root;
} debugfs;
+
+   spinlock_t sem_bo_free_list_lock;
+   struct nouveau_sem_bo *sem_bo_free_list;
+   atomic_t sem_handles;
+   uint32_t sem_max_handles;
 };
 
 static inline struct drm_nouveau_private *
@@ -1141,6 +1146,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void **obj);
 extern void *nouveau_fence_ref(void *obj);
 extern void nouveau_fence_handler(struct drm_device *dev, int channel);
+extern void nouveau_fence_device_init(struct drm_device *dev);
+extern void nouveau_fence_device_takedown(struct drm_device *dev);
 
 /* nouveau_gem.c */
 extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c 
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 9b1c2c3..01152f3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -32,6 +32,13 @@
 
 #define USE_REFCNT (dev_priv-card_type = NV_10)
 
+#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
+
+/* reading fences can be very expensive
+ * use a threshold that would only use up half a single sem_bo
+ */
+#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / 
(NOUVEAU_MAX_CHANNEL_NR * 2))
+
 struct nouveau_fence {
struct nouveau_channel *channel;
struct kref refcount;
@@ -47,6 +54,218 @@ nouveau_fence(void *sync_obj)
return (struct nouveau_fence *)sync_obj;
 }
 
+struct nouveau_sem_bo {
+   struct nouveau_sem_bo *next;
+   struct nouveau_bo *bo;
+   uint32_t handle;
+
+   /* = 0: num_free + 1 slots are free, sem_bo is or is about to be on 
free_list
+   -1: all allocated, sem_bo is NOT on free_list
+   */
+   atomic_t num_free;
+
+   DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+   DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+   DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
+};
+
+struct nouveau_sem {
+   struct nouveau_sem_bo *sem_bo;
+   unsigned num;
+   uint32_t value;
+};
+
+struct nouveau_sem_bo*
+nouveau_sem_bo_alloc(struct drm_device *dev)
+{
+   struct nouveau_sem_bo *sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+   struct nouveau_bo *bo;
+   int flags = TTM_PL_FLAG_VRAM;
+   int ret;
+   bool is_iomem;
+   void *mem;
+
+   sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+
+   if (!sem_bo)
+   return 0;
+
+   ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
+   0, 0x, true, true, bo);
+   if (ret)
+   goto out_free;
+
+   sem_bo-bo = bo;
+
+   ret = nouveau_bo_pin

Re: [Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Luca Barbieri
 How often do we expect cross-channel sync to kick in? Maybe 2-3 times
 per frame? I suspect contentions will be rare enough to make spinlocks
 as fast as atomics for all real-life cases, and they don't have such a
 high maintainability cost. What do you guys think?

For the case of a single (or a few) GL application the requirements
are indeed modest.

I'm not sure that spinlocks or an otherwise reduced solution would be
much simpler.
You basically would just avoid the retrying code.

Also if you have a multithreaded/multiprocess GPGPU application on
large SMP machine things may change, as you may have a lot of commands
and semaphores in flight, as well as high contention for anything
global.

Of course, currently we hold both the BKL and struct_mutex around
things, which makes it all moot, but hopefully we'll switch to
per-channel mutexes soon (I'm looking into that).
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Luca Barbieri
 Sounds like premature optimization to me. I'm just stating my personal
 view here, but I have a feeling a patch with 60% of lines could do very
 well the same for most realistic cases.

Perhaps, but really, the only thing you would probably save by using
spinlocks in the fast path is retrying in nouveau_sem_alloc, which
should be at most 10 lines saved.

You could save much more by supporting only a single static semaphore
BO, and still retain almost all flexibility by making it large.
However, it's somewhat inelegant, and why not just keep the
functionality so we never need to revisit this again?

 BTW, the kernel has some linked list helpers you might want to use for
 sem_bo_free_list
It is a singly linked list, and slist.h never got merged.
I could possibly make it doubly linked, even though it's a bit useless.

 and probably the best place for the sem stuff to live
 is dev_priv-fence instead of the root of drm_nouveau_private.
There is no fence currently in drm_nouveau_private.
Adding a sem or fence substructure could make sense though.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] drm/nouveau: dehexify nv50_fifo.c

2010-01-30 Thread Luca Barbieri
---
 drivers/gpu/drm/nouveau/nv50_fifo.c |   68 +-
 1 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nv50_fifo.c 
b/drivers/gpu/drm/nouveau/nv50_fifo.c
index 32b244b..f0cba1e 100644
--- a/drivers/gpu/drm/nouveau/nv50_fifo.c
+++ b/drivers/gpu/drm/nouveau/nv50_fifo.c
@@ -58,7 +58,7 @@ nv50_fifo_init_thingo(struct drm_device *dev)
 
nv_wr32(dev, 0x32f4, cur-instance  12);
nv_wr32(dev, 0x32ec, nr);
-   nv_wr32(dev, 0x2500, 0x101);
+   nv_wr32(dev, NV03_PFIFO_CACHES, 0x101);
 }
 
 static int
@@ -146,7 +146,7 @@ nv50_fifo_init_regs__nv(struct drm_device *dev)
 {
NV_DEBUG(dev, \n);
 
-   nv_wr32(dev, 0x250c, 0x6f3cfc34);
+   nv_wr32(dev, NV04_PFIFO_SIZE, 0x6f3cfc34);
 }
 
 static void
@@ -154,12 +154,12 @@ nv50_fifo_init_regs(struct drm_device *dev)
 {
NV_DEBUG(dev, \n);
 
-   nv_wr32(dev, 0x2500, 0);
-   nv_wr32(dev, 0x3250, 0);
-   nv_wr32(dev, 0x3220, 0);
-   nv_wr32(dev, 0x3204, 0);
-   nv_wr32(dev, 0x3210, 0);
-   nv_wr32(dev, 0x3270, 0);
+   nv_wr32(dev, NV03_PFIFO_CACHES, 0);
+   nv_wr32(dev, NV04_PFIFO_CACHE1_PULL0, 0);
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_PUSH, 0);
+   nv_wr32(dev, NV03_PFIFO_CACHE1_PUSH1, 0);
+   nv_wr32(dev, NV03_PFIFO_CACHE1_PUT, 0);
+   nv_wr32(dev, NV03_PFIFO_CACHE1_GET, 0);
 
/* Enable dummy channels setup by nv50_instmem.c */
nv50_fifo_channel_enable(dev, 0, true);
@@ -345,9 +345,9 @@ nv50_fifo_load_context(struct nouveau_channel *chan)
 
nv_wr32(dev, 0x3330, nv_ro32(dev, ramfc, 0x00/4));
nv_wr32(dev, 0x3334, nv_ro32(dev, ramfc, 0x04/4));
-   nv_wr32(dev, 0x3240, nv_ro32(dev, ramfc, 0x08/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_PUT, nv_ro32(dev, ramfc, 0x08/4));
nv_wr32(dev, 0x3320, nv_ro32(dev, ramfc, 0x0c/4));
-   nv_wr32(dev, 0x3244, nv_ro32(dev, ramfc, 0x10/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_GET, nv_ro32(dev, ramfc, 0x10/4));
nv_wr32(dev, 0x3328, nv_ro32(dev, ramfc, 0x14/4));
nv_wr32(dev, 0x3368, nv_ro32(dev, ramfc, 0x18/4));
nv_wr32(dev, 0x336c, nv_ro32(dev, ramfc, 0x1c/4));
@@ -355,27 +355,27 @@ nv50_fifo_load_context(struct nouveau_channel *chan)
nv_wr32(dev, 0x3374, nv_ro32(dev, ramfc, 0x24/4));
nv_wr32(dev, 0x3378, nv_ro32(dev, ramfc, 0x28/4));
nv_wr32(dev, 0x337c, nv_ro32(dev, ramfc, 0x2c/4));
-   nv_wr32(dev, 0x3228, nv_ro32(dev, ramfc, 0x30/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_STATE, nv_ro32(dev, ramfc, 0x30/4));
nv_wr32(dev, 0x3364, nv_ro32(dev, ramfc, 0x34/4));
nv_wr32(dev, 0x32a0, nv_ro32(dev, ramfc, 0x38/4));
-   nv_wr32(dev, 0x3224, nv_ro32(dev, ramfc, 0x3c/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_FETCH, nv_ro32(dev, ramfc, 0x3c/4));
nv_wr32(dev, 0x324c, nv_ro32(dev, ramfc, 0x40/4));
-   nv_wr32(dev, 0x2044, nv_ro32(dev, ramfc, 0x44/4));
-   nv_wr32(dev, 0x322c, nv_ro32(dev, ramfc, 0x48/4));
+   nv_wr32(dev, NV04_PFIFO_DMA_TIMESLICE, nv_ro32(dev, ramfc, 0x44/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_INSTANCE, nv_ro32(dev, ramfc, 
0x48/4));
nv_wr32(dev, 0x3234, nv_ro32(dev, ramfc, 0x4c/4));
nv_wr32(dev, 0x3340, nv_ro32(dev, ramfc, 0x50/4));
nv_wr32(dev, 0x3344, nv_ro32(dev, ramfc, 0x54/4));
-   nv_wr32(dev, 0x3280, nv_ro32(dev, ramfc, 0x58/4));
-   nv_wr32(dev, 0x3254, nv_ro32(dev, ramfc, 0x5c/4));
-   nv_wr32(dev, 0x3260, nv_ro32(dev, ramfc, 0x60/4));
-   nv_wr32(dev, 0x3264, nv_ro32(dev, ramfc, 0x64/4));
-   nv_wr32(dev, 0x3268, nv_ro32(dev, ramfc, 0x68/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_ENGINE, nv_ro32(dev, ramfc, 0x58/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_PULL1, nv_ro32(dev, ramfc, 0x5c/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_ACQUIRE_TIMEOUT, nv_ro32(dev, ramfc, 
0x60/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_ACQUIRE_TIMESTAMP, nv_ro32(dev, ramfc, 
0x64/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_ACQUIRE_VALUE, nv_ro32(dev, ramfc, 
0x68/4));
nv_wr32(dev, 0x326c, nv_ro32(dev, ramfc, 0x6c/4));
nv_wr32(dev, 0x32e4, nv_ro32(dev, ramfc, 0x70/4));
-   nv_wr32(dev, 0x3248, nv_ro32(dev, ramfc, 0x74/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_REF_CNT, nv_ro32(dev, ramfc, 0x74/4));
nv_wr32(dev, 0x2088, nv_ro32(dev, ramfc, 0x78/4));
nv_wr32(dev, 0x2058, nv_ro32(dev, ramfc, 0x7c/4));
-   nv_wr32(dev, 0x2210, nv_ro32(dev, ramfc, 0x80/4));
+   nv_wr32(dev, NV03_PFIFO_RAMHT, nv_ro32(dev, ramfc, 0x80/4));
 
cnt = nv_ro32(dev, ramfc, 0x84/4);
for (ptr = 0; ptr  cnt; ptr++) {
@@ -430,9 +430,9 @@ nv50_fifo_unload_context(struct drm_device *dev)
 
nv_wo32(dev, ramfc, 0x00/4, nv_rd32(dev, 0x3330));
nv_wo32(dev, ramfc, 0x04/4, nv_rd32(dev, 0x3334));
-   nv_wo32(dev, ramfc, 0x08/4, nv_rd32(dev, 0x3240));
+   nv_wo32(dev, ramfc, 0x08/4, nv_rd32(dev, NV04_PFIFO_CACHE1_DMA_PUT));

[Nouveau] [PATCH] drm/nouveau: dehexify nv50_fifo.c (v2)

2010-01-30 Thread Luca Barbieri
Merged the two patches and added signoff.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nv50_fifo.c |   84 +-
 1 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nv50_fifo.c 
b/drivers/gpu/drm/nouveau/nv50_fifo.c
index 32b244b..550cabe 100644
--- a/drivers/gpu/drm/nouveau/nv50_fifo.c
+++ b/drivers/gpu/drm/nouveau/nv50_fifo.c
@@ -58,7 +58,7 @@ nv50_fifo_init_thingo(struct drm_device *dev)
 
nv_wr32(dev, 0x32f4, cur-instance  12);
nv_wr32(dev, 0x32ec, nr);
-   nv_wr32(dev, 0x2500, 0x101);
+   nv_wr32(dev, NV03_PFIFO_CACHES, 0x101);
 }
 
 static int
@@ -146,7 +146,7 @@ nv50_fifo_init_regs__nv(struct drm_device *dev)
 {
NV_DEBUG(dev, \n);
 
-   nv_wr32(dev, 0x250c, 0x6f3cfc34);
+   nv_wr32(dev, NV04_PFIFO_SIZE, 0x6f3cfc34);
 }
 
 static void
@@ -154,12 +154,12 @@ nv50_fifo_init_regs(struct drm_device *dev)
 {
NV_DEBUG(dev, \n);
 
-   nv_wr32(dev, 0x2500, 0);
-   nv_wr32(dev, 0x3250, 0);
-   nv_wr32(dev, 0x3220, 0);
-   nv_wr32(dev, 0x3204, 0);
-   nv_wr32(dev, 0x3210, 0);
-   nv_wr32(dev, 0x3270, 0);
+   nv_wr32(dev, NV03_PFIFO_CACHES, 0);
+   nv_wr32(dev, NV04_PFIFO_CACHE1_PULL0, 0);
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_PUSH, 0);
+   nv_wr32(dev, NV03_PFIFO_CACHE1_PUSH1, 0);
+   nv_wr32(dev, NV03_PFIFO_CACHE1_PUT, 0);
+   nv_wr32(dev, NV03_PFIFO_CACHE1_GET, 0);
 
/* Enable dummy channels setup by nv50_instmem.c */
nv50_fifo_channel_enable(dev, 0, true);
@@ -345,9 +345,9 @@ nv50_fifo_load_context(struct nouveau_channel *chan)
 
nv_wr32(dev, 0x3330, nv_ro32(dev, ramfc, 0x00/4));
nv_wr32(dev, 0x3334, nv_ro32(dev, ramfc, 0x04/4));
-   nv_wr32(dev, 0x3240, nv_ro32(dev, ramfc, 0x08/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_PUT, nv_ro32(dev, ramfc, 0x08/4));
nv_wr32(dev, 0x3320, nv_ro32(dev, ramfc, 0x0c/4));
-   nv_wr32(dev, 0x3244, nv_ro32(dev, ramfc, 0x10/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_GET, nv_ro32(dev, ramfc, 0x10/4));
nv_wr32(dev, 0x3328, nv_ro32(dev, ramfc, 0x14/4));
nv_wr32(dev, 0x3368, nv_ro32(dev, ramfc, 0x18/4));
nv_wr32(dev, 0x336c, nv_ro32(dev, ramfc, 0x1c/4));
@@ -355,27 +355,27 @@ nv50_fifo_load_context(struct nouveau_channel *chan)
nv_wr32(dev, 0x3374, nv_ro32(dev, ramfc, 0x24/4));
nv_wr32(dev, 0x3378, nv_ro32(dev, ramfc, 0x28/4));
nv_wr32(dev, 0x337c, nv_ro32(dev, ramfc, 0x2c/4));
-   nv_wr32(dev, 0x3228, nv_ro32(dev, ramfc, 0x30/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_STATE, nv_ro32(dev, ramfc, 0x30/4));
nv_wr32(dev, 0x3364, nv_ro32(dev, ramfc, 0x34/4));
-   nv_wr32(dev, 0x32a0, nv_ro32(dev, ramfc, 0x38/4));
-   nv_wr32(dev, 0x3224, nv_ro32(dev, ramfc, 0x3c/4));
-   nv_wr32(dev, 0x324c, nv_ro32(dev, ramfc, 0x40/4));
-   nv_wr32(dev, 0x2044, nv_ro32(dev, ramfc, 0x44/4));
-   nv_wr32(dev, 0x322c, nv_ro32(dev, ramfc, 0x48/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_DCOUNT, nv_ro32(dev, ramfc, 0x38/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_FETCH, nv_ro32(dev, ramfc, 0x3c/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_DMA_SUBROUTINE, nv_ro32(dev, ramfc, 
0x40/4));
+   nv_wr32(dev, NV04_PFIFO_DMA_TIMESLICE, nv_ro32(dev, ramfc, 0x44/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_DMA_INSTANCE, nv_ro32(dev, ramfc, 
0x48/4));
nv_wr32(dev, 0x3234, nv_ro32(dev, ramfc, 0x4c/4));
nv_wr32(dev, 0x3340, nv_ro32(dev, ramfc, 0x50/4));
nv_wr32(dev, 0x3344, nv_ro32(dev, ramfc, 0x54/4));
-   nv_wr32(dev, 0x3280, nv_ro32(dev, ramfc, 0x58/4));
-   nv_wr32(dev, 0x3254, nv_ro32(dev, ramfc, 0x5c/4));
-   nv_wr32(dev, 0x3260, nv_ro32(dev, ramfc, 0x60/4));
-   nv_wr32(dev, 0x3264, nv_ro32(dev, ramfc, 0x64/4));
-   nv_wr32(dev, 0x3268, nv_ro32(dev, ramfc, 0x68/4));
-   nv_wr32(dev, 0x326c, nv_ro32(dev, ramfc, 0x6c/4));
-   nv_wr32(dev, 0x32e4, nv_ro32(dev, ramfc, 0x70/4));
-   nv_wr32(dev, 0x3248, nv_ro32(dev, ramfc, 0x74/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_ENGINE, nv_ro32(dev, ramfc, 0x58/4));
+   nv_wr32(dev, NV04_PFIFO_CACHE1_PULL1, nv_ro32(dev, ramfc, 0x5c/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_ACQUIRE_TIMEOUT, nv_ro32(dev, ramfc, 
0x60/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_ACQUIRE_TIMESTAMP, nv_ro32(dev, ramfc, 
0x64/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_ACQUIRE_VALUE, nv_ro32(dev, ramfc, 
0x68/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_SEMAPHORE, nv_ro32(dev, ramfc, 0x6c/4));
+   nv_wr32(dev, NV40_PFIFO_UNK32E4, nv_ro32(dev, ramfc, 0x70/4));
+   nv_wr32(dev, NV10_PFIFO_CACHE1_REF_CNT, nv_ro32(dev, ramfc, 0x74/4));
nv_wr32(dev, 0x2088, nv_ro32(dev, ramfc, 0x78/4));
nv_wr32(dev, 0x2058, nv_ro32(dev, ramfc, 0x7c/4));
-   nv_wr32(dev, 0x2210, nv_ro32(dev, ramfc, 0x80/4));
+   nv_wr32(dev, NV03_PFIFO_RAMHT, nv_ro32(dev, ramfc, 0x80/4

[Nouveau] [PATCH 1/2] libdrm/nouveau: new optimized libdrm pushbuffer ABI

2010-01-29 Thread Luca Barbieri
This patch changes the pushbuffer ABI to:

1. No longer use/expose nouveau_pushbuffer. Everything is directly
   in nouveau_channel. This saves the extra pushbuf pointer dereference.

2. Use cur/end pointers instead of tracking the remaining size.
   Pushing data now only needs to alter cur and not both cur and remaining.

The goal is to make the *_RING macros faster and make the interface simpler
and cleaner in the process.

The *_RING APIs are unchanged, but those are inlined and the ABI is changed.
The libdrm version is thus bumped.

Also, anything accessing pushbuf-remaining instead of using AVAIL_RING
will need to be fixed.
---
 include/drm/nouveau_drm.h |2 +-
 nouveau/nouveau_bo.c  |2 +-
 nouveau/nouveau_channel.h |5 ++-
 nouveau/nouveau_device.c  |2 +-
 nouveau/nouveau_private.h |3 --
 nouveau/nouveau_pushbuf.c |   47 
 nouveau/nouveau_pushbuf.h |   22 ++--
 7 files changed, 34 insertions(+), 49 deletions(-)

diff --git a/include/drm/nouveau_drm.h b/include/drm/nouveau_drm.h
index 1e67c44..f764174 100644
--- a/include/drm/nouveau_drm.h
+++ b/include/drm/nouveau_drm.h
@@ -25,7 +25,7 @@
 #ifndef __NOUVEAU_DRM_H__
 #define __NOUVEAU_DRM_H__
 
-#define NOUVEAU_DRM_HEADER_PATCHLEVEL 15
+#define NOUVEAU_DRM_HEADER_PATCHLEVEL 16
 
 struct drm_nouveau_channel_alloc {
uint32_t fb_ctxdma_handle;
diff --git a/nouveau/nouveau_bo.c b/nouveau/nouveau_bo.c
index 10cc8a6..ac1b37f 100644
--- a/nouveau/nouveau_bo.c
+++ b/nouveau/nouveau_bo.c
@@ -565,7 +565,7 @@ nouveau_bo_pending(struct nouveau_bo *bo)
 struct drm_nouveau_gem_pushbuf_bo *
 nouveau_bo_emit_buffer(struct nouveau_channel *chan, struct nouveau_bo *bo)
 {
-   struct nouveau_pushbuf_priv *nvpb = nouveau_pushbuf(chan-pushbuf);
+   struct nouveau_pushbuf_priv *nvpb = nouveau_channel(chan)-pb;
struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
struct drm_nouveau_gem_pushbuf_bo *pbbo;
struct nouveau_bo *ref = NULL;
diff --git a/nouveau/nouveau_channel.h b/nouveau/nouveau_channel.h
index 294f749..ddcf8e4 100644
--- a/nouveau/nouveau_channel.h
+++ b/nouveau/nouveau_channel.h
@@ -29,11 +29,12 @@ struct nouveau_subchannel {
 };
 
 struct nouveau_channel {
+   uint32_t *cur;
+   uint32_t *end;
+
struct nouveau_device *device;
int id;
 
-   struct nouveau_pushbuf *pushbuf;
-
struct nouveau_grobj *nullobj;
struct nouveau_grobj *vram;
struct nouveau_grobj *gart;
diff --git a/nouveau/nouveau_device.c b/nouveau/nouveau_device.c
index 0982d3b..14bf8bb 100644
--- a/nouveau/nouveau_device.c
+++ b/nouveau/nouveau_device.c
@@ -26,7 +26,7 @@
 
 #include nouveau_private.h
 
-#if NOUVEAU_DRM_HEADER_PATCHLEVEL != 15
+#if NOUVEAU_DRM_HEADER_PATCHLEVEL != 16
 #error nouveau_drm.h does not match expected patchlevel, update libdrm.
 #endif
 
diff --git a/nouveau/nouveau_private.h b/nouveau/nouveau_private.h
index 39758d1..0e526a1 100644
--- a/nouveau/nouveau_private.h
+++ b/nouveau/nouveau_private.h
@@ -39,8 +39,6 @@
 #define CALPB_BUFFERS 4
 #define CALPB_BUFSZ   16384
 struct nouveau_pushbuf_priv {
-   struct nouveau_pushbuf base;
-
int no_aper_update;
int use_cal;
uint32_t cal_suffix0;
@@ -50,7 +48,6 @@ struct nouveau_pushbuf_priv {
int current_offset;
 
unsigned *pushbuf;
-   unsigned  size;
 
unsigned marker;
unsigned marker_relocs;
diff --git a/nouveau/nouveau_pushbuf.c b/nouveau/nouveau_pushbuf.c
index 7da3a47..b6af216 100644
--- a/nouveau/nouveau_pushbuf.c
+++ b/nouveau/nouveau_pushbuf.c
@@ -37,12 +37,13 @@ nouveau_pushbuf_space_call(struct nouveau_channel *chan, 
unsigned min)
struct nouveau_pushbuf_priv *nvpb = nvchan-pb;
struct nouveau_bo *bo;
int ret;
+   unsigned size;
 
if (min  PB_MIN_USER_DWORDS)
min = PB_MIN_USER_DWORDS;
 
-   nvpb-current_offset = nvpb-base.cur - nvpb-pushbuf;
-   if (nvpb-current_offset + min + 2 = nvpb-size)
+   nvpb-current_offset = chan-cur - nvpb-pushbuf;
+   if (chan-cur + min + 2 = chan-end)
return 0;
 
nvpb-current++;
@@ -54,13 +55,12 @@ nouveau_pushbuf_space_call(struct nouveau_channel *chan, 
unsigned min)
if (ret)
return ret;
 
-   nvpb-size = (bo-size - 8) / 4;
+   size = (bo-size - 8) / 4;
nvpb-pushbuf = bo-map;
nvpb-current_offset = 0;
 
-   nvpb-base.channel = chan;
-   nvpb-base.remaining = nvpb-size;
-   nvpb-base.cur = nvpb-pushbuf;
+   chan-cur = nvpb-pushbuf;
+   chan-end = nvpb-pushbuf + size;
 
nouveau_bo_unmap(bo);
return 0;
@@ -71,6 +71,7 @@ nouveau_pushbuf_space(struct nouveau_channel *chan, unsigned 
min)
 {
struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
struct nouveau_pushbuf_priv *nvpb = nvchan-pb;
+   unsigned size;
 
if (nvpb-use_cal)
return 

[Nouveau] [PATCH 2/2] libdrm/nouveau: support writing out the pushbuffer in renouveau trace format

2010-01-29 Thread Luca Barbieri
This patch causes libdrm, when NOUVEAU_DUMP=1 is set, to write the
pushbuffer to stdout instead of submitting it to the card.

renouveau-parse can then be used to parse it and obtain a readable
trace.

This is very useful for debugging and optimizing the Gallium driver.
---
 nouveau/nouveau_private.h |1 +
 nouveau/nouveau_pushbuf.c |   13 +++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/nouveau/nouveau_private.h b/nouveau/nouveau_private.h
index 0e526a1..bed117a 100644
--- a/nouveau/nouveau_private.h
+++ b/nouveau/nouveau_private.h
@@ -40,6 +40,7 @@
 #define CALPB_BUFSZ   16384
 struct nouveau_pushbuf_priv {
int no_aper_update;
+   int use_dump;
int use_cal;
uint32_t cal_suffix0;
uint32_t cal_suffix1;
diff --git a/nouveau/nouveau_pushbuf.c b/nouveau/nouveau_pushbuf.c
index b6af216..4cb8985 100644
--- a/nouveau/nouveau_pushbuf.c
+++ b/nouveau/nouveau_pushbuf.c
@@ -148,7 +148,10 @@ nouveau_pushbuf_init(struct nouveau_channel *chan)
struct nouveau_pushbuf_priv *nvpb = nvchan-pb;
int ret;
 
-   nouveau_pushbuf_init_call(chan);
+   if(getenv(NOUVEAU_DUMP))
+   nvpb-use_dump = 1;
+   if(!nvpb-use_dump)
+   nouveau_pushbuf_init_call(chan);
 
ret = nouveau_pushbuf_space(chan, 0);
if (ret) {
@@ -190,7 +193,13 @@ nouveau_pushbuf_flush(struct nouveau_channel *chan, 
unsigned min)
if (chan-cur == nvpb-pushbuf)
return 0;
 
-   if (nvpb-use_cal) {
+   if (nvpb-use_dump) {
+   uint32_t* p;
+   printf(# begin %i dwords %i buffers %i relocs\n, chan-cur - 
nvpb-pushbuf, nvpb-nr_buffers, nvpb-nr_relocs);
+   for(p = nvpb-pushbuf; p != chan-cur; ++p)
+   printf(%08x\n, *p);   
+   printf(# end\n);
+   } else if (nvpb-use_cal) {
struct drm_nouveau_gem_pushbuf_call req;
 
*(chan-cur++) = nvpb-cal_suffix0;
-- 
1.6.6.1.476.g01ddb

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] drm/nouveau: call ttm_bo_wait with the bo lock held to prevent hang

2010-01-28 Thread Luca Barbieri
Please apply or state objections to this patch.

Thanks.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] drm/nouveau: enlarge GART aperture

2010-01-28 Thread Luca Barbieri
This patch enlarges the PCI GART aperture to 512 MB.

The current 64MB aperture is too small and should be enlarged.
The optimal amound may be card/system-dependent, so a more sophisticated
approach may be preferable.
In particular, if RAMIN is less than 1MB, a 512MB aperture won't fit.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_sgdma.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c 
b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
index 4c7f1e4..2ca44cc 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
@@ -227,7 +227,7 @@ nouveau_sgdma_init(struct drm_device *dev)
int i, ret;
 
if (dev_priv-card_type  NV_50) {
-   aper_size = (64 * 1024 * 1024);
+   aper_size = (512 * 1024 * 1024);
obj_size  = (aper_size  NV_CTXDMA_PAGE_SHIFT) * 4;
obj_size += 8; /* ctxdma header */
} else {
-- 
1.6.6.1.476.g01ddb

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] drm/ttm: Fix race condition in ttm_bo_delayed_delete

2010-01-21 Thread Luca Barbieri
 At a first glance:

 1) We probably *will* need a delayed destroyed workqueue to avoid wasting
 memory that otherwise should be freed to the system. At the very least, the
 delayed delete process should optionally be run by a system shrinker.
You are right. For VRAM we don't care since we are the only user,
while for system backed memory some delayed destruction will be
needed.
The logical extension of the scheme would be for the Linux page
allocator/swapper to check for TTM buffers to destroy when it would
otherwise shrink caches, try to swap and/or wait on swap to happen.
Not sure whether there are existing hooks for this or where exactly to
hook this code.

 2) Fences in TTM are currently not necessarily strictly ordered, and
 sequence numbers are hidden from the bo code. This means, for a given FIFO,
 fence sequence 3 may expire before fence sequence 2, depending on the usage
 of the buffer.

My definition of channel (I sometimes used FIFO incorrectly as a
synonym of that) is exactly a set of fences that are strictly ordered.
If the card has multiple HW engines, each is considered a different
channel (so that a channel becomes a (fifo, engine) pair).

We may need however to add the concept of a sync domain that would
be a set of channels that support on-GPU synchronization against each
other.
This would model hardware where channels with the same FIFO can be
synchronized together but those with different FIFOs don't, and also
multi-core GPUs where synchronization might be available only inside
each core and not across cores.

To sum it up, a GPU consists of a set of sync domains, each consisting
of a set of channels, each consisting of a sequence of fences, with
the following rules:
1. Fences within the same channel expire in order
2. If channels A and B belong to the same sync domain, it's possible
to emit a fence on A that is guaranteed to expire after an arbitrary
fence of B

Whether channels have the same FIFO or not is essentially a driver
implementation detail, and what TTM cares about is if they are in the
same sync domain.

[I just made up sync domain here: is there a standard term?]

This assumes that the synchronizability graph is a disjoint union of
complete graphs. Is there any example where it is not so?
Also, does this actually model correctly Poulsbo, or am I wrong?

Note that we could use CPU mediation more than we currently do.
For instance now Nouveau, to do inter-channel synchronization, simply
waits on the fence with the CPU immediately synchronously, while it
could instead queue the commands in software, and with an
interrupt/delayed mechanism submit them to hardware once the fence to
be waited for is expired.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] drm/ttm: Fix race condition in ttm_bo_delayed_delete

2010-01-21 Thread Luca Barbieri
 Nvidia cards have a synchronization primitive that could be used to
 synchronize several FIFOs in hardware (AKA semaphores, see [1] for an
 example).

Does this operate wholly on the GPU on all nVidia cards?

It seems that at least on some GPUs this will trigger software
methods that are basically a way for the GPU to trigger an interrupt
and stop the FIFO until the CPU handles the interrupt and restarts it.

Also, is there a way on nVidia cards to get interrupts on fences, but
only where the fence sequence number is higher than a dynamically set
value? (so that one could sleep for fence X without getting an
interrupt for every single fence before that)

If not, it could possibly be hacked around by reading from a DMA
object at the address of the fence sequence number and then resizing
the DMA object so that addresses from a certain point on would trigger
a protection fault interrupt.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] drm/ttm: Fix race condition in ttm_bo_delayed_delete

2010-01-21 Thread Luca Barbieri
I'm not sure I understand your proposal correctly.
It seems your proposoal is similar to mine, replacing the term fence
nodes with ttm transactions, but I'm not sure if I understand it
correctly

Here is some pseudocode for a improved, simplified version of my proposal.
It is modified so that there are no longer distinct alive/destroy
lists, but buffers are destroyed if their ref count is zero.

list_head ram_lru_list; /* list of bos */
list_head vram_unfenced_lru_list; /* list of bos */
list_head gart_unfenced_lru_list; /* list of bos */

atomic uint64_t next_seq_num;

// if read_list and write_list are empty, the buffer is unfenced and
MUST be in an unfenced lru list
// otherwise, it is fenced and MUST be, if not zombie, on some
read_list/write_list, or if zombie, on some unfenced_list
struct ttm_buffer_object
{
   kref_t ref;
   list_head read_list; /* list of bo_fence_nodes */
   list_head write_list; /* list of bo_fence_nodes */
   list_head unfenced_list; /* list entry in
[ram|[gart|vram]_unfenced]_lru_list */
   [...]
};

// TODO: we could embed just the first two members in the
ttm_buffer_object, and steal the lowest bit on the fence pointer to
signify that
// this would optimize for the very common single-fence case
struct ttm_bo_fence_node
{
list_head bo_list; /* list entry in bo.[read|write]_list */
struct ttm_fence_node* fence;

struct ttm_buffer_object* bo;
list_head fence_list; /* list entry in fence.[vram|gart|destroy]_list */
};

struct ttm_fence
{
void* sync_object; /* may also turned into an object containing a
ttm_fence at the start */
uint64_t seq_num; /* unique value in order of kmalloc of this ttm_fence */
list_head bo_list; /* list of bo_fence_nodes */
};

struct ttm_channel
{
list_head fence_list; /* list of ttm_fence_node */
};

ttm_flush_fences()
{
list_head vram_list[MAX_CHANNELS];
list_head gart_list[MAX_CHANNELS];
foreach channel
{
foreach fence in channel
{
 if(not driver-fence_expired(fence-sync_object))
 break;
 foreach bo_fence_node in fence.bo_list
 {
 remove bo_fence_node
 if bo.read_list and bo.write_list are both empty
 {
 if bo.refcount is zero
 destroy
 else
 {
 append to [vram|gart]_list[channel]
 }
 }
 }
}
}

// this is the n-way merge of vram_list[]s into the lru list
while(vram_list[]s are not all empty)
{
// this can use either a simple scan, or an heap
find channel such that
first_entry(vram_list[channel]).fence.seq_num is smallest

remove first_entry(vram_list[channel]) and put the bo at the
recent end of vram_unfenced_lru_list
}

same thing for gart;
}

// assume buffer is reserved, use current mechanism or mutex for that
// channel may be null for CPU waits
ttm_bo_wait(bo, channel, wait_for_write)
{
 foreach fence_node in bo.write_list
 {
 if(fence_node.channel != channel)
 driver-wait_fence(fence_node.fence.sync_object);
 }

 if(!wait_for_write)
  return;

 foreach fence_node in bo.read_list
 {
 if(fence_node.channel != channel)
  driver-wait_fence(fence_node.fence.sync_object);
 }
}

ttm_out_of_memory() takes memory_alloc_mutex
{
retry:
ttm_flush_fences();
if(we have enough space now)
return;
foreach in [vram|gart]_unfenced_lru_list
{
evict that buffer if it's big enough, no need to check fences
this uses the current ghost mechanism for accelerated moves
emit evicted_buffer_fence for after emission
}
if we didn't find a big enough buffer, evict the biggest buffer
(also consider empty space around it in size)
if(we have enough space now)
return;
if(burn_cpu_time_but_have_lowest_latencies)
{
while(!driver-fence_expired(evicted_bo-sync_object) and we
don't have enough space)
{
driver-wait_for_any_fence_to_expire();
ttm_flush_fences();
}
}
else
ttm_bo_wait(evicted_bo 0)
goto retry;
}

// assume the buffer has already been put in the desired memory space
// also assume we already waited for the buffer fences
ttm_add_buffer_to_fence(fence, bo)
{
remove bo from unfenced lru list if it's on it
for the none or single bo_fence_node in bo.read_list or
bo.write_list with bo_fence_node.fence.channel == fence.channel
{
remove bo_fence_node from bo_fence_node.fence.[gart|vram]_list
if(bo_fence_node.fence has all lists empty)
remove from channel and free the fence bo_fence_node.fence
remove bo_fence_node from bo.[read|list]_list
}

create a new bo_fence node and use it to add the bo to the fence
}

ttm_bo_refcount_drops_to_zero(bo)

Re: [Nouveau] [PATCH] drm/ttm: Fix race condition in ttm_bo_delayed_delete

2010-01-21 Thread Luca Barbieri
 If not, it could possibly be hacked around by reading from a DMA
 object at the address of the fence sequence number and then resizing
 the DMA object so that addresses from a certain point on would trigger
 a protection fault interrupt.

 I don't think you can safely modify a DMA object without stalling the
 card completely, but i think you could use e.g. PGRAPH NOTIFY interrupts
 and disable them by flipping a bit in PGRAPH when you stop caring about
 them.

The problem is that one needs to disable them *before* the one he cares about.

Suppose the card is at fence 0 and we are interested in fence 1000 expiring.

If we just enable interrupts, then we are going to be interrupted
uselessly 1000 times.
Instead, we would like to tell the card send me interrupts for
fences, but only for fence number 1000 (or higher).

This way one could for instance render a whole scene, and then
desiring to read it into the CPU, just ask for an interrupt once
rendering is done (i.e. wait for the framebuffer fence) and get a
single interrupt, while we cleanly sleep undisturbed in the meantime.

Basically, it would just need some way of *conditionally* causing interrupts.
If there is none, then maybe we could insert a call to a fence
mini-pushbuffer filled with NOPs that could be overwritten with an
interrupt request on demand?
Or alternatively, construct such a pushbuffer with the 2D or 3D
engines, starting from our 1000 input and the fence value generated
by the 3D engine? (this is likely to be slow though).
Or some hack like the DMA object resizing? (would it crash the GPU? or
just not work due to it caching the previous size?)
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] drm/ttm: Fix race condition in ttm_bo_delayed_delete

2010-01-20 Thread Luca Barbieri
Yes it's fine. I sent your patch to Dave with an expanded commit
comment for merging.

Here is a possible redesign of the mechanism inspired by this issue.
It seems that what we are racing against is buffer eviction, due to
delayed deletion buffers being still kept on the LRU list.

I'm wondering if the delayed deletion design could be changed as follows:
1. Remove to-be-deleted buffers from the LRU list before putting them
on delayed delete
2. Change buffer eviction to first do a delayed deletion pass. This
should be cheap (and cheaper than the current code) because delayed
deletion stops at the first unsignaled fence.
3. Add a new delayed deletion lock/semaphore. Then, have
ttm_bo_delayed_delete take it for reading and keep it across the
function.
4. Inside the delayed deletion lock, grab the LRU lock, copy the
delayed delete list head to a local variable, set it to empty and
release the lock.
5. Iterate on the privately held list with list_for_each_safe
6. At the end of ttm_bo_delayed_delete, retake the LRU lock and readd
the remaining part of our private list at the head of the global list

This would prevent uselessly trying to evict delayed-delete buffers
(which should be processed in fence order and not LRU order), and also
prevent concurrent delayed deletions, which should be more harmful
than useful.

Furthermore, it should be possible to get rid of list locking in the
following way:
1. BOs to be delayed-deleted are added to the head of the initial
delayed deletion single linked list, using atomic cmpxchg
2. ttm_bo_delayed_delete takes the delayed deletion lock and grabs the
list with an atomic xchg of the head with zero
3. It reverses the list in place, processes the entries and puts them
at the end of a second single linked list, the recurring delayed
deletion list
4. It processes the recurring delayed deletion list, cleaning up the BOs
5. Finally, the delayed deletion lock is released

This makes adding to the delayed deletion list lockless.

The LRU list instead inherently needs to be doubly linked, so only RCU
could make it lockless, and it seems that may require using an
external list node structure (so readers don't suddenly jump to the
most recent node), and that would not be a win (except with *lots* of
CPUs).
Besides, most DRM drivers (except vmware) are taking the BKL around
all ioctls and (except nouveau) use a single pushbuffer, so this is a
bit moot anyway.

What do you think?

Anyway, this, if done, would be for the next merge window, or later,
while the current fix ought to be merged now.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] drm/ttm: Fix race condition in ttm_bo_delayed_delete

2010-01-20 Thread Luca Barbieri
 Also note that the delayed delete list is not in fence order but in
 deletion-time order, which perhaps gives room for more optimizations.
You are right.
I think then that ttm_bo_delayed_delete may still need to be changed,
because it stops when ttm_bo_cleanup_refs returns -EBUSY, which
happens when a fence has not been reached.
This means that a buffer will need to wait for all previously deleted
buffers to become unused, even if it is unused itself.
Is this acceptable?

What if we get rid of the delayed destroy list, and instead append
buffers to be deleted to their fence object, and delete them when the
fence is signaled?

This also allows to do it more naturally, since the fence object can
just keep a normal reference to the buffers it fences, and unreference
them on expiration.

Then there needs to be no special delayed destruction logic, and it
would work as if the GPU were keeping a reference to the buffer
itself, using fences as a proxy to have the CPU do that work for the
GPU.

Then the delayed work is no longer periodically destroy buffers but
rather periodically check if fences are expired, naturally stopping
at the first unexpired one.
Drivers that support IRQs on fences could also do the work in the
interrupt handler/tasklet instead, avoid the delay jiffies magic
number. This may need a NAPI-like interrupt mitigation middle layer
for optimal results though.

 But isn't an atomic cmpxchg about as costly as a spinlock?
I think it's cheaper on all architectures, as otherwise it would be
mostly pointless to have it, since you can emulate it with a spinlock.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH] drm/ttm: Fix race condition in ttm_bo_delayed_delete

2010-01-20 Thread Luca Barbieri
When designing this, we should also keep in mind that some drivers
(e.g. nouveau) have multiple FIFO channels, and thus we would like a
buffer to be referenced for reading by multiple channels at once (and
be destroyed only when all fences are expired, obviously).
Also, hardware may support on-GPU inter-channel synchronization, and
then multiple references may be for writing too.

If we use an external dynamically allocated channel/buffer list node,
we can support this (if the kernel allocators aren't fast enough,
which they should be, we can just keep released ones linked to the bo
to speed allocations).

Note that in nouveau there is a small hardware limit to channels (up
to 128 on nv50), but future hardware may possibly support unlimited
channels.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 1/2] nv30-nv40: Rewrite primitive splitting and emission

2010-01-18 Thread Luca Barbieri
The current code for primitive splitting and emission on pre-nv50 is
severely broken.

In particular:
1. Quads and lines are totally broken because = 3 should be = ~3
and similar for lines
2. Triangle fans and polygons are broken because the first vertex
must be repeated for each split chunk
3. Line loops are broken because the must be converted to a line strip,
reemitting the first vertex at the end to close the loop
4. Quad strips are broken because 2 vertices must be repeated, and not 3

This patch rewrites the splitting code to work correctly for all
primitives, including those with adjacency.

It also rewrites the nv30/nv40 code to make use of the the new splitting
code and simplifies it by making more code common between the different
emission strategies (vbo, u8/u16/u32 indices, index buffer).

Note that the splitting helper is now independent of nouveau and could
be moved into gallium/auxiliary.
---
 src/gallium/drivers/nouveau/nouveau_util.h |  107 ---
 src/gallium/drivers/nv30/nv30_vbo.c|  287 
 src/gallium/drivers/nv40/nv40_vbo.c|  280 +++
 3 files changed, 403 insertions(+), 271 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nouveau_util.h 
b/src/gallium/drivers/nouveau/nouveau_util.h
index a10114b..a74d75d 100644
--- a/src/gallium/drivers/nouveau/nouveau_util.h
+++ b/src/gallium/drivers/nouveau/nouveau_util.h
@@ -1,63 +1,124 @@
 #ifndef __NOUVEAU_UTIL_H__
 #define __NOUVEAU_UTIL_H__
 
-/* Determine how many vertices can be pushed into the command stream.
- * Where the remaining space isn't large enough to represent all verices,
- * split the buffer at primitive boundaries.
+// output v[global_start] before the chunk
+#define SPLIT_BEGIN_WITH_FIRST 1
+
+// output v[global_start] before the chunk, with edgeflag off
+#define SPLIT_BEGIN_WITH_FIRST_EDGEFLAG_OFF 2
+
+// output v[global_start] after the chunk
+#define SPLIT_END_WITH_FIRST 4
+
+// output v[chunk_start - 4], v[chunk_start - 6], v[chunk_start - 2] before 
the chunk
+// output v[chunk_end + 1] after the chunk
+#define SPLIT_TRIANGLE_STRIP_ADJACENCY 8
+
+/* private flags for nouveau_vbuf_split to keep state */
+#define SPLIT_CLOSE_LOOP 0x4000
+
+/* If 0 is returned, you must flush and retry
  *
- * Returns a count of vertices that can be rendered, and an index to
- * restart drawing at after a flush.
+ * Otherwise:
+ * 1. Output special vertices at the beginning if the _old_ value of flags 
specifies that
+ * 2. Output the returned amount of vertices
+ * 3. Output special vertices at the end if the _new_ value of flags specifies 
that
+ * 3. Set start to *restart. If *restart == end, you are done
  */
 static INLINE unsigned
-nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp,
-  unsigned mode, unsigned start, unsigned count,
-  unsigned *restart)
+util_split_primitive(int max, unsigned* pmode, unsigned* pstart, unsigned end, 
unsigned* flags)
 {
-   int max, adj = 0;
+   unsigned mode = *pmode;
+   unsigned start = *pstart;
+   unsigned count = end - start;
+   int adj = 0;
 
-   max  = remaining - overhead;
if (max  0)
return 0;
 
-   max *= vpp;
if (max = count)
+   {
+   if(*flags  SPLIT_CLOSE_LOOP)
+   *flags |= SPLIT_END_WITH_FIRST;
+
+   *flags = ~SPLIT_TRIANGLE_STRIP_ADJACENCY;
+   *pstart = end;
+
return count;
+   }
 
switch (mode) {
case PIPE_PRIM_POINTS:
break;
case PIPE_PRIM_LINES:
-   max = max  1;
-   break;
-   case PIPE_PRIM_TRIANGLES:
-   max = max - (max % 3);
-   break;
-   case PIPE_PRIM_QUADS:
-   max = max  3;
+   max = ~1;
break;
case PIPE_PRIM_LINE_LOOP:
+   if (max  2)
+   return 0;
+   adj = 1;
+   *pmode = PIPE_PRIM_LINE_STRIP;
+   *flags |= SPLIT_CLOSE_LOOP;
+   break;
case PIPE_PRIM_LINE_STRIP:
if (max  2)
-   max = 0;
+   return 0;
adj = 1;
break;
-   case PIPE_PRIM_POLYGON:
+   case PIPE_PRIM_TRIANGLES:
+   max = max - (max % 3);
+   break;
case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_TRIANGLE_FAN:
if (max  3)
max = 0;
adj = 2;
break;
+   case PIPE_PRIM_TRIANGLE_FAN:
+   if(max  3)
+   return 0;
+   adj = 1;
+   *flags |= SPLIT_BEGIN_WITH_FIRST;
+   break;
+   case PIPE_PRIM_QUADS:
+   max = ~3;
+   break;
case PIPE_PRIM_QUAD_STRIP:
+   max = ~1;
if (max  4)
-  

[Nouveau] [PATCH 2/2] nv40: output relocations on draw calls and not on flushes

2010-01-18 Thread Luca Barbieri
Currently we emit relocations on pushbuffer flushes.
However, this is wrong, because the pushbuffer flushes may be due to 2D
calls.
In particular, this leads to -22: validating while mapped errors in
dmesg, since the current vertex buffer can be mapped while a non-draw
(e.g. surface_copy) cal is done.
If we relocate on flushes, the relocations cause those errors.

The solution is to only set a bitmask of the needed relocations on flush,
and lazily emit them before emitting primitives.

This should totally eliminate the -22: validate while mapped errors.

This patch requires the previous primitive splitting patch.

nv30 and nv50 ought to be fixed in a similar way.

nv50 had a fix for this, but I think this approach is much better.
---
 src/gallium/drivers/nouveau/nouveau_stateobj.h |   12 ++---
 src/gallium/drivers/nv40/nv40_context.c|3 -
 src/gallium/drivers/nv40/nv40_context.h|9 +++-
 src/gallium/drivers/nv40/nv40_screen.c |2 +
 src/gallium/drivers/nv40/nv40_screen.h |3 +
 src/gallium/drivers/nv40/nv40_state_emit.c |   57 
 src/gallium/drivers/nv40/nv40_vbo.c|   14 +++---
 7 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h 
b/src/gallium/drivers/nouveau/nouveau_stateobj.h
index e844f6a..06ab028 100644
--- a/src/gallium/drivers/nouveau/nouveau_stateobj.h
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -273,7 +273,6 @@ static INLINE void
 so_emit_reloc_markers(struct nouveau_channel *chan, struct nouveau_stateobj 
*so)
 {
struct nouveau_pushbuf *pb = chan-pushbuf;
-   struct nouveau_grobj *gr = NULL;
unsigned i;
int ret = 0;
 
@@ -291,14 +290,11 @@ so_emit_reloc_markers(struct nouveau_channel *chan, 
struct nouveau_stateobj *so)
}
 #endif /* DEBUG_NOUVEAU_STATEOBJ */
 
-   /* The object needs to be bound and the system must know the
-* subchannel is being used. Otherwise it will discard it.
+   /* We don't need to autobind, since there are enough subchannels
+* for all objects we use. If this is changed, account for the 
extra
+* space in callers of this function.
 */
-   if (gr != r-gr) {
-   BEGIN_RING(chan, r-gr, 0x100, 1);
-   OUT_RING(chan, 0);
-   gr = r-gr;
-   }
+   assert(r-gr-bound != NOUVEAU_GROBJ_UNBOUND);
 
/* Some relocs really don't like to be hammered,
 * NOUVEAU_BO_DUMMY makes sure it only
diff --git a/src/gallium/drivers/nv40/nv40_context.c 
b/src/gallium/drivers/nv40/nv40_context.c
index f79ae4d..8fab88f 100644
--- a/src/gallium/drivers/nv40/nv40_context.c
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -69,9 +69,6 @@ nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
nv40-pipe.is_texture_referenced = nouveau_is_texture_referenced;
nv40-pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
 
-   screen-base.channel-user_private = nv40;
-   screen-base.channel-flush_notify = nv40_state_flush_notify;
-
nv40_init_query_functions(nv40);
nv40_init_surface_functions(nv40);
nv40_init_state_functions(nv40);
diff --git a/src/gallium/drivers/nv40/nv40_context.h 
b/src/gallium/drivers/nv40/nv40_context.h
index e219bb5..220cd27 100644
--- a/src/gallium/drivers/nv40/nv40_context.h
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -100,6 +100,7 @@ struct nv40_state {
unsigned fp_samplers;
 
uint64_t dirty;
+   uint64_t emit_relocs;
struct nouveau_stateobj *hw[NV40_STATE_MAX];
 };
 
@@ -199,7 +200,13 @@ extern void nv40_fragtex_bind(struct nv40_context *);
 extern boolean nv40_state_validate(struct nv40_context *nv40);
 extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40);
 extern void nv40_state_emit(struct nv40_context *nv40);
-extern void nv40_state_flush_notify(struct nouveau_channel *chan);
+extern void nv40_state_start(struct nv40_context *nv40, unsigned space);
+static inline void nv40_state_finish(struct nv40_context *nv40)
+{
+   /* if this triggers, it means we flushed in the meantime, which must 
not happen */
+   assert(!(nv40-screen-need_relocs  (1ULL  NV40_STATE_FB)));
+}
+
 extern struct nv40_state_entry nv40_state_rasterizer;
 extern struct nv40_state_entry nv40_state_scissor;
 extern struct nv40_state_entry nv40_state_stipple;
diff --git a/src/gallium/drivers/nv40/nv40_screen.c 
b/src/gallium/drivers/nv40/nv40_screen.c
index 21320ba..d57461c 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -180,6 +180,8 @@ nv40_screen_create(struct pipe_winsys *ws, struct 
nouveau_device *dev)
return NULL;
}
chan = screen-base.channel;
+   chan-user_private = screen;
+   chan-flush_notify = 

[Nouveau] [PATCH] nv40: add support for ARB_half_float_vertex

2010-01-18 Thread Luca Barbieri
This requires the arb_half_float_vertex Mesa branch, plus some unreleased
gallium support work by Dave Airlie.

You may need to fix an assertion in st_pipe_vertex_format too.
---
 src/gallium/drivers/nv40/nv40_vbo.c |   14 ++
 1 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/src/gallium/drivers/nv40/nv40_vbo.c 
b/src/gallium/drivers/nv40/nv40_vbo.c
index e85d6b4..211f3bd 100644
--- a/src/gallium/drivers/nv40/nv40_vbo.c
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -11,6 +11,10 @@
 
 #define FORCE_SWTNL 0
 
+#ifndef NV40TCL_VTXFMT_TYPE_HALF
+#define NV40TCL_VTXFMT_TYPE_HALF 3
+#endif
+
 static INLINE int
 nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
 {
@@ -21,6 +25,12 @@ nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, 
unsigned *ncomp)
case PIPE_FORMAT_R32G32B32A32_FLOAT:
*fmt = NV40TCL_VTXFMT_TYPE_FLOAT;
break;
+   case PIPE_FORMAT_R16X16_FLOAT:
+   case PIPE_FORMAT_R16G16_FLOAT:
+   case PIPE_FORMAT_R16G16B16X16_FLOAT:
+   case PIPE_FORMAT_R16G16B16A16_FLOAT:
+   *fmt = NV40TCL_VTXFMT_TYPE_HALF;
+   break;
case PIPE_FORMAT_R8_UNORM:
case PIPE_FORMAT_R8G8_UNORM:
case PIPE_FORMAT_R8G8B8_UNORM:
@@ -41,21 +51,25 @@ nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, 
unsigned *ncomp)
switch (pipe) {
case PIPE_FORMAT_R8_UNORM:
case PIPE_FORMAT_R32_FLOAT:
+   case PIPE_FORMAT_R16X16_FLOAT:
case PIPE_FORMAT_R16_SSCALED:
*ncomp = 1;
break;
case PIPE_FORMAT_R8G8_UNORM:
case PIPE_FORMAT_R32G32_FLOAT:
+   case PIPE_FORMAT_R16G16_FLOAT:
case PIPE_FORMAT_R16G16_SSCALED:
*ncomp = 2;
break;
case PIPE_FORMAT_R8G8B8_UNORM:
case PIPE_FORMAT_R32G32B32_FLOAT:
+   case PIPE_FORMAT_R16G16B16X16_FLOAT:
case PIPE_FORMAT_R16G16B16_SSCALED:
*ncomp = 3;
break;
case PIPE_FORMAT_R8G8B8A8_UNORM:
case PIPE_FORMAT_R32G32B32A32_FLOAT:
+   case PIPE_FORMAT_R16G16B16A16_FLOAT:
case PIPE_FORMAT_R16G16B16A16_SSCALED:
*ncomp = 4;
break;
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] nv40: add missing vertprog setcond instructions

2010-01-18 Thread Luca Barbieri
Trivially adds SEQ, SGT, SLE, SNE, SFL, STR and SSG which were missing.
---
 src/gallium/drivers/nv40/nv40_vertprog.c |   21 +
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c 
b/src/gallium/drivers/nv40/nv40_vertprog.c
index 80f2a87..578f10e 100644
--- a/src/gallium/drivers/nv40/nv40_vertprog.c
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -551,6 +551,27 @@ nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
case TGSI_OPCODE_SLT:
arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
break;
+   case TGSI_OPCODE_SEQ:
+   arith(vpc, 0, OP_SEQ, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_SGT:
+   arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_SLE:
+   arith(vpc, 0, OP_SLE, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_SNE:
+   arith(vpc, 0, OP_SNE, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_SFL:
+   arith(vpc, 0, OP_SFL, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_STR:
+   arith(vpc, 0, OP_STR, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_SSG:
+   arith(vpc, 0, OP_SSG, dst, mask, src[0], src[1], none);
+   break;
case TGSI_OPCODE_SUB:
arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
break;
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] nv40: add missing vertprog setcond instructions (v2)

2010-01-18 Thread Luca Barbieri
Trivially adds SEQ, SGT, SLE, SNE, SFL, STR and SSG which were missing.

Changed to preserv alphabetic order of cases.
---
 src/gallium/drivers/nv40/nv40_vertprog.c |   21 +
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c 
b/src/gallium/drivers/nv40/nv40_vertprog.c
index 80f2a87..2f86e64 100644
--- a/src/gallium/drivers/nv40/nv40_vertprog.c
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -545,12 +545,33 @@ nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
case TGSI_OPCODE_RSQ:
arith(vpc, 1, OP_RSQ, dst, mask, none, none, abs(src[0]));
break;
+   case TGSI_OPCODE_SEQ:
+   arith(vpc, 0, OP_SEQ, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_SFL:
+   arith(vpc, 0, OP_SFL, dst, mask, src[0], src[1], none);
+   break;
case TGSI_OPCODE_SGE:
arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
break;
+   case TGSI_OPCODE_SGT:
+   arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_SLE:
+   arith(vpc, 0, OP_SLE, dst, mask, src[0], src[1], none);
+   break;
case TGSI_OPCODE_SLT:
arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
break;
+   case TGSI_OPCODE_SNE:
+   arith(vpc, 0, OP_SNE, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_SSG:
+   arith(vpc, 0, OP_SSG, dst, mask, src[0], src[1], none);
+   break;
+   case TGSI_OPCODE_STR:
+   arith(vpc, 0, OP_STR, dst, mask, src[0], src[1], none);
+   break;
case TGSI_OPCODE_SUB:
arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
break;
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] nv30-nv40: support unlimited queries (v2)

2010-01-18 Thread Luca Barbieri
Currently on NV30/NV40 an assert will be triggered once 32 queries are
outstanding.

This violates the OpenGL/Gallium interface, which requires support for
an unlimited number of fences.

This patch fixes the problem by putting queries in a linked list and
waiting on the oldest one if allocation fails.

nVidia seems to use a similar strategy, but with 1024 instead of 32 fences.
The next patch will improve this.

Fixed indentation and added header for query_list.
---
 src/gallium/drivers/nv30/nv30_query.c  |   26 ++
 src/gallium/drivers/nv30/nv30_screen.c |2 ++
 src/gallium/drivers/nv30/nv30_screen.h |2 ++
 src/gallium/drivers/nv40/nv40_query.c  |   26 ++
 src/gallium/drivers/nv40/nv40_screen.c |2 ++
 src/gallium/drivers/nv40/nv40_screen.h |2 ++
 6 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/nv30/nv30_query.c 
b/src/gallium/drivers/nv30/nv30_query.c
index e27e9cc..eeab223 100644
--- a/src/gallium/drivers/nv30/nv30_query.c
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -3,6 +3,7 @@
 #include nv30_context.h
 
 struct nv30_query {
+   struct list_head list;
struct nouveau_resource *object;
unsigned type;
boolean ready;
@@ -23,6 +24,8 @@ nv30_query_create(struct pipe_context *pipe, unsigned 
query_type)
q = CALLOC(1, sizeof(struct nv30_query));
q-type = query_type;
 
+   assert(q-type == PIPE_QUERY_OCCLUSION_COUNTER);
+
return (struct pipe_query *)q;
 }
 
@@ -32,7 +35,10 @@ nv30_query_destroy(struct pipe_context *pipe, struct 
pipe_query *pq)
struct nv30_query *q = nv30_query(pq);
 
if (q-object)
+   {
nouveau_resource_free(q-object);
+   LIST_DEL(q-list);
+   }
FREE(q);
 }
 
@@ -44,20 +50,25 @@ nv30_query_begin(struct pipe_context *pipe, struct 
pipe_query *pq)
struct nv30_screen *screen = nv30-screen;
struct nouveau_channel *chan = screen-base.channel;
struct nouveau_grobj *rankine = screen-rankine;
-
-   assert(q-type == PIPE_QUERY_OCCLUSION_COUNTER);
+   uint64_t tmp;
 
/* Happens when end_query() is called, then another begin_query()
 * without querying the result in-between.  For now we'll wait for
 * the existing query to notify completion, but it could be better.
 */
-   if (q-object) {
-   uint64_t tmp;
+   if (q-object)
pipe-get_query_result(pipe, pq, 1, tmp);
+
+   while (nouveau_resource_alloc(nv30-screen-query_heap, 1, NULL, 
q-object))
+   {
+   struct nv30_query* oldestq;
+   assert(!LIST_IS_EMPTY(nv30-screen-query_list));
+   oldestq = LIST_ENTRY(struct nv30_query, 
nv30-screen-query_list.next, list);
+   pipe-get_query_result(pipe, (struct pipe_query*)oldestq, 1, 
tmp);
}
 
-   if (nouveau_resource_alloc(nv30-screen-query_heap, 1, NULL, 
q-object))
-   assert(0);
+   LIST_ADDTAIL(q-list, nv30-screen-query_list);
+
nouveau_notifier_reset(nv30-screen-query, q-object-start);
 
BEGIN_RING(chan, rankine, NV34TCL_QUERY_RESET, 1);
@@ -90,8 +101,6 @@ nv30_query_result(struct pipe_context *pipe, struct 
pipe_query *pq,
struct nv30_context *nv30 = nv30_context(pipe);
struct nv30_query *q = nv30_query(pq);
 
-   assert(q-object  q-type == PIPE_QUERY_OCCLUSION_COUNTER);
-
if (!q-ready) {
unsigned status;
 
@@ -110,6 +119,7 @@ nv30_query_result(struct pipe_context *pipe, struct 
pipe_query *pq,
q-object-start);
q-ready = TRUE;
nouveau_resource_free(q-object);
+   LIST_DEL(q-list);
}
 
*result = q-result;
diff --git a/src/gallium/drivers/nv30/nv30_screen.c 
b/src/gallium/drivers/nv30/nv30_screen.c
index 9ed4817..755db43 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -261,6 +261,8 @@ nv30_screen_create(struct pipe_winsys *ws, struct 
nouveau_device *dev)
return NULL;
}
 
+   LIST_INITHEAD(screen-query_list);
+
/* Vtxprog resources */
if (nouveau_resource_init(screen-vp_exec_heap, 0, 256) ||
nouveau_resource_init(screen-vp_data_heap, 0, 256)) {
diff --git a/src/gallium/drivers/nv30/nv30_screen.h 
b/src/gallium/drivers/nv30/nv30_screen.h
index 5fbd998..4e8b55c 100644
--- a/src/gallium/drivers/nv30/nv30_screen.h
+++ b/src/gallium/drivers/nv30/nv30_screen.h
@@ -1,6 +1,7 @@
 #ifndef __NV30_SCREEN_H__
 #define __NV30_SCREEN_H__
 
+#include util/u_double_list.h
 #include nouveau/nouveau_screen.h
 
 #include nv04/nv04_surface_2d.h
@@ -20,6 +21,7 @@ struct nv30_screen {
/* Query object resources */
struct nouveau_notifier *query;
struct nouveau_resource *query_heap;
+   struct list_head query_list;
 
/* 

Re: [Nouveau] [Mesa3d-dev] [PATCH 2/2] st: don't assert on empty fragment program

2010-01-18 Thread Luca Barbieri
Breakpoint 3, _mesa_ProgramStringARB (target=34820, format=34933,
len=70, string=0x85922ba) at shader/arbprogram.c:434
434GET_CURRENT_CONTEXT(ctx);
$31 = 0x85922ba !!ARBfp1.0\n\nOPTION
ARB_precision_hint_fastest;\n\n\n\nEND\n

Not sure why Sauerbraten does this, but it does, at least on my system
(Ubuntu Karmic, nv40 driver) and it should be legal.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [Mesa3d-dev] [PATCH] glsl: put varyings in texcoord slots

2010-01-18 Thread Luca Barbieri
 If you get this patch in, then you'll still have to fight with every
 other state tracker that doesn't prettify their TGSI. It would be a
 much better approach to attempt to RE the routing tables.

I don't think there any users of the Gallium interface that need more
than 8 vertex outputs/fragment inputs and don't use sequential values
starting at 0, except the GLSL linker without this patch.

ARB_fragment_program and ARB_vertex_program is limited to texcoord
slots, and Mesa should advertise only 8 of them.
Also users of this interface will likely only use as many as they
need, sequentially.

Vega, xorg seem to only use up to 2 slots.
g3dvl up to 8 (starting from 0, of course).

Cards with less than 8 slots may sometimes still have problems, but
such cards will probably be DX8 cards that don't work anyway.

Furthermore, even if you can route things, usings vertex outputs and
fragment inputs with lower indices may be more efficient anyway.

As for REing the tables, it may not be possible.
This is the code that apparently sets them up right now:
/* vtxprog output routing */
so_method(so, screen-curie, 0x1fc4, 1);
so_data  (so, 0x06144321);
so_method(so, screen-curie, 0x1fc8, 2);
so_data  (so, 0xedcba987);
so_data  (so, 0x0021);
so_method(so, screen-curie, 0x1fd0, 1);
so_data  (so, 0x00171615);
so_method(so, screen-curie, 0x1fd4, 1);
so_data  (so, 0x001b1a19);

This makes me think that only 4 bits might be used for the values
(look at the arithmetic progressions of 4-bit values), so that there
is a limit of 16 vertex output/fragment inputs.
If GLSL starts at index 10, we are still in trouble because less than
8 varyings will be available.

Also leaving vertex outputs/fragment inputs unused by starting at high
values may be bad for performance even if supported, as it may lead to
a bigger register file and thus less simultaneous GPU threads running.

In other words, having GLSL start at index 10 is easily avoided, and
causes problems nothing else causes, so why not just stop doing that?
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] libdrm/nouveau: Support nested bo mapping

2010-01-17 Thread Luca Barbieri
Most Gallium drivers support nested mapping by using a reference count.
We don't, and swtnl fallback triggers an error due to this.

This patch adds this support in libdrm.
---
 nouveau/nouveau_bo.c  |8 +++-
 nouveau/nouveau_private.h |1 +
 2 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/nouveau/nouveau_bo.c b/nouveau/nouveau_bo.c
index 10cc8a6..c1a9843 100644
--- a/nouveau/nouveau_bo.c
+++ b/nouveau/nouveau_bo.c
@@ -417,9 +417,12 @@ nouveau_bo_map_range(struct nouveau_bo *bo, uint32_t 
delta, uint32_t size,
struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
int ret;
 
-   if (!nvbo || bo-map)
+   if (!nvbo)
return -EINVAL;
 
+   if(nvbo-map_count++)
+   return 0;
+
if (!nouveau_bo_allocated(nvbo)) {
if (nvbo-flags  (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART)) {
ret = nouveau_bo_kalloc(nvbo, NULL);
@@ -470,6 +473,9 @@ nouveau_bo_unmap(struct nouveau_bo *bo)
 {
struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
 
+   if(--nvbo-map_count)
+   return 0;
+
if (bo-map  !nvbo-sysmem) {
struct nouveau_device_priv *nvdev = nouveau_device(bo-device);
struct drm_nouveau_gem_cpu_fini req;
diff --git a/nouveau/nouveau_private.h b/nouveau/nouveau_private.h
index 39758d1..512bc1e 100644
--- a/nouveau/nouveau_private.h
+++ b/nouveau/nouveau_private.h
@@ -115,6 +115,7 @@ struct nouveau_bo_priv {
drm_handle_t handle;
uint64_t map_handle;
void *map;
+   unsigned map_count;
 
/* Last known information from kernel on buffer status */
int pinned;
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 1/2] nv40: don't crash on empty fragment program

2010-01-17 Thread Luca Barbieri
---
 src/gallium/drivers/nv40/nv40_fragprog.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c 
b/src/gallium/drivers/nv40/nv40_fragprog.c
index 1237066..209d211 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -843,7 +843,8 @@ nv40_fragprog_translate(struct nv40_context *nv40,
fp-fp_control |= fpc-num_regs  NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
 
/* Terminate final instruction */
-   fp-insn[fpc-inst_offset] |= 0x0001;
+   if(fp-insn)
+fp-insn[fpc-inst_offset] |= 0x0001;
 
/* Append NOP + END instruction, may or may not be necessary. */
fpc-inst_offset = fp-insn_len;
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 2/2] st: don't assert on empty fragment program

2010-01-17 Thread Luca Barbieri
Sauerbraten triggers this assert.
---
 src/mesa/state_tracker/st_atom_shader.c |2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_shader.c 
b/src/mesa/state_tracker/st_atom_shader.c
index 176f3ea..fce533a 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -79,8 +79,6 @@ translate_fp(struct st_context *st,
 
   stfp-num_input_slots = numIn;
 
-  assert(stfp-Base.Base.NumInstructions  1);
-
   st_translate_fragment_program(st, stfp, stfp-input_to_slot);
}
 }
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 1/2] nv30-nv40: support unlimited queries

2010-01-17 Thread Luca Barbieri
Currently on NV30/NV40 an assert will be triggered once 32 queries are
outstanding.

This violates the OpenGL/Gallium interface, which requires support for
an unlimited number of fences.

This patch fixes the problem by putting queries in a linked list and
waiting on the oldest one if allocation fails.

nVidia seems to use a similar strategy, but with 1024 instead of 32 fences.
The next patch will improve this.
---
 src/gallium/drivers/nv30/nv30_query.c  |   26 ++
 src/gallium/drivers/nv30/nv30_screen.c |2 ++
 src/gallium/drivers/nv30/nv30_screen.h |1 +
 src/gallium/drivers/nv40/nv40_query.c  |   26 ++
 src/gallium/drivers/nv40/nv40_screen.c |2 ++
 src/gallium/drivers/nv40/nv40_screen.h |1 +
 6 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/nv30/nv30_query.c 
b/src/gallium/drivers/nv30/nv30_query.c
index e27e9cc..c0d192b 100644
--- a/src/gallium/drivers/nv30/nv30_query.c
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -3,6 +3,7 @@
 #include nv30_context.h
 
 struct nv30_query {
+struct list_head list;
struct nouveau_resource *object;
unsigned type;
boolean ready;
@@ -23,6 +24,8 @@ nv30_query_create(struct pipe_context *pipe, unsigned 
query_type)
q = CALLOC(1, sizeof(struct nv30_query));
q-type = query_type;
 
+   assert(q-type == PIPE_QUERY_OCCLUSION_COUNTER);
+
return (struct pipe_query *)q;
 }
 
@@ -32,7 +35,10 @@ nv30_query_destroy(struct pipe_context *pipe, struct 
pipe_query *pq)
struct nv30_query *q = nv30_query(pq);
 
if (q-object)
+   {
nouveau_resource_free(q-object);
+LIST_DEL(q-list);
+   }
FREE(q);
 }
 
@@ -44,20 +50,25 @@ nv30_query_begin(struct pipe_context *pipe, struct 
pipe_query *pq)
struct nv30_screen *screen = nv30-screen;
struct nouveau_channel *chan = screen-base.channel;
struct nouveau_grobj *rankine = screen-rankine;
-
-   assert(q-type == PIPE_QUERY_OCCLUSION_COUNTER);
+uint64_t tmp;
 
/* Happens when end_query() is called, then another begin_query()
 * without querying the result in-between.  For now we'll wait for
 * the existing query to notify completion, but it could be better.
 */
-   if (q-object) {
-   uint64_t tmp;
+   if (q-object)
pipe-get_query_result(pipe, pq, 1, tmp);
+
+   while (nouveau_resource_alloc(nv30-screen-query_heap, 1, NULL, 
q-object))
+   {
+   struct nv30_query* oldestq;
+   assert(!LIST_IS_EMPTY(nv30-screen-query_list));
+   oldestq = LIST_ENTRY(struct nv30_query, 
nv30-screen-query_list.next, list);
+   pipe-get_query_result(pipe, (struct pipe_query*)oldestq, 1, 
tmp);
}
 
-   if (nouveau_resource_alloc(nv30-screen-query_heap, 1, NULL, 
q-object))
-   assert(0);
+   LIST_ADDTAIL(q-list, nv30-screen-query_list);
+
nouveau_notifier_reset(nv30-screen-query, q-object-start);
 
BEGIN_RING(chan, rankine, NV34TCL_QUERY_RESET, 1);
@@ -90,8 +101,6 @@ nv30_query_result(struct pipe_context *pipe, struct 
pipe_query *pq,
struct nv30_context *nv30 = nv30_context(pipe);
struct nv30_query *q = nv30_query(pq);
 
-   assert(q-object  q-type == PIPE_QUERY_OCCLUSION_COUNTER);
-
if (!q-ready) {
unsigned status;
 
@@ -110,6 +119,7 @@ nv30_query_result(struct pipe_context *pipe, struct 
pipe_query *pq,
q-object-start);
q-ready = TRUE;
nouveau_resource_free(q-object);
+   LIST_DEL(q-list);
}
 
*result = q-result;
diff --git a/src/gallium/drivers/nv30/nv30_screen.c 
b/src/gallium/drivers/nv30/nv30_screen.c
index 48a562e..2cd5d12 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -252,6 +252,8 @@ nv30_screen_create(struct pipe_winsys *ws, struct 
nouveau_device *dev)
return NULL;
}
 
+   LIST_INITHEAD(screen-query_list);
+
/* Vtxprog resources */
if (nouveau_resource_init(screen-vp_exec_heap, 0, 256) ||
nouveau_resource_init(screen-vp_data_heap, 0, 256)) {
diff --git a/src/gallium/drivers/nv30/nv30_screen.h 
b/src/gallium/drivers/nv30/nv30_screen.h
index cbf945f..9190789 100644
--- a/src/gallium/drivers/nv30/nv30_screen.h
+++ b/src/gallium/drivers/nv30/nv30_screen.h
@@ -19,6 +19,7 @@ struct nv30_screen {
/* Query object resources */
struct nouveau_notifier *query;
struct nouveau_resource *query_heap;
+   struct list_head query_list;
 
/* Vtxprog resources */
struct nouveau_resource *vp_exec_heap;
diff --git a/src/gallium/drivers/nv40/nv40_query.c 
b/src/gallium/drivers/nv40/nv40_query.c
index 8ed4a67..01d35ea 100644
--- 

[Nouveau] [PATCH 2/2] nv30/nv40: allocate a bigger block for queries

2010-01-17 Thread Luca Barbieri
This patch allocates a bigger chunk of memory to store queries in,
increasing the (hidden) outstanding query limit from 32 to 125.

It also tries to make use of a 16KB notifier block if the kernel
supports that.

The blob supports 1024 queries due to their 16KB query block and
16-byte rather than 32-byte sized queries.
---
 src/gallium/drivers/nv30/nv30_screen.c |   13 +
 src/gallium/drivers/nv40/nv40_screen.c |   13 ++---
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/nv30/nv30_screen.c 
b/src/gallium/drivers/nv30/nv30_screen.c
index 2cd5d12..0f26d39 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -238,22 +238,27 @@ nv30_screen_create(struct pipe_winsys *ws, struct 
nouveau_device *dev)
}
 
/* Query objects */
-   ret = nouveau_notifier_alloc(chan, 0xbeef0302, 32, screen-query);
+   unsigned query_sizes[] = {(16384 - 3 * 32) / 32, 15 * 1024 / 32, (4096 
- 32 * 3) / 32, 3 * 1024 / 32, 2 * 1024 / 32, 1024 / 32};
+   for(i = 0; i  sizeof(query_sizes) / sizeof(query_sizes[0]); ++i)
+   {
+   ret = nouveau_notifier_alloc(chan, 0xbeef0302, query_sizes[i], 
screen-query);
+   if(!ret)
+   break;
+   }
+
if (ret) {
NOUVEAU_ERR(Error initialising query objects: %d\n, ret);
nv30_screen_destroy(pscreen);
return NULL;
}
 
-   ret = nouveau_resource_init(screen-query_heap, 0, 32);
+   nouveau_resource_init(screen-query_heap, 0, query_sizes[i]);
if (ret) {
NOUVEAU_ERR(Error initialising query object heap: %d\n, ret);
nv30_screen_destroy(pscreen);
return NULL;
}
 
-   LIST_INITHEAD(screen-query_list);
-
/* Vtxprog resources */
if (nouveau_resource_init(screen-vp_exec_heap, 0, 256) ||
nouveau_resource_init(screen-vp_data_heap, 0, 256)) {
diff --git a/src/gallium/drivers/nv40/nv40_screen.c 
b/src/gallium/drivers/nv40/nv40_screen.c
index a8c14f9..4264d18 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -161,7 +161,7 @@ nv40_screen_create(struct pipe_winsys *ws, struct 
nouveau_device *dev)
struct pipe_screen *pscreen;
struct nouveau_stateobj *so;
unsigned curie_class = 0;
-   int ret;
+   int ret, i;
 
if (!screen)
return NULL;
@@ -223,14 +223,21 @@ nv40_screen_create(struct pipe_winsys *ws, struct 
nouveau_device *dev)
}
 
/* Query objects */
-   ret = nouveau_notifier_alloc(chan, 0xbeef0302, 32, screen-query);
+   unsigned query_sizes[] = {(16384 - 3 * 32) / 32, 15 * 1024 / 32, (4096 
- 32 * 3) / 32, 3 * 1024 / 32, 2 * 1024 / 32, 1024 / 32};
+   for(i = 0; i  sizeof(query_sizes) / sizeof(query_sizes[0]); ++i)
+   {
+   ret = nouveau_notifier_alloc(chan, 0xbeef0302, query_sizes[i], 
screen-query);
+   if(!ret)
+   break;
+   }
+
if (ret) {
NOUVEAU_ERR(Error initialising query objects: %d\n, ret);
nv40_screen_destroy(pscreen);
return NULL;
}
 
-   nouveau_resource_init(screen-query_heap, 0, 32);
+   nouveau_resource_init(screen-query_heap, 0, query_sizes[i]);
if (ret) {
NOUVEAU_ERR(Error initialising query object heap: %d\n, ret);
nv40_screen_destroy(pscreen);
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] drm/nouveau: Evict buffers in VRAM before freeing sgdma

2010-01-16 Thread Luca Barbieri
Currently, we take down the sgdma engine without evicting all buffers
from VRAM.

The TTM device release will try to evict anything in VRAM to GART
memory, but this will fail since sgdma has already been taken down.

This causes an infinite loop in kernel mode on module unload.
It usually doesn't happen because there aren't any buffer on close.
However, if the GPU is locked up, this condition is easily triggered.

This patch fixes it in the simplest way possible by cleaning VRAM
right before cleaning SGDMA memory.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_mem.c   |1 -
 drivers/gpu/drm/nouveau/nouveau_state.c |1 +
 2 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c 
b/drivers/gpu/drm/nouveau/nouveau_mem.c
index 186f34b..8f3a12f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_mem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
@@ -386,7 +386,6 @@ void nouveau_mem_close(struct drm_device *dev)
nouveau_bo_unpin(dev_priv-vga_ram);
nouveau_bo_ref(NULL, dev_priv-vga_ram);
 
-   ttm_bo_clean_mm(dev_priv-ttm.bdev, TTM_PL_VRAM);
ttm_bo_device_release(dev_priv-ttm.bdev);
 
nouveau_ttm_global_release(dev_priv);
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c 
b/drivers/gpu/drm/nouveau/nouveau_state.c
index 09b9a46..c212742 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -525,6 +525,7 @@ static void nouveau_card_takedown(struct drm_device *dev)
engine-mc.takedown(dev);
 
mutex_lock(dev-struct_mutex);
+ttm_bo_clean_mm(dev_priv-ttm.bdev, TTM_PL_VRAM);
ttm_bo_clean_mm(dev_priv-ttm.bdev, TTM_PL_TT);
mutex_unlock(dev-struct_mutex);
nouveau_sgdma_takedown(dev);
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] More on GART vertex buffer corruption

2010-01-14 Thread Luca Barbieri
I looked a bit more into the problem of vertex corruption with GART
vertex buffers that disappears putting the buffers in VRAM that I'm
experiencing on my card.
The system I'm seeing this on is a Dell Inspiron 9400 notebook with a
GeForce Go 7900 GS on a PCI Express Intel i945 chipset.

First, I've looked into the behavior of the nVidia driver:
1. On all NV3x and NV4x traces, and my system, vertex buffers are
always put in VRAM (renouveau uses GL_STATIC_DRAW)
2. On my system all vertex object usage flags still cause the vertex
buffer to be put in VRAM
3. On my system GL_NV_vertex_range causes the use of a GART vertex
buffer, and seems the only way to do so

Note that statement 2 may not be true for all NV3x/NV4x cards.
It would be interesting to see the output of the attached renouveau
tests on other cards.

glFlushVertexArrayRangeNV does the following:
# 14 NOPs
00043710  size 1, subchannel 1 (0xbeef3097),offset 0x1710,increment
NV40TCL[0x1710/4]
00043d6c  size 1, subchannel 1 (0xbeef3097),offset 0x1d6c,increment
0580NV40TCL[0x1d6c/4]
00043d70  size 1, subchannel 1 (0xbeef3097),offset 0x1d70,increment
000bNV40TCL[0x1d70/4]
00042104  size 1, subchannel 1 (0xbeef3097),offset 0x0104,increment
NV40TCL.NOTIFY
00042100  size 1, subchannel 1 (0xbeef3097),offset 0x0100,increment
NV40TCL.NOP
And probably waits for the fence afterwards.

0x1710 does something relating to vertex buffer caching.
0x1d6c and 0x1d70 are a 3D engine fence emission.

Adding 0x1710 and waiting for our usual fences both before and after
draw_elements did not fix the problem.
I haven't tried using the 3D fencing mechanism.

It seems we might be better off always putting vertex buffers in VRAM,
since the blob does so and while writing to them might be a bit
slower, the required synchronization may be even worse.

I attached a renouveau test for GL_NV_vertex_array_range and one for
all the glBufferData flags. The quality of the code is quite low but
it should do the job.
void test_nv_vertex_array_range(void)
{
#ifndef GL_NV_vertex_array_range
	printf(# Sorry, no GL_NV_vertex_array_range\n);
#else
	const char *extensions = (const char *)regl.GetString(GL_EXTENSIONS);
	if (!strstr(extensions, GL_NV_vertex_array_range)) {
		fprintf(stderr,No GL_NV_vertex_array_range extension.\n);
		return;
	}

	void * ( * glXAllocateMemoryNV) (GLsizei size, GLfloat readfreq, GLfloat writefreq, GLfloat priority) = SDL_GL_GetProcAddress(glXAllocateMemoryNV);
	GLfloat* memory = glXAllocateMemoryNV(65536, 0, 0, 0.5f);
	if(!memory)
	{
		fprintf(stderr,glXAllocateMemoryNV failed.\n);
		return;
	}
	
	TEST_PROLOGUE;
	printf(# Testing GL_NV_vertex_array_range\n);

#define NUM_PRIMITIVES  8
#define NUM_VERTICES(4*NUM_PRIMITIVES)
#define NUM_CLIENTSTATE 6
	GLfloat vtxdata[NUM_VERTICES*NUM_CLIENTSTATE];
	
	glVertexArrayRangeNV(sizeof(vtxdata), memory);
	memcpy(memory, vtxdata, sizeof(vtxdata));

	dump_before();

	regl.EnableClientState(GL_VERTEX_ARRAY_RANGE_NV);
	regl.EnableClientState(GL_VERTEX_ARRAY);
	regl.EnableClientState(GL_COLOR_ARRAY);
	//regl.EnableClientState(GL_SECONDARY_COLOR_ARRAY);
	//regl.EnableClientState(GL_TEXTURE_COORD_ARRAY);
	//regl.EnableClientState(GL_NORMAL_ARRAY);
	//regl.EnableClientState(GL_FOG_COORD_ARRAY);

	regl.VertexPointer(3, GL_FLOAT, 0, NULL);
	regl.ColorPointer (3, GL_FLOAT, 0, NULL);
	//regl.SecondaryColorPointer (3, GL_FLOAT, 0, NULL);
	//regl.TexCoordPointer (4, GL_FLOAT, 0, NULL);
	//regl.NormalPointer (GL_FLOAT, 0, NULL);
	//regl.FogCoordPointer (GL_FLOAT, 0, NULL);
	dump_after(0);

int i;
for(i = 0; i  3; ++i)
{
	printf(# Drawing\n);
	dump_before();

	regl.DrawArrays(GL_TRIANGLES, 0, NUM_VERTICES);

	dump_after(0);

	printf(# glFlushVertexArrayRangeNV\n);
	
	dump_before(0);
	glFlushVertexArrayRangeNV();
	dump_after(0);
}
	
	regl.DisableClientState(GL_VERTEX_ARRAY);
	regl.DisableClientState(GL_COLOR_ARRAY);
	regl.DisableClientState(GL_VERTEX_ARRAY_RANGE_NV);
	//regl.DisableClientState(GL_SECONDARY_COLOR_ARRAY);
	//regl.DisableClientState(GL_TEXTURE_COORD_ARRAY);
	//regl.DisableClientState(GL_NORMAL_ARRAY);
	//regl.DisableClientState(GL_FOG_COORD_ARRAY);

	glVertexArrayRangeNV(0, 0);
	TEST_EPILOGUE;
#endif
}

void test_arb_vertex_buffer_object_indexed(void)
{
#ifndef GL_ARB_vertex_buffer_object
	printf(# Sorry, no GL_ARB_vertex_buffer_object\n);
#else
	const char *extensions = (const char *)regl.GetString(GL_EXTENSIONS);
	if (!strstr(extensions, GL_ARB_vertex_buffer_object)) {
		fprintf(stderr,No GL_ARB_vertex_buffer_object extension.\n);
		return;
	}

	TEST_PROLOGUE;
const char* freqs[] = {STREAM, STATIC, DYNAMIC, 0};
const char* nats[] = {DRAW, READ, COPY, 0};
int freq, nat;
for(freq = 0; freq  3; ++freq)
{
for(nat = 0; nat  3; ++nat)
{
int usage = 0x88E0 + 4 * freq + nat;
	printf(# Testing ARB_vertex_buffer_object with GL_%s_%s\n, freqs[freq], nats[nat]);

#define NUM_TRIANGLES   4000
#define NUM_VERTICES(4*NUM_PRIMITIVES)
	
	uint16_t vtxdata[NUM_VERTICES*4];
	uint16_t 

Re: [Nouveau] [PATCH] drm/nouveau: Check pushbuffer bounds in system call

2010-01-13 Thread Luca Barbieri
Any issues with this patch?
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] nv40: Correct zsa so_new size

2010-01-13 Thread Luca Barbieri
Triggered by Doom 3.
---
 src/gallium/drivers/nv40/nv40_state.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/src/gallium/drivers/nv40/nv40_state.c 
b/src/gallium/drivers/nv40/nv40_state.c
index ed0ca9e..4e3a61f 100644
--- a/src/gallium/drivers/nv40/nv40_state.c
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -445,7 +445,7 @@ nv40_depth_stencil_alpha_state_create(struct pipe_context 
*pipe,
 {
struct nv40_context *nv40 = nv40_context(pipe);
struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
-   struct nouveau_stateobj *so = so_new(4, 21, 0);
+   struct nouveau_stateobj *so = so_new(4, 22, 0);
struct nouveau_grobj *curie = nv40-screen-curie;
 
so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] nv20-nv40: Add support for two sided color

2010-01-13 Thread Luca Barbieri
This patch adds support for two-sided vertex color to NV20, NV30 and NV40.
When set, the COLOR0/1 fs inputs on back faces will be wired to vs outputs 
BCOLOR0/1.
This makes OpenGL two sided lighting work, which can be tested with 
progs/demos/projtex.

This is already supported on NV50 and seems to be unsupported on NV04 and NV10.

The following defines need to be added to nouveau_class.h:

In renouveau.xml, for both NV30 and NV40:
reg32 offset=0x142c name=VERTEX_TWO_SIDE_ENABLE type=boolean/

Tested on NV40 only.
---
 src/gallium/drivers/nv20/nv20_state.c  |3 ++-
 src/gallium/drivers/nv20/nv20_state.h  |2 ++
 src/gallium/drivers/nv20/nv20_state_emit.c |3 +++
 src/gallium/drivers/nv30/nv30_state.c  |6 --
 src/gallium/drivers/nv40/nv40_state.c  |8 ++--
 5 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/nv20/nv20_state.c 
b/src/gallium/drivers/nv20/nv20_state.c
index 3a82e63..a8a33b2 100644
--- a/src/gallium/drivers/nv20/nv20_state.c
+++ b/src/gallium/drivers/nv20/nv20_state.c
@@ -234,7 +234,6 @@ nv20_rasterizer_state_create(struct pipe_context *pipe,
int i;
 
/*XXX: ignored:
-*  light_twoside
 *  offset_cw/ccw -nohw
 *  scissor
 *  point_smooth -nohw
@@ -301,6 +300,8 @@ nv20_rasterizer_state_create(struct pipe_context *pipe,
} else {
rs-point_sprite = 0;
}
+   
+   rs-light_twoside = cso-light_twoside;
 
return (void *)rs;
 }
diff --git a/src/gallium/drivers/nv20/nv20_state.h 
b/src/gallium/drivers/nv20/nv20_state.h
index dde4106..b8b0366 100644
--- a/src/gallium/drivers/nv20/nv20_state.h
+++ b/src/gallium/drivers/nv20/nv20_state.h
@@ -39,6 +39,8 @@ struct nv20_rasterizer_state {
uint32_t cull_face_en;
 
uint32_t point_sprite;
+   
+   uint32_t light_twoside;
 
const struct pipe_rasterizer_state *templ;
 };
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c 
b/src/gallium/drivers/nv20/nv20_state_emit.c
index 6bbd1fd..a916788 100644
--- a/src/gallium/drivers/nv20/nv20_state_emit.c
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -68,6 +68,9 @@ static void nv20_state_emit_rast(struct nv20_context* nv20)
 
BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
OUT_RING  (chan, r-cull_face_en);
+
+BEGIN_RING(chan, kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
+OUT_RING  (chan, r-light_twoside);
 }
 
 static void nv20_state_emit_dsa(struct nv20_context* nv20)
diff --git a/src/gallium/drivers/nv30/nv30_state.c 
b/src/gallium/drivers/nv30/nv30_state.c
index a80dfb0..ab45199 100644
--- a/src/gallium/drivers/nv30/nv30_state.c
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -300,11 +300,10 @@ nv30_rasterizer_state_create(struct pipe_context *pipe,
 {
struct nv30_context *nv30 = nv30_context(pipe);
struct nv30_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
-   struct nouveau_stateobj *so = so_new(9, 19, 0);
+   struct nouveau_stateobj *so = so_new(10, 20, 0);
struct nouveau_grobj *rankine = nv30-screen-rankine;
 
/*XXX: ignored:
-*  light_twoside
 *  point_smooth -nohw
 *  multisample
 */
@@ -313,6 +312,9 @@ nv30_rasterizer_state_create(struct pipe_context *pipe,
so_data  (so, cso-flatshade ? NV34TCL_SHADE_MODEL_FLAT :
   NV34TCL_SHADE_MODEL_SMOOTH);
 
+so_method(so, rankine, NV34TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+so_data  (so, cso-light_twoside);
+
so_method(so, rankine, NV34TCL_LINE_WIDTH, 2);
so_data  (so, (unsigned char)(cso-line_width * 8.0)  0xff);
so_data  (so, cso-line_smooth ? 1 : 0);
diff --git a/src/gallium/drivers/nv40/nv40_state.c 
b/src/gallium/drivers/nv40/nv40_state.c
index 4e3a61f..395f7aa 100644
--- a/src/gallium/drivers/nv40/nv40_state.c
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -310,11 +310,10 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 {
struct nv40_context *nv40 = nv40_context(pipe);
struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
-   struct nouveau_stateobj *so = so_new(8, 18, 0);
+   struct nouveau_stateobj *so = so_new(10, 20, 0);
struct nouveau_grobj *curie = nv40-screen-curie;
 
/*XXX: ignored:
-*  light_twoside
 *  point_smooth -nohw
 *  multisample
 */
@@ -323,6 +322,11 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
so_data  (so, cso-flatshade ? NV40TCL_SHADE_MODEL_FLAT :
   NV40TCL_SHADE_MODEL_SMOOTH);
 
+/* the blob also sets 0x1428 to 0 in the same block.
+   Its purpose is unclear and it does not seem to have any discernible 
effects. */
+so_method(so, curie, NV40TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+so_data  (so, cso-light_twoside);
+

Re: [Nouveau] [Discussion] User controls for PowerManagement

2010-01-09 Thread Luca Barbieri
How about taking inspiration from the cpufreq sysfs interface?

There are sysfs objects for drm cards at /sys/class/drm/cardnumber.
Mine, for instance, is at /sys/class/drm/card0, which links to
/sys/devices/pci:00/:00:01.0/:01:00.0/drm/card0.

A simple scaling approach could just look at the time of the last
pushbuffer submission on any non-Xserver channel (adding a flag for
the DDX, or possibly direct rendered desktop applications to specify
on channel creation). If it's more than N milliseconds ago, turn on
power saving, otherwise disable it, possibly with several time
thresholds for several modes. Of course, it shoud not be implemented
literally this way, but rather by using timers appropriately.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] Findings on pre-NV50 miptree layout

2010-01-08 Thread Luca Barbieri
I wrote a tool for automatically finding out the texture layout for Gallium
drivers.
You can find it attached to
http://sourceforge.net/mailarchive/forum.php?thread_name=ff13bc9a1001081140y18450c3ejdfac25c9260fd367%40mail.gmail.comforum_name=mesa3d-dev
.
Here are the findings from running it.

The result is that our miptree layout code is partially broken, and overly
complex.
In particular:
1. 3D textures are broken because they are not laid out like cube maps, but
first by level and then by face
2. Swizzled 3D texture are all 3 texture coordinates swizzled together
3. Cube maps have their faces 128 byte aligned, not only 64 like in my patch
or unaligned like without it (not applied IIRC).
4. Swizzled 2D/3D/cube textures don't have any gaps, except for cube map
face alignment. The current code contains a strange dimension check.

I'm in the process of rewriting the miptree layout code and all the 2D
engine code to account this (and to support all case, including unswizzling
and 3D-swizzling).

Here are the findings on NV40.

Not sure what happens with compressed textures (which may be currently
broken since Doom3 misrenders in non-Ultra quality).
I'll check that once the 2D code is otherwise finished and working

* Swizzled 1D/2D/3D textures
Mipmaps are laid sequentially with no gap in between.
Each mipmap is laid swizzled.
To get the swizzled address of a texel, take an x bit, then an y bit, then a
z bit and so on.
If you exceed a dimension, skip taking their bits.
This means in particular that 3D textures are 3D swizzled: zyxzyxzyx...
The blob loads swizzled 3D textures with the CPU.
It seems that there is no GPU hardware that can write to swizzled 3D
textures (except of course for mapping it as something else and computing in
a fragment shader).

* Swizzled cube maps
Cube maps are sequences of 2D textures, each aligned to 128 bytes.
The hardware seems to have sampling support for non-square cube maps (not
supported by any API afaik) but lays them out as if their height were equal
to the width.

* Linear textures
In linear textures all images have the same pitch, which can apparently be
any value (even 4), aligned to bytes per pixel.
1D textures and 2D textures are laid out in the obvious way.
3D textures are laid with contiguous mipmaps, each containing a 2D texture
for each zslice.
Cube maps are laid with contiguous faces, each a 2D texture. No special face
alignment is used (they will still be aligned to the pitch of course).


# Alignment requirements
Render target pitch must be a multiple of 64 and greater than 0.
For swizzled targets, the blob sets a width * bpp pitch. However, the
hardware seems to ignore pitch for swizzled targets.
Furthermore, the hardware can actually render to any swizzled surface size!
The offset must however be a multiple of 64, even for swizzled surfaces.

# GdiSubRect does not seem to work with SWIZZLED_SURFACE set as surface

# SWIZZLED_SURFACE wants a 64-byte aligned destination
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] Fix null deref in ttm_bo_mem_space caused by forgetting to set placement.busy_placement

2010-01-05 Thread Luca Barbieri
Set it to the same value of placement.placement

Triggered by running etracer under compiz.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_bo.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c 
b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 0cad6d8..e0bc2cd 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -547,7 +547,7 @@ nouveau_bo_move_flipd(struct ttm_buffer_object *bo, bool 
evict, bool intr,
 
placement.fpfn = placement.lpfn = 0;
placement.num_placement = placement.num_busy_placement = 1;
-   placement.placement = placement_memtype;
+   placement.placement = placement.busy_placement = placement_memtype;
 
tmp_mem = *new_mem;
tmp_mem.mm_node = NULL;
@@ -585,7 +585,7 @@ nouveau_bo_move_flips(struct ttm_buffer_object *bo, bool 
evict, bool intr,
 
placement.fpfn = placement.lpfn = 0;
placement.num_placement = placement.num_busy_placement = 1;
-   placement.placement = placement_memtype;
+   placement.placement = placement.busy_placement = placement_memtype;
 
tmp_mem = *new_mem;
tmp_mem.mm_node = NULL;
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] Fix null deref in nouveau_fence_emit due to deleted fence

2010-01-05 Thread Luca Barbieri
Currently Nouveau will unvalidate all buffers if it is forced to wait on one, 
and then start revalidating from the beginning.
While doing so, it destroys the operation fence, causing nouveau_fence_emit to 
crash.

This patch fixes this bug by taking the fence object out of validate_op and 
creating it just before emit.
The fence pointer is initialized to 0 and unref'ed unconditionally.

In addition to fixing the bug, this prevents its reintroduction and simplifies 
the code.
---
 drivers/gpu/drm/nouveau/nouveau_gem.c |   35 ++--
 1 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c 
b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 18fd8ac..7c1ff14 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -220,7 +220,6 @@ nouveau_gem_set_domain(struct drm_gem_object *gem, uint32_t 
read_domains,
 }
 
 struct validate_op {
-   struct nouveau_fence *fence;
struct list_head vram_list;
struct list_head gart_list;
struct list_head both_list;
@@ -252,17 +251,11 @@ validate_fini_list(struct list_head *list, struct 
nouveau_fence *fence)
 }
 
 static void
-validate_fini(struct validate_op *op, bool success)
+validate_fini(struct validate_op *op, struct nouveau_fence* fence)
 {
-   struct nouveau_fence *fence = op-fence;
-
-   if (unlikely(!success))
-   op-fence = NULL;
-
-   validate_fini_list(op-vram_list, op-fence);
-   validate_fini_list(op-gart_list, op-fence);
-   validate_fini_list(op-both_list, op-fence);
-   nouveau_fence_unref((void *)fence);
+   validate_fini_list(op-vram_list, fence);
+   validate_fini_list(op-gart_list, fence);
+   validate_fini_list(op-both_list, fence);
 }
 
 static int
@@ -420,10 +413,6 @@ nouveau_gem_pushbuf_validate(struct nouveau_channel *chan,
INIT_LIST_HEAD(op-gart_list);
INIT_LIST_HEAD(op-both_list);
 
-   ret = nouveau_fence_new(chan, op-fence, false);
-   if (ret)
-   return ret;
-
if (nr_buffers == 0)
return 0;
 
@@ -541,6 +530,7 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void 
*data,
struct drm_nouveau_gem_pushbuf_bo *bo = NULL;
struct nouveau_channel *chan;
struct validate_op op;
+   struct nouveau_fence* fence = 0;
uint32_t *pushbuf = NULL;
int ret = 0, do_reloc = 0, i;
 
@@ -597,7 +587,8 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void 
*data,
 
OUT_RINGp(chan, pushbuf, req-nr_dwords);
 
-   ret = nouveau_fence_emit(op.fence);
+   ret = nouveau_fence_new(chan, fence, false)
+   || nouveau_fence_emit(fence);
if (ret) {
NV_ERROR(dev, error fencing pushbuf: %d\n, ret);
WIND_RING(chan);
@@ -605,7 +596,7 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void 
*data,
}
 
if (nouveau_gem_pushbuf_sync(chan)) {
-   ret = nouveau_fence_wait(op.fence, NULL, false, false);
+   ret = nouveau_fence_wait(fence, NULL, false, false);
if (ret) {
for (i = 0; i  req-nr_dwords; i++)
NV_ERROR(dev, 0x%08x\n, pushbuf[i]);
@@ -614,7 +605,8 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void 
*data,
}
 
 out:
-   validate_fini(op, ret == 0);
+   validate_fini(op, fence);
+   nouveau_fence_unref((void**)fence);
mutex_unlock(dev-struct_mutex);
kfree(pushbuf);
kfree(bo);
@@ -634,6 +626,7 @@ nouveau_gem_ioctl_pushbuf_call(struct drm_device *dev, void 
*data,
struct drm_gem_object *gem;
struct nouveau_bo *pbbo;
struct validate_op op;
+   struct nouveau_fence* fence = 0;
int i, ret = 0, do_reloc = 0;
 
NOUVEAU_CHECK_INITIALISED_WITH_RETURN;
@@ -772,7 +765,8 @@ nouveau_gem_ioctl_pushbuf_call(struct drm_device *dev, void 
*data,
OUT_RING(chan, 0);
}
 
-   ret = nouveau_fence_emit(op.fence);
+   ret = nouveau_fence_new(chan, fence, false)
+   || nouveau_fence_emit(fence);
if (ret) {
NV_ERROR(dev, error fencing pushbuf: %d\n, ret);
WIND_RING(chan);
@@ -780,7 +774,8 @@ nouveau_gem_ioctl_pushbuf_call(struct drm_device *dev, void 
*data,
}
 
 out:
-   validate_fini(op, ret == 0);
+   validate_fini(op, fence);
+   nouveau_fence_unref((void**)fence);
mutex_unlock(dev-struct_mutex);
kfree(bo);
 
-- 
1.6.3.3

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] Print NOUVEAU_NO_SWIZZLE and NOUVEAU_NO_TRANSFER messages only once

2009-12-31 Thread Luca Barbieri
Currently we are continuously spewing messages messages about these variables 
since we call debug_get_bool_option everytime we want to check their value
This is annoying, slows things down due to terminal rerendering and obscures 
useful messages.
This patch only calls debug_get_bool_option once and caches the result in a 
static variable.
---
 src/gallium/drivers/nv04/nv04_transfer.c |6 --
 src/gallium/drivers/nv10/nv10_transfer.c |6 --
 src/gallium/drivers/nv20/nv20_miptree.c  |5 -
 src/gallium/drivers/nv20/nv20_transfer.c |6 --
 src/gallium/drivers/nv30/nv30_miptree.c  |5 -
 src/gallium/drivers/nv30/nv30_transfer.c |6 --
 src/gallium/drivers/nv40/nv40_miptree.c  |5 -
 src/gallium/drivers/nv40/nv40_transfer.c |6 --
 8 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/nv04/nv04_transfer.c 
b/src/gallium/drivers/nv04/nv04_transfer.c
index 2dd2e14..f7a64f9 100644
--- a/src/gallium/drivers/nv04/nv04_transfer.c
+++ b/src/gallium/drivers/nv04/nv04_transfer.c
@@ -41,6 +41,9 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct 
pipe_texture *pt,
struct nv04_miptree *mt = (struct nv04_miptree *)pt;
struct nv04_transfer *tx;
struct pipe_texture tx_tex_template, *tx_tex;
+   static int no_transfer = -1;
+   if(no_transfer  0)
+   no_transfer = debug_get_bool_option(NOUVEAU_NO_TRANSFER, 
TRUE/*XXX:FALSE*/);
 
tx = CALLOC_STRUCT(nv04_transfer);
if (!tx)
@@ -58,8 +61,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct 
pipe_texture *pt,
tx-base.zslice = zslice;
 
/* Direct access to texture */
-   if ((pt-tex_usage  PIPE_TEXTURE_USAGE_DYNAMIC ||
-debug_get_bool_option(NOUVEAU_NO_TRANSFER, TRUE/*XXX:FALSE*/)) 
+   if ((pt-tex_usage  PIPE_TEXTURE_USAGE_DYNAMIC || no_transfer) 
pt-tex_usage  NOUVEAU_TEXTURE_USAGE_LINEAR)
{
tx-direct = true;
diff --git a/src/gallium/drivers/nv10/nv10_transfer.c 
b/src/gallium/drivers/nv10/nv10_transfer.c
index eb04af9..d834638 100644
--- a/src/gallium/drivers/nv10/nv10_transfer.c
+++ b/src/gallium/drivers/nv10/nv10_transfer.c
@@ -41,6 +41,9 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct 
pipe_texture *pt,
struct nv10_miptree *mt = (struct nv10_miptree *)pt;
struct nv10_transfer *tx;
struct pipe_texture tx_tex_template, *tx_tex;
+   static int no_transfer = -1;
+   if(no_transfer  0)
+   no_transfer = debug_get_bool_option(NOUVEAU_NO_TRANSFER, 
TRUE/*XXX:FALSE*/);
 
tx = CALLOC_STRUCT(nv10_transfer);
if (!tx)
@@ -58,8 +61,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct 
pipe_texture *pt,
tx-base.zslice = zslice;
 
/* Direct access to texture */
-   if ((pt-tex_usage  PIPE_TEXTURE_USAGE_DYNAMIC ||
-debug_get_bool_option(NOUVEAU_NO_TRANSFER, TRUE/*XXX:FALSE*/)) 
+   if ((pt-tex_usage  PIPE_TEXTURE_USAGE_DYNAMIC || no_transfer) 
pt-tex_usage  NOUVEAU_TEXTURE_USAGE_LINEAR)
{
tx-direct = true;
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c 
b/src/gallium/drivers/nv20/nv20_miptree.c
index 8f7538e..c0ec60d 100644
--- a/src/gallium/drivers/nv20/nv20_miptree.c
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -89,6 +89,9 @@ nv20_miptree_create(struct pipe_screen *screen, const struct 
pipe_texture *pt)
struct nv20_miptree *mt;
unsigned buf_usage = PIPE_BUFFER_USAGE_PIXEL |
 NOUVEAU_BUFFER_USAGE_TEXTURE;
+   static int no_swizzle = -1;
+   if(no_swizzle  0)
+   no_swizzle = debug_get_bool_option(NOUVEAU_NO_SWIZZLE, FALSE);
 
mt = MALLOC(sizeof(struct nv20_miptree));
if (!mt)
@@ -116,7 +119,7 @@ nv20_miptree_create(struct pipe_screen *screen, const 
struct pipe_texture *pt)
case PIPE_FORMAT_X8R8G8B8_UNORM:
case PIPE_FORMAT_R16_SNORM:
{
-   if (debug_get_bool_option(NOUVEAU_NO_SWIZZLE, FALSE))
+   if (no_swizzle)
mt-base.tex_usage |= 
NOUVEAU_TEXTURE_USAGE_LINEAR;
break;
}
diff --git a/src/gallium/drivers/nv20/nv20_transfer.c 
b/src/gallium/drivers/nv20/nv20_transfer.c
index 699773e..2d06e79 100644
--- a/src/gallium/drivers/nv20/nv20_transfer.c
+++ b/src/gallium/drivers/nv20/nv20_transfer.c
@@ -41,6 +41,9 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct 
pipe_texture *pt,
struct nv20_miptree *mt = (struct nv20_miptree *)pt;
struct nv20_transfer *tx;
struct pipe_texture tx_tex_template, *tx_tex;
+   static int no_transfer = -1;
+   if(no_transfer  0)
+   no_transfer = debug_get_bool_option(NOUVEAU_NO_TRANSFER, 
TRUE/*XXX:FALSE*/);
 
tx = CALLOC_STRUCT(nv20_transfer);
if (!tx)
@@ -58,8 +61,7 @@ 

[Nouveau] [PATCH] Autogenerate uureg opcode macros

2009-12-31 Thread Luca Barbieri
Also some missing _src()s and cosmetic changes.
---
 src/gallium/programs/galliumut/Makefile|5 +
 .../programs/galliumut/gen_uureg_opcodes.sh|   29 +++
 src/gallium/programs/galliumut/uureg.h |  196 
 3 files changed, 71 insertions(+), 159 deletions(-)
 create mode 100644 src/gallium/programs/galliumut/gen_uureg_opcodes.sh

diff --git a/src/gallium/programs/galliumut/Makefile 
b/src/gallium/programs/galliumut/Makefile
index ab0d684..4cb9d7c 100644
--- a/src/gallium/programs/galliumut/Makefile
+++ b/src/gallium/programs/galliumut/Makefile
@@ -9,3 +9,8 @@ LIBRARY_DEFINES = --std=gnu99
 C_SOURCES = egl_gallium.c image.c normal_gen.c
 
 include ../../Makefile.template
+
+default: uureg_opcodes.h
+
+uureg_opcodes.h: gen_uureg_opcodes.sh
+   bash $^  $@
diff --git a/src/gallium/programs/galliumut/gen_uureg_opcodes.sh 
b/src/gallium/programs/galliumut/gen_uureg_opcodes.sh
new file mode 100644
index 000..3a56fcb
--- /dev/null
+++ b/src/gallium/programs/galliumut/gen_uureg_opcodes.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+cat - EOF
+#ifndef UUREG_OPCODES_H
+#define UUREG_OPCODES_H
+
+/* Autogenerated file, do not edit manually! Use make to regenerate. */
+
+EOF
+
+cat -  EOF|cpp -P -E - -I../../auxiliary|sed -re 's/^define /#define _/; s/ 
CAT /##/g;'
+#define OP00(op) define op() ureg_##op(ureg)
+#define OP01(op) define op(src) ureg_##op(ureg, _src(src))
+#define OP00_LBL(op) define op(label) ureg_##op(ureg, label)
+#define OP01_LBL(op) define op(src, label) ureg_##op(ureg, _src(src), label)
+#define OP10(op) define op(dst) ureg_##op(ureg, dst)
+#define OP11(op) define op(dst, src) ureg_##op(ureg, dst, _src(src))
+#define OP12(op) define op(dst, src0, src1) ureg_##op(ureg, dst, _src(src0), 
_src(src1))
+#define OP12_TEX(op) define op(dst, target, src0, src1) ureg_##op(ureg, dst, 
TGSI_TEXTURE_ CAT target, _src(src0), _src(src1))
+#define OP13(op) define op(dst, src0, src1, src2) ureg_##op(ureg, dst, 
_src(src0), _src(src1), _src(src2))
+#define OP14_TEX(op) define op(dst, target, src0, src1, src2, src3) 
ureg_##op(ureg, dst, TGSI_TEXTURE_ CAT target, _src(src0), _src(src1), 
_src(src2), _src(src3))
+
+#include tgsi/tgsi_opcode_tmp.h
+EOF
+
+cat - EOF
+
+#endif
+EOF
+
diff --git a/src/gallium/programs/galliumut/uureg.h 
b/src/gallium/programs/galliumut/uureg.h
index a2d07a7..d30e188 100644
--- a/src/gallium/programs/galliumut/uureg.h
+++ b/src/gallium/programs/galliumut/uureg.h
@@ -60,7 +60,7 @@ static inline struct ureg_src _src(const struct ureg_src 
src) {return src;}
 #define _OUTPUT(v, n, i) struct ureg_dst v = ureg_DECL_output(ureg, 
TGSI_SEMANTIC_##n, i)
 #define _CONST_(v, i) struct ureg_src v = ureg_DECL_constant(ureg, i)
 #define _CONST(v, s) UREG_CONST(v, ureg, s)
- #define _CONST_MAT3(v, s) UREG_CONST_MAT3(v, ureg, s)
+#define _CONST_MAT3(v, s) UREG_CONST_MAT3(v, ureg, s)
 #define _CONST_MAT4(v, s) UREG_CONST_MAT4(v, ureg, s)
 #define _ADDRESS(v) struct ureg_src v = ureg_DECL_address(ureg)
 #define _LOOP(v) struct ureg_src v = ureg_DECL_loop(ureg)
@@ -88,6 +88,41 @@ static inline struct ureg_src _src(const struct ureg_src 
src) {return src;}
 #define _zy(v) _swz(v, Z, Y, Z, Y)
 #define _zw(v) _swz(v, Z, W, Z, W)
 
+#define _ind(r, a) ureg_src_indirect(_src(r), _src(a))
+#define _abs(x) ureg_abs(_src(x))
+#define _neg(x) ureg_negate(_src(x))
+#define _undef ureg_src_undef()
+#define _is_undef(v) ureg_src_is_undef(_src(v))
+
+#define _X(v) ureg_writemask((v), TGSI_WRITEMASK_X)
+#define _Y(v) ureg_writemask((v), TGSI_WRITEMASK_Y)
+#define _Z(v) ureg_writemask((v), TGSI_WRITEMASK_Z)
+#define _W(v) ureg_writemask((v), TGSI_WRITEMASK_W)
+#define _XY(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y)
+#define _XZ(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Z)
+#define _XW(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_W)
+#define _YZ(v) ureg_writemask((v), TGSI_WRITEMASK_Y | TGSI_WRITEMASK_Z)
+#define _YW(v) ureg_writemask((v), TGSI_WRITEMASK_Y | TGSI_WRITEMASK_W)
+#define _ZW(v) ureg_writemask((v), TGSI_WRITEMASK_Z | TGSI_WRITEMASK_W)
+#define _XYZ(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y | 
TGSI_WRITEMASK_Z)
+#define _XYW(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y | 
TGSI_WRITEMASK_W)
+#define _XZW(v) ureg_writemask((v), TGSI_WRITEMASK_Y | TGSI_WRITEMASK_Z | 
TGSI_WRITEMASK_W)
+#define _YZW(v) ureg_writemask((v), TGSI_WRITEMASK_Y | TGSI_WRITEMASK_Z | 
TGSI_WRITEMASK_W)
+#define _XYZW(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y | 
TGSI_WRITEMASK_Z | TGSI_WRITEMASK_W)
+
+#define _SAT(v) ureg_saturate(v)
+#define _PRED(v, n, x, y, z, w) ureg_predicate(v, n, x, y, z, w)
+#define _IND(r, a) ureg_dst_indirect(r, _src(a))
+#define _UNDEF ureg_dst_undef()
+#define _IS_UNDEF(v) ureg_dst_is_undef(v)
+
+#define _VERT struct ureg_program* ureg = ureg_create(TGSI_PROCESSOR_VERTEX)
+#define _FRAG struct ureg_program* ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT)
+
+#define 

Re: [Nouveau] [PATCH] Autogenerate uureg opcode macros

2009-12-31 Thread Luca Barbieri
This was supposed to go to mesa3d.

On Thu, Dec 31, 2009 at 6:24 PM, Luca Barbieri l...@luca-barbieri.comwrote:

 Also some missing _src()s and cosmetic changes.
 ---
  src/gallium/programs/galliumut/Makefile|5 +
  .../programs/galliumut/gen_uureg_opcodes.sh|   29 +++
  src/gallium/programs/galliumut/uureg.h |  196
 
  3 files changed, 71 insertions(+), 159 deletions(-)
  create mode 100644 src/gallium/programs/galliumut/gen_uureg_opcodes.sh

 diff --git a/src/gallium/programs/galliumut/Makefile
 b/src/gallium/programs/galliumut/Makefile
 index ab0d684..4cb9d7c 100644
 --- a/src/gallium/programs/galliumut/Makefile
 +++ b/src/gallium/programs/galliumut/Makefile
 @@ -9,3 +9,8 @@ LIBRARY_DEFINES = --std=gnu99
  C_SOURCES = egl_gallium.c image.c normal_gen.c

  include ../../Makefile.template
 +
 +default: uureg_opcodes.h
 +
 +uureg_opcodes.h: gen_uureg_opcodes.sh
 +   bash $^  $@
 diff --git a/src/gallium/programs/galliumut/gen_uureg_opcodes.sh
 b/src/gallium/programs/galliumut/gen_uureg_opcodes.sh
 new file mode 100644
 index 000..3a56fcb
 --- /dev/null
 +++ b/src/gallium/programs/galliumut/gen_uureg_opcodes.sh
 @@ -0,0 +1,29 @@
 +#!/bin/bash
 +cat - EOF
 +#ifndef UUREG_OPCODES_H
 +#define UUREG_OPCODES_H
 +
 +/* Autogenerated file, do not edit manually! Use make to regenerate. */
 +
 +EOF
 +
 +cat -  EOF|cpp -P -E - -I../../auxiliary|sed -re 's/^define /#define _/;
 s/ CAT /##/g;'
 +#define OP00(op) define op() ureg_##op(ureg)
 +#define OP01(op) define op(src) ureg_##op(ureg, _src(src))
 +#define OP00_LBL(op) define op(label) ureg_##op(ureg, label)
 +#define OP01_LBL(op) define op(src, label) ureg_##op(ureg, _src(src),
 label)
 +#define OP10(op) define op(dst) ureg_##op(ureg, dst)
 +#define OP11(op) define op(dst, src) ureg_##op(ureg, dst, _src(src))
 +#define OP12(op) define op(dst, src0, src1) ureg_##op(ureg, dst,
 _src(src0), _src(src1))
 +#define OP12_TEX(op) define op(dst, target, src0, src1) ureg_##op(ureg,
 dst, TGSI_TEXTURE_ CAT target, _src(src0), _src(src1))
 +#define OP13(op) define op(dst, src0, src1, src2) ureg_##op(ureg, dst,
 _src(src0), _src(src1), _src(src2))
 +#define OP14_TEX(op) define op(dst, target, src0, src1, src2, src3)
 ureg_##op(ureg, dst, TGSI_TEXTURE_ CAT target, _src(src0), _src(src1),
 _src(src2), _src(src3))
 +
 +#include tgsi/tgsi_opcode_tmp.h
 +EOF
 +
 +cat - EOF
 +
 +#endif
 +EOF
 +
 diff --git a/src/gallium/programs/galliumut/uureg.h
 b/src/gallium/programs/galliumut/uureg.h
 index a2d07a7..d30e188 100644
 --- a/src/gallium/programs/galliumut/uureg.h
 +++ b/src/gallium/programs/galliumut/uureg.h
 @@ -60,7 +60,7 @@ static inline struct ureg_src _src(const struct ureg_src
 src) {return src;}
  #define _OUTPUT(v, n, i) struct ureg_dst v = ureg_DECL_output(ureg,
 TGSI_SEMANTIC_##n, i)
  #define _CONST_(v, i) struct ureg_src v = ureg_DECL_constant(ureg, i)
  #define _CONST(v, s) UREG_CONST(v, ureg, s)
 - #define _CONST_MAT3(v, s) UREG_CONST_MAT3(v, ureg, s)
 +#define _CONST_MAT3(v, s) UREG_CONST_MAT3(v, ureg, s)
  #define _CONST_MAT4(v, s) UREG_CONST_MAT4(v, ureg, s)
  #define _ADDRESS(v) struct ureg_src v = ureg_DECL_address(ureg)
  #define _LOOP(v) struct ureg_src v = ureg_DECL_loop(ureg)
 @@ -88,6 +88,41 @@ static inline struct ureg_src _src(const struct
 ureg_src src) {return src;}
  #define _zy(v) _swz(v, Z, Y, Z, Y)
  #define _zw(v) _swz(v, Z, W, Z, W)

 +#define _ind(r, a) ureg_src_indirect(_src(r), _src(a))
 +#define _abs(x) ureg_abs(_src(x))
 +#define _neg(x) ureg_negate(_src(x))
 +#define _undef ureg_src_undef()
 +#define _is_undef(v) ureg_src_is_undef(_src(v))
 +
 +#define _X(v) ureg_writemask((v), TGSI_WRITEMASK_X)
 +#define _Y(v) ureg_writemask((v), TGSI_WRITEMASK_Y)
 +#define _Z(v) ureg_writemask((v), TGSI_WRITEMASK_Z)
 +#define _W(v) ureg_writemask((v), TGSI_WRITEMASK_W)
 +#define _XY(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y)
 +#define _XZ(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Z)
 +#define _XW(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_W)
 +#define _YZ(v) ureg_writemask((v), TGSI_WRITEMASK_Y | TGSI_WRITEMASK_Z)
 +#define _YW(v) ureg_writemask((v), TGSI_WRITEMASK_Y | TGSI_WRITEMASK_W)
 +#define _ZW(v) ureg_writemask((v), TGSI_WRITEMASK_Z | TGSI_WRITEMASK_W)
 +#define _XYZ(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y |
 TGSI_WRITEMASK_Z)
 +#define _XYW(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y |
 TGSI_WRITEMASK_W)
 +#define _XZW(v) ureg_writemask((v), TGSI_WRITEMASK_Y | TGSI_WRITEMASK_Z |
 TGSI_WRITEMASK_W)
 +#define _YZW(v) ureg_writemask((v), TGSI_WRITEMASK_Y | TGSI_WRITEMASK_Z |
 TGSI_WRITEMASK_W)
 +#define _XYZW(v) ureg_writemask((v), TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y |
 TGSI_WRITEMASK_Z | TGSI_WRITEMASK_W)
 +
 +#define _SAT(v) ureg_saturate(v)
 +#define _PRED(v, n, x, y, z, w) ureg_predicate(v, n, x, y, z, w)
 +#define _IND(r, a) ureg_dst_indirect(r, _src(a))
 +#define _UNDEF ureg_dst_undef()
 +#define _IS_UNDEF

[Nouveau] [PATCH] Correct miptree layout for cubemaps on NV20-NV40

2009-12-30 Thread Luca Barbieri
It seems that the current miptree layout is incorrect because the size
of all the levels of each cube map face must be 64-byte aligned.
This patch fixes piglit cubemap and fbo-cubemap which were broken.
This makes sense since otherwise all the levels would no longer be
64-byte aligned, which the GPU needs for 2D/3D targets.
Note that bin/cubemap and bin/fbo-cubemap still report errors on 2x2 and
1x1 mipmap levels but they also report some of them with softpipe and
swrast.
---
 src/gallium/drivers/nv20/nv20_miptree.c |1 +
 src/gallium/drivers/nv30/nv30_miptree.c |1 +
 src/gallium/drivers/nv40/nv40_miptree.c |1 +
 3 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/src/gallium/drivers/nv20/nv20_miptree.c 
b/src/gallium/drivers/nv20/nv20_miptree.c
index 8f7538e..ad61217 100644
--- a/src/gallium/drivers/nv20/nv20_miptree.c
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -52,6 +52,7 @@ nv20_miptree_layout(struct nv20_miptree *nv20mt)
 
nv20mt-level[l].image_offset[f] = offset;
offset += nv20mt-level[l].pitch * u_minify(pt-height0, l);
+   offset = align(offset, 64);
}
 
nv20mt-total_size = offset;
diff --git a/src/gallium/drivers/nv30/nv30_miptree.c 
b/src/gallium/drivers/nv30/nv30_miptree.c
index 8fbba38..9850de8 100644
--- a/src/gallium/drivers/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nv30/nv30_miptree.c
@@ -54,6 +54,7 @@ nv30_miptree_layout(struct nv30_miptree *nv30mt)
 
nv30mt-level[l].image_offset[f] = offset;
offset += nv30mt-level[l].pitch * u_minify(pt-height0, l);
+   offset = align(offset, 64);
}
 
nv30mt-total_size = offset;
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c 
b/src/gallium/drivers/nv40/nv40_miptree.c
index 89bd155..f7e8b55 100644
--- a/src/gallium/drivers/nv40/nv40_miptree.c
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -56,6 +56,7 @@ nv40_miptree_layout(struct nv40_miptree *mt)
 
mt-level[l].image_offset[f] = offset;
offset += mt-level[l].pitch * u_minify(pt-height0, l);
+   offset = align(offset, 64);
}
 
mt-total_size = offset;
-- 
1.6.3.3



___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] Correct swizzled surfaces patch

2009-12-29 Thread Luca Barbieri
My swizzling fix incorrectly used the dimensions of the copy rectangle
instead of that of the destination surface. This patch fixes that.
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index ca0c433..481315e 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -168,10 +168,10 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	sub_w = MIN2(sub_w, w - x);
 
 	/* Must be 64-byte aligned */
-	assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format))  63));
+	assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y, dst-width, dst-height) * util_format_get_blocksize(dst-texture-format))  63));
 
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format),
+	OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x, dy+y, dst-width, dst-height) * util_format_get_blocksize(dst-texture-format),
  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] Fix glTexSubImage on swizzled surfaces on =NV40

2009-12-29 Thread Luca Barbieri
Currently in nvXX_transfer_new a temporary as large as the surface is created.
If the subrectangle is not the whole texture we would need to read
back the whole texture, but we aren't.
Thus, everything but the subrectangle specified is loaded as garbage.
This can be seen in progs/demos/ray.

This patch fixes the problem by creating a temporary that covers only
the desired subrectangle.

This causes us to hit an alignment assert in nv04_surface_2d.c,.
This is fixed by specifying the start of the surface as the surface
offset, and using the _POINT registers to program the (x, y)
coordinates.
This also allows us to avoid computing swizzled addresses on the CPU.

This fixes progs/demos/ray and doesn't seem to introduce problems.

Patch is for all =NV40 cards, but tested on NV40 only.
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index ca0c433..c14d76d 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -167,19 +167,19 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	  for (x = 0; x  w; x += sub_w) {
 	sub_w = MIN2(sub_w, w - x);
 
-	assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format))  63));
+	assert(!(dst-offset  63));
 
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format),
+	OUT_RELOCl(chan, dst_bo, dst-offset,
  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
 	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
 	OUT_RING  (chan, nv04_scaled_image_format(src-format));
 	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
-	OUT_RING  (chan, 0);
+	OUT_RING  (chan, (dx + x) | ((dy + y)  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
 	OUT_RING  (chan, sub_h  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
-	OUT_RING  (chan, 0);
+	OUT_RING  (chan, (dx + x) | ((dy + y)  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
 	OUT_RING  (chan, sub_h  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
 	OUT_RING  (chan, 1  20);
 	OUT_RING  (chan, 1  20);
@@ -190,9 +189,9 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	OUT_RING  (chan, src_pitch |
 			 NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
 			 NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
-	OUT_RELOCl(chan, src_bo, src-offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src-texture-format),
+	OUT_RELOCl(chan, src_bo, src-offset,
  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RING  (chan, 0);
+	OUT_RING  (chan, (sx + x) | ((sy + y)  NV04_SCALED_IMAGE_FROM_MEMORY_POINT_Y_SHIFT));
 	  }
 	}
 
diff --git a/src/gallium/drivers/nv40/nv40_transfer.c b/src/gallium/drivers/nv40/nv40_transfer.c
index adfd035..791ee68 100644
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ b/src/gallium/drivers/nv40/nv40_transfer.c
@@ -16,14 +16,14 @@ struct nv40_transfer {
 };
 
 static void
-nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
  struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template-target = pt-target;
 	template-format = pt-format;
-	template-width0 = u_minify(pt-width0, level);
-	template-height0 = u_minify(pt-height0, level);
+	template-width0 = width;
+	template-height0 = height;
 	template-depth0 = 1;
 	template-last_level = 0;
 	template-nr_samples = pt-nr_samples;
@@ -71,7 +71,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx-direct = false;
 
-	nv40_compatible_transfer_tex(pt, level, tx_tex_template);
+	nv40_compatible_transfer_tex(pt, w, h, tx_tex_template);
 
 	tx_tex = pscreen-texture_create(pscreen, tx_tex_template);
 	if (!tx_tex)
@@ -80,6 +80,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx-base.stride = ((struct nv40_miptree*)tx_tex)-level[0].pitch;
+
 	tx-surface = pscreen-get_tex_surface(pscreen, tx_tex,
 	   0, 0, 0,
 	   pipe_transfer_buffer_flags(tx-base));
@@ -105,8 +107,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen-eng2d-copy(nvscreen-eng2d,
 		  tx-surface, 0, 0,
-		  src, 0, 0,
-		  src-width, src-height);
+		  src, x, y,
+		  w, h);
 
 		pipe_surface_reference(src, NULL);
 	

Re: [Nouveau] [PATCH] Fix glTexSubImage on swizzled surfaces on =NV40

2009-12-29 Thread Luca Barbieri
Ignore that patch. It's broken because we must set the offset for the
up to 1024x1024 chunk we are copying instead of the whole image.
The corrected patch is attached.
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index ca0c433..3193086 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -167,20 +167,19 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	  for (x = 0; x  w; x += sub_w) {
 	sub_w = MIN2(sub_w, w - x);
 
-	/* Must be 64-byte aligned */
-	assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format))  63));
+	assert(!(dst-offset  63));
 
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format),
+	OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(x, y, dst-width, dst-height) * util_format_get_blocksize(dst-texture-format),
  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
 	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
 	OUT_RING  (chan, nv04_scaled_image_format(src-format));
 	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
-	OUT_RING  (chan, 0);
+	OUT_RING  (chan, dx | (dy  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
 	OUT_RING  (chan, sub_h  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
-	OUT_RING  (chan, 0);
+	OUT_RING  (chan, dx | (dy  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
 	OUT_RING  (chan, sub_h  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
 	OUT_RING  (chan, 1  20);
 	OUT_RING  (chan, 1  20);
@@ -190,9 +189,9 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	OUT_RING  (chan, src_pitch |
 			 NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
 			 NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
-	OUT_RELOCl(chan, src_bo, src-offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src-texture-format),
+-   OUT_RELOCl(chan, src_bo, src-offset + y * src_pitch + x * util_format_get_blocksize(src-texture-format),
  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sx | (sy  NV04_SCALED_IMAGE_FROM_MEMORY_POINT_Y_SHIFT));
 	  }
 	}
  
diff --git a/src/gallium/drivers/nv40/nv40_transfer.c b/src/gallium/drivers/nv40/nv40_transfer.c
index adfd035..791ee68 100644
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ b/src/gallium/drivers/nv40/nv40_transfer.c
@@ -16,14 +16,14 @@ struct nv40_transfer {
 };
 
 static void
-nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
  struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template-target = pt-target;
 	template-format = pt-format;
-	template-width0 = u_minify(pt-width0, level);
-	template-height0 = u_minify(pt-height0, level);
+	template-width0 = width;
+	template-height0 = height;
 	template-depth0 = 1;
 	template-last_level = 0;
 	template-nr_samples = pt-nr_samples;
@@ -71,7 +71,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx-direct = false;
 
-	nv40_compatible_transfer_tex(pt, level, tx_tex_template);
+	nv40_compatible_transfer_tex(pt, w, h, tx_tex_template);
 
 	tx_tex = pscreen-texture_create(pscreen, tx_tex_template);
 	if (!tx_tex)
@@ -80,6 +80,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx-base.stride = ((struct nv40_miptree*)tx_tex)-level[0].pitch;
+
 	tx-surface = pscreen-get_tex_surface(pscreen, tx_tex,
 	   0, 0, 0,
 	   pipe_transfer_buffer_flags(tx-base));
@@ -105,8 +107,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen-eng2d-copy(nvscreen-eng2d,
 		  tx-surface, 0, 0,
-		  src, 0, 0,
-		  src-width, src-height);
+		  src, x, y,
+		  w, h);
 
 		pipe_surface_reference(src, NULL);
 	}
@@ -130,9 +132,9 @@ nv40_transfer_del(struct pipe_transfer *ptx)
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen-eng2d-copy(nvscreen-eng2d,
-		  dst, 0, 0,
+		  dst, tx-base.x, tx-base.y,
 		  tx-surface, 0, 0,
-		  dst-width, dst-height);
+		  tx-base.width, tx-base.height);
 
 		pipe_surface_reference(dst, NULL);

Re: [Nouveau] [PATCH] Fix glTexSubImage on swizzled surfaces on =NV40

2009-12-29 Thread Luca Barbieri
Third attempt, as the second one was logically wrong.
The problem in the first patch was actually that the source point
register has a 1024 limit.
This one leaves the way the source is set up alone, and sets the whole
surface as the destination like in the first version, using the point
registers (which do not seem to have a 1024 limit, but only on the
destination).

Needs testing on NV40.
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index 40b538f..f0145c8 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -167,20 +190,19 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	  for (x = 0; x  w; x += sub_w) {
 	sub_w = MIN2(sub_w, w - x);
 
-	/* Must be 64-byte aligned */
-	assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format))  63));
+	assert(!(dst-offset  63));
 
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format),
+	OUT_RELOCl(chan, dst_bo, dst-offset,
  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
 	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
 	OUT_RING  (chan, nv04_scaled_image_format(src-format));
 	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
-	OUT_RING  (chan, 0);
+	OUT_RING  (chan, (x + dx) | ((y + dy)  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
 	OUT_RING  (chan, sub_h  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
-	OUT_RING  (chan, 0);
+	OUT_RING  (chan, (x + dx) | ((y + dy)  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
 	OUT_RING  (chan, sub_h  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
 	OUT_RING  (chan, 1  20);
 	OUT_RING  (chan, 1  20);
diff --git a/src/gallium/drivers/nv40/nv40_transfer.c b/src/gallium/drivers/nv40/nv40_transfer.c
index adfd035..791ee68 100644
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ b/src/gallium/drivers/nv40/nv40_transfer.c
@@ -16,14 +16,14 @@ struct nv40_transfer {
 };
 
 static void
-nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
  struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template-target = pt-target;
 	template-format = pt-format;
-	template-width0 = u_minify(pt-width0, level);
-	template-height0 = u_minify(pt-height0, level);
+	template-width0 = width;
+	template-height0 = height;
 	template-depth0 = 1;
 	template-last_level = 0;
 	template-nr_samples = pt-nr_samples;
@@ -71,7 +71,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx-direct = false;
 
-	nv40_compatible_transfer_tex(pt, level, tx_tex_template);
+	nv40_compatible_transfer_tex(pt, w, h, tx_tex_template);
 
 	tx_tex = pscreen-texture_create(pscreen, tx_tex_template);
 	if (!tx_tex)
@@ -80,6 +80,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx-base.stride = ((struct nv40_miptree*)tx_tex)-level[0].pitch;
+
 	tx-surface = pscreen-get_tex_surface(pscreen, tx_tex,
 	   0, 0, 0,
 	   pipe_transfer_buffer_flags(tx-base));
@@ -105,8 +107,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen-eng2d-copy(nvscreen-eng2d,
 		  tx-surface, 0, 0,
-		  src, 0, 0,
-		  src-width, src-height);
+		  src, x, y,
+		  w, h);
 
 		pipe_surface_reference(src, NULL);
 	}
@@ -130,9 +132,9 @@ nv40_transfer_del(struct pipe_transfer *ptx)
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen-eng2d-copy(nvscreen-eng2d,
-		  dst, 0, 0,
+		  dst, tx-base.x, tx-base.y,
 		  tx-surface, 0, 0,
-		  dst-width, dst-height);
+		  tx-base.width, tx-base.height);
 
 		pipe_surface_reference(dst, NULL);
 	}
@@ -151,8 +153,10 @@ nv40_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt-buffer,
 	pipe_transfer_buffer_flags(ptx));
 
-	return map + ns-base.offset +
-	   ptx-y * ns-pitch + ptx-x * util_format_get_blocksize(ptx-texture-format);
+	if(!tx-direct)
+		return map + ns-base.offset;
+	else
+		return map + ns-base.offset + ptx-y * ns-pitch + ptx-x * util_format_get_blocksize(ptx-texture-format);
 }
 
 static void
diff --git 

[Nouveau] [PATCH] Fix surface_fill alpha

2009-12-29 Thread Luca Barbieri
Currently surface_fill sets alpha incorrectly to 1.0 when drawing to
A8R8G8B8 instead of the correct value.

xf86-video-nouveau has the following comment confirming the issue:
/* When SURFACE_FORMAT_A8R8G8B8 is used with GDI_RECTANGLE_TEXT, the
 * alpha channel gets forced to 0xFF for some reason.  We're using
 * SURFACE_FORMAT_Y32 as a workaround
 */

This patch fixes it by always using SURFACE_FORMAT_Y formats in surface_fill.

diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c
b/src/gallium/drivers/nv04/nv04_surface_2d.c
index 3193086..dfe30c0 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -34,26 +34,6 @@ nv04_surface_format(enum pipe_format format)
 }

 static INLINE int
-nv04_rect_format(enum pipe_format format)
-{
-   switch (format) {
-   case PIPE_FORMAT_A8_UNORM:
-   return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
-   case PIPE_FORMAT_R5G6B5_UNORM:
-   case PIPE_FORMAT_A8L8_UNORM:
-   case PIPE_FORMAT_Z16_UNORM:
-   return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-   return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
-   default:
-   return -1;
-   }
-}
-
-static INLINE int
 nv04_scaled_image_format(enum pipe_format format)
 {
switch (format) {
@@ -319,13 +299,24 @@ nv04_surface_fill(struct nv04_surface_2d *ctx,
struct pipe_surface *dst,
struct nouveau_grobj *rect = ctx-rect;
struct nouveau_bo *dst_bo = nouveau_bo(ctx-buf(dst));
unsigned dst_pitch = ((struct nv04_surface *)dst)-pitch;
+   int bpp = util_format_get_blocksize(dst-format);
int cs2d_format, gdirect_format;

-   cs2d_format = nv04_surface_format(dst-format);
-   assert(cs2d_format = 0);
-
-   gdirect_format = nv04_rect_format(dst-format);
-   assert(gdirect_format = 0);
+   if(bpp == 1)
+   {
+   gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+   cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+   }
+   else if(bpp == 2)
+   {
+   gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+   cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y16;
+   }
+   else
+   {
+   gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+   cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+   }

MARK_RING (chan, 16, 4);
BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] Synchronization mostly missing?

2009-12-28 Thread Luca Barbieri
It looks like there are two bugs.

One seems related to some kind of GPU cache of GART memory which does
not get flushed, causes significant corruption and is worked around by
putting buffers in VRAM, software TNL or immediate submission.
It may be related to the NV40TCL_VTX_CACHE_INVALIDATE which is in
nouveau_class.h but never used.
Synchronizing and/or waiting after draw_arrays seems to improve things
but does not fully solve them.

However, there is another one, which is still present with buffers in
VRAM but is eliminated if I add syncing with the DMA_FENCE mechanism
at the end of draw_arrays and draw_elements. This one may be more
widely reproducible.

Try running two or more copies of mesa/progs/demos/dinoshade, all visible.
Do you see a flashing yellow region on the floors?
I do. If I add NV40TCL_NOTIFY or DMA_FENCE based synchronization, the
problem disappears.
This also happens if you move around the window, presumably due to the X server.

It seems that kernel FIFO/M2MF-based fencing does indeed wait for
rendering or at least vertex fetch, but that somehow works only if
there is a single application running.
If there are multiple applications, then the DMA_FENCE-based mechanism
waits more and keeps working while kernel FIFO/M2MF-based fencing
fails.

I'm not sure why this is the case though.

Using nVidia ctxprogs had no effect.
The vram_pushbuf option caused an X lockup upon starting the demo.

Another thing that comes to mind (purely speculative) is that the
FIFO/M2MF synchronization may be due to the fact that GPU component
that reads from the FIFO is the same one that reads the vertices or
other data and is prioritizing that over reading commands, but having
multiple active channels makes that no longer be the case.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] Synchronization mostly missing?

2009-12-28 Thread Luca Barbieri
It looks like there are two bugs.

One seems related to some kind of cache of GART memory which does not
get flushed, causes significant corruption and is worked around by
putting buffers in VRAM.
For some reason, adding syncing instead of putting buffers of VRAM
does seem to greatly reduce the symptoms of this bug and fully removes
them for some programs, for not for all.

However, there is another one, which is still present with buffers in
VRAM but is eliminated if I add syncing with the DMA_FENCE mechanism
at the end of draw_arrays and draw_elements. This one may be more
widely reproducible.

Try running two or more copies of mesa/progs/demos/dinoshade, all visible.
Do you see a flashing yellow region on the floor?
I do. If I add NV40_TCL_NOTIFY or DMA_FENCE based synchronization, the
problem disappears.
This also happens if you move around the window, presumably due to the X server.

It seems that M2MF/FIFO-based fencing does indeed work for our
purposes, but only if there is a single application running.
If there are multiple applications, then the 3D engine DMA_FENCE-based
mechanism somehow waits more and keeps working while FIFO/M2MF-based
fencing fails.

I'm not sure why this is the case though.
Using nVidia ctxprogs has no effect.

Another things that comes to mind (purely speculative) is that the
FIFO synchronization may be due to the fact that the GPU component
that reads from the FIFO is the same one that reads the vertices or
other data and it prioritizes that over reading commands, but having
multiple contexts makes that no longer be the case.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] Synchronization mostly missing?

2009-12-27 Thread Luca Barbieri
It seems that Noveau is assuming that once the FIFO pointer is past a
command, that command has finished executing, and all the buffers it
used are no longer needed.

However, this seems to be false at least on G71.
In particular, the card may not have even finished reading the input
vertex buffers when the pushbuffer fence triggers.
While Mesa does not reuse the buffer object itself, the current
allocator tends to return memory that has just been freed, resulting
in the buffer actually been reused.
Thus Mesa will overwrite the vertices before the GPU has used them.

This results in all kinds of artifacts, such as vertices going to
infinity, and random polygons appearing.
This can be seen in progs/demos/engine, progs/demos/dinoshade,
Blender, Extreme Tux Racer and probably any non-trivial OpenGL
software.

The problem can be significantly reduced by just adding a waiting loop
at the end of draw_arrays and draw_elements, or by synchronizing
drawing by adding and calling the following function instead of
pipe-flush in nv40_vbo.c:
I think the remaining artifacts may be due to missing 2D engine
synchronization, but I'm not sure how that works.
Note that this causes the CPU to wait for rendering, which is not the
correct solution

static void nv40_sync(struct nv40_context *nv40)
{
nouveau_notifier_reset(nv40-screen-sync, 0);

//  BEGIN_RING(curie, 0x1d6c, 1);
//  OUT_RING(0x5c0);

//  static int value = 0x23;
//  BEGIN_RING(curie, 0x1d70, 1);
//  OUT_RING(value++);

BEGIN_RING(curie, NV40TCL_NOTIFY, 1);
OUT_RING(0);

BEGIN_RING(curie, NV40TCL_NOP, 1);
OUT_RING(0);

FIRE_RING(NULL);

nouveau_notifier_wait_status(nv40-screen-sync, 0, 0, 0);
}

It seems that NV40TCL_NOTIFY (which must be followed by a nop for some
reason) triggers a notification of rendering completion.
Furthermore, the card will probably put the value set with 0x1d70
somewhere, where 0x1d6c has an unknown use
The 1d70/1d6c is frequently used by the nVidia driver, with 0x1d70
being a sequence number, while 0x1d6c is always set to 0x5c0, while
NV40TCL_NOTIFY seems to be inserted on demand.
On my machine, setting 0x1d6c/0x1d70 like the nVidia driver does
causes a GPU lockup. That is probably because the location where the
GPU is supposed to put the value has not been setup correctly.

So it seems that the current model is wrong, and the current fence
should only be used to determine whether the pushbuffer itself can be
reused.
It seems that, after figuring out where the GPU writes the value and
how to use the mechanism properly, this should be used by the kernel
driver as the bo-sync_obj implementation.
This will delay destruction of the buffers, and thus prevent
reallocation of them, and artifacts, without synchronizing rendering.

I'm not sure why this hasn't been noticed before though.
Is everyone getting randomly misrendered OpenGL or is my machine
somehow more prone to reusing buffers?

What do you think? Is the analysis correct?
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] Synchronization mostly missing?

2009-12-27 Thread Luca Barbieri
I figured out the registers.

There is a fence/sync mechanism which apparently triggers after
rendering is finished.
There are two ways to use it, but they trigger at the same time
(spinning in a loop on the CPU checking them, they trigger at the same
iteration or in two successive iterations).

The first is the sync notifier, which involves a notifier object set
at NV40TCL_DMA_NOTIFY.
When NV40TCL_NOTIFY, with argument 0, followed by NV40TCL_NOP, with
argument 0 is inserted in the ring, the notifier object will be
notified when rendering is finished.
fbcon uses this to sync rendering.
Currently the Mesa driver sets an object but does not use it.
The renouveau traces use this mechanism only in the
EXT_framebuffer_object tests.
It's not clear what the purpose of the NOP is, but it seems necessary.

The second is the fence mechanism, which involves an object set at
NV40TCL_DMA_FENCE.
When register 0x1d70 is set, the value set there will be written to
the object at the offset programmed in 0x1d6c.
The offset in 0x1d6c must be 16-byte aligned, but the GPU seems to
only write 4 bytes with the sequence number.
Nouveau does not use this currently, and sets NV40TCL_DMA_FENCE to 0.
The nVidia driver uses this often. It allocates a 4KB object and asks
the GPU to put the sequence number always at offset 0x5c0. Why it does
this rather than allocating a 16 byte object and using offset 0 is
unknown.

IMHO the fence mechanism should be implemented in the kernel along
with the current FIFO fencing, and should protect the relocated buffer
object.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] Synchronization mostly missing?

2009-12-27 Thread Luca Barbieri
 Can you reproduce this with your vertex buffers in VRAM instead of GART?
 (to rule out that it's a fencing issue).

Putting the vertex buffers in VRAM makes things almost perfect, but
still with rare artifacts.
In particular, the yellow arrow in dinoshade sometimes becames a
yellow polygon on the floor, which happens almost every frame if I
move the window around.
It does fix demos/engine, blender and etracer is almost perfect.

Using my sync patch fixes demos/engine and demos/dinoshade, but still
leaves artifacts in blender when moving the rectangle and artifacts in
etracer.

Putting the vertex buffers in VRAM _AND_ adding my sync patch makes
things perfect on my system.

Using sync + a delay loop before drawing makes things better but still
problematic.

Also note that both adding wbinvd in the kernel at the start of push
buffer submission, running with nopat and synchronizing with the
current fence in the kernel had no effect on demos/engine artifacts.

Preventing loading of intel_agp did not seem to have any effect either
(but strangely, it still listed the aperture size, not sure what's up
there).

The last test I tried was, all together:
1. My nv40_sync patch
2. 3 wbinvd followed by spinning 1 times in the kernel at the
start of pushbuffer validation
3. Adding
BEGIN_RING(curie, NV40TCL_VTX_CACHE_INVALIDATE, 1);
OUT_RING(0);
before and after draw_elements and draw_arrays
4. Removing intel_agp

The logo on etracer's splash screen still, on some frames, flickered.
Only putting vertex buffers in VRAM fixed that.

I'm not really sure what is happening there.

It seems that there is the lack of synchronization plus some other problem.

Maybe there is indeed an on-GPU cache for AGP/PCI memory which isn't
getting flushed.
Maybe NV40TCL_VTX_CACHE_INVALIDATE should be used but not in the way I did.
I couldn't find it in renouveau traces, who did reverse engineer that?
What does that do?

Also, what happens when I remove intel_agp? Does it use PCI DMA?

BTW, it seems to me that adding the fencing mechanism I described is
necessary even if the vertices are read before the FIFO continues,
since rendering is not completed and currently I don't see anything
preventing TTM from, for instance, evicting the render buffer while it
is being rendered to.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [MESA PATCH] Fix nv40_miptree_layout pitch

2009-12-26 Thread Luca Barbieri
On Sun, Dec 27, 2009 at 2:25 AM, Younes Manton youne...@gmail.com wrote:
 On Sat, Dec 26, 2009 at 1:22 AM, Luca Barbieri l...@luca-barbieri.com wrote:
 I just coded a patch that does this and seems to work fine. It must be
 fixed since it breaks OpenGL (or the state tracker can be changed, but
 it seems better to do it in the driver).

 The patch also fixes NV20 and NV30 in the same way. They compile but
 are untested.

 I would guess that using the 3D engine is faster for the larger
 levels, but the 2D engine is faster for the smaller ones (and lacks
 this issue).

 Hi, this patch and the other swizzle related one seem to have been
 mangled, they're wrapped with hard line breaks so I can't apply them.
 Can you resend? The cmp/scs one applied fine.


Sorry, I hoped gmail wouldn't mangle it, but apparently was wrong.

Attached the patch, will test once I get the message back.

I'll post the other patch in reply to the other message.
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 42c77e5..4c3e08a 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -23,6 +23,9 @@
 #define NOUVEAU_BUFFER_USAGE_ZETA (1  17)
 #define NOUVEAU_BUFFER_USAGE_TRANSFER (1  18)
 
+/* use along with GPU_WRITE for 2D-only writes */
+#define NOUVEAU_BUFFER_USAGE_NO_RENDER (1  19)
+
 extern struct pipe_screen *
 nv04_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index 12df7fd..40b538f 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -491,3 +500,49 @@ nv04_surface_2d_init(struct nouveau_screen *screen)
 	ctx-fill = nv04_surface_fill;
 	return ctx;
 }
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns)
+{
+	int temp_flags;
+
+	// printf(creating temp, flags is %i!\n, flags);
+
+	if(ns-base.usage  PIPE_BUFFER_USAGE_DISCARD)
+	{
+		temp_flags = ns-base.usage | PIPE_BUFFER_USAGE_GPU_READ;
+		ns-base.usage = PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_DISCARD;
+	}
+	else
+	{
+		temp_flags = ns-base.usage | PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE;
+		ns-base.usage = PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_GPU_READ;
+	}
+
+	struct nv40_screen* screen = (struct nv40_screen*)pscreen;
+	ns-base.usage = PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE;
+
+	struct pipe_texture templ;
+	memset(templ, 0, sizeof(templ));
+	templ.format = ns-base.texture-format;
+	templ.target = PIPE_TEXTURE_2D;
+	templ.width0 = ns-base.width;
+	templ.height0 = ns-base.height;
+	templ.depth0 = 1;
+	templ.last_level = 0;
+
+	// TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented
+	templ.nr_samples = ns-base.texture-nr_samples;
+
+	templ.tex_usage = ns-base.texture-tex_usage | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+	struct pipe_texture* temp_tex = pscreen-texture_create(pscreen, templ);
+	struct nv04_surface* temp_ns = (struct nv04_surface*)pscreen-get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags);
+	temp_ns-backing = ns;
+
+	if(ns-base.usage  PIPE_BUFFER_USAGE_GPU_READ)
+		eng2d-copy(eng2d, temp_ns-backing-base, 0, 0, ns-base, 0, 0, ns-base.width, ns-base.height);
+
+	return temp_ns;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.h b/src/gallium/drivers/nv04/nv04_surface_2d.h
index 02b3f56..ce696a1 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.h
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.h
@@ -4,6 +4,7 @@
 struct nv04_surface {
 	struct pipe_surface base;
 	unsigned pitch;
+	struct nv04_surface* backing;
 };
 
 struct nv04_surface_2d {
@@ -30,4 +31,7 @@ nv04_surface_2d_init(struct nouveau_screen *screen);
 void
 nv04_surface_2d_takedown(struct nv04_surface_2d **);
 
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns);
+
 #endif
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c b/src/gallium/drivers/nv20/nv20_miptree.c
index d1291a9..8f7538e 100644
--- a/src/gallium/drivers/nv20/nv20_miptree.c
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -6,6 +6,7 @@
 
 #include nv20_context.h
 #include nv20_screen.h
+#include ../nv04/nv04_surface_2d.h
 
 static void
 nv20_miptree_layout(struct nv20_miptree *nv20mt)
@@ -127,6 +128,12 @@ nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
 	if (pt-tex_usage  PIPE_TEXTURE_USAGE_DYNAMIC)
 		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
 
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy

Re: [Nouveau] Fix swizzling for copies to rectangular textures

2009-12-26 Thread Luca Barbieri
Patch was mangled, resent attached.
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index 12df7fd..40b538f 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -77,7 +77,7 @@ nv04_scaled_image_format(enum pipe_format format)
 }
 
 static INLINE unsigned
-nv04_swizzle_bits(unsigned x, unsigned y)
+nv04_swizzle_bits_square(unsigned x, unsigned y)
 {
 	unsigned u = (x  0x001)  0 |
 	 (x  0x002)  1 |
@@ -107,6 +107,15 @@ nv04_swizzle_bits(unsigned x, unsigned y)
 	return v | u;
 }
 
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	unsigned s = MIN2(w, h);
+	unsigned m = s - 1;
+	return (((x | y)  ~m) * s) | nv04_swizzle_bits_square(x  m, y  m);
+}
+
 static int
 nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 			  struct pipe_surface *dst, int dx, int dy,
@@ -159,10 +168,10 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	sub_w = MIN2(sub_w, w - x);
 
 	/* Must be 64-byte aligned */
-	assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y) * util_format_get_blocksize(dst-texture-format))  63));
+	assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format))  63));
 
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x, dy+y) * util_format_get_blocksize(dst-texture-format),
+	OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) * util_format_get_blocksize(dst-texture-format),
  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [MESA PATCH] Fix nv40_miptree_layout pitch

2009-12-25 Thread Luca Barbieri
This patch fixes two issues in nv40_miptree_layout.

First, pt-width0 is used, which is the size of the whole texture,
while width, which is the size of the mipmap level, should be used.

Second, the current code does not 64-byte align the pitch of swizzled
textures. However, on my NV40 this causes a pgraph error regarding the
pitch register (and sometimes a system lockup too), which is fixed by
this patch.
I'm not sure how small mipmaps could have worked with the previous code.

Also the offset code below may need some review.
And furthermore, wide_pitch is set for any kind of texture usage, so
maybe it should be made unconditional (what's the point of allocating
a texture that the GPU can't use in any way?).

diff --git a/src/gallium/drivers/nv40/nv40_miptree.c
b/src/gallium/drivers/nv40/nv40_miptree.c
index b974e68..9f54187 100644
--- a/src/gallium/drivers/nv40/nv40_miptree.c
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -31,8 +31,8 @@ nv40_miptree_layout(struct nv40_miptree *mt)
}

for (l = 0; l = pt-last_level; l++) {
-   if (wide_pitch  (pt-tex_usage  
NOUVEAU_TEXTURE_USAGE_LINEAR))
-   mt-level[l].pitch = 
align(util_format_get_stride(pt-format,
pt-width0), 64);
+   if (wide_pitch)
+   mt-level[l].pitch = 
align(util_format_get_stride(pt-format, width), 64);
else
mt-level[l].pitch = util_format_get_stride(pt-format, 
width);
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [MESA PATCH] Fix nv40_miptree_layout pitch

2009-12-25 Thread Luca Barbieri
You are right. The patch is wrong. Both changes fix my program, but do
break OpenGL (e.g. redbook/mipmap).

I managed to reproduce the problem with perf/genmipmap.

When run, it causes several instances of one of these 3 errors (using
swizzled textures):
[12949.125732] [drm] nouveau :01:00.0: PGRAPH_ERROR - nSource:
DATA_ERROR, nStatus: BAD_ARGUMENT
[12949.125738] [drm] nouveau :01:00.0: PGRAPH_ERROR - Ch 3/7 Class
0x4097 Mthd 0x020c Data 0x:0x0008
[12949.214750] [drm] nouveau :01:00.0: PGRAPH_ERROR - nSource:
DATA_ERROR, nStatus: BAD_ARGUMENT
[12949.214757] [drm] nouveau :01:00.0: PGRAPH_ERROR - Ch 3/7 Class
0x4097 Mthd 0x020c Data 0x:0x0010
[12951.752081] [drm] nouveau :01:00.0: PGRAPH_ERROR - nSource:
DATA_ERROR, nStatus: BAD_ARGUMENT
[12951.752088] [drm] nouveau :01:00.0: PGRAPH_ERROR - Ch 3/7 Class
0x4097 Mthd 0x020c Data 0x:0x0020

It seems they are due to PGRAPH not liking an 8/16/32 pitch.
In my program I got these as well and narrowed it down to doing mipmap
generation on the small levels that have pitch set that way.

This patch does make them go away but breaks progs/mipmap (both
changes are wrong).

Apparently the miptree layout is correct, but the lowest mipmap levels
cannot be rendered to with the current code (that however tries to,
resulting in the errors), possibly due to an hardware limitation.

I guess a possible solution could be to modify
nv40_miptree_surface_new to allocate temporary surfaces for pitch 64
levels (i.e. 8x, 4x and 1x mipmap levels for RGBA) and then do a copy
with the 2D engine in nv40_miptree_surface_del.

Alternatively, for square Pot textures, it may be possible to map the
8x8, 4x4, 2x2 and 1x1 mipmaps as a single 16x16 pitch 64 swizzled
texture in which they should appear as rectangles, and then restrict
drawing to the rectangle by adjusting the viewport (and finding a way
to make bypass work too). Not sure if this works and whether it can be
generalized to non-square-pot textures.

How does the nVidia driver implement glGenerateMipmap?
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [MESA PATCH] Fix nv40_miptree_layout pitch

2009-12-25 Thread Luca Barbieri
I just coded a patch that does this and seems to work fine. It must be
fixed since it breaks OpenGL (or the state tracker can be changed, but
it seems better to do it in the driver).

The patch also fixes NV20 and NV30 in the same way. They compile but
are untested.

I would guess that using the 3D engine is faster for the larger
levels, but the 2D engine is faster for the smaller ones (and lacks
this issue).

diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h
b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 42c77e5..4c3e08a 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -23,6 +23,9 @@
 #define NOUVEAU_BUFFER_USAGE_ZETA (1  17)
 #define NOUVEAU_BUFFER_USAGE_TRANSFER (1  18)

+/* use along GPU_WRITE for 2D-only writes */
+#define NOUVEAU_BUFFER_USAGE_NO_RENDER (1  19)
+
 extern struct pipe_screen *
 nv04_screen_create(struct pipe_winsys *ws, struct nouveau_device *);

diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c
b/src/gallium/drivers/nv04/nv04_surface_2d.c
index 12df7fd..16e8379 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -491,3 +501,49 @@ nv04_surface_2d_init(struct nouveau_screen *screen)
ctx-fill = nv04_surface_fill;
return ctx;
 }
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct
nv04_surface_2d* eng2d, struct nv04_surface* ns)
+{
+   int temp_flags;
+
+   // printf(creating temp, flags is %i!\n, flags);
+
+   if(ns-base.usage  PIPE_BUFFER_USAGE_DISCARD)
+   {
+   temp_flags = ns-base.usage | PIPE_BUFFER_USAGE_GPU_READ;
+   ns-base.usage = PIPE_BUFFER_USAGE_GPU_WRITE |
NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_DISCARD;
+   }
+   else
+   {
+   temp_flags = ns-base.usage | PIPE_BUFFER_USAGE_GPU_READ |
PIPE_BUFFER_USAGE_GPU_WRITE;
+   ns-base.usage = PIPE_BUFFER_USAGE_GPU_WRITE |
NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_GPU_READ;
+   }
+
+   struct nv40_screen* screen = (struct nv40_screen*)pscreen;
+   ns-base.usage = PIPE_BUFFER_USAGE_GPU_READ | 
PIPE_BUFFER_USAGE_GPU_WRITE;
+
+   struct pipe_texture templ;
+   memset(templ, 0, sizeof(templ));
+   templ.format = ns-base.texture-format;
+   templ.target = PIPE_TEXTURE_2D;
+   templ.width0 = ns-base.width;
+   templ.height0 = ns-base.height;
+   templ.depth0 = 1;
+   templ.last_level = 0;
+
+   // TODO: this is probably wrong and we should specifically handle
multisampling somehow once it is implemented
+   templ.nr_samples = ns-base.texture-nr_samples;
+
+   templ.tex_usage = ns-base.texture-tex_usage |
PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+   struct pipe_texture* temp_tex = pscreen-texture_create(pscreen, 
templ);
+   struct nv04_surface* temp_ns = (struct
nv04_surface*)pscreen-get_tex_surface(pscreen, temp_tex, 0, 0, 0,
temp_flags);
+   temp_ns-backing = ns;
+
+   if(ns-base.usage  PIPE_BUFFER_USAGE_GPU_READ)
+   eng2d-copy(eng2d, temp_ns-backing-base, 0, 0, ns-base, 0, 
0,
ns-base.width, ns-base.height);
+
+   return temp_ns;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.h
b/src/gallium/drivers/nv04/nv04_surface_2d.h
index 02b3f56..ce696a1 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.h
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.h
@@ -4,6 +4,7 @@
 struct nv04_surface {
struct pipe_surface base;
unsigned pitch;
+   struct nv04_surface* backing;
 };

 struct nv04_surface_2d {
@@ -30,4 +31,7 @@ nv04_surface_2d_init(struct nouveau_screen *screen);
 void
 nv04_surface_2d_takedown(struct nv04_surface_2d **);

+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct
nv04_surface_2d* eng2d, struct nv04_surface* ns);
+
 #endif
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c
b/src/gallium/drivers/nv20/nv20_miptree.c
index d1291a9..8f7538e 100644
--- a/src/gallium/drivers/nv20/nv20_miptree.c
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -6,6 +6,7 @@

 #include nv20_context.h
 #include nv20_screen.h
+#include ../nv04/nv04_surface_2d.h

 static void
 nv20_miptree_layout(struct nv20_miptree *nv20mt)
@@ -127,6 +128,12 @@ nv20_miptree_create(struct pipe_screen *screen,
const struct pipe_texture *pt)
if (pt-tex_usage  PIPE_TEXTURE_USAGE_DYNAMIC)
buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;

+   /* apparently we can't render to swizzled surfaces smaller than 64
bytes, so make them linear.
+* If the user did not ask for a render target, they can still
render to it, but it will cost them an extra copy.
+* This also happens for small mipmaps of large textures. */
+   if (pt-tex_usage  PIPE_TEXTURE_USAGE_RENDER_TARGET 
util_format_get_stride(pt-format, pt-width0)  64)
+   mt-base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+

[Nouveau] Fix swizzling for copies to rectangular textures

2009-12-25 Thread Luca Barbieri
nVidia hardware seems to swizzle rectangular texture (with width !=
height) coordinates by swizzling the lower bits and then adding the
higher bits from the larger dimension.
However, nv04_swizzle_bits ignores width and height and just
interleaves everything.
This causes problems with rectangular POT textures with height or
width 2048 or 4096 (but not 2048x1024 where it works by chance) since
the driver swizzles them in 1024x1024 chunks and gets the start
position for the non-first chunks wrong.
The following patch seems to fix those problems.

diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c
b/src/gallium/drivers/nv04/nv04_surface_2d.c
index 12df7fd..40b538f 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -77,7 +77,7 @@ nv04_scaled_image_format(enum pipe_format format)
 }

 static INLINE unsigned
-nv04_swizzle_bits(unsigned x, unsigned y)
+nv04_swizzle_bits_square(unsigned x, unsigned y)
 {
unsigned u = (x  0x001)  0 |
 (x  0x002)  1 |
@@ -107,6 +107,15 @@ nv04_swizzle_bits(unsigned x, unsigned y)
return v | u;
 }

+/* rectangular swizzled textures are linear concatenations of
swizzled square tiles */
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   unsigned s = MIN2(w, h);
+   unsigned m = s - 1;
+   return (((x | y)  ~m) * s) | nv04_swizzle_bits_square(x  m, y  m);
+}
+
 static int
 nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
  struct pipe_surface *dst, int dx, int dy,
@@ -159,10 +168,10 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
sub_w = MIN2(sub_w, w - x);

/* Must be 64-byte aligned */
-   assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y) *
util_format_get_blocksize(dst-texture-format))  63));
+   assert(!((dst-offset + nv04_swizzle_bits(dx+x, dy+y, w, h) *
util_format_get_blocksize(dst-texture-format))  63));

BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-   OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x,
dy+y) * util_format_get_blocksize(dst-texture-format),
+   OUT_RELOCl(chan, dst_bo, dst-offset + nv04_swizzle_bits(dx+x,
dy+y, w, h) * util_format_get_blocksize(dst-texture-format),
  NOUVEAU_BO_GART | NOUVEAU_BO_VRAM |
NOUVEAU_BO_WR);

BEGIN_RING(chan, sifm, 
NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH] NV30/NV40 CMP and SCS src == dst handling

2009-12-25 Thread Luca Barbieri
CMP and SCS can produce incorrect results if the source and
destination are the same.
This patch should fix the issues.
CMP is fixed by predicating both moves.
SCS by changing the order if the source component is X.


diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c
b/src/gallium/drivers/nv30/nv30_fragprog.c
index 40965a9..dc4e583 100644
--- a/src/gallium/drivers/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -435,10 +435,11 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
break;
case TGSI_OPCODE_CMP:
-   tmp = temp(fpc);
-   arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+   tmp = nv30_sr(NV30SR_NONE, 0);
tmp.cc_update = 1;
arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+   dst.cc_test = NV30_VP_INST_COND_GE;
+   arith(fpc, sat, MOV, dst, mask, src[2], none, none);
dst.cc_test = NV30_VP_INST_COND_LT;
arith(fpc, sat, MOV, dst, mask, src[1], none, none);
break;
@@ -517,13 +518,28 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
arith(fpc, sat, RSQ, dst, mask, abs(swz(src[0], X, X, X, X)), 
none, none);
break;
case TGSI_OPCODE_SCS:
-   if (mask  MASK_X) {
-   arith(fpc, sat, COS, dst, MASK_X,
- swz(src[0], X, X, X, X), none, none);
+   /* avoid overwriting the source */
+   if(src[0].swz[SWZ_X] != SWZ_X)
+   {
+   if (mask  MASK_X) {
+   arith(fpc, sat, COS, dst, MASK_X,
+ swz(src[0], X, X, X, X), none, none);
+   }
+   if (mask  MASK_Y) {
+   arith(fpc, sat, SIN, dst, MASK_Y,
+ swz(src[0], X, X, X, X), none, none);
+   }
}
-   if (mask  MASK_Y) {
-   arith(fpc, sat, SIN, dst, MASK_Y,
- swz(src[0], X, X, X, X), none, none);
+   else
+   {
+   if (mask  MASK_Y) {
+   arith(fpc, sat, SIN, dst, MASK_Y,
+ swz(src[0], X, X, X, X), none, none);
+   }
+   if (mask  MASK_X) {
+   arith(fpc, sat, COS, dst, MASK_X,
+ swz(src[0], X, X, X, X), none, none);
+   }
}
break;
case TGSI_OPCODE_SIN:
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c
b/src/gallium/drivers/nv40/nv40_fragprog.c
index 1bf1672..468d350 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -445,10 +445,11 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
break;
case TGSI_OPCODE_CMP:
-   tmp = temp(fpc);
-   arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+   tmp = nv40_sr(NV40SR_NONE, 0);
tmp.cc_update = 1;
arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+   dst.cc_test = NV40_VP_INST_COND_GE;
+   arith(fpc, sat, MOV, dst, mask, src[2], none, none);
dst.cc_test = NV40_VP_INST_COND_LT;
arith(fpc, sat, MOV, dst, mask, src[1], none, none);
break;
@@ -573,13 +574,28 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
  neg(swz(tmp, X, X, X, X)), none, none);
break;
case TGSI_OPCODE_SCS:
-   if (mask  MASK_X) {
-   arith(fpc, sat, COS, dst, MASK_X,
- swz(src[0], X, X, X, X), none, none);
+   /* avoid overwriting the source */
+   if(src[0].swz[SWZ_X] != SWZ_X)
+   {
+   if (mask  MASK_X) {
+   arith(fpc, sat, COS, dst, MASK_X,
+ swz(src[0], X, X, X, X), none, none);
+   }
+   if (mask  MASK_Y) {
+   arith(fpc, sat, SIN, dst, MASK_Y,
+ swz(src[0], X, X, X, X), none, none);
+   }
}
-   if (mask  MASK_Y) {
-   arith(fpc, sat, SIN, dst, MASK_Y,
- swz(src[0], X, X, X, X), none, none);
+   else
+   {
+   if (mask  MASK_Y) {
+   arith(fpc, sat, SIN, dst, MASK_Y,
+ swz(src[0],